removed unnecessary method overloading and fixed documentation

This commit is contained in:
Bhavna Jindal 2024-02-01 09:24:12 -08:00
parent 6d24ffc976
commit ce31fa912b
4 changed files with 75 additions and 109 deletions

View File

@ -10,33 +10,27 @@ Below functions are used for series data analysis.
## seriesOutliersDetectTukey
Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences).
Detects outliers in series data using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences).
**Syntax**
``` sql
seriesOutliersDetectTukey(series);
seriesOutliersDetectTukey(series, kind, min_percentile, max_percentile, K);
seriesOutliersDetectTukey(series, min_percentile, max_percentile, K);
```
**Arguments**
- `series` - An array of numeric values.
- `kind` - Kind of algorithm to use. Supported values are 'tukey' for standard tukey and 'ctukey' for custom tukey algorithm. The default is 'ctukey'.
- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range(IQR). The value must be in range [2,98]. The default is 10. This value is only supported for 'ctukey'.
- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range(IQR). The value must be in range [2,98]. The default is 90. This value is only supported for 'ctukey'.
- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range [(IQR)](https://en.wikipedia.org/wiki/Interquartile_range). The value must be in range [2,98]. The default is 25.
- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range (IQR). The value must be in range [2,98]. The default is 75.
- `K` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5
At least four data points are required in `series` to detect outliers.
Default quantile range:
- `tukey` - 25%/75%
- `ctukey` - 10%/90%
**Returned value**
- Returns an array of the same length where each value represents score of possible anomaly of corresponding element in the series.
- A non-zero score indicates a possible anomaly.
- Returns an array of the same length as the input array where each value represents score of possible anomaly of corresponding element in the series. A non-zero score indicates a possible anomaly.
Type: [Array](../../sql-reference/data-types/array.md).
@ -51,23 +45,23 @@ SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4,
Result:
``` text
┌───────────print_0───────────────────
│[0,0,0,0,0,0,0,0,0,10.5,0,0,0,0,0,0] │
└─────────────────────────────────────
┌───────────print_0─────────────────┐
│[0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0] │
└───────────────────────────────────┘
```
Query:
``` sql
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 'ctukey', 20, 80, 1.5) AS print_0;
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 20, 80, 1.5) AS print_0;
```
Result:
``` text
┌─print_0────────────────────────────┐
│ [0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0] │
└────────────────────────────────────┘
┌─print_0──────────────────────────────
│ [0,0,0,0,0,0,0,0,0,19.5,0,0,0,0,0,0] │
└──────────────────────────────────────
```
## seriesPeriodDetectFFT

View File

@ -14,9 +14,10 @@ namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int ILLEGAL_COLUMN;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}
///Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences)
/// Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences)
class FunctionSeriesOutliersDetectTukey : public IFunction
{
public:
@ -36,9 +37,15 @@ public:
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
if (arguments.size() != 1 && arguments.size() != 4)
throw Exception(
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Function {} needs either 1 or 4 arguments; passed {}.",
getName(),
arguments.size());
FunctionArgumentDescriptors mandatory_args{{"time_series", &isArray<IDataType>, nullptr, "Array"}};
FunctionArgumentDescriptors optional_args{
{"kind", &isString<IDataType>, isColumnConst, "const String"},
{"min_percentile", &isNativeNumber<IDataType>, isColumnConst, "Number"},
{"max_percentile", &isNativeNumber<IDataType>, isColumnConst, "Number"},
{"k", &isNativeNumber<IDataType>, isColumnConst, "Number"}};
@ -48,9 +55,9 @@ public:
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeFloat64>());
}
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2, 3, 4}; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2, 3}; }
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
{
ColumnPtr col = arguments[0].column;
const ColumnArray * col_arr = checkAndGetColumn<ColumnArray>(col.get());
@ -58,62 +65,36 @@ public:
const IColumn & arr_data = col_arr->getData();
const ColumnArray::Offsets & arr_offsets = col_arr->getOffsets();
Float64 min_percentile = 0.10; //default 10th percentile
Float64 max_percentile = 0.90; //default 90th percentile
ColumnPtr col_res;
if (input_rows_count == 0)
return ColumnArray::create(ColumnFloat64::create());
Float64 min_percentile = 0.25; /// default 25th percentile
Float64 max_percentile = 0.75; /// default 75th percentile
Float64 K = 1.50;
if (arguments.size() > 1)
{
//const IColumn * arg_column = arguments[1].column.get();
const ColumnConst * arg_string = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
Float64 p_min = arguments[1].column->getFloat64(0);
if (p_min < 2.0 || p_min > 98.0)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The second argument of function {} must be in range [2.0, 98.0]", getName());
if (!arg_string)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "The second argument of function {} must be constant String", getName());
min_percentile = p_min / 100;
String kind = arg_string->getValue<String>();
if (kind == "ctukey")
{
if (arguments.size() > 2)
{
Float64 p_min = arguments[2].column->getFloat64(0);
if (p_min >= 2.0 && p_min <= 98.0)
min_percentile = p_min / 100;
else
throw Exception(
ErrorCodes::BAD_ARGUMENTS, "The third argument of function {} must be in range [2.0, 98.0]", getName());
}
Float64 p_max = arguments[2].column->getFloat64(0);
if (p_max < 2.0 || p_max > 98.0 || p_max < min_percentile * 100)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The third argument of function {} must be in range [2.0, 98.0]", getName());
if (arguments.size() > 3)
{
Float64 p_max = arguments[3].column->getFloat64(0);
if (p_max >= 2.0 && p_max <= 98.0 && p_max > min_percentile * 100)
max_percentile = p_max / 100;
else
throw Exception(
ErrorCodes::BAD_ARGUMENTS, "The fourth argument of function {} must be in range [2.0, 98.0]", getName());
}
}
else if (kind == "tukey")
{
min_percentile = 0.25;
max_percentile = 0.75;
}
else
throw Exception(
ErrorCodes::BAD_ARGUMENTS, "The second argument of function {} can only be 'tukey' or 'ctukey'.", getName());
max_percentile = p_max / 100;
auto k_val = arguments[3].column->getFloat64(0);
if (k_val < 0.0)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The fourth argument of function {} must be a positive number", getName());
K = k_val;
}
Float64 K = 1.50;
if (arguments.size() == 5)
{
auto k_val = arguments[4].column->getFloat64(0);
if (k_val >= 0.0)
K = k_val;
else
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The fifth argument of function {} must be a positive number", getName());
}
ColumnPtr col_res;
if (executeNumber<UInt8>(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res)
|| executeNumber<UInt16>(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res)
|| executeNumber<UInt32>(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res)
@ -172,7 +153,7 @@ private:
Float64 q1, q2;
auto p1 = len * min_percentile;
Float64 p1 = len * min_percentile;
if (p1 == static_cast<Int64>(p1))
{
size_t index = static_cast<size_t>(p1) - 1;
@ -184,7 +165,7 @@ private:
q1 = src_sorted[index];
}
auto p2 = len * max_percentile;
Float64 p2 = len * max_percentile;
if (p2 == static_cast<Int64>(p2))
{
size_t index = static_cast<size_t>(p2) - 1;
@ -219,33 +200,27 @@ REGISTER_FUNCTION(SeriesOutliersDetectTukey)
{
factory.registerFunction<FunctionSeriesOutliersDetectTukey>(FunctionDocumentation{
.description = R"(
Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences).
Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences).
Detects outliers in series data using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences).
**Syntax**
``` sql
seriesOutliersDetectTukey(series);
seriesOutliersDetectTukey(series, kind, min_percentile, max_percentile, K);
seriesOutliersDetectTukey(series, min_percentile, max_percentile, K);
```
**Arguments**
- `series` - An array of numeric values.
- `kind` - Kind of algorithm to use. Supported values are 'tukey' for standard tukey and 'ctukey' for custom tukey algorithm. The default is 'ctukey'.
- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range(IQR). The value must be in range [2,98]. The default is 10. This value is only supported for 'ctukey'.
- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range(IQR). The value must be in range [2,98]. The default is 90. This value is only supported for 'ctukey'.
- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range [(IQR)](https://en.wikipedia.org/wiki/Interquartile_range). The value must be in range [2,98]. The default is 25.
- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range (IQR). The value must be in range [2,98]. The default is 75.
- `K` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5
Default quantile range:
- `tukey` - 25%/75%
- `ctukey` - 10%/90%
At least four data points are required in `series` to detect outliers.
**Returned value**
- Returns an array of the same length where each value represents score of possible anomaly of corresponding element in the series.
- A non-zero score indicates a possible anomaly.
- Returns an array of the same length as the input array where each value represents score of possible anomaly of corresponding element in the series. A non-zero score indicates a possible anomaly.
Type: [Array](../../sql-reference/data-types/array.md).
@ -260,23 +235,23 @@ SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4,
Result:
``` text
print_0
[0,0,0,0,0,0,0,0,0,10.5,0,0,0,0,0,0]
print_0
[0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0]
```
Query:
``` sql
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 'ctukey', 20, 80, 1.5) AS print_0;
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 20, 80, 1.5) AS print_0;
```
Result:
``` text
print_0
[0,0,0,0,0,0,0,0,0,12,0,0,0,0,0,0]
print_0
[0,0,0,0,0,0,0,0,0,19.5,0,0,0,0,0,0]
```)",
.categories{"Time series analysis"}});
}

View File

@ -1,14 +1,12 @@
[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,11.100000000000001,0,0,0,0,0,0]
[-4.475000000000001,0,6.925000000000001,0,0,0,0,0,0,0,0,7.925000000000001,0,0,0,0]
[0,0,0,0,0,0,0,0,0,27.975,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,10.5,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,26.1,0,0,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,11.100000000000001,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,27.3,0,0,0,0,0,0]
[-2.4999999999999996,0,5.1,0,0,0,0,0,2.0999999999999996,50.1,2.0999999999999996,0,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,27.3,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,10.5,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,27.3,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
[0,0,0,0]
[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0]
[0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0]

View File

@ -1,11 +1,11 @@
DROP TABLE IF EXISTS tb1;
CREATE TABLE tb1 (n UInt32, a Array(Float64)) engine=Memory;
INSERT INTO tb1 VALUES (1, [-3,2.40,15,3.90,5,6,4.50,5.20,3,4,5,16,7,5,5,4]), (2, [-3,2.40,15,3.90,5,6,4.50,5.20,12,45,12,3.40,3,4,5,6]);
INSERT INTO tb1 VALUES (1, [-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 3, 4, 5, 16, 7, 5, 5, 4]), (2, [-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 12, 45, 12, 3.40, 3, 4, 5, 6]);
-- non-const inputs
SELECT seriesOutliersDetectTukey(a) FROM tb1 ORDER BY n;
SELECT seriesOutliersDetectTukey(a,'ctukey', 25,75) FROM tb1 ORDER BY n;
SELECT seriesOutliersDetectTukey(a,10,90,1.5) FROM tb1 ORDER BY n;
DROP TABLE IF EXISTS tb1;
-- const inputs
@ -13,18 +13,17 @@ SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40,
SELECT seriesOutliersDetectTukey([-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 12, 60, 12, 3.40, 3, 4, 5, 6, 3.40, 2.7]);
-- const inputs with optional arguments
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 'ctukey', 25, 75);
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 'ctukey', 10, 90);
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 'tukey', 10, 90);
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 'ctukey', 2, 98);
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 'ctukey', 2, 98);
SELECT seriesOutliersDetectTukey(arrayMap(x -> sin(x / 10), range(30)), 'tukey');
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 'tukey', 25, 75, 1.5);
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 'tukey', 25, 75, 3);
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 25, 75, 1.5);
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 10, 90, 1.5);
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 2, 98, 1.5);
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 2, 98, 1.5);
SELECT seriesOutliersDetectTukey(arrayMap(x -> sin(x / 10), range(30)));
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 25, 75, 3);
-- negative tests
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 'tukey', 25, 75, -1); -- { serverError BAD_ARGUMENTS}
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 'xyz', 33, 53); -- { serverError BAD_ARGUMENTS}
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 25, 75, -1); -- { serverError BAD_ARGUMENTS}
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 33, 53); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH}
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 33); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH}
SELECT seriesOutliersDetectTukey([-3, 2.4, 15, NULL]); -- { serverError ILLEGAL_COLUMN}
SELECT seriesOutliersDetectTukey([]); -- { serverError ILLEGAL_COLUMN}
SELECT seriesOutliersDetectTukey([-3, 2.4, 15]); -- { serverError BAD_ARGUMENTS}