ClickHouse/src/Functions/seriesOutliersDetectTukey.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

259 lines
9.9 KiB
C++
Raw Normal View History

2023-12-05 15:44:42 +00:00
#include <Columns/ColumnArray.h>
2024-01-18 15:26:32 +00:00
#include <Columns/ColumnString.h>
2023-12-05 15:44:42 +00:00
#include <Columns/ColumnsNumber.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <cmath>
2023-12-05 15:44:42 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int ILLEGAL_COLUMN;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
2023-12-05 15:44:42 +00:00
}
/// Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences)
2024-01-18 15:26:32 +00:00
class FunctionSeriesOutliersDetectTukey : public IFunction
2023-12-05 15:44:42 +00:00
{
public:
2024-01-18 15:26:32 +00:00
static constexpr auto name = "seriesOutliersDetectTukey";
2023-12-05 15:44:42 +00:00
2024-01-18 15:26:32 +00:00
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionSeriesOutliersDetectTukey>(); }
2023-12-05 15:44:42 +00:00
std::string getName() const override { return name; }
2024-01-18 15:26:32 +00:00
bool isVariadic() const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
2023-12-05 15:44:42 +00:00
bool useDefaultImplementationForConstants() const override { return true; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
if (arguments.size() != 1 && arguments.size() != 4)
throw Exception(
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Function {} needs either 1 or 4 arguments; passed {}.",
getName(),
arguments.size());
2024-01-18 15:26:32 +00:00
FunctionArgumentDescriptors mandatory_args{{"time_series", &isArray<IDataType>, nullptr, "Array"}};
FunctionArgumentDescriptors optional_args{
{"min_percentile", &isNativeNumber<IDataType>, isColumnConst, "Number"},
{"max_percentile", &isNativeNumber<IDataType>, isColumnConst, "Number"},
{"k", &isNativeNumber<IDataType>, isColumnConst, "Number"}};
2024-01-18 15:26:32 +00:00
validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args);
2023-12-05 15:44:42 +00:00
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeFloat64>());
}
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2, 3}; }
2024-01-18 15:26:32 +00:00
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
2023-12-05 15:44:42 +00:00
{
ColumnPtr col = arguments[0].column;
const ColumnArray * col_arr = checkAndGetColumn<ColumnArray>(col.get());
2023-12-05 15:44:42 +00:00
const IColumn & arr_data = col_arr->getData();
const ColumnArray::Offsets & arr_offsets = col_arr->getOffsets();
2023-12-05 15:44:42 +00:00
ColumnPtr col_res;
if (input_rows_count == 0)
return ColumnArray::create(ColumnFloat64::create());
Float64 min_percentile = 0.25; /// default 25th percentile
Float64 max_percentile = 0.75; /// default 75th percentile
Float64 K = 1.50;
2024-01-18 15:26:32 +00:00
if (arguments.size() > 1)
2024-01-18 15:26:32 +00:00
{
Float64 p_min = arguments[1].column->getFloat64(0);
if (p_min < 2.0 || p_min > 98.0)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The second argument of function {} must be in range [2.0, 98.0]", getName());
2024-01-18 15:26:32 +00:00
min_percentile = p_min / 100;
2024-01-18 15:26:32 +00:00
Float64 p_max = arguments[2].column->getFloat64(0);
if (p_max < 2.0 || p_max > 98.0 || p_max < min_percentile * 100)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The third argument of function {} must be in range [2.0, 98.0]", getName());
2024-01-18 15:26:32 +00:00
max_percentile = p_max / 100;
2023-12-05 15:44:42 +00:00
auto k_val = arguments[3].column->getFloat64(0);
if (k_val < 0.0)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The fourth argument of function {} must be a positive number", getName());
K = k_val;
}
if (executeNumber<UInt8>(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res)
|| executeNumber<UInt16>(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res)
|| executeNumber<UInt32>(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res)
|| executeNumber<UInt64>(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res)
|| executeNumber<Int8>(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res)
|| executeNumber<Int16>(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res)
|| executeNumber<Int32>(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res)
|| executeNumber<Int64>(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res)
|| executeNumber<Float32>(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res)
|| executeNumber<Float64>(arr_data, arr_offsets, min_percentile, max_percentile, K, col_res))
2023-12-05 15:44:42 +00:00
{
return col_res;
2023-12-05 15:44:42 +00:00
}
else
throw Exception(
ErrorCodes::ILLEGAL_COLUMN,
"Illegal column {} of first argument of function {}",
arguments[0].column->getName(),
getName());
}
private:
2023-12-05 15:44:42 +00:00
template <typename T>
bool executeNumber(
const IColumn & arr_data,
const ColumnArray::Offsets & arr_offsets,
Float64 min_percentile,
Float64 max_percentile,
Float64 K,
ColumnPtr & res_ptr) const
2023-12-05 15:44:42 +00:00
{
const ColumnVector<T> * src_data_concrete = checkAndGetColumn<ColumnVector<T>>(&arr_data);
2023-12-05 15:44:42 +00:00
if (!src_data_concrete)
return false;
const PaddedPODArray<T> & src_vec = src_data_concrete->getData();
auto outliers = ColumnFloat64::create();
auto & outlier_data = outliers->getData();
ColumnArray::ColumnOffsets::MutablePtr res_offsets = ColumnArray::ColumnOffsets::create();
auto & res_offsets_data = res_offsets->getData();
std::vector<Float64> src_sorted;
2023-12-05 15:44:42 +00:00
ColumnArray::Offset prev_src_offset = 0;
for (auto src_offset : arr_offsets)
2023-12-05 15:44:42 +00:00
{
chassert(prev_src_offset <= src_offset);
size_t len = src_offset - prev_src_offset;
2023-12-05 15:44:42 +00:00
if (len < 4)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "At least four data points are needed for function {}", getName());
src_sorted.assign(src_vec.begin() + prev_src_offset, src_vec.begin() + src_offset);
2023-12-05 15:44:42 +00:00
std::sort(src_sorted.begin(), src_sorted.end());
2024-01-18 15:26:32 +00:00
Float64 q1, q2;
2023-12-05 15:44:42 +00:00
Float64 p1 = len * min_percentile;
if (p1 == static_cast<Int64>(p1))
{
size_t index = static_cast<size_t>(p1) - 1;
q1 = (src_sorted[index] + src_sorted[index + 1]) / 2;
2024-01-18 15:26:32 +00:00
}
else
{
size_t index = static_cast<size_t>(std::ceil(p1)) - 1;
2024-01-18 15:26:32 +00:00
q1 = src_sorted[index];
}
Float64 p2 = len * max_percentile;
if (p2 == static_cast<Int64>(p2))
{
size_t index = static_cast<size_t>(p2) - 1;
q2 = (src_sorted[index] + src_sorted[index + 1]) / 2;
2024-01-18 15:26:32 +00:00
}
else
{
size_t index = static_cast<size_t>(std::ceil(p2)) - 1;
2024-01-18 15:26:32 +00:00
q2 = src_sorted[index];
}
2023-12-05 15:44:42 +00:00
Float64 iqr = q2 - q1; /// interquantile range
2023-12-05 15:44:42 +00:00
Float64 lower_fence = q1 - K * iqr;
Float64 upper_fence = q2 + K * iqr;
2023-12-05 15:44:42 +00:00
for (ColumnArray::Offset j = prev_src_offset; j < src_offset; ++j)
2023-12-05 15:44:42 +00:00
{
auto score = std::min((src_vec[j] - lower_fence), 0.0) + std::max((src_vec[j] - upper_fence), 0.0);
2023-12-05 15:44:42 +00:00
outlier_data.push_back(score);
}
res_offsets_data.push_back(outlier_data.size());
prev_src_offset = src_offset;
2023-12-05 15:44:42 +00:00
}
res_ptr = ColumnArray::create(std::move(outliers), std::move(res_offsets));
return true;
}
};
2024-01-18 15:26:32 +00:00
REGISTER_FUNCTION(SeriesOutliersDetectTukey)
2023-12-05 15:44:42 +00:00
{
2024-01-18 15:26:32 +00:00
factory.registerFunction<FunctionSeriesOutliersDetectTukey>(FunctionDocumentation{
2023-12-05 15:44:42 +00:00
.description = R"(
Detects outliers in series data using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences).
2023-12-05 15:44:42 +00:00
**Syntax**
``` sql
2024-01-18 15:26:32 +00:00
seriesOutliersDetectTukey(series);
seriesOutliersDetectTukey(series, min_percentile, max_percentile, K);
2023-12-05 15:44:42 +00:00
```
**Arguments**
- `series` - An array of numeric values.
- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range [(IQR)](https://en.wikipedia.org/wiki/Interquartile_range). The value must be in range [2,98]. The default is 25.
- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range (IQR). The value must be in range [2,98]. The default is 75.
- `K` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5
At least four data points are required in `series` to detect outliers.
2023-12-05 15:44:42 +00:00
**Returned value**
- Returns an array of the same length as the input array where each value represents score of possible anomaly of corresponding element in the series. A non-zero score indicates a possible anomaly.
2023-12-05 15:44:42 +00:00
Type: [Array](../../sql-reference/data-types/array.md).
**Examples**
Query:
``` sql
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6]) AS print_0;
2023-12-05 15:44:42 +00:00
```
Result:
``` text
print_0
[0,0,0,0,0,0,0,0,0,27,0,0,0,0,0,0]
2023-12-05 15:44:42 +00:00
```
Query:
``` sql
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 20, 80, 1.5) AS print_0;
2023-12-05 15:44:42 +00:00
```
Result:
``` text
print_0
[0,0,0,0,0,0,0,0,0,19.5,0,0,0,0,0,0]
2023-12-05 15:44:42 +00:00
```)",
.categories{"Time series analysis"}});
}
}