2023-12-05 15:44:42 +00:00
# include <Columns/ColumnArray.h>
2024-01-18 15:26:32 +00:00
# include <Columns/ColumnString.h>
2023-12-05 15:44:42 +00:00
# include <Columns/ColumnsNumber.h>
# include <DataTypes/DataTypeArray.h>
# include <DataTypes/DataTypesNumber.h>
# include <Functions/FunctionFactory.h>
# include <Functions/FunctionHelpers.h>
# include <Functions/IFunction.h>
2024-02-05 17:23:11 +00:00
# include <Common/NaNUtils.h>
2024-01-31 18:35:08 +00:00
# include <cmath>
2023-12-05 15:44:42 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS ;
extern const int ILLEGAL_COLUMN ;
2024-02-01 17:24:12 +00:00
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH ;
2023-12-05 15:44:42 +00:00
}
2024-02-01 17:24:12 +00:00
/// Detects a possible anomaly in series using [Tukey Fences](https://en.wikipedia.org/wiki/Outlier#Tukey%27s_fences)
2024-01-18 15:26:32 +00:00
class FunctionSeriesOutliersDetectTukey : public IFunction
2023-12-05 15:44:42 +00:00
{
public :
2024-01-18 15:26:32 +00:00
static constexpr auto name = " seriesOutliersDetectTukey " ;
2023-12-05 15:44:42 +00:00
2024-01-18 15:26:32 +00:00
static FunctionPtr create ( ContextPtr ) { return std : : make_shared < FunctionSeriesOutliersDetectTukey > ( ) ; }
2023-12-05 15:44:42 +00:00
std : : string getName ( ) const override { return name ; }
2024-01-18 15:26:32 +00:00
bool isVariadic ( ) const override { return true ; }
size_t getNumberOfArguments ( ) const override { return 0 ; }
2023-12-05 15:44:42 +00:00
bool useDefaultImplementationForConstants ( ) const override { return true ; }
bool isSuitableForShortCircuitArgumentsExecution ( const DataTypesWithConstInfo & /*arguments*/ ) const override { return true ; }
DataTypePtr getReturnTypeImpl ( const ColumnsWithTypeAndName & arguments ) const override
{
2024-02-01 17:24:12 +00:00
if ( arguments . size ( ) ! = 1 & & arguments . size ( ) ! = 4 )
throw Exception (
ErrorCodes : : NUMBER_OF_ARGUMENTS_DOESNT_MATCH ,
" Function {} needs either 1 or 4 arguments; passed {}. " ,
getName ( ) ,
arguments . size ( ) ) ;
2024-01-18 15:26:32 +00:00
FunctionArgumentDescriptors mandatory_args { { " time_series " , & isArray < IDataType > , nullptr , " Array " } } ;
FunctionArgumentDescriptors optional_args {
2024-02-29 12:15:25 +00:00
{ " min_percentile " , & isFloat < IDataType > , isColumnConst , " Number " } ,
{ " max_percentile " , & isFloat < IDataType > , isColumnConst , " Number " } ,
2024-01-31 18:35:08 +00:00
{ " k " , & isNativeNumber < IDataType > , isColumnConst , " Number " } } ;
2024-01-18 15:26:32 +00:00
validateFunctionArgumentTypes ( * this , arguments , mandatory_args , optional_args ) ;
2023-12-05 15:44:42 +00:00
return std : : make_shared < DataTypeArray > ( std : : make_shared < DataTypeFloat64 > ( ) ) ;
}
2024-02-01 17:24:12 +00:00
ColumnNumbers getArgumentsThatAreAlwaysConstant ( ) const override { return { 1 , 2 , 3 } ; }
2024-01-18 15:26:32 +00:00
2024-02-01 17:24:12 +00:00
ColumnPtr executeImpl ( const ColumnsWithTypeAndName & arguments , const DataTypePtr & , size_t input_rows_count ) const override
2023-12-05 15:44:42 +00:00
{
2024-01-31 18:35:08 +00:00
ColumnPtr col = arguments [ 0 ] . column ;
const ColumnArray * col_arr = checkAndGetColumn < ColumnArray > ( col . get ( ) ) ;
2023-12-05 15:44:42 +00:00
2024-01-31 18:35:08 +00:00
const IColumn & arr_data = col_arr - > getData ( ) ;
const ColumnArray : : Offsets & arr_offsets = col_arr - > getOffsets ( ) ;
2023-12-05 15:44:42 +00:00
2024-02-01 17:24:12 +00:00
ColumnPtr col_res ;
if ( input_rows_count = = 0 )
return ColumnArray : : create ( ColumnFloat64 : : create ( ) ) ;
Float64 min_percentile = 0.25 ; /// default 25th percentile
Float64 max_percentile = 0.75 ; /// default 75th percentile
2024-02-02 15:46:36 +00:00
Float64 k = 1.50 ;
2024-01-18 15:26:32 +00:00
2024-01-31 18:35:08 +00:00
if ( arguments . size ( ) > 1 )
2024-01-18 15:26:32 +00:00
{
2024-02-29 12:15:25 +00:00
static constexpr Float64 min_percentile_lower_bound = 0.02 ;
static constexpr Float64 max_percentile_upper_bound = 0.98 ;
2024-01-18 15:26:32 +00:00
2024-02-29 12:15:25 +00:00
min_percentile = arguments [ 1 ] . column - > getFloat64 ( 0 ) ;
if ( isnan ( min_percentile ) | | ! isFinite ( min_percentile ) | | min_percentile < min_percentile_lower_bound | | min_percentile > max_percentile_upper_bound )
throw Exception ( ErrorCodes : : BAD_ARGUMENTS , " The second argument of function {} must be in range [2.0, 98.0] " , getName ( ) ) ;
2024-01-18 15:26:32 +00:00
2024-02-29 12:15:25 +00:00
max_percentile = arguments [ 2 ] . column - > getFloat64 ( 0 ) ;
if ( isnan ( max_percentile ) | | ! isFinite ( max_percentile ) | | max_percentile < min_percentile_lower_bound | | max_percentile > max_percentile_upper_bound | | max_percentile < min_percentile )
2024-02-01 17:24:12 +00:00
throw Exception ( ErrorCodes : : BAD_ARGUMENTS , " The third argument of function {} must be in range [2.0, 98.0] " , getName ( ) ) ;
2024-01-18 15:26:32 +00:00
2024-02-29 12:15:25 +00:00
k = arguments [ 3 ] . column - > getFloat64 ( 0 ) ;
if ( k < 0.0 | | isnan ( k ) | | ! isFinite ( k ) )
2024-02-01 17:24:12 +00:00
throw Exception ( ErrorCodes : : BAD_ARGUMENTS , " The fourth argument of function {} must be a positive number " , getName ( ) ) ;
}
2024-01-31 18:35:08 +00:00
2024-02-02 15:46:36 +00:00
if ( executeNumber < UInt8 > ( arr_data , arr_offsets , min_percentile , max_percentile , k , col_res )
| | executeNumber < UInt16 > ( arr_data , arr_offsets , min_percentile , max_percentile , k , col_res )
| | executeNumber < UInt32 > ( arr_data , arr_offsets , min_percentile , max_percentile , k , col_res )
| | executeNumber < UInt64 > ( arr_data , arr_offsets , min_percentile , max_percentile , k , col_res )
| | executeNumber < Int8 > ( arr_data , arr_offsets , min_percentile , max_percentile , k , col_res )
| | executeNumber < Int16 > ( arr_data , arr_offsets , min_percentile , max_percentile , k , col_res )
| | executeNumber < Int32 > ( arr_data , arr_offsets , min_percentile , max_percentile , k , col_res )
| | executeNumber < Int64 > ( arr_data , arr_offsets , min_percentile , max_percentile , k , col_res )
| | executeNumber < Float32 > ( arr_data , arr_offsets , min_percentile , max_percentile , k , col_res )
| | executeNumber < Float64 > ( arr_data , arr_offsets , min_percentile , max_percentile , k , col_res ) )
2023-12-05 15:44:42 +00:00
{
2024-01-31 18:35:08 +00:00
return col_res ;
2023-12-05 15:44:42 +00:00
}
else
throw Exception (
ErrorCodes : : ILLEGAL_COLUMN ,
" Illegal column {} of first argument of function {} " ,
arguments [ 0 ] . column - > getName ( ) ,
getName ( ) ) ;
}
2024-01-31 18:35:08 +00:00
private :
2023-12-05 15:44:42 +00:00
template < typename T >
2024-01-31 18:35:08 +00:00
bool executeNumber (
const IColumn & arr_data ,
const ColumnArray : : Offsets & arr_offsets ,
Float64 min_percentile ,
Float64 max_percentile ,
2024-02-02 15:46:36 +00:00
Float64 k ,
2024-01-31 18:35:08 +00:00
ColumnPtr & res_ptr ) const
2023-12-05 15:44:42 +00:00
{
2024-01-31 18:35:08 +00:00
const ColumnVector < T > * src_data_concrete = checkAndGetColumn < ColumnVector < T > > ( & arr_data ) ;
2023-12-05 15:44:42 +00:00
if ( ! src_data_concrete )
return false ;
const PaddedPODArray < T > & src_vec = src_data_concrete - > getData ( ) ;
auto outliers = ColumnFloat64 : : create ( ) ;
auto & outlier_data = outliers - > getData ( ) ;
ColumnArray : : ColumnOffsets : : MutablePtr res_offsets = ColumnArray : : ColumnOffsets : : create ( ) ;
auto & res_offsets_data = res_offsets - > getData ( ) ;
2024-01-31 18:35:08 +00:00
std : : vector < Float64 > src_sorted ;
2023-12-05 15:44:42 +00:00
ColumnArray : : Offset prev_src_offset = 0 ;
2024-01-31 18:35:08 +00:00
for ( auto src_offset : arr_offsets )
2023-12-05 15:44:42 +00:00
{
2024-01-31 18:35:08 +00:00
chassert ( prev_src_offset < = src_offset ) ;
size_t len = src_offset - prev_src_offset ;
2023-12-05 15:44:42 +00:00
if ( len < 4 )
throw Exception ( ErrorCodes : : BAD_ARGUMENTS , " At least four data points are needed for function {} " , getName ( ) ) ;
2024-01-31 18:35:08 +00:00
src_sorted . assign ( src_vec . begin ( ) + prev_src_offset , src_vec . begin ( ) + src_offset ) ;
2023-12-05 15:44:42 +00:00
std : : sort ( src_sorted . begin ( ) , src_sorted . end ( ) ) ;
2024-02-29 05:30:28 +00:00
Float64 q1 ;
Float64 q2 ;
2023-12-05 15:44:42 +00:00
2024-02-01 17:24:12 +00:00
Float64 p1 = len * min_percentile ;
2024-01-31 18:35:08 +00:00
if ( p1 = = static_cast < Int64 > ( p1 ) )
{
size_t index = static_cast < size_t > ( p1 ) - 1 ;
q1 = ( src_sorted [ index ] + src_sorted [ index + 1 ] ) / 2 ;
2024-01-18 15:26:32 +00:00
}
else
2024-01-31 18:35:08 +00:00
{
size_t index = static_cast < size_t > ( std : : ceil ( p1 ) ) - 1 ;
2024-01-18 15:26:32 +00:00
q1 = src_sorted [ index ] ;
}
2024-02-01 17:24:12 +00:00
Float64 p2 = len * max_percentile ;
2024-01-31 18:35:08 +00:00
if ( p2 = = static_cast < Int64 > ( p2 ) )
{
size_t index = static_cast < size_t > ( p2 ) - 1 ;
q2 = ( src_sorted [ index ] + src_sorted [ index + 1 ] ) / 2 ;
2024-01-18 15:26:32 +00:00
}
else
2024-01-31 18:35:08 +00:00
{
size_t index = static_cast < size_t > ( std : : ceil ( p2 ) ) - 1 ;
2024-01-18 15:26:32 +00:00
q2 = src_sorted [ index ] ;
}
2023-12-05 15:44:42 +00:00
2024-01-31 18:35:08 +00:00
Float64 iqr = q2 - q1 ; /// interquantile range
2023-12-05 15:44:42 +00:00
2024-02-02 15:46:36 +00:00
Float64 lower_fence = q1 - k * iqr ;
Float64 upper_fence = q2 + k * iqr ;
2023-12-05 15:44:42 +00:00
2024-01-31 18:35:08 +00:00
for ( ColumnArray : : Offset j = prev_src_offset ; j < src_offset ; + + j )
2023-12-05 15:44:42 +00:00
{
2024-01-31 18:35:08 +00:00
auto score = std : : min ( ( src_vec [ j ] - lower_fence ) , 0.0 ) + std : : max ( ( src_vec [ j ] - upper_fence ) , 0.0 ) ;
2023-12-05 15:44:42 +00:00
outlier_data . push_back ( score ) ;
}
res_offsets_data . push_back ( outlier_data . size ( ) ) ;
2024-01-31 18:35:08 +00:00
prev_src_offset = src_offset ;
2023-12-05 15:44:42 +00:00
}
res_ptr = ColumnArray : : create ( std : : move ( outliers ) , std : : move ( res_offsets ) ) ;
return true ;
}
} ;
2024-01-18 15:26:32 +00:00
REGISTER_FUNCTION ( SeriesOutliersDetectTukey )
2023-12-05 15:44:42 +00:00
{
2024-01-18 15:26:32 +00:00
factory . registerFunction < FunctionSeriesOutliersDetectTukey > ( FunctionDocumentation {
2023-12-05 15:44:42 +00:00
. description = R " (
2024-02-01 17:24:12 +00:00
Detects outliers in series data using [ Tukey Fences ] ( https : //en.wikipedia.org/wiki/Outlier#Tukey%27s_fences).
2024-01-31 18:35:08 +00:00
2023-12-05 15:44:42 +00:00
* * Syntax * *
` ` ` sql
2024-01-18 15:26:32 +00:00
seriesOutliersDetectTukey ( series ) ;
2024-02-02 15:46:36 +00:00
seriesOutliersDetectTukey ( series , min_percentile , max_percentile , k ) ;
2023-12-05 15:44:42 +00:00
` ` `
* * Arguments * *
2024-01-31 18:35:08 +00:00
- ` series ` - An array of numeric values .
2024-02-29 12:15:25 +00:00
- ` min_quantile ` - The minimum quantile to be used to calculate inter - quantile range [ ( IQR ) ] ( https : //en.wikipedia.org/wiki/Interquartile_range). The value must be in range [0.02,0.98]. The default is 0.25.
2024-02-29 05:30:28 +00:00
- ` max_quantile ` - The maximum quantile to be used to calculate inter - quantile range ( IQR ) . The value must be in range [ 0.02 , 0.98 ] . The default is 0.75 .
2024-02-02 15:46:36 +00:00
- ` k ` - Non - negative constant value to detect mild or stronger outliers . The default value is 1.5
2024-01-31 18:35:08 +00:00
2024-02-01 17:24:12 +00:00
At least four data points are required in ` series ` to detect outliers .
2023-12-05 15:44:42 +00:00
* * Returned value * *
2024-02-01 17:24:12 +00:00
- Returns an array of the same length as the input array where each value represents score of possible anomaly of corresponding element in the series . A non - zero score indicates a possible anomaly .
2023-12-05 15:44:42 +00:00
Type : [ Array ] ( . . / . . / sql - reference / data - types / array . md ) .
* * Examples * *
Query :
` ` ` sql
2024-01-31 18:35:08 +00:00
SELECT seriesOutliersDetectTukey ( [ - 3 , 2 , 15 , 3 , 5 , 6 , 4 , 5 , 12 , 45 , 12 , 3 , 3 , 4 , 5 , 6 ] ) AS print_0 ;
2023-12-05 15:44:42 +00:00
` ` `
Result :
` ` ` text
2024-02-01 17:24:12 +00:00
┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ print_0 ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐
│ [ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 27 , 0 , 0 , 0 , 0 , 0 , 0 ] │
└ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
2023-12-05 15:44:42 +00:00
` ` `
Query :
` ` ` sql
2024-02-29 12:15:25 +00:00
SELECT seriesOutliersDetectTukey ( [ - 3 , 2 , 15 , 3 , 5 , 6 , 4.50 , 5 , 12 , 45 , 12 , 3.40 , 3 , 4 , 5 , 6 ] , 0.2 , 0.8 , 1.5 ) AS print_0 ;
2023-12-05 15:44:42 +00:00
` ` `
Result :
` ` ` text
2024-02-01 17:24:12 +00:00
┌ ─ print_0 ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┐
│ [ 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 19.5 , 0 , 0 , 0 , 0 , 0 , 0 ] │
└ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
2023-12-05 15:44:42 +00:00
` ` ` ) " ,
. categories { " Time series analysis " } } ) ;
}
}