mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 08:32:02 +00:00
General sanity in function seriesOutliersDetectTukey
This commit is contained in:
parent
3875c56369
commit
414e6eb600
@ -22,7 +22,7 @@
|
||||
* Backup & Restore support for `AzureBlobStorage`. Resolves [#50747](https://github.com/ClickHouse/ClickHouse/issues/50747). [#56988](https://github.com/ClickHouse/ClickHouse/pull/56988) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
|
||||
* The user can now specify the template string directly in the query using `format_schema_rows_template` as an alternative to `format_template_row`. Closes [#31363](https://github.com/ClickHouse/ClickHouse/issues/31363). [#59088](https://github.com/ClickHouse/ClickHouse/pull/59088) ([Shaun Struwig](https://github.com/Blargian)).
|
||||
* Implemented automatic conversion of merge tree tables of different kinds to replicated engine. Create empty `convert_to_replicated` file in table's data directory (`/clickhouse/store/xxx/xxxyyyyy-yyyy-yyyy-yyyy-yyyyyyyyyyyy/`) and that table will be converted automatically on next server start. [#57798](https://github.com/ClickHouse/ClickHouse/pull/57798) ([Kirill](https://github.com/kirillgarbar)).
|
||||
* Added function `seriesOutliersTukey` to detect outliers in series data using Tukey's fences algorithm. [#58632](https://github.com/ClickHouse/ClickHouse/pull/58632) ([Bhavna Jindal](https://github.com/bhavnajindal)).
|
||||
* Added function `seriesOutliersDetectTukey` to detect outliers in series data using Tukey's fences algorithm. [#58632](https://github.com/ClickHouse/ClickHouse/pull/58632) ([Bhavna Jindal](https://github.com/bhavnajindal)).
|
||||
* Added query `ALTER TABLE table FORGET PARTITION partition` that removes ZooKeeper nodes, related to an empty partition. [#59507](https://github.com/ClickHouse/ClickHouse/pull/59507) ([Sergei Trifonov](https://github.com/serxa)). This is an expert-level feature.
|
||||
* Support JWT credentials file for the NATS table engine. [#59543](https://github.com/ClickHouse/ClickHouse/pull/59543) ([Nickolaj Jepsen](https://github.com/nickolaj-jepsen)).
|
||||
* Implemented system.dns_cache table, which can be useful for debugging DNS issues. [#59856](https://github.com/ClickHouse/ClickHouse/pull/59856) ([Kirill Nikiforov](https://github.com/allmazz)).
|
||||
@ -60,7 +60,7 @@
|
||||
* Now if `StorageBuffer` has more than 1 shard (`num_layers` > 1) background flush will happen simultaneously for all shards in multiple threads. [#60111](https://github.com/ClickHouse/ClickHouse/pull/60111) ([alesapin](https://github.com/alesapin)).
|
||||
|
||||
#### Improvement
|
||||
* When output format is Pretty format and a block consists of a single numeric value which exceeds one million, A readable number will be printed on table right. [#60379](https://github.com/ClickHouse/ClickHouse/pull/60379) ([rogeryk](https://github.com/rogeryk)).
|
||||
* When output format is `Pretty` format and a block consists of a single numeric value which exceeds one million, A readable number will be printed on table right. [#60379](https://github.com/ClickHouse/ClickHouse/pull/60379) ([rogeryk](https://github.com/rogeryk)).
|
||||
* Added settings `split_parts_ranges_into_intersecting_and_non_intersecting_final` and `split_intersecting_parts_ranges_into_layers_final`. These settings are needed to disable optimizations for queries with `FINAL` and needed for debug only. [#59705](https://github.com/ClickHouse/ClickHouse/pull/59705) ([Maksim Kita](https://github.com/kitaisreal)). Actually not only for that - they can also lower memory usage at the expense of performance.
|
||||
* Rename the setting `extract_kvp_max_pairs_per_row` to `extract_key_value_pairs_max_pairs_per_row`. The issue (unnecessary abbreviation in the setting name) was introduced in https://github.com/ClickHouse/ClickHouse/pull/43606. Fix the documentation of this setting. [#59683](https://github.com/ClickHouse/ClickHouse/pull/59683) ([Alexey Milovidov](https://github.com/alexey-milovidov)). [#59960](https://github.com/ClickHouse/ClickHouse/pull/59960) ([jsc0218](https://github.com/jsc0218)).
|
||||
* Running `ALTER COLUMN MATERIALIZE` on a column with `DEFAULT` or `MATERIALIZED` expression now precisely follows the semantics. [#58023](https://github.com/ClickHouse/ClickHouse/pull/58023) ([Duc Canh Le](https://github.com/canhld94)).
|
||||
|
@ -24,8 +24,8 @@ class FunctionSeriesOutliersDetectTukey : public IFunction
|
||||
public:
|
||||
static constexpr auto name = "seriesOutliersDetectTukey";
|
||||
|
||||
static constexpr Float64 min_quartile = 2.0;
|
||||
static constexpr Float64 max_quartile = 98.0;
|
||||
static constexpr Float64 min_quartile = 0.02;
|
||||
static constexpr Float64 max_quartile = 0.98;
|
||||
|
||||
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionSeriesOutliersDetectTukey>(); }
|
||||
|
||||
@ -73,7 +73,6 @@ public:
|
||||
if (input_rows_count == 0)
|
||||
return ColumnArray::create(ColumnFloat64::create());
|
||||
|
||||
|
||||
Float64 min_percentile = 0.25; /// default 25th percentile
|
||||
Float64 max_percentile = 0.75; /// default 75th percentile
|
||||
Float64 k = 1.50;
|
||||
@ -84,13 +83,13 @@ public:
|
||||
if (isnan(p_min) || !isFinite(p_min) || p_min < min_quartile|| p_min > max_quartile)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The second argument of function {} must be in range [2.0, 98.0]", getName());
|
||||
|
||||
min_percentile = p_min / 100;
|
||||
min_percentile = p_min;
|
||||
|
||||
Float64 p_max = arguments[2].column->getFloat64(0);
|
||||
if (isnan(p_max) || !isFinite(p_max) || p_max < min_quartile || p_max > max_quartile || p_max < min_percentile * 100)
|
||||
if (isnan(p_max) || !isFinite(p_max) || p_max < min_quartile || p_max > max_quartile || p_max < min_percentile)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The third argument of function {} must be in range [2.0, 98.0]", getName());
|
||||
|
||||
max_percentile = p_max / 100;
|
||||
max_percentile = p_max;
|
||||
|
||||
auto k_val = arguments[3].column->getFloat64(0);
|
||||
if (k_val < 0.0 || isnan(k_val) || !isFinite(k_val))
|
||||
@ -155,7 +154,8 @@ private:
|
||||
src_sorted.assign(src_vec.begin() + prev_src_offset, src_vec.begin() + src_offset);
|
||||
std::sort(src_sorted.begin(), src_sorted.end());
|
||||
|
||||
Float64 q1, q2;
|
||||
Float64 q1;
|
||||
Float64 q2;
|
||||
|
||||
Float64 p1 = len * min_percentile;
|
||||
if (p1 == static_cast<Int64>(p1))
|
||||
@ -216,8 +216,8 @@ seriesOutliersDetectTukey(series, min_percentile, max_percentile, k);
|
||||
**Arguments**
|
||||
|
||||
- `series` - An array of numeric values.
|
||||
- `min_percentile` - The minimum percentile to be used to calculate inter-quantile range [(IQR)](https://en.wikipedia.org/wiki/Interquartile_range). The value must be in range [2,98]. The default is 25.
|
||||
- `max_percentile` - The maximum percentile to be used to calculate inter-quantile range (IQR). The value must be in range [2,98]. The default is 75.
|
||||
- `min_quantile` - The minimum quantile to be used to calculate inter-quantile range [(IQR)](https://en.wikipedia.org/wiki/Interquartile_range). The value must be in range [2,98]. The default is 25.
|
||||
- `max_quantile` - The maximum quantile to be used to calculate inter-quantile range (IQR). The value must be in range [0.02, 0.98]. The default is 0.75.
|
||||
- `k` - Non-negative constant value to detect mild or stronger outliers. The default value is 1.5
|
||||
|
||||
At least four data points are required in `series` to detect outliers.
|
||||
@ -247,7 +247,7 @@ Result:
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 20, 80, 1.5) AS print_0;
|
||||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], .2, .8, 1.5) AS print_0;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
@ -8,7 +8,7 @@ INSERT INTO tb1 VALUES (1, [-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 3, 4, 5, 16, 7
|
||||
|
||||
-- non-const inputs
|
||||
SELECT seriesOutliersDetectTukey(a) FROM tb1 ORDER BY n;
|
||||
SELECT seriesOutliersDetectTukey(a,10,90,1.5) FROM tb1 ORDER BY n;
|
||||
SELECT seriesOutliersDetectTukey(a,.10,.90,1.5) FROM tb1 ORDER BY n;
|
||||
DROP TABLE IF EXISTS tb1;
|
||||
|
||||
-- const inputs
|
||||
@ -16,17 +16,17 @@ SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40,
|
||||
SELECT seriesOutliersDetectTukey([-3, 2.40, 15, 3.90, 5, 6, 4.50, 5.20, 12, 60, 12, 3.40, 3, 4, 5, 6, 3.40, 2.7]);
|
||||
|
||||
-- const inputs with optional arguments
|
||||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 25, 75, 1.5);
|
||||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 10, 90, 1.5);
|
||||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], 2, 98, 1.5);
|
||||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], .25, .75, 1.5);
|
||||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], .10, .90, 1.5);
|
||||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4.50, 5, 12, 45, 12, 3.40, 3, 4, 5, 6], .02, .98, 1.5);
|
||||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 2, 98, 1.5);
|
||||
SELECT seriesOutliersDetectTukey(arrayMap(x -> sin(x / 10), range(30)));
|
||||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 25, 75, 3);
|
||||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], .25, .75, 3);
|
||||
|
||||
-- negative tests
|
||||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], 25, 75, -1); -- { serverError BAD_ARGUMENTS}
|
||||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 33, 53); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH}
|
||||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], 33); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH}
|
||||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3, 5, 6, 4, 5, 12, 45, 12, 3, 3, 4, 5, 6], .25, .75, -1); -- { serverError BAD_ARGUMENTS}
|
||||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], .33, .53); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH}
|
||||
SELECT seriesOutliersDetectTukey([-3, 2, 15, 3], .33); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH}
|
||||
SELECT seriesOutliersDetectTukey([-3, 2.4, 15, NULL]); -- { serverError ILLEGAL_COLUMN}
|
||||
SELECT seriesOutliersDetectTukey([]); -- { serverError ILLEGAL_COLUMN}
|
||||
SELECT seriesOutliersDetectTukey([-3, 2.4, 15]); -- { serverError BAD_ARGUMENTS}
|
||||
SELECT seriesOutliersDetectTukey([-3, 2.4, 15]); -- { serverError BAD_ARGUMENTS}
|
||||
|
Loading…
Reference in New Issue
Block a user