From 15e20f56fa4a9a497e4cc247aa9215cb840203f9 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Fri, 21 Jun 2024 16:29:25 +0800 Subject: [PATCH] Add statistics cmsketch --- contrib/datasketches-cpp-cmake/CMakeLists.txt | 1 + .../statements/alter/statistics.md | 2 +- src/CMakeLists.txt | 1 + .../Statistics/CMSketchStatistics.cpp | 82 +++++++++++++++++++ src/Storages/Statistics/CMSketchStatistics.h | 39 +++++++++ .../ConditionSelectivityEstimator.cpp | 35 +++----- .../ConditionSelectivityEstimator.h | 5 +- src/Storages/Statistics/Statistics.cpp | 41 +++++++++- src/Storages/Statistics/Statistics.h | 5 +- src/Storages/StatisticsDescription.cpp | 4 + src/Storages/StatisticsDescription.h | 1 + .../03174_statistics_cmsketch.reference | 26 ++++++ .../0_stateless/03174_statistics_cmsketch.sql | 78 ++++++++++++++++++ 13 files changed, 290 insertions(+), 30 deletions(-) create mode 100644 src/Storages/Statistics/CMSketchStatistics.cpp create mode 100644 src/Storages/Statistics/CMSketchStatistics.h create mode 100644 tests/queries/0_stateless/03174_statistics_cmsketch.reference create mode 100644 tests/queries/0_stateless/03174_statistics_cmsketch.sql diff --git a/contrib/datasketches-cpp-cmake/CMakeLists.txt b/contrib/datasketches-cpp-cmake/CMakeLists.txt index b12a88ad57b..497d6956d0e 100644 --- a/contrib/datasketches-cpp-cmake/CMakeLists.txt +++ b/contrib/datasketches-cpp-cmake/CMakeLists.txt @@ -9,6 +9,7 @@ set(DATASKETCHES_LIBRARY theta) add_library(_datasketches INTERFACE) target_include_directories(_datasketches SYSTEM BEFORE INTERFACE "${ClickHouse_SOURCE_DIR}/contrib/datasketches-cpp/common/include" + "${ClickHouse_SOURCE_DIR}/contrib/datasketches-cpp/count/include" "${ClickHouse_SOURCE_DIR}/contrib/datasketches-cpp/theta/include") add_library(ch_contrib::datasketches ALIAS _datasketches) diff --git a/docs/en/sql-reference/statements/alter/statistics.md b/docs/en/sql-reference/statements/alter/statistics.md index 80024781f88..22ff740d410 100644 --- a/docs/en/sql-reference/statements/alter/statistics.md +++ b/docs/en/sql-reference/statements/alter/statistics.md @@ -25,7 +25,7 @@ Also, they are replicated, syncing statistics metadata via ZooKeeper. There is an example adding two statistics types to two columns: ``` -ALTER TABLE t1 MODIFY STATISTICS c, d TYPE TDigest, Uniq; +ALTER TABLE t1 MODIFY STATISTICS c, d TYPE TDigest, Uniq, CMSketch; ``` :::note diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 84aaec17a5b..4efb6004172 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -546,6 +546,7 @@ endif() if (TARGET ch_contrib::datasketches) target_link_libraries (clickhouse_aggregate_functions PRIVATE ch_contrib::datasketches) + dbms_target_link_libraries(PRIVATE ch_contrib::datasketches) endif () target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::lz4) diff --git a/src/Storages/Statistics/CMSketchStatistics.cpp b/src/Storages/Statistics/CMSketchStatistics.cpp new file mode 100644 index 00000000000..2c217d30278 --- /dev/null +++ b/src/Storages/Statistics/CMSketchStatistics.cpp @@ -0,0 +1,82 @@ +#include +#include +#include +#include +#include + +#if USE_DATASKETCHES + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int ILLEGAL_STATISTICS; +} + + +CMSketchStatistics::CMSketchStatistics(const SingleStatisticsDescription & stat_, DataTypePtr data_type_) + : IStatistics(stat_), data(CMSKETCH_HASH_COUNT, CMSKETCH_BUCKET_COUNT), data_type(data_type_) +{ +} + +Float64 CMSketchStatistics::estimateEqual(const Field & value) const +{ + if (auto float_val = getFloat64(value)) + return data.get_estimate(&float_val.value(), 8); + if (auto string_val = getString(value)) + return data.get_estimate(string_val->data(), string_val->size()); + UNREACHABLE(); +} + +void CMSketchStatistics::serialize(WriteBuffer & buf) +{ + auto bytes = data.serialize(); + writeIntBinary(static_cast(bytes.size()), buf); + buf.write(reinterpret_cast(bytes.data()), bytes.size()); +} + +void CMSketchStatistics::deserialize(ReadBuffer & buf) +{ + UInt64 size; + readIntBinary(size, buf); + String s; + s.reserve(size); + buf.readStrict(s.data(), size); /// Extra copy can be avoided by implementing count_min_sketch::deserialize with ReadBuffer + auto read_sketch = datasketches::count_min_sketch::deserialize(s.data(), size, datasketches::DEFAULT_SEED); + data.merge(read_sketch); +} + +void CMSketchStatistics::update(const ColumnPtr & column) +{ + size_t size = column->size(); + + for (size_t i = 0; i < size; ++i) + { + Field f; + column->get(i, f); + if (f.isNull()) + continue; + if (auto float_val = getFloat64(f)) + data.update(&float_val.value(), 8, 1.0); + else if (auto string_val = getString(f)) + data.update(*string_val, 1.0); + } +} + +void CMSketchValidator(const SingleStatisticsDescription &, DataTypePtr data_type) +{ + data_type = removeNullable(data_type); + data_type = removeLowCardinalityAndNullable(data_type); + if (!data_type->isValueRepresentedByNumber() && !isStringOrFixedString(data_type)) + throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'cmsketch' does not support type {}", data_type->getName()); +} + +StatisticsPtr CMSketchCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type) +{ + return std::make_shared(stat, data_type); +} + +} + +#endif diff --git a/src/Storages/Statistics/CMSketchStatistics.h b/src/Storages/Statistics/CMSketchStatistics.h new file mode 100644 index 00000000000..03964614d57 --- /dev/null +++ b/src/Storages/Statistics/CMSketchStatistics.h @@ -0,0 +1,39 @@ +#pragma once + +#if USE_DATASKETCHES + +#include +#include +#include + +namespace DB +{ + +/// CMSketchStatistics is used to estimate expression like col = 'value' or col in ('v1', 'v2'). +class CMSketchStatistics : public IStatistics +{ +public: + explicit CMSketchStatistics(const SingleStatisticsDescription & stat_, DataTypePtr data_type_); + + Float64 estimateEqual(const Field & value) const; + + void serialize(WriteBuffer & buf) override; + + void deserialize(ReadBuffer & buf) override; + + void update(const ColumnPtr & column) override; + +private: + static constexpr size_t CMSKETCH_HASH_COUNT = 8; + static constexpr size_t CMSKETCH_BUCKET_COUNT = 2048; + + datasketches::count_min_sketch data; + DataTypePtr data_type; +}; + +StatisticsPtr CMSketchCreator(const SingleStatisticsDescription & stat, DataTypePtr); +void CMSketchValidator(const SingleStatisticsDescription &, DataTypePtr data_type); + +} + +#endif diff --git a/src/Storages/Statistics/ConditionSelectivityEstimator.cpp b/src/Storages/Statistics/ConditionSelectivityEstimator.cpp index 757136fdf42..73c5c549a5d 100644 --- a/src/Storages/Statistics/ConditionSelectivityEstimator.cpp +++ b/src/Storages/Statistics/ConditionSelectivityEstimator.cpp @@ -35,11 +35,14 @@ Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateGreat return rows - estimateLess(val, rows); } -Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateEqual(Float64 val, Float64 rows) const +Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateEqual(Field val, Float64 rows) const { + auto float_val = getFloat64(val); if (part_statistics.empty()) { - if (val < - threshold || val > threshold) + if (!float_val) + return default_unknown_cond_factor * rows; + else if (float_val.value() < - threshold || float_val.value() > threshold) return default_normal_cond_factor * rows; else return default_good_cond_factor * rows; @@ -87,7 +90,7 @@ static std::pair tryToExtractSingleColumn(const RPNBuilderTreeNod return result; } -std::pair ConditionSelectivityEstimator::extractBinaryOp(const RPNBuilderTreeNode & node, const String & column_name) const +std::pair ConditionSelectivityEstimator::extractBinaryOp(const RPNBuilderTreeNode & node, const String & column_name) const { if (!node.isFunction()) return {}; @@ -123,18 +126,7 @@ std::pair ConditionSelectivityEstimator::extractBinaryOp(const DataTypePtr output_type; if (!constant_node->tryGetConstant(output_value, output_type)) return {}; - - const auto type = output_value.getType(); - Float64 value; - if (type == Field::Types::Int64) - value = output_value.get(); - else if (type == Field::Types::UInt64) - value = output_value.get(); - else if (type == Field::Types::Float64) - value = output_value.get(); - else - return {}; - return std::make_pair(function_name, value); + return std::make_pair(function_name, output_value); } Float64 ConditionSelectivityEstimator::estimateRowCount(const RPNBuilderTreeNode & node) const @@ -142,7 +134,7 @@ Float64 ConditionSelectivityEstimator::estimateRowCount(const RPNBuilderTreeNode auto result = tryToExtractSingleColumn(node); if (result.second != 1) { - return default_unknown_cond_factor; + return default_unknown_cond_factor * total_rows; } String col = result.first; auto it = column_estimators.find(col); @@ -152,19 +144,16 @@ Float64 ConditionSelectivityEstimator::estimateRowCount(const RPNBuilderTreeNode bool dummy = total_rows == 0; ColumnSelectivityEstimator estimator; if (it != column_estimators.end()) - { estimator = it->second; - } else - { dummy = true; - } auto [op, val] = extractBinaryOp(node, col); + auto float_val = getFloat64(val); if (op == "equals") { if (dummy) { - if (val < - threshold || val > threshold) + if (!float_val || (float_val < - threshold || float_val > threshold)) return default_normal_cond_factor * total_rows; else return default_good_cond_factor * total_rows; @@ -175,13 +164,13 @@ Float64 ConditionSelectivityEstimator::estimateRowCount(const RPNBuilderTreeNode { if (dummy) return default_normal_cond_factor * total_rows; - return estimator.estimateLess(val, total_rows); + return estimator.estimateLess(float_val.value(), total_rows); } else if (op == "greater" || op == "greaterOrEquals") { if (dummy) return default_normal_cond_factor * total_rows; - return estimator.estimateGreater(val, total_rows); + return estimator.estimateGreater(float_val.value(), total_rows); } else return default_unknown_cond_factor * total_rows; diff --git a/src/Storages/Statistics/ConditionSelectivityEstimator.h b/src/Storages/Statistics/ConditionSelectivityEstimator.h index f0599742276..9bf4940e563 100644 --- a/src/Storages/Statistics/ConditionSelectivityEstimator.h +++ b/src/Storages/Statistics/ConditionSelectivityEstimator.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace DB { @@ -24,7 +25,7 @@ private: Float64 estimateGreater(Float64 val, Float64 rows) const; - Float64 estimateEqual(Float64 val, Float64 rows) const; + Float64 estimateEqual(Field val, Float64 rows) const; }; static constexpr auto default_good_cond_factor = 0.1; @@ -37,7 +38,7 @@ private: UInt64 total_rows = 0; std::set part_names; std::map column_estimators; - std::pair extractBinaryOp(const RPNBuilderTreeNode & node, const String & column_name) const; + std::pair extractBinaryOp(const RPNBuilderTreeNode & node, const String & column_name) const; public: /// TODO: Support the condition consists of CNF/DNF like (cond1 and cond2) or (cond3) ... diff --git a/src/Storages/Statistics/Statistics.cpp b/src/Storages/Statistics/Statistics.cpp index fed0bd61c03..b35d653b7ec 100644 --- a/src/Storages/Statistics/Statistics.cpp +++ b/src/Storages/Statistics/Statistics.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -26,6 +27,28 @@ enum StatisticsFileVersion : UInt16 V0 = 0, }; +std::optional getFloat64(const Field & f) +{ + const auto type = f.getType(); + Float64 value; + if (type == Field::Types::Int64) + value = f.get(); + else if (type == Field::Types::UInt64) + value = f.get(); + else if (type == Field::Types::Float64) + value = f.get(); + else + return {}; + return value; +} + +std::optional getString(const Field & f) +{ + if (f.getType() == Field::Types::String) + return f.get(); + return {}; +} + IStatistics::IStatistics(const SingleStatisticsDescription & stat_) : stat(stat_) {} ColumnStatistics::ColumnStatistics(const ColumnStatisticsDescription & stats_desc_) @@ -54,9 +77,10 @@ Float64 ColumnStatistics::estimateGreater(Float64 val) const return rows - estimateLess(val); } -Float64 ColumnStatistics::estimateEqual(Float64 val) const +Float64 ColumnStatistics::estimateEqual(Field val) const { - if (stats.contains(StatisticsType::Uniq) && stats.contains(StatisticsType::TDigest)) + auto float_val = getFloat64(val); + if (float_val && stats.contains(StatisticsType::Uniq) && stats.contains(StatisticsType::TDigest)) { auto uniq_static = std::static_pointer_cast(stats.at(StatisticsType::Uniq)); /// 2048 is the default number of buckets in TDigest. In this case, TDigest stores exactly one value (with many rows) @@ -64,9 +88,16 @@ Float64 ColumnStatistics::estimateEqual(Float64 val) const if (uniq_static->getCardinality() < 2048) { auto tdigest_static = std::static_pointer_cast(stats.at(StatisticsType::TDigest)); - return tdigest_static->estimateEqual(val); + return tdigest_static->estimateEqual(float_val.value()); } } +#if USE_DATASKETCHES + if (stats.contains(StatisticsType::CMSketch)) + { + auto cmsketch_static = std::static_pointer_cast(stats.at(StatisticsType::CMSketch)); + return cmsketch_static->estimateEqual(val); + } +#endif if (val < - ConditionSelectivityEstimator::threshold || val > ConditionSelectivityEstimator::threshold) return rows * ConditionSelectivityEstimator::default_normal_cond_factor; else @@ -145,6 +176,10 @@ MergeTreeStatisticsFactory::MergeTreeStatisticsFactory() registerCreator(StatisticsType::Uniq, UniqCreator); registerValidator(StatisticsType::TDigest, TDigestValidator); registerValidator(StatisticsType::Uniq, UniqValidator); +#if USE_DATASKETCHES + registerCreator(StatisticsType::CMSketch, CMSketchCreator); + registerValidator(StatisticsType::CMSketch, CMSketchValidator); +#endif } MergeTreeStatisticsFactory & MergeTreeStatisticsFactory::instance() diff --git a/src/Storages/Statistics/Statistics.h b/src/Storages/Statistics/Statistics.h index 2ab1337af02..f6121d72256 100644 --- a/src/Storages/Statistics/Statistics.h +++ b/src/Storages/Statistics/Statistics.h @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -58,7 +59,7 @@ public: Float64 estimateGreater(Float64 val) const; - Float64 estimateEqual(Float64 val) const; + Float64 estimateEqual(Field val) const; private: @@ -100,4 +101,6 @@ private: Validators validators; }; +std::optional getFloat64(const Field & f); +std::optional getString(const Field & f); } diff --git a/src/Storages/StatisticsDescription.cpp b/src/Storages/StatisticsDescription.cpp index f10fb78f933..08c79043ac4 100644 --- a/src/Storages/StatisticsDescription.cpp +++ b/src/Storages/StatisticsDescription.cpp @@ -54,6 +54,8 @@ static StatisticsType stringToStatisticsType(String type) return StatisticsType::TDigest; if (type == "uniq") return StatisticsType::Uniq; + if (type == "cmsketch") + return StatisticsType::CMSketch; throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistics type: {}. Supported statistics types are `tdigest` and `uniq`.", type); } @@ -65,6 +67,8 @@ String SingleStatisticsDescription::getTypeName() const return "TDigest"; case StatisticsType::Uniq: return "Uniq"; + case StatisticsType::CMSketch: + return "CMSketch"; default: throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown statistics type: {}. Supported statistics types are `tdigest` and `uniq`.", type); } diff --git a/src/Storages/StatisticsDescription.h b/src/Storages/StatisticsDescription.h index 4862fb79d45..a2005f59de1 100644 --- a/src/Storages/StatisticsDescription.h +++ b/src/Storages/StatisticsDescription.h @@ -13,6 +13,7 @@ enum class StatisticsType : UInt8 { TDigest = 0, Uniq = 1, + CMSketch = 2, Max = 63, }; diff --git a/tests/queries/0_stateless/03174_statistics_cmsketch.reference b/tests/queries/0_stateless/03174_statistics_cmsketch.reference new file mode 100644 index 00000000000..ea77f317a31 --- /dev/null +++ b/tests/queries/0_stateless/03174_statistics_cmsketch.reference @@ -0,0 +1,26 @@ +CREATE TABLE default.t1\n(\n `a` String STATISTICS(cmsketch),\n `b` Int64 STATISTICS(cmsketch),\n `c` UInt64 STATISTICS(cmsketch),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 + Prewhere info + Prewhere filter + Prewhere filter column: and(equals(a, \'0\'_String), equals(b, 0), equals(c, 0)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(equals(a, \'0\'_String), equals(c, 0), greater(b, 0)) (removed) +After drop statistics for a + Prewhere info + Prewhere filter + Prewhere filter column: and(equals(b, 0), equals(c, 0), equals(a, \'0\'_String)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(equals(c, 0), equals(a, \'0\'_String), greater(b, 0)) (removed) +LowCardinality + Prewhere info + Prewhere filter + Prewhere filter column: and(equals(a, \'0\'_String), equals(b, 0), equals(c, 0)) (removed) +Nullable + Prewhere info + Prewhere filter + Prewhere filter column: and(equals(a, \'0\'_String), equals(b, 0), equals(c, 0)) (removed) +LowCardinality(Nullable) + Prewhere info + Prewhere filter + Prewhere filter column: and(equals(a, \'0\'_String), equals(b, 0), equals(c, 0)) (removed) diff --git a/tests/queries/0_stateless/03174_statistics_cmsketch.sql b/tests/queries/0_stateless/03174_statistics_cmsketch.sql new file mode 100644 index 00000000000..c45d6186fdf --- /dev/null +++ b/tests/queries/0_stateless/03174_statistics_cmsketch.sql @@ -0,0 +1,78 @@ +-- Tags: no-fasttest +DROP TABLE IF EXISTS t1; + +SET allow_experimental_statistics = 1; +SET allow_statistics_optimize = 1; + +CREATE TABLE t1 +( + a String STATISTICS(cmsketch), + b Int64 STATISTICS(cmsketch), + c UInt64 STATISTICS(cmsketch), + pk String, +) Engine = MergeTree() ORDER BY pk +SETTINGS min_bytes_for_wide_part = 0; + +SHOW CREATE TABLE t1; + +INSERT INTO t1 select toString(number % 1000), number % 100, number % 10, generateUUIDv4() FROM system.numbers LIMIT 10000; + +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE c = 0 and b = 0 and a = '0') WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE c = 0 and b > 0 and a = '0') WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; + +ALTER TABLE t1 DROP STATISTICS a; + +SELECT 'After drop statistics for a'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE c = 0 and b = 0 and a = '0') WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE c = 0 and b > 0 and a = '0') WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; + +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; +SET allow_suspicious_low_cardinality_types=1; +CREATE TABLE t2 +( + a LowCardinality(String) STATISTICS(cmsketch), + b Int64 STATISTICS(cmsketch), + c UInt64 STATISTICS(cmsketch), + pk String, +) Engine = MergeTree() ORDER BY pk +SETTINGS min_bytes_for_wide_part = 0; +INSERT INTO t2 select toString(number % 1000), number % 100, number % 10, generateUUIDv4() FROM system.numbers LIMIT 10000; + +SELECT 'LowCardinality'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t2 WHERE c = 0 and b = 0 and a = '0') WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; + + +DROP TABLE IF EXISTS t2; +DROP TABLE IF EXISTS t3; + +CREATE TABLE t3 +( + a Nullable(String) STATISTICS(cmsketch), + b Int64 STATISTICS(cmsketch), + c UInt64 STATISTICS(cmsketch), + pk String, +) Engine = MergeTree() ORDER BY pk +SETTINGS min_bytes_for_wide_part = 0; +INSERT INTO t3 select toString(number % 1000), number % 100, number % 10, generateUUIDv4() FROM system.numbers LIMIT 10000; + +SELECT 'Nullable'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t3 WHERE c = 0 and b = 0 and a = '0') WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; + +DROP TABLE IF EXISTS t3; +DROP TABLE IF EXISTS t4; + +CREATE TABLE t4 +( + a LowCardinality(Nullable(String)) STATISTICS(cmsketch), + b Int64 STATISTICS(cmsketch), + c UInt64 STATISTICS(cmsketch), + pk String, +) Engine = MergeTree() ORDER BY pk +SETTINGS min_bytes_for_wide_part = 0; +INSERT INTO t4 select toString(number % 1000), number % 100, number % 10, generateUUIDv4() FROM system.numbers LIMIT 10000; + +SELECT 'LowCardinality(Nullable)'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t4 WHERE c = 0 and b = 0 and a = '0') WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; + +DROP TABLE IF EXISTS t4;