Uniq statistics supports more datatypes and refactor logical when there is no statistics.

2024-09-19 16:20:50 +00:00 · 2024-07-24 17:43:27 +08:00 · 2024-07-24 17:43:27 +08:00 · 40fc0ca574
commit 40fc0ca574
parent 4712b79960
11 changed files with 94 additions and 89 deletions
--- a/src/Storages/Statistics/ConditionSelectivityEstimator.cpp
+++ b/src/Storages/Statistics/ConditionSelectivityEstimator.cpp
@ -19,7 +19,7 @@ void ConditionSelectivityEstimator::ColumnSelectivityEstimator::merge(String par
 Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateLess(const Field & val, Float64 rows) const
 {
    if (part_statistics.empty())
-        return default_normal_cond_factor * rows;
+        return default_cond_range_factor * rows;
    Float64 result = 0;
    Float64 part_rows = 0;
    for (const auto & [key, estimator] : part_statistics)
@ -38,15 +38,8 @@ Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateGreat
 Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateEqual(const Field & val, Float64 rows) const
 {
    if (part_statistics.empty())
-    {
-        auto float_val = StatisticsUtils::tryConvertToFloat64(val);
-        if (!float_val)
-            return default_unknown_cond_factor * rows;
-        else if (float_val.value() < - threshold || float_val.value() > threshold)
-            return default_normal_cond_factor * rows;
-        else
-            return default_good_cond_factor * rows;
-    }
+        return default_cond_equal_factor * rows;
+
    Float64 result = 0;
    Float64 partial_cnt = 0;
    for (const auto & [key, estimator] : part_statistics)
@ -149,30 +142,22 @@ Float64 ConditionSelectivityEstimator::estimateRowCount(const RPNBuilderTreeNode

    auto [op, val] = extractBinaryOp(node, col);

+    if (dummy)
+    {
+        if (op == "equals")
+            return default_cond_equal_factor * total_rows;
+        else if (op == "less" || op == "lessOrEquals" || op == "greater" || op == "greaterOrEquals")
+            return default_cond_range_factor * total_rows;
+        else
+            return default_unknown_cond_factor * total_rows;
+    }
+
    if (op == "equals")
-    {
-        if (dummy)
-        {
-            auto float_val = StatisticsUtils::tryConvertToFloat64(val);
-            if (!float_val || (float_val < - threshold || float_val > threshold))
-                return default_normal_cond_factor * total_rows;
-            else
-                return default_good_cond_factor * total_rows;
-        }
        return estimator.estimateEqual(val, total_rows);
-    }
    else if (op == "less" || op == "lessOrEquals")
-    {
-        if (dummy)
-            return default_normal_cond_factor * total_rows;
        return estimator.estimateLess(val, total_rows);
-    }
    else if (op == "greater" || op == "greaterOrEquals")
-    {
-        if (dummy)
-            return default_normal_cond_factor * total_rows;
        return estimator.estimateGreater(val, total_rows);
-    }
    else
        return default_unknown_cond_factor * total_rows;
 }
--- a/src/Storages/Statistics/ConditionSelectivityEstimator.h
+++ b/src/Storages/Statistics/ConditionSelectivityEstimator.h
@ -38,12 +38,10 @@ private:

    std::pair<String, Field> extractBinaryOp(const RPNBuilderTreeNode & node, const String & column_name) const;

-    static constexpr auto default_good_cond_factor = 0.1;
-    static constexpr auto default_normal_cond_factor = 0.5;
-    static constexpr auto default_unknown_cond_factor = 1.0;
-    /// Conditions like "x = N" are considered good if abs(N) > threshold.
-    /// This is used to assume that condition is likely to have good selectivity.
-    static constexpr auto threshold = 2;
+    /// Used to estimate the selectivity of a condition when there is no statistics.
+    static constexpr auto default_cond_range_factor = 0.5;
+    static constexpr auto default_cond_equal_factor = 0.01;
+    static constexpr auto default_unknown_cond_factor = 1;

    UInt64 total_rows = 0;
    std::map<String, ColumnSelectivityEstimator> column_estimators;
--- a/src/Storages/Statistics/Statistics.cpp
+++ b/src/Storages/Statistics/Statistics.cpp
@ -51,13 +51,6 @@ std::optional<Float64> StatisticsUtils::tryConvertToFloat64(const Field & field)
    }
 }

-std::optional<String> StatisticsUtils::tryConvertToString(const DB::Field & field)
-{
-    if (field.getType() == Field::Types::String)
-        return field.get<String>();
-    return {};
-}
-
 IStatistics::IStatistics(const SingleStatisticsDescription & stat_)
    : stat(stat_)
 {
@ -106,7 +99,7 @@ Float64 ColumnStatistics::estimateLess(const Field & val) const
        return stats.at(StatisticsType::TDigest)->estimateLess(val);
    if (stats.contains(StatisticsType::MinMax))
        return stats.at(StatisticsType::MinMax)->estimateLess(val);
-    return rows * ConditionSelectivityEstimator::default_normal_cond_factor;
+    return rows * ConditionSelectivityEstimator::default_cond_range_factor;
 }

 Float64 ColumnStatistics::estimateGreater(const Field & val) const
@ -116,8 +109,7 @@ Float64 ColumnStatistics::estimateGreater(const Field & val) const

 Float64 ColumnStatistics::estimateEqual(const Field & val) const
 {
-    auto float_val = StatisticsUtils::tryConvertToFloat64(val);
-    if (float_val.has_value() && stats.contains(StatisticsType::Uniq) && stats.contains(StatisticsType::TDigest))
+    if (stats_desc.data_type->isValueRepresentedByNumber() && stats.contains(StatisticsType::Uniq) && stats.contains(StatisticsType::TDigest))
    {
        /// 2048 is the default number of buckets in TDigest. In this case, TDigest stores exactly one value (with many rows) for every bucket.
        if (stats.at(StatisticsType::Uniq)->estimateCardinality() < 2048)
@ -127,10 +119,16 @@ Float64 ColumnStatistics::estimateEqual(const Field & val) const
    if (stats.contains(StatisticsType::CountMinSketch))
        return stats.at(StatisticsType::CountMinSketch)->estimateEqual(val);
 #endif
-    if (!float_val.has_value() && (float_val < - ConditionSelectivityEstimator::threshold || float_val > ConditionSelectivityEstimator::threshold))
-        return rows * ConditionSelectivityEstimator::default_normal_cond_factor;
-    else
-        return rows * ConditionSelectivityEstimator::default_good_cond_factor;
+    if (stats.contains(StatisticsType::Uniq))
+    {
+        auto cardinality = stats.at(StatisticsType::Uniq)->estimateCardinality();
+        if (cardinality == 0)
+            return 0;
+        /// Assume that the value is uniformly distributed among the unique values.
+        return static_cast<Float64>(1) / stats.at(StatisticsType::Uniq)->estimateCardinality();
+    }
+
+    return rows * ConditionSelectivityEstimator::default_cond_equal_factor;
 }

 /// -------------------------------------
--- a/src/Storages/Statistics/Statistics.h
+++ b/src/Storages/Statistics/Statistics.h
@ -19,7 +19,6 @@ struct StatisticsUtils
 {
    /// Returns std::nullopt if input Field cannot be converted to a concrete value
    static std::optional<Float64> tryConvertToFloat64(const Field & field);
-    static std::optional<String> tryConvertToString(const Field & field);
 };

 /// Statistics describe properties of the values in the column,
@ -32,11 +31,6 @@ public:
    explicit IStatistics(const SingleStatisticsDescription & stat_);
    virtual ~IStatistics() = default;

-    virtual void update(const ColumnPtr & column) = 0;
-
-    virtual void serialize(WriteBuffer & buf) = 0;
-    virtual void deserialize(ReadBuffer & buf) = 0;
-
    /// Estimate the cardinality of the column.
    /// Throws if the statistics object is not able to do a meaningful estimation.
    virtual UInt64 estimateCardinality() const;
@ -46,6 +40,11 @@ public:
    virtual Float64 estimateEqual(const Field & val) const; /// cardinality of val in the column
    virtual Float64 estimateLess(const Field & val) const;  /// summarized cardinality of values < val in the column

+    virtual void update(const ColumnPtr & column) = 0;
+
+    virtual void serialize(WriteBuffer & buf) = 0;
+    virtual void deserialize(ReadBuffer & buf) = 0;
+
 protected:
    SingleStatisticsDescription stat;
 };
--- a/src/Storages/Statistics/StatisticsMinMax.cpp
+++ b/src/Storages/Statistics/StatisticsMinMax.cpp
@ -32,18 +32,18 @@ Float64 StatisticsMinMax::estimateLess(const Field & val) const
    if (val_converted.isNull())
        return 0;

-    auto val_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
+    auto val_as_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);

-    if (val_float < min)
+    if (val_as_float < min)
        return 0;

-    if (val_float > max)
+    if (val_as_float > max)
        return row_count;

    if (max == min)
        return row_count;

-    return ((val_float - min) / (max - min)) * row_count;
+    return ((val_as_float - min) / (max - min)) * row_count;
 }

 void StatisticsMinMax::update(const ColumnPtr & column)
--- a/src/Storages/Statistics/StatisticsTDigest.cpp
+++ b/src/Storages/Statistics/StatisticsTDigest.cpp
@ -1,6 +1,8 @@
 #include <Storages/Statistics/StatisticsTDigest.h>
 #include <DataTypes/DataTypeNullable.h>
 #include <DataTypes/DataTypeLowCardinality.h>
+#include <Interpreters/convertFieldToType.h>
+#include <Common/FieldVisitorConvertToNumber.h>

 namespace DB
 {
@ -10,24 +12,20 @@ extern const int ILLEGAL_STATISTICS;
 extern const int LOGICAL_ERROR;
 }

-StatisticsTDigest::StatisticsTDigest(const SingleStatisticsDescription & stat_)
-    : IStatistics(stat_)
+StatisticsTDigest::StatisticsTDigest(const SingleStatisticsDescription & stat_, DataTypePtr data_type_)
+    : IStatistics(stat_), data_type(data_type_)
 {
 }

 void StatisticsTDigest::update(const ColumnPtr & column)
 {
-    size_t rows = column->size();
-    for (size_t row = 0; row < rows; ++row)
+    for (size_t row = 0; row < column->size(); ++row)
    {
-        Field field;
-        column->get(row, field);
-
-        if (field.isNull())
+        if (column->isNullAt(row))
            continue;

-        if (auto field_as_float = StatisticsUtils::tryConvertToFloat64(field))
-            t_digest.add(*field_as_float, 1);
+        auto data = column->getFloat64(row);
+        t_digest.add(data, 1);
    }
 }

@ -43,18 +41,22 @@ void StatisticsTDigest::deserialize(ReadBuffer & buf)

 Float64 StatisticsTDigest::estimateLess(const Field & val) const
 {
-    auto val_as_float = StatisticsUtils::tryConvertToFloat64(val);
-    if (val_as_float)
-        return t_digest.getCountLessThan(*val_as_float);
-    throw Exception(ErrorCodes::LOGICAL_ERROR, "Statistics 'tdigest' does not support estimating value of type {}", val.getTypeName());
+    Field val_converted = convertFieldToType(val, *data_type);
+    if (val_converted.isNull())
+        return 0;
+    
+    auto val_as_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
+    return t_digest.getCountLessThan(val_as_float);
 }

 Float64 StatisticsTDigest::estimateEqual(const Field & val) const
 {
-    auto val_as_float = StatisticsUtils::tryConvertToFloat64(val);
-    if (val_as_float)
-        return t_digest.getCountEqual(*val_as_float);
-    throw Exception(ErrorCodes::LOGICAL_ERROR, "Statistics 'tdigest' does not support estimating value of type {}", val.getTypeName());
+    Field val_converted = convertFieldToType(val, *data_type);
+    if (val_converted.isNull())
+        return 0;
+    
+    auto val_as_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
+    return t_digest.getCountEqual(val_as_float);
 }

 void tdigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
@ -65,9 +67,9 @@ void tdigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type
        throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'tdigest' do not support type {}", data_type->getName());
 }

-StatisticsPtr tdigestCreator(const SingleStatisticsDescription & stat, DataTypePtr)
+StatisticsPtr tdigestCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type)
 {
-    return std::make_shared<StatisticsTDigest>(stat);
+    return std::make_shared<StatisticsTDigest>(stat, data_type);
 }

 }
--- a/src/Storages/Statistics/StatisticsTDigest.h
+++ b/src/Storages/Statistics/StatisticsTDigest.h
@ -9,18 +9,19 @@ namespace DB
 class StatisticsTDigest : public IStatistics
 {
 public:
-    explicit StatisticsTDigest(const SingleStatisticsDescription & stat_);
+    explicit StatisticsTDigest(const SingleStatisticsDescription & stat_, DataTypePtr data_type_);
+
+    Float64 estimateLess(const Field & val) const override;
+    Float64 estimateEqual(const Field & val) const override;

    void update(const ColumnPtr & column) override;

    void serialize(WriteBuffer & buf) override;
    void deserialize(ReadBuffer & buf) override;

-    Float64 estimateLess(const Field & val) const override;
-    Float64 estimateEqual(const Field & val) const override;
-
 private:
    QuantileTDigest<Float64> t_digest;
+    DataTypePtr data_type;
 };

 void tdigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type);
--- a/src/Storages/Statistics/StatisticsUniq.cpp
+++ b/src/Storages/Statistics/StatisticsUniq.cpp
@ -56,7 +56,7 @@ void uniqValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
 {
    data_type = removeNullable(data_type);
    data_type = removeLowCardinalityAndNullable(data_type);
-    if (!data_type->isValueRepresentedByNumber())
+    if (!data_type->isValueRepresentedByNumber() && !isStringOrFixedString(data_type))
        throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'uniq' do not support type {}", data_type->getName());
 }

--- a/src/Storages/Statistics/StatisticsUniq.h
+++ b/src/Storages/Statistics/StatisticsUniq.h
@ -13,13 +13,13 @@ public:
    StatisticsUniq(const SingleStatisticsDescription & stat_, const DataTypePtr & data_type);
    ~StatisticsUniq() override;

+    UInt64 estimateCardinality() const override;
+
    void update(const ColumnPtr & column) override;

    void serialize(WriteBuffer & buf) override;
    void deserialize(ReadBuffer & buf) override;

-    UInt64 estimateCardinality() const override;
-
 private:
    std::unique_ptr<Arena> arena;
    AggregateFunctionPtr collector;
--- a/tests/queries/0_stateless/02864_statistics_estimation.reference
+++ b/tests/queries/0_stateless/02864_statistics_estimation.reference
@ -14,12 +14,19 @@ Test statistics multi-types:
        Prewhere info
          Prewhere filter
          Prewhere filter column: and(equals(a, \'10000\'), equals(b, 0), less(c, 0)) (removed)
-Test statistics min_max and tdigest:
+Test estimating range condition:
        Prewhere info
          Prewhere filter
          Prewhere filter column: and(less(b, 10), less(c, 0)) (removed)
        Prewhere info
          Prewhere filter
          Prewhere filter column: and(less(b, 10), less(c, 0)) (removed)
+Test estimating equals condition:
+        Prewhere info
+          Prewhere filter
+          Prewhere filter column: and(equals(a, \'0\'), equals(b, 10)) (removed)
+        Prewhere info
+          Prewhere filter
+          Prewhere filter column: and(equals(a, \'0\'), equals(b, 10)) (removed)
 Test LowCardinality and Nullable data type:
 tab2
--- a/tests/queries/0_stateless/02864_statistics_estimation.sql
+++ b/tests/queries/0_stateless/02864_statistics_estimation.sql
@ -68,7 +68,7 @@ WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%';
 ALTER TABLE tab DROP STATISTICS a, b, c, d;


-SELECT 'Test statistics min_max and tdigest:';
+SELECT 'Test estimating range condition:';

 ALTER TABLE tab ADD STATISTICS b TYPE min_max;
 ALTER TABLE tab MATERIALIZE STATISTICS b;
@ -84,6 +84,21 @@ WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%';
 ALTER TABLE tab DROP STATISTICS b;


+SELECT 'Test estimating equals condition:';
+
+ALTER TABLE tab ADD STATISTICS a TYPE uniq;
+ALTER TABLE tab MATERIALIZE STATISTICS a;
+SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8|_UInt16|_String', '')
+FROM (EXPLAIN actions=1 SELECT count(*) FROM tab WHERE b = 10/*100*/ and a = '0'/*1*/)
+WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%';
+
+ALTER TABLE tab ADD STATISTICS a TYPE count_min;
+ALTER TABLE tab MATERIALIZE STATISTICS a;
+SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8|_UInt16|_String', '')
+FROM (EXPLAIN actions=1 SELECT count(*) FROM tab WHERE b = 10/*100*/ and a = '0'/*1*/)
+WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%';
+ALTER TABLE tab DROP STATISTICS a;
+
 DROP TABLE IF EXISTS tab SYNC;


@ -93,8 +108,8 @@ SET allow_suspicious_low_cardinality_types=1;
 CREATE TABLE tab2
 (
    a LowCardinality(Int64) STATISTICS(count_min),
-    b Nullable(Int64) STATISTICS(count_min),
-    c LowCardinality(Nullable(Int64)) STATISTICS(count_min),
+    b Nullable(Int64) STATISTICS(min_max, count_min),
+    c LowCardinality(Nullable(Int64)) STATISTICS(min_max, count_min),
    pk String,
 ) Engine = MergeTree() ORDER BY pk;