support estimate by t-digest

2024-12-04 21:42:39 +00:00 · 2023-08-24 14:09:42 +02:00 · 2023-08-24 14:09:42 +02:00 · c29261e5b4
commit c29261e5b4
parent ec07032173
8 changed files with 90 additions and 20 deletions
--- a/src/AggregateFunctions/QuantileTDigest.h
+++ b/src/AggregateFunctions/QuantileTDigest.h
@ -337,12 +337,39 @@ public:

    Float64 getCountLessThan(Float64 value) const
    {
+        bool first = true;
+        Count sum = 0;
+        Count prev_count = 0;
+        Float64 prev_x = 0;
+        Value prev_mean = 0;

-        ///Count sum = 0;
-        ///Value prev_mean = centroids.front().mean;
-        ///Count prev_count = centroids.front().count;
+        for (const auto & c : centroids)
+        {
+            std::cerr << "c "<< c.mean << " "<< c.count << std::endl;
+            Float64 current_x = sum + c.count * 0.5;
+            if (c.mean >= value)
+            {
+                /// value is smaller than any value.
+                if (first)
+                    return 0;

-        return value;
+                Float64 left = prev_x + 0.5 * (prev_count == 1);
+                Float64 right = current_x - 0.5 * (c.count == 1);
+                return checkOverflow<Float64>(interpolate(
+                    static_cast<Value>(value),
+                    prev_mean,
+                    static_cast<Value>(left),
+                    c.mean,
+                    static_cast<Value>(right)));
+            }
+            sum += c.count;
+            prev_mean = c.mean;
+            prev_count = c.count;
+            prev_x = current_x;
+            first = false;
+        }
+        /// count is larger than any value.
+        return count;
    }

    /** Calculates the quantile q [0, 1] based on the digest.
--- a/src/Interpreters/MutationsInterpreter.cpp
+++ b/src/Interpreters/MutationsInterpreter.cpp
@ -716,8 +716,7 @@ void MutationsInterpreter::prepare(bool dry_run)
            if (it == std::cend(statistics_desc))
                throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown statistic: {}", command.statistic_name);

-            for (const auto & column : it->column_names)
-                dependencies.emplace(column, ColumnDependency::STATISTIC);
+            dependencies.emplace(it->column_name, ColumnDependency::STATISTIC);
            materialized_statistics.emplace(command.statistic_name);
        }
        else if (command.type == MutationCommand::MATERIALIZE_PROJECTION)
--- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
+++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
@ -10,7 +10,6 @@
 #include <Parsers/ASTSubquery.h>
 #include <Parsers/formatAST.h>
 #include <Interpreters/misc.h>
-#include "Common/logger_useful.h"
 #include <Common/typeid_cast.h>
 #include <DataTypes/NestedUtils.h>
 #include <Interpreters/ActionsDAG.h>
--- a/src/Storages/Statistic/Statistic.cpp
+++ b/src/Storages/Statistic/Statistic.cpp
@ -131,11 +131,6 @@ Float64 ConditionEstimator::estimateSelectivity(const RPNBuilderTreeNode & node)

 StatisticPtr TDigestCreator(const StatisticDescription & stat)
 {
-    if (stat.column_names.size() != 1)
-    {
-        /// throw
-    }
-
    /// TODO: check column data types.
    return StatisticPtr(new TDigestStatistic(stat));
 }
--- a/src/Storages/Statistic/Statistic.h
+++ b/src/Storages/Statistic/Statistic.h
@ -45,7 +45,7 @@ public:

    const String & columnName() const
    {
-        return statistics.column_names[0];
+        return statistics.column_name;
    }
    /// const String& type() const = 0;
    /// virtual StatisticType statisticType() const = 0;
@ -93,7 +93,7 @@ public:

    void update(const Block & block) override
    {
-        const auto & column_with_type = block.getByName(statistics.column_names[0]);
+        const auto & column_with_type = block.getByName(statistics.column_name);
        size_t size = block.rows();

        for (size_t i = 0; i < size; ++i)
--- a/src/Storages/Statistic/tests/gtest_stats.cpp
+++ b/src/Storages/Statistic/tests/gtest_stats.cpp
@ -0,0 +1,44 @@
+#include <gtest/gtest.h>
+
+#include <Storages/Statistic/Statistic.h>
+
+TEST(Statistic, TDigestLessThan)
+{
+    /// this is the simplest data which is continuous integeters.
+    /// so the estimated errors should be low.
+
+    std::vector<Int64> data;
+    data.reserve(100000);
+    for (int i = 0; i < 100000; i++)
+        data.push_back(i);
+
+    auto test_less_than = [](const std::vector<Int64> & data1,
+                             const std::vector<double> & v,
+                             const std::vector<double> & answers,
+                             const std::vector<double> & eps)
+    {
+
+        DB::QuantileTDigest<Int64> t_digest;
+
+        for (int i = 0; i < data1.size(); i++)
+            t_digest.add(data1[i]);
+        t_digest.compress();
+
+        for (int i = 0; i < v.size(); i ++)
+        {
+            auto value = v[i];
+            auto result = t_digest.getCountLessThan(value);
+            auto answer = answers[i];
+            auto error = eps[i];
+            ASSERT_LE(result, answer * (1 + error));
+            ASSERT_GE(result, answer * (1 - error));
+        }
+    };
+    test_less_than(data, {-1, 1e9, 50000.0, 3000.0, 30.0}, {0, 100000, 50000, 3000, 30}, {0, 0, 0.001, 0.001, 0.001});
+
+    /// If we reversely construct the digest, the error is as bad as 5%.
+    std::reverse(data.begin(), data.end());
+    test_less_than(data, {-1, 1e9, 50000.0, 3000.0, 30.0}, {0, 100000, 50000, 3000, 30}, {0, 0, 0.001, 0.001, 0.001});
+
+
+}
--- a/src/Storages/StatisticsDescription.cpp
+++ b/src/Storages/StatisticsDescription.cpp
@ -41,14 +41,18 @@ StatisticDescription StatisticDescription::getStatisticFromAST(const ASTPtr & de
    stat.type = Poco::toLower(stat_definition->type->name);

    ASTPtr expr_list = extractKeyExpressionList(stat_definition->columns->clone());
+    if (expr_list->children.size() != 1)
+    {
+        throw Exception(ErrorCodes::INCORRECT_QUERY, "Statistic must contain exactly one column");
+    }
    for (const auto & ast : expr_list->children)
    {
        ASTIdentifier* ident = ast->as<ASTIdentifier>();
        if (!ident || !columns.hasPhysical(ident->getColumnName()))
            throw Exception(ErrorCodes::INCORRECT_QUERY, "Incorrect column");
        const auto & column = columns.get(ident->getColumnName());
-        stat.column_names.push_back(column.name);
-        stat.data_types.push_back(column.type);
+        stat.column_name = column.name;
+        stat.data_type = column.type;
    }

    UNUSED(context);
@ -60,7 +64,8 @@ StatisticDescription::StatisticDescription(const StatisticDescription & other)
    : definition_ast(other.definition_ast ? other.definition_ast->clone() : nullptr)
    , name(other.name)
    , type(other.type)
-    , column_names(other.column_names)
+    , column_name(other.column_name)
+    , data_type(other.data_type)
 {
 }

@ -76,7 +81,8 @@ StatisticDescription & StatisticDescription::operator=(const StatisticDescriptio

    name = other.name;
    type = other.type;
-    column_names = other.column_names;
+    column_name = other.column_name;
+    data_type = other.data_type;

    return *this;
 }
--- a/src/Storages/StatisticsDescription.h
+++ b/src/Storages/StatisticsDescription.h
@ -18,10 +18,10 @@ struct StatisticDescription
    String type;

    /// Names of statistic columns
-    Names column_names;
+    String column_name;

    /// Data types of statistic columns
-    DataTypes data_types;
+    DataTypePtr data_type;

    static StatisticDescription getStatisticFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns, ContextPtr context);