support estimate by t-digest

This commit is contained in:
Han Fei 2023-08-24 14:09:42 +02:00
parent ec07032173
commit c29261e5b4
8 changed files with 90 additions and 20 deletions

View File

@ -337,12 +337,39 @@ public:
Float64 getCountLessThan(Float64 value) const
{
bool first = true;
Count sum = 0;
Count prev_count = 0;
Float64 prev_x = 0;
Value prev_mean = 0;
///Count sum = 0;
///Value prev_mean = centroids.front().mean;
///Count prev_count = centroids.front().count;
for (const auto & c : centroids)
{
std::cerr << "c "<< c.mean << " "<< c.count << std::endl;
Float64 current_x = sum + c.count * 0.5;
if (c.mean >= value)
{
/// value is smaller than any value.
if (first)
return 0;
return value;
Float64 left = prev_x + 0.5 * (prev_count == 1);
Float64 right = current_x - 0.5 * (c.count == 1);
return checkOverflow<Float64>(interpolate(
static_cast<Value>(value),
prev_mean,
static_cast<Value>(left),
c.mean,
static_cast<Value>(right)));
}
sum += c.count;
prev_mean = c.mean;
prev_count = c.count;
prev_x = current_x;
first = false;
}
/// count is larger than any value.
return count;
}
/** Calculates the quantile q [0, 1] based on the digest.

View File

@ -716,8 +716,7 @@ void MutationsInterpreter::prepare(bool dry_run)
if (it == std::cend(statistics_desc))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown statistic: {}", command.statistic_name);
for (const auto & column : it->column_names)
dependencies.emplace(column, ColumnDependency::STATISTIC);
dependencies.emplace(it->column_name, ColumnDependency::STATISTIC);
materialized_statistics.emplace(command.statistic_name);
}
else if (command.type == MutationCommand::MATERIALIZE_PROJECTION)

View File

@ -10,7 +10,6 @@
#include <Parsers/ASTSubquery.h>
#include <Parsers/formatAST.h>
#include <Interpreters/misc.h>
#include "Common/logger_useful.h"
#include <Common/typeid_cast.h>
#include <DataTypes/NestedUtils.h>
#include <Interpreters/ActionsDAG.h>

View File

@ -131,11 +131,6 @@ Float64 ConditionEstimator::estimateSelectivity(const RPNBuilderTreeNode & node)
StatisticPtr TDigestCreator(const StatisticDescription & stat)
{
if (stat.column_names.size() != 1)
{
/// throw
}
/// TODO: check column data types.
return StatisticPtr(new TDigestStatistic(stat));
}

View File

@ -45,7 +45,7 @@ public:
const String & columnName() const
{
return statistics.column_names[0];
return statistics.column_name;
}
/// const String& type() const = 0;
/// virtual StatisticType statisticType() const = 0;
@ -93,7 +93,7 @@ public:
void update(const Block & block) override
{
const auto & column_with_type = block.getByName(statistics.column_names[0]);
const auto & column_with_type = block.getByName(statistics.column_name);
size_t size = block.rows();
for (size_t i = 0; i < size; ++i)

View File

@ -0,0 +1,44 @@
#include <gtest/gtest.h>
#include <Storages/Statistic/Statistic.h>
TEST(Statistic, TDigestLessThan)
{
/// this is the simplest data which is continuous integeters.
/// so the estimated errors should be low.
std::vector<Int64> data;
data.reserve(100000);
for (int i = 0; i < 100000; i++)
data.push_back(i);
auto test_less_than = [](const std::vector<Int64> & data1,
const std::vector<double> & v,
const std::vector<double> & answers,
const std::vector<double> & eps)
{
DB::QuantileTDigest<Int64> t_digest;
for (int i = 0; i < data1.size(); i++)
t_digest.add(data1[i]);
t_digest.compress();
for (int i = 0; i < v.size(); i ++)
{
auto value = v[i];
auto result = t_digest.getCountLessThan(value);
auto answer = answers[i];
auto error = eps[i];
ASSERT_LE(result, answer * (1 + error));
ASSERT_GE(result, answer * (1 - error));
}
};
test_less_than(data, {-1, 1e9, 50000.0, 3000.0, 30.0}, {0, 100000, 50000, 3000, 30}, {0, 0, 0.001, 0.001, 0.001});
/// If we reversely construct the digest, the error is as bad as 5%.
std::reverse(data.begin(), data.end());
test_less_than(data, {-1, 1e9, 50000.0, 3000.0, 30.0}, {0, 100000, 50000, 3000, 30}, {0, 0, 0.001, 0.001, 0.001});
}

View File

@ -41,14 +41,18 @@ StatisticDescription StatisticDescription::getStatisticFromAST(const ASTPtr & de
stat.type = Poco::toLower(stat_definition->type->name);
ASTPtr expr_list = extractKeyExpressionList(stat_definition->columns->clone());
if (expr_list->children.size() != 1)
{
throw Exception(ErrorCodes::INCORRECT_QUERY, "Statistic must contain exactly one column");
}
for (const auto & ast : expr_list->children)
{
ASTIdentifier* ident = ast->as<ASTIdentifier>();
if (!ident || !columns.hasPhysical(ident->getColumnName()))
throw Exception(ErrorCodes::INCORRECT_QUERY, "Incorrect column");
const auto & column = columns.get(ident->getColumnName());
stat.column_names.push_back(column.name);
stat.data_types.push_back(column.type);
stat.column_name = column.name;
stat.data_type = column.type;
}
UNUSED(context);
@ -60,7 +64,8 @@ StatisticDescription::StatisticDescription(const StatisticDescription & other)
: definition_ast(other.definition_ast ? other.definition_ast->clone() : nullptr)
, name(other.name)
, type(other.type)
, column_names(other.column_names)
, column_name(other.column_name)
, data_type(other.data_type)
{
}
@ -76,7 +81,8 @@ StatisticDescription & StatisticDescription::operator=(const StatisticDescriptio
name = other.name;
type = other.type;
column_names = other.column_names;
column_name = other.column_name;
data_type = other.data_type;
return *this;
}

View File

@ -18,10 +18,10 @@ struct StatisticDescription
String type;
/// Names of statistic columns
Names column_names;
String column_name;
/// Data types of statistic columns
DataTypes data_types;
DataTypePtr data_type;
static StatisticDescription getStatisticFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns, ContextPtr context);