mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-04 21:42:39 +00:00
support estimate by t-digest
This commit is contained in:
parent
ec07032173
commit
c29261e5b4
@ -337,12 +337,39 @@ public:
|
||||
|
||||
Float64 getCountLessThan(Float64 value) const
|
||||
{
|
||||
bool first = true;
|
||||
Count sum = 0;
|
||||
Count prev_count = 0;
|
||||
Float64 prev_x = 0;
|
||||
Value prev_mean = 0;
|
||||
|
||||
///Count sum = 0;
|
||||
///Value prev_mean = centroids.front().mean;
|
||||
///Count prev_count = centroids.front().count;
|
||||
for (const auto & c : centroids)
|
||||
{
|
||||
std::cerr << "c "<< c.mean << " "<< c.count << std::endl;
|
||||
Float64 current_x = sum + c.count * 0.5;
|
||||
if (c.mean >= value)
|
||||
{
|
||||
/// value is smaller than any value.
|
||||
if (first)
|
||||
return 0;
|
||||
|
||||
return value;
|
||||
Float64 left = prev_x + 0.5 * (prev_count == 1);
|
||||
Float64 right = current_x - 0.5 * (c.count == 1);
|
||||
return checkOverflow<Float64>(interpolate(
|
||||
static_cast<Value>(value),
|
||||
prev_mean,
|
||||
static_cast<Value>(left),
|
||||
c.mean,
|
||||
static_cast<Value>(right)));
|
||||
}
|
||||
sum += c.count;
|
||||
prev_mean = c.mean;
|
||||
prev_count = c.count;
|
||||
prev_x = current_x;
|
||||
first = false;
|
||||
}
|
||||
/// count is larger than any value.
|
||||
return count;
|
||||
}
|
||||
|
||||
/** Calculates the quantile q [0, 1] based on the digest.
|
||||
|
@ -716,8 +716,7 @@ void MutationsInterpreter::prepare(bool dry_run)
|
||||
if (it == std::cend(statistics_desc))
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown statistic: {}", command.statistic_name);
|
||||
|
||||
for (const auto & column : it->column_names)
|
||||
dependencies.emplace(column, ColumnDependency::STATISTIC);
|
||||
dependencies.emplace(it->column_name, ColumnDependency::STATISTIC);
|
||||
materialized_statistics.emplace(command.statistic_name);
|
||||
}
|
||||
else if (command.type == MutationCommand::MATERIALIZE_PROJECTION)
|
||||
|
@ -10,7 +10,6 @@
|
||||
#include <Parsers/ASTSubquery.h>
|
||||
#include <Parsers/formatAST.h>
|
||||
#include <Interpreters/misc.h>
|
||||
#include "Common/logger_useful.h"
|
||||
#include <Common/typeid_cast.h>
|
||||
#include <DataTypes/NestedUtils.h>
|
||||
#include <Interpreters/ActionsDAG.h>
|
||||
|
@ -131,11 +131,6 @@ Float64 ConditionEstimator::estimateSelectivity(const RPNBuilderTreeNode & node)
|
||||
|
||||
StatisticPtr TDigestCreator(const StatisticDescription & stat)
|
||||
{
|
||||
if (stat.column_names.size() != 1)
|
||||
{
|
||||
/// throw
|
||||
}
|
||||
|
||||
/// TODO: check column data types.
|
||||
return StatisticPtr(new TDigestStatistic(stat));
|
||||
}
|
||||
|
@ -45,7 +45,7 @@ public:
|
||||
|
||||
const String & columnName() const
|
||||
{
|
||||
return statistics.column_names[0];
|
||||
return statistics.column_name;
|
||||
}
|
||||
/// const String& type() const = 0;
|
||||
/// virtual StatisticType statisticType() const = 0;
|
||||
@ -93,7 +93,7 @@ public:
|
||||
|
||||
void update(const Block & block) override
|
||||
{
|
||||
const auto & column_with_type = block.getByName(statistics.column_names[0]);
|
||||
const auto & column_with_type = block.getByName(statistics.column_name);
|
||||
size_t size = block.rows();
|
||||
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
|
44
src/Storages/Statistic/tests/gtest_stats.cpp
Normal file
44
src/Storages/Statistic/tests/gtest_stats.cpp
Normal file
@ -0,0 +1,44 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <Storages/Statistic/Statistic.h>
|
||||
|
||||
TEST(Statistic, TDigestLessThan)
|
||||
{
|
||||
/// this is the simplest data which is continuous integeters.
|
||||
/// so the estimated errors should be low.
|
||||
|
||||
std::vector<Int64> data;
|
||||
data.reserve(100000);
|
||||
for (int i = 0; i < 100000; i++)
|
||||
data.push_back(i);
|
||||
|
||||
auto test_less_than = [](const std::vector<Int64> & data1,
|
||||
const std::vector<double> & v,
|
||||
const std::vector<double> & answers,
|
||||
const std::vector<double> & eps)
|
||||
{
|
||||
|
||||
DB::QuantileTDigest<Int64> t_digest;
|
||||
|
||||
for (int i = 0; i < data1.size(); i++)
|
||||
t_digest.add(data1[i]);
|
||||
t_digest.compress();
|
||||
|
||||
for (int i = 0; i < v.size(); i ++)
|
||||
{
|
||||
auto value = v[i];
|
||||
auto result = t_digest.getCountLessThan(value);
|
||||
auto answer = answers[i];
|
||||
auto error = eps[i];
|
||||
ASSERT_LE(result, answer * (1 + error));
|
||||
ASSERT_GE(result, answer * (1 - error));
|
||||
}
|
||||
};
|
||||
test_less_than(data, {-1, 1e9, 50000.0, 3000.0, 30.0}, {0, 100000, 50000, 3000, 30}, {0, 0, 0.001, 0.001, 0.001});
|
||||
|
||||
/// If we reversely construct the digest, the error is as bad as 5%.
|
||||
std::reverse(data.begin(), data.end());
|
||||
test_less_than(data, {-1, 1e9, 50000.0, 3000.0, 30.0}, {0, 100000, 50000, 3000, 30}, {0, 0, 0.001, 0.001, 0.001});
|
||||
|
||||
|
||||
}
|
@ -41,14 +41,18 @@ StatisticDescription StatisticDescription::getStatisticFromAST(const ASTPtr & de
|
||||
stat.type = Poco::toLower(stat_definition->type->name);
|
||||
|
||||
ASTPtr expr_list = extractKeyExpressionList(stat_definition->columns->clone());
|
||||
if (expr_list->children.size() != 1)
|
||||
{
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Statistic must contain exactly one column");
|
||||
}
|
||||
for (const auto & ast : expr_list->children)
|
||||
{
|
||||
ASTIdentifier* ident = ast->as<ASTIdentifier>();
|
||||
if (!ident || !columns.hasPhysical(ident->getColumnName()))
|
||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "Incorrect column");
|
||||
const auto & column = columns.get(ident->getColumnName());
|
||||
stat.column_names.push_back(column.name);
|
||||
stat.data_types.push_back(column.type);
|
||||
stat.column_name = column.name;
|
||||
stat.data_type = column.type;
|
||||
}
|
||||
|
||||
UNUSED(context);
|
||||
@ -60,7 +64,8 @@ StatisticDescription::StatisticDescription(const StatisticDescription & other)
|
||||
: definition_ast(other.definition_ast ? other.definition_ast->clone() : nullptr)
|
||||
, name(other.name)
|
||||
, type(other.type)
|
||||
, column_names(other.column_names)
|
||||
, column_name(other.column_name)
|
||||
, data_type(other.data_type)
|
||||
{
|
||||
}
|
||||
|
||||
@ -76,7 +81,8 @@ StatisticDescription & StatisticDescription::operator=(const StatisticDescriptio
|
||||
|
||||
name = other.name;
|
||||
type = other.type;
|
||||
column_names = other.column_names;
|
||||
column_name = other.column_name;
|
||||
data_type = other.data_type;
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
@ -18,10 +18,10 @@ struct StatisticDescription
|
||||
String type;
|
||||
|
||||
/// Names of statistic columns
|
||||
Names column_names;
|
||||
String column_name;
|
||||
|
||||
/// Data types of statistic columns
|
||||
DataTypes data_types;
|
||||
DataTypePtr data_type;
|
||||
|
||||
static StatisticDescription getStatisticFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns, ContextPtr context);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user