update docs and refine statements

This commit is contained in:
Han Fei 2023-09-08 02:27:17 +02:00
parent f60dad0598
commit ddcb64f39f
17 changed files with 209 additions and 161 deletions

View File

@ -44,7 +44,10 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
INDEX index_name2 expr2 TYPE type2(...) [GRANULARITY value2],
...
PROJECTION projection_name_1 (SELECT <COLUMN LIST EXPR> [GROUP BY] [ORDER BY]),
PROJECTION projection_name_2 (SELECT <COLUMN LIST EXPR> [GROUP BY] [ORDER BY])
PROJECTION projection_name_2 (SELECT <COLUMN LIST EXPR> [GROUP BY] [ORDER BY]),
...
STATISTIC <COLUMN LIST> TYPE type1,
STATISTIC <COLUMN LIST> TYPE type2
) ENGINE = MergeTree()
ORDER BY expr
[PARTITION BY expr]
@ -1353,3 +1356,22 @@ In this sample configuration:
- `_part_uuid` — Unique part identifier (if enabled MergeTree setting `assign_part_uuids`).
- `_partition_value` — Values (a tuple) of a `partition by` expression.
- `_sample_factor` — Sample factor (from the query).
## Column Statistics (Experimental) {#column-statistics}
The statistic declaration is in the columns section of the `CREATE` query.
``` sql
STATISTIC <list of columns> TYPE type
```
For tables from the `*MergeTree` family, statistics can be specified.
These lightweight statistics aggregate information about distribution of values in columns.
They can be used for query optimization (At current time they are used for moving expressions to PREWHERE).
#### Available Types of Column Statistics {#available-types-of-column-statistics}
- `tdigest`
Stores distribution of values from numeric columns in [TDigest](https://github.com/tdunning/t-digest) sketch.

View File

@ -459,8 +459,8 @@ ASTPtr InterpreterCreateQuery::formatStatistics(const StatisticsDescriptions & s
{
auto res = std::make_shared<ASTExpressionList>();
for (const auto & statistic : statistics)
res->children.push_back(statistic.definition_ast->clone());
for (const auto & definition_ast : statistics.definition_asts)
res->children.push_back(definition_ast->clone());
return res;
}
@ -721,8 +721,10 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti
}
if (create.columns_list->stats)
for (const auto & statistic : create.columns_list->stats->children)
properties.stats.push_back(
StatisticDescription::getStatisticFromAST(statistic->clone(), properties.columns, getContext()));
{
auto stats = StatisticsDescriptions::getStatisticsFromAST(statistic->clone(), properties.columns, getContext());
properties.stats.merge(stats);
}
if (create.columns_list->projections)
for (const auto & projection_ast : create.columns_list->projections->children)

View File

@ -724,17 +724,20 @@ void MutationsInterpreter::prepare(bool dry_run)
else if (command.type == MutationCommand::MATERIALIZE_STATISTIC)
{
mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION);
auto it = std::find_if(
std::cbegin(statistics_desc), std::end(statistics_desc),
[&](const StatisticDescription & statistic)
{
return statistic.column_name == command.statistic_column_name;
});
if (it == std::cend(statistics_desc))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown statistic column: {}", command.statistic_column_name);
for (const auto & stat_column_name: command.statistic_columns)
{
auto it = std::find_if(
std::cbegin(statistics_desc), std::end(statistics_desc),
[&](const StatisticDescription & statistic)
{
return statistic.column_name == stat_column_name;
});
if (it == std::cend(statistics_desc))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown statistic column: {}", stat_column_name);
dependencies.emplace(it->column_name, ColumnDependency::STATISTIC);
materialized_statistics.emplace(command.statistic_column_name);
dependencies.emplace(it->column_name, ColumnDependency::STATISTIC);
materialized_statistics.emplace(stat_column_name);
}
}
else if (command.type == MutationCommand::MATERIALIZE_PROJECTION)
{
@ -755,7 +758,8 @@ void MutationsInterpreter::prepare(bool dry_run)
else if (command.type == MutationCommand::DROP_STATISTIC)
{
mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION);
materialized_statistics.erase(command.statistic_column_name);
for (const auto & stat_column_name: command.statistic_columns)
materialized_statistics.erase(stat_column_name);
}
else if (command.type == MutationCommand::DROP_PROJECTION)
{

View File

@ -1,4 +1,5 @@
#include <Parsers/ASTStatisticDeclaration.h>
#include <Parsers/ASTIdentifier.h>
#include <Common/quoteString.h>
#include <IO/Operators.h>
@ -12,17 +13,27 @@ ASTPtr ASTStatisticDeclaration::clone() const
{
auto res = std::make_shared<ASTStatisticDeclaration>();
res->column_name = column_name;
res->set(res->columns, columns->clone());
res->type = type;
return res;
}
void ASTStatisticDeclaration::formatImpl(const FormatSettings & s, FormatState &, FormatStateStacked) const
std::vector<String> ASTStatisticDeclaration::getColumnNames() const
{
s.ostr << backQuoteIfNeed(column_name);
s.ostr << " ";
std::vector<String> result;
result.reserve(columns->children.size());
for (const ASTPtr & column_ast : columns->children)
{
result.push_back(column_ast->as<ASTIdentifier &>().name());
}
return result;
}
void ASTStatisticDeclaration::formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const
{
columns->formatImpl(s, state, frame);
s.ostr << (s.hilite ? hilite_keyword : "") << " TYPE " << (s.hilite ? hilite_none : "");
s.ostr << backQuoteIfNeed(type);
}

View File

@ -12,13 +12,15 @@ class ASTFunction;
class ASTStatisticDeclaration : public IAST
{
public:
String column_name;
/// We do not support to set bucket number for tdigest
IAST * columns;
/// TODO type should be a list of ASTFunction, for example, 'tdigest(256), hyperloglog(128)', etc.
String type;
/** Get the text that identifies this element. */
String getID(char) const override { return "Stat"; }
std::vector<String> getColumnNames() const;
ASTPtr clone() const override;
void formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const override;
};

View File

@ -165,14 +165,13 @@ bool ParserStatisticDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected &
{
ParserKeyword s_type("TYPE");
ParserIdentifier name_p;
ParserList columns_p(std::make_unique<ParserIdentifier>(), std::make_unique<ParserToken>(TokenType::Comma), false);
ParserIdentifier type_p;
ASTPtr name;
ASTPtr column;
ASTPtr columns;
ASTPtr type;
if (!name_p.parse(pos, name, expected))
if (!columns_p.parse(pos, columns, expected))
return false;
if (!s_type.ignore(pos, expected))
@ -182,7 +181,7 @@ bool ParserStatisticDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected &
return false;
auto stat = std::make_shared<ASTStatisticDeclaration>();
stat->column_name = name->as<ASTIdentifier &>().name();
stat->set(stat->columns, columns);
stat->type = type->as<ASTIdentifier &>().name();
node = stat;

View File

@ -242,9 +242,8 @@ std::optional<AlterCommand> AlterCommand::parse(const ASTAlterCommand * command_
const auto & ast_stat_decl = command_ast->statistic_decl->as<ASTStatisticDeclaration &>();
command.statistic_column_name = ast_stat_decl.column_name;
command.statistic_columns = ast_stat_decl.getColumnNames();
command.statistic_type = ast_stat_decl.type;
command.if_not_exists = command_ast->if_not_exists;
return command;
@ -315,7 +314,7 @@ std::optional<AlterCommand> AlterCommand::parse(const ASTAlterCommand * command_
command.type = AlterCommand::DROP_STATISTIC;
const auto & ast_stat_decl = command_ast->statistic_decl->as<ASTStatisticDeclaration &>();
command.statistic_column_name = ast_stat_decl.column_name;
command.statistic_columns = ast_stat_decl.getColumnNames();
command.statistic_type = ast_stat_decl.type;
command.if_exists = command_ast->if_exists;
command.clear = command_ast->clear_statistic;
@ -589,45 +588,47 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context)
}
else if (type == ADD_STATISTIC)
{
if (!if_not_exists && std::any_of(
metadata.statistics.cbegin(),
metadata.statistics.cend(),
[this](const auto & statistic)
{
return statistic.column_name == statistic_column_name && statistic.type == statistic_type;
}))
/// TODO: Right now we assume there is only one type of statistics for simple implement.
for (const auto & statistic_column_name : statistic_columns)
{
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot add statistic {} with type {}: statistic on this column with this type already exists", statistic_column_name, statistic_type);
if (!if_not_exists && std::any_of(
metadata.statistics.cbegin(),
metadata.statistics.cend(),
[&](const auto & statistic)
{
return statistic.column_name == statistic_column_name;
}))
{
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot add statistic {} with type {}: statistic on this column with this type already exists", statistic_column_name, statistic_type);
}
}
auto insert_it = metadata.statistics.end();
/// insert the index in the beginning of the indices list
if (first)
insert_it = metadata.statistics.begin();
metadata.statistics.emplace(insert_it, StatisticDescription::getStatisticFromAST(statistic_decl, metadata.columns, context));
auto stats = StatisticsDescriptions::getStatisticsFromAST(statistic_decl, metadata.columns, context);
metadata.statistics.merge(stats);
}
else if (type == DROP_STATISTIC)
{
if (!partition && !clear)
{
auto erase_it = std::find_if(
metadata.statistics.begin(),
metadata.statistics.end(),
[this](const auto & statistic)
{
return statistic.column_name == statistic_column_name && statistic.type == statistic_type;
});
if (erase_it == metadata.statistics.end())
for (const auto & stat_column_name : statistic_columns)
{
if (if_exists)
return;
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Wrong statistic name. Cannot find statistic {} with type {} to drop", backQuote(statistic_column_name), statistic_type);
}
auto erase_it = std::find_if(
metadata.statistics.begin(),
metadata.statistics.end(),
[stat_column_name](const auto & statistic)
{
return statistic.column_name == stat_column_name;
});
metadata.statistics.erase(erase_it);
if (erase_it == metadata.statistics.end())
{
if (if_exists)
return;
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Wrong statistic name. Cannot find statistic {} with type {} to drop", backQuote(stat_column_name), statistic_type);
}
LOG_INFO(&Poco::Logger::get("drop_stat"), "dropping statistic {}", erase_it->column_name);
metadata.statistics.erase(erase_it);
}
}
}
else if (type == ADD_CONSTRAINT)
@ -958,7 +959,7 @@ std::optional<MutationCommand> AlterCommand::tryConvertToMutationCommand(Storage
else if (type == DROP_STATISTIC)
{
result.type = MutationCommand::Type::DROP_STATISTIC;
result.column_name = statistic_column_name;
result.statistic_columns = statistic_columns;
if (clear)
result.clear = true;

View File

@ -121,7 +121,7 @@ struct AlterCommand
String projection_name;
ASTPtr statistic_decl = nullptr;
String statistic_column_name;
std::vector<String> statistic_columns;
String statistic_type;
/// For MODIFY TTL

View File

@ -580,9 +580,11 @@ static StoragePtr create(const StorageFactory::Arguments & args)
metadata.secondary_indices.push_back(IndexDescription::getIndexFromAST(index, columns, context));
if (args.query.columns_list && args.query.columns_list->stats)
for (const auto & stat : args.query.columns_list->stats->children)
metadata.statistics.push_back(
StatisticDescription::getStatisticFromAST(stat, columns, args.getContext()));
for (const auto & stat_ast : args.query.columns_list->stats->children)
{
auto stats = StatisticsDescriptions::getStatisticsFromAST(stat_ast, columns, args.getContext());
metadata.statistics.merge(stats);
}
if (args.query.columns_list && args.query.columns_list->projections)
for (auto & projection_ast : args.query.columns_list->projections->children)

View File

@ -77,7 +77,11 @@ std::optional<MutationCommand> MutationCommand::parse(ASTAlterCommand * command,
res.type = MATERIALIZE_STATISTIC;
res.partition = command->partition;
res.predicate = nullptr;
res.statistic_column_name = command->statistic_decl->as<ASTStatisticDeclaration &>().column_name;
for (const ASTPtr & column_ast : command->statistic_decl->as<ASTStatisticDeclaration &>().columns->children)
{
const auto & column = column_ast->as<ASTIdentifier &>().getColumnName();
res.statistic_columns.push_back(column);
}
return res;
}
else if (command->type == ASTAlterCommand::MATERIALIZE_PROJECTION)

View File

@ -53,7 +53,7 @@ struct MutationCommand
/// For MATERIALIZE INDEX and PROJECTION and STATISTIC
String index_name = {};
String projection_name = {};
String statistic_column_name = {};
std::vector<String> statistic_columns = {};
/// For MATERIALIZE INDEX, UPDATE and DELETE.
ASTPtr partition = {};

View File

@ -135,7 +135,7 @@ StatisticPtr TDigestCreator(const StatisticDescription & stat)
return StatisticPtr(new TDigestStatistic(stat));
}
void MergeTreeStatisticFactory::registerCreator(const std::string & stat_type, Creator creator)
void MergeTreeStatisticFactory::registerCreator(StatisticType stat_type, Creator creator)
{
if (!creators.emplace(stat_type, std::move(creator)).second)
throw Exception(ErrorCodes::LOGICAL_ERROR, "MergeTreeStatisticFactory: the statistic creator type {} is not unique", stat_type);
@ -143,7 +143,7 @@ void MergeTreeStatisticFactory::registerCreator(const std::string & stat_type, C
MergeTreeStatisticFactory::MergeTreeStatisticFactory()
{
registerCreator("tdigest", TDigestCreator);
registerCreator(TDigest, TDigestCreator);
///registerCreator("cm_sketch", CMSketchCreator);
}
@ -160,16 +160,7 @@ StatisticPtr MergeTreeStatisticFactory::get(const StatisticDescription & stat) c
if (it == creators.end())
{
throw Exception(ErrorCodes::INCORRECT_QUERY,
"Unknown Statistic type '{}'. Available types: {}", stat.type,
std::accumulate(creators.cbegin(), creators.cend(), std::string{},
[] (auto && left, const auto & right) -> std::string
{
if (left.empty())
return right.first;
else
return left + ", " + right.first;
})
);
"Unknown Statistic type '{}'. Available types: tdigest", stat.type);
}
return std::make_shared<TDigestStatistic>(stat);
}

View File

@ -43,10 +43,10 @@ public:
return stat.column_name;
}
const String & type() const
{
return stat.type;
}
//const String & type() const
//{
// return stat.type;
//}
virtual void serialize(WriteBuffer & buf) = 0;
@ -118,13 +118,13 @@ public:
Statistics getMany(const std::vector<StatisticDescription> & stats) const;
void registerCreator(const std::string & type, Creator creator);
void registerCreator(StatisticType type, Creator creator);
protected:
MergeTreeStatisticFactory();
private:
using Creators = std::unordered_map<std::string, Creator>;
using Creators = std::unordered_map<StatisticType, Creator>;
Creators creators;
};

View File

@ -10,6 +10,8 @@
#include <Storages/extractKeyExpressionList.h>
#include <Storages/StatisticsDescription.h>
#include <Common/logger_useful.h>
namespace DB
{
@ -19,58 +21,51 @@ namespace ErrorCodes
extern const int LOGICAL_ERROR;
};
StatisticDescription StatisticDescription::getStatisticFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns, ContextPtr context)
StatisticType StatisticDescription::stringToType(String type)
{
if (type.empty())
return TDigest;
if (type == "tdigest")
return TDigest;
throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistic type: {}", type);
}
StatisticsDescriptions StatisticsDescriptions::getStatisticsFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns, ContextPtr context)
{
const auto * stat_definition = definition_ast->as<ASTStatisticDeclaration>();
if (!stat_definition)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create statistic from non ASTStatisticDeclaration AST");
StatisticDescription stat;
stat.definition_ast = definition_ast->clone();
stat.type = Poco::toLower(stat_definition->type);
if (stat.type != "tdigest")
throw Exception(ErrorCodes::INCORRECT_QUERY, "Incorrect type name {}", stat.type);
String column_name = stat_definition->column_name;
LOG_INFO(&Poco::Logger::get("stats_desc"), "stat_def is like {}", stat_definition->dumpTree());
if (!columns.hasPhysical(column_name))
throw Exception(ErrorCodes::INCORRECT_QUERY, "Incorrect column name {}", column_name);
StatisticsDescriptions stats;
for (const auto & column_ast : stat_definition->columns->children)
{
StatisticDescription stat;
stat.type = StatisticDescription::stringToType(Poco::toLower(stat_definition->type));
String column_name = column_ast->as<ASTIdentifier &>().name();
const auto & column = columns.getPhysical(column_name);
stat.column_name = column.name;
/// TODO: check if it is numeric.
stat.data_type = column.type;
if (!columns.hasPhysical(column_name))
throw Exception(ErrorCodes::INCORRECT_QUERY, "Incorrect column name {}", column_name);
const auto & column = columns.getPhysical(column_name);
stat.column_name = column.name;
/// TODO: check if it is numeric.
stat.data_type = column.type;
stats.push_back(stat);
}
stats.definition_asts.push_back(definition_ast);
if (stats.empty())
throw Exception(ErrorCodes::INCORRECT_QUERY, "Empty statistic column list");
LOG_INFO(&Poco::Logger::get("stats_desc"), "there are {} stats", stats.size());
UNUSED(context);
return stat;
return stats;
}
StatisticDescription::StatisticDescription(const StatisticDescription & other)
: definition_ast(other.definition_ast ? other.definition_ast->clone() : nullptr)
, type(other.type)
, column_name(other.column_name)
, data_type(other.data_type)
{
}
StatisticDescription & StatisticDescription::operator=(const StatisticDescription & other)
{
if (&other == this)
return *this;
if (other.definition_ast)
definition_ast = other.definition_ast->clone();
else
definition_ast.reset();
type = other.type;
column_name = other.column_name;
data_type = other.data_type;
return *this;
}
bool StatisticsDescriptions::has(const String & name) const
{
for (const auto & statistic : *this)
@ -79,31 +74,22 @@ bool StatisticsDescriptions::has(const String & name) const
return false;
}
void StatisticsDescriptions::merge(const StatisticsDescriptions & other)
{
insert(end(), other.begin(), other.end());
definition_asts.insert(definition_asts.end(), other.definition_asts.begin(), other.definition_asts.end());
}
String StatisticsDescriptions::toString() const
{
if (empty())
return {};
ASTExpressionList list;
for (const auto & statistic : *this)
list.children.push_back(statistic.definition_ast);
for (const auto & ast : definition_asts)
list.children.push_back(ast);
return serializeAST(list);
}
StatisticsDescriptions StatisticsDescriptions::parse(const String & str, const ColumnsDescription & columns, ContextPtr context)
{
StatisticsDescriptions result;
if (str.empty())
return result;
ParserStatisticDeclaration parser;
ASTPtr list = parseQuery(parser, str, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH);
for (const auto & index : list->children)
result.emplace_back(StatisticDescription::getStatisticFromAST(index, columns, context));
return result;
}
}

View File

@ -7,13 +7,15 @@
namespace DB
{
enum StatisticType
{
TDigest = 0,
};
struct StatisticDescription
{
/// Definition AST of statistic
ASTPtr definition_ast;
/// the type of statistic, right now it's only tdigest.
String type;
StatisticType type;
/// Names of statistic columns
String column_name;
@ -21,24 +23,22 @@ struct StatisticDescription
/// Data types of statistic columns
DataTypePtr data_type;
static StatisticDescription getStatisticFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns, ContextPtr context);
StatisticDescription() = default;
/// We need custom copy constructors because we don't want
/// unintentionaly share AST variables and modify them.
StatisticDescription(const StatisticDescription & other);
StatisticDescription & operator=(const StatisticDescription & other);
static StatisticType stringToType(String type);
};
struct StatisticsDescriptions : public std::vector<StatisticDescription>
{
std::vector<ASTPtr> definition_asts;
/// Stat with name exists
bool has(const String & name) const;
/// merge with other Statistics
void merge(const StatisticsDescriptions & other);
/// Convert description to string
String toString() const;
/// Parse description from string
static StatisticsDescriptions parse(const String & str, const ColumnsDescription & columns, ContextPtr context);
static StatisticsDescriptions getStatisticsFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns, ContextPtr context);
};
}

View File

@ -0,0 +1,22 @@
CREATE TABLE default.t1\n(\n `a` Int64,\n `b` Float64,\n `pk` String,\n STATISTIC a, b TYPE tdigest\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS index_granularity = 8192
After insert
SELECT count()
FROM t1
PREWHERE (a < 10) AND (b < 10)
10
After drop statistic
SELECT count()
FROM t1
PREWHERE (b < 10) AND (a < 10)
10
After add statistic
After materialize statistic
SELECT count()
FROM t1
PREWHERE (a < 10) AND (b < 10)
20
After merge
SELECT count()
FROM t1
PREWHERE (a < 10) AND (b < 10)
20

View File

@ -8,8 +8,7 @@ CREATE TABLE t1
a Int64,
b Float64,
pk String,
STATISTIC a TYPE tdigest,
STATISTIC b TYPE tdigest
STATISTIC a, b TYPE tdigest,
) Engine = MergeTree() ORDER BY pk;
SHOW CREATE TABLE t1;
@ -20,18 +19,21 @@ SELECT 'After insert';
EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and a < 10;
SELECT count(*) FROM t1 WHERE b < 10 and a < 10;
ALTER TABLE t1 DROP STATISTIC a TYPE tdigest;
ALTER TABLE t1 DROP STATISTIC b TYPE tdigest;
ALTER TABLE t1 DROP STATISTIC a, b TYPE tdigest;
SELECT 'After drop statistic';
EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and a < 10;
SELECT count(*) FROM t1 WHERE b < 10 and a < 10;
ALTER TABLE t1 ADD STATISTIC a TYPE tdigest;
ALTER TABLE t1 ADD STATISTIC b TYPE tdigest;
--SHOW CREATE TABLE t1;
ALTER TABLE t1 MATERIALIZE STATISTIC a TYPE tdigest;
ALTER TABLE t1 MATERIALIZE STATISTIC b TYPE tdigest;
ALTER TABLE t1 ADD STATISTIC a, b TYPE tdigest;
SELECT 'After add statistic';
--SHOW CREATE TABLE t1;
ALTER TABLE t1 MATERIALIZE STATISTIC a, b TYPE tdigest;
INSERT INTO t1 select number, -number, generateUUIDv4() FROM system.numbers LIMIT 10000;
SELECT 'After materialize statistic';