Uniq statistics supports more datatypes and refactor logical when there is no statistics.

This commit is contained in:
JackyWoo 2024-07-24 17:43:27 +08:00
parent 4712b79960
commit 40fc0ca574
11 changed files with 94 additions and 89 deletions

View File

@ -19,7 +19,7 @@ void ConditionSelectivityEstimator::ColumnSelectivityEstimator::merge(String par
Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateLess(const Field & val, Float64 rows) const
{
if (part_statistics.empty())
return default_normal_cond_factor * rows;
return default_cond_range_factor * rows;
Float64 result = 0;
Float64 part_rows = 0;
for (const auto & [key, estimator] : part_statistics)
@ -38,15 +38,8 @@ Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateGreat
Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateEqual(const Field & val, Float64 rows) const
{
if (part_statistics.empty())
{
auto float_val = StatisticsUtils::tryConvertToFloat64(val);
if (!float_val)
return default_unknown_cond_factor * rows;
else if (float_val.value() < - threshold || float_val.value() > threshold)
return default_normal_cond_factor * rows;
else
return default_good_cond_factor * rows;
}
return default_cond_equal_factor * rows;
Float64 result = 0;
Float64 partial_cnt = 0;
for (const auto & [key, estimator] : part_statistics)
@ -149,30 +142,22 @@ Float64 ConditionSelectivityEstimator::estimateRowCount(const RPNBuilderTreeNode
auto [op, val] = extractBinaryOp(node, col);
if (dummy)
{
if (op == "equals")
return default_cond_equal_factor * total_rows;
else if (op == "less" || op == "lessOrEquals" || op == "greater" || op == "greaterOrEquals")
return default_cond_range_factor * total_rows;
else
return default_unknown_cond_factor * total_rows;
}
if (op == "equals")
{
if (dummy)
{
auto float_val = StatisticsUtils::tryConvertToFloat64(val);
if (!float_val || (float_val < - threshold || float_val > threshold))
return default_normal_cond_factor * total_rows;
else
return default_good_cond_factor * total_rows;
}
return estimator.estimateEqual(val, total_rows);
}
else if (op == "less" || op == "lessOrEquals")
{
if (dummy)
return default_normal_cond_factor * total_rows;
return estimator.estimateLess(val, total_rows);
}
else if (op == "greater" || op == "greaterOrEquals")
{
if (dummy)
return default_normal_cond_factor * total_rows;
return estimator.estimateGreater(val, total_rows);
}
else
return default_unknown_cond_factor * total_rows;
}

View File

@ -38,12 +38,10 @@ private:
std::pair<String, Field> extractBinaryOp(const RPNBuilderTreeNode & node, const String & column_name) const;
static constexpr auto default_good_cond_factor = 0.1;
static constexpr auto default_normal_cond_factor = 0.5;
static constexpr auto default_unknown_cond_factor = 1.0;
/// Conditions like "x = N" are considered good if abs(N) > threshold.
/// This is used to assume that condition is likely to have good selectivity.
static constexpr auto threshold = 2;
/// Used to estimate the selectivity of a condition when there is no statistics.
static constexpr auto default_cond_range_factor = 0.5;
static constexpr auto default_cond_equal_factor = 0.01;
static constexpr auto default_unknown_cond_factor = 1;
UInt64 total_rows = 0;
std::map<String, ColumnSelectivityEstimator> column_estimators;

View File

@ -51,13 +51,6 @@ std::optional<Float64> StatisticsUtils::tryConvertToFloat64(const Field & field)
}
}
std::optional<String> StatisticsUtils::tryConvertToString(const DB::Field & field)
{
if (field.getType() == Field::Types::String)
return field.get<String>();
return {};
}
IStatistics::IStatistics(const SingleStatisticsDescription & stat_)
: stat(stat_)
{
@ -106,7 +99,7 @@ Float64 ColumnStatistics::estimateLess(const Field & val) const
return stats.at(StatisticsType::TDigest)->estimateLess(val);
if (stats.contains(StatisticsType::MinMax))
return stats.at(StatisticsType::MinMax)->estimateLess(val);
return rows * ConditionSelectivityEstimator::default_normal_cond_factor;
return rows * ConditionSelectivityEstimator::default_cond_range_factor;
}
Float64 ColumnStatistics::estimateGreater(const Field & val) const
@ -116,8 +109,7 @@ Float64 ColumnStatistics::estimateGreater(const Field & val) const
Float64 ColumnStatistics::estimateEqual(const Field & val) const
{
auto float_val = StatisticsUtils::tryConvertToFloat64(val);
if (float_val.has_value() && stats.contains(StatisticsType::Uniq) && stats.contains(StatisticsType::TDigest))
if (stats_desc.data_type->isValueRepresentedByNumber() && stats.contains(StatisticsType::Uniq) && stats.contains(StatisticsType::TDigest))
{
/// 2048 is the default number of buckets in TDigest. In this case, TDigest stores exactly one value (with many rows) for every bucket.
if (stats.at(StatisticsType::Uniq)->estimateCardinality() < 2048)
@ -127,10 +119,16 @@ Float64 ColumnStatistics::estimateEqual(const Field & val) const
if (stats.contains(StatisticsType::CountMinSketch))
return stats.at(StatisticsType::CountMinSketch)->estimateEqual(val);
#endif
if (!float_val.has_value() && (float_val < - ConditionSelectivityEstimator::threshold || float_val > ConditionSelectivityEstimator::threshold))
return rows * ConditionSelectivityEstimator::default_normal_cond_factor;
else
return rows * ConditionSelectivityEstimator::default_good_cond_factor;
if (stats.contains(StatisticsType::Uniq))
{
auto cardinality = stats.at(StatisticsType::Uniq)->estimateCardinality();
if (cardinality == 0)
return 0;
/// Assume that the value is uniformly distributed among the unique values.
return static_cast<Float64>(1) / stats.at(StatisticsType::Uniq)->estimateCardinality();
}
return rows * ConditionSelectivityEstimator::default_cond_equal_factor;
}
/// -------------------------------------

View File

@ -19,7 +19,6 @@ struct StatisticsUtils
{
/// Returns std::nullopt if input Field cannot be converted to a concrete value
static std::optional<Float64> tryConvertToFloat64(const Field & field);
static std::optional<String> tryConvertToString(const Field & field);
};
/// Statistics describe properties of the values in the column,
@ -32,11 +31,6 @@ public:
explicit IStatistics(const SingleStatisticsDescription & stat_);
virtual ~IStatistics() = default;
virtual void update(const ColumnPtr & column) = 0;
virtual void serialize(WriteBuffer & buf) = 0;
virtual void deserialize(ReadBuffer & buf) = 0;
/// Estimate the cardinality of the column.
/// Throws if the statistics object is not able to do a meaningful estimation.
virtual UInt64 estimateCardinality() const;
@ -46,6 +40,11 @@ public:
virtual Float64 estimateEqual(const Field & val) const; /// cardinality of val in the column
virtual Float64 estimateLess(const Field & val) const; /// summarized cardinality of values < val in the column
virtual void update(const ColumnPtr & column) = 0;
virtual void serialize(WriteBuffer & buf) = 0;
virtual void deserialize(ReadBuffer & buf) = 0;
protected:
SingleStatisticsDescription stat;
};

View File

@ -32,18 +32,18 @@ Float64 StatisticsMinMax::estimateLess(const Field & val) const
if (val_converted.isNull())
return 0;
auto val_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
auto val_as_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
if (val_float < min)
if (val_as_float < min)
return 0;
if (val_float > max)
if (val_as_float > max)
return row_count;
if (max == min)
return row_count;
return ((val_float - min) / (max - min)) * row_count;
return ((val_as_float - min) / (max - min)) * row_count;
}
void StatisticsMinMax::update(const ColumnPtr & column)

View File

@ -1,6 +1,8 @@
#include <Storages/Statistics/StatisticsTDigest.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <Interpreters/convertFieldToType.h>
#include <Common/FieldVisitorConvertToNumber.h>
namespace DB
{
@ -10,24 +12,20 @@ extern const int ILLEGAL_STATISTICS;
extern const int LOGICAL_ERROR;
}
StatisticsTDigest::StatisticsTDigest(const SingleStatisticsDescription & stat_)
: IStatistics(stat_)
StatisticsTDigest::StatisticsTDigest(const SingleStatisticsDescription & stat_, DataTypePtr data_type_)
: IStatistics(stat_), data_type(data_type_)
{
}
void StatisticsTDigest::update(const ColumnPtr & column)
{
size_t rows = column->size();
for (size_t row = 0; row < rows; ++row)
for (size_t row = 0; row < column->size(); ++row)
{
Field field;
column->get(row, field);
if (field.isNull())
if (column->isNullAt(row))
continue;
if (auto field_as_float = StatisticsUtils::tryConvertToFloat64(field))
t_digest.add(*field_as_float, 1);
auto data = column->getFloat64(row);
t_digest.add(data, 1);
}
}
@ -43,18 +41,22 @@ void StatisticsTDigest::deserialize(ReadBuffer & buf)
Float64 StatisticsTDigest::estimateLess(const Field & val) const
{
auto val_as_float = StatisticsUtils::tryConvertToFloat64(val);
if (val_as_float)
return t_digest.getCountLessThan(*val_as_float);
throw Exception(ErrorCodes::LOGICAL_ERROR, "Statistics 'tdigest' does not support estimating value of type {}", val.getTypeName());
Field val_converted = convertFieldToType(val, *data_type);
if (val_converted.isNull())
return 0;
auto val_as_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
return t_digest.getCountLessThan(val_as_float);
}
Float64 StatisticsTDigest::estimateEqual(const Field & val) const
{
auto val_as_float = StatisticsUtils::tryConvertToFloat64(val);
if (val_as_float)
return t_digest.getCountEqual(*val_as_float);
throw Exception(ErrorCodes::LOGICAL_ERROR, "Statistics 'tdigest' does not support estimating value of type {}", val.getTypeName());
Field val_converted = convertFieldToType(val, *data_type);
if (val_converted.isNull())
return 0;
auto val_as_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
return t_digest.getCountEqual(val_as_float);
}
void tdigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
@ -65,9 +67,9 @@ void tdigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type
throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'tdigest' do not support type {}", data_type->getName());
}
StatisticsPtr tdigestCreator(const SingleStatisticsDescription & stat, DataTypePtr)
StatisticsPtr tdigestCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type)
{
return std::make_shared<StatisticsTDigest>(stat);
return std::make_shared<StatisticsTDigest>(stat, data_type);
}
}

View File

@ -9,18 +9,19 @@ namespace DB
class StatisticsTDigest : public IStatistics
{
public:
explicit StatisticsTDigest(const SingleStatisticsDescription & stat_);
explicit StatisticsTDigest(const SingleStatisticsDescription & stat_, DataTypePtr data_type_);
Float64 estimateLess(const Field & val) const override;
Float64 estimateEqual(const Field & val) const override;
void update(const ColumnPtr & column) override;
void serialize(WriteBuffer & buf) override;
void deserialize(ReadBuffer & buf) override;
Float64 estimateLess(const Field & val) const override;
Float64 estimateEqual(const Field & val) const override;
private:
QuantileTDigest<Float64> t_digest;
DataTypePtr data_type;
};
void tdigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type);

View File

@ -56,7 +56,7 @@ void uniqValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
{
data_type = removeNullable(data_type);
data_type = removeLowCardinalityAndNullable(data_type);
if (!data_type->isValueRepresentedByNumber())
if (!data_type->isValueRepresentedByNumber() && !isStringOrFixedString(data_type))
throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'uniq' do not support type {}", data_type->getName());
}

View File

@ -13,13 +13,13 @@ public:
StatisticsUniq(const SingleStatisticsDescription & stat_, const DataTypePtr & data_type);
~StatisticsUniq() override;
UInt64 estimateCardinality() const override;
void update(const ColumnPtr & column) override;
void serialize(WriteBuffer & buf) override;
void deserialize(ReadBuffer & buf) override;
UInt64 estimateCardinality() const override;
private:
std::unique_ptr<Arena> arena;
AggregateFunctionPtr collector;

View File

@ -14,12 +14,19 @@ Test statistics multi-types:
Prewhere info
Prewhere filter
Prewhere filter column: and(equals(a, \'10000\'), equals(b, 0), less(c, 0)) (removed)
Test statistics min_max and tdigest:
Test estimating range condition:
Prewhere info
Prewhere filter
Prewhere filter column: and(less(b, 10), less(c, 0)) (removed)
Prewhere info
Prewhere filter
Prewhere filter column: and(less(b, 10), less(c, 0)) (removed)
Test estimating equals condition:
Prewhere info
Prewhere filter
Prewhere filter column: and(equals(a, \'0\'), equals(b, 10)) (removed)
Prewhere info
Prewhere filter
Prewhere filter column: and(equals(a, \'0\'), equals(b, 10)) (removed)
Test LowCardinality and Nullable data type:
tab2

View File

@ -68,7 +68,7 @@ WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%';
ALTER TABLE tab DROP STATISTICS a, b, c, d;
SELECT 'Test statistics min_max and tdigest:';
SELECT 'Test estimating range condition:';
ALTER TABLE tab ADD STATISTICS b TYPE min_max;
ALTER TABLE tab MATERIALIZE STATISTICS b;
@ -84,6 +84,21 @@ WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%';
ALTER TABLE tab DROP STATISTICS b;
SELECT 'Test estimating equals condition:';
ALTER TABLE tab ADD STATISTICS a TYPE uniq;
ALTER TABLE tab MATERIALIZE STATISTICS a;
SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8|_UInt16|_String', '')
FROM (EXPLAIN actions=1 SELECT count(*) FROM tab WHERE b = 10/*100*/ and a = '0'/*1*/)
WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%';
ALTER TABLE tab ADD STATISTICS a TYPE count_min;
ALTER TABLE tab MATERIALIZE STATISTICS a;
SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8|_UInt16|_String', '')
FROM (EXPLAIN actions=1 SELECT count(*) FROM tab WHERE b = 10/*100*/ and a = '0'/*1*/)
WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%';
ALTER TABLE tab DROP STATISTICS a;
DROP TABLE IF EXISTS tab SYNC;
@ -93,8 +108,8 @@ SET allow_suspicious_low_cardinality_types=1;
CREATE TABLE tab2
(
a LowCardinality(Int64) STATISTICS(count_min),
b Nullable(Int64) STATISTICS(count_min),
c LowCardinality(Nullable(Int64)) STATISTICS(count_min),
b Nullable(Int64) STATISTICS(min_max, count_min),
c LowCardinality(Nullable(Int64)) STATISTICS(min_max, count_min),
pk String,
) Engine = MergeTree() ORDER BY pk;