mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-19 16:20:50 +00:00
Uniq statistics supports more datatypes and refactor logical when there is no statistics.
This commit is contained in:
parent
4712b79960
commit
40fc0ca574
@ -19,7 +19,7 @@ void ConditionSelectivityEstimator::ColumnSelectivityEstimator::merge(String par
|
||||
Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateLess(const Field & val, Float64 rows) const
|
||||
{
|
||||
if (part_statistics.empty())
|
||||
return default_normal_cond_factor * rows;
|
||||
return default_cond_range_factor * rows;
|
||||
Float64 result = 0;
|
||||
Float64 part_rows = 0;
|
||||
for (const auto & [key, estimator] : part_statistics)
|
||||
@ -38,15 +38,8 @@ Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateGreat
|
||||
Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateEqual(const Field & val, Float64 rows) const
|
||||
{
|
||||
if (part_statistics.empty())
|
||||
{
|
||||
auto float_val = StatisticsUtils::tryConvertToFloat64(val);
|
||||
if (!float_val)
|
||||
return default_unknown_cond_factor * rows;
|
||||
else if (float_val.value() < - threshold || float_val.value() > threshold)
|
||||
return default_normal_cond_factor * rows;
|
||||
else
|
||||
return default_good_cond_factor * rows;
|
||||
}
|
||||
return default_cond_equal_factor * rows;
|
||||
|
||||
Float64 result = 0;
|
||||
Float64 partial_cnt = 0;
|
||||
for (const auto & [key, estimator] : part_statistics)
|
||||
@ -149,30 +142,22 @@ Float64 ConditionSelectivityEstimator::estimateRowCount(const RPNBuilderTreeNode
|
||||
|
||||
auto [op, val] = extractBinaryOp(node, col);
|
||||
|
||||
if (dummy)
|
||||
{
|
||||
if (op == "equals")
|
||||
return default_cond_equal_factor * total_rows;
|
||||
else if (op == "less" || op == "lessOrEquals" || op == "greater" || op == "greaterOrEquals")
|
||||
return default_cond_range_factor * total_rows;
|
||||
else
|
||||
return default_unknown_cond_factor * total_rows;
|
||||
}
|
||||
|
||||
if (op == "equals")
|
||||
{
|
||||
if (dummy)
|
||||
{
|
||||
auto float_val = StatisticsUtils::tryConvertToFloat64(val);
|
||||
if (!float_val || (float_val < - threshold || float_val > threshold))
|
||||
return default_normal_cond_factor * total_rows;
|
||||
else
|
||||
return default_good_cond_factor * total_rows;
|
||||
}
|
||||
return estimator.estimateEqual(val, total_rows);
|
||||
}
|
||||
else if (op == "less" || op == "lessOrEquals")
|
||||
{
|
||||
if (dummy)
|
||||
return default_normal_cond_factor * total_rows;
|
||||
return estimator.estimateLess(val, total_rows);
|
||||
}
|
||||
else if (op == "greater" || op == "greaterOrEquals")
|
||||
{
|
||||
if (dummy)
|
||||
return default_normal_cond_factor * total_rows;
|
||||
return estimator.estimateGreater(val, total_rows);
|
||||
}
|
||||
else
|
||||
return default_unknown_cond_factor * total_rows;
|
||||
}
|
||||
|
@ -38,12 +38,10 @@ private:
|
||||
|
||||
std::pair<String, Field> extractBinaryOp(const RPNBuilderTreeNode & node, const String & column_name) const;
|
||||
|
||||
static constexpr auto default_good_cond_factor = 0.1;
|
||||
static constexpr auto default_normal_cond_factor = 0.5;
|
||||
static constexpr auto default_unknown_cond_factor = 1.0;
|
||||
/// Conditions like "x = N" are considered good if abs(N) > threshold.
|
||||
/// This is used to assume that condition is likely to have good selectivity.
|
||||
static constexpr auto threshold = 2;
|
||||
/// Used to estimate the selectivity of a condition when there is no statistics.
|
||||
static constexpr auto default_cond_range_factor = 0.5;
|
||||
static constexpr auto default_cond_equal_factor = 0.01;
|
||||
static constexpr auto default_unknown_cond_factor = 1;
|
||||
|
||||
UInt64 total_rows = 0;
|
||||
std::map<String, ColumnSelectivityEstimator> column_estimators;
|
||||
|
@ -51,13 +51,6 @@ std::optional<Float64> StatisticsUtils::tryConvertToFloat64(const Field & field)
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<String> StatisticsUtils::tryConvertToString(const DB::Field & field)
|
||||
{
|
||||
if (field.getType() == Field::Types::String)
|
||||
return field.get<String>();
|
||||
return {};
|
||||
}
|
||||
|
||||
IStatistics::IStatistics(const SingleStatisticsDescription & stat_)
|
||||
: stat(stat_)
|
||||
{
|
||||
@ -106,7 +99,7 @@ Float64 ColumnStatistics::estimateLess(const Field & val) const
|
||||
return stats.at(StatisticsType::TDigest)->estimateLess(val);
|
||||
if (stats.contains(StatisticsType::MinMax))
|
||||
return stats.at(StatisticsType::MinMax)->estimateLess(val);
|
||||
return rows * ConditionSelectivityEstimator::default_normal_cond_factor;
|
||||
return rows * ConditionSelectivityEstimator::default_cond_range_factor;
|
||||
}
|
||||
|
||||
Float64 ColumnStatistics::estimateGreater(const Field & val) const
|
||||
@ -116,8 +109,7 @@ Float64 ColumnStatistics::estimateGreater(const Field & val) const
|
||||
|
||||
Float64 ColumnStatistics::estimateEqual(const Field & val) const
|
||||
{
|
||||
auto float_val = StatisticsUtils::tryConvertToFloat64(val);
|
||||
if (float_val.has_value() && stats.contains(StatisticsType::Uniq) && stats.contains(StatisticsType::TDigest))
|
||||
if (stats_desc.data_type->isValueRepresentedByNumber() && stats.contains(StatisticsType::Uniq) && stats.contains(StatisticsType::TDigest))
|
||||
{
|
||||
/// 2048 is the default number of buckets in TDigest. In this case, TDigest stores exactly one value (with many rows) for every bucket.
|
||||
if (stats.at(StatisticsType::Uniq)->estimateCardinality() < 2048)
|
||||
@ -127,10 +119,16 @@ Float64 ColumnStatistics::estimateEqual(const Field & val) const
|
||||
if (stats.contains(StatisticsType::CountMinSketch))
|
||||
return stats.at(StatisticsType::CountMinSketch)->estimateEqual(val);
|
||||
#endif
|
||||
if (!float_val.has_value() && (float_val < - ConditionSelectivityEstimator::threshold || float_val > ConditionSelectivityEstimator::threshold))
|
||||
return rows * ConditionSelectivityEstimator::default_normal_cond_factor;
|
||||
else
|
||||
return rows * ConditionSelectivityEstimator::default_good_cond_factor;
|
||||
if (stats.contains(StatisticsType::Uniq))
|
||||
{
|
||||
auto cardinality = stats.at(StatisticsType::Uniq)->estimateCardinality();
|
||||
if (cardinality == 0)
|
||||
return 0;
|
||||
/// Assume that the value is uniformly distributed among the unique values.
|
||||
return static_cast<Float64>(1) / stats.at(StatisticsType::Uniq)->estimateCardinality();
|
||||
}
|
||||
|
||||
return rows * ConditionSelectivityEstimator::default_cond_equal_factor;
|
||||
}
|
||||
|
||||
/// -------------------------------------
|
||||
|
@ -19,7 +19,6 @@ struct StatisticsUtils
|
||||
{
|
||||
/// Returns std::nullopt if input Field cannot be converted to a concrete value
|
||||
static std::optional<Float64> tryConvertToFloat64(const Field & field);
|
||||
static std::optional<String> tryConvertToString(const Field & field);
|
||||
};
|
||||
|
||||
/// Statistics describe properties of the values in the column,
|
||||
@ -32,11 +31,6 @@ public:
|
||||
explicit IStatistics(const SingleStatisticsDescription & stat_);
|
||||
virtual ~IStatistics() = default;
|
||||
|
||||
virtual void update(const ColumnPtr & column) = 0;
|
||||
|
||||
virtual void serialize(WriteBuffer & buf) = 0;
|
||||
virtual void deserialize(ReadBuffer & buf) = 0;
|
||||
|
||||
/// Estimate the cardinality of the column.
|
||||
/// Throws if the statistics object is not able to do a meaningful estimation.
|
||||
virtual UInt64 estimateCardinality() const;
|
||||
@ -46,6 +40,11 @@ public:
|
||||
virtual Float64 estimateEqual(const Field & val) const; /// cardinality of val in the column
|
||||
virtual Float64 estimateLess(const Field & val) const; /// summarized cardinality of values < val in the column
|
||||
|
||||
virtual void update(const ColumnPtr & column) = 0;
|
||||
|
||||
virtual void serialize(WriteBuffer & buf) = 0;
|
||||
virtual void deserialize(ReadBuffer & buf) = 0;
|
||||
|
||||
protected:
|
||||
SingleStatisticsDescription stat;
|
||||
};
|
||||
|
@ -32,18 +32,18 @@ Float64 StatisticsMinMax::estimateLess(const Field & val) const
|
||||
if (val_converted.isNull())
|
||||
return 0;
|
||||
|
||||
auto val_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
|
||||
auto val_as_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
|
||||
|
||||
if (val_float < min)
|
||||
if (val_as_float < min)
|
||||
return 0;
|
||||
|
||||
if (val_float > max)
|
||||
if (val_as_float > max)
|
||||
return row_count;
|
||||
|
||||
if (max == min)
|
||||
return row_count;
|
||||
|
||||
return ((val_float - min) / (max - min)) * row_count;
|
||||
return ((val_as_float - min) / (max - min)) * row_count;
|
||||
}
|
||||
|
||||
void StatisticsMinMax::update(const ColumnPtr & column)
|
||||
|
@ -1,6 +1,8 @@
|
||||
#include <Storages/Statistics/StatisticsTDigest.h>
|
||||
#include <DataTypes/DataTypeNullable.h>
|
||||
#include <DataTypes/DataTypeLowCardinality.h>
|
||||
#include <Interpreters/convertFieldToType.h>
|
||||
#include <Common/FieldVisitorConvertToNumber.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -10,24 +12,20 @@ extern const int ILLEGAL_STATISTICS;
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
StatisticsTDigest::StatisticsTDigest(const SingleStatisticsDescription & stat_)
|
||||
: IStatistics(stat_)
|
||||
StatisticsTDigest::StatisticsTDigest(const SingleStatisticsDescription & stat_, DataTypePtr data_type_)
|
||||
: IStatistics(stat_), data_type(data_type_)
|
||||
{
|
||||
}
|
||||
|
||||
void StatisticsTDigest::update(const ColumnPtr & column)
|
||||
{
|
||||
size_t rows = column->size();
|
||||
for (size_t row = 0; row < rows; ++row)
|
||||
for (size_t row = 0; row < column->size(); ++row)
|
||||
{
|
||||
Field field;
|
||||
column->get(row, field);
|
||||
|
||||
if (field.isNull())
|
||||
if (column->isNullAt(row))
|
||||
continue;
|
||||
|
||||
if (auto field_as_float = StatisticsUtils::tryConvertToFloat64(field))
|
||||
t_digest.add(*field_as_float, 1);
|
||||
auto data = column->getFloat64(row);
|
||||
t_digest.add(data, 1);
|
||||
}
|
||||
}
|
||||
|
||||
@ -43,18 +41,22 @@ void StatisticsTDigest::deserialize(ReadBuffer & buf)
|
||||
|
||||
Float64 StatisticsTDigest::estimateLess(const Field & val) const
|
||||
{
|
||||
auto val_as_float = StatisticsUtils::tryConvertToFloat64(val);
|
||||
if (val_as_float)
|
||||
return t_digest.getCountLessThan(*val_as_float);
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Statistics 'tdigest' does not support estimating value of type {}", val.getTypeName());
|
||||
Field val_converted = convertFieldToType(val, *data_type);
|
||||
if (val_converted.isNull())
|
||||
return 0;
|
||||
|
||||
auto val_as_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
|
||||
return t_digest.getCountLessThan(val_as_float);
|
||||
}
|
||||
|
||||
Float64 StatisticsTDigest::estimateEqual(const Field & val) const
|
||||
{
|
||||
auto val_as_float = StatisticsUtils::tryConvertToFloat64(val);
|
||||
if (val_as_float)
|
||||
return t_digest.getCountEqual(*val_as_float);
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Statistics 'tdigest' does not support estimating value of type {}", val.getTypeName());
|
||||
Field val_converted = convertFieldToType(val, *data_type);
|
||||
if (val_converted.isNull())
|
||||
return 0;
|
||||
|
||||
auto val_as_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
|
||||
return t_digest.getCountEqual(val_as_float);
|
||||
}
|
||||
|
||||
void tdigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
|
||||
@ -65,9 +67,9 @@ void tdigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type
|
||||
throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'tdigest' do not support type {}", data_type->getName());
|
||||
}
|
||||
|
||||
StatisticsPtr tdigestCreator(const SingleStatisticsDescription & stat, DataTypePtr)
|
||||
StatisticsPtr tdigestCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type)
|
||||
{
|
||||
return std::make_shared<StatisticsTDigest>(stat);
|
||||
return std::make_shared<StatisticsTDigest>(stat, data_type);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -9,18 +9,19 @@ namespace DB
|
||||
class StatisticsTDigest : public IStatistics
|
||||
{
|
||||
public:
|
||||
explicit StatisticsTDigest(const SingleStatisticsDescription & stat_);
|
||||
explicit StatisticsTDigest(const SingleStatisticsDescription & stat_, DataTypePtr data_type_);
|
||||
|
||||
Float64 estimateLess(const Field & val) const override;
|
||||
Float64 estimateEqual(const Field & val) const override;
|
||||
|
||||
void update(const ColumnPtr & column) override;
|
||||
|
||||
void serialize(WriteBuffer & buf) override;
|
||||
void deserialize(ReadBuffer & buf) override;
|
||||
|
||||
Float64 estimateLess(const Field & val) const override;
|
||||
Float64 estimateEqual(const Field & val) const override;
|
||||
|
||||
private:
|
||||
QuantileTDigest<Float64> t_digest;
|
||||
DataTypePtr data_type;
|
||||
};
|
||||
|
||||
void tdigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type);
|
||||
|
@ -56,7 +56,7 @@ void uniqValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
|
||||
{
|
||||
data_type = removeNullable(data_type);
|
||||
data_type = removeLowCardinalityAndNullable(data_type);
|
||||
if (!data_type->isValueRepresentedByNumber())
|
||||
if (!data_type->isValueRepresentedByNumber() && !isStringOrFixedString(data_type))
|
||||
throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'uniq' do not support type {}", data_type->getName());
|
||||
}
|
||||
|
||||
|
@ -13,13 +13,13 @@ public:
|
||||
StatisticsUniq(const SingleStatisticsDescription & stat_, const DataTypePtr & data_type);
|
||||
~StatisticsUniq() override;
|
||||
|
||||
UInt64 estimateCardinality() const override;
|
||||
|
||||
void update(const ColumnPtr & column) override;
|
||||
|
||||
void serialize(WriteBuffer & buf) override;
|
||||
void deserialize(ReadBuffer & buf) override;
|
||||
|
||||
UInt64 estimateCardinality() const override;
|
||||
|
||||
private:
|
||||
std::unique_ptr<Arena> arena;
|
||||
AggregateFunctionPtr collector;
|
||||
|
@ -14,12 +14,19 @@ Test statistics multi-types:
|
||||
Prewhere info
|
||||
Prewhere filter
|
||||
Prewhere filter column: and(equals(a, \'10000\'), equals(b, 0), less(c, 0)) (removed)
|
||||
Test statistics min_max and tdigest:
|
||||
Test estimating range condition:
|
||||
Prewhere info
|
||||
Prewhere filter
|
||||
Prewhere filter column: and(less(b, 10), less(c, 0)) (removed)
|
||||
Prewhere info
|
||||
Prewhere filter
|
||||
Prewhere filter column: and(less(b, 10), less(c, 0)) (removed)
|
||||
Test estimating equals condition:
|
||||
Prewhere info
|
||||
Prewhere filter
|
||||
Prewhere filter column: and(equals(a, \'0\'), equals(b, 10)) (removed)
|
||||
Prewhere info
|
||||
Prewhere filter
|
||||
Prewhere filter column: and(equals(a, \'0\'), equals(b, 10)) (removed)
|
||||
Test LowCardinality and Nullable data type:
|
||||
tab2
|
||||
|
@ -68,7 +68,7 @@ WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%';
|
||||
ALTER TABLE tab DROP STATISTICS a, b, c, d;
|
||||
|
||||
|
||||
SELECT 'Test statistics min_max and tdigest:';
|
||||
SELECT 'Test estimating range condition:';
|
||||
|
||||
ALTER TABLE tab ADD STATISTICS b TYPE min_max;
|
||||
ALTER TABLE tab MATERIALIZE STATISTICS b;
|
||||
@ -84,6 +84,21 @@ WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%';
|
||||
ALTER TABLE tab DROP STATISTICS b;
|
||||
|
||||
|
||||
SELECT 'Test estimating equals condition:';
|
||||
|
||||
ALTER TABLE tab ADD STATISTICS a TYPE uniq;
|
||||
ALTER TABLE tab MATERIALIZE STATISTICS a;
|
||||
SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8|_UInt16|_String', '')
|
||||
FROM (EXPLAIN actions=1 SELECT count(*) FROM tab WHERE b = 10/*100*/ and a = '0'/*1*/)
|
||||
WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%';
|
||||
|
||||
ALTER TABLE tab ADD STATISTICS a TYPE count_min;
|
||||
ALTER TABLE tab MATERIALIZE STATISTICS a;
|
||||
SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8|_UInt16|_String', '')
|
||||
FROM (EXPLAIN actions=1 SELECT count(*) FROM tab WHERE b = 10/*100*/ and a = '0'/*1*/)
|
||||
WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%';
|
||||
ALTER TABLE tab DROP STATISTICS a;
|
||||
|
||||
DROP TABLE IF EXISTS tab SYNC;
|
||||
|
||||
|
||||
@ -93,8 +108,8 @@ SET allow_suspicious_low_cardinality_types=1;
|
||||
CREATE TABLE tab2
|
||||
(
|
||||
a LowCardinality(Int64) STATISTICS(count_min),
|
||||
b Nullable(Int64) STATISTICS(count_min),
|
||||
c LowCardinality(Nullable(Int64)) STATISTICS(count_min),
|
||||
b Nullable(Int64) STATISTICS(min_max, count_min),
|
||||
c LowCardinality(Nullable(Int64)) STATISTICS(min_max, count_min),
|
||||
pk String,
|
||||
) Engine = MergeTree() ORDER BY pk;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user