Handle edge case: col_int32 > 10.6

This commit is contained in:
JackyWoo 2024-08-07 10:43:49 +08:00
parent 3769f8a465
commit aafe498b7f
5 changed files with 43 additions and 21 deletions

View File

@ -1,6 +1,8 @@
#include <Storages/Statistics/Statistics.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <DataTypes/DataTypeFactory.h>
#include <Interpreters/convertFieldToType.h>
#include <Storages/ColumnsDescription.h>
#include <Storages/Statistics/ConditionSelectivityEstimator.h>
#include <Storages/Statistics/StatisticsCountMinSketch.h>
@ -10,6 +12,7 @@
#include <Storages/StatisticsDescription.h>
#include <Common/Exception.h>
#include <Common/logger_useful.h>
#include <Common/FieldVisitorConvertToNumber.h>
#include "config.h" /// USE_DATASKETCHES
@ -28,6 +31,25 @@ enum StatisticsFileVersion : UInt16
V0 = 0,
};
std::optional<Float64> StatisticsUtils::tryConvertToFloat64(const Field & value, const DataTypePtr & value_data_type)
{
if (value_data_type->isValueRepresentedByNumber())
{
Field val_converted;
/// For case val_int32 < 10.5 or val_int32 < '10.5' we should convert 10.5 to Float64.
if (isInteger(value_data_type) && (value.getType() == Field::Types::Float64 || value.getType() == Field::Types::String))
val_converted = convertFieldToType(value, *DataTypeFactory::instance().get("Float64"));
/// We should convert value to the real column data type and then translate it to Float64.
/// For example for expression col_date > '2024-08-07', if we directly convert '2024-08-07' to Float64, we will get null.
val_converted = convertFieldToType(value, *value_data_type);
if (val_converted.isNull())
return {};
return applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
}
return {};
}
IStatistics::IStatistics(const SingleStatisticsDescription & stat_)
: stat(stat_)

View File

@ -15,6 +15,12 @@ constexpr auto STATS_FILE_PREFIX = "statistics_";
constexpr auto STATS_FILE_SUFFIX = ".stats";
struct StatisticsUtils
{
/// Returns std::nullopt if input Field cannot be converted to a concrete value
static std::optional<Float64> tryConvertToFloat64(const Field & value, const DataTypePtr & value_data_type);
};
/// Statistics describe properties of the values in the column,
/// e.g. how many unique values exist,
/// what are the N most frequent values,

View File

@ -1,10 +1,8 @@
#include <Storages/Statistics/StatisticsMinMax.h>
#include <Common/FieldVisitorConvertToNumber.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeNullable.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/convertFieldToType.h>
#include <algorithm>
@ -53,12 +51,10 @@ void StatisticsMinMax::deserialize(ReadBuffer & buf)
Float64 StatisticsMinMax::estimateLess(const Field & val) const
{
Field val_converted = convertFieldToType(val, *data_type);
if (val_converted.isNull())
auto val_as_float = StatisticsUtils::tryConvertToFloat64(val, data_type);
if (!val_as_float.has_value())
return 0;
auto val_as_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
if (val_as_float < min)
return 0;
@ -68,7 +64,7 @@ Float64 StatisticsMinMax::estimateLess(const Field & val) const
if (min == max)
return (val_as_float != max) ? 0 : row_count;
return ((val_as_float - min) / (max - min)) * row_count;
return ((*val_as_float - min) / (max - min)) * row_count;
}
void minMaxStatisticsValidator(const SingleStatisticsDescription & /*statistics_description*/, DataTypePtr data_type)

View File

@ -41,22 +41,18 @@ void StatisticsTDigest::deserialize(ReadBuffer & buf)
Float64 StatisticsTDigest::estimateLess(const Field & val) const
{
Field val_converted = convertFieldToType(val, *data_type);
if (val_converted.isNull())
auto val_as_float = StatisticsUtils::tryConvertToFloat64(val, data_type);
if (!val_as_float.has_value())
return 0;
auto val_as_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
return t_digest.getCountLessThan(val_as_float);
return t_digest.getCountLessThan(*val_as_float);
}
Float64 StatisticsTDigest::estimateEqual(const Field & val) const
{
Field val_converted = convertFieldToType(val, *data_type);
if (val_converted.isNull())
auto val_as_float = StatisticsUtils::tryConvertToFloat64(val, data_type);
if (!val_as_float.has_value())
return 0;
auto val_as_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
return t_digest.getCountEqual(val_as_float);
return t_digest.getCountEqual(*val_as_float);
}
void tdigestStatisticsValidator(const SingleStatisticsDescription & /*statistics_description*/, DataTypePtr data_type)

View File

@ -67,9 +67,7 @@ ALTER TABLE tab DROP STATISTICS a, b, c, d;
SELECT 'Test statistics multi-types:';
ALTER TABLE tab ADD STATISTICS a TYPE count_min, uniq;
ALTER TABLE tab ADD STATISTICS b TYPE count_min, minmax, uniq, tdigest;
ALTER TABLE tab ADD STATISTICS c TYPE count_min, minmax, uniq, tdigest;
ALTER TABLE tab ADD STATISTICS d TYPE count_min, minmax, uniq, tdigest;
ALTER TABLE tab ADD STATISTICS b, c, d TYPE count_min, minmax, uniq, tdigest;
ALTER TABLE tab MATERIALIZE STATISTICS a, b, c, d;
SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8|_UInt16|_String|_DateTime', '')
@ -82,10 +80,14 @@ WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%';
SELECT 'Test statistics implicitly type conversion:';
SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8|_UInt16|_String|_DateTime', '')
FROM (EXPLAIN actions=1 SELECT count(*) FROM tab WHERE d = '2024-08-06 09:58:09'/*0*/ and c = '0'/*100*/ and b > 0/*9990*/ and a = '1'/*1*/)
FROM (EXPLAIN actions=1 SELECT count(*) FROM tab WHERE d = '2024-08-06 09:58:09'/*0*/ and c = '0'/*100*/)
WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%';
SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8|_UInt16|_String|_DateTime', '')
FROM (EXPLAIN actions=1 SELECT count(*) FROM tab WHERE d = '2024-08-06 09:58:09'/*0*/ and b > 50.1/*5000*/)
WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%';
ALTER TABLE tab DROP STATISTICS a, b, c, d;