ClickHouse/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp

235 lines
7.8 KiB
C++
Raw Normal View History

2019-06-19 15:30:48 +00:00
#include <Storages/MergeTree/MergeTreeIndexMinMax.h>
2019-01-10 13:50:41 +00:00
#include <Interpreters/ExpressionActions.h>
#include <Interpreters/ExpressionAnalyzer.h>
#include <Interpreters/TreeRewriter.h>
2019-01-10 13:50:41 +00:00
#include <Parsers/ASTFunction.h>
2019-01-09 14:15:23 +00:00
#include <Poco/Logger.h>
#include <Common/FieldVisitorsAccurateComparison.h>
2019-01-08 17:27:44 +00:00
namespace DB
{
2019-01-22 19:43:52 +00:00
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
2020-05-28 12:37:05 +00:00
MergeTreeIndexGranuleMinMax::MergeTreeIndexGranuleMinMax(const String & index_name_, const Block & index_sample_block_)
: index_name(index_name_)
, index_sample_block(index_sample_block_)
2020-05-27 18:38:34 +00:00
{}
2019-03-08 19:52:21 +00:00
2019-06-19 15:30:48 +00:00
MergeTreeIndexGranuleMinMax::MergeTreeIndexGranuleMinMax(
2020-05-28 12:37:05 +00:00
const String & index_name_,
const Block & index_sample_block_,
std::vector<Range> && hyperrectangle_)
: index_name(index_name_)
, index_sample_block(index_sample_block_)
, hyperrectangle(std::move(hyperrectangle_)) {}
2019-01-08 17:27:44 +00:00
2019-06-19 15:30:48 +00:00
void MergeTreeIndexGranuleMinMax::serializeBinary(WriteBuffer & ostr) const
2019-01-08 17:27:44 +00:00
{
if (empty())
throw Exception(
2020-05-28 12:37:05 +00:00
"Attempt to write empty minmax index " + backQuote(index_name), ErrorCodes::LOGICAL_ERROR);
2019-01-08 17:27:44 +00:00
2020-05-27 18:38:34 +00:00
for (size_t i = 0; i < index_sample_block.columns(); ++i)
2019-01-08 17:27:44 +00:00
{
2020-05-27 18:38:34 +00:00
const DataTypePtr & type = index_sample_block.getByPosition(i).type;
2021-03-09 14:46:52 +00:00
auto serialization = type->getDefaultSerialization();
Fix on-disk format breakage for secondary indices over Nullable column [1] breaks on disk format (and the relevant change in the: [1]: https://github.com/ClickHouse/ClickHouse/pull/12455#discussion_r682830812 Too bad that I checked this patchset only for compatibility after reverting this patch [2] (use case: I've applied it manually, then revert it, and data skipping indexes over Nullable column had been broken) [2]: https://github.com/ClickHouse/ClickHouse/pull/12455#issuecomment-823423772 But this patchset actually breaks compatibility with older versions of clickhouse for Nullable data skipping indexes after simple upgrade: Here is a simple reproducer: -- -- run this with 21.6 or similar (i.e. w/o this patch) -- CREATE TABLE data ( `key` Int, `value` Nullable(Int), INDEX value_index value TYPE minmax GRANULARITY 1 ) ENGINE = MergeTree ORDER BY key; INSERT INTO data SELECT number, number FROM numbers(10000); SELECT * FROM data WHERE value = 20000 SETTINGS force_data_skipping_indices = 'value_index' SETTINGS force_data_skipping_indices = 'value_index', max_rows_to_read=1; Now upgrade and run the query again: SELECT * FROM data WHERE value = 20000 SETTINGS force_data_skipping_indices = 'value_index' SETTINGS force_data_skipping_indices = 'value_index', max_rows_to_read=1; And it will fail because of on disk format changes: $ ll --time-style=+ data/*/data/all_1_1_0/skp*.idx -rw-r----- 1 azat azat 36 data/with_nullable_patch/data/all_1_1_0/skp_idx_value_index.idx -rw-r----- 1 azat azat 37 data/without_nullable_patch/data/all_1_1_0/skp_idx_value_index.idx $ md5sum data/*/data/all_1_1_0/skp*.idx a19c95c4a14506c65665a1e30ab404bf data/with_nullable_patch/data/all_1_1_0/skp_idx_value_index.idx e50e2fcfa873b232196623d56ab26105 data/without_nullable_patch/data/all_1_1_0/skp_idx_value_index.idx Note, that there is no stable release with this patch included yet, so no need to backport. Also note that you may create data skipping indexes over Nullable column even before [3]. [3]: https://github.com/ClickHouse/ClickHouse/pull/12455 v2: break cases when granulas has Null in values due to backward compatibility
2021-08-04 18:25:51 +00:00
serialization->serializeBinary(hyperrectangle[i].left, ostr);
serialization->serializeBinary(hyperrectangle[i].right, ostr);
2019-01-08 17:27:44 +00:00
}
}
void MergeTreeIndexGranuleMinMax::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version)
2019-01-08 17:27:44 +00:00
{
2020-03-10 14:56:55 +00:00
hyperrectangle.clear();
2019-02-14 17:35:50 +00:00
Field min_val;
Field max_val;
2021-03-09 14:46:52 +00:00
2020-05-27 18:38:34 +00:00
for (size_t i = 0; i < index_sample_block.columns(); ++i)
2019-01-08 17:27:44 +00:00
{
2020-05-27 18:38:34 +00:00
const DataTypePtr & type = index_sample_block.getByPosition(i).type;
2021-03-09 14:46:52 +00:00
auto serialization = type->getDefaultSerialization();
switch (version)
Fix on-disk format breakage for secondary indices over Nullable column [1] breaks on disk format (and the relevant change in the: [1]: https://github.com/ClickHouse/ClickHouse/pull/12455#discussion_r682830812 Too bad that I checked this patchset only for compatibility after reverting this patch [2] (use case: I've applied it manually, then revert it, and data skipping indexes over Nullable column had been broken) [2]: https://github.com/ClickHouse/ClickHouse/pull/12455#issuecomment-823423772 But this patchset actually breaks compatibility with older versions of clickhouse for Nullable data skipping indexes after simple upgrade: Here is a simple reproducer: -- -- run this with 21.6 or similar (i.e. w/o this patch) -- CREATE TABLE data ( `key` Int, `value` Nullable(Int), INDEX value_index value TYPE minmax GRANULARITY 1 ) ENGINE = MergeTree ORDER BY key; INSERT INTO data SELECT number, number FROM numbers(10000); SELECT * FROM data WHERE value = 20000 SETTINGS force_data_skipping_indices = 'value_index' SETTINGS force_data_skipping_indices = 'value_index', max_rows_to_read=1; Now upgrade and run the query again: SELECT * FROM data WHERE value = 20000 SETTINGS force_data_skipping_indices = 'value_index' SETTINGS force_data_skipping_indices = 'value_index', max_rows_to_read=1; And it will fail because of on disk format changes: $ ll --time-style=+ data/*/data/all_1_1_0/skp*.idx -rw-r----- 1 azat azat 36 data/with_nullable_patch/data/all_1_1_0/skp_idx_value_index.idx -rw-r----- 1 azat azat 37 data/without_nullable_patch/data/all_1_1_0/skp_idx_value_index.idx $ md5sum data/*/data/all_1_1_0/skp*.idx a19c95c4a14506c65665a1e30ab404bf data/with_nullable_patch/data/all_1_1_0/skp_idx_value_index.idx e50e2fcfa873b232196623d56ab26105 data/without_nullable_patch/data/all_1_1_0/skp_idx_value_index.idx Note, that there is no stable release with this patch included yet, so no need to backport. Also note that you may create data skipping indexes over Nullable column even before [3]. [3]: https://github.com/ClickHouse/ClickHouse/pull/12455 v2: break cases when granulas has Null in values due to backward compatibility
2021-08-04 18:25:51 +00:00
{
case 1:
if (!type->isNullable())
{
serialization->deserializeBinary(min_val, istr);
serialization->deserializeBinary(max_val, istr);
}
else
{
/// NOTE: that this serialization differs from
/// IMergeTreeDataPart::MinMaxIndex::load() to preserve
/// backward compatibility.
///
/// But this is deprecated format, so this is OK.
bool is_null;
readBinary(is_null, istr);
if (!is_null)
{
serialization->deserializeBinary(min_val, istr);
serialization->deserializeBinary(max_val, istr);
}
else
{
min_val = Null();
max_val = Null();
}
}
break;
/// New format with proper Nullable support for values that includes Null values
case 2:
Fix on-disk format breakage for secondary indices over Nullable column [1] breaks on disk format (and the relevant change in the: [1]: https://github.com/ClickHouse/ClickHouse/pull/12455#discussion_r682830812 Too bad that I checked this patchset only for compatibility after reverting this patch [2] (use case: I've applied it manually, then revert it, and data skipping indexes over Nullable column had been broken) [2]: https://github.com/ClickHouse/ClickHouse/pull/12455#issuecomment-823423772 But this patchset actually breaks compatibility with older versions of clickhouse for Nullable data skipping indexes after simple upgrade: Here is a simple reproducer: -- -- run this with 21.6 or similar (i.e. w/o this patch) -- CREATE TABLE data ( `key` Int, `value` Nullable(Int), INDEX value_index value TYPE minmax GRANULARITY 1 ) ENGINE = MergeTree ORDER BY key; INSERT INTO data SELECT number, number FROM numbers(10000); SELECT * FROM data WHERE value = 20000 SETTINGS force_data_skipping_indices = 'value_index' SETTINGS force_data_skipping_indices = 'value_index', max_rows_to_read=1; Now upgrade and run the query again: SELECT * FROM data WHERE value = 20000 SETTINGS force_data_skipping_indices = 'value_index' SETTINGS force_data_skipping_indices = 'value_index', max_rows_to_read=1; And it will fail because of on disk format changes: $ ll --time-style=+ data/*/data/all_1_1_0/skp*.idx -rw-r----- 1 azat azat 36 data/with_nullable_patch/data/all_1_1_0/skp_idx_value_index.idx -rw-r----- 1 azat azat 37 data/without_nullable_patch/data/all_1_1_0/skp_idx_value_index.idx $ md5sum data/*/data/all_1_1_0/skp*.idx a19c95c4a14506c65665a1e30ab404bf data/with_nullable_patch/data/all_1_1_0/skp_idx_value_index.idx e50e2fcfa873b232196623d56ab26105 data/without_nullable_patch/data/all_1_1_0/skp_idx_value_index.idx Note, that there is no stable release with this patch included yet, so no need to backport. Also note that you may create data skipping indexes over Nullable column even before [3]. [3]: https://github.com/ClickHouse/ClickHouse/pull/12455 v2: break cases when granulas has Null in values due to backward compatibility
2021-08-04 18:25:51 +00:00
serialization->deserializeBinary(min_val, istr);
serialization->deserializeBinary(max_val, istr);
// NULL_LAST
if (min_val.isNull())
min_val = POSITIVE_INFINITY;
if (max_val.isNull())
max_val = POSITIVE_INFINITY;
break;
default:
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown index version {}.", version);
Fix on-disk format breakage for secondary indices over Nullable column [1] breaks on disk format (and the relevant change in the: [1]: https://github.com/ClickHouse/ClickHouse/pull/12455#discussion_r682830812 Too bad that I checked this patchset only for compatibility after reverting this patch [2] (use case: I've applied it manually, then revert it, and data skipping indexes over Nullable column had been broken) [2]: https://github.com/ClickHouse/ClickHouse/pull/12455#issuecomment-823423772 But this patchset actually breaks compatibility with older versions of clickhouse for Nullable data skipping indexes after simple upgrade: Here is a simple reproducer: -- -- run this with 21.6 or similar (i.e. w/o this patch) -- CREATE TABLE data ( `key` Int, `value` Nullable(Int), INDEX value_index value TYPE minmax GRANULARITY 1 ) ENGINE = MergeTree ORDER BY key; INSERT INTO data SELECT number, number FROM numbers(10000); SELECT * FROM data WHERE value = 20000 SETTINGS force_data_skipping_indices = 'value_index' SETTINGS force_data_skipping_indices = 'value_index', max_rows_to_read=1; Now upgrade and run the query again: SELECT * FROM data WHERE value = 20000 SETTINGS force_data_skipping_indices = 'value_index' SETTINGS force_data_skipping_indices = 'value_index', max_rows_to_read=1; And it will fail because of on disk format changes: $ ll --time-style=+ data/*/data/all_1_1_0/skp*.idx -rw-r----- 1 azat azat 36 data/with_nullable_patch/data/all_1_1_0/skp_idx_value_index.idx -rw-r----- 1 azat azat 37 data/without_nullable_patch/data/all_1_1_0/skp_idx_value_index.idx $ md5sum data/*/data/all_1_1_0/skp*.idx a19c95c4a14506c65665a1e30ab404bf data/with_nullable_patch/data/all_1_1_0/skp_idx_value_index.idx e50e2fcfa873b232196623d56ab26105 data/without_nullable_patch/data/all_1_1_0/skp_idx_value_index.idx Note, that there is no stable release with this patch included yet, so no need to backport. Also note that you may create data skipping indexes over Nullable column even before [3]. [3]: https://github.com/ClickHouse/ClickHouse/pull/12455 v2: break cases when granulas has Null in values due to backward compatibility
2021-08-04 18:25:51 +00:00
}
2020-03-10 14:56:55 +00:00
hyperrectangle.emplace_back(min_val, true, max_val, true);
2019-01-08 17:27:44 +00:00
}
2019-01-09 14:15:23 +00:00
}
2020-05-28 12:37:05 +00:00
MergeTreeIndexAggregatorMinMax::MergeTreeIndexAggregatorMinMax(const String & index_name_, const Block & index_sample_block_)
: index_name(index_name_)
, index_sample_block(index_sample_block_)
2020-05-27 18:38:34 +00:00
{}
2019-03-08 19:52:21 +00:00
2019-06-19 15:30:48 +00:00
MergeTreeIndexGranulePtr MergeTreeIndexAggregatorMinMax::getGranuleAndReset()
2019-03-08 19:52:21 +00:00
{
2020-05-28 12:37:05 +00:00
return std::make_shared<MergeTreeIndexGranuleMinMax>(index_name, index_sample_block, std::move(hyperrectangle));
2019-03-08 19:52:21 +00:00
}
2019-06-19 15:30:48 +00:00
void MergeTreeIndexAggregatorMinMax::update(const Block & block, size_t * pos, size_t limit)
2019-01-08 17:27:44 +00:00
{
2019-02-06 07:49:18 +00:00
if (*pos >= block.rows())
throw Exception(
"The provided position is not less than the number of block rows. Position: "
+ toString(*pos) + ", Block rows: " + toString(block.rows()) + ".", ErrorCodes::LOGICAL_ERROR);
2019-01-26 19:31:12 +00:00
size_t rows_read = std::min(limit, block.rows() - *pos);
FieldRef field_min;
FieldRef field_max;
2020-05-27 18:38:34 +00:00
for (size_t i = 0; i < index_sample_block.columns(); ++i)
2019-01-08 17:27:44 +00:00
{
2020-05-27 18:38:34 +00:00
auto index_column_name = index_sample_block.getByPosition(i).name;
const auto & column = block.getByName(index_column_name).column->cut(*pos, rows_read);
if (const auto * column_nullable = typeid_cast<const ColumnNullable *>(column.get()))
column_nullable->getExtremesNullLast(field_min, field_max);
else
column->getExtremes(field_min, field_max);
2019-01-26 19:31:12 +00:00
2020-03-10 14:56:55 +00:00
if (hyperrectangle.size() <= i)
2019-01-08 17:27:44 +00:00
{
2020-03-10 14:56:55 +00:00
hyperrectangle.emplace_back(field_min, true, field_max, true);
2019-01-08 17:27:44 +00:00
}
2019-01-26 19:31:12 +00:00
else
{
hyperrectangle[i].left
= applyVisitor(FieldVisitorAccurateLess(), hyperrectangle[i].left, field_min) ? hyperrectangle[i].left : field_min;
hyperrectangle[i].right
= applyVisitor(FieldVisitorAccurateLess(), hyperrectangle[i].right, field_max) ? field_max : hyperrectangle[i].right;
2019-01-26 19:31:12 +00:00
}
2019-01-08 17:27:44 +00:00
}
*pos += rows_read;
2019-01-27 18:23:08 +00:00
}
2019-01-08 17:27:44 +00:00
2019-06-19 15:30:48 +00:00
MergeTreeIndexConditionMinMax::MergeTreeIndexConditionMinMax(
2020-05-28 13:09:03 +00:00
const IndexDescription & index,
2020-05-27 18:38:34 +00:00
const SelectQueryInfo & query,
ContextPtr context)
2020-05-27 18:38:34 +00:00
: index_data_types(index.data_types)
, condition(query, context, index.column_names, index.expression)
{
}
2019-01-08 17:27:44 +00:00
2019-06-19 15:30:48 +00:00
bool MergeTreeIndexConditionMinMax::alwaysUnknownOrTrue() const
2019-01-08 17:27:44 +00:00
{
return condition.alwaysUnknownOrTrue();
}
2019-06-19 15:30:48 +00:00
bool MergeTreeIndexConditionMinMax::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const
2019-01-08 17:27:44 +00:00
{
2019-06-19 15:30:48 +00:00
std::shared_ptr<MergeTreeIndexGranuleMinMax> granule
= std::dynamic_pointer_cast<MergeTreeIndexGranuleMinMax>(idx_granule);
2019-01-26 06:26:49 +00:00
if (!granule)
2019-01-08 17:27:44 +00:00
throw Exception(
2019-02-06 07:49:18 +00:00
"Minmax index condition got a granule with the wrong type.", ErrorCodes::LOGICAL_ERROR);
2020-05-28 12:37:05 +00:00
return condition.checkInHyperrectangle(granule->hyperrectangle, index_data_types).can_be_true;
2019-01-08 17:27:44 +00:00
}
2019-06-19 15:30:48 +00:00
MergeTreeIndexGranulePtr MergeTreeIndexMinMax::createIndexGranule() const
2019-01-08 17:27:44 +00:00
{
2020-05-28 12:37:05 +00:00
return std::make_shared<MergeTreeIndexGranuleMinMax>(index.name, index.sample_block);
2019-01-08 17:27:44 +00:00
}
2019-03-08 19:52:21 +00:00
2019-06-19 15:30:48 +00:00
MergeTreeIndexAggregatorPtr MergeTreeIndexMinMax::createIndexAggregator() const
2019-03-08 19:52:21 +00:00
{
2020-05-28 12:37:05 +00:00
return std::make_shared<MergeTreeIndexAggregatorMinMax>(index.name, index.sample_block);
2019-03-08 19:52:21 +00:00
}
2019-06-19 15:30:48 +00:00
MergeTreeIndexConditionPtr MergeTreeIndexMinMax::createIndexCondition(
const SelectQueryInfo & query, ContextPtr context) const
2019-01-08 17:27:44 +00:00
{
2020-05-27 18:38:34 +00:00
return std::make_shared<MergeTreeIndexConditionMinMax>(index, query, context);
2019-01-08 17:27:44 +00:00
};
2019-06-19 15:30:48 +00:00
bool MergeTreeIndexMinMax::mayBenefitFromIndexForIn(const ASTPtr & node) const
2019-02-25 08:43:19 +00:00
{
const String column_name = node->getColumnName();
2019-02-25 10:12:05 +00:00
2020-05-27 18:38:34 +00:00
for (const auto & cname : index.column_names)
2019-08-03 11:02:40 +00:00
if (column_name == cname)
2019-02-25 10:12:05 +00:00
return true;
if (const auto * func = typeid_cast<const ASTFunction *>(node.get()))
if (func->arguments->children.size() == 1)
return mayBenefitFromIndexForIn(func->arguments->children.front());
return false;
2019-02-25 08:43:19 +00:00
}
2019-01-08 17:27:44 +00:00
MergeTreeIndexFormat MergeTreeIndexMinMax::getDeserializedFormat(const DiskPtr disk, const std::string & relative_path_prefix) const
{
if (disk->exists(relative_path_prefix + ".idx2"))
return {2, ".idx2"};
else if (disk->exists(relative_path_prefix + ".idx"))
return {1, ".idx"};
return {0 /* unknown */, ""};
}
2020-05-28 13:45:08 +00:00
MergeTreeIndexPtr minmaxIndexCreator(
2020-05-28 13:09:03 +00:00
const IndexDescription & index)
2019-01-08 17:27:44 +00:00
{
2020-05-28 12:37:05 +00:00
return std::make_shared<MergeTreeIndexMinMax>(index);
2019-01-08 17:27:44 +00:00
}
2020-05-28 13:09:03 +00:00
void minmaxIndexValidator(const IndexDescription & /* index */, bool /* attach */)
2020-05-28 12:37:05 +00:00
{
}
}