From e98d09c93e6c54a2cc4eadab8614539c0a5eb0f8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 21 Feb 2024 21:41:04 +0100 Subject: [PATCH] Do not load useless columns from the index in memory --- contrib/rapidjson | 2 +- src/Processors/QueryPlan/PartsSplitter.cpp | 9 ++++--- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 21 +++++++++++++++ .../MergeTree/MergeTreeDataSelectExecutor.cpp | 26 +++++++++++++++---- src/Storages/MergeTree/MergeTreeSettings.h | 2 +- 5 files changed, 49 insertions(+), 11 deletions(-) diff --git a/contrib/rapidjson b/contrib/rapidjson index c4ef90ccdbc..a9bc56c9165 160000 --- a/contrib/rapidjson +++ b/contrib/rapidjson @@ -1 +1 @@ -Subproject commit c4ef90ccdbc21d5d5a628d08316bfd301e32d6fa +Subproject commit a9bc56c9165f1dbbbcada64221bd3a59042c5b95 diff --git a/src/Processors/QueryPlan/PartsSplitter.cpp b/src/Processors/QueryPlan/PartsSplitter.cpp index 0fc6ddd6408..fcb1d8dd92c 100644 --- a/src/Processors/QueryPlan/PartsSplitter.cpp +++ b/src/Processors/QueryPlan/PartsSplitter.cpp @@ -32,9 +32,9 @@ std::string toString(const Values & value) int compareValues(const Values & lhs, const Values & rhs) { - chassert(lhs.size() == rhs.size()); + size_t size = std::min(lhs.size(), rhs.size()); - for (size_t i = 0; i < lhs.size(); ++i) + for (size_t i = 0; i < size; ++i) { if (applyVisitor(FieldVisitorAccurateLess(), lhs[i], rhs[i])) return -1; @@ -55,8 +55,9 @@ public: Values getValue(size_t part_idx, size_t mark) const { const auto & index = parts[part_idx].data_part->getIndex(); - Values values(index.size()); - for (size_t i = 0; i < values.size(); ++i) + size_t size = index.size(); + Values values(size); + for (size_t i = 0; i < size; ++i) { index[i]->get(mark, values[i]); if (values[i].isNull()) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 11ede661f78..629f3688874 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -869,6 +869,27 @@ void IMergeTreeDataPart::loadIndex() const for (size_t j = 0; j < key_size; ++j) key_serializations[j]->deserializeBinary(*loaded_index[j], *index_file, {}); + /// Cut useless suffix columns, if necessary. + Float64 ratio_to_drop_suffix_columns = storage.getSettings()->primary_key_ratio_of_unique_prefix_values_to_skip_suffix_columns; + if (key_size > 1 && ratio_to_drop_suffix_columns > 0 && ratio_to_drop_suffix_columns < 1) + { + chassert(marks_count > 0); + for (size_t j = 0; j < key_size - 1; ++j) + { + size_t num_changes = 0; + for (size_t i = 1; i < marks_count; ++i) + if (0 != loaded_index[j]->compareAt(i, i - 1, *loaded_index[j], 0)) + ++num_changes; + + if (static_cast(num_changes) / marks_count >= ratio_to_drop_suffix_columns) + { + key_size = j + 1; + loaded_index.resize(key_size); + break; + } + } + } + for (size_t i = 0; i < key_size; ++i) { loaded_index[i]->shrinkToFit(); diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 1ba28713680..175419f20e0 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -1110,7 +1110,11 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange( DataTypes key_types; for (size_t i : key_indices) { - index_columns->emplace_back(ColumnWithTypeAndName{index[i], primary_key.data_types[i], primary_key.column_names[i]}); + if (i < index.size()) + index_columns->emplace_back(index[i], primary_key.data_types[i], primary_key.column_names[i]); + else + index_columns->emplace_back(); /// The column of the primary key was not loaded in memory - we'll skip it. + key_types.emplace_back(primary_key.data_types[i]); } @@ -1119,7 +1123,6 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange( std::function create_field_ref; if (key_condition.hasMonotonicFunctionsChain()) { - create_field_ref = [index_columns](size_t row, size_t column, FieldRef & field) { field = {index_columns.get(), row, column}; @@ -1159,7 +1162,11 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange( { for (size_t i = 0; i < used_key_size; ++i) { - create_field_ref(range.begin, i, index_left[i]); + if ((*index_columns)[i].column) + create_field_ref(range.begin, i, index_left[i]); + else + index_left[i] = NEGATIVE_INFINITY; + index_right[i] = POSITIVE_INFINITY; } } @@ -1170,8 +1177,17 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange( for (size_t i = 0; i < used_key_size; ++i) { - create_field_ref(range.begin, i, index_left[i]); - create_field_ref(range.end, i, index_right[i]); + if ((*index_columns)[i].column) + { + create_field_ref(range.begin, i, index_left[i]); + create_field_ref(range.end, i, index_right[i]); + } + else + { + /// If the PK column was not loaded in memory - exclude it from the analysis. + index_left[i] = NEGATIVE_INFINITY; + index_right[i] = POSITIVE_INFINITY; + } } } key_condition_maybe_true = key_condition.mayBeTrueInRange(used_key_size, index_left.data(), index_right.data(), key_types); diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index b64632b6139..1cff44142bc 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -202,7 +202,7 @@ struct Settings; M(UInt64, marks_compress_block_size, 65536, "Mark compress block size, the actual size of the block to compress.", 0) \ M(UInt64, primary_key_compress_block_size, 65536, "Primary compress block size, the actual size of the block to compress.", 0) \ M(Bool, primary_key_lazy_load, true, "Load primary key in memory on first use instead of on table initialization. This can save memory in the presence of a large number of tables.", 0) \ - \ + M(Float, primary_key_ratio_of_unique_prefix_values_to_skip_suffix_columns, 0.9f, "If the value of a column of the primary key in data part changes at least in this ratio of times, skip loading next columns in memory. This allows to save memory usage by not loading useless columns of the primary key.", 0) \ /** Projection settings. */ \ M(UInt64, max_projections, 25, "The maximum number of merge tree projections.", 0) \