Do not load useless columns from the index in memory

This commit is contained in:
Alexey Milovidov 2024-02-21 21:41:04 +01:00
parent 64a80f1011
commit e98d09c93e
5 changed files with 49 additions and 11 deletions

2
contrib/rapidjson vendored

@ -1 +1 @@
Subproject commit c4ef90ccdbc21d5d5a628d08316bfd301e32d6fa
Subproject commit a9bc56c9165f1dbbbcada64221bd3a59042c5b95

View File

@ -32,9 +32,9 @@ std::string toString(const Values & value)
int compareValues(const Values & lhs, const Values & rhs)
{
chassert(lhs.size() == rhs.size());
size_t size = std::min(lhs.size(), rhs.size());
for (size_t i = 0; i < lhs.size(); ++i)
for (size_t i = 0; i < size; ++i)
{
if (applyVisitor(FieldVisitorAccurateLess(), lhs[i], rhs[i]))
return -1;
@ -55,8 +55,9 @@ public:
Values getValue(size_t part_idx, size_t mark) const
{
const auto & index = parts[part_idx].data_part->getIndex();
Values values(index.size());
for (size_t i = 0; i < values.size(); ++i)
size_t size = index.size();
Values values(size);
for (size_t i = 0; i < size; ++i)
{
index[i]->get(mark, values[i]);
if (values[i].isNull())

View File

@ -869,6 +869,27 @@ void IMergeTreeDataPart::loadIndex() const
for (size_t j = 0; j < key_size; ++j)
key_serializations[j]->deserializeBinary(*loaded_index[j], *index_file, {});
/// Cut useless suffix columns, if necessary.
Float64 ratio_to_drop_suffix_columns = storage.getSettings()->primary_key_ratio_of_unique_prefix_values_to_skip_suffix_columns;
if (key_size > 1 && ratio_to_drop_suffix_columns > 0 && ratio_to_drop_suffix_columns < 1)
{
chassert(marks_count > 0);
for (size_t j = 0; j < key_size - 1; ++j)
{
size_t num_changes = 0;
for (size_t i = 1; i < marks_count; ++i)
if (0 != loaded_index[j]->compareAt(i, i - 1, *loaded_index[j], 0))
++num_changes;
if (static_cast<Float64>(num_changes) / marks_count >= ratio_to_drop_suffix_columns)
{
key_size = j + 1;
loaded_index.resize(key_size);
break;
}
}
}
for (size_t i = 0; i < key_size; ++i)
{
loaded_index[i]->shrinkToFit();

View File

@ -1110,7 +1110,11 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
DataTypes key_types;
for (size_t i : key_indices)
{
index_columns->emplace_back(ColumnWithTypeAndName{index[i], primary_key.data_types[i], primary_key.column_names[i]});
if (i < index.size())
index_columns->emplace_back(index[i], primary_key.data_types[i], primary_key.column_names[i]);
else
index_columns->emplace_back(); /// The column of the primary key was not loaded in memory - we'll skip it.
key_types.emplace_back(primary_key.data_types[i]);
}
@ -1119,7 +1123,6 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
std::function<void(size_t, size_t, FieldRef &)> create_field_ref;
if (key_condition.hasMonotonicFunctionsChain())
{
create_field_ref = [index_columns](size_t row, size_t column, FieldRef & field)
{
field = {index_columns.get(), row, column};
@ -1159,7 +1162,11 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
{
for (size_t i = 0; i < used_key_size; ++i)
{
if ((*index_columns)[i].column)
create_field_ref(range.begin, i, index_left[i]);
else
index_left[i] = NEGATIVE_INFINITY;
index_right[i] = POSITIVE_INFINITY;
}
}
@ -1169,10 +1176,19 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
range.end -= 1; /// Remove final empty mark. It's useful only for primary key condition.
for (size_t i = 0; i < used_key_size; ++i)
{
if ((*index_columns)[i].column)
{
create_field_ref(range.begin, i, index_left[i]);
create_field_ref(range.end, i, index_right[i]);
}
else
{
/// If the PK column was not loaded in memory - exclude it from the analysis.
index_left[i] = NEGATIVE_INFINITY;
index_right[i] = POSITIVE_INFINITY;
}
}
}
key_condition_maybe_true = key_condition.mayBeTrueInRange(used_key_size, index_left.data(), index_right.data(), key_types);
}

View File

@ -202,7 +202,7 @@ struct Settings;
M(UInt64, marks_compress_block_size, 65536, "Mark compress block size, the actual size of the block to compress.", 0) \
M(UInt64, primary_key_compress_block_size, 65536, "Primary compress block size, the actual size of the block to compress.", 0) \
M(Bool, primary_key_lazy_load, true, "Load primary key in memory on first use instead of on table initialization. This can save memory in the presence of a large number of tables.", 0) \
\
M(Float, primary_key_ratio_of_unique_prefix_values_to_skip_suffix_columns, 0.9f, "If the value of a column of the primary key in data part changes at least in this ratio of times, skip loading next columns in memory. This allows to save memory usage by not loading useless columns of the primary key.", 0) \
/** Projection settings. */ \
M(UInt64, max_projections, 25, "The maximum number of merge tree projections.", 0) \