fix index granularity

This commit is contained in:
Anton Popov 2024-11-12 16:42:49 +00:00
parent c5817d528c
commit 3bba64e975
9 changed files with 59 additions and 58 deletions

View File

@ -41,6 +41,11 @@ size_t MergeTreeIndexGranularity::getMarksCountWithoutFinal() const
return total - hasFinalMark();
}
size_t MergeTreeIndexGranularity::getMarkStartingRow(size_t mark_index) const
{
return getRowsCountInRange(0, mark_index);
}
size_t MergeTreeIndexGranularity::getLastMarkRows() const
{
return getMarkRows(getMarksCount() - 1);
@ -48,7 +53,7 @@ size_t MergeTreeIndexGranularity::getLastMarkRows() const
size_t MergeTreeIndexGranularity::getLastNonFinalMarkRows() const
{
size_t last_mark_rows = getLastMarkRows();
size_t last_mark_rows = getMarkRows(getMarksCount() - 1);
if (last_mark_rows != 0)
return last_mark_rows;
return getMarkRows(getMarksCount() - 2);
@ -140,5 +145,4 @@ MergeTreeIndexGranularityPtr createMergeTreeIndexGranularity(
return std::make_shared<MergeTreeIndexGranularityConstant>(computed_granularity);
}
}

View File

@ -5,12 +5,7 @@
namespace DB
{
/// Class contains information about index granularity in rows of IMergeTreeDataPart
/// Inside it contains vector of partial sums of rows after mark:
/// |-----|---|----|----|
/// | 5 | 8 | 12 | 16 |
/// If user doesn't specify setting index_granularity_bytes for MergeTree* table
/// all values in inner vector would have constant stride (default 8192).
/// Class that contains information about index granularity in rows of IMergeTreeDataPart
class MergeTreeIndexGranularity
{
public:
@ -49,7 +44,7 @@ public:
virtual size_t getMarkRows(size_t mark_index) const = 0;
/// Return amount of rows before mark
virtual size_t getMarkStartingRow(size_t mark_index) const = 0;
size_t getMarkStartingRow(size_t mark_index) const;
/// Amount of rows after last mark
size_t getLastMarkRows() const;

View File

@ -27,22 +27,11 @@ size_t MergeTreeIndexGranularityAdaptive::getMarkRows(size_t mark_index) const
return marks_rows_partial_sums[mark_index] - marks_rows_partial_sums[mark_index - 1];
}
size_t MergeTreeIndexGranularityAdaptive::getMarkStartingRow(size_t mark_index) const
{
if (mark_index > getMarksCount())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to get non existing mark {}, while size is {}", mark_index, getMarksCount());
if (mark_index == 0)
return 0;
return marks_rows_partial_sums[mark_index - 1];
}
bool MergeTreeIndexGranularityAdaptive::hasFinalMark() const
{
if (marks_rows_partial_sums.empty())
return false;
return marks_rows_partial_sums.back() == 0;
return getLastMarkRows() == 0;
}
size_t MergeTreeIndexGranularityAdaptive::getMarksCount() const
@ -92,9 +81,16 @@ void MergeTreeIndexGranularityAdaptive::adjustLastMark(size_t rows_count)
size_t MergeTreeIndexGranularityAdaptive::getRowsCountInRange(size_t begin, size_t end) const
{
if (end > getMarksCount())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to get marks in range [{}; {}), while size is {}", begin, end, getMarksCount());
if (end == 0)
return 0;
size_t subtrahend = 0;
if (begin != 0)
subtrahend = marks_rows_partial_sums[begin - 1];
return marks_rows_partial_sums[end - 1] - subtrahend;
}
@ -126,6 +122,7 @@ uint64_t MergeTreeIndexGranularityAdaptive::getBytesSize() const
{
return marks_rows_partial_sums.size() * sizeof(size_t);
}
uint64_t MergeTreeIndexGranularityAdaptive::getBytesAllocated() const
{
return marks_rows_partial_sums.capacity() * sizeof(size_t);

View File

@ -4,6 +4,10 @@
namespace DB
{
/// Class that stores adaptive index granularity.
/// Inside it contains vector of partial sums of rows after mark:
/// |-----|---|----|----|
/// | 5 | 8 | 12 | 16 |
class MergeTreeIndexGranularityAdaptive : public MergeTreeIndexGranularity
{
public:
@ -19,7 +23,6 @@ public:
size_t getTotalRows() const override;
size_t getMarkRows(size_t mark_index) const override;
size_t getMarkStartingRow(size_t mark_index) const override;
bool hasFinalMark() const override;
void appendMark(size_t rows_count) override;

View File

@ -1,6 +1,5 @@
#include <Storages/MergeTree/MergeTreeIndexGranularityConstant.h>
namespace DB
{
@ -38,22 +37,6 @@ size_t MergeTreeIndexGranularityConstant::getMarkRows(size_t mark_index) const
return 0; // Final mark.
}
size_t MergeTreeIndexGranularityConstant::getMarkStartingRow(size_t mark_index) const
{
if (mark_index > getMarksCount())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to get non existing mark {}, while size is {}", mark_index, getMarksCount());
size_t total_rows = 0;
if (mark_index >= num_marks_without_final && mark_index != 0)
{
total_rows += last_mark_granularity;
mark_index = num_marks_without_final - 1;
}
total_rows += constant_granularity * mark_index;
return total_rows;
}
size_t MergeTreeIndexGranularityConstant::getMarksCount() const
{
return num_marks_without_final + has_final_mark;
@ -104,8 +87,14 @@ void MergeTreeIndexGranularityConstant::adjustLastMark(size_t rows_count)
size_t MergeTreeIndexGranularityConstant::getRowsCountInRange(size_t begin, size_t end) const
{
if (end > getMarksCount())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to get marks in range [{}; {}), while size is {}", begin, end, getMarksCount());
if (end == 0)
return 0;
size_t total_rows = 0;
if (end >= num_marks_without_final && end != 0)
if (end >= num_marks_without_final)
{
total_rows += last_mark_granularity;
end = num_marks_without_final - 1;
@ -115,27 +104,31 @@ size_t MergeTreeIndexGranularityConstant::getRowsCountInRange(size_t begin, size
return total_rows;
}
size_t MergeTreeIndexGranularityConstant::getMarkUpperBoundForRow(size_t row_index) const
{
size_t num_rows_with_constant_granularity = (num_marks_without_final - 1) * constant_granularity;
/// All granules with constant granularity + last granule + final granule
if (row_index >= num_rows_with_constant_granularity)
return getMarksCount();
return (row_index + constant_granularity - 1) / constant_granularity;
}
size_t MergeTreeIndexGranularityConstant::countMarksForRows(size_t from_mark, size_t number_of_rows) const
{
size_t rows_before_mark = getMarkStartingRow(from_mark);
size_t last_row_pos = rows_before_mark + number_of_rows;
if (last_row_pos >= (num_marks_without_final - 1) * constant_granularity)
return num_marks_without_final - from_mark;
return (last_row_pos + constant_granularity - 1) / constant_granularity - from_mark;
return getMarkUpperBoundForRow(last_row_pos) - from_mark;
}
size_t MergeTreeIndexGranularityConstant::countRowsForRows(size_t from_mark, size_t number_of_rows, size_t offset_in_rows) const
{
UNUSED(from_mark, number_of_rows, offset_in_rows);
return 0;
// size_t rows_before_mark = getMarkStartingRow(from_mark);
// size_t last_row_pos = rows_before_mark + offset_in_rows + number_of_rows;
// auto it = std::upper_bound(marks_rows_partial_sums.begin(), marks_rows_partial_sums.end(), last_row_pos);
// size_t to_mark = it - marks_rows_partial_sums.begin();
size_t rows_before_mark = getMarkStartingRow(from_mark);
size_t last_row_pos = rows_before_mark + offset_in_rows + number_of_rows;
// return getRowsCountInRange(from_mark, std::max(1UL, to_mark)) - offset_in_rows;
return getRowsCountInRange(from_mark, std::max(1UL, getMarkUpperBoundForRow(last_row_pos))) - offset_in_rows;
}
std::string MergeTreeIndexGranularityConstant::describe() const

View File

@ -4,6 +4,8 @@
namespace DB
{
/// Class that stores constant index granularity for whole part, except
/// last non-zero granule and final granule which always has zero rows.
class MergeTreeIndexGranularityConstant : public MergeTreeIndexGranularity
{
private:
@ -13,6 +15,8 @@ private:
size_t num_marks_without_final = 0;
bool has_final_mark = false;
size_t getMarkUpperBoundForRow(size_t row_index) const;
public:
MergeTreeIndexGranularityConstant() = default;
explicit MergeTreeIndexGranularityConstant(size_t constant_granularity_);
@ -27,7 +31,6 @@ public:
size_t getTotalRows() const override;
size_t getMarkRows(size_t mark_index) const override;
size_t getMarkStartingRow(size_t mark_index) const override;
bool hasFinalMark() const override { return has_final_mark; }
void appendMark(size_t rows_count) override;

View File

@ -4,7 +4,6 @@
#include <Columns/FilterDescription.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnsCommon.h>
#include "Common/Logger.h"
#include <Common/TargetSpecific.h>
#include <Common/logger_useful.h>
#include <Core/UUID.h>

View File

@ -35,7 +35,6 @@
#include <DataTypes/DataTypeVariant.h>
#include <boost/algorithm/string/replace.hpp>
#include <Common/ProfileEventsScope.h>
#include "Storages/MergeTree/MergeTreeIndexGranularity.h"
#include <Core/ColumnsWithTypeAndName.h>
@ -75,6 +74,7 @@ namespace MergeTreeSetting
extern const MergeTreeSettingsFloat ratio_of_defaults_for_sparse_serialization;
extern const MergeTreeSettingsBool replace_long_file_name_to_hash;
extern const MergeTreeSettingsBool ttl_only_drop_parts;
extern const MergeTreeSettingsBool enable_index_granularity_compression;
}
namespace ErrorCodes
@ -985,12 +985,19 @@ void finalizeMutatedPart(
new_data_part->rows_count = source_part->rows_count;
new_data_part->index_granularity = source_part->index_granularity;
/// Just in case
new_data_part->index_granularity->shrinkToFitInMemory();
new_data_part->setIndex(*source_part->getIndex());
new_data_part->minmax_idx = source_part->minmax_idx;
new_data_part->modification_time = time(nullptr);
if ((*new_data_part->storage.getSettings())[MergeTreeSetting::enable_index_granularity_compression])
{
if (auto new_index_granularity = new_data_part->index_granularity->optimize())
new_data_part->index_granularity = std::move(new_index_granularity);
}
/// Just in case
new_data_part->index_granularity->shrinkToFitInMemory();
/// Load rest projections which are hardlinked
bool noop;
new_data_part->loadProjections(false, false, noop, true /* if_not_loaded */);

View File

@ -11,8 +11,8 @@ all_2_2_0 5 2 25
all_2_2_0 6 2 27
all_2_2_0 7 1 29
all_2_2_0 8 0 29
all_1_1_0 24
all_2_2_0 72
all_1_1_0 25
all_2_2_0 25
adaptive non-const, after merge
all_1_2_1 0 10 0
all_1_2_1 1 5 10