ClickHouse/src/Storages/MergeTree/SimpleMergeSelector.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

232 lines
8.3 KiB
C++
Raw Normal View History

#include <Storages/MergeTree/SimpleMergeSelector.h>
2023-03-06 14:53:58 +00:00
#include <base/interpolate.h>
2016-10-27 23:28:35 +00:00
#include <cmath>
2021-05-08 15:20:40 +00:00
#include <cassert>
#include <iostream>
2016-10-27 23:28:35 +00:00
namespace DB
{
namespace
{
/** Estimates best set of parts to merge within passed alternatives.
*/
2016-10-27 23:28:35 +00:00
struct Estimator
{
2020-09-03 13:29:18 +00:00
using Iterator = SimpleMergeSelector::PartsRange::const_iterator;
void consider(Iterator begin, Iterator end, size_t sum_size, size_t size_prev_at_left, const SimpleMergeSelector::Settings & settings)
2016-10-27 23:28:35 +00:00
{
double current_score = score(end - begin, sum_size, settings.size_fixed_cost_to_add);
if (settings.enable_heuristic_to_align_parts
&& size_prev_at_left > sum_size * settings.heuristic_to_align_parts_min_ratio_of_sum_size_to_prev_part)
{
double difference = std::abs(log2(static_cast<double>(sum_size) / size_prev_at_left));
if (difference < settings.heuristic_to_align_parts_max_absolute_difference_in_powers_of_two)
current_score *= interpolateLinear(settings.heuristic_to_align_parts_max_score_adjustment, 1,
difference / settings.heuristic_to_align_parts_max_absolute_difference_in_powers_of_two);
}
if (settings.enable_heuristic_to_remove_small_parts_at_right)
while (end >= begin + 3 && (end - 1)->size < settings.heuristic_to_remove_small_parts_at_right_max_ratio * sum_size)
--end;
2022-09-10 02:07:51 +00:00
if (min_score == 0.0 || current_score < min_score)
2016-10-27 23:28:35 +00:00
{
min_score = current_score;
best_begin = begin;
best_end = end;
2016-10-27 23:28:35 +00:00
}
}
2020-09-03 13:29:18 +00:00
SimpleMergeSelector::PartsRange getBest() const
2016-10-27 23:28:35 +00:00
{
2020-09-03 13:29:18 +00:00
return SimpleMergeSelector::PartsRange(best_begin, best_end);
2016-10-27 23:28:35 +00:00
}
static double score(double count, double sum_size, double sum_size_fixed_cost)
{
/** Consider we have two alternative ranges of data parts to merge.
* Assume time to merge a range is proportional to sum size of its parts.
*
* Cost of query execution is proportional to total number of data parts in a moment of time.
* Let define our target: to minimize average (in time) total number of data parts.
*
* Let calculate integral of total number of parts, if we are going to do merge of one or another range.
* It must be lower, and thus we decide, what range is better to merge.
*
* The integral is lower iff the following formula is lower:
*
* sum_size / (count - 1)
*
* But we have some tunes to prefer longer ranges.
*/
return (sum_size + sum_size_fixed_cost * count) / (count - 1.9);
}
2022-09-10 02:07:51 +00:00
double min_score = 0.0;
Iterator best_begin {};
Iterator best_end {};
2016-10-27 23:28:35 +00:00
};
/**
* 1 _____
* /
* 0_____/
* ^ ^
* min max
*/
double mapPiecewiseLinearToUnit(double value, double min, double max)
{
return value <= min ? 0
: (value >= max ? 1
: ((value - min) / (max - min)));
}
/** Is allowed to merge parts in range with specific properties.
*/
bool allow(
2020-10-22 14:15:56 +00:00
double sum_size,
double max_size,
double min_age,
double range_size,
double partition_size,
double min_size_to_lower_base_log,
double max_size_to_lower_base_log,
const SimpleMergeSelector::Settings & settings)
{
2022-10-19 12:34:20 +00:00
if (settings.min_age_to_force_merge && min_age >= settings.min_age_to_force_merge)
return true;
// std::cerr << "sum_size: " << sum_size << "\n";
/// Map size to 0..1 using logarithmic scale
/// Use log(1 + x) instead of log1p(x) because our sum_size is always integer.
2020-10-20 20:05:20 +00:00
/// Also log1p seems to be slow and significantly affect performance of merges assignment.
double size_normalized = mapPiecewiseLinearToUnit(log(1 + sum_size), min_size_to_lower_base_log, max_size_to_lower_base_log);
// std::cerr << "size_normalized: " << size_normalized << "\n";
/// Calculate boundaries for age
double min_age_to_lower_base = interpolateLinear(settings.min_age_to_lower_base_at_min_size, settings.min_age_to_lower_base_at_max_size, size_normalized);
double max_age_to_lower_base = interpolateLinear(settings.max_age_to_lower_base_at_min_size, settings.max_age_to_lower_base_at_max_size, size_normalized);
// std::cerr << "min_age_to_lower_base: " << min_age_to_lower_base << "\n";
// std::cerr << "max_age_to_lower_base: " << max_age_to_lower_base << "\n";
/// Map age to 0..1
double age_normalized = mapPiecewiseLinearToUnit(min_age, min_age_to_lower_base, max_age_to_lower_base);
// std::cerr << "age: " << min_age << "\n";
// std::cerr << "age_normalized: " << age_normalized << "\n";
/// Map partition_size to 0..1
double num_parts_normalized = mapPiecewiseLinearToUnit(partition_size, settings.min_parts_to_lower_base, settings.max_parts_to_lower_base);
// std::cerr << "partition_size: " << partition_size << "\n";
// std::cerr << "num_parts_normalized: " << num_parts_normalized << "\n";
double combined_ratio = std::min(1.0, age_normalized + num_parts_normalized);
// std::cerr << "combined_ratio: " << combined_ratio << "\n";
double lowered_base = interpolateLinear(settings.base, 2.0, combined_ratio);
// std::cerr << "------- lowered_base: " << lowered_base << "\n";
return (sum_size + range_size * settings.size_fixed_cost_to_add) / (max_size + settings.size_fixed_cost_to_add) >= lowered_base;
}
void selectWithinPartition(
2020-09-03 13:29:18 +00:00
const SimpleMergeSelector::PartsRange & parts,
2016-10-27 23:28:35 +00:00
const size_t max_total_size_to_merge,
Estimator & estimator,
const SimpleMergeSelector::Settings & settings,
double min_size_to_lower_base_log,
double max_size_to_lower_base_log)
2016-10-27 23:28:35 +00:00
{
size_t parts_count = parts.size();
2016-11-01 20:30:40 +00:00
if (parts_count <= 1)
return;
/// If the parts in the parts vector are sorted by block number,
/// it may not be ideal to only select parts for merging from the first N ones.
/// This is because if there are more than N parts in the partition,
/// we will not be able to assign a merge for newly created parts.
/// As a result, the total number of parts within the partition could
/// grow uncontrollably, similar to a snowball effect.
/// To address this we will try to assign a merge taking into consideration
/// only last N parts.
static constexpr size_t parts_threshold = 1000;
size_t begin = 0;
if (parts_count >= parts_threshold)
begin = parts_count - parts_threshold;
for (; begin < parts_count; ++begin)
{
if (!parts[begin].shall_participate_in_merges)
continue;
2016-11-01 20:30:40 +00:00
size_t sum_size = parts[begin].size;
size_t max_size = parts[begin].size;
size_t min_age = parts[begin].age;
for (size_t end = begin + 2; end <= parts_count; ++end)
2016-10-27 23:28:35 +00:00
{
2021-05-08 15:20:40 +00:00
assert(end > begin);
2023-02-19 22:15:09 +00:00
if (settings.max_parts_to_merge_at_once && end - begin > settings.max_parts_to_merge_at_once)
2016-10-27 23:28:35 +00:00
break;
if (!parts[end - 1].shall_participate_in_merges)
break;
size_t cur_size = parts[end - 1].size;
size_t cur_age = parts[end - 1].age;
sum_size += cur_size;
max_size = std::max(max_size, cur_size);
min_age = std::min(min_age, cur_age);
if (max_total_size_to_merge && sum_size > max_total_size_to_merge)
2016-10-27 23:28:35 +00:00
break;
if (allow(sum_size, max_size, min_age, end - begin, parts_count, min_size_to_lower_base_log, max_size_to_lower_base_log, settings))
2016-11-01 20:30:40 +00:00
estimator.consider(
parts.begin() + begin,
parts.begin() + end,
sum_size,
begin == 0 ? 0 : parts[begin - 1].size,
settings);
}
}
2016-10-27 23:28:35 +00:00
}
}
2020-09-03 13:29:18 +00:00
SimpleMergeSelector::PartsRange SimpleMergeSelector::select(
const PartsRanges & parts_ranges,
size_t max_total_size_to_merge)
{
Estimator estimator;
/// Precompute logarithm of settings boundaries, because log function is quite expensive in terms of performance
const double min_size_to_lower_base_log = log(1 + settings.min_size_to_lower_base);
const double max_size_to_lower_base_log = log(1 + settings.max_size_to_lower_base);
2020-09-03 13:29:18 +00:00
for (const auto & part_range : parts_ranges)
selectWithinPartition(part_range, max_total_size_to_merge, estimator, settings, min_size_to_lower_base_log, max_size_to_lower_base_log);
return estimator.getBest();
}
}