ClickHouse/dbms/src/Storages/MergeTree/SimpleMergeSelector.cpp

148 lines
4.1 KiB
C++
Raw Normal View History

2016-10-27 23:28:35 +00:00
#include <DB/Storages/MergeTree/SimpleMergeSelector.h>
#include <DB/Common/interpolate.h>
2016-10-27 23:28:35 +00:00
#include <cmath>
2016-10-27 23:28:35 +00:00
namespace DB
{
namespace
{
/** Estimates best set of parts to merge within passed alternatives.
*/
2016-10-27 23:28:35 +00:00
struct Estimator
{
using Iterator = SimpleMergeSelector::PartsInPartition::const_iterator;
void consider(Iterator begin, Iterator end, size_t sum_size, size_t size_next_at_left, size_t size_next_at_right)
2016-10-27 23:28:35 +00:00
{
double current_score = score(end - begin, sum_size);
if (size_next_at_left > sum_size * 0.9)
{
double difference = std::abs(log2(static_cast<double>(sum_size) / size_next_at_left));
if (difference < 0.5)
current_score *= 0.75 + difference * 0.5;
}
if (size_next_at_right == 0)
current_score *= 0.9;
if (size_next_at_right > sum_size * 0.9)
{
double difference = std::abs(log2(static_cast<double>(sum_size) / size_next_at_right));
if (difference < 0.5)
current_score *= 0.75 + difference * 0.5;
}
if (!min_score || current_score < min_score)
2016-10-27 23:28:35 +00:00
{
min_score = current_score;
best_begin = begin;
best_end = end;
2016-10-27 23:28:35 +00:00
}
}
SimpleMergeSelector::PartsInPartition getBest()
{
return SimpleMergeSelector::PartsInPartition(best_begin, best_end);
2016-10-27 23:28:35 +00:00
}
static double score(double count, double sum_size)
{
/** Consider we have two alternative ranges of data parts to merge.
* Assume time to merge a range is proportional to sum size of its parts.
*
* Cost of query execution is proportional to total number of data parts in a moment of time.
* Let define our target: to minimize average (in time) total number of data parts.
*
* Let calculate integral of total number of parts, if we are going to do merge of one or another range.
* It must be lower, and thus we decide, what range is better to merge.
*
* The integral is lower iff the following formula is lower:
*/
return sum_size / (count - 1);
}
double min_score = 0;
Iterator best_begin;
Iterator best_end;
2016-10-27 23:28:35 +00:00
};
void selectWithinPartition(
const SimpleMergeSelector::PartsInPartition & parts,
2016-10-27 23:28:35 +00:00
const size_t max_total_size_to_merge,
2016-11-01 19:20:24 +00:00
const time_t current_min_part_age,
Estimator & estimator,
const SimpleMergeSelector::Settings & settings)
2016-10-27 23:28:35 +00:00
{
size_t parts_count = parts.size();
2016-11-01 20:30:40 +00:00
if (parts_count <= 1)
return;
2016-10-27 23:28:35 +00:00
double actual_base = std::max(1.0, std::min(
settings.base,
std::min(
interpolateLinear(settings.base, 1.0, (static_cast<double>(parts_count) - settings.lower_base_after_num_parts_start)
/ (settings.lower_base_after_num_parts_end - settings.lower_base_after_num_parts_start)),
interpolateLinear(settings.base, 1.0, (static_cast<double>(current_min_part_age) - settings.lower_base_after_seconds_start)
/ (settings.lower_base_after_seconds_end - settings.lower_base_after_seconds_start)))));
for (size_t begin = 0; begin < parts_count; ++begin)
{
/// If too much parts, select only from first, to avoid complexity.
if (begin > 1000)
break;
2016-11-01 20:30:40 +00:00
size_t sum_size = parts[begin].size;
size_t max_size = parts[begin].size;
for (size_t end = begin + 2; end <= parts_count; ++end)
2016-10-27 23:28:35 +00:00
{
if (settings.max_parts_to_merge_at_once && end - begin > settings.max_parts_to_merge_at_once)
2016-10-27 23:28:35 +00:00
break;
2016-11-01 20:30:40 +00:00
sum_size += parts[end - 1].size;
if (parts[end - 1].size > max_size)
max_size = parts[end - 1].size;
2016-10-27 23:28:35 +00:00
if (max_total_size_to_merge && sum_size > max_total_size_to_merge)
2016-10-27 23:28:35 +00:00
break;
2016-11-01 20:30:40 +00:00
if (static_cast<double>(sum_size) / max_size >= actual_base)
estimator.consider(
parts.begin() + begin,
parts.begin() + end,
sum_size,
begin == 0 ? 0 : parts[begin - 1].size,
end == parts_count ? 0 : parts[end].size);
}
}
2016-10-27 23:28:35 +00:00
}
}
SimpleMergeSelector::PartsInPartition SimpleMergeSelector::select(
const Partitions & partitions,
const size_t max_total_size_to_merge)
{
2016-11-01 19:20:24 +00:00
time_t min_age = -1;
for (const auto & partition : partitions)
for (const auto & part : partition)
if (min_age == -1 || part.age < min_age)
min_age = part.age;
Estimator estimator;
for (const auto & partition : partitions)
2016-11-01 19:20:24 +00:00
selectWithinPartition(partition, max_total_size_to_merge, min_age, estimator, settings);
return estimator.getBest();
}
}