mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-26 09:32:01 +00:00
Attempt to improve merge selecting algorithm [#METR-23632].
This commit is contained in:
parent
cac8f9216f
commit
0a2d1ebbb3
@ -11,6 +11,9 @@ class SimpleMergeSelector : public IMergeSelector
|
||||
public:
|
||||
struct Settings
|
||||
{
|
||||
/// Zero means unlimited.
|
||||
size_t max_parts_to_merge_at_once = 100;
|
||||
|
||||
/** Minimum ratio of size of one part to all parts in set of parts to merge (for usual cases).
|
||||
* For example, if all parts have equal size, it means, that at least 'base' number of parts should be merged.
|
||||
* If parts has non-uniform sizes, then minumum number of parts to merge is effectively increased.
|
||||
@ -35,7 +38,7 @@ public:
|
||||
* 3. Sum size of parts participating in merge. If higher - then more age is required to lower base. So, base is lowered slower.
|
||||
* It means: for small parts, it's worth to merge faster, even not so wide or balanced.
|
||||
*
|
||||
* We have multivariative dependency. Let it be logarithmic of size and multi-linear by other variables,
|
||||
* We have multivariative dependency. Let it be logarithmic of size and somewhat multi-linear by other variables,
|
||||
* between some boundary points, and constant outside.
|
||||
*/
|
||||
|
||||
@ -53,8 +56,19 @@ public:
|
||||
/// Add this to size before all calculations. It means: merging even very small parts has it's fixed cost.
|
||||
size_t size_fixed_cost_to_add = 5 * 1024 * 1024;
|
||||
|
||||
/// Zero means unlimited.
|
||||
size_t max_parts_to_merge_at_once = 100;
|
||||
/** Heuristic:
|
||||
* Make some preference for ranges, that sum_size is like (in terms of ratio) to part previous at left.
|
||||
*/
|
||||
bool enable_heuristic_to_align_parts = true;
|
||||
double heuristic_to_align_parts_min_ratio_of_sum_size_to_prev_part = 0.9;
|
||||
double heuristic_to_align_parts_max_absolute_difference_in_powers_of_two = 0.5;
|
||||
double heuristic_to_align_parts_max_score_adjustment = 0.75;
|
||||
|
||||
/** Heuristic:
|
||||
* From right side of range, remove all parts, that size is less than specified ratio of sum_size.
|
||||
*/
|
||||
bool enable_heuristic_to_remove_small_parts_at_right = true;
|
||||
double heuristic_to_remove_small_parts_at_right_max_ratio = 0.01;
|
||||
};
|
||||
|
||||
SimpleMergeSelector(const Settings & settings) : settings(settings) {}
|
||||
|
@ -17,25 +17,22 @@ struct Estimator
|
||||
{
|
||||
using Iterator = SimpleMergeSelector::PartsInPartition::const_iterator;
|
||||
|
||||
void consider(Iterator begin, Iterator end, size_t sum_size, size_t sum_size_fixed_cost, size_t size_prev_at_left)
|
||||
void consider(Iterator begin, Iterator end, size_t sum_size, size_t size_prev_at_left, const SimpleMergeSelector::Settings & settings)
|
||||
{
|
||||
double current_score = score(end - begin, sum_size, sum_size_fixed_cost);
|
||||
double current_score = score(end - begin, sum_size, settings.size_fixed_cost_to_add);
|
||||
|
||||
/** Heuristic:
|
||||
* Make some preference for ranges, that sum_size is like (in terms of ratio) to part previous at left.
|
||||
*/
|
||||
if (size_prev_at_left > sum_size * 0.9)
|
||||
if (settings.enable_heuristic_to_align_parts
|
||||
&& size_prev_at_left > sum_size * settings.heuristic_to_align_parts_min_ratio_of_sum_size_to_prev_part)
|
||||
{
|
||||
double difference = std::abs(log2(static_cast<double>(sum_size) / size_prev_at_left));
|
||||
if (difference < 0.5)
|
||||
current_score *= 0.75 + difference * 0.5;
|
||||
if (difference < settings.heuristic_to_align_parts_max_absolute_difference_in_powers_of_two)
|
||||
current_score *= interpolateLinear(settings.heuristic_to_align_parts_max_score_adjustment, 1,
|
||||
difference / settings.heuristic_to_align_parts_max_absolute_difference_in_powers_of_two);
|
||||
}
|
||||
|
||||
/** Heuristic:
|
||||
* From right side of range, remove all parts, that size is less than 1% of sum_size.
|
||||
*/
|
||||
while (end >= begin + 3 && (end - 1)->size < 0.01 * sum_size)
|
||||
--end;
|
||||
if (settings.enable_heuristic_to_remove_small_parts_at_right)
|
||||
while (end >= begin + 3 && (end - 1)->size < settings.heuristic_to_remove_small_parts_at_right_max_ratio * sum_size)
|
||||
--end;
|
||||
|
||||
if (!min_score || current_score < min_score)
|
||||
{
|
||||
@ -175,8 +172,8 @@ void selectWithinPartition(
|
||||
parts.begin() + begin,
|
||||
parts.begin() + end,
|
||||
sum_size,
|
||||
settings.size_fixed_cost_to_add,
|
||||
begin == 0 ? 0 : parts[begin - 1].size);
|
||||
begin == 0 ? 0 : parts[begin - 1].size,
|
||||
settings);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -41,25 +41,34 @@ int main(int argc, char ** argv)
|
||||
IMergeSelector::Part part;
|
||||
in >> part.size >> "\t" >> part.age >> "\t" >> part.level >> "\t" >> part_names.back() >> "\n";
|
||||
part.data = part_names.back().data();
|
||||
part.level = 0;
|
||||
parts.emplace_back(part);
|
||||
sum_parts_size += part.size;
|
||||
}
|
||||
|
||||
size_t sum_size_written = sum_parts_size;
|
||||
size_t num_merges = 1;
|
||||
size_t age_passed = 0;
|
||||
|
||||
while (parts.size() > 1)
|
||||
{
|
||||
IMergeSelector::PartsInPartition selected_parts = selector.select(partitions, 0);
|
||||
IMergeSelector::PartsInPartition selected_parts = selector.select(partitions, 0 /*100ULL * 1024 * 1024 * 1024*/);
|
||||
|
||||
if (selected_parts.empty())
|
||||
{
|
||||
std::cout << '.';
|
||||
++age_passed;
|
||||
for (auto & part : parts)
|
||||
++part.age;
|
||||
|
||||
if (age_passed > 60 * 86400)
|
||||
break;
|
||||
|
||||
if (age_passed % 86400 == 0)
|
||||
std::cout << ".";
|
||||
|
||||
continue;
|
||||
}
|
||||
std::cout << '\n';
|
||||
std::cout << "Time passed: " << age_passed << '\n';
|
||||
|
||||
size_t sum_merged_size = 0;
|
||||
size_t start_index = 0;
|
||||
@ -75,7 +84,7 @@ int main(int argc, char ** argv)
|
||||
start_index = i;
|
||||
}
|
||||
|
||||
std::cout << parts[i].size;
|
||||
std::cout << (parts[i].size / 1024) << "_" << parts[i].level;
|
||||
if (in_range)
|
||||
{
|
||||
sum_merged_size += parts[i].size;
|
||||
@ -101,6 +110,14 @@ int main(int argc, char ** argv)
|
||||
|
||||
sum_size_written += sum_merged_size;
|
||||
++num_merges;
|
||||
|
||||
double time_to_merge = sum_merged_size / (1048576 * 10.0);
|
||||
|
||||
age_passed += time_to_merge;
|
||||
for (auto & part : parts)
|
||||
part.age += time_to_merge;
|
||||
|
||||
std::cout << "Time passed: " << age_passed << ", num parts: " << parts.size() << '\n';
|
||||
}
|
||||
|
||||
std::cout << std::fixed << std::setprecision(2)
|
||||
|
Loading…
Reference in New Issue
Block a user