diff --git a/dbms/include/DB/Storages/MergeTree/SimpleMergeSelector.h b/dbms/include/DB/Storages/MergeTree/SimpleMergeSelector.h index 116383b68b2..b3a56d0bc3f 100644 --- a/dbms/include/DB/Storages/MergeTree/SimpleMergeSelector.h +++ b/dbms/include/DB/Storages/MergeTree/SimpleMergeSelector.h @@ -11,6 +11,9 @@ class SimpleMergeSelector : public IMergeSelector public: struct Settings { + /// Zero means unlimited. + size_t max_parts_to_merge_at_once = 100; + /** Minimum ratio of size of one part to all parts in set of parts to merge (for usual cases). * For example, if all parts have equal size, it means, that at least 'base' number of parts should be merged. * If parts has non-uniform sizes, then minumum number of parts to merge is effectively increased. @@ -35,7 +38,7 @@ public: * 3. Sum size of parts participating in merge. If higher - then more age is required to lower base. So, base is lowered slower. * It means: for small parts, it's worth to merge faster, even not so wide or balanced. * - * We have multivariative dependency. Let it be logarithmic of size and multi-linear by other variables, + * We have multivariative dependency. Let it be logarithmic of size and somewhat multi-linear by other variables, * between some boundary points, and constant outside. */ @@ -53,8 +56,19 @@ public: /// Add this to size before all calculations. It means: merging even very small parts has it's fixed cost. size_t size_fixed_cost_to_add = 5 * 1024 * 1024; - /// Zero means unlimited. - size_t max_parts_to_merge_at_once = 100; + /** Heuristic: + * Make some preference for ranges, that sum_size is like (in terms of ratio) to part previous at left. + */ + bool enable_heuristic_to_align_parts = true; + double heuristic_to_align_parts_min_ratio_of_sum_size_to_prev_part = 0.9; + double heuristic_to_align_parts_max_absolute_difference_in_powers_of_two = 0.5; + double heuristic_to_align_parts_max_score_adjustment = 0.75; + + /** Heuristic: + * From right side of range, remove all parts, that size is less than specified ratio of sum_size. + */ + bool enable_heuristic_to_remove_small_parts_at_right = true; + double heuristic_to_remove_small_parts_at_right_max_ratio = 0.01; }; SimpleMergeSelector(const Settings & settings) : settings(settings) {} diff --git a/dbms/src/Storages/MergeTree/SimpleMergeSelector.cpp b/dbms/src/Storages/MergeTree/SimpleMergeSelector.cpp index 6b07c22c0bd..a790aa6fa30 100644 --- a/dbms/src/Storages/MergeTree/SimpleMergeSelector.cpp +++ b/dbms/src/Storages/MergeTree/SimpleMergeSelector.cpp @@ -17,25 +17,22 @@ struct Estimator { using Iterator = SimpleMergeSelector::PartsInPartition::const_iterator; - void consider(Iterator begin, Iterator end, size_t sum_size, size_t sum_size_fixed_cost, size_t size_prev_at_left) + void consider(Iterator begin, Iterator end, size_t sum_size, size_t size_prev_at_left, const SimpleMergeSelector::Settings & settings) { - double current_score = score(end - begin, sum_size, sum_size_fixed_cost); + double current_score = score(end - begin, sum_size, settings.size_fixed_cost_to_add); - /** Heuristic: - * Make some preference for ranges, that sum_size is like (in terms of ratio) to part previous at left. - */ - if (size_prev_at_left > sum_size * 0.9) + if (settings.enable_heuristic_to_align_parts + && size_prev_at_left > sum_size * settings.heuristic_to_align_parts_min_ratio_of_sum_size_to_prev_part) { double difference = std::abs(log2(static_cast(sum_size) / size_prev_at_left)); - if (difference < 0.5) - current_score *= 0.75 + difference * 0.5; + if (difference < settings.heuristic_to_align_parts_max_absolute_difference_in_powers_of_two) + current_score *= interpolateLinear(settings.heuristic_to_align_parts_max_score_adjustment, 1, + difference / settings.heuristic_to_align_parts_max_absolute_difference_in_powers_of_two); } - /** Heuristic: - * From right side of range, remove all parts, that size is less than 1% of sum_size. - */ - while (end >= begin + 3 && (end - 1)->size < 0.01 * sum_size) - --end; + if (settings.enable_heuristic_to_remove_small_parts_at_right) + while (end >= begin + 3 && (end - 1)->size < settings.heuristic_to_remove_small_parts_at_right_max_ratio * sum_size) + --end; if (!min_score || current_score < min_score) { @@ -175,8 +172,8 @@ void selectWithinPartition( parts.begin() + begin, parts.begin() + end, sum_size, - settings.size_fixed_cost_to_add, - begin == 0 ? 0 : parts[begin - 1].size); + begin == 0 ? 0 : parts[begin - 1].size, + settings); } } } diff --git a/dbms/src/Storages/tests/merge_selector2.cpp b/dbms/src/Storages/tests/merge_selector2.cpp index a4f50e38fe5..e27fbed8cd5 100644 --- a/dbms/src/Storages/tests/merge_selector2.cpp +++ b/dbms/src/Storages/tests/merge_selector2.cpp @@ -41,25 +41,34 @@ int main(int argc, char ** argv) IMergeSelector::Part part; in >> part.size >> "\t" >> part.age >> "\t" >> part.level >> "\t" >> part_names.back() >> "\n"; part.data = part_names.back().data(); + part.level = 0; parts.emplace_back(part); sum_parts_size += part.size; } size_t sum_size_written = sum_parts_size; size_t num_merges = 1; + size_t age_passed = 0; while (parts.size() > 1) { - IMergeSelector::PartsInPartition selected_parts = selector.select(partitions, 0); + IMergeSelector::PartsInPartition selected_parts = selector.select(partitions, 0 /*100ULL * 1024 * 1024 * 1024*/); if (selected_parts.empty()) { - std::cout << '.'; + ++age_passed; for (auto & part : parts) ++part.age; + + if (age_passed > 60 * 86400) + break; + + if (age_passed % 86400 == 0) + std::cout << "."; + continue; } - std::cout << '\n'; + std::cout << "Time passed: " << age_passed << '\n'; size_t sum_merged_size = 0; size_t start_index = 0; @@ -75,7 +84,7 @@ int main(int argc, char ** argv) start_index = i; } - std::cout << parts[i].size; + std::cout << (parts[i].size / 1024) << "_" << parts[i].level; if (in_range) { sum_merged_size += parts[i].size; @@ -101,6 +110,14 @@ int main(int argc, char ** argv) sum_size_written += sum_merged_size; ++num_merges; + + double time_to_merge = sum_merged_size / (1048576 * 10.0); + + age_passed += time_to_merge; + for (auto & part : parts) + part.age += time_to_merge; + + std::cout << "Time passed: " << age_passed << ", num parts: " << parts.size() << '\n'; } std::cout << std::fixed << std::setprecision(2)