Attempt to improve merge selecting algorithm [#METR-23632].

This commit is contained in:
Alexey Milovidov 2016-11-28 06:46:08 +03:00
parent cac8f9216f
commit 0a2d1ebbb3
3 changed files with 50 additions and 22 deletions

View File

@ -11,6 +11,9 @@ class SimpleMergeSelector : public IMergeSelector
public:
struct Settings
{
/// Zero means unlimited.
size_t max_parts_to_merge_at_once = 100;
/** Minimum ratio of size of one part to all parts in set of parts to merge (for usual cases).
* For example, if all parts have equal size, it means, that at least 'base' number of parts should be merged.
* If parts has non-uniform sizes, then minumum number of parts to merge is effectively increased.
@ -35,7 +38,7 @@ public:
* 3. Sum size of parts participating in merge. If higher - then more age is required to lower base. So, base is lowered slower.
* It means: for small parts, it's worth to merge faster, even not so wide or balanced.
*
* We have multivariative dependency. Let it be logarithmic of size and multi-linear by other variables,
* We have multivariative dependency. Let it be logarithmic of size and somewhat multi-linear by other variables,
* between some boundary points, and constant outside.
*/
@ -53,8 +56,19 @@ public:
/// Add this to size before all calculations. It means: merging even very small parts has it's fixed cost.
size_t size_fixed_cost_to_add = 5 * 1024 * 1024;
/// Zero means unlimited.
size_t max_parts_to_merge_at_once = 100;
/** Heuristic:
* Make some preference for ranges, that sum_size is like (in terms of ratio) to part previous at left.
*/
bool enable_heuristic_to_align_parts = true;
double heuristic_to_align_parts_min_ratio_of_sum_size_to_prev_part = 0.9;
double heuristic_to_align_parts_max_absolute_difference_in_powers_of_two = 0.5;
double heuristic_to_align_parts_max_score_adjustment = 0.75;
/** Heuristic:
* From right side of range, remove all parts, that size is less than specified ratio of sum_size.
*/
bool enable_heuristic_to_remove_small_parts_at_right = true;
double heuristic_to_remove_small_parts_at_right_max_ratio = 0.01;
};
SimpleMergeSelector(const Settings & settings) : settings(settings) {}

View File

@ -17,24 +17,21 @@ struct Estimator
{
using Iterator = SimpleMergeSelector::PartsInPartition::const_iterator;
void consider(Iterator begin, Iterator end, size_t sum_size, size_t sum_size_fixed_cost, size_t size_prev_at_left)
void consider(Iterator begin, Iterator end, size_t sum_size, size_t size_prev_at_left, const SimpleMergeSelector::Settings & settings)
{
double current_score = score(end - begin, sum_size, sum_size_fixed_cost);
double current_score = score(end - begin, sum_size, settings.size_fixed_cost_to_add);
/** Heuristic:
* Make some preference for ranges, that sum_size is like (in terms of ratio) to part previous at left.
*/
if (size_prev_at_left > sum_size * 0.9)
if (settings.enable_heuristic_to_align_parts
&& size_prev_at_left > sum_size * settings.heuristic_to_align_parts_min_ratio_of_sum_size_to_prev_part)
{
double difference = std::abs(log2(static_cast<double>(sum_size) / size_prev_at_left));
if (difference < 0.5)
current_score *= 0.75 + difference * 0.5;
if (difference < settings.heuristic_to_align_parts_max_absolute_difference_in_powers_of_two)
current_score *= interpolateLinear(settings.heuristic_to_align_parts_max_score_adjustment, 1,
difference / settings.heuristic_to_align_parts_max_absolute_difference_in_powers_of_two);
}
/** Heuristic:
* From right side of range, remove all parts, that size is less than 1% of sum_size.
*/
while (end >= begin + 3 && (end - 1)->size < 0.01 * sum_size)
if (settings.enable_heuristic_to_remove_small_parts_at_right)
while (end >= begin + 3 && (end - 1)->size < settings.heuristic_to_remove_small_parts_at_right_max_ratio * sum_size)
--end;
if (!min_score || current_score < min_score)
@ -175,8 +172,8 @@ void selectWithinPartition(
parts.begin() + begin,
parts.begin() + end,
sum_size,
settings.size_fixed_cost_to_add,
begin == 0 ? 0 : parts[begin - 1].size);
begin == 0 ? 0 : parts[begin - 1].size,
settings);
}
}
}

View File

@ -41,25 +41,34 @@ int main(int argc, char ** argv)
IMergeSelector::Part part;
in >> part.size >> "\t" >> part.age >> "\t" >> part.level >> "\t" >> part_names.back() >> "\n";
part.data = part_names.back().data();
part.level = 0;
parts.emplace_back(part);
sum_parts_size += part.size;
}
size_t sum_size_written = sum_parts_size;
size_t num_merges = 1;
size_t age_passed = 0;
while (parts.size() > 1)
{
IMergeSelector::PartsInPartition selected_parts = selector.select(partitions, 0);
IMergeSelector::PartsInPartition selected_parts = selector.select(partitions, 0 /*100ULL * 1024 * 1024 * 1024*/);
if (selected_parts.empty())
{
std::cout << '.';
++age_passed;
for (auto & part : parts)
++part.age;
if (age_passed > 60 * 86400)
break;
if (age_passed % 86400 == 0)
std::cout << ".";
continue;
}
std::cout << '\n';
std::cout << "Time passed: " << age_passed << '\n';
size_t sum_merged_size = 0;
size_t start_index = 0;
@ -75,7 +84,7 @@ int main(int argc, char ** argv)
start_index = i;
}
std::cout << parts[i].size;
std::cout << (parts[i].size / 1024) << "_" << parts[i].level;
if (in_range)
{
sum_merged_size += parts[i].size;
@ -101,6 +110,14 @@ int main(int argc, char ** argv)
sum_size_written += sum_merged_size;
++num_merges;
double time_to_merge = sum_merged_size / (1048576 * 10.0);
age_passed += time_to_merge;
for (auto & part : parts)
part.age += time_to_merge;
std::cout << "Time passed: " << age_passed << ", num parts: " << parts.size() << '\n';
}
std::cout << std::fixed << std::setprecision(2)