Improve performance of quantileMerge #16640

This commit is contained in:
Alexey Milovidov 2020-11-03 23:26:55 +03:00
parent 2fae1c3c31
commit 5fe679324e

View File

@ -158,12 +158,25 @@ public:
}
else
{
randomShuffle(samples);
/// Replace every element in our reservoir to the b's reservoir
/// with the probability of b.total_values / (a.total_values + b.total_values)
/// Do it more roughly than true random sampling to save performance.
total_values += b.total_values;
for (size_t i = 0; i < sample_count; ++i)
/// Will replace every frequency'th element in a to element from b.
double frequency = static_cast<double>(total_values) / b.total_values;
/// When frequency is too low, replace just one random element with the corresponding probability.
if (frequency * 2 >= sample_count)
{
UInt64 rnd = genRandom(total_values);
if (rnd < b.total_values)
UInt64 rnd = genRandom(frequency);
if (rnd < sample_count)
samples[rnd] = b.samples[rnd];
}
else
{
for (double i = 0; i < sample_count; i += frequency)
samples[i] = b.samples[i];
}
}
@ -222,15 +235,6 @@ private:
return (static_cast<UInt64>(rng()) * (static_cast<UInt64>(rng.max()) + 1ULL) + static_cast<UInt64>(rng())) % lim;
}
void randomShuffle(Array & v)
{
for (size_t i = 1; i < v.size(); ++i)
{
size_t j = genRandom(i + 1);
std::swap(v[i], v[j]);
}
}
void sortIfNeeded()
{
if (sorted)