mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-13 18:02:24 +00:00
improve performance
This commit is contained in:
parent
a478e57905
commit
ddba07b9c3
@ -1,7 +1,8 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <base/sort.h>
|
||||||
|
#include <Common/RadixSort.h>
|
||||||
#include <IO/WriteBuffer.h>
|
#include <IO/WriteBuffer.h>
|
||||||
#include <IO/ReadBuffer.h>
|
#include <IO/ReadBuffer.h>
|
||||||
#include <IO/WriteHelpers.h>
|
#include <IO/WriteHelpers.h>
|
||||||
@ -53,6 +54,8 @@ public:
|
|||||||
, compressed(compressed_)
|
, compressed(compressed_)
|
||||||
, sampled(sampled_)
|
, sampled(sampled_)
|
||||||
{
|
{
|
||||||
|
sampled.reserve(compress_threshold);
|
||||||
|
backup_sampled.reserve(compress_threshold);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isCompressed() const { return compressed; }
|
bool isCompressed() const { return compressed; }
|
||||||
@ -113,9 +116,8 @@ public:
|
|||||||
void compress()
|
void compress()
|
||||||
{
|
{
|
||||||
withHeadBufferInserted();
|
withHeadBufferInserted();
|
||||||
auto compressed_samples = doCompress(sampled, 2 * relative_error * count);
|
|
||||||
|
|
||||||
std::swap(sampled, compressed_samples);
|
doCompress(2 * relative_error * count);
|
||||||
compressed = true;
|
compressed = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -165,8 +167,8 @@ public:
|
|||||||
// `max(g_ab + delta_ab) <= floor(2 * eps_ab * (n_a + n_b))` since
|
// `max(g_ab + delta_ab) <= floor(2 * eps_ab * (n_a + n_b))` since
|
||||||
// `max(g_ab + delta_ab) <= floor(2 * eps_a * n_a) + floor(2 * eps_b * n_b)`
|
// `max(g_ab + delta_ab) <= floor(2 * eps_a * n_a) + floor(2 * eps_b * n_b)`
|
||||||
// Finally, one can see how the `insert(x)` operation can be expressed as `merge([(x, 1, 0])`
|
// Finally, one can see how the `insert(x)` operation can be expressed as `merge([(x, 1, 0])`
|
||||||
std::vector<Stats> merged_sampled;
|
backup_sampled.clear();
|
||||||
merged_sampled.reserve(sampled.size() + other.sampled.size());
|
backup_sampled.reserve(sampled.size() + other.sampled.size());
|
||||||
double merged_relative_error = std::max(relative_error, other.relative_error);
|
double merged_relative_error = std::max(relative_error, other.relative_error);
|
||||||
size_t merged_count = count + other.count;
|
size_t merged_count = count + other.count;
|
||||||
Int64 additional_self_delta = static_cast<Int64>(std::floor(2 * other.relative_error * other.count));
|
Int64 additional_self_delta = static_cast<Int64>(std::floor(2 * other.relative_error * other.count));
|
||||||
@ -198,27 +200,28 @@ public:
|
|||||||
|
|
||||||
// Insert it
|
// Insert it
|
||||||
next_sample.delta += additional_delta;
|
next_sample.delta += additional_delta;
|
||||||
merged_sampled.push_back(next_sample);
|
backup_sampled.emplace_back(std::move(next_sample));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copy the remaining samples from the other list
|
// Copy the remaining samples from the other list
|
||||||
// (by construction, at most one `while` loop will run)
|
// (by construction, at most one `while` loop will run)
|
||||||
while (self_idx < sampled.size())
|
while (self_idx < sampled.size())
|
||||||
{
|
{
|
||||||
merged_sampled.push_back(sampled[self_idx]);
|
backup_sampled.emplace_back(sampled[self_idx]);
|
||||||
++self_idx;
|
++self_idx;
|
||||||
}
|
}
|
||||||
while (other_idx < other.sampled.size())
|
while (other_idx < other.sampled.size())
|
||||||
{
|
{
|
||||||
merged_sampled.emplace_back(other.sampled[other_idx]);
|
backup_sampled.emplace_back(other.sampled[other_idx]);
|
||||||
++other_idx;
|
++other_idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto compressed_sampled = doCompress(merged_sampled, 2 * merged_relative_error * merged_count);
|
std::swap(sampled, backup_sampled);
|
||||||
compress_threshold = other.compress_threshold;
|
|
||||||
relative_error = merged_relative_error;
|
relative_error = merged_relative_error;
|
||||||
std::swap(compressed_sampled, sampled);
|
|
||||||
count = merged_count;
|
count = merged_count;
|
||||||
|
compress_threshold = other.compress_threshold;
|
||||||
|
|
||||||
|
doCompress(2 * merged_relative_error * merged_count);
|
||||||
compressed = true;
|
compressed = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -287,13 +290,18 @@ private:
|
|||||||
if (head_sampled.empty())
|
if (head_sampled.empty())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
size_t current_count = count;
|
bool use_radix_sort = head_sampled.size() >= 256 && (is_arithmetic_v<T> && !is_big_int_v<T>);
|
||||||
std::sort(head_sampled.begin(), head_sampled.end());
|
if (use_radix_sort)
|
||||||
std::vector<Stats> new_samples;
|
RadixSort<RadixSortNumTraits<T>>::executeLSD(head_sampled.data(), head_sampled.size());
|
||||||
new_samples.reserve(sampled.size() + head_sampled.size());
|
else
|
||||||
|
::sort(head_sampled.begin(), head_sampled.end());
|
||||||
|
|
||||||
|
backup_sampled.clear();
|
||||||
|
backup_sampled.reserve(sampled.size() + head_sampled.size());
|
||||||
|
|
||||||
size_t sample_idx = 0;
|
size_t sample_idx = 0;
|
||||||
size_t ops_idx = 0;
|
size_t ops_idx = 0;
|
||||||
|
size_t current_count = count;
|
||||||
for (; ops_idx < head_sampled.size(); ++ops_idx)
|
for (; ops_idx < head_sampled.size(); ++ops_idx)
|
||||||
{
|
{
|
||||||
T current_sample = head_sampled[ops_idx];
|
T current_sample = head_sampled[ops_idx];
|
||||||
@ -301,47 +309,47 @@ private:
|
|||||||
// Add all the samples before the next observation.
|
// Add all the samples before the next observation.
|
||||||
while (sample_idx < sampled.size() && sampled[sample_idx].value <= current_sample)
|
while (sample_idx < sampled.size() && sampled[sample_idx].value <= current_sample)
|
||||||
{
|
{
|
||||||
new_samples.push_back(sampled[sample_idx]);
|
backup_sampled.emplace_back(sampled[sample_idx]);
|
||||||
++sample_idx;
|
++sample_idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If it is the first one to insert, of if it is the last one
|
// If it is the first one to insert, of if it is the last one
|
||||||
++current_count;
|
++current_count;
|
||||||
Int64 delta;
|
Int64 delta;
|
||||||
if (new_samples.empty() || (sample_idx == sampled.size() && ops_idx == (head_sampled.size() - 1)))
|
if (backup_sampled.empty() || (sample_idx == sampled.size() && ops_idx == (head_sampled.size() - 1)))
|
||||||
delta = 0;
|
delta = 0;
|
||||||
else
|
else
|
||||||
delta = static_cast<Int64>(std::floor(2 * relative_error * current_count));
|
delta = static_cast<Int64>(std::floor(2 * relative_error * current_count));
|
||||||
|
|
||||||
new_samples.emplace_back(current_sample, 1, delta);
|
backup_sampled.emplace_back(current_sample, 1, delta);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add all the remaining existing samples
|
// Add all the remaining existing samples
|
||||||
for (; sample_idx < sampled.size(); ++sample_idx)
|
for (; sample_idx < sampled.size(); ++sample_idx)
|
||||||
new_samples.push_back(sampled[sample_idx]);
|
backup_sampled.emplace_back(sampled[sample_idx]);
|
||||||
|
|
||||||
std::swap(sampled, new_samples);
|
std::swap(sampled, backup_sampled);
|
||||||
head_sampled.clear();
|
head_sampled.clear();
|
||||||
count = current_count;
|
count = current_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static std::vector<Stats> doCompress(const std::vector<Stats> & current_samples, double merge_threshold)
|
void doCompress(double merge_threshold)
|
||||||
{
|
{
|
||||||
if (current_samples.empty())
|
if (sampled.empty())
|
||||||
return {};
|
return;
|
||||||
|
|
||||||
std::vector<Stats> res;
|
backup_sampled.clear();
|
||||||
// Start for the last element, which is always part of the set.
|
// Start for the last element, which is always part of the set.
|
||||||
// The head contains the current new head, that may be merged with the current element.
|
// The head contains the current new head, that may be merged with the current element.
|
||||||
auto head = current_samples.back();
|
Stats head = sampled.back();
|
||||||
ssize_t i = current_samples.size() - 2;
|
ssize_t i = sampled.size() - 2;
|
||||||
|
|
||||||
// Do not compress the last element
|
// Do not compress the last element
|
||||||
while (i >= 1)
|
while (i >= 1)
|
||||||
{
|
{
|
||||||
// The current sample:
|
// The current sample:
|
||||||
const auto & sample1 = current_samples[i];
|
const auto & sample1 = sampled[i];
|
||||||
// Do we need to compress?
|
// Do we need to compress?
|
||||||
if (sample1.g + head.g + head.delta < merge_threshold)
|
if (sample1.g + head.g + head.delta < merge_threshold)
|
||||||
{
|
{
|
||||||
@ -351,22 +359,23 @@ private:
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Prepend the current head, and keep the current sample as target for merging.
|
// Prepend the current head, and keep the current sample as target for merging.
|
||||||
res.insert(res.begin(), head);
|
backup_sampled.push_back(head);
|
||||||
head = sample1;
|
head = sample1;
|
||||||
}
|
}
|
||||||
--i;
|
--i;
|
||||||
}
|
}
|
||||||
|
|
||||||
res.insert(res.begin(), head);
|
backup_sampled.push_back(head);
|
||||||
// If necessary, add the minimum element:
|
// If necessary, add the minimum element:
|
||||||
auto curr_head = current_samples.front();
|
auto curr_head = sampled.front();
|
||||||
|
|
||||||
// don't add the minimum element if `currentSamples` has only one element (both `currHead` and
|
// don't add the minimum element if `currentSamples` has only one element (both `currHead` and
|
||||||
// `head` point to the same element)
|
// `head` point to the same element)
|
||||||
if (curr_head.value <= head.value && current_samples.size() > 1)
|
if (curr_head.value <= head.value && sampled.size() > 1)
|
||||||
res.insert(res.begin(), current_samples.front());
|
backup_sampled.emplace_back(sampled.front());
|
||||||
|
|
||||||
return res;
|
std::reverse(backup_sampled.begin(), backup_sampled.end());
|
||||||
|
std::swap(sampled, backup_sampled);
|
||||||
}
|
}
|
||||||
|
|
||||||
double relative_error;
|
double relative_error;
|
||||||
@ -375,6 +384,8 @@ private:
|
|||||||
bool compressed;
|
bool compressed;
|
||||||
|
|
||||||
std::vector<Stats> sampled;
|
std::vector<Stats> sampled;
|
||||||
|
std::vector<Stats> backup_sampled;
|
||||||
|
|
||||||
std::vector<T> head_sampled;
|
std::vector<T> head_sampled;
|
||||||
|
|
||||||
static constexpr size_t default_compress_threshold = 10000;
|
static constexpr size_t default_compress_threshold = 10000;
|
||||||
|
Loading…
Reference in New Issue
Block a user