mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-14 18:32:29 +00:00
148 lines
4.5 KiB
C++
148 lines
4.5 KiB
C++
|
#pragma once
|
||
|
|
||
|
#include <cmath>
|
||
|
#include <random>
|
||
|
#include <pcg_random.hpp>
|
||
|
|
||
|
|
||
|
namespace LZ4
|
||
|
{
|
||
|
|
||
|
/** There are many implementation details of LZ4 decompression loop, that affect performance.
|
||
|
* For example: copy by 8 or by 16 bytes at once; use shuffle instruction to replicate match or not.
|
||
|
*
|
||
|
* The optimal algorithm is dependent:
|
||
|
*
|
||
|
* - on CPU architecture
|
||
|
* (example: on Skylake it's almost always better to copy by 16 bytes and use shuffle,
|
||
|
* but on Westmere using shuffle is worse and copy by 16 bytes is better only for high compression ratios)
|
||
|
*
|
||
|
* - on data distribution
|
||
|
* (example: when compression ratio is higher than 10.20,
|
||
|
* it's usually better to copy by 16 bytes rather than 8).
|
||
|
*
|
||
|
* It's very difficult to test all combinations on different CPUs and to choose correct rule to select best variant.
|
||
|
* (Even if you do this, you have high chance to over-optimize for specific CPU while downgrading performance on another.)
|
||
|
*
|
||
|
* Instead of this, we choose best algorithm by using performance statistics
|
||
|
* with something like "Bayesian Bandits" method.
|
||
|
*/
|
||
|
|
||
|
|
||
|
/** Both buffers passed to 'decompress' function must have
|
||
|
* at least this amount of excessive bytes after end of data
|
||
|
* that is allowed to read/write.
|
||
|
* This value is a little overestimation.
|
||
|
*/
|
||
|
static constexpr size_t ADDITIONAL_BYTES_AT_END_OF_BUFFER = 32;
|
||
|
|
||
|
|
||
|
/** When decompressing uniform sequence of blocks (for example, blocks from one file),
|
||
|
* you can pass single PerformanceStatistics object to subsequent invocations of 'decompress' method.
|
||
|
* It will accumulate statistics and use it as a feedback to choose best specialization of algorithm at runtime.
|
||
|
* One PerformanceStatistics object cannot be used concurrently from different threads.
|
||
|
*/
|
||
|
struct PerformanceStatistics
|
||
|
{
|
||
|
struct Element
|
||
|
{
|
||
|
double count = 0;
|
||
|
double sum = 0;
|
||
|
|
||
|
double adjusted_count() const
|
||
|
{
|
||
|
return count - NUM_INVOCATIONS_TO_THROW_OFF;
|
||
|
}
|
||
|
|
||
|
double mean() const
|
||
|
{
|
||
|
return sum / adjusted_count();
|
||
|
}
|
||
|
|
||
|
/// For better convergence, we don't use proper estimate of stddev.
|
||
|
/// We want to eventually choose between two algorithms even in case
|
||
|
/// when there is no statistical significant difference between them.
|
||
|
double sigma() const
|
||
|
{
|
||
|
return mean() / sqrt(adjusted_count());
|
||
|
}
|
||
|
|
||
|
void update(double seconds, double bytes)
|
||
|
{
|
||
|
++count;
|
||
|
|
||
|
if (count > PerformanceStatistics::NUM_INVOCATIONS_TO_THROW_OFF)
|
||
|
sum += seconds / bytes;
|
||
|
}
|
||
|
|
||
|
double sample(pcg64 & rng) const
|
||
|
{
|
||
|
/// If there is a variant with not enough statistics, always choose it.
|
||
|
/// And in that case prefer variant with less number of invocations.
|
||
|
|
||
|
if (adjusted_count() < 2)
|
||
|
return adjusted_count() - 1;
|
||
|
else
|
||
|
return std::normal_distribution<>(mean(), sigma())(rng);
|
||
|
}
|
||
|
};
|
||
|
|
||
|
/// Number of different algorithms to select from.
|
||
|
static constexpr size_t NUM_ELEMENTS = 4;
|
||
|
|
||
|
/// Cold invocations may be affected by additional memory latencies. Don't take first invocations into account.
|
||
|
static constexpr double NUM_INVOCATIONS_TO_THROW_OFF = 2;
|
||
|
|
||
|
Element data[NUM_ELEMENTS];
|
||
|
|
||
|
pcg64 rng;
|
||
|
|
||
|
/// To select from different algorithms we use a kind of "bandits" algorithm.
|
||
|
/// Sample random values from estimated normal distributions and choose the minimal.
|
||
|
size_t select()
|
||
|
{
|
||
|
double samples[NUM_ELEMENTS];
|
||
|
for (size_t i = 0; i < NUM_ELEMENTS; ++i)
|
||
|
samples[i] = data[i].sample(rng);
|
||
|
|
||
|
return std::min_element(samples, samples + NUM_ELEMENTS) - samples;
|
||
|
}
|
||
|
};
|
||
|
|
||
|
|
||
|
/** This method dispatch to one of different implementations depending on performance statistics.
|
||
|
*/
|
||
|
void decompress(
|
||
|
const char * const source,
|
||
|
char * const dest,
|
||
|
size_t source_size,
|
||
|
size_t dest_size,
|
||
|
PerformanceStatistics & statistics);
|
||
|
|
||
|
|
||
|
/** Obtain statistics about LZ4 block useful for development.
|
||
|
*/
|
||
|
struct StreamStatistics
|
||
|
{
|
||
|
size_t num_tokens = 0;
|
||
|
size_t sum_literal_lengths = 0;
|
||
|
size_t sum_match_lengths = 0;
|
||
|
size_t sum_match_offsets = 0;
|
||
|
size_t count_match_offset_less_8 = 0;
|
||
|
size_t count_match_offset_less_16 = 0;
|
||
|
size_t count_match_replicate_itself = 0;
|
||
|
|
||
|
void literal(size_t length);
|
||
|
void match(size_t length, size_t offset);
|
||
|
|
||
|
void print() const;
|
||
|
};
|
||
|
|
||
|
void statistics(
|
||
|
const char * const source,
|
||
|
char * const dest,
|
||
|
size_t dest_size,
|
||
|
StreamStatistics & stat);
|
||
|
|
||
|
}
|