ClickHouse/src/Compression/LZ4_decompress_faster.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

165 lines
5.2 KiB
C++
Raw Normal View History

2018-01-16 01:59:51 +00:00
#pragma once
#include <cmath>
#include <random>
#include <pcg_random.hpp>
namespace LZ4
{
/** There are many implementation details of LZ4 decompression loop, that affect performance.
2018-01-16 02:04:02 +00:00
* For example: copy by 8 or by 16 (SSE2) bytes at once; use shuffle (SSSE3) instruction to replicate match or not.
2018-01-16 01:59:51 +00:00
*
2018-01-16 02:04:02 +00:00
* The optimal algorithm is dependent on:
2018-01-16 01:59:51 +00:00
*
2018-01-16 02:04:02 +00:00
* 1. CPU architecture.
2018-01-16 01:59:51 +00:00
* (example: on Skylake it's almost always better to copy by 16 bytes and use shuffle,
* but on Westmere using shuffle is worse and copy by 16 bytes is better only for high compression ratios)
*
2018-01-16 02:04:02 +00:00
* 2. Data distribution.
2018-01-16 01:59:51 +00:00
* (example: when compression ratio is higher than 10.20,
* it's usually better to copy by 16 bytes rather than 8).
*
* It's very difficult to test all combinations on different CPUs and to choose correct rule to select best variant.
* (Even if you do this, you have high chance to over-optimize for specific CPU while downgrading performance on another.)
*
* Instead of this, we choose best algorithm by using performance statistics
* with something like "Bayesian Bandits" method.
*/
/** Both buffers passed to 'decompress' function must have
* at least this amount of excessive bytes after end of data
* that is allowed to read/write.
* This value is a little overestimation.
*/
static constexpr size_t ADDITIONAL_BYTES_AT_END_OF_BUFFER = 64;
2018-01-16 01:59:51 +00:00
/** When decompressing uniform sequence of blocks (for example, blocks from one file),
* you can pass single PerformanceStatistics object to subsequent invocations of 'decompress' method.
* It will accumulate statistics and use it as a feedback to choose best specialization of algorithm at runtime.
* One PerformanceStatistics object cannot be used concurrently from different threads.
*/
struct PerformanceStatistics
{
struct Element
{
double count = 0;
double sum = 0;
2018-01-16 02:04:02 +00:00
double adjustedCount() const
2018-01-16 01:59:51 +00:00
{
return count - NUM_INVOCATIONS_TO_THROW_OFF;
}
double mean() const
{
2018-01-16 02:04:02 +00:00
return sum / adjustedCount();
2018-01-16 01:59:51 +00:00
}
/// For better convergence, we don't use proper estimate of stddev.
2018-01-16 02:04:02 +00:00
/// We want to eventually separate between two algorithms even in case
2018-01-16 01:59:51 +00:00
/// when there is no statistical significant difference between them.
double sigma() const
{
2018-01-16 02:04:02 +00:00
return mean() / sqrt(adjustedCount());
2018-01-16 01:59:51 +00:00
}
void update(double seconds, double bytes)
{
++count;
if (count > NUM_INVOCATIONS_TO_THROW_OFF)
2018-01-16 01:59:51 +00:00
sum += seconds / bytes;
}
double sample(pcg64 & stat_rng) const
2018-01-16 01:59:51 +00:00
{
/// If there is a variant with not enough statistics, always choose it.
/// And in that case prefer variant with less number of invocations.
2018-01-16 02:04:02 +00:00
if (adjustedCount() < 2)
return adjustedCount() - 1;
2018-01-16 01:59:51 +00:00
else
return std::normal_distribution<>(mean(), sigma())(stat_rng);
2018-01-16 01:59:51 +00:00
}
};
/// Number of different algorithms to select from.
2022-04-25 07:44:30 +00:00
static constexpr size_t NUM_ELEMENTS = 5;
2018-01-16 01:59:51 +00:00
/// Cold invocations may be affected by additional memory latencies. Don't take first invocations into account.
static constexpr double NUM_INVOCATIONS_TO_THROW_OFF = 2;
/// How to select method to run.
/// -1 - automatically, based on statistics (default);
2021-03-16 23:08:23 +00:00
/// >= 0 - always choose specified method (for performance testing);
/// -2 - choose methods in round robin fashion (for performance testing).
ssize_t choose_method = -1;
2018-01-16 01:59:51 +00:00
Element data[NUM_ELEMENTS];
2019-07-28 15:30:38 +00:00
/// It's Ok that generator is not seeded.
2018-01-16 01:59:51 +00:00
pcg64 rng;
/// To select from different algorithms we use a kind of "bandits" algorithm.
/// Sample random values from estimated normal distributions and choose the minimal.
size_t select(size_t max_method = NUM_ELEMENTS)
2018-01-16 01:59:51 +00:00
{
if (choose_method < 0)
{
double samples[max_method];
for (size_t i = 0; i < max_method; ++i)
samples[i] = choose_method == -1
? data[i].sample(rng)
: data[i].adjustedCount();
2018-01-16 01:59:51 +00:00
return std::min_element(samples, samples + max_method) - samples;
}
else
return choose_method;
2018-01-16 01:59:51 +00:00
}
2021-08-03 12:24:16 +00:00
PerformanceStatistics() = default;
explicit PerformanceStatistics(ssize_t choose_method_) : choose_method(choose_method_) {}
2018-01-16 01:59:51 +00:00
};
/** This method dispatch to one of different implementations depending on performance statistics.
*/
2021-08-03 12:24:16 +00:00
bool decompress(
const char * const source, /// NOLINT
char * const dest, /// NOLINT
2018-01-16 01:59:51 +00:00
size_t source_size,
size_t dest_size,
PerformanceStatistics & statistics);
/** Obtain statistics about LZ4 block useful for development.
*/
struct StreamStatistics
{
size_t num_tokens = 0;
size_t sum_literal_lengths = 0;
size_t sum_match_lengths = 0;
size_t sum_match_offsets = 0;
size_t count_match_offset_less_8 = 0;
size_t count_match_offset_less_16 = 0;
size_t count_match_replicate_itself = 0;
void literal(size_t length);
void match(size_t length, size_t offset);
void print() const;
};
void statistics(
const char * const source, /// NOLINT
char * const dest, /// NOLINT
2018-01-16 01:59:51 +00:00
size_t dest_size,
StreamStatistics & stat);
}