ClickHouse/dbms/src/IO/LZ4_decompress_faster.h

#pragma once

#include <cmath>
#include <random>
#include <pcg_random.hpp>


namespace LZ4
{

/** There are many implementation details of LZ4 decompression loop, that affect performance.
  * For example: copy by 8 or by 16 (SSE2) bytes at once; use shuffle (SSSE3) instruction to replicate match or not.
  *
  * The optimal algorithm is dependent on:
  *
  * 1. CPU architecture.
  * (example: on Skylake it's almost always better to copy by 16 bytes and use shuffle,
  *  but on Westmere using shuffle is worse and copy by 16 bytes is better only for high compression ratios)
  *
  * 2. Data distribution.
  * (example: when compression ratio is higher than 10.20,
  *  it's usually better to copy by 16 bytes rather than 8).
  *
  * It's very difficult to test all combinations on different CPUs and to choose correct rule to select best variant.
  * (Even if you do this, you have high chance to over-optimize for specific CPU while downgrading performance on another.)
  *
  * Instead of this, we choose best algorithm by using performance statistics
  *  with something like "Bayesian Bandits" method.
  */


/** Both buffers passed to 'decompress' function must have
  *  at least this amount of excessive bytes after end of data
  *  that is allowed to read/write.
  * This value is a little overestimation.
  */
static constexpr size_t ADDITIONAL_BYTES_AT_END_OF_BUFFER = 32;


/** When decompressing uniform sequence of blocks (for example, blocks from one file),
  *  you can pass single PerformanceStatistics object to subsequent invocations of 'decompress' method.
  * It will accumulate statistics and use it as a feedback to choose best specialization of algorithm at runtime.
  * One PerformanceStatistics object cannot be used concurrently from different threads.
  */
struct PerformanceStatistics
{
    struct Element
    {
        double count = 0;
        double sum = 0;

        double adjustedCount() const
        {
            return count - NUM_INVOCATIONS_TO_THROW_OFF;
        }

        double mean() const
        {
            return sum / adjustedCount();
        }

        /// For better convergence, we don't use proper estimate of stddev.
        /// We want to eventually separate between two algorithms even in case
        ///  when there is no statistical significant difference between them.
        double sigma() const
        {
            return mean() / sqrt(adjustedCount());
        }

        void update(double seconds, double bytes)
        {
            ++count;

            if (count > NUM_INVOCATIONS_TO_THROW_OFF)
                sum += seconds / bytes;
        }

        double sample(pcg64 & rng) const
        {
            /// If there is a variant with not enough statistics, always choose it.
            /// And in that case prefer variant with less number of invocations.

            if (adjustedCount() < 2)
                return adjustedCount() - 1;
            else
                return std::normal_distribution<>(mean(), sigma())(rng);
        }
    };

    /// Number of different algorithms to select from.
    static constexpr size_t NUM_ELEMENTS = 3;

    /// Cold invocations may be affected by additional memory latencies. Don't take first invocations into account.
    static constexpr double NUM_INVOCATIONS_TO_THROW_OFF = 2;

    /// How to select method to run.
    /// -1 - automatically, based on statistics (default);
    /// 0..3 - always choose specified method (for performance testing);
    /// -2 - choose methods in round robin fashion (for performance testing).
    ssize_t choose_method = -1;

    Element data[NUM_ELEMENTS];

    pcg64 rng;

    /// To select from different algorithms we use a kind of "bandits" algorithm.
    /// Sample random values from estimated normal distributions and choose the minimal.
    size_t select()
    {
        if (choose_method < 0)
        {
            double samples[NUM_ELEMENTS];
            for (size_t i = 0; i < NUM_ELEMENTS; ++i)
                samples[i] = choose_method == -1
                    ? data[i].sample(rng)
                    : data[i].adjustedCount();

            return std::min_element(samples, samples + NUM_ELEMENTS) - samples;
        }
        else
            return choose_method;
    }

    PerformanceStatistics() {}
    PerformanceStatistics(ssize_t choose_method) : choose_method(choose_method) {}
};


/** This method dispatch to one of different implementations depending on performance statistics.
  */
void decompress(
    const char * const source,
    char * const dest,
    size_t source_size,
    size_t dest_size,
    PerformanceStatistics & statistics);


/** Obtain statistics about LZ4 block useful for development.
  */
struct StreamStatistics
{
    size_t num_tokens = 0;
    size_t sum_literal_lengths = 0;
    size_t sum_match_lengths = 0;
    size_t sum_match_offsets = 0;
    size_t count_match_offset_less_8 = 0;
    size_t count_match_offset_less_16 = 0;
    size_t count_match_replicate_itself = 0;

    void literal(size_t length);
    void match(size_t length, size_t offset);

    void print() const;
};

void statistics(
    const char * const source,
    char * const dest,
    size_t dest_size,
    StreamStatistics & stat);

}