#pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include namespace DB { namespace ErrorCodes { extern const int TOO_LARGE_ARRAY_SIZE; } } /** The algorithm was implemented by Alexei Borzenkov https: //███████████.yandex-team.ru/snaury * He owns the authorship of the code and half the comments in this namespace, * except for merging, serialization, and sorting, as well as selecting types and other changes. * We thank Alexei Borzenkov for writing the original code. */ namespace tdigest { /** * The centroid stores the weight of points around their mean value */ template struct Centroid { Value mean; Count count; Centroid() = default; explicit Centroid(Value mean, Count count = 1) : mean(mean) , count(count) {} Centroid & operator+=(const Centroid & other) { count += other.count; mean += other.count * (other.mean - mean) / count; return *this; } bool operator<(const Centroid & other) const { return mean < other.mean; } }; /** :param epsilon: value \delta from the article - error in the range * quantile 0.5 (default is 0.01, i.e. 1%) * :param max_unmerged: when accumulating count of new points beyond this * value centroid compression is triggered * (default is 2048, the higher the value - the * more memory is required, but amortization of execution time increases) */ template struct Params { Value epsilon = 0.01; size_t max_unmerged = 2048; }; /** Implementation of t-digest algorithm (https://github.com/tdunning/t-digest). * This option is very similar to MergingDigest on java, however the decision about * the union is accepted based on the original condition from the article * (via a size constraint, using the approximation of the quantile of each * centroid, not the distance on the curve of the position of their boundaries). MergingDigest * on java gives significantly fewer centroids than this variant, that * negatively affects accuracy with the same compression factor, but gives * size guarantees. The author himself on the proposal for this variant said that * the size of the digest grows like O(log(n)), while the version on java * does not depend on the expected number of points. Also an variant on java * uses asin, which slows down the algorithm a bit. */ template class MergingDigest { using Params = tdigest::Params; using Centroid = tdigest::Centroid; /// The memory will be allocated to several elements at once, so that the state occupies 64 bytes. static constexpr size_t bytes_in_arena = 64 - sizeof(DB::PODArray) - sizeof(TotalCount) - sizeof(uint32_t); using Summary = DB::PODArray, bytes_in_arena>>; Summary summary; TotalCount count = 0; uint32_t unmerged = 0; /** Linear interpolation at the point x on the line (x1, y1)..(x2, y2) */ static Value interpolate(Value x, Value x1, Value y1, Value x2, Value y2) { double k = (x - x1) / (x2 - x1); return y1 + k * (y2 - y1); } struct RadixSortTraits { using Element = Centroid; using Key = Value; using CountType = uint32_t; using KeyBits = uint32_t; static constexpr size_t PART_SIZE_BITS = 8; using Transform = RadixSortFloatTransform; using Allocator = RadixSortMallocAllocator; /// The function to get the key from an array element. static Key & extractKey(Element & elem) { return elem.mean; } }; public: /** Adds to the digest a change in `x` with a weight of `cnt` (default 1) */ void add(const Params & params, Value x, CentroidCount cnt = 1) { add(params, Centroid(x, cnt)); } /** Adds a centroid `c` to the digest */ void add(const Params & params, const Centroid & c) { summary.push_back(c); count += c.count; ++unmerged; if (unmerged >= params.max_unmerged) compress(params); } /** Performs compression of accumulated centroids * When merging, the invariant is retained to the maximum size of each * centroid that does not exceed `4 q (1 - q) \ delta N`. */ void compress(const Params & params) { if (unmerged > 0) { RadixSort::execute(&summary[0], summary.size()); if (summary.size() > 3) { /// A pair of consecutive bars of the histogram. auto l = summary.begin(); auto r = std::next(l); TotalCount sum = 0; while (r != summary.end()) { // we use quantile which gives us the smallest error /// The ratio of the part of the histogram to l, including the half l to the entire histogram. That is, what level quantile in position l. Value ql = (sum + l->count * 0.5) / count; Value err = ql * (1 - ql); /// The ratio of the portion of the histogram to l, including l and half r to the entire histogram. That is, what level is the quantile in position r. Value qr = (sum + l->count + r->count * 0.5) / count; Value err2 = qr * (1 - qr); if (err > err2) err = err2; Value k = 4 * count * err * params.epsilon; /** The ratio of the weight of the glued column pair to all values is not greater, * than epsilon multiply by a certain quadratic coefficient, which in the median is 1 (4 * 1/2 * 1/2), * and at the edges decreases and is approximately equal to the distance to the edge * 4. */ if (l->count + r->count <= k) { // it is possible to merge left and right /// The left column "eats" the right. *l += *r; } else { // not enough capacity, check the next pair sum += l->count; ++l; /// We skip all the values "eaten" earlier. if (l != r) *l = *r; } ++r; } /// At the end of the loop, all values to the right of l were "eaten". summary.resize(l - summary.begin() + 1); } unmerged = 0; } } /** Calculates the quantile q [0, 1] based on the digest. * For an empty digest returns NaN. */ Value getQuantile(const Params & params, Value q) { if (summary.empty()) return NAN; compress(params); if (summary.size() == 1) return summary.front().mean; Value x = q * count; TotalCount sum = 0; Value prev_mean = summary.front().mean; Value prev_x = 0; for (const auto & c : summary) { Value current_x = sum + c.count * 0.5; if (current_x >= x) return interpolate(x, prev_x, prev_mean, current_x, c.mean); sum += c.count; prev_mean = c.mean; prev_x = current_x; } return summary.back().mean; } /** Get multiple quantiles (`size` pieces). * levels - an array of levels of the desired quantiles. They are in a random order. * levels_permutation - array-permutation levels. The i-th position will be the index of the i-th ascending level in the `levels` array. * result - the array where the results are added, in order of `levels`, */ template void getManyQuantiles(const Params & params, const Value * levels, const size_t * levels_permutation, size_t size, ResultType * result) { if (summary.empty()) { for (size_t result_num = 0; result_num < size; ++result_num) result[result_num] = std::is_floating_point::value ? NAN : 0; return; } compress(params); if (summary.size() == 1) { for (size_t result_num = 0; result_num < size; ++result_num) result[result_num] = summary.front().mean; return; } Value x = levels[levels_permutation[0]] * count; TotalCount sum = 0; Value prev_mean = summary.front().mean; Value prev_x = 0; size_t result_num = 0; for (const auto & c : summary) { Value current_x = sum + c.count * 0.5; while (current_x >= x) { result[levels_permutation[result_num]] = interpolate(x, prev_x, prev_mean, current_x, c.mean); ++result_num; if (result_num >= size) return; x = levels[levels_permutation[result_num]] * count; } sum += c.count; prev_mean = c.mean; prev_x = current_x; } auto rest_of_results = summary.back().mean; for (; result_num < size; ++result_num) result[levels_permutation[result_num]] = rest_of_results; } /** Combine with another state. */ void merge(const Params & params, const MergingDigest & other) { for (const auto & c : other.summary) add(params, c); } /** Write to the stream. */ void write(const Params & params, DB::WriteBuffer & buf) { compress(params); DB::writeVarUInt(summary.size(), buf); buf.write(reinterpret_cast(&summary[0]), summary.size() * sizeof(summary[0])); } /** Read from the stream. */ void read(const Params & params, DB::ReadBuffer & buf) { size_t size = 0; DB::readVarUInt(size, buf); if (size > params.max_unmerged) throw DB::Exception("Too large t-digest summary size", DB::ErrorCodes::TOO_LARGE_ARRAY_SIZE); summary.resize(size); buf.read(reinterpret_cast(&summary[0]), size * sizeof(summary[0])); } }; } namespace DB { struct AggregateFunctionQuantileTDigestData { tdigest::MergingDigest digest; }; template class AggregateFunctionQuantileTDigest final : public IUnaryAggregateFunction> { private: Float32 level; tdigest::Params params; DataTypePtr type; public: AggregateFunctionQuantileTDigest(double level_ = 0.5) : level(level_) {} String getName() const override { return "quantileTDigest"; } DataTypePtr getReturnType() const override { return type; } void setArgument(const DataTypePtr & argument) { if (returns_float) type = std::make_shared(); else type = argument; } void setParameters(const Array & params) override { if (params.size() != 1) throw Exception("Aggregate function " + getName() + " requires exactly one parameter.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); level = applyVisitor(FieldVisitorConvertToNumber(), params[0]); } void addImpl(AggregateDataPtr place, const IColumn & column, size_t row_num, Arena *) const { this->data(place).digest.add(params, static_cast &>(column).getData()[row_num]); } void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override { this->data(place).digest.merge(params, this->data(rhs).digest); } void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override { this->data(const_cast(place)).digest.write(params, buf); } void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override { this->data(place).digest.read(params, buf); } void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override { auto quantile = this->data(const_cast(place)).digest.getQuantile(params, level); if (returns_float) static_cast(to).getData().push_back(quantile); else static_cast &>(to).getData().push_back(quantile); } }; template class AggregateFunctionQuantileTDigestWeighted final : public IBinaryAggregateFunction> { private: Float32 level; tdigest::Params params; DataTypePtr type; public: AggregateFunctionQuantileTDigestWeighted(double level_ = 0.5) : level(level_) {} String getName() const override { return "quantileTDigestWeighted"; } DataTypePtr getReturnType() const override { return type; } void setArgumentsImpl(const DataTypes & arguments) { if (returns_float) type = std::make_shared(); else type = arguments.at(0); } void setParameters(const Array & params) override { if (params.size() != 1) throw Exception("Aggregate function " + getName() + " requires exactly one parameter.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); level = applyVisitor(FieldVisitorConvertToNumber(), params[0]); } void addImpl(AggregateDataPtr place, const IColumn & column_value, const IColumn & column_weight, size_t row_num, Arena *) const { this->data(place).digest.add(params, static_cast &>(column_value).getData()[row_num], static_cast &>(column_weight).getData()[row_num]); } void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override { this->data(place).digest.merge(params, this->data(rhs).digest); } void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override { this->data(const_cast(place)).digest.write(params, buf); } void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override { this->data(place).digest.read(params, buf); } void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override { auto quantile = this->data(const_cast(place)).digest.getQuantile(params, level); if (returns_float) static_cast(to).getData().push_back(quantile); else static_cast &>(to).getData().push_back(quantile); } }; template class AggregateFunctionQuantilesTDigest final : public IUnaryAggregateFunction> { private: QuantileLevels levels; tdigest::Params params; DataTypePtr type; public: String getName() const override { return "quantilesTDigest"; } DataTypePtr getReturnType() const override { return std::make_shared(type); } void setArgument(const DataTypePtr & argument) { if (returns_float) type = std::make_shared(); else type = argument; } void setParameters(const Array & params) override { levels.set(params); } void addImpl(AggregateDataPtr place, const IColumn & column, size_t row_num, Arena *) const { this->data(place).digest.add(params, static_cast &>(column).getData()[row_num]); } void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override { this->data(place).digest.merge(params, this->data(rhs).digest); } void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override { this->data(const_cast(place)).digest.write(params, buf); } void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override { this->data(place).digest.read(params, buf); } void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override { ColumnArray & arr_to = static_cast(to); ColumnArray::Offsets_t & offsets_to = arr_to.getOffsets(); size_t size = levels.size(); offsets_to.push_back((offsets_to.size() == 0 ? 0 : offsets_to.back()) + size); if (!size) return; if (returns_float) { typename ColumnFloat32::Container_t & data_to = static_cast(arr_to.getData()).getData(); size_t old_size = data_to.size(); data_to.resize(data_to.size() + size); this->data(const_cast(place)).digest.getManyQuantiles( params, &levels.levels[0], &levels.permutation[0], size, &data_to[old_size]); } else { typename ColumnVector::Container_t & data_to = static_cast &>(arr_to.getData()).getData(); size_t old_size = data_to.size(); data_to.resize(data_to.size() + size); this->data(const_cast(place)).digest.getManyQuantiles( params, &levels.levels[0], &levels.permutation[0], size, &data_to[old_size]); } } }; template class AggregateFunctionQuantilesTDigestWeighted final : public IBinaryAggregateFunction> { private: QuantileLevels levels; tdigest::Params params; DataTypePtr type; public: String getName() const override { return "quantilesTDigest"; } DataTypePtr getReturnType() const override { return std::make_shared(type); } void setArgumentsImpl(const DataTypes & arguments) { if (returns_float) type = std::make_shared(); else type = arguments.at(0); } void setParameters(const Array & params) override { levels.set(params); } void addImpl(AggregateDataPtr place, const IColumn & column_value, const IColumn & column_weight, size_t row_num, Arena *) const { this->data(place).digest.add(params, static_cast &>(column_value).getData()[row_num], static_cast &>(column_weight).getData()[row_num]); } void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override { this->data(place).digest.merge(params, this->data(rhs).digest); } void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override { this->data(const_cast(place)).digest.write(params, buf); } void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override { this->data(place).digest.read(params, buf); } void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override { ColumnArray & arr_to = static_cast(to); ColumnArray::Offsets_t & offsets_to = arr_to.getOffsets(); size_t size = levels.size(); offsets_to.push_back((offsets_to.size() == 0 ? 0 : offsets_to.back()) + size); if (!size) return; if (returns_float) { typename ColumnFloat32::Container_t & data_to = static_cast(arr_to.getData()).getData(); size_t old_size = data_to.size(); data_to.resize(data_to.size() + size); this->data(const_cast(place)).digest.getManyQuantiles( params, &levels.levels[0], &levels.permutation[0], size, &data_to[old_size]); } else { typename ColumnVector::Container_t & data_to = static_cast &>(arr_to.getData()).getData(); size_t old_size = data_to.size(); data_to.resize(data_to.size() + size); this->data(const_cast(place)).digest.getManyQuantiles( params, &levels.levels[0], &levels.permutation[0], size, &data_to[old_size]); } } }; }