implement n log(n) compression algorigthm

2024-11-24 00:22:29 +00:00 · 2018-06-16 21:40:43 +03:00 · 2018-06-16 21:40:43 +03:00 · 984fe8e4fd
commit 984fe8e4fd
parent b09323c547
2 changed files with 105 additions and 59 deletions
--- a/dbms/src/AggregateFunctions/AggregateFunctionHistogram.cpp
+++ b/dbms/src/AggregateFunctions/AggregateFunctionHistogram.cpp
@ -7,27 +7,36 @@

 namespace DB {

+namespace ErrorCodes
+{
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int BAD_ARGUMENTS;
+}
+
+namespace {
+
 AggregateFunctionPtr createAggregateFunctionHistogram(const std::string & name, const DataTypes & arguments, const Array & params)
 {
    if (params.size() != 1)
-    {
-        throw Exception("Function " + name + " requires only bins count");
-    }
+        throw Exception("Function " + name + " requires bins count", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
+
+    UInt32 bins_count = applyVisitor(FieldVisitorConvertToNumber<UInt32>(), params[0]);
+
+    if (bins_count == 0)
+        throw Exception("Bin count should be positive", ErrorCodes::BAD_ARGUMENTS);
+
    assertUnary(name, arguments);
-
-    UInt32 bins_count;
-
-#define READ(VAL, PARAM) \
-    VAL = applyVisitor(FieldVisitorConvertToNumber<decltype(VAL)>(), PARAM);
-
-    READ(bins_count, params[0]);
-#undef READ
-
    AggregateFunctionPtr res(createWithNumericType<AggregateFunctionHistogram>(*arguments[0], bins_count));

+    if (!res)
+        throw Exception("Illegal type " + arguments[0]->getName() + " of argument for aggregate function " + name, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
    return res;
 }

+}
+
 void registerAggregateFunctionHistogram(AggregateFunctionFactory & factory)
 {
    factory.registerFunction("histogram", createAggregateFunctionHistogram);
--- a/dbms/src/AggregateFunctions/AggregateFunctionHistogram.h
+++ b/dbms/src/AggregateFunctions/AggregateFunctionHistogram.h
@ -2,7 +2,8 @@

 #include <Common/Arena.h>
 #include <Common/PODArray.h>
-#include <Common/RadixSort.h>
+#include <Common/AutoArray.h>
+
 #include <Columns/ColumnVector.h>
 #include <Columns/ColumnTuple.h>
 #include <Columns/ColumnArray.h>
@ -17,6 +18,8 @@

 #include <AggregateFunctions/IAggregateFunction.h>

+#include <queue>
+
 namespace DB {

 namespace ErrorCodes
@ -31,8 +34,8 @@ namespace ErrorCodes
 class AggregateFunctionHistogramData
 {
 public:
-    using Mean = Float32;
-    using Weight = Float32;
+    using Mean = Float64;
+    using Weight = Float64;

 private:
    struct WeightedValue {
@ -41,83 +44,113 @@ private:

        WeightedValue operator + (const WeightedValue& other)
        {
-            return {(other.mean * other.weight + mean * weight) / (other.weight + weight), other.weight + weight};
+            return {mean + other.weight * (other.mean - mean) / (other.weight + weight), other.weight + weight};
        }
    };

-    struct RadixSortTraits
-    {
-        using Element = WeightedValue;
-        using Key = Mean;
-        using CountType = UInt32;
-        using KeyBits = UInt32;
-
-        static constexpr size_t PART_SIZE_BITS = 8;
-
-        using Transform = RadixSortFloatTransform<KeyBits>;
-        using Allocator = RadixSortMallocAllocator;
-
-        static Key & extractKey(Element & elem) { return elem.mean; }
-    };
-
 private:
    UInt32 max_bins;
+
+    // quantity of stored weighted-values
    UInt32 size;
+
+    // calculated lower and upper bounds of seen points
    Mean lower_bound;
    Mean upper_bound;

-    static constexpr Float32 epsilon = 1e-8;
+    static constexpr Mean epsilon = 1e-8;

    // Weighted values representation of histogram.
    // We allow up to max_bins * 2 values stored in intermediate states
+    // Is allocated in arena, so there is no explicit management.
    WeightedValue* points;

 private:
    void sort()
    {
-        RadixSort<RadixSortTraits>::execute(points, size);
+        std::sort(points, points + size,
+                [](const WeightedValue& first, const WeightedValue& second)
+                {
+                    return first.mean < second.mean;
+                });
    }

    /**
-     * Repeatedly fuse until max_bins bins left
+     * Repeatedly fuse most close values until max_bins bins left
     */
    void compress()
    {
        sort();
-        while (size > max_bins)
-        {
-            size_t min_index = 0;
-            auto quality = [&](size_t i)
-            {
-                return points[i + 1].mean - points[i].mean;
-            };
-            for (size_t i = 0; i + 1 != size; ++i)
-            {
-                if (quality(min_index) > quality(i))
-                {
-                    min_index = i;
-                }
-            }
+        auto newsz = size;
+        if (size <= max_bins) return;

-            points[min_index] = points[min_index] + points[min_index + 1];
-            for (size_t i = min_index + 1; i + 1 < size; ++i)
-            {
-                points[i] = points[i + 1];
-            }
-            size--;
+        // Maintain doubly-linked list of "active" points
+        // and store neighbour pairs in priority queue by distance
+        AutoArray<UInt32> previous(size + 1);
+        AutoArray<UInt32> next(size + 1);
+        AutoArray<bool> active(size + 1, true);
+        active[size] = false;
+        auto delete_node = [&](UInt32 i)
+        {
+            previous[next[i]] = previous[i];
+            next[previous[i]] = next[i];
+            active[i] = false;
+        };
+        for (size_t i = 0; i <= size; i++)
+        {
+            previous[i] = i - 1;
+            next[i] = i + 1;
        }
+        next[size] = 0;
+        previous[0] = size;
+
+        using QueueItem = std::pair<Mean, UInt32>;
+
+        std::vector<QueueItem> storage;
+        storage.reserve(2 * size - max_bins);
+        std::priority_queue<QueueItem, std::vector<QueueItem>, std::greater<QueueItem>> queue;
+
+        auto quality = [&](UInt32 i) { return points[next[i]].mean - points[i].mean; };
+        for (size_t i = 0; i + 1 < size; i++)
+            queue.push({quality(i), i});
+
+        while (newsz > max_bins && !queue.empty())
+        {
+            auto min_item = queue.top();
+            queue.pop();
+            auto l = min_item.second;
+            auto r = next[l];
+            if (!active[l] || !active[r] || quality(l) > min_item.first)
+                continue;
+
+            points[l] = points[l] + points[r];
+
+            delete_node(r);
+            if (active[next[l]])
+                queue.push({quality(l), l});
+            if (active[previous[l]])
+                queue.push({quality(previous[l]), previous[l]});
+
+            newsz--;
+        }
+
+        size_t l = 0;
+        for (size_t r = 0; r < size; r++)
+            if (active[r])
+                points[l++] = points[r];
+        size = newsz;
    }

    /***
     * Delete too close points from histogram.
-     * Assume that points are sorted.
+     * Assumes that points are sorted.
     */
    void unique()
    {
        size_t l = 0;
-        for (auto r = l + 1; r != size; r++)
+        for (auto r = l + 1; r < size; r++)
        {
-            if (abs(points[l].mean - points[r].mean) < epsilon)
+            if (points[l].mean + epsilon >= points[r].mean)
            {
                points[l] = points[l] + points[r];
            }
@ -139,11 +172,15 @@ public:
    AggregateFunctionHistogramData(UInt32 max_bins)
        : max_bins(max_bins)
        , size(0)
+        , lower_bound(std::numeric_limits<Mean>::max())
+        , upper_bound(std::numeric_limits<Mean>::lowest())
        , points(nullptr)
    {
    }

    void insertResultInto(ColumnVector<Mean>& to_lower, ColumnVector<Mean>& to_upper, ColumnVector<Weight>& to_weights) {
+        if (!points) return;
+
        compress();
        unique();

@ -167,7 +204,7 @@ public:
        lower_bound = std::min(lower_bound, value);
        upper_bound = std::max(upper_bound, value);

-        if (size > max_bins * 2)
+        if (size >= max_bins * 2)
        {
            compress();
        }
@ -199,7 +236,7 @@ public:

        readVarUInt(size, buf);

-        if (size > max_bins)
+        if (size > max_bins * 2)
            throw Exception("Too many bins", ErrorCodes::TOO_LARGE_ARRAY_SIZE);

        if (!points)