Add requested changes. [#CLICKHOUSE-3606]

2024-11-24 08:32:02 +00:00 · 2018-02-26 04:27:33 +03:00 · 2018-02-26 04:27:33 +03:00 · 120530e44c
commit 120530e44c
parent 11b4cf3163
8 changed files with 144 additions and 117 deletions
--- a/dbms/src/Functions/FunctionsConsistentHashing.cpp
+++ b/dbms/src/Functions/FunctionsConsistentHashing.cpp
@ -8,7 +8,8 @@ namespace DB
 void registerFunctionsConsistentHashing(FunctionFactory & factory)
 {
    factory.registerFunction<FunctionYandexConsistentHash>();
-    factory.registerFunction<FunctionJumpConsistentHas>();
+    factory.registerFunction<FunctionJumpConsistentHash>();
+    factory.registerFunction<FunctionSumburConsistentHash>();
 }

 }
--- a/dbms/src/Functions/FunctionsConsistentHashing.h
+++ b/dbms/src/Functions/FunctionsConsistentHashing.h
@ -6,9 +6,8 @@
 #include <Columns/ColumnConst.h>
 #include <Functions/IFunction.h>
 #include <Functions/FunctionHelpers.h>
-
+#include <common/likely.h>
 #include <yandex/consistent_hashing.h>
-#include <iostream>


 namespace DB
@ -23,10 +22,12 @@ namespace ErrorCodes
 }


+/// An O(1) time and space consistent hash algorithm by Konstantin Oblakov
 struct YandexConsistentHashImpl
 {
-    static constexpr auto name = "YandexConsistentHash";
+    static constexpr auto name = "yandexConsistentHash";

+    using HashType = UInt64;
    /// Actually it supports UInt64, but it is effective only if n < 65536
    using ResultType = UInt32;
    using BucketsCountType = ResultType;
@ -51,8 +52,9 @@ static inline int32_t JumpConsistentHash(uint64_t key, int32_t num_buckets) {

 struct JumpConsistentHashImpl
 {
-    static constexpr auto name = "JumpConsistentHash";
+    static constexpr auto name = "jumpConsistentHash";

+    using HashType = UInt64;
    using ResultType = Int32;
    using BucketsCountType = ResultType;

@ -63,14 +65,57 @@ struct JumpConsistentHashImpl
 };


+/// Sumbur algorithm https://github.com/mailru/sumbur-ruby/blob/master/lib/sumbur/pure_ruby.rb
+static inline UInt32 sumburConsistentHash(UInt32 hashed_integer, UInt32 cluster_capacity)
+{
+    UInt32 l = 0xFFFFFFFF;
+    UInt32 part = l / cluster_capacity;
+
+    if (l - hashed_integer < part)
+        return 0;
+
+    UInt32 h = hashed_integer;
+    UInt32 n = 1;
+    UInt32 i = 2;
+    while (i <= cluster_capacity)
+    {
+        auto c = l / (i * (i - 1));
+        if (c <= h)
+            h -= c;
+        else
+        {
+            h += c * (i - n - 1);
+            n = i;
+            if (l / n - h < part)
+                break;
+        }
+        i += 1;
+    }
+
+    return n - 1;
+}
+
+struct SumburConsistentHashImpl
+{
+    static constexpr auto name = "sumburConsistentHash";
+
+    using HashType = UInt32;
+    using ResultType = UInt32;
+    using BucketsCountType = ResultType;
+
+    static inline ResultType apply(UInt32 hash, BucketsCountType n)
+    {
+        return sumburConsistentHash(hash, n);
+    }
+};
+
+
 template <typename Impl>
 class FunctionConsistentHashImpl : public IFunction
 {
 public:

    static constexpr auto name = Impl::name;
-    using ResultType = typename Impl::ResultType;
-    using BucketsType = typename Impl::BucketsCountType;

    static FunctionPtr create(const Context &) { return std::make_shared<FunctionConsistentHashImpl<Impl>>(); };

@ -84,6 +129,10 @@ public:
            throw Exception("Illegal type " + arguments[0]->getName() + " of the first argument of function " + getName(),
                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

+        if (arguments[0]->getSizeOfValueInMemory() > sizeof(HashType))
+            throw Exception("Function " + getName() + " accepts " + std::to_string(sizeof(HashType) * 8) + "-bit integers at most"
+                            + ", got " + arguments[0]->getName(), ErrorCodes::BAD_ARGUMENTS);
+
        if (!arguments[1]->isInteger())
            throw Exception("Illegal type " + arguments[1]->getName() + " of the second argument of function " + getName(),
                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
@ -91,83 +140,91 @@ public:
        return std::make_shared<DataTypeNumber<ResultType>>();
    }

+    bool useDefaultImplementationForConstants() const override { return true; }
+    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
+
    void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) override
    {
-        auto buckets_col = block.getByPosition(arguments[1]).column.get();
-        if (!buckets_col->isColumnConst())
+        if (block.getByPosition(arguments[1]).column->isColumnConst())
+            executeConstBuckets(block, arguments, result);
+        else
            throw Exception("The second argument of function " + getName() + " (number of buckets) must be constant", ErrorCodes::BAD_ARGUMENTS);
-
-        constexpr UInt64 max_buckets = static_cast<UInt64>(std::numeric_limits<BucketsType>::max());
-        UInt64 num_buckets;
-
-        auto check_range = [&] (auto buckets)
-        {
-            if (buckets <= 0)
-                throw Exception("The second argument of function " + getName() + " (number of buckets) must be positive number",
-                                ErrorCodes::BAD_ARGUMENTS);
-
-            if (static_cast<UInt64>(buckets) > max_buckets)
-                throw Exception("The value of the second argument of function " + getName() + " (number of buckets) is not fit to " +
-                            DataTypeNumber<BucketsType>().getName(), ErrorCodes::BAD_ARGUMENTS);
-
-            num_buckets = static_cast<UInt64>(buckets);
-        };
-
-        Field buckets_field = (*buckets_col)[0];
-        if (buckets_field.getType() == Field::Types::Int64)
-            check_range(buckets_field.safeGet<Int64>());
-        else if (buckets_field.getType() == Field::Types::UInt64)
-            check_range(buckets_field.safeGet<UInt64>());
-        else
-            throw Exception("Illegal type " + String(buckets_field.getTypeName()) + " of the second argument of function " + getName(),
-                            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
-
-
-        const auto & hash_col_source = block.getByPosition(arguments[0]).column;
-        ColumnPtr hash_col = (hash_col_source->isColumnConst()) ? hash_col_source->convertToFullColumnIfConst() : hash_col_source;
-        ColumnPtr & res_col = block.getByPosition(result).column;
-
-
-        const IDataType * hash_type = block.getByPosition(arguments[0]).type.get();
-
-        if      (checkDataType<DataTypeUInt8>(hash_type)) executeType<UInt8>(hash_col, res_col, num_buckets);
-        else if (checkDataType<DataTypeUInt16>(hash_type)) executeType<UInt16>(hash_col, res_col, num_buckets);
-        else if (checkDataType<DataTypeUInt32>(hash_type)) executeType<UInt32>(hash_col, res_col, num_buckets);
-        else if (checkDataType<DataTypeUInt64>(hash_type)) executeType<UInt64>(hash_col, res_col, num_buckets);
-        else if (checkDataType<DataTypeInt8>(hash_type)) executeType<Int8>(hash_col, res_col, num_buckets);
-        else if (checkDataType<DataTypeInt16>(hash_type)) executeType<Int16>(hash_col, res_col, num_buckets);
-        else if (checkDataType<DataTypeInt32>(hash_type)) executeType<Int32>(hash_col, res_col, num_buckets);
-        else if (checkDataType<DataTypeInt64>(hash_type)) executeType<Int64>(hash_col, res_col, num_buckets);
-        else
-            throw Exception("Illegal type " + hash_type->getName() + " of the first argument of function " + getName(),
-                            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
    }

 private:

-    template <typename HashType>
-    void executeType(const ColumnPtr & col_hash_ptr, ColumnPtr & out_col_result, const UInt64 num_buckets)
+    using HashType = typename Impl::HashType;
+    using ResultType = typename Impl::ResultType;
+    using BucketsType = typename Impl::BucketsCountType;
+    static constexpr auto max_buckets = static_cast<UInt64>(std::numeric_limits<BucketsType>::max());
+
+    template <typename T>
+    inline BucketsType checkBucketsRange(T buckets)
    {
-        auto col_hash = checkAndGetColumn<ColumnVector<HashType>>(col_hash_ptr.get());
+        if (unlikely(buckets <= 0))
+            throw Exception("The second argument of function " + getName() + " (number of buckets) must be positive number",
+                            ErrorCodes::BAD_ARGUMENTS);
+
+        if (unlikely(static_cast<UInt64>(buckets) > max_buckets))
+            throw Exception("The value of the second argument of function " + getName() + " (number of buckets) is not fit to " +
+                        DataTypeNumber<BucketsType>().getName(), ErrorCodes::BAD_ARGUMENTS);
+
+        return static_cast<BucketsType>(buckets);
+    }
+
+    void executeConstBuckets(Block & block, const ColumnNumbers & arguments, size_t result)
+    {
+        Field buckets_field = (*block.getByPosition(arguments[1]).column)[0];
+        BucketsType num_buckets;
+
+        if (buckets_field.getType() == Field::Types::Int64)
+            num_buckets = checkBucketsRange(buckets_field.get<Int64>());
+        else if (buckets_field.getType() == Field::Types::UInt64)
+            num_buckets = checkBucketsRange(buckets_field.get<UInt64>());
+        else
+            throw Exception("Illegal type " + String(buckets_field.getTypeName()) + " of the second argument of function " + getName(),
+                            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+        const auto & hash_col = block.getByPosition(arguments[0]).column;
+        const IDataType * hash_type = block.getByPosition(arguments[0]).type.get();
+        auto res_col = ColumnVector<ResultType>::create();
+
+        if      (checkDataType<DataTypeUInt8>(hash_type)) executeType<UInt8>(hash_col, num_buckets, res_col.get());
+        else if (checkDataType<DataTypeUInt16>(hash_type)) executeType<UInt16>(hash_col, num_buckets, res_col.get());
+        else if (checkDataType<DataTypeUInt32>(hash_type)) executeType<UInt32>(hash_col, num_buckets, res_col.get());
+        else if (checkDataType<DataTypeUInt64>(hash_type)) executeType<UInt64>(hash_col, num_buckets, res_col.get());
+        else if (checkDataType<DataTypeInt8>(hash_type)) executeType<Int8>(hash_col, num_buckets, res_col.get());
+        else if (checkDataType<DataTypeInt16>(hash_type)) executeType<Int16>(hash_col, num_buckets, res_col.get());
+        else if (checkDataType<DataTypeInt32>(hash_type)) executeType<Int32>(hash_col, num_buckets, res_col.get());
+        else if (checkDataType<DataTypeInt64>(hash_type)) executeType<Int64>(hash_col, num_buckets, res_col.get());
+        else
+            throw Exception("Illegal type " + hash_type->getName() + " of the first argument of function " + getName(),
+                            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+        block.getByPosition(result).column = std::move(res_col);
+    }
+
+    template <typename CurrentHashType>
+    void executeType(const ColumnPtr & col_hash_ptr, BucketsType num_buckets, ColumnVector<ResultType> * col_result)
+    {
+        auto col_hash = checkAndGetColumn<ColumnVector<CurrentHashType>>(col_hash_ptr.get());
        if (!col_hash)
            throw Exception("Illegal type of the first argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
-
-        auto col_result = ColumnVector<ResultType>::create();
-        typename ColumnVector<ResultType>::Container & vec_result = col_result->getData();
+        
+        auto & vec_result = col_result->getData();
        const auto & vec_hash = col_hash->getData();

        size_t size = vec_hash.size();
        vec_result.resize(size);
        for (size_t i = 0; i < size; ++i)
-            vec_result[i] = Impl::apply(static_cast<UInt64>(vec_hash[i]), static_cast<BucketsType>(num_buckets));
-
-        out_col_result = std::move(col_result);
+            vec_result[i] = Impl::apply(static_cast<HashType>(vec_hash[i]), num_buckets);
    }
 };


 using FunctionYandexConsistentHash = FunctionConsistentHashImpl<YandexConsistentHashImpl>;
-using FunctionJumpConsistentHas = FunctionConsistentHashImpl<JumpConsistentHashImpl>;
+using FunctionJumpConsistentHash = FunctionConsistentHashImpl<JumpConsistentHashImpl>;
+using FunctionSumburConsistentHash = FunctionConsistentHashImpl<SumburConsistentHashImpl>;


 }
--- a/dbms/tests/integration/test_cluster_copier/task_month_to_week_description.xml
+++ b/dbms/tests/integration/test_cluster_copier/task_month_to_week_description.xml
@ -29,7 +29,7 @@
            <engine>ENGINE=ReplicatedMergeTree('/clickhouse/tables/cluster{cluster}/{shard}/b', '{replica}') PARTITION BY toMonday(date) ORDER BY d</engine>

            <!-- Which sarding key to use while copying -->
-            <sharding_key>JumpConsistentHash(intHash64(d), 2)</sharding_key>
+            <sharding_key>jumpConsistentHash(intHash64(d), 2)</sharding_key>

            <!-- Optional expression that filter copying data -->
            <!-- <where_condition></where_condition> -->
--- a/dbms/tests/integration/test_cluster_copier/test.py
+++ b/dbms/tests/integration/test_cluster_copier/test.py
@ -129,8 +129,8 @@ class Task2:
        assert TSV(self.cluster.instances['s0_0_0'].query("SELECT count() FROM cluster(cluster0, default, a)")) == TSV("85\n")
        assert TSV(self.cluster.instances['s1_0_0'].query("SELECT count(), uniqExact(date) FROM cluster(cluster1, default, b)")) == TSV("85\t85\n")

-        assert TSV(self.cluster.instances['s1_0_0'].query("SELECT DISTINCT JumpConsistentHash(intHash64(d), 2) FROM b")) == TSV("0\n")
-        assert TSV(self.cluster.instances['s1_1_0'].query("SELECT DISTINCT JumpConsistentHash(intHash64(d), 2) FROM b")) == TSV("1\n")
+        assert TSV(self.cluster.instances['s1_0_0'].query("SELECT DISTINCT jumpConsistentHash(intHash64(d), 2) FROM b")) == TSV("0\n")
+        assert TSV(self.cluster.instances['s1_1_0'].query("SELECT DISTINCT jumpConsistentHash(intHash64(d), 2) FROM b")) == TSV("1\n")

        assert TSV(self.cluster.instances['s1_0_0'].query("SELECT uniqExact(partition) IN (12, 13) FROM system.parts WHERE active AND database='default' AND table='b'")) == TSV("1\n")
        assert TSV(self.cluster.instances['s1_1_0'].query("SELECT uniqExact(partition) IN (12, 13) FROM system.parts WHERE active AND database='default' AND table='b'")) == TSV("1\n")
--- a/dbms/tests/queries/0_stateless/00580_consistent_hashing_functions.sql
+++ b/dbms/tests/queries/0_stateless/00580_consistent_hashing_functions.sql
@ -1,4 +1,4 @@
-SELECT JumpConsistentHash(1, 1), JumpConsistentHash(42, 57), JumpConsistentHash(256, 1024), JumpConsistentHash(3735883980, 1), JumpConsistentHash(3735883980, 666), JumpConsistentHash(16045690984833335023, 255);
-SELECT YandexConsistentHash(16045690984833335023, 1), YandexConsistentHash(16045690984833335023, 2), YandexConsistentHash(16045690984833335023, 3), YandexConsistentHash(16045690984833335023, 4), YandexConsistentHash(16045690984833335023, 173), YandexConsistentHash(16045690984833335023, 255);
-SELECT JumpConsistentHash(intHash64(number), 787) FROM system.numbers LIMIT 1000000, 2;
-SELECT YandexConsistentHash(16045690984833335023+number-number, 120) FROM system.numbers LIMIT 1000000, 2;
+SELECT jumpConsistentHash(1, 1), jumpConsistentHash(42, 57), jumpConsistentHash(256, 1024), jumpConsistentHash(3735883980, 1), jumpConsistentHash(3735883980, 666), jumpConsistentHash(16045690984833335023, 255);
+SELECT yandexConsistentHash(16045690984833335023, 1), yandexConsistentHash(16045690984833335023, 2), yandexConsistentHash(16045690984833335023, 3), yandexConsistentHash(16045690984833335023, 4), yandexConsistentHash(16045690984833335023, 173), yandexConsistentHash(16045690984833335023, 255);
+SELECT jumpConsistentHash(intHash64(number), 787) FROM system.numbers LIMIT 1000000, 2;
+SELECT yandexConsistentHash(16045690984833335023+number-number, 120) FROM system.numbers LIMIT 1000000, 2;
--- a/libs/yandex-consistent-hashing/yandex/bitops.h
+++ b/libs/yandex-consistent-hashing/yandex/bitops.h
@ -3,36 +3,33 @@
 #include <limits>
 #include <type_traits>

-// Assume little endian

-inline uint16_t & LO_16(uint32_t & x) { return reinterpret_cast<uint16_t *>(&x)[0]; }
-inline uint16_t & HI_16(uint32_t & x) { return reinterpret_cast<uint16_t *>(&x)[1]; }
+inline uint16_t LO_16(uint32_t x) { return static_cast<uint16_t>(x & 0x0000FFFF); }
+inline uint16_t HI_16(uint32_t x) { return static_cast<uint16_t>(x >> 16); }

-inline uint32_t & LO_32(uint64_t & x) { return reinterpret_cast<uint32_t *>(&x)[0]; }
-inline uint32_t & HI_32(uint64_t & x) { return reinterpret_cast<uint32_t *>(&x)[1]; }
+inline uint32_t LO_32(uint64_t x) { return static_cast<uint32_t>(x & 0x00000000FFFFFFFF); }
+inline uint32_t HI_32(uint64_t x) { return static_cast<uint32_t>(x >> 32); }


+/// Clang also defines __GNUC__
 #if defined(__GNUC__)
        inline unsigned GetValueBitCountImpl(unsigned int value) noexcept {
-            // Y_ASSERT(value); // because __builtin_clz* have undefined result for zero.
+            // NOTE: __builtin_clz* have undefined result for zero.
            return std::numeric_limits<unsigned int>::digits - __builtin_clz(value);
        }

        inline unsigned GetValueBitCountImpl(unsigned long value) noexcept {
-            // Y_ASSERT(value); // because __builtin_clz* have undefined result for zero.
            return std::numeric_limits<unsigned long>::digits - __builtin_clzl(value);
        }

        inline unsigned GetValueBitCountImpl(unsigned long long value) noexcept {
-            // Y_ASSERT(value); // because __builtin_clz* have undefined result for zero.
            return std::numeric_limits<unsigned long long>::digits - __builtin_clzll(value);
        }
 #else
-        /// Stupid realization for non-GCC. Can use BSR from x86 instructions set.
+        /// Stupid realization for non GCC-like compilers. Can use BSR from x86 instructions set.
        template <typename T>
        inline unsigned GetValueBitCountImpl(T value) noexcept {
-            // Y_ASSERT(value);     // because __builtin_clz* have undefined result for zero.
-            unsigned result = 1; // result == 0 - impossible value, see Y_ASSERT().
+            unsigned result = 1; // result == 0 - impossible value, since value cannot be zero
            value >>= 1;
            while (value) {
                value >>= 1;
@ -46,10 +43,10 @@ inline uint32_t & HI_32(uint64_t & x) { return reinterpret_cast<uint32_t *>(&x)[

 /**
 * Returns the number of leading 0-bits in `value`, starting at the most significant bit position.
+ * NOTE: value cannot be zero
 */
 template <typename T>
 static inline unsigned GetValueBitCount(T value) noexcept {
-    // Y_ASSERT(value > 0);
    using TCvt = std::make_unsigned_t<std::decay_t<T>>;
    return GetValueBitCountImpl(static_cast<TCvt>(value));
 }
--- a/libs/yandex-consistent-hashing/yandex/consistent_hashing.h
+++ b/libs/yandex-consistent-hashing/yandex/consistent_hashing.h
@ -4,6 +4,8 @@
 #include <cstddef>

 /*
+ * Author: Konstantin Oblakov
+ *
 * Maps random ui64 x (in fact hash of some string) to n baskets/shards.
 * Output value is id of a basket. 0 <= ConsistentHashing(x, n) < n.
 * Probability of all baskets must be equal. Also, it should be consistent
--- a/libs/yandex-consistent-hashing/yandex/popcount.h
+++ b/libs/yandex-consistent-hashing/yandex/popcount.h
@ -11,12 +11,6 @@ using std::size_t;
 #include <intrin.h>
 #endif

-#ifdef __SSE2__
-constexpr bool HavePOPCNTInstr = true;
-#else
-constexpr bool HavePOPCNTInstr = false;
-#pragma GCC warning "SSE2 is not detected, PopCount function will be too slow"
-#endif

 static inline uint32_t PopCountImpl(uint8_t n) {
    extern uint8_t const* PopCountLUT8;
@ -35,21 +29,9 @@ static inline uint32_t PopCountImpl(uint16_t n) {
 static inline uint32_t PopCountImpl(uint32_t n) {
 #if defined(_MSC_VER)
    return __popcnt(n);
+#elif defined(__GNUC__) // it is true for Clang also
+    return __builtin_popcount(n);
 #else
-#if defined(__x86_64__)
-
-    if (HavePOPCNTInstr) {
-        uint32_t r;
-
-        __asm__("popcnt %1, %0;"
-                : "=r"(r)
-                : "r"(n)
-                :);
-
-        return r;
-    }
-#endif
-
    return PopCountImpl((uint16_t)LO_16(n)) + PopCountImpl((uint16_t)HI_16(n));
 #endif
 }
@ -57,21 +39,9 @@ static inline uint32_t PopCountImpl(uint32_t n) {
 static inline uint32_t PopCountImpl(uint64_t n) {
 #if defined(_MSC_VER) && !defined(_i386_)
    return __popcnt64(n);
+#elif defined(__GNUC__) // it is true for Clang also
+    return __builtin_popcountll(n);
 #else
-#if defined(__x86_64__)
-
-    if (HavePOPCNTInstr) {
-        uint64_t r;
-
-        __asm__("popcnt %1, %0;"
-                : "=r"(r)
-                : "r"(n)
-                :);
-
-        return r;
-    }
-#endif
-
    return PopCountImpl((uint32_t)LO_32(n)) + PopCountImpl((uint32_t)HI_32(n));
 #endif
 }