Add requested changes. [#CLICKHOUSE-3606]

This commit is contained in:
Vitaliy Lyudvichenko 2018-02-26 04:27:33 +03:00
parent 11b4cf3163
commit 120530e44c
8 changed files with 144 additions and 117 deletions

View File

@ -8,7 +8,8 @@ namespace DB
void registerFunctionsConsistentHashing(FunctionFactory & factory)
{
factory.registerFunction<FunctionYandexConsistentHash>();
factory.registerFunction<FunctionJumpConsistentHas>();
factory.registerFunction<FunctionJumpConsistentHash>();
factory.registerFunction<FunctionSumburConsistentHash>();
}
}

View File

@ -6,9 +6,8 @@
#include <Columns/ColumnConst.h>
#include <Functions/IFunction.h>
#include <Functions/FunctionHelpers.h>
#include <common/likely.h>
#include <yandex/consistent_hashing.h>
#include <iostream>
namespace DB
@ -23,10 +22,12 @@ namespace ErrorCodes
}
/// An O(1) time and space consistent hash algorithm by Konstantin Oblakov
struct YandexConsistentHashImpl
{
static constexpr auto name = "YandexConsistentHash";
static constexpr auto name = "yandexConsistentHash";
using HashType = UInt64;
/// Actually it supports UInt64, but it is effective only if n < 65536
using ResultType = UInt32;
using BucketsCountType = ResultType;
@ -51,8 +52,9 @@ static inline int32_t JumpConsistentHash(uint64_t key, int32_t num_buckets) {
struct JumpConsistentHashImpl
{
static constexpr auto name = "JumpConsistentHash";
static constexpr auto name = "jumpConsistentHash";
using HashType = UInt64;
using ResultType = Int32;
using BucketsCountType = ResultType;
@ -63,14 +65,57 @@ struct JumpConsistentHashImpl
};
/// Sumbur algorithm https://github.com/mailru/sumbur-ruby/blob/master/lib/sumbur/pure_ruby.rb
static inline UInt32 sumburConsistentHash(UInt32 hashed_integer, UInt32 cluster_capacity)
{
UInt32 l = 0xFFFFFFFF;
UInt32 part = l / cluster_capacity;
if (l - hashed_integer < part)
return 0;
UInt32 h = hashed_integer;
UInt32 n = 1;
UInt32 i = 2;
while (i <= cluster_capacity)
{
auto c = l / (i * (i - 1));
if (c <= h)
h -= c;
else
{
h += c * (i - n - 1);
n = i;
if (l / n - h < part)
break;
}
i += 1;
}
return n - 1;
}
struct SumburConsistentHashImpl
{
static constexpr auto name = "sumburConsistentHash";
using HashType = UInt32;
using ResultType = UInt32;
using BucketsCountType = ResultType;
static inline ResultType apply(UInt32 hash, BucketsCountType n)
{
return sumburConsistentHash(hash, n);
}
};
template <typename Impl>
class FunctionConsistentHashImpl : public IFunction
{
public:
static constexpr auto name = Impl::name;
using ResultType = typename Impl::ResultType;
using BucketsType = typename Impl::BucketsCountType;
static FunctionPtr create(const Context &) { return std::make_shared<FunctionConsistentHashImpl<Impl>>(); };
@ -84,6 +129,10 @@ public:
throw Exception("Illegal type " + arguments[0]->getName() + " of the first argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (arguments[0]->getSizeOfValueInMemory() > sizeof(HashType))
throw Exception("Function " + getName() + " accepts " + std::to_string(sizeof(HashType) * 8) + "-bit integers at most"
+ ", got " + arguments[0]->getName(), ErrorCodes::BAD_ARGUMENTS);
if (!arguments[1]->isInteger())
throw Exception("Illegal type " + arguments[1]->getName() + " of the second argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
@ -91,83 +140,91 @@ public:
return std::make_shared<DataTypeNumber<ResultType>>();
}
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) override
{
auto buckets_col = block.getByPosition(arguments[1]).column.get();
if (!buckets_col->isColumnConst())
if (block.getByPosition(arguments[1]).column->isColumnConst())
executeConstBuckets(block, arguments, result);
else
throw Exception("The second argument of function " + getName() + " (number of buckets) must be constant", ErrorCodes::BAD_ARGUMENTS);
constexpr UInt64 max_buckets = static_cast<UInt64>(std::numeric_limits<BucketsType>::max());
UInt64 num_buckets;
auto check_range = [&] (auto buckets)
{
if (buckets <= 0)
throw Exception("The second argument of function " + getName() + " (number of buckets) must be positive number",
ErrorCodes::BAD_ARGUMENTS);
if (static_cast<UInt64>(buckets) > max_buckets)
throw Exception("The value of the second argument of function " + getName() + " (number of buckets) is not fit to " +
DataTypeNumber<BucketsType>().getName(), ErrorCodes::BAD_ARGUMENTS);
num_buckets = static_cast<UInt64>(buckets);
};
Field buckets_field = (*buckets_col)[0];
if (buckets_field.getType() == Field::Types::Int64)
check_range(buckets_field.safeGet<Int64>());
else if (buckets_field.getType() == Field::Types::UInt64)
check_range(buckets_field.safeGet<UInt64>());
else
throw Exception("Illegal type " + String(buckets_field.getTypeName()) + " of the second argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
const auto & hash_col_source = block.getByPosition(arguments[0]).column;
ColumnPtr hash_col = (hash_col_source->isColumnConst()) ? hash_col_source->convertToFullColumnIfConst() : hash_col_source;
ColumnPtr & res_col = block.getByPosition(result).column;
const IDataType * hash_type = block.getByPosition(arguments[0]).type.get();
if (checkDataType<DataTypeUInt8>(hash_type)) executeType<UInt8>(hash_col, res_col, num_buckets);
else if (checkDataType<DataTypeUInt16>(hash_type)) executeType<UInt16>(hash_col, res_col, num_buckets);
else if (checkDataType<DataTypeUInt32>(hash_type)) executeType<UInt32>(hash_col, res_col, num_buckets);
else if (checkDataType<DataTypeUInt64>(hash_type)) executeType<UInt64>(hash_col, res_col, num_buckets);
else if (checkDataType<DataTypeInt8>(hash_type)) executeType<Int8>(hash_col, res_col, num_buckets);
else if (checkDataType<DataTypeInt16>(hash_type)) executeType<Int16>(hash_col, res_col, num_buckets);
else if (checkDataType<DataTypeInt32>(hash_type)) executeType<Int32>(hash_col, res_col, num_buckets);
else if (checkDataType<DataTypeInt64>(hash_type)) executeType<Int64>(hash_col, res_col, num_buckets);
else
throw Exception("Illegal type " + hash_type->getName() + " of the first argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
private:
template <typename HashType>
void executeType(const ColumnPtr & col_hash_ptr, ColumnPtr & out_col_result, const UInt64 num_buckets)
using HashType = typename Impl::HashType;
using ResultType = typename Impl::ResultType;
using BucketsType = typename Impl::BucketsCountType;
static constexpr auto max_buckets = static_cast<UInt64>(std::numeric_limits<BucketsType>::max());
template <typename T>
inline BucketsType checkBucketsRange(T buckets)
{
auto col_hash = checkAndGetColumn<ColumnVector<HashType>>(col_hash_ptr.get());
if (unlikely(buckets <= 0))
throw Exception("The second argument of function " + getName() + " (number of buckets) must be positive number",
ErrorCodes::BAD_ARGUMENTS);
if (unlikely(static_cast<UInt64>(buckets) > max_buckets))
throw Exception("The value of the second argument of function " + getName() + " (number of buckets) is not fit to " +
DataTypeNumber<BucketsType>().getName(), ErrorCodes::BAD_ARGUMENTS);
return static_cast<BucketsType>(buckets);
}
void executeConstBuckets(Block & block, const ColumnNumbers & arguments, size_t result)
{
Field buckets_field = (*block.getByPosition(arguments[1]).column)[0];
BucketsType num_buckets;
if (buckets_field.getType() == Field::Types::Int64)
num_buckets = checkBucketsRange(buckets_field.get<Int64>());
else if (buckets_field.getType() == Field::Types::UInt64)
num_buckets = checkBucketsRange(buckets_field.get<UInt64>());
else
throw Exception("Illegal type " + String(buckets_field.getTypeName()) + " of the second argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
const auto & hash_col = block.getByPosition(arguments[0]).column;
const IDataType * hash_type = block.getByPosition(arguments[0]).type.get();
auto res_col = ColumnVector<ResultType>::create();
if (checkDataType<DataTypeUInt8>(hash_type)) executeType<UInt8>(hash_col, num_buckets, res_col.get());
else if (checkDataType<DataTypeUInt16>(hash_type)) executeType<UInt16>(hash_col, num_buckets, res_col.get());
else if (checkDataType<DataTypeUInt32>(hash_type)) executeType<UInt32>(hash_col, num_buckets, res_col.get());
else if (checkDataType<DataTypeUInt64>(hash_type)) executeType<UInt64>(hash_col, num_buckets, res_col.get());
else if (checkDataType<DataTypeInt8>(hash_type)) executeType<Int8>(hash_col, num_buckets, res_col.get());
else if (checkDataType<DataTypeInt16>(hash_type)) executeType<Int16>(hash_col, num_buckets, res_col.get());
else if (checkDataType<DataTypeInt32>(hash_type)) executeType<Int32>(hash_col, num_buckets, res_col.get());
else if (checkDataType<DataTypeInt64>(hash_type)) executeType<Int64>(hash_col, num_buckets, res_col.get());
else
throw Exception("Illegal type " + hash_type->getName() + " of the first argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
block.getByPosition(result).column = std::move(res_col);
}
template <typename CurrentHashType>
void executeType(const ColumnPtr & col_hash_ptr, BucketsType num_buckets, ColumnVector<ResultType> * col_result)
{
auto col_hash = checkAndGetColumn<ColumnVector<CurrentHashType>>(col_hash_ptr.get());
if (!col_hash)
throw Exception("Illegal type of the first argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
auto col_result = ColumnVector<ResultType>::create();
typename ColumnVector<ResultType>::Container & vec_result = col_result->getData();
auto & vec_result = col_result->getData();
const auto & vec_hash = col_hash->getData();
size_t size = vec_hash.size();
vec_result.resize(size);
for (size_t i = 0; i < size; ++i)
vec_result[i] = Impl::apply(static_cast<UInt64>(vec_hash[i]), static_cast<BucketsType>(num_buckets));
out_col_result = std::move(col_result);
vec_result[i] = Impl::apply(static_cast<HashType>(vec_hash[i]), num_buckets);
}
};
using FunctionYandexConsistentHash = FunctionConsistentHashImpl<YandexConsistentHashImpl>;
using FunctionJumpConsistentHas = FunctionConsistentHashImpl<JumpConsistentHashImpl>;
using FunctionJumpConsistentHash = FunctionConsistentHashImpl<JumpConsistentHashImpl>;
using FunctionSumburConsistentHash = FunctionConsistentHashImpl<SumburConsistentHashImpl>;
}

View File

@ -29,7 +29,7 @@
<engine>ENGINE=ReplicatedMergeTree('/clickhouse/tables/cluster{cluster}/{shard}/b', '{replica}') PARTITION BY toMonday(date) ORDER BY d</engine>
<!-- Which sarding key to use while copying -->
<sharding_key>JumpConsistentHash(intHash64(d), 2)</sharding_key>
<sharding_key>jumpConsistentHash(intHash64(d), 2)</sharding_key>
<!-- Optional expression that filter copying data -->
<!-- <where_condition></where_condition> -->

View File

@ -129,8 +129,8 @@ class Task2:
assert TSV(self.cluster.instances['s0_0_0'].query("SELECT count() FROM cluster(cluster0, default, a)")) == TSV("85\n")
assert TSV(self.cluster.instances['s1_0_0'].query("SELECT count(), uniqExact(date) FROM cluster(cluster1, default, b)")) == TSV("85\t85\n")
assert TSV(self.cluster.instances['s1_0_0'].query("SELECT DISTINCT JumpConsistentHash(intHash64(d), 2) FROM b")) == TSV("0\n")
assert TSV(self.cluster.instances['s1_1_0'].query("SELECT DISTINCT JumpConsistentHash(intHash64(d), 2) FROM b")) == TSV("1\n")
assert TSV(self.cluster.instances['s1_0_0'].query("SELECT DISTINCT jumpConsistentHash(intHash64(d), 2) FROM b")) == TSV("0\n")
assert TSV(self.cluster.instances['s1_1_0'].query("SELECT DISTINCT jumpConsistentHash(intHash64(d), 2) FROM b")) == TSV("1\n")
assert TSV(self.cluster.instances['s1_0_0'].query("SELECT uniqExact(partition) IN (12, 13) FROM system.parts WHERE active AND database='default' AND table='b'")) == TSV("1\n")
assert TSV(self.cluster.instances['s1_1_0'].query("SELECT uniqExact(partition) IN (12, 13) FROM system.parts WHERE active AND database='default' AND table='b'")) == TSV("1\n")

View File

@ -1,4 +1,4 @@
SELECT JumpConsistentHash(1, 1), JumpConsistentHash(42, 57), JumpConsistentHash(256, 1024), JumpConsistentHash(3735883980, 1), JumpConsistentHash(3735883980, 666), JumpConsistentHash(16045690984833335023, 255);
SELECT YandexConsistentHash(16045690984833335023, 1), YandexConsistentHash(16045690984833335023, 2), YandexConsistentHash(16045690984833335023, 3), YandexConsistentHash(16045690984833335023, 4), YandexConsistentHash(16045690984833335023, 173), YandexConsistentHash(16045690984833335023, 255);
SELECT JumpConsistentHash(intHash64(number), 787) FROM system.numbers LIMIT 1000000, 2;
SELECT YandexConsistentHash(16045690984833335023+number-number, 120) FROM system.numbers LIMIT 1000000, 2;
SELECT jumpConsistentHash(1, 1), jumpConsistentHash(42, 57), jumpConsistentHash(256, 1024), jumpConsistentHash(3735883980, 1), jumpConsistentHash(3735883980, 666), jumpConsistentHash(16045690984833335023, 255);
SELECT yandexConsistentHash(16045690984833335023, 1), yandexConsistentHash(16045690984833335023, 2), yandexConsistentHash(16045690984833335023, 3), yandexConsistentHash(16045690984833335023, 4), yandexConsistentHash(16045690984833335023, 173), yandexConsistentHash(16045690984833335023, 255);
SELECT jumpConsistentHash(intHash64(number), 787) FROM system.numbers LIMIT 1000000, 2;
SELECT yandexConsistentHash(16045690984833335023+number-number, 120) FROM system.numbers LIMIT 1000000, 2;

View File

@ -3,36 +3,33 @@
#include <limits>
#include <type_traits>
// Assume little endian
inline uint16_t & LO_16(uint32_t & x) { return reinterpret_cast<uint16_t *>(&x)[0]; }
inline uint16_t & HI_16(uint32_t & x) { return reinterpret_cast<uint16_t *>(&x)[1]; }
inline uint16_t LO_16(uint32_t x) { return static_cast<uint16_t>(x & 0x0000FFFF); }
inline uint16_t HI_16(uint32_t x) { return static_cast<uint16_t>(x >> 16); }
inline uint32_t & LO_32(uint64_t & x) { return reinterpret_cast<uint32_t *>(&x)[0]; }
inline uint32_t & HI_32(uint64_t & x) { return reinterpret_cast<uint32_t *>(&x)[1]; }
inline uint32_t LO_32(uint64_t x) { return static_cast<uint32_t>(x & 0x00000000FFFFFFFF); }
inline uint32_t HI_32(uint64_t x) { return static_cast<uint32_t>(x >> 32); }
/// Clang also defines __GNUC__
#if defined(__GNUC__)
inline unsigned GetValueBitCountImpl(unsigned int value) noexcept {
// Y_ASSERT(value); // because __builtin_clz* have undefined result for zero.
// NOTE: __builtin_clz* have undefined result for zero.
return std::numeric_limits<unsigned int>::digits - __builtin_clz(value);
}
inline unsigned GetValueBitCountImpl(unsigned long value) noexcept {
// Y_ASSERT(value); // because __builtin_clz* have undefined result for zero.
return std::numeric_limits<unsigned long>::digits - __builtin_clzl(value);
}
inline unsigned GetValueBitCountImpl(unsigned long long value) noexcept {
// Y_ASSERT(value); // because __builtin_clz* have undefined result for zero.
return std::numeric_limits<unsigned long long>::digits - __builtin_clzll(value);
}
#else
/// Stupid realization for non-GCC. Can use BSR from x86 instructions set.
/// Stupid realization for non GCC-like compilers. Can use BSR from x86 instructions set.
template <typename T>
inline unsigned GetValueBitCountImpl(T value) noexcept {
// Y_ASSERT(value); // because __builtin_clz* have undefined result for zero.
unsigned result = 1; // result == 0 - impossible value, see Y_ASSERT().
unsigned result = 1; // result == 0 - impossible value, since value cannot be zero
value >>= 1;
while (value) {
value >>= 1;
@ -46,10 +43,10 @@ inline uint32_t & HI_32(uint64_t & x) { return reinterpret_cast<uint32_t *>(&x)[
/**
* Returns the number of leading 0-bits in `value`, starting at the most significant bit position.
* NOTE: value cannot be zero
*/
template <typename T>
static inline unsigned GetValueBitCount(T value) noexcept {
// Y_ASSERT(value > 0);
using TCvt = std::make_unsigned_t<std::decay_t<T>>;
return GetValueBitCountImpl(static_cast<TCvt>(value));
}

View File

@ -4,6 +4,8 @@
#include <cstddef>
/*
* Author: Konstantin Oblakov
*
* Maps random ui64 x (in fact hash of some string) to n baskets/shards.
* Output value is id of a basket. 0 <= ConsistentHashing(x, n) < n.
* Probability of all baskets must be equal. Also, it should be consistent

View File

@ -11,12 +11,6 @@ using std::size_t;
#include <intrin.h>
#endif
#ifdef __SSE2__
constexpr bool HavePOPCNTInstr = true;
#else
constexpr bool HavePOPCNTInstr = false;
#pragma GCC warning "SSE2 is not detected, PopCount function will be too slow"
#endif
static inline uint32_t PopCountImpl(uint8_t n) {
extern uint8_t const* PopCountLUT8;
@ -35,21 +29,9 @@ static inline uint32_t PopCountImpl(uint16_t n) {
static inline uint32_t PopCountImpl(uint32_t n) {
#if defined(_MSC_VER)
return __popcnt(n);
#elif defined(__GNUC__) // it is true for Clang also
return __builtin_popcount(n);
#else
#if defined(__x86_64__)
if (HavePOPCNTInstr) {
uint32_t r;
__asm__("popcnt %1, %0;"
: "=r"(r)
: "r"(n)
:);
return r;
}
#endif
return PopCountImpl((uint16_t)LO_16(n)) + PopCountImpl((uint16_t)HI_16(n));
#endif
}
@ -57,21 +39,9 @@ static inline uint32_t PopCountImpl(uint32_t n) {
static inline uint32_t PopCountImpl(uint64_t n) {
#if defined(_MSC_VER) && !defined(_i386_)
return __popcnt64(n);
#elif defined(__GNUC__) // it is true for Clang also
return __builtin_popcountll(n);
#else
#if defined(__x86_64__)
if (HavePOPCNTInstr) {
uint64_t r;
__asm__("popcnt %1, %0;"
: "=r"(r)
: "r"(n)
:);
return r;
}
#endif
return PopCountImpl((uint32_t)LO_32(n)) + PopCountImpl((uint32_t)HI_32(n));
#endif
}