mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 07:01:59 +00:00
Support trailing zero bytes in string hash map.
This commit is contained in:
parent
e381246bc1
commit
cd76ba3c19
@ -25,9 +25,13 @@ struct StringHashMapCell<StringKey16, TMapped> : public HashMapCell<StringKey16,
|
||||
using Base::Base;
|
||||
static constexpr bool need_zero_value_storage = false;
|
||||
bool isZero(const HashTableNoState & state) const { return isZero(this->value.first, state); }
|
||||
// Assuming String does not contain zero bytes. NOTE: Cannot be used in serialized method
|
||||
static bool isZero(const StringKey16 & key, const HashTableNoState & /*state*/) { return key.low == 0; }
|
||||
void setZero() { this->value.first.low = 0; }
|
||||
|
||||
// Zero means unoccupied cells in hash table. Use key with last word = 0 as
|
||||
// zero keys, because such keys are unrepresentable (no way to encode length).
|
||||
static bool isZero(const StringKey16 & key, const HashTableNoState &)
|
||||
{ return key.high == 0; }
|
||||
void setZero() { this->value.first.high = 0; }
|
||||
|
||||
// external
|
||||
const StringRef getKey() const { return toStringRef(this->value.first); }
|
||||
// internal
|
||||
@ -42,9 +46,13 @@ struct StringHashMapCell<StringKey24, TMapped> : public HashMapCell<StringKey24,
|
||||
using Base::Base;
|
||||
static constexpr bool need_zero_value_storage = false;
|
||||
bool isZero(const HashTableNoState & state) const { return isZero(this->value.first, state); }
|
||||
// Assuming String does not contain zero bytes. NOTE: Cannot be used in serialized method
|
||||
static bool isZero(const StringKey24 & key, const HashTableNoState & /*state*/) { return key.a == 0; }
|
||||
void setZero() { this->value.first.a = 0; }
|
||||
|
||||
// Zero means unoccupied cells in hash table. Use key with last word = 0 as
|
||||
// zero keys, because such keys are unrepresentable (no way to encode length).
|
||||
static bool isZero(const StringKey24 & key, const HashTableNoState &)
|
||||
{ return key.c == 0; }
|
||||
void setZero() { this->value.first.c = 0; }
|
||||
|
||||
// external
|
||||
const StringRef getKey() const { return toStringRef(this->value.first); }
|
||||
// internal
|
||||
|
@ -18,14 +18,17 @@ struct StringKey24
|
||||
|
||||
inline StringRef ALWAYS_INLINE toStringRef(const StringKey8 & n)
|
||||
{
|
||||
assert(n != 0);
|
||||
return {reinterpret_cast<const char *>(&n), 8ul - (__builtin_clzll(n) >> 3)};
|
||||
}
|
||||
inline StringRef ALWAYS_INLINE toStringRef(const StringKey16 & n)
|
||||
{
|
||||
assert(n.high != 0);
|
||||
return {reinterpret_cast<const char *>(&n), 16ul - (__builtin_clzll(n.high) >> 3)};
|
||||
}
|
||||
inline StringRef ALWAYS_INLINE toStringRef(const StringKey24 & n)
|
||||
{
|
||||
assert(n.c != 0);
|
||||
return {reinterpret_cast<const char *>(&n), 24ul - (__builtin_clzll(n.c) >> 3)};
|
||||
}
|
||||
|
||||
@ -229,6 +232,7 @@ public:
|
||||
template <typename Self, typename KeyHolder, typename Func>
|
||||
static auto ALWAYS_INLINE dispatch(Self & self, KeyHolder && key_holder, Func && func)
|
||||
{
|
||||
StringHashTableHash hash;
|
||||
const StringRef & x = keyHolderGetKey(key_holder);
|
||||
const size_t sz = x.size;
|
||||
if (sz == 0)
|
||||
@ -237,6 +241,13 @@ public:
|
||||
return func(self.m0, VoidKey{}, 0);
|
||||
}
|
||||
|
||||
if (x.data[sz - 1] == 0)
|
||||
{
|
||||
// Strings with trailing zeros are not representable as fixed-size
|
||||
// string keys. Put them to the generic table.
|
||||
return func(self.ms, std::forward<KeyHolder>(key_holder), hash(x));
|
||||
}
|
||||
|
||||
const char * p = x.data;
|
||||
// pending bits that needs to be shifted out
|
||||
const char s = (-sz & 7) * 8;
|
||||
@ -247,7 +258,6 @@ public:
|
||||
StringKey24 k24;
|
||||
UInt64 n[3];
|
||||
};
|
||||
StringHashTableHash hash;
|
||||
switch ((sz - 1) >> 3)
|
||||
{
|
||||
case 0: // 1..8 bytes
|
||||
|
@ -77,6 +77,7 @@ public:
|
||||
template <typename Self, typename Func, typename KeyHolder>
|
||||
static auto ALWAYS_INLINE dispatch(Self & self, KeyHolder && key_holder, Func && func)
|
||||
{
|
||||
StringHashTableHash hash;
|
||||
const StringRef & x = keyHolderGetKey(key_holder);
|
||||
const size_t sz = x.size;
|
||||
if (sz == 0)
|
||||
@ -85,6 +86,16 @@ public:
|
||||
return func(self.impls[0].m0, VoidKey{}, 0);
|
||||
}
|
||||
|
||||
if (x.data[x.size - 1] == 0)
|
||||
{
|
||||
// Strings with trailing zeros are not representable as fixed-size
|
||||
// string keys. Put them to the generic table.
|
||||
auto res = hash(x);
|
||||
auto buck = getBucketFromHash(res);
|
||||
return func(self.impls[buck].ms, std::forward<KeyHolder>(key_holder),
|
||||
res);
|
||||
}
|
||||
|
||||
const char * p = x.data;
|
||||
// pending bits that needs to be shifted out
|
||||
const char s = (-sz & 7) * 8;
|
||||
@ -95,7 +106,6 @@ public:
|
||||
StringKey24 k24;
|
||||
UInt64 n[3];
|
||||
};
|
||||
StringHashTableHash hash;
|
||||
switch ((sz - 1) >> 3)
|
||||
{
|
||||
case 0:
|
||||
|
@ -0,0 +1 @@
|
||||
1
|
@ -0,0 +1,15 @@
|
||||
-- Test that the string hash map works properly with keys containing zero
|
||||
-- bytes.
|
||||
-- Keys with no central '1' are mostly duplicates. The unique keys
|
||||
-- in this group are '', '\0', ...., '\0 x 34', to a total of 35. All other
|
||||
-- keys are unique.
|
||||
select count(*) = 18 * 18 * 17 + 35
|
||||
from (
|
||||
select key
|
||||
from (
|
||||
with 18 as n
|
||||
select repeat('\0', number % n)
|
||||
|| repeat('1', intDiv(number, n) % n)
|
||||
|| repeat('\0', intDiv(number, n * n) % n) key
|
||||
from numbers(18 * 18 * 18))
|
||||
group by key);
|
Loading…
Reference in New Issue
Block a user