Support trailing zero bytes in string hash map.

This commit is contained in:
Alexander Kuzmenkov 2020-04-02 20:19:57 +03:00
parent e381246bc1
commit cd76ba3c19
5 changed files with 52 additions and 8 deletions

View File

@ -25,9 +25,13 @@ struct StringHashMapCell<StringKey16, TMapped> : public HashMapCell<StringKey16,
using Base::Base;
static constexpr bool need_zero_value_storage = false;
bool isZero(const HashTableNoState & state) const { return isZero(this->value.first, state); }
// Assuming String does not contain zero bytes. NOTE: Cannot be used in serialized method
static bool isZero(const StringKey16 & key, const HashTableNoState & /*state*/) { return key.low == 0; }
void setZero() { this->value.first.low = 0; }
// Zero means unoccupied cells in hash table. Use key with last word = 0 as
// zero keys, because such keys are unrepresentable (no way to encode length).
static bool isZero(const StringKey16 & key, const HashTableNoState &)
{ return key.high == 0; }
void setZero() { this->value.first.high = 0; }
// external
const StringRef getKey() const { return toStringRef(this->value.first); }
// internal
@ -42,9 +46,13 @@ struct StringHashMapCell<StringKey24, TMapped> : public HashMapCell<StringKey24,
using Base::Base;
static constexpr bool need_zero_value_storage = false;
bool isZero(const HashTableNoState & state) const { return isZero(this->value.first, state); }
// Assuming String does not contain zero bytes. NOTE: Cannot be used in serialized method
static bool isZero(const StringKey24 & key, const HashTableNoState & /*state*/) { return key.a == 0; }
void setZero() { this->value.first.a = 0; }
// Zero means unoccupied cells in hash table. Use key with last word = 0 as
// zero keys, because such keys are unrepresentable (no way to encode length).
static bool isZero(const StringKey24 & key, const HashTableNoState &)
{ return key.c == 0; }
void setZero() { this->value.first.c = 0; }
// external
const StringRef getKey() const { return toStringRef(this->value.first); }
// internal

View File

@ -18,14 +18,17 @@ struct StringKey24
inline StringRef ALWAYS_INLINE toStringRef(const StringKey8 & n)
{
assert(n != 0);
return {reinterpret_cast<const char *>(&n), 8ul - (__builtin_clzll(n) >> 3)};
}
inline StringRef ALWAYS_INLINE toStringRef(const StringKey16 & n)
{
assert(n.high != 0);
return {reinterpret_cast<const char *>(&n), 16ul - (__builtin_clzll(n.high) >> 3)};
}
inline StringRef ALWAYS_INLINE toStringRef(const StringKey24 & n)
{
assert(n.c != 0);
return {reinterpret_cast<const char *>(&n), 24ul - (__builtin_clzll(n.c) >> 3)};
}
@ -229,6 +232,7 @@ public:
template <typename Self, typename KeyHolder, typename Func>
static auto ALWAYS_INLINE dispatch(Self & self, KeyHolder && key_holder, Func && func)
{
StringHashTableHash hash;
const StringRef & x = keyHolderGetKey(key_holder);
const size_t sz = x.size;
if (sz == 0)
@ -237,6 +241,13 @@ public:
return func(self.m0, VoidKey{}, 0);
}
if (x.data[sz - 1] == 0)
{
// Strings with trailing zeros are not representable as fixed-size
// string keys. Put them to the generic table.
return func(self.ms, std::forward<KeyHolder>(key_holder), hash(x));
}
const char * p = x.data;
// pending bits that needs to be shifted out
const char s = (-sz & 7) * 8;
@ -247,7 +258,6 @@ public:
StringKey24 k24;
UInt64 n[3];
};
StringHashTableHash hash;
switch ((sz - 1) >> 3)
{
case 0: // 1..8 bytes

View File

@ -77,6 +77,7 @@ public:
template <typename Self, typename Func, typename KeyHolder>
static auto ALWAYS_INLINE dispatch(Self & self, KeyHolder && key_holder, Func && func)
{
StringHashTableHash hash;
const StringRef & x = keyHolderGetKey(key_holder);
const size_t sz = x.size;
if (sz == 0)
@ -85,6 +86,16 @@ public:
return func(self.impls[0].m0, VoidKey{}, 0);
}
if (x.data[x.size - 1] == 0)
{
// Strings with trailing zeros are not representable as fixed-size
// string keys. Put them to the generic table.
auto res = hash(x);
auto buck = getBucketFromHash(res);
return func(self.impls[buck].ms, std::forward<KeyHolder>(key_holder),
res);
}
const char * p = x.data;
// pending bits that needs to be shifted out
const char s = (-sz & 7) * 8;
@ -95,7 +106,6 @@ public:
StringKey24 k24;
UInt64 n[3];
};
StringHashTableHash hash;
switch ((sz - 1) >> 3)
{
case 0:

View File

@ -0,0 +1,15 @@
-- Test that the string hash map works properly with keys containing zero
-- bytes.
-- Keys with no central '1' are mostly duplicates. The unique keys
-- in this group are '', '\0', ...., '\0 x 34', to a total of 35. All other
-- keys are unique.
select count(*) = 18 * 18 * 17 + 35
from (
select key
from (
with 18 as n
select repeat('\0', number % n)
|| repeat('1', intDiv(number, n) % n)
|| repeat('\0', intDiv(number, n * n) % n) key
from numbers(18 * 18 * 18))
group by key);