Fix bloom filters for String (data skipping indices)

bloom filter was broken for the first element, if all of the following
conditions satisfied:
- they are created on INSERT (in thie case bloom filter hashing uses
  offsets, in case of OPTIMIZE it does not, since it already has
  granulars).
- the record is not the first in the block
- the record is the first per index_granularity (do not confuse this
  with data skipping index GRANULARITY).
- type of the field for indexing is "String" (not FixedString)

Because in this case there was incorrect length and *data* for that string.
This commit is contained in:
Azat Khuzhin 2020-06-12 21:17:06 +03:00
parent fc8da2abd3
commit 7f52b615e0
3 changed files with 15 additions and 4 deletions

View File

@ -196,18 +196,17 @@ struct BloomFilterHash
const ColumnString::Chars & data = index_column->getChars();
const ColumnString::Offsets & offsets = index_column->getOffsets();
ColumnString::Offset current_offset = pos;
for (size_t index = 0, size = vec.size(); index < size; ++index)
{
ColumnString::Offset current_offset = offsets[index + pos - 1];
size_t length = offsets[index + pos] - current_offset - 1 /* terminating zero */;
UInt64 city_hash = CityHash_v1_0_2::CityHash64(
reinterpret_cast<const char *>(&data[current_offset]), offsets[index + pos] - current_offset - 1);
reinterpret_cast<const char *>(&data[current_offset]), length);
if constexpr (is_first)
vec[index] = city_hash;
else
vec[index] = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(vec[index], city_hash));
current_offset = offsets[index + pos];
}
}
else if (const auto * fixed_string_index_column = typeid_cast<const ColumnFixedString *>(column))

View File

@ -0,0 +1,4 @@
1
1
1
1

View File

@ -0,0 +1,8 @@
DROP TABLE IF EXISTS test_01307;
CREATE TABLE test_01307 (id UInt64, val String, INDEX ind val TYPE bloom_filter() GRANULARITY 1) ENGINE = MergeTree() ORDER BY id SETTINGS index_granularity = 2;
INSERT INTO test_01307 (id, val) select number as id, toString(number) as val from numbers(4);
SELECT count() FROM test_01307 WHERE identity(val) = '2';
SELECT count() FROM test_01307 WHERE val = '2';
OPTIMIZE TABLE test_01307 FINAL;
SELECT count() FROM test_01307 WHERE identity(val) = '2';
SELECT count() FROM test_01307 WHERE val = '2';