Fix bloom filters for String (data skipping indices)

bloom filter was broken for the first element, if all of the following conditions satisfied: - they are created on INSERT (in thie case bloom filter hashing uses offsets, in case of OPTIMIZE it does not, since it already has granulars). - the record is not the first in the block - the record is the first per index_granularity (do not confuse this with data skipping index GRANULARITY). - type of the field for indexing is "String" (not FixedString) Because in this case there was incorrect length and *data* for that string.
2024-09-22 01:30:51 +00:00 · 2020-06-12 21:17:06 +03:00 · 2020-06-12 21:17:06 +03:00 · 7f52b615e0
commit 7f52b615e0
parent fc8da2abd3
3 changed files with 15 additions and 4 deletions
--- a/src/Interpreters/BloomFilterHash.h
+++ b/src/Interpreters/BloomFilterHash.h
@ -196,18 +196,17 @@ struct BloomFilterHash
            const ColumnString::Chars & data = index_column->getChars();
            const ColumnString::Offsets & offsets = index_column->getOffsets();

-            ColumnString::Offset current_offset = pos;
            for (size_t index = 0, size = vec.size(); index < size; ++index)
            {
+                ColumnString::Offset current_offset = offsets[index + pos - 1];
+                size_t length = offsets[index + pos] - current_offset - 1 /* terminating zero */;
                UInt64 city_hash = CityHash_v1_0_2::CityHash64(
-                    reinterpret_cast<const char *>(&data[current_offset]), offsets[index + pos] - current_offset - 1);
+                    reinterpret_cast<const char *>(&data[current_offset]), length);

                if constexpr (is_first)
                    vec[index] = city_hash;
                else
                    vec[index] = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(vec[index], city_hash));
-
-                current_offset = offsets[index + pos];
            }
        }
        else if (const auto * fixed_string_index_column = typeid_cast<const ColumnFixedString *>(column))
--- a/tests/queries/0_stateless/01307_data_skip_bloom_filter.reference
+++ b/tests/queries/0_stateless/01307_data_skip_bloom_filter.reference
@ -0,0 +1,4 @@
+1
+1
+1
+1
--- a/tests/queries/0_stateless/01307_data_skip_bloom_filter.sql
+++ b/tests/queries/0_stateless/01307_data_skip_bloom_filter.sql
@ -0,0 +1,8 @@
+DROP TABLE IF EXISTS test_01307;
+CREATE TABLE test_01307 (id UInt64, val String, INDEX ind val TYPE bloom_filter() GRANULARITY 1) ENGINE = MergeTree() ORDER BY id SETTINGS index_granularity = 2;
+INSERT INTO test_01307 (id, val) select number as id, toString(number) as val from numbers(4);
+SELECT count() FROM test_01307 WHERE identity(val) = '2';
+SELECT count() FROM test_01307 WHERE val = '2';
+OPTIMIZE TABLE test_01307 FINAL;
+SELECT count() FROM test_01307 WHERE identity(val) = '2';
+SELECT count() FROM test_01307 WHERE val = '2';