From 2fdde4ba5539f17496f9ba2b77c94d47325c6ada Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Fri, 30 Aug 2024 00:29:46 +0000 Subject: [PATCH] unroll loop in bloom filter --- src/Interpreters/BloomFilter.cpp | 35 +++++++++++++++++++++-- src/Interpreters/BloomFilter.h | 13 +++++++++ tests/performance/ngram_filter_insert.xml | 9 ++++++ 3 files changed, 54 insertions(+), 3 deletions(-) create mode 100644 tests/performance/ngram_filter_insert.xml diff --git a/src/Interpreters/BloomFilter.cpp b/src/Interpreters/BloomFilter.cpp index 7bf50a0312b..0bb46b1f0a8 100644 --- a/src/Interpreters/BloomFilter.cpp +++ b/src/Interpreters/BloomFilter.cpp @@ -71,10 +71,39 @@ void BloomFilter::add(const char * data, size_t len) size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed); size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B); - for (size_t i = 0; i < hashes; ++i) + switch (hashes) { - size_t pos = (hash1 + i * hash2 + i * i) % (8 * size); - filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType)))); + case 1: + { + addHashesImpl<1>(hash1, hash2); + break; + } + case 2: + { + addHashesImpl<2>(hash1, hash2); + break; + } + case 3: + { + addHashesImpl<3>(hash1, hash2); + break; + } + case 4: + { + addHashesImpl<4>(hash1, hash2); + break; + } + default: + { + addHashesImpl<4>(hash1, hash2); + + for (size_t i = 4; i < hashes; ++i) + { + size_t pos = (hash1 + i * hash2 + i * i) % (8 * size); + filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType)))); + } + break; + } } } diff --git a/src/Interpreters/BloomFilter.h b/src/Interpreters/BloomFilter.h index 8ebdfd879e6..76a0a76e247 100644 --- a/src/Interpreters/BloomFilter.h +++ b/src/Interpreters/BloomFilter.h @@ -8,6 +8,7 @@ #include #include #include +#include namespace DB @@ -56,7 +57,19 @@ public: UInt64 isEmpty() const; friend bool operator== (const BloomFilter & a, const BloomFilter & b); + private: + template + ALWAYS_INLINE void addHashesImpl(size_t hash1, size_t hash2) + { + static_assert(num_hashes >= 1 && num_hashes <= 4); + + for (size_t i = 0; i < num_hashes; ++i) + { + size_t pos = (hash1 + i * hash2 + i * i) % (8 * size); + filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType)))); + } + } size_t size; size_t hashes; diff --git a/tests/performance/ngram_filter_insert.xml b/tests/performance/ngram_filter_insert.xml new file mode 100644 index 00000000000..e426a64bb81 --- /dev/null +++ b/tests/performance/ngram_filter_insert.xml @@ -0,0 +1,9 @@ + + DROP TABLE IF EXISTS test_ngram + CREATE TABLE test_ngram (s String, INDEX idx_s s TYPE ngrambf_v1(5, 10000, 3, 0) GRANULARITY 1) ENGINE = MergeTree ORDER BY tuple() + SYSTEM STOP MERGES test_ngram + + INSERT INTO test_ngram SELECT randomPrintableASCII(128) FROM numbers(100000) + + DROP TABLE IF EXISTS test_ngram +