From 6df2548417c46023aff87339f53691501380b48a Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 19 Dec 2023 09:11:18 +0000 Subject: [PATCH] Some minor adjustments --- .../MergeTree/MergeTreeIndexFullText.cpp | 116 +++++++++--------- ...f_indexes_support_match_function.reference | 26 ++++ ...ngrambf_indexes_support_match_function.sql | 107 ++++++++++++++++ .../02943_tokenbf_support_match.reference | 10 -- .../02943_tokenbf_support_match.sql | 68 ---------- 5 files changed, 192 insertions(+), 135 deletions(-) create mode 100644 tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.reference create mode 100644 tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.sql delete mode 100644 tests/queries/0_stateless/02943_tokenbf_support_match.reference delete mode 100644 tests/queries/0_stateless/02943_tokenbf_support_match.sql diff --git a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp index 85343aabd50..3dbc4e8a7f1 100644 --- a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -1,23 +1,23 @@ #include #include -#include +#include +#include #include -#include +#include #include +#include #include #include #include #include -#include -#include -#include #include #include -#include #include -#include -#include +#include +#include +#include +#include #include @@ -243,20 +243,6 @@ bool MergeTreeConditionFullText::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx /// Check like in KeyCondition. std::vector rpn_stack; - - auto multi_funtion_processor = [&rpn_stack, &granule] (const RPNElement & element) - { - std::vector result(element.set_bloom_filters.back().size(), true); - - const auto & bloom_filters = element.set_bloom_filters[0]; - - for (size_t row = 0; row < bloom_filters.size(); ++row) - result[row] = result[row] && granule->bloom_filters[element.key_column].contains(bloom_filters[row]); - - rpn_stack.emplace_back( - std::find(std::cbegin(result), std::cend(result), true) != std::end(result), true); - }; - for (const auto & element : rpn) { if (element.function == RPNElement::FUNCTION_UNKNOWN) @@ -294,17 +280,32 @@ bool MergeTreeConditionFullText::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx else if (element.function == RPNElement::FUNCTION_MULTI_SEARCH || element.function == RPNElement::FUNCTION_HAS_ANY) { - multi_funtion_processor(element); + std::vector result(element.set_bloom_filters.back().size(), true); + + const auto & bloom_filters = element.set_bloom_filters[0]; + + for (size_t row = 0; row < bloom_filters.size(); ++row) + result[row] = result[row] && granule->bloom_filters[element.key_column].contains(bloom_filters[row]); + + rpn_stack.emplace_back(std::find(std::cbegin(result), std::cend(result), true) != std::end(result), true); } else if (element.function == RPNElement::FUNCTION_MATCH) { if (!element.set_bloom_filters.empty()) { - multi_funtion_processor(element); + /// Alternative substrings + std::vector result(element.set_bloom_filters.back().size(), true); + + const auto & bloom_filters = element.set_bloom_filters[0]; + + for (size_t row = 0; row < bloom_filters.size(); ++row) + result[row] = result[row] && granule->bloom_filters[element.key_column].contains(bloom_filters[row]); + + rpn_stack.emplace_back(std::find(std::cbegin(result), std::cend(result), true) != std::end(result), true); } - // If set_bloom_filters is not empty means we got alternative substring else if (element.bloom_filter) { + /// Required substrings rpn_stack.emplace_back(granule->bloom_filters[element.key_column].contains(*element.bloom_filter), true); } } @@ -535,38 +536,6 @@ bool MergeTreeConditionFullText::traverseTreeEquals( return true; } - if (function_name == "match") - { - out.key_column = *key_index; - out.function = RPNElement::FUNCTION_MATCH; - out.bloom_filter = std::make_unique(params); - - auto & string_view = const_value.get(); - String required_substring; - std::vector alternatives; - bool tmp_var; - OptimizedRegularExpression::analyze(string_view, required_substring, tmp_var, tmp_var, alternatives); - - if (required_substring.empty() && alternatives.empty()) - return false; - - if (!alternatives.empty()) - { - std::vector> bloom_filters; - bloom_filters.emplace_back(); - for (const auto & alternative : alternatives) - { - bloom_filters.back().emplace_back(params); - token_extractor->stringToBloomFilter(alternative.data(), alternative.size(), bloom_filters.back().back()); - } - out.set_bloom_filters = std::move(bloom_filters); - } - else - token_extractor->stringToBloomFilter(required_substring.data(), required_substring.size(), *out.bloom_filter); - - return true; - } - else if (function_name == "has") { out.key_column = *key_index; @@ -654,6 +623,39 @@ bool MergeTreeConditionFullText::traverseTreeEquals( out.set_bloom_filters = std::move(bloom_filters); return true; } + else if (function_name == "match") + { + out.key_column = *key_index; + out.function = RPNElement::FUNCTION_MATCH; + out.bloom_filter = std::make_unique(params); + + auto & value = const_value.get(); + String required_substring; + bool dummy_is_trivial, dummy_required_substring_is_prefix; + std::vector alternatives; + OptimizedRegularExpression::analyze(value, required_substring, dummy_is_trivial, dummy_required_substring_is_prefix, alternatives); + + if (required_substring.empty() && alternatives.empty()) + return false; + + /// out.set_bloom_filters means alternatives exist + /// out.bloom_filter means required_substring exists + if (!alternatives.empty()) + { + std::vector> bloom_filters; + bloom_filters.emplace_back(); + for (const auto & alternative : alternatives) + { + bloom_filters.back().emplace_back(params); + token_extractor->stringToBloomFilter(alternative.data(), alternative.size(), bloom_filters.back().back()); + } + out.set_bloom_filters = std::move(bloom_filters); + } + else + token_extractor->stringToBloomFilter(required_substring.data(), required_substring.size(), *out.bloom_filter); + + return true; + } return false; } diff --git a/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.reference b/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.reference new file mode 100644 index 00000000000..41ca02e3877 --- /dev/null +++ b/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.reference @@ -0,0 +1,26 @@ +1 Hello ClickHouse +2 Hello World +1 Hello ClickHouse +2 Hello World + Granules: 6/6 + Granules: 2/6 + Granules: 6/6 + Granules: 2/6 +--- +1 Hello ClickHouse +2 Hello World +6 World Champion +1 Hello ClickHouse +2 Hello World +6 World Champion + Granules: 6/6 + Granules: 3/6 + Granules: 6/6 + Granules: 3/6 +--- +5 OLAP Database +5 OLAP Database + Granules: 6/6 + Granules: 1/6 + Granules: 6/6 + Granules: 1/6 diff --git a/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.sql b/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.sql new file mode 100644 index 00000000000..7378df41b8d --- /dev/null +++ b/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.sql @@ -0,0 +1,107 @@ +DROP TABLE IF EXISTS tokenbf_tab; +DROP TABLE IF EXISTS ngrambf_tab; + +CREATE TABLE tokenbf_tab +( + id UInt32, + str String, + INDEX idx str TYPE tokenbf_v1(256, 2, 0) +) +ENGINE = MergeTree +ORDER BY id +SETTINGS index_granularity = 1; + +CREATE TABLE ngrambf_tab +( + id UInt32, + str String, + INDEX idx str TYPE ngrambf_v1(3, 256, 2, 0) +) +ENGINE = MergeTree +ORDER BY id +SETTINGS index_granularity = 1; + +INSERT INTO tokenbf_tab VALUES (1, 'Hello ClickHouse'), (2, 'Hello World'), (3, 'Good Weather'), (4, 'Say Hello'), (5, 'OLAP Database'), (6, 'World Champion'); +INSERT INTO ngrambf_tab VALUES (1, 'Hello ClickHouse'), (2, 'Hello World'), (3, 'Good Weather'), (4, 'Say Hello'), (5, 'OLAP Database'), (6, 'World Champion'); + +SELECT * FROM tokenbf_tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id; +SELECT * FROM ngrambf_tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id; + +-- Skip 2/6 granules +-- Required string: 'Hello ' +-- Alternatives: 'Hello ClickHouse', 'Hello World' + +SELECT * +FROM +( + EXPLAIN PLAN indexes=1 + SELECT * FROM tokenbf_tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id +) +WHERE + explain LIKE '%Granules: %'; + +SELECT * +FROM +( + EXPLAIN PLAN indexes=1 + SELECT * FROM ngrambf_tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id +) +WHERE + explain LIKE '%Granules: %'; + +SELECT '---'; + +SELECT * FROM tokenbf_tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id; +SELECT * FROM ngrambf_tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id; + +-- Skip 3/6 granules +-- Required string: - +-- Alternatives: 'ClickHouse', 'World' + +SELECT * +FROM +( + EXPLAIN PLAN indexes = 1 + SELECT * FROM tokenbf_tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id +) +WHERE + explain LIKE '%Granules: %'; + +SELECT * +FROM +( + EXPLAIN PLAN indexes = 1 + SELECT * FROM ngrambf_tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id +) +WHERE + explain LIKE '%Granules: %'; + +SELECT '---'; + +SELECT * FROM tokenbf_tab WHERE match(str, 'OLAP.*') ORDER BY id; +SELECT * FROM ngrambf_tab WHERE match(str, 'OLAP.*') ORDER BY id; + +-- Skip 5/6 granules +-- Required string: 'OLAP' +-- Alternatives: - + +SELECT * +FROM +( + EXPLAIN PLAN indexes = 1 + SELECT * FROM tokenbf_tab WHERE match(str, 'OLAP (.*?)*') ORDER BY id +) +WHERE + explain LIKE '%Granules: %'; + +SELECT * +FROM +( + EXPLAIN PLAN indexes = 1 + SELECT * FROM ngrambf_tab WHERE match(str, 'OLAP (.*?)*') ORDER BY id +) +WHERE + explain LIKE '%Granules: %'; + +DROP TABLE tokenbf_tab; +DROP TABLE ngrambf_tab; diff --git a/tests/queries/0_stateless/02943_tokenbf_support_match.reference b/tests/queries/0_stateless/02943_tokenbf_support_match.reference deleted file mode 100644 index d02011eb2a1..00000000000 --- a/tests/queries/0_stateless/02943_tokenbf_support_match.reference +++ /dev/null @@ -1,10 +0,0 @@ - Granules: 5/5 - Granules: 2/5 - - - Granules: 5/5 - Granules: 2/5 - - - Granules: 5/5 - Granules: 1/5 diff --git a/tests/queries/0_stateless/02943_tokenbf_support_match.sql b/tests/queries/0_stateless/02943_tokenbf_support_match.sql deleted file mode 100644 index b48eb45c0d0..00000000000 --- a/tests/queries/0_stateless/02943_tokenbf_support_match.sql +++ /dev/null @@ -1,68 +0,0 @@ --- Tags: no-parallel - -DROP DATABASE IF EXISTS test_tokenbf_match; - -CREATE DATABASE test_tokenbf_match; - -CREATE TABLE test_tokenbf_match.test_tokenbf -( - `id` UInt32, - `str` String, - INDEX str_idx str TYPE tokenbf_v1(256, 2, 0) GRANULARITY 1 -) -ENGINE = MergeTree -ORDER BY id -SETTINGS index_granularity = 1; - -INSERT INTO test_tokenbf_match.test_tokenbf VALUES (1, 'Hello ClickHouse'), (2, 'Hello World'), (3, 'Hello Github'), (4, 'Hello Cloud'), (5, 'OLAP Database'); - ---SKIP 3 GRANUS ---Required String: Hello ---Alternative String: Hello ClickHouse ---Alternative String: Hello World -SELECT - * -FROM -( - EXPLAIN indexes=1 - SELECT * FROM test_tokenbf_match.test_tokenbf WHERE match(str, 'Hello (ClickHouse|World)') -) -WHERE - explain like '%Granules%'; - - -SELECT ''; -SELECT ''; - - ---SKIP 3 GRANUS ---No Required String ---Alternative String: ClickHouse ---Alternative String: World -SELECT - * -FROM -( - EXPLAIN indexes = 1 - SELECT * FROM test_tokenbf_match.test_tokenbf where match(str, '(.*?)* (ClickHouse|World)') -) -WHERE - explain like '%Granules%'; - -SELECT ''; -SELECT ''; - ---SKIP 4 GRANUS ---Required String: OLAP ---No Alternative String -SELECT - * -FROM -( - EXPLAIN indexes = 1 - SELECT * FROM test_tokenbf_match.test_tokenbf where match(str, 'OLAP (.*?)*') -) -WHERE - explain like '%Granules%'; - -DROP DATABASE IF EXISTS test_tokenbf_match;