Merge pull request #57878 from Jpnock/master

Apply full-text skipping index when using `hasAny()`
2024-11-10 01:25:21 +00:00 · 2023-12-17 11:06:29 +01:00 · 2023-12-17 11:06:29 +01:00 · 7f675ddf80
commit 7f675ddf80
parent fc67d2c0e9 2c24e438aa
7 changed files with 76 additions and 7 deletions
--- a/docs/en/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md
@ -520,7 +520,7 @@ Indexes of type `set` can be utilized by all functions. The other index types ar
 | [empty](/docs/en/sql-reference/functions/array-functions#function-empty)                                   | ✔           | ✔      | ✗          | ✗          | ✗            | ✗        |
 | [notEmpty](/docs/en/sql-reference/functions/array-functions#function-notempty)                             | ✔           | ✔      | ✗          | ✗          | ✗            | ✗        |
 | [has](/docs/en/sql-reference/functions/array-functions#function-has)                                       | ✗           | ✗      | ✔          | ✔          | ✔            | ✔        |
-| [hasAny](/docs/en/sql-reference/functions/array-functions#function-hasAny)                                 | ✗           | ✗      | ✗          | ✗          | ✔            | ✗        |
+| [hasAny](/docs/en/sql-reference/functions/array-functions#function-hasAny)                                 | ✗           | ✗      | ✔          | ✔          | ✔            | ✗        |
 | [hasAll](/docs/en/sql-reference/functions/array-functions#function-hasAll)                                 | ✗           | ✗      | ✗          | ✗          | ✔            | ✗        |
 | hasToken                                                                                                   | ✗           | ✗      | ✗          | ✔          | ✗            | ✔        |
 | hasTokenOrNull                                                                                             | ✗           | ✗      | ✗          | ✔          | ✗            | ✔        |
--- a/docs/ru/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/ru/engines/table-engines/mergetree-family/mergetree.md
@ -369,6 +369,9 @@ INDEX b (u64 * length(str), i32 + f64 * 100, date, str) TYPE set(100) GRANULARIT
 | [greaterOrEquals (\>=)](../../../sql-reference/functions/comparison-functions.md#greaterorequals) | ✔           | ✔      | ✗           | ✗           | ✗             |
 | [empty](../../../sql-reference/functions/array-functions.md#function-empty)                                | ✔           | ✔      | ✗           | ✗           | ✗             |
 | [notEmpty](../../../sql-reference/functions/array-functions.md#function-notempty)                          | ✔           | ✔      | ✗           | ✗           | ✗             |
+| [has](../../../sql-reference/functions/array-functions.md#function-has)                                       | ✗           | ✗      | ✔          | ✔          | ✔            | ✔        |
+| [hasAny](../../../sql-reference/functions/array-functions.md#function-hasAny)                                 | ✗           | ✗      | ✔          | ✔          | ✔            | ✗        |
+| [hasAll](../../../sql-reference/functions/array-functions.md#function-hasAll)                                 | ✗           | ✗      | ✗          | ✗          | ✔            | ✗        |
 | hasToken                                                                                                   | ✗           | ✗      | ✗           | ✔           | ✗             |

 Функции с постоянным агрументом, который меньше, чем размер ngram не могут использовать индекс `ngrambf_v1` для оптимизации запроса.
--- a/docs/zh/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/zh/engines/table-engines/mergetree-family/mergetree.md
@ -364,6 +364,9 @@ WHERE 子句中的条件可以包含对某列数据进行运算的函数表达
 | [greaterOrEquals (\>=)](../../../sql-reference/functions/comparison-functions.md#greaterorequals) | ✔           | ✔      | ✗          | ✗          | ✗            |
 | [empty](../../../sql-reference/functions/array-functions.md#function-empty) | ✔           | ✔      | ✗          | ✗          | ✗            |
 | [notEmpty](../../../sql-reference/functions/array-functions.md#function-notempty) | ✔           | ✔      | ✗          | ✗          | ✗            |
+| [has](../../../sql-reference/functions/array-functions.md#function-has)                                       | ✗           | ✗      | ✔          | ✔          | ✔            | ✔        |
+| [hasAny](../../../sql-reference/functions/array-functions.md#function-hasAny)                                 | ✗           | ✗      | ✔          | ✔          | ✔            | ✗        |
+| [hasAll](../../../sql-reference/functions/array-functions.md#function-hasAll)                                 | ✗           | ✗      | ✗          | ✗          | ✔            | ✗        |
 | hasToken                                                     | ✗           | ✗      | ✗          | ✔          | ✗            |

 常量参数小于 ngram 大小的函数不能使用 `ngrambf_v1` 进行查询优化。
--- a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp
@ -201,6 +201,7 @@ bool MergeTreeConditionFullText::alwaysUnknownOrTrue() const
             || element.function == RPNElement::FUNCTION_IN
             || element.function == RPNElement::FUNCTION_NOT_IN
             || element.function == RPNElement::FUNCTION_MULTI_SEARCH
+             || element.function == RPNElement::FUNCTION_HAS_ANY
             || element.function == RPNElement::ALWAYS_FALSE)
        {
            rpn_stack.push_back(false);
@ -274,7 +275,8 @@ bool MergeTreeConditionFullText::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx
            if (element.function == RPNElement::FUNCTION_NOT_IN)
                rpn_stack.back() = !rpn_stack.back();
        }
-        else if (element.function == RPNElement::FUNCTION_MULTI_SEARCH)
+        else if (element.function == RPNElement::FUNCTION_MULTI_SEARCH
+            || element.function == RPNElement::FUNCTION_HAS_ANY)
        {
            std::vector<bool> result(element.set_bloom_filters.back().size(), true);

@ -395,7 +397,8 @@ bool MergeTreeConditionFullText::extractAtomFromTree(const RPNBuilderTreeNode &
                 function_name.starts_with("hasToken") ||
                 function_name == "startsWith" ||
                 function_name == "endsWith" ||
-                 function_name == "multiSearchAny")
+                 function_name == "multiSearchAny" ||
+                 function_name == "hasAny")
        {
            Field const_value;
            DataTypePtr const_type;
@ -574,10 +577,13 @@ bool MergeTreeConditionFullText::traverseTreeEquals(
        token_extractor->stringToBloomFilter(value.data(), value.size(), *out.bloom_filter);
        return true;
    }
-    else if (function_name == "multiSearchAny")
+    else if (function_name == "multiSearchAny"
+        || function_name == "hasAny")
    {
        out.key_column = *key_index;
-        out.function = RPNElement::FUNCTION_MULTI_SEARCH;
+        out.function = function_name == "multiSearchAny" ?
+            RPNElement::FUNCTION_MULTI_SEARCH :
+            RPNElement::FUNCTION_HAS_ANY;

        /// 2d vector is not needed here but is used because already exists for FUNCTION_IN
        std::vector<std::vector<BloomFilter>> bloom_filters;
--- a/src/Storages/MergeTree/MergeTreeIndexFullText.h
+++ b/src/Storages/MergeTree/MergeTreeIndexFullText.h
@ -92,6 +92,7 @@ private:
            FUNCTION_IN,
            FUNCTION_NOT_IN,
            FUNCTION_MULTI_SEARCH,
+            FUNCTION_HAS_ANY,
            FUNCTION_UNKNOWN, /// Can take any value.
            /// Operators of the logical expression.
            FUNCTION_NOT,
@ -107,13 +108,13 @@ private:
                : function(function_), key_column(key_column_), bloom_filter(std::move(const_bloom_filter_)) {}

        Function function = FUNCTION_UNKNOWN;
-        /// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS and FUNCTION_MULTI_SEARCH
+        /// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS, FUNCTION_MULTI_SEARCH and FUNCTION_HAS_ANY
        size_t key_column;

        /// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS
        std::unique_ptr<BloomFilter> bloom_filter;

-        /// For FUNCTION_IN, FUNCTION_NOT_IN and FUNCTION_MULTI_SEARCH
+        /// For FUNCTION_IN, FUNCTION_NOT_IN, FUNCTION_MULTI_SEARCH and FUNCTION_HAS_ANY
        std::vector<std::vector<BloomFilter>> set_bloom_filters;

        /// For FUNCTION_IN and FUNCTION_NOT_IN
--- a/tests/queries/0_stateless/02943_use_full_text_skip_index_with_has_any.reference
+++ b/tests/queries/0_stateless/02943_use_full_text_skip_index_with_has_any.reference
@ -0,0 +1,17 @@
+1	['this is a test','example.com']
+--
+1	['this is a test','example.com']
+--
+2	['another test','another example']
+--
+1	['this is a test','example.com']
+2	['another test','another example']
+--
+1	['this is a test','example.com']
+--
+1	['this is a test','example.com']
+--
+2	['another test','another example']
+--
+1	['this is a test','example.com']
+2	['another test','another example']
--- a/tests/queries/0_stateless/02943_use_full_text_skip_index_with_has_any.sql
+++ b/tests/queries/0_stateless/02943_use_full_text_skip_index_with_has_any.sql
@ -0,0 +1,39 @@
+DROP TABLE IF EXISTS tokenbf_v1_hasany_test;
+DROP TABLE IF EXISTS ngrambf_v1_hasany_test;
+
+CREATE TABLE tokenbf_v1_hasany_test
+(
+    id UInt32,
+    array Array(String),
+    INDEX idx_array_tokenbf_v1 array TYPE tokenbf_v1(512,3,0) GRANULARITY 1,
+) Engine=MergeTree() ORDER BY id SETTINGS index_granularity = 1;
+
+CREATE TABLE ngrambf_v1_hasany_test
+(
+    id UInt32,
+    array Array(String),
+    INDEX idx_array_ngrambf_v1 array TYPE ngrambf_v1(3,512,3,0) GRANULARITY 1,
+) Engine=MergeTree() ORDER BY id SETTINGS index_granularity = 1;
+
+INSERT INTO tokenbf_v1_hasany_test VALUES (1, ['this is a test', 'example.com']), (2, ['another test', 'another example']);
+INSERT INTO ngrambf_v1_hasany_test VALUES (1, ['this is a test', 'example.com']), (2, ['another test', 'another example']);
+
+SELECT * FROM tokenbf_v1_hasany_test WHERE hasAny(array, ['this is a test']) SETTINGS force_data_skipping_indices='idx_array_tokenbf_v1';
+SELECT '--';
+SELECT * FROM tokenbf_v1_hasany_test WHERE hasAny(array, ['example.com']) SETTINGS force_data_skipping_indices='idx_array_tokenbf_v1';
+SELECT '--';
+SELECT * FROM tokenbf_v1_hasany_test WHERE hasAny(array, ['another test']) SETTINGS force_data_skipping_indices='idx_array_tokenbf_v1';
+SELECT '--';
+SELECT * FROM tokenbf_v1_hasany_test WHERE hasAny(array, ['another example', 'example.com']) ORDER BY id ASC SETTINGS force_data_skipping_indices='idx_array_tokenbf_v1';
+SELECT '--';
+
+SELECT * FROM ngrambf_v1_hasany_test WHERE hasAny(array, ['this is a test']) SETTINGS force_data_skipping_indices='idx_array_ngrambf_v1';
+SELECT '--';
+SELECT * FROM ngrambf_v1_hasany_test WHERE hasAny(array, ['example.com']) SETTINGS force_data_skipping_indices='idx_array_ngrambf_v1';
+SELECT '--';
+SELECT * FROM ngrambf_v1_hasany_test WHERE hasAny(array, ['another test']) SETTINGS force_data_skipping_indices='idx_array_ngrambf_v1';
+SELECT '--';
+SELECT * FROM ngrambf_v1_hasany_test WHERE hasAny(array, ['another example', 'example.com']) ORDER BY id ASC SETTINGS force_data_skipping_indices='idx_array_ngrambf_v1';
+
+DROP TABLE tokenbf_v1_hasany_test;
+DROP TABLE ngrambf_v1_hasany_test;