diff --git a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp index 1c71d77b334..8f43b1606cb 100644 --- a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -155,13 +156,40 @@ void MergeTreeIndexAggregatorFullText::update(const Block & block, size_t * pos, for (size_t col = 0; col < index_columns.size(); ++col) { - const auto & column = block.getByName(index_columns[col]).column; - for (size_t i = 0; i < rows_read; ++i) + const auto & column_with_type = block.getByName(index_columns[col]); + const auto & column = column_with_type.column; + size_t current_position = *pos; + + if (isArray(column_with_type.type)) { - auto ref = column->getDataAt(*pos + i); - columnToBloomFilter(ref.data, ref.size, token_extractor, granule->bloom_filters[col]); + const auto & column_array = assert_cast(*column); + const auto & column_offsets = column_array.getOffsets(); + const auto & column_key = column_array.getData(); + + for (size_t i = 0; i < rows_read; ++i) + { + size_t element_start_row = column_offsets[current_position - 1]; + size_t elements_size = column_offsets[current_position] - element_start_row; + + for (size_t row_num = 0; row_num < elements_size; row_num++) + { + auto ref = column_key.getDataAt(element_start_row + row_num); + columnToBloomFilter(ref.data, ref.size, token_extractor, granule->bloom_filters[col]); + } + + current_position += 1; + } + } + else + { + for (size_t i = 0; i < rows_read; ++i) + { + auto ref = column->getDataAt(current_position + i); + columnToBloomFilter(ref.data, ref.size, token_extractor, granule->bloom_filters[col]); + } } } + granule->has_elems = true; *pos += rows_read; } @@ -202,6 +230,7 @@ bool MergeTreeConditionFullText::alwaysUnknownOrTrue() const } else if (element.function == RPNElement::FUNCTION_EQUALS || element.function == RPNElement::FUNCTION_NOT_EQUALS + || element.function == RPNElement::FUNCTION_HAS || element.function == RPNElement::FUNCTION_IN || element.function == RPNElement::FUNCTION_NOT_IN || element.function == RPNElement::FUNCTION_MULTI_SEARCH @@ -251,7 +280,8 @@ bool MergeTreeConditionFullText::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx rpn_stack.emplace_back(true, true); } else if (element.function == RPNElement::FUNCTION_EQUALS - || element.function == RPNElement::FUNCTION_NOT_EQUALS) + || element.function == RPNElement::FUNCTION_NOT_EQUALS + || element.function == RPNElement::FUNCTION_HAS) { rpn_stack.emplace_back(granule->bloom_filters[element.key_column].contains(*element.bloom_filter), true); @@ -378,6 +408,15 @@ bool MergeTreeConditionFullText::atomFromAST( else if (!token_extractor->supportLike() && (func_name == "like" || func_name == "notLike")) return false; + if (func_name == "has") + { + out.key_column = key_column_num; + out.function = RPNElement::FUNCTION_HAS; + out.bloom_filter = std::make_unique(params); + stringToBloomFilter(const_value.get(), token_extractor, *out.bloom_filter); + + return true; + } if (func_name == "notEquals") { out.key_column = key_column_num; @@ -837,10 +876,18 @@ MergeTreeIndexPtr bloomFilterIndexCreator( void bloomFilterIndexValidator(const IndexDescription & index, bool /*attach*/) { - for (const auto & data_type : index.data_types) + for (const auto & index_data_type : index.data_types) { - if (data_type->getTypeId() != TypeIndex::String && data_type->getTypeId() != TypeIndex::FixedString) - throw Exception("Bloom filter index can be used only with `String` or `FixedString` column.", ErrorCodes::INCORRECT_QUERY); + WhichDataType data_type(index_data_type); + + if (data_type.isArray()) + { + const auto & array_type = assert_cast(*index_data_type); + data_type = WhichDataType(array_type.getNestedType()); + } + + if (!data_type.isString() && !data_type.isFixedString()) + throw Exception("Bloom filter index can be used only with `String`, `FixedString` column or Array with `String` or `FixedString` values column.", ErrorCodes::INCORRECT_QUERY); } if (index.type == NgramTokenExtractor::getName()) diff --git a/src/Storages/MergeTree/MergeTreeIndexFullText.h b/src/Storages/MergeTree/MergeTreeIndexFullText.h index d34cbc61da2..b1c70a9c04f 100644 --- a/src/Storages/MergeTree/MergeTreeIndexFullText.h +++ b/src/Storages/MergeTree/MergeTreeIndexFullText.h @@ -112,6 +112,7 @@ private: /// Atoms of a Boolean expression. FUNCTION_EQUALS, FUNCTION_NOT_EQUALS, + FUNCTION_HAS, FUNCTION_IN, FUNCTION_NOT_IN, FUNCTION_MULTI_SEARCH, diff --git a/tests/queries/0_stateless/2022_array_full_text_bloom_filter_index.reference b/tests/queries/0_stateless/2022_array_full_text_bloom_filter_index.reference new file mode 100644 index 00000000000..f61dedd9bd2 --- /dev/null +++ b/tests/queries/0_stateless/2022_array_full_text_bloom_filter_index.reference @@ -0,0 +1,8 @@ +1 ['K1 K1'] ['K1 K1'] +2 ['K2 K2'] ['K2 K2'] +1 ['K1 K1'] ['K1 K1'] +2 ['K2 K2'] ['K2 K2'] +1 ['K1 K1'] ['K1 K1'] +2 ['K2 K2'] ['K2 K2'] +1 ['K1 K1'] ['K1 K1'] +2 ['K2 K2'] ['K2 K2'] diff --git a/tests/queries/0_stateless/2022_array_full_text_bloom_filter_index.sql b/tests/queries/0_stateless/2022_array_full_text_bloom_filter_index.sql new file mode 100644 index 00000000000..6a2a00674cb --- /dev/null +++ b/tests/queries/0_stateless/2022_array_full_text_bloom_filter_index.sql @@ -0,0 +1,42 @@ +DROP TABLE IF EXISTS bf_tokenbf_array_test; +DROP TABLE IF EXISTS bf_ngram_array_test; + +CREATE TABLE bf_tokenbf_array_test +( + row_id UInt32, + array Array(String), + array_fixed Array(FixedString(5)), + INDEX array_bf_tokenbf array TYPE tokenbf_v1(256,2,0) GRANULARITY 1, + INDEX array_fixed_bf_tokenbf array_fixed TYPE tokenbf_v1(256,2,0) GRANULARITY 1 +) Engine=MergeTree() ORDER BY row_id SETTINGS index_granularity = 2; + +CREATE TABLE bf_ngram_array_test +( + row_id UInt32, + array Array(String), + array_fixed Array(FixedString(5)), + INDEX array_ngram array TYPE ngrambf_v1(4,256,2,0) GRANULARITY 1, + INDEX array_fixed_ngram array_fixed TYPE ngrambf_v1(4,256,2,0) GRANULARITY 1 +) Engine=MergeTree() ORDER BY row_id SETTINGS index_granularity = 2; + +INSERT INTO bf_tokenbf_array_test VALUES (1, ['K1 K1'], ['K1 K1']), (2, ['K2 K2'], ['K2 K2']); +INSERT INTO bf_ngram_array_test VALUES (1, ['K1 K1'], ['K1 K1']), (2, ['K2 K2'], ['K2 K2']); + +SELECT * FROM bf_tokenbf_array_test WHERE has(array, 'K1 K1'); +SELECT * FROM bf_tokenbf_array_test WHERE has(array, 'K2 K2'); +SELECT * FROM bf_tokenbf_array_test WHERE has(array, 'K3 K3'); + +SELECT * FROM bf_tokenbf_array_test WHERE has(array_fixed, 'K1 K1'); +SELECT * FROM bf_tokenbf_array_test WHERE has(array_fixed, 'K2 K2'); +SELECT * FROM bf_tokenbf_array_test WHERE has(array_fixed, 'K3 K3'); + +SELECT * FROM bf_ngram_array_test WHERE has(array, 'K1 K1'); +SELECT * FROM bf_ngram_array_test WHERE has(array, 'K2 K2'); +SELECT * FROM bf_ngram_array_test WHERE has(array, 'K3 K3'); + +SELECT * FROM bf_ngram_array_test WHERE has(array_fixed, 'K1 K1'); +SELECT * FROM bf_ngram_array_test WHERE has(array_fixed, 'K2 K2'); +SELECT * FROM bf_ngram_array_test WHERE has(array_fixed, 'K3 K3'); + +DROP TABLE bf_tokenbf_array_test; +DROP TABLE bf_ngram_array_test;