diff --git a/src/Interpreters/GinFilter.cpp b/src/Interpreters/GinFilter.cpp index 4662128e8ab..e60d54026eb 100644 --- a/src/Interpreters/GinFilter.cpp +++ b/src/Interpreters/GinFilter.cpp @@ -21,14 +21,18 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -GinFilterParameters::GinFilterParameters(size_t ngrams_, Float64 density_) +GinFilterParameters::GinFilterParameters(size_t ngrams_, UInt64 max_rows_) : ngrams(ngrams_) - , density(density_) + , max_rows_in_postings_list(max_rows_) { + /// 0 indicates no limitation of postings list's size + if (max_rows_in_postings_list == 0) + max_rows_in_postings_list = std::numeric_limits::max(); + if (ngrams > 8) throw Exception(ErrorCodes::BAD_ARGUMENTS, "The size of inverted index filter cannot be greater than 8"); - if (density <= 0 || density > 1) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "The density inverted index gin filter must be between 0 and 1"); + if (max_rows_in_postings_list < MIN_ROWS_IN_POSTINGS_LIST) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The maximum rows in postings list must be no less than {}", MIN_ROWS_IN_POSTINGS_LIST); } GinFilter::GinFilter(const GinFilterParameters & params_) @@ -36,7 +40,7 @@ GinFilter::GinFilter(const GinFilterParameters & params_) { } -void GinFilter::add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store, UInt64 limit) const +void GinFilter::add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store) const { if (len > FST::MAX_TERM_LENGTH) return; @@ -51,8 +55,7 @@ void GinFilter::add(const char * data, size_t len, UInt32 rowID, GinIndexStorePt } else { - UInt64 size_limit = std::lround(limit * params.density); - auto builder = std::make_shared(size_limit); + auto builder = std::make_shared(params.max_rows_in_postings_list); builder->add(rowID); store->setPostingsBuilder(term, builder); diff --git a/src/Interpreters/GinFilter.h b/src/Interpreters/GinFilter.h index 8985d84f215..3e57a07ecd4 100644 --- a/src/Interpreters/GinFilter.h +++ b/src/Interpreters/GinFilter.h @@ -8,13 +8,15 @@ namespace DB { static inline constexpr auto INVERTED_INDEX_NAME = "inverted"; +static inline constexpr UInt64 MIN_ROWS_IN_POSTINGS_LIST = 8 * 1024; +static inline constexpr UInt64 DEFAULT_ROWS_IN_POSTINGS_LIST = 64 * 1024; struct GinFilterParameters { - GinFilterParameters(size_t ngrams_, Float64 density_); + GinFilterParameters(size_t ngrams_, UInt64 max_rows_); size_t ngrams; - Float64 density; + UInt64 max_rows_in_postings_list; }; struct GinSegmentWithRowIdRange @@ -42,7 +44,7 @@ public: /// Add term (located at 'data' with length 'len') and its row ID to the postings list builder /// for building inverted index for the given store. - void add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store, UInt64 limit) const; + void add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store) const; /// Accumulate (segmentID, RowIDStart, RowIDEnd) for building skipping index void addRowRangeToGinFilter(UInt32 segmentID, UInt32 rowIDStart, UInt32 rowIDEnd); diff --git a/src/Storages/MergeTree/MergeTreeIndexInverted.cpp b/src/Storages/MergeTree/MergeTreeIndexInverted.cpp index 04ef7339a0e..169ae768b31 100644 --- a/src/Storages/MergeTree/MergeTreeIndexInverted.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexInverted.cpp @@ -109,14 +109,14 @@ MergeTreeIndexGranulePtr MergeTreeIndexAggregatorInverted::getGranuleAndReset() return new_granule; } -void MergeTreeIndexAggregatorInverted::addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter, UInt64 limit) +void MergeTreeIndexAggregatorInverted::addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter) { size_t cur = 0; size_t token_start = 0; size_t token_len = 0; while (cur < length && token_extractor->nextInStringPadded(data, length, &cur, &token_start, &token_len)) - gin_filter.add(data + token_start, token_len, rowID, store, limit); + gin_filter.add(data + token_start, token_len, rowID, store); } void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos, size_t limit) @@ -150,7 +150,7 @@ void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos, for (size_t row_num = 0; row_num < elements_size; ++row_num) { auto ref = column_key.getDataAt(element_start_row + row_num); - addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col], rows_read); + addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col]); store->incrementCurrentSizeBy(ref.size); } current_position += 1; @@ -165,7 +165,7 @@ void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos, for (size_t i = 0; i < rows_read; ++i) { auto ref = column->getDataAt(current_position + i); - addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col], rows_read); + addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col]); store->incrementCurrentSizeBy(ref.size); row_id++; if (store->needToWrite()) @@ -735,8 +735,8 @@ MergeTreeIndexPtr invertedIndexCreator( const IndexDescription & index) { size_t n = index.arguments.empty() ? 0 : index.arguments[0].get(); - Float64 density = index.arguments.size() < 2 ? 1.0 : index.arguments[1].get(); - GinFilterParameters params(n, density); + UInt64 max_rows = index.arguments.size() < 2 ? DEFAULT_ROWS_IN_POSTINGS_LIST : index.arguments[1].get(); + GinFilterParameters params(n, max_rows); /// Use SplitTokenExtractor when n is 0, otherwise use NgramTokenExtractor if (n > 0) @@ -780,13 +780,14 @@ void invertedIndexValidator(const IndexDescription & index, bool /*attach*/) if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::UInt64) throw Exception(ErrorCodes::INCORRECT_QUERY, "The first Inverted index argument must be positive integer."); - if (index.arguments.size() == 2 && (index.arguments[1].getType() != Field::Types::Float64 || index.arguments[1].get() <= 0 || index.arguments[1].get() > 1)) - throw Exception(ErrorCodes::INCORRECT_QUERY, "The second Inverted index argument must be a float between 0 and 1."); + if (index.arguments.size() == 2 && (index.arguments[1].getType() != Field::Types::UInt64 || + (index.arguments[1].get() != 0 && index.arguments[1].get() < MIN_ROWS_IN_POSTINGS_LIST))) + throw Exception(ErrorCodes::INCORRECT_QUERY, "The maximum rows in postings list must be no less than {}", MIN_ROWS_IN_POSTINGS_LIST); /// Just validate size_t ngrams = index.arguments.empty() ? 0 : index.arguments[0].get(); - Float64 density = index.arguments.size() < 2 ? 1.0 : index.arguments[1].get(); - GinFilterParameters params(ngrams, density); + UInt64 max_rows = index.arguments.size() < 2 ? DEFAULT_ROWS_IN_POSTINGS_LIST : index.arguments[1].get(); + GinFilterParameters params(ngrams, max_rows); } } diff --git a/src/Storages/MergeTree/MergeTreeIndexInverted.h b/src/Storages/MergeTree/MergeTreeIndexInverted.h index 96a87c2e2ba..96d12128bb4 100644 --- a/src/Storages/MergeTree/MergeTreeIndexInverted.h +++ b/src/Storages/MergeTree/MergeTreeIndexInverted.h @@ -48,7 +48,7 @@ struct MergeTreeIndexAggregatorInverted final : IMergeTreeIndexAggregator void update(const Block & block, size_t * pos, size_t limit) override; - void addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter, UInt64 limit); + void addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter); GinIndexStorePtr store; Names index_columns; diff --git a/tests/queries/0_stateless/02346_full_text_search.sql b/tests/queries/0_stateless/02346_full_text_search.sql index c8536976377..7d3337b9407 100644 --- a/tests/queries/0_stateless/02346_full_text_search.sql +++ b/tests/queries/0_stateless/02346_full_text_search.sql @@ -277,74 +277,3 @@ SELECT * FROM tab WHERE str == 'b' AND 1.0; -- AND result_rows==1 -- LIMIT 1; -- --- ---------------------------------------------------- --- SELECT 'Test density==1'; --- --- DROP TABLE IF EXISTS tab; --- --- CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(0, 1.0)) --- Engine=MergeTree --- ORDER BY (k) --- SETTINGS max_digestion_size_per_segment = 1, index_granularity = 512 --- AS --- SELECT number, if(number%2, format('happy {}', hex(number)), format('birthday {}', hex(number))) --- FROM numbers(1024); --- --- -- check inverted index was created --- SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1; --- --- -- search inverted index, no row has 'happy birthday' --- SELECT count() == 0 FROM tab WHERE s =='happy birthday'; --- --- -- check the query only skip all granules (0 row total; each granule has 512 rows) --- SYSTEM FLUSH LOGS; --- SELECT read_rows==0 from system.query_log --- WHERE query_kind ='Select' --- AND current_database = currentDatabase() --- AND endsWith(trimRight(query), 'SELECT count() == 0 FROM tab WHERE s ==\'happy birthday\';') --- AND type='QueryFinish' --- AND result_rows==1 --- LIMIT 1; --- --- ---------------------------------------------------- --- SELECT 'Test density==0.1'; --- --- DROP TABLE IF EXISTS tab; --- --- CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(0, 0.1)) --- Engine=MergeTree --- ORDER BY (k) --- SETTINGS max_digestion_size_per_segment = 1, index_granularity = 512 --- AS --- SELECT number, if(number==1023, 'happy new year', if(number%2, format('happy {}', hex(number)), format('birthday {}', hex(number)))) --- FROM numbers(1024); --- --- -- check inverted index was created --- --- SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1; --- --- -- search inverted index, no row has 'happy birthday' --- SELECT count() == 0 FROM tab WHERE s == 'happy birthday'; --- --- -- check the query does not skip any of the 2 granules(1024 rows total; each granule has 512 rows) --- SYSTEM FLUSH LOGS; --- SELECT read_rows==1024 from system.query_log --- WHERE query_kind ='Select' --- AND current_database = currentDatabase() --- AND endsWith(trimRight(query), 'SELECT count() == 0 FROM tab WHERE s == \'happy birthday\';') --- AND type='QueryFinish' --- AND result_rows==1 --- LIMIT 1; --- --- -- search inverted index, no row has 'happy new year' --- SELECT count() == 1 FROM tab WHERE s == 'happy new year'; --- --- -- check the query only read 1 granule because of density (1024 rows total; each granule has 512 rows) --- SYSTEM FLUSH LOGS; --- SELECT read_rows==512 from system.query_log --- WHERE query_kind ='Select' --- AND current_database = currentDatabase() --- AND endsWith(trimRight(query), 'SELECT count() == 1 FROM tab WHERE s == \'happy new year\';') --- AND type='QueryFinish' --- AND result_rows==1 --- LIMIT 1;