Replace setting density to max_rows_in postings_list for full text search

This commit is contained in:
HarryLeeIBM 2023-10-13 10:31:21 -07:00
parent dd0751d324
commit 25545d504d
5 changed files with 27 additions and 92 deletions

View File

@ -21,14 +21,18 @@ namespace ErrorCodes
extern const int BAD_ARGUMENTS; extern const int BAD_ARGUMENTS;
} }
GinFilterParameters::GinFilterParameters(size_t ngrams_, Float64 density_) GinFilterParameters::GinFilterParameters(size_t ngrams_, UInt64 max_rows_)
: ngrams(ngrams_) : ngrams(ngrams_)
, density(density_) , max_rows_in_postings_list(max_rows_)
{ {
/// 0 indicates no limitation of postings list's size
if (max_rows_in_postings_list == 0)
max_rows_in_postings_list = std::numeric_limits<UInt64>::max();
if (ngrams > 8) if (ngrams > 8)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The size of inverted index filter cannot be greater than 8"); throw Exception(ErrorCodes::BAD_ARGUMENTS, "The size of inverted index filter cannot be greater than 8");
if (density <= 0 || density > 1) if (max_rows_in_postings_list < MIN_ROWS_IN_POSTINGS_LIST)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The density inverted index gin filter must be between 0 and 1"); throw Exception(ErrorCodes::BAD_ARGUMENTS, "The maximum rows in postings list must be no less than {}", MIN_ROWS_IN_POSTINGS_LIST);
} }
GinFilter::GinFilter(const GinFilterParameters & params_) GinFilter::GinFilter(const GinFilterParameters & params_)
@ -36,7 +40,7 @@ GinFilter::GinFilter(const GinFilterParameters & params_)
{ {
} }
void GinFilter::add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store, UInt64 limit) const void GinFilter::add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store) const
{ {
if (len > FST::MAX_TERM_LENGTH) if (len > FST::MAX_TERM_LENGTH)
return; return;
@ -51,8 +55,7 @@ void GinFilter::add(const char * data, size_t len, UInt32 rowID, GinIndexStorePt
} }
else else
{ {
UInt64 size_limit = std::lround(limit * params.density); auto builder = std::make_shared<GinIndexPostingsBuilder>(params.max_rows_in_postings_list);
auto builder = std::make_shared<GinIndexPostingsBuilder>(size_limit);
builder->add(rowID); builder->add(rowID);
store->setPostingsBuilder(term, builder); store->setPostingsBuilder(term, builder);

View File

@ -8,13 +8,15 @@ namespace DB
{ {
static inline constexpr auto INVERTED_INDEX_NAME = "inverted"; static inline constexpr auto INVERTED_INDEX_NAME = "inverted";
static inline constexpr UInt64 MIN_ROWS_IN_POSTINGS_LIST = 8 * 1024;
static inline constexpr UInt64 DEFAULT_ROWS_IN_POSTINGS_LIST = 64 * 1024;
struct GinFilterParameters struct GinFilterParameters
{ {
GinFilterParameters(size_t ngrams_, Float64 density_); GinFilterParameters(size_t ngrams_, UInt64 max_rows_);
size_t ngrams; size_t ngrams;
Float64 density; UInt64 max_rows_in_postings_list;
}; };
struct GinSegmentWithRowIdRange struct GinSegmentWithRowIdRange
@ -42,7 +44,7 @@ public:
/// Add term (located at 'data' with length 'len') and its row ID to the postings list builder /// Add term (located at 'data' with length 'len') and its row ID to the postings list builder
/// for building inverted index for the given store. /// for building inverted index for the given store.
void add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store, UInt64 limit) const; void add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store) const;
/// Accumulate (segmentID, RowIDStart, RowIDEnd) for building skipping index /// Accumulate (segmentID, RowIDStart, RowIDEnd) for building skipping index
void addRowRangeToGinFilter(UInt32 segmentID, UInt32 rowIDStart, UInt32 rowIDEnd); void addRowRangeToGinFilter(UInt32 segmentID, UInt32 rowIDStart, UInt32 rowIDEnd);

View File

@ -109,14 +109,14 @@ MergeTreeIndexGranulePtr MergeTreeIndexAggregatorInverted::getGranuleAndReset()
return new_granule; return new_granule;
} }
void MergeTreeIndexAggregatorInverted::addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter, UInt64 limit) void MergeTreeIndexAggregatorInverted::addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter)
{ {
size_t cur = 0; size_t cur = 0;
size_t token_start = 0; size_t token_start = 0;
size_t token_len = 0; size_t token_len = 0;
while (cur < length && token_extractor->nextInStringPadded(data, length, &cur, &token_start, &token_len)) while (cur < length && token_extractor->nextInStringPadded(data, length, &cur, &token_start, &token_len))
gin_filter.add(data + token_start, token_len, rowID, store, limit); gin_filter.add(data + token_start, token_len, rowID, store);
} }
void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos, size_t limit) void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos, size_t limit)
@ -150,7 +150,7 @@ void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos,
for (size_t row_num = 0; row_num < elements_size; ++row_num) for (size_t row_num = 0; row_num < elements_size; ++row_num)
{ {
auto ref = column_key.getDataAt(element_start_row + row_num); auto ref = column_key.getDataAt(element_start_row + row_num);
addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col], rows_read); addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col]);
store->incrementCurrentSizeBy(ref.size); store->incrementCurrentSizeBy(ref.size);
} }
current_position += 1; current_position += 1;
@ -165,7 +165,7 @@ void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos,
for (size_t i = 0; i < rows_read; ++i) for (size_t i = 0; i < rows_read; ++i)
{ {
auto ref = column->getDataAt(current_position + i); auto ref = column->getDataAt(current_position + i);
addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col], rows_read); addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col]);
store->incrementCurrentSizeBy(ref.size); store->incrementCurrentSizeBy(ref.size);
row_id++; row_id++;
if (store->needToWrite()) if (store->needToWrite())
@ -735,8 +735,8 @@ MergeTreeIndexPtr invertedIndexCreator(
const IndexDescription & index) const IndexDescription & index)
{ {
size_t n = index.arguments.empty() ? 0 : index.arguments[0].get<size_t>(); size_t n = index.arguments.empty() ? 0 : index.arguments[0].get<size_t>();
Float64 density = index.arguments.size() < 2 ? 1.0 : index.arguments[1].get<Float64>(); UInt64 max_rows = index.arguments.size() < 2 ? DEFAULT_ROWS_IN_POSTINGS_LIST : index.arguments[1].get<UInt64>();
GinFilterParameters params(n, density); GinFilterParameters params(n, max_rows);
/// Use SplitTokenExtractor when n is 0, otherwise use NgramTokenExtractor /// Use SplitTokenExtractor when n is 0, otherwise use NgramTokenExtractor
if (n > 0) if (n > 0)
@ -780,13 +780,14 @@ void invertedIndexValidator(const IndexDescription & index, bool /*attach*/)
if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::UInt64) if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::UInt64)
throw Exception(ErrorCodes::INCORRECT_QUERY, "The first Inverted index argument must be positive integer."); throw Exception(ErrorCodes::INCORRECT_QUERY, "The first Inverted index argument must be positive integer.");
if (index.arguments.size() == 2 && (index.arguments[1].getType() != Field::Types::Float64 || index.arguments[1].get<Float64>() <= 0 || index.arguments[1].get<Float64>() > 1)) if (index.arguments.size() == 2 && (index.arguments[1].getType() != Field::Types::UInt64 ||
throw Exception(ErrorCodes::INCORRECT_QUERY, "The second Inverted index argument must be a float between 0 and 1."); (index.arguments[1].get<UInt64>() != 0 && index.arguments[1].get<UInt64>() < MIN_ROWS_IN_POSTINGS_LIST)))
throw Exception(ErrorCodes::INCORRECT_QUERY, "The maximum rows in postings list must be no less than {}", MIN_ROWS_IN_POSTINGS_LIST);
/// Just validate /// Just validate
size_t ngrams = index.arguments.empty() ? 0 : index.arguments[0].get<size_t>(); size_t ngrams = index.arguments.empty() ? 0 : index.arguments[0].get<size_t>();
Float64 density = index.arguments.size() < 2 ? 1.0 : index.arguments[1].get<Float64>(); UInt64 max_rows = index.arguments.size() < 2 ? DEFAULT_ROWS_IN_POSTINGS_LIST : index.arguments[1].get<UInt64>();
GinFilterParameters params(ngrams, density); GinFilterParameters params(ngrams, max_rows);
} }
} }

View File

@ -48,7 +48,7 @@ struct MergeTreeIndexAggregatorInverted final : IMergeTreeIndexAggregator
void update(const Block & block, size_t * pos, size_t limit) override; void update(const Block & block, size_t * pos, size_t limit) override;
void addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter, UInt64 limit); void addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter);
GinIndexStorePtr store; GinIndexStorePtr store;
Names index_columns; Names index_columns;

View File

@ -277,74 +277,3 @@ SELECT * FROM tab WHERE str == 'b' AND 1.0;
-- AND result_rows==1 -- AND result_rows==1
-- LIMIT 1; -- LIMIT 1;
-- --
-- ----------------------------------------------------
-- SELECT 'Test density==1';
--
-- DROP TABLE IF EXISTS tab;
--
-- CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(0, 1.0))
-- Engine=MergeTree
-- ORDER BY (k)
-- SETTINGS max_digestion_size_per_segment = 1, index_granularity = 512
-- AS
-- SELECT number, if(number%2, format('happy {}', hex(number)), format('birthday {}', hex(number)))
-- FROM numbers(1024);
--
-- -- check inverted index was created
-- SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1;
--
-- -- search inverted index, no row has 'happy birthday'
-- SELECT count() == 0 FROM tab WHERE s =='happy birthday';
--
-- -- check the query only skip all granules (0 row total; each granule has 512 rows)
-- SYSTEM FLUSH LOGS;
-- SELECT read_rows==0 from system.query_log
-- WHERE query_kind ='Select'
-- AND current_database = currentDatabase()
-- AND endsWith(trimRight(query), 'SELECT count() == 0 FROM tab WHERE s ==\'happy birthday\';')
-- AND type='QueryFinish'
-- AND result_rows==1
-- LIMIT 1;
--
-- ----------------------------------------------------
-- SELECT 'Test density==0.1';
--
-- DROP TABLE IF EXISTS tab;
--
-- CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(0, 0.1))
-- Engine=MergeTree
-- ORDER BY (k)
-- SETTINGS max_digestion_size_per_segment = 1, index_granularity = 512
-- AS
-- SELECT number, if(number==1023, 'happy new year', if(number%2, format('happy {}', hex(number)), format('birthday {}', hex(number))))
-- FROM numbers(1024);
--
-- -- check inverted index was created
--
-- SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1;
--
-- -- search inverted index, no row has 'happy birthday'
-- SELECT count() == 0 FROM tab WHERE s == 'happy birthday';
--
-- -- check the query does not skip any of the 2 granules(1024 rows total; each granule has 512 rows)
-- SYSTEM FLUSH LOGS;
-- SELECT read_rows==1024 from system.query_log
-- WHERE query_kind ='Select'
-- AND current_database = currentDatabase()
-- AND endsWith(trimRight(query), 'SELECT count() == 0 FROM tab WHERE s == \'happy birthday\';')
-- AND type='QueryFinish'
-- AND result_rows==1
-- LIMIT 1;
--
-- -- search inverted index, no row has 'happy new year'
-- SELECT count() == 1 FROM tab WHERE s == 'happy new year';
--
-- -- check the query only read 1 granule because of density (1024 rows total; each granule has 512 rows)
-- SYSTEM FLUSH LOGS;
-- SELECT read_rows==512 from system.query_log
-- WHERE query_kind ='Select'
-- AND current_database = currentDatabase()
-- AND endsWith(trimRight(query), 'SELECT count() == 1 FROM tab WHERE s == \'happy new year\';')
-- AND type='QueryFinish'
-- AND result_rows==1
-- LIMIT 1;