Replace setting density to max_rows_in postings_list for full text search

This commit is contained in:
HarryLeeIBM 2023-10-13 10:31:21 -07:00
parent dd0751d324
commit 25545d504d
5 changed files with 27 additions and 92 deletions

View File

@ -21,14 +21,18 @@ namespace ErrorCodes
extern const int BAD_ARGUMENTS;
}
GinFilterParameters::GinFilterParameters(size_t ngrams_, Float64 density_)
GinFilterParameters::GinFilterParameters(size_t ngrams_, UInt64 max_rows_)
: ngrams(ngrams_)
, density(density_)
, max_rows_in_postings_list(max_rows_)
{
/// 0 indicates no limitation of postings list's size
if (max_rows_in_postings_list == 0)
max_rows_in_postings_list = std::numeric_limits<UInt64>::max();
if (ngrams > 8)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The size of inverted index filter cannot be greater than 8");
if (density <= 0 || density > 1)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The density inverted index gin filter must be between 0 and 1");
if (max_rows_in_postings_list < MIN_ROWS_IN_POSTINGS_LIST)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The maximum rows in postings list must be no less than {}", MIN_ROWS_IN_POSTINGS_LIST);
}
GinFilter::GinFilter(const GinFilterParameters & params_)
@ -36,7 +40,7 @@ GinFilter::GinFilter(const GinFilterParameters & params_)
{
}
void GinFilter::add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store, UInt64 limit) const
void GinFilter::add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store) const
{
if (len > FST::MAX_TERM_LENGTH)
return;
@ -51,8 +55,7 @@ void GinFilter::add(const char * data, size_t len, UInt32 rowID, GinIndexStorePt
}
else
{
UInt64 size_limit = std::lround(limit * params.density);
auto builder = std::make_shared<GinIndexPostingsBuilder>(size_limit);
auto builder = std::make_shared<GinIndexPostingsBuilder>(params.max_rows_in_postings_list);
builder->add(rowID);
store->setPostingsBuilder(term, builder);

View File

@ -8,13 +8,15 @@ namespace DB
{
static inline constexpr auto INVERTED_INDEX_NAME = "inverted";
static inline constexpr UInt64 MIN_ROWS_IN_POSTINGS_LIST = 8 * 1024;
static inline constexpr UInt64 DEFAULT_ROWS_IN_POSTINGS_LIST = 64 * 1024;
struct GinFilterParameters
{
GinFilterParameters(size_t ngrams_, Float64 density_);
GinFilterParameters(size_t ngrams_, UInt64 max_rows_);
size_t ngrams;
Float64 density;
UInt64 max_rows_in_postings_list;
};
struct GinSegmentWithRowIdRange
@ -42,7 +44,7 @@ public:
/// Add term (located at 'data' with length 'len') and its row ID to the postings list builder
/// for building inverted index for the given store.
void add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store, UInt64 limit) const;
void add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store) const;
/// Accumulate (segmentID, RowIDStart, RowIDEnd) for building skipping index
void addRowRangeToGinFilter(UInt32 segmentID, UInt32 rowIDStart, UInt32 rowIDEnd);

View File

@ -109,14 +109,14 @@ MergeTreeIndexGranulePtr MergeTreeIndexAggregatorInverted::getGranuleAndReset()
return new_granule;
}
void MergeTreeIndexAggregatorInverted::addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter, UInt64 limit)
void MergeTreeIndexAggregatorInverted::addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter)
{
size_t cur = 0;
size_t token_start = 0;
size_t token_len = 0;
while (cur < length && token_extractor->nextInStringPadded(data, length, &cur, &token_start, &token_len))
gin_filter.add(data + token_start, token_len, rowID, store, limit);
gin_filter.add(data + token_start, token_len, rowID, store);
}
void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos, size_t limit)
@ -150,7 +150,7 @@ void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos,
for (size_t row_num = 0; row_num < elements_size; ++row_num)
{
auto ref = column_key.getDataAt(element_start_row + row_num);
addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col], rows_read);
addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col]);
store->incrementCurrentSizeBy(ref.size);
}
current_position += 1;
@ -165,7 +165,7 @@ void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos,
for (size_t i = 0; i < rows_read; ++i)
{
auto ref = column->getDataAt(current_position + i);
addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col], rows_read);
addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col]);
store->incrementCurrentSizeBy(ref.size);
row_id++;
if (store->needToWrite())
@ -735,8 +735,8 @@ MergeTreeIndexPtr invertedIndexCreator(
const IndexDescription & index)
{
size_t n = index.arguments.empty() ? 0 : index.arguments[0].get<size_t>();
Float64 density = index.arguments.size() < 2 ? 1.0 : index.arguments[1].get<Float64>();
GinFilterParameters params(n, density);
UInt64 max_rows = index.arguments.size() < 2 ? DEFAULT_ROWS_IN_POSTINGS_LIST : index.arguments[1].get<UInt64>();
GinFilterParameters params(n, max_rows);
/// Use SplitTokenExtractor when n is 0, otherwise use NgramTokenExtractor
if (n > 0)
@ -780,13 +780,14 @@ void invertedIndexValidator(const IndexDescription & index, bool /*attach*/)
if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::UInt64)
throw Exception(ErrorCodes::INCORRECT_QUERY, "The first Inverted index argument must be positive integer.");
if (index.arguments.size() == 2 && (index.arguments[1].getType() != Field::Types::Float64 || index.arguments[1].get<Float64>() <= 0 || index.arguments[1].get<Float64>() > 1))
throw Exception(ErrorCodes::INCORRECT_QUERY, "The second Inverted index argument must be a float between 0 and 1.");
if (index.arguments.size() == 2 && (index.arguments[1].getType() != Field::Types::UInt64 ||
(index.arguments[1].get<UInt64>() != 0 && index.arguments[1].get<UInt64>() < MIN_ROWS_IN_POSTINGS_LIST)))
throw Exception(ErrorCodes::INCORRECT_QUERY, "The maximum rows in postings list must be no less than {}", MIN_ROWS_IN_POSTINGS_LIST);
/// Just validate
size_t ngrams = index.arguments.empty() ? 0 : index.arguments[0].get<size_t>();
Float64 density = index.arguments.size() < 2 ? 1.0 : index.arguments[1].get<Float64>();
GinFilterParameters params(ngrams, density);
UInt64 max_rows = index.arguments.size() < 2 ? DEFAULT_ROWS_IN_POSTINGS_LIST : index.arguments[1].get<UInt64>();
GinFilterParameters params(ngrams, max_rows);
}
}

View File

@ -48,7 +48,7 @@ struct MergeTreeIndexAggregatorInverted final : IMergeTreeIndexAggregator
void update(const Block & block, size_t * pos, size_t limit) override;
void addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter, UInt64 limit);
void addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter);
GinIndexStorePtr store;
Names index_columns;

View File

@ -277,74 +277,3 @@ SELECT * FROM tab WHERE str == 'b' AND 1.0;
-- AND result_rows==1
-- LIMIT 1;
--
-- ----------------------------------------------------
-- SELECT 'Test density==1';
--
-- DROP TABLE IF EXISTS tab;
--
-- CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(0, 1.0))
-- Engine=MergeTree
-- ORDER BY (k)
-- SETTINGS max_digestion_size_per_segment = 1, index_granularity = 512
-- AS
-- SELECT number, if(number%2, format('happy {}', hex(number)), format('birthday {}', hex(number)))
-- FROM numbers(1024);
--
-- -- check inverted index was created
-- SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1;
--
-- -- search inverted index, no row has 'happy birthday'
-- SELECT count() == 0 FROM tab WHERE s =='happy birthday';
--
-- -- check the query only skip all granules (0 row total; each granule has 512 rows)
-- SYSTEM FLUSH LOGS;
-- SELECT read_rows==0 from system.query_log
-- WHERE query_kind ='Select'
-- AND current_database = currentDatabase()
-- AND endsWith(trimRight(query), 'SELECT count() == 0 FROM tab WHERE s ==\'happy birthday\';')
-- AND type='QueryFinish'
-- AND result_rows==1
-- LIMIT 1;
--
-- ----------------------------------------------------
-- SELECT 'Test density==0.1';
--
-- DROP TABLE IF EXISTS tab;
--
-- CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(0, 0.1))
-- Engine=MergeTree
-- ORDER BY (k)
-- SETTINGS max_digestion_size_per_segment = 1, index_granularity = 512
-- AS
-- SELECT number, if(number==1023, 'happy new year', if(number%2, format('happy {}', hex(number)), format('birthday {}', hex(number))))
-- FROM numbers(1024);
--
-- -- check inverted index was created
--
-- SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1;
--
-- -- search inverted index, no row has 'happy birthday'
-- SELECT count() == 0 FROM tab WHERE s == 'happy birthday';
--
-- -- check the query does not skip any of the 2 granules(1024 rows total; each granule has 512 rows)
-- SYSTEM FLUSH LOGS;
-- SELECT read_rows==1024 from system.query_log
-- WHERE query_kind ='Select'
-- AND current_database = currentDatabase()
-- AND endsWith(trimRight(query), 'SELECT count() == 0 FROM tab WHERE s == \'happy birthday\';')
-- AND type='QueryFinish'
-- AND result_rows==1
-- LIMIT 1;
--
-- -- search inverted index, no row has 'happy new year'
-- SELECT count() == 1 FROM tab WHERE s == 'happy new year';
--
-- -- check the query only read 1 granule because of density (1024 rows total; each granule has 512 rows)
-- SYSTEM FLUSH LOGS;
-- SELECT read_rows==512 from system.query_log
-- WHERE query_kind ='Select'
-- AND current_database = currentDatabase()
-- AND endsWith(trimRight(query), 'SELECT count() == 1 FROM tab WHERE s == \'happy new year\';')
-- AND type='QueryFinish'
-- AND result_rows==1
-- LIMIT 1;