mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-18 05:32:52 +00:00
Replace setting density to max_rows_in postings_list for full text search
This commit is contained in:
parent
dd0751d324
commit
25545d504d
@ -21,14 +21,18 @@ namespace ErrorCodes
|
|||||||
extern const int BAD_ARGUMENTS;
|
extern const int BAD_ARGUMENTS;
|
||||||
}
|
}
|
||||||
|
|
||||||
GinFilterParameters::GinFilterParameters(size_t ngrams_, Float64 density_)
|
GinFilterParameters::GinFilterParameters(size_t ngrams_, UInt64 max_rows_)
|
||||||
: ngrams(ngrams_)
|
: ngrams(ngrams_)
|
||||||
, density(density_)
|
, max_rows_in_postings_list(max_rows_)
|
||||||
{
|
{
|
||||||
|
/// 0 indicates no limitation of postings list's size
|
||||||
|
if (max_rows_in_postings_list == 0)
|
||||||
|
max_rows_in_postings_list = std::numeric_limits<UInt64>::max();
|
||||||
|
|
||||||
if (ngrams > 8)
|
if (ngrams > 8)
|
||||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The size of inverted index filter cannot be greater than 8");
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The size of inverted index filter cannot be greater than 8");
|
||||||
if (density <= 0 || density > 1)
|
if (max_rows_in_postings_list < MIN_ROWS_IN_POSTINGS_LIST)
|
||||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The density inverted index gin filter must be between 0 and 1");
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The maximum rows in postings list must be no less than {}", MIN_ROWS_IN_POSTINGS_LIST);
|
||||||
}
|
}
|
||||||
|
|
||||||
GinFilter::GinFilter(const GinFilterParameters & params_)
|
GinFilter::GinFilter(const GinFilterParameters & params_)
|
||||||
@ -36,7 +40,7 @@ GinFilter::GinFilter(const GinFilterParameters & params_)
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void GinFilter::add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store, UInt64 limit) const
|
void GinFilter::add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store) const
|
||||||
{
|
{
|
||||||
if (len > FST::MAX_TERM_LENGTH)
|
if (len > FST::MAX_TERM_LENGTH)
|
||||||
return;
|
return;
|
||||||
@ -51,8 +55,7 @@ void GinFilter::add(const char * data, size_t len, UInt32 rowID, GinIndexStorePt
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
UInt64 size_limit = std::lround(limit * params.density);
|
auto builder = std::make_shared<GinIndexPostingsBuilder>(params.max_rows_in_postings_list);
|
||||||
auto builder = std::make_shared<GinIndexPostingsBuilder>(size_limit);
|
|
||||||
builder->add(rowID);
|
builder->add(rowID);
|
||||||
|
|
||||||
store->setPostingsBuilder(term, builder);
|
store->setPostingsBuilder(term, builder);
|
||||||
|
@ -8,13 +8,15 @@ namespace DB
|
|||||||
{
|
{
|
||||||
|
|
||||||
static inline constexpr auto INVERTED_INDEX_NAME = "inverted";
|
static inline constexpr auto INVERTED_INDEX_NAME = "inverted";
|
||||||
|
static inline constexpr UInt64 MIN_ROWS_IN_POSTINGS_LIST = 8 * 1024;
|
||||||
|
static inline constexpr UInt64 DEFAULT_ROWS_IN_POSTINGS_LIST = 64 * 1024;
|
||||||
|
|
||||||
struct GinFilterParameters
|
struct GinFilterParameters
|
||||||
{
|
{
|
||||||
GinFilterParameters(size_t ngrams_, Float64 density_);
|
GinFilterParameters(size_t ngrams_, UInt64 max_rows_);
|
||||||
|
|
||||||
size_t ngrams;
|
size_t ngrams;
|
||||||
Float64 density;
|
UInt64 max_rows_in_postings_list;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct GinSegmentWithRowIdRange
|
struct GinSegmentWithRowIdRange
|
||||||
@ -42,7 +44,7 @@ public:
|
|||||||
|
|
||||||
/// Add term (located at 'data' with length 'len') and its row ID to the postings list builder
|
/// Add term (located at 'data' with length 'len') and its row ID to the postings list builder
|
||||||
/// for building inverted index for the given store.
|
/// for building inverted index for the given store.
|
||||||
void add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store, UInt64 limit) const;
|
void add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store) const;
|
||||||
|
|
||||||
/// Accumulate (segmentID, RowIDStart, RowIDEnd) for building skipping index
|
/// Accumulate (segmentID, RowIDStart, RowIDEnd) for building skipping index
|
||||||
void addRowRangeToGinFilter(UInt32 segmentID, UInt32 rowIDStart, UInt32 rowIDEnd);
|
void addRowRangeToGinFilter(UInt32 segmentID, UInt32 rowIDStart, UInt32 rowIDEnd);
|
||||||
|
@ -109,14 +109,14 @@ MergeTreeIndexGranulePtr MergeTreeIndexAggregatorInverted::getGranuleAndReset()
|
|||||||
return new_granule;
|
return new_granule;
|
||||||
}
|
}
|
||||||
|
|
||||||
void MergeTreeIndexAggregatorInverted::addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter, UInt64 limit)
|
void MergeTreeIndexAggregatorInverted::addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter)
|
||||||
{
|
{
|
||||||
size_t cur = 0;
|
size_t cur = 0;
|
||||||
size_t token_start = 0;
|
size_t token_start = 0;
|
||||||
size_t token_len = 0;
|
size_t token_len = 0;
|
||||||
|
|
||||||
while (cur < length && token_extractor->nextInStringPadded(data, length, &cur, &token_start, &token_len))
|
while (cur < length && token_extractor->nextInStringPadded(data, length, &cur, &token_start, &token_len))
|
||||||
gin_filter.add(data + token_start, token_len, rowID, store, limit);
|
gin_filter.add(data + token_start, token_len, rowID, store);
|
||||||
}
|
}
|
||||||
|
|
||||||
void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos, size_t limit)
|
void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos, size_t limit)
|
||||||
@ -150,7 +150,7 @@ void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos,
|
|||||||
for (size_t row_num = 0; row_num < elements_size; ++row_num)
|
for (size_t row_num = 0; row_num < elements_size; ++row_num)
|
||||||
{
|
{
|
||||||
auto ref = column_key.getDataAt(element_start_row + row_num);
|
auto ref = column_key.getDataAt(element_start_row + row_num);
|
||||||
addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col], rows_read);
|
addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col]);
|
||||||
store->incrementCurrentSizeBy(ref.size);
|
store->incrementCurrentSizeBy(ref.size);
|
||||||
}
|
}
|
||||||
current_position += 1;
|
current_position += 1;
|
||||||
@ -165,7 +165,7 @@ void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos,
|
|||||||
for (size_t i = 0; i < rows_read; ++i)
|
for (size_t i = 0; i < rows_read; ++i)
|
||||||
{
|
{
|
||||||
auto ref = column->getDataAt(current_position + i);
|
auto ref = column->getDataAt(current_position + i);
|
||||||
addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col], rows_read);
|
addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col]);
|
||||||
store->incrementCurrentSizeBy(ref.size);
|
store->incrementCurrentSizeBy(ref.size);
|
||||||
row_id++;
|
row_id++;
|
||||||
if (store->needToWrite())
|
if (store->needToWrite())
|
||||||
@ -735,8 +735,8 @@ MergeTreeIndexPtr invertedIndexCreator(
|
|||||||
const IndexDescription & index)
|
const IndexDescription & index)
|
||||||
{
|
{
|
||||||
size_t n = index.arguments.empty() ? 0 : index.arguments[0].get<size_t>();
|
size_t n = index.arguments.empty() ? 0 : index.arguments[0].get<size_t>();
|
||||||
Float64 density = index.arguments.size() < 2 ? 1.0 : index.arguments[1].get<Float64>();
|
UInt64 max_rows = index.arguments.size() < 2 ? DEFAULT_ROWS_IN_POSTINGS_LIST : index.arguments[1].get<UInt64>();
|
||||||
GinFilterParameters params(n, density);
|
GinFilterParameters params(n, max_rows);
|
||||||
|
|
||||||
/// Use SplitTokenExtractor when n is 0, otherwise use NgramTokenExtractor
|
/// Use SplitTokenExtractor when n is 0, otherwise use NgramTokenExtractor
|
||||||
if (n > 0)
|
if (n > 0)
|
||||||
@ -780,13 +780,14 @@ void invertedIndexValidator(const IndexDescription & index, bool /*attach*/)
|
|||||||
if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::UInt64)
|
if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::UInt64)
|
||||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "The first Inverted index argument must be positive integer.");
|
throw Exception(ErrorCodes::INCORRECT_QUERY, "The first Inverted index argument must be positive integer.");
|
||||||
|
|
||||||
if (index.arguments.size() == 2 && (index.arguments[1].getType() != Field::Types::Float64 || index.arguments[1].get<Float64>() <= 0 || index.arguments[1].get<Float64>() > 1))
|
if (index.arguments.size() == 2 && (index.arguments[1].getType() != Field::Types::UInt64 ||
|
||||||
throw Exception(ErrorCodes::INCORRECT_QUERY, "The second Inverted index argument must be a float between 0 and 1.");
|
(index.arguments[1].get<UInt64>() != 0 && index.arguments[1].get<UInt64>() < MIN_ROWS_IN_POSTINGS_LIST)))
|
||||||
|
throw Exception(ErrorCodes::INCORRECT_QUERY, "The maximum rows in postings list must be no less than {}", MIN_ROWS_IN_POSTINGS_LIST);
|
||||||
|
|
||||||
/// Just validate
|
/// Just validate
|
||||||
size_t ngrams = index.arguments.empty() ? 0 : index.arguments[0].get<size_t>();
|
size_t ngrams = index.arguments.empty() ? 0 : index.arguments[0].get<size_t>();
|
||||||
Float64 density = index.arguments.size() < 2 ? 1.0 : index.arguments[1].get<Float64>();
|
UInt64 max_rows = index.arguments.size() < 2 ? DEFAULT_ROWS_IN_POSTINGS_LIST : index.arguments[1].get<UInt64>();
|
||||||
GinFilterParameters params(ngrams, density);
|
GinFilterParameters params(ngrams, max_rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -48,7 +48,7 @@ struct MergeTreeIndexAggregatorInverted final : IMergeTreeIndexAggregator
|
|||||||
|
|
||||||
void update(const Block & block, size_t * pos, size_t limit) override;
|
void update(const Block & block, size_t * pos, size_t limit) override;
|
||||||
|
|
||||||
void addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter, UInt64 limit);
|
void addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter);
|
||||||
|
|
||||||
GinIndexStorePtr store;
|
GinIndexStorePtr store;
|
||||||
Names index_columns;
|
Names index_columns;
|
||||||
|
@ -277,74 +277,3 @@ SELECT * FROM tab WHERE str == 'b' AND 1.0;
|
|||||||
-- AND result_rows==1
|
-- AND result_rows==1
|
||||||
-- LIMIT 1;
|
-- LIMIT 1;
|
||||||
--
|
--
|
||||||
-- ----------------------------------------------------
|
|
||||||
-- SELECT 'Test density==1';
|
|
||||||
--
|
|
||||||
-- DROP TABLE IF EXISTS tab;
|
|
||||||
--
|
|
||||||
-- CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(0, 1.0))
|
|
||||||
-- Engine=MergeTree
|
|
||||||
-- ORDER BY (k)
|
|
||||||
-- SETTINGS max_digestion_size_per_segment = 1, index_granularity = 512
|
|
||||||
-- AS
|
|
||||||
-- SELECT number, if(number%2, format('happy {}', hex(number)), format('birthday {}', hex(number)))
|
|
||||||
-- FROM numbers(1024);
|
|
||||||
--
|
|
||||||
-- -- check inverted index was created
|
|
||||||
-- SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1;
|
|
||||||
--
|
|
||||||
-- -- search inverted index, no row has 'happy birthday'
|
|
||||||
-- SELECT count() == 0 FROM tab WHERE s =='happy birthday';
|
|
||||||
--
|
|
||||||
-- -- check the query only skip all granules (0 row total; each granule has 512 rows)
|
|
||||||
-- SYSTEM FLUSH LOGS;
|
|
||||||
-- SELECT read_rows==0 from system.query_log
|
|
||||||
-- WHERE query_kind ='Select'
|
|
||||||
-- AND current_database = currentDatabase()
|
|
||||||
-- AND endsWith(trimRight(query), 'SELECT count() == 0 FROM tab WHERE s ==\'happy birthday\';')
|
|
||||||
-- AND type='QueryFinish'
|
|
||||||
-- AND result_rows==1
|
|
||||||
-- LIMIT 1;
|
|
||||||
--
|
|
||||||
-- ----------------------------------------------------
|
|
||||||
-- SELECT 'Test density==0.1';
|
|
||||||
--
|
|
||||||
-- DROP TABLE IF EXISTS tab;
|
|
||||||
--
|
|
||||||
-- CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(0, 0.1))
|
|
||||||
-- Engine=MergeTree
|
|
||||||
-- ORDER BY (k)
|
|
||||||
-- SETTINGS max_digestion_size_per_segment = 1, index_granularity = 512
|
|
||||||
-- AS
|
|
||||||
-- SELECT number, if(number==1023, 'happy new year', if(number%2, format('happy {}', hex(number)), format('birthday {}', hex(number))))
|
|
||||||
-- FROM numbers(1024);
|
|
||||||
--
|
|
||||||
-- -- check inverted index was created
|
|
||||||
--
|
|
||||||
-- SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1;
|
|
||||||
--
|
|
||||||
-- -- search inverted index, no row has 'happy birthday'
|
|
||||||
-- SELECT count() == 0 FROM tab WHERE s == 'happy birthday';
|
|
||||||
--
|
|
||||||
-- -- check the query does not skip any of the 2 granules(1024 rows total; each granule has 512 rows)
|
|
||||||
-- SYSTEM FLUSH LOGS;
|
|
||||||
-- SELECT read_rows==1024 from system.query_log
|
|
||||||
-- WHERE query_kind ='Select'
|
|
||||||
-- AND current_database = currentDatabase()
|
|
||||||
-- AND endsWith(trimRight(query), 'SELECT count() == 0 FROM tab WHERE s == \'happy birthday\';')
|
|
||||||
-- AND type='QueryFinish'
|
|
||||||
-- AND result_rows==1
|
|
||||||
-- LIMIT 1;
|
|
||||||
--
|
|
||||||
-- -- search inverted index, no row has 'happy new year'
|
|
||||||
-- SELECT count() == 1 FROM tab WHERE s == 'happy new year';
|
|
||||||
--
|
|
||||||
-- -- check the query only read 1 granule because of density (1024 rows total; each granule has 512 rows)
|
|
||||||
-- SYSTEM FLUSH LOGS;
|
|
||||||
-- SELECT read_rows==512 from system.query_log
|
|
||||||
-- WHERE query_kind ='Select'
|
|
||||||
-- AND current_database = currentDatabase()
|
|
||||||
-- AND endsWith(trimRight(query), 'SELECT count() == 1 FROM tab WHERE s == \'happy new year\';')
|
|
||||||
-- AND type='QueryFinish'
|
|
||||||
-- AND result_rows==1
|
|
||||||
-- LIMIT 1;
|
|
||||||
|
Loading…
Reference in New Issue
Block a user