ClickHouse/src/Interpreters/GinFilter.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

177 lines
5.4 KiB
C++
Raw Normal View History

2022-06-24 01:56:15 +00:00
#include <Columns/ColumnArray.h>
#include <Columns/ColumnLowCardinality.h>
2023-01-20 09:32:36 +00:00
#include <Columns/ColumnNullable.h>
2022-06-24 01:56:15 +00:00
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeLowCardinality.h>
2023-01-20 09:32:36 +00:00
#include <DataTypes/DataTypeNullable.h>
2022-06-24 01:56:15 +00:00
#include <Disks/DiskLocal.h>
#include <Interpreters/GinFilter.h>
2023-01-20 09:32:36 +00:00
#include <Storages/MergeTree/GinIndexStore.h>
#include <Storages/MergeTree/MergeTreeIndexFullText.h>
#include <Storages/MergeTree/MergeTreeIndexInverted.h>
2023-01-20 09:32:36 +00:00
#include <string>
#include <algorithm>
#include <city.h>
2022-06-24 01:56:15 +00:00
namespace DB
{
2023-01-20 09:32:36 +00:00
2022-06-24 01:56:15 +00:00
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
2023-01-20 09:32:36 +00:00
GinFilterParameters::GinFilterParameters(size_t ngrams_, UInt64 max_rows_per_postings_list_)
2023-01-20 09:32:36 +00:00
: ngrams(ngrams_)
, max_rows_per_postings_list(max_rows_per_postings_list_)
2022-06-24 01:56:15 +00:00
{
if (max_rows_per_postings_list == UNLIMITED_ROWS_PER_POSTINGS_LIST)
max_rows_per_postings_list = std::numeric_limits<UInt64>::max();
2022-06-24 01:56:15 +00:00
if (ngrams > 8)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The size of inverted index filter cannot be greater than 8");
2022-06-24 01:56:15 +00:00
}
GinFilter::GinFilter(const GinFilterParameters & params_)
: params(params_)
2022-06-24 01:56:15 +00:00
{
}
void GinFilter::add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store) const
2022-06-24 01:56:15 +00:00
{
if (len > FST::MAX_TERM_LENGTH)
2022-06-24 01:56:15 +00:00
return;
2023-01-10 16:26:27 +00:00
String term(data, len);
2023-01-20 09:32:36 +00:00
auto it = store->getPostingsListBuilder().find(term);
2022-06-24 01:56:15 +00:00
2023-01-20 09:32:36 +00:00
if (it != store->getPostingsListBuilder().end())
2022-06-24 01:56:15 +00:00
{
if (!it->second->contains(rowID))
it->second->add(rowID);
}
else
{
auto builder = std::make_shared<GinIndexPostingsBuilder>(params.max_rows_per_postings_list);
2022-06-24 01:56:15 +00:00
builder->add(rowID);
2023-01-10 16:26:27 +00:00
store->setPostingsBuilder(term, builder);
2022-06-24 01:56:15 +00:00
}
}
/// This method assumes segmentIDs are in increasing order, which is true since rows are
/// digested sequentially and segments are created sequentially too.
2022-06-24 01:56:15 +00:00
void GinFilter::addRowRangeToGinFilter(UInt32 segmentID, UInt32 rowIDStart, UInt32 rowIDEnd)
{
2023-01-10 16:26:27 +00:00
/// check segment ids are monotonic increasing
assert(rowid_ranges.empty() || rowid_ranges.back().segment_id <= segmentID);
if (!rowid_ranges.empty())
2022-06-24 01:56:15 +00:00
{
/// Try to merge the rowID range with the last one in the container
2023-01-20 09:32:36 +00:00
GinSegmentWithRowIdRange & last_rowid_range = rowid_ranges.back();
2023-01-10 16:26:27 +00:00
if (last_rowid_range.segment_id == segmentID &&
last_rowid_range.range_end+1 == rowIDStart)
2022-06-24 01:56:15 +00:00
{
2023-01-10 16:26:27 +00:00
last_rowid_range.range_end = rowIDEnd;
2022-06-24 01:56:15 +00:00
return;
}
}
2023-01-10 16:26:27 +00:00
rowid_ranges.push_back({segmentID, rowIDStart, rowIDEnd});
2022-06-24 01:56:15 +00:00
}
void GinFilter::clear()
{
2023-01-20 09:32:36 +00:00
query_string.clear();
terms.clear();
2023-01-10 16:26:27 +00:00
rowid_ranges.clear();
2022-06-24 01:56:15 +00:00
}
bool GinFilter::contains(const GinFilter & filter, PostingsCacheForStore & cache_store) const
{
if (filter.getTerms().empty())
return true;
GinPostingsCachePtr postings_cache = cache_store.getPostings(filter.getQueryString());
if (postings_cache == nullptr)
{
GinIndexStoreDeserializer reader(cache_store.store);
postings_cache = reader.createPostingsCacheFromTerms(filter.getTerms());
cache_store.cache[filter.getQueryString()] = postings_cache;
}
return match(*postings_cache);
}
2023-01-20 09:32:36 +00:00
namespace
{
/// Helper method for checking if postings list cache is empty
bool hasEmptyPostingsList(const GinPostingsCache & postings_cache)
2022-06-24 01:56:15 +00:00
{
2023-01-10 16:26:27 +00:00
if (postings_cache.empty())
2022-06-24 01:56:15 +00:00
return true;
2023-01-20 09:32:36 +00:00
for (const auto & term_postings : postings_cache)
2022-06-24 01:56:15 +00:00
{
const GinSegmentedPostingsListContainer & container = term_postings.second;
2022-07-19 20:15:59 +00:00
if (container.empty())
2022-06-24 01:56:15 +00:00
return true;
}
return false;
}
2023-01-20 09:32:36 +00:00
/// Helper method to check if the postings list cache has intersection with given row ID range
bool matchInRange(const GinPostingsCache & postings_cache, UInt32 segment_id, UInt32 range_start, UInt32 range_end)
2022-06-24 01:56:15 +00:00
{
2023-01-20 09:32:36 +00:00
/// Check for each term
2022-06-24 01:56:15 +00:00
GinIndexPostingsList intersection_result;
bool intersection_result_init = false;
2023-01-20 09:32:36 +00:00
for (const auto & term_postings : postings_cache)
2022-06-24 01:56:15 +00:00
{
2023-01-10 17:09:11 +00:00
/// Check if it is in the same segment by searching for segment_id
const GinSegmentedPostingsListContainer & container = term_postings.second;
2023-01-10 16:26:27 +00:00
auto container_it = container.find(segment_id);
2022-07-03 12:18:51 +00:00
if (container_it == container.cend())
return false;
auto min_in_container = container_it->second->minimum();
2022-07-03 12:44:34 +00:00
auto max_in_container = container_it->second->maximum();
2023-01-05 04:08:28 +00:00
//check if the postings list has always match flag
if (container_it->second->cardinality() == 1 && UINT32_MAX == min_in_container)
continue; //always match
2022-07-03 12:18:51 +00:00
if (range_start > max_in_container || min_in_container > range_end)
return false;
/// Delay initialization as late as possible
if (!intersection_result_init)
{
intersection_result_init = true;
intersection_result.addRange(range_start, range_end+1);
}
intersection_result &= *container_it->second;
if (intersection_result.cardinality() == 0)
return false;
2022-06-24 01:56:15 +00:00
}
return true;
}
2023-01-20 09:32:36 +00:00
}
bool GinFilter::match(const GinPostingsCache & postings_cache) const
2022-06-24 01:56:15 +00:00
{
2022-07-03 12:18:51 +00:00
if (hasEmptyPostingsList(postings_cache))
2022-06-24 01:56:15 +00:00
return false;
2022-07-03 12:18:51 +00:00
/// Check for each row ID ranges
2023-01-20 09:32:36 +00:00
for (const auto & rowid_range: rowid_ranges)
if (matchInRange(postings_cache, rowid_range.segment_id, rowid_range.range_start, rowid_range.range_end))
return true;
return false;
2022-06-24 01:56:15 +00:00
}
}