2022-06-24 01:56:15 +00:00
|
|
|
#include <Columns/ColumnArray.h>
|
|
|
|
#include <Columns/ColumnLowCardinality.h>
|
2023-01-20 09:32:36 +00:00
|
|
|
#include <Columns/ColumnNullable.h>
|
2022-06-24 01:56:15 +00:00
|
|
|
#include <DataTypes/DataTypeArray.h>
|
|
|
|
#include <DataTypes/DataTypeLowCardinality.h>
|
2023-01-20 09:32:36 +00:00
|
|
|
#include <DataTypes/DataTypeNullable.h>
|
2022-06-24 01:56:15 +00:00
|
|
|
#include <Disks/DiskLocal.h>
|
|
|
|
#include <Interpreters/GinFilter.h>
|
2023-01-20 09:32:36 +00:00
|
|
|
#include <Storages/MergeTree/GinIndexStore.h>
|
|
|
|
#include <Storages/MergeTree/MergeTreeIndexFullText.h>
|
2023-01-20 11:40:19 +00:00
|
|
|
#include <Storages/MergeTree/MergeTreeIndexInverted.h>
|
2023-01-20 09:32:36 +00:00
|
|
|
#include <string>
|
|
|
|
#include <algorithm>
|
|
|
|
#include <city.h>
|
2022-06-24 01:56:15 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
2023-01-20 09:32:36 +00:00
|
|
|
|
2022-06-24 01:56:15 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int BAD_ARGUMENTS;
|
|
|
|
}
|
2023-01-20 09:32:36 +00:00
|
|
|
|
2023-10-17 00:52:39 +00:00
|
|
|
GinFilterParameters::GinFilterParameters(size_t ngrams_, UInt64 max_rows_per_postings_list_)
|
2023-01-20 09:32:36 +00:00
|
|
|
: ngrams(ngrams_)
|
2023-10-17 00:52:39 +00:00
|
|
|
, max_rows_per_postings_list(max_rows_per_postings_list_)
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
2023-10-17 00:52:39 +00:00
|
|
|
if (max_rows_per_postings_list == UNLIMITED_ROWS_PER_POSTINGS_LIST)
|
|
|
|
max_rows_per_postings_list = std::numeric_limits<UInt64>::max();
|
2023-10-13 17:31:21 +00:00
|
|
|
|
2022-06-24 01:56:15 +00:00
|
|
|
if (ngrams > 8)
|
2023-01-24 10:09:42 +00:00
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "The size of inverted index filter cannot be greater than 8");
|
2022-06-24 01:56:15 +00:00
|
|
|
}
|
|
|
|
|
2023-01-05 03:42:45 +00:00
|
|
|
GinFilter::GinFilter(const GinFilterParameters & params_)
|
|
|
|
: params(params_)
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2023-10-13 17:31:21 +00:00
|
|
|
void GinFilter::add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store) const
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
2022-09-07 18:22:09 +00:00
|
|
|
if (len > FST::MAX_TERM_LENGTH)
|
2022-06-24 01:56:15 +00:00
|
|
|
return;
|
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
String term(data, len);
|
2023-01-20 09:32:36 +00:00
|
|
|
auto it = store->getPostingsListBuilder().find(term);
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-20 09:32:36 +00:00
|
|
|
if (it != store->getPostingsListBuilder().end())
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
|
|
|
if (!it->second->contains(rowID))
|
|
|
|
it->second->add(rowID);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2023-10-17 00:52:39 +00:00
|
|
|
auto builder = std::make_shared<GinIndexPostingsBuilder>(params.max_rows_per_postings_list);
|
2022-06-24 01:56:15 +00:00
|
|
|
builder->add(rowID);
|
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
store->setPostingsBuilder(term, builder);
|
2022-06-24 01:56:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-25 23:29:30 +00:00
|
|
|
/// This method assumes segmentIDs are in increasing order, which is true since rows are
|
|
|
|
/// digested sequentially and segments are created sequentially too.
|
2022-06-24 01:56:15 +00:00
|
|
|
void GinFilter::addRowRangeToGinFilter(UInt32 segmentID, UInt32 rowIDStart, UInt32 rowIDEnd)
|
|
|
|
{
|
2023-01-10 16:26:27 +00:00
|
|
|
/// check segment ids are monotonic increasing
|
|
|
|
assert(rowid_ranges.empty() || rowid_ranges.back().segment_id <= segmentID);
|
|
|
|
|
|
|
|
if (!rowid_ranges.empty())
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
|
|
|
/// Try to merge the rowID range with the last one in the container
|
2023-01-20 09:32:36 +00:00
|
|
|
GinSegmentWithRowIdRange & last_rowid_range = rowid_ranges.back();
|
2023-01-10 16:26:27 +00:00
|
|
|
|
|
|
|
if (last_rowid_range.segment_id == segmentID &&
|
|
|
|
last_rowid_range.range_end+1 == rowIDStart)
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
2023-01-10 16:26:27 +00:00
|
|
|
last_rowid_range.range_end = rowIDEnd;
|
2022-06-24 01:56:15 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2023-01-10 16:26:27 +00:00
|
|
|
rowid_ranges.push_back({segmentID, rowIDStart, rowIDEnd});
|
2022-06-24 01:56:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void GinFilter::clear()
|
|
|
|
{
|
2023-01-20 09:32:36 +00:00
|
|
|
query_string.clear();
|
2022-09-25 23:29:30 +00:00
|
|
|
terms.clear();
|
2023-01-10 16:26:27 +00:00
|
|
|
rowid_ranges.clear();
|
2022-06-24 01:56:15 +00:00
|
|
|
}
|
|
|
|
|
2023-01-20 10:56:20 +00:00
|
|
|
bool GinFilter::contains(const GinFilter & filter, PostingsCacheForStore & cache_store) const
|
|
|
|
{
|
|
|
|
if (filter.getTerms().empty())
|
|
|
|
return true;
|
|
|
|
|
2023-01-20 11:18:40 +00:00
|
|
|
GinPostingsCachePtr postings_cache = cache_store.getPostings(filter.getQueryString());
|
2023-01-20 10:56:20 +00:00
|
|
|
if (postings_cache == nullptr)
|
|
|
|
{
|
|
|
|
GinIndexStoreDeserializer reader(cache_store.store);
|
|
|
|
postings_cache = reader.createPostingsCacheFromTerms(filter.getTerms());
|
|
|
|
cache_store.cache[filter.getQueryString()] = postings_cache;
|
|
|
|
}
|
|
|
|
|
|
|
|
return match(*postings_cache);
|
|
|
|
}
|
|
|
|
|
2023-01-20 09:32:36 +00:00
|
|
|
namespace
|
|
|
|
{
|
|
|
|
|
|
|
|
/// Helper method for checking if postings list cache is empty
|
2023-01-20 11:18:40 +00:00
|
|
|
bool hasEmptyPostingsList(const GinPostingsCache & postings_cache)
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
2023-01-10 16:26:27 +00:00
|
|
|
if (postings_cache.empty())
|
2022-06-24 01:56:15 +00:00
|
|
|
return true;
|
|
|
|
|
2023-01-20 09:32:36 +00:00
|
|
|
for (const auto & term_postings : postings_cache)
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
2023-01-20 11:18:40 +00:00
|
|
|
const GinSegmentedPostingsListContainer & container = term_postings.second;
|
2022-07-19 20:15:59 +00:00
|
|
|
if (container.empty())
|
2022-06-24 01:56:15 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-01-20 09:32:36 +00:00
|
|
|
/// Helper method to check if the postings list cache has intersection with given row ID range
|
2023-01-20 11:18:40 +00:00
|
|
|
bool matchInRange(const GinPostingsCache & postings_cache, UInt32 segment_id, UInt32 range_start, UInt32 range_end)
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
2023-01-20 09:32:36 +00:00
|
|
|
/// Check for each term
|
2022-06-24 01:56:15 +00:00
|
|
|
GinIndexPostingsList intersection_result;
|
|
|
|
bool intersection_result_init = false;
|
|
|
|
|
2023-01-20 09:32:36 +00:00
|
|
|
for (const auto & term_postings : postings_cache)
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
2023-01-10 17:09:11 +00:00
|
|
|
/// Check if it is in the same segment by searching for segment_id
|
2023-01-20 11:18:40 +00:00
|
|
|
const GinSegmentedPostingsListContainer & container = term_postings.second;
|
2023-01-10 16:26:27 +00:00
|
|
|
auto container_it = container.find(segment_id);
|
2022-07-03 12:18:51 +00:00
|
|
|
if (container_it == container.cend())
|
|
|
|
return false;
|
|
|
|
auto min_in_container = container_it->second->minimum();
|
2022-07-03 12:44:34 +00:00
|
|
|
auto max_in_container = container_it->second->maximum();
|
2023-01-05 03:42:45 +00:00
|
|
|
|
2023-01-05 04:08:28 +00:00
|
|
|
//check if the postings list has always match flag
|
2023-01-05 03:42:45 +00:00
|
|
|
if (container_it->second->cardinality() == 1 && UINT32_MAX == min_in_container)
|
|
|
|
continue; //always match
|
|
|
|
|
2022-07-03 12:18:51 +00:00
|
|
|
if (range_start > max_in_container || min_in_container > range_end)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/// Delay initialization as late as possible
|
|
|
|
if (!intersection_result_init)
|
|
|
|
{
|
|
|
|
intersection_result_init = true;
|
|
|
|
intersection_result.addRange(range_start, range_end+1);
|
|
|
|
}
|
|
|
|
intersection_result &= *container_it->second;
|
|
|
|
if (intersection_result.cardinality() == 0)
|
|
|
|
return false;
|
2022-06-24 01:56:15 +00:00
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-01-20 09:32:36 +00:00
|
|
|
}
|
|
|
|
|
2023-01-20 11:18:40 +00:00
|
|
|
bool GinFilter::match(const GinPostingsCache & postings_cache) const
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
2022-07-03 12:18:51 +00:00
|
|
|
if (hasEmptyPostingsList(postings_cache))
|
2022-06-24 01:56:15 +00:00
|
|
|
return false;
|
|
|
|
|
2022-07-03 12:18:51 +00:00
|
|
|
/// Check for each row ID ranges
|
2023-01-20 09:32:36 +00:00
|
|
|
for (const auto & rowid_range: rowid_ranges)
|
2022-09-25 23:29:30 +00:00
|
|
|
if (matchInRange(postings_cache, rowid_range.segment_id, rowid_range.range_start, rowid_range.range_end))
|
|
|
|
return true;
|
|
|
|
return false;
|
2022-06-24 01:56:15 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|