2022-06-24 01:56:15 +00:00
|
|
|
#pragma once
|
|
|
|
|
2023-01-20 09:32:36 +00:00
|
|
|
#include <Storages/MergeTree/GinIndexStore.h>
|
2022-06-24 01:56:15 +00:00
|
|
|
#include <vector>
|
|
|
|
#include <memory>
|
2023-01-20 09:32:36 +00:00
|
|
|
|
2022-06-24 01:56:15 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
2023-01-20 09:32:36 +00:00
|
|
|
|
|
|
|
static inline constexpr auto INVERTED_INDEX_NAME = "inverted";
|
|
|
|
|
2022-06-24 01:56:15 +00:00
|
|
|
struct GinFilterParameters
|
|
|
|
{
|
2023-01-20 09:32:36 +00:00
|
|
|
GinFilterParameters(size_t ngrams_, Float64 density_);
|
2022-06-24 01:56:15 +00:00
|
|
|
|
|
|
|
size_t ngrams;
|
2023-01-05 03:42:45 +00:00
|
|
|
Float64 density;
|
2022-06-24 01:56:15 +00:00
|
|
|
};
|
|
|
|
|
2023-01-20 09:32:36 +00:00
|
|
|
struct GinSegmentWithRowIdRange
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
2022-07-03 12:18:51 +00:00
|
|
|
/// Segment ID of the row ID range
|
2022-06-24 01:56:15 +00:00
|
|
|
UInt32 segment_id;
|
|
|
|
|
|
|
|
/// First row ID in the range
|
|
|
|
UInt32 range_start;
|
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Last row ID in the range (inclusive)
|
2022-06-24 01:56:15 +00:00
|
|
|
UInt32 range_end;
|
|
|
|
};
|
|
|
|
|
2023-01-20 09:32:36 +00:00
|
|
|
using GinSegmentWithRowIdRangeVector = std::vector<GinSegmentWithRowIdRange>;
|
|
|
|
|
2023-01-10 17:09:11 +00:00
|
|
|
/// GinFilter provides underlying functionalities for building inverted index and also
|
2023-01-10 16:26:27 +00:00
|
|
|
/// it does filtering the unmatched rows according to its query string.
|
|
|
|
/// It also builds and uses skipping index which stores (segmentID, RowIDStart, RowIDEnd) triples.
|
2022-06-24 01:56:15 +00:00
|
|
|
class GinFilter
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
|
2023-01-20 09:32:36 +00:00
|
|
|
explicit GinFilter(const GinFilterParameters & params_);
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-20 09:32:36 +00:00
|
|
|
/// Add term (located at 'data' with length 'len') and its row ID to the postings list builder
|
|
|
|
/// for building inverted index for the given store.
|
|
|
|
void add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store, UInt64 limit) const;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Accumulate (segmentID, RowIDStart, RowIDEnd) for building skipping index
|
2022-06-24 01:56:15 +00:00
|
|
|
void addRowRangeToGinFilter(UInt32 segmentID, UInt32 rowIDStart, UInt32 rowIDEnd);
|
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Clear the content
|
2022-06-24 01:56:15 +00:00
|
|
|
void clear();
|
|
|
|
|
2023-01-20 09:32:36 +00:00
|
|
|
/// Check if the filter (built from query string) contains any rows in given filter by using
|
2023-01-10 16:26:27 +00:00
|
|
|
/// given postings list cache
|
2023-01-20 09:32:36 +00:00
|
|
|
bool contains(const GinFilter & filter, PostingsCacheForStore & cache_store) const;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Set the query string of the filter
|
2023-01-20 09:32:36 +00:00
|
|
|
void setQueryString(const char * data, size_t len)
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
|
|
|
query_string = String(data, len);
|
|
|
|
}
|
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Add term which are tokens generated from the query string
|
2023-01-20 09:32:36 +00:00
|
|
|
void addTerm(const char * data, size_t len)
|
2022-09-28 14:28:28 +00:00
|
|
|
{
|
|
|
|
if (len > FST::MAX_TERM_LENGTH)
|
|
|
|
return;
|
|
|
|
terms.push_back(String(data, len));
|
|
|
|
}
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-20 09:32:36 +00:00
|
|
|
/// Getter
|
|
|
|
const String & getQueryString() const { return query_string; }
|
|
|
|
const std::vector<String> & getTerms() const { return terms;}
|
|
|
|
const GinSegmentWithRowIdRangeVector & getFilter() const { return rowid_ranges; }
|
|
|
|
GinSegmentWithRowIdRangeVector & getFilter() { return rowid_ranges; }
|
2022-09-28 14:28:28 +00:00
|
|
|
|
2022-06-24 01:56:15 +00:00
|
|
|
private:
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Filter parameters
|
2023-01-20 09:32:36 +00:00
|
|
|
const GinFilterParameters & params;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Query string of the filter
|
2022-06-24 01:56:15 +00:00
|
|
|
String query_string;
|
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Tokenized terms from query string
|
2022-06-24 01:56:15 +00:00
|
|
|
std::vector<String> terms;
|
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Row ID ranges which are (segmentID, RowIDStart, RowIDEnd)
|
2023-01-20 09:32:36 +00:00
|
|
|
GinSegmentWithRowIdRangeVector rowid_ranges;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-20 09:32:36 +00:00
|
|
|
/// Check if the given postings list cache has matched rows by using the filter
|
|
|
|
bool match(const PostingsCache & postings_cache) const;
|
2022-06-24 01:56:15 +00:00
|
|
|
};
|
|
|
|
|
2023-01-20 09:32:36 +00:00
|
|
|
using GinFilters = std::vector<GinFilter>;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
|
|
|
}
|