2022-06-24 01:56:15 +00:00
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <vector>
|
|
|
|
#include <memory>
|
|
|
|
#include <Storages/MergeTree/GinIndexStore.h>
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
struct GinFilterParameters
|
|
|
|
{
|
2023-01-05 03:42:45 +00:00
|
|
|
explicit GinFilterParameters(size_t ngrams_, Float64 density_);
|
2022-06-24 01:56:15 +00:00
|
|
|
|
|
|
|
size_t ngrams;
|
2023-01-05 03:42:45 +00:00
|
|
|
Float64 density;
|
2022-06-24 01:56:15 +00:00
|
|
|
};
|
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
struct GinSegmentWithRowIDRange
|
2022-06-24 01:56:15 +00:00
|
|
|
{
|
2022-07-03 12:18:51 +00:00
|
|
|
/// Segment ID of the row ID range
|
2022-06-24 01:56:15 +00:00
|
|
|
UInt32 segment_id;
|
|
|
|
|
|
|
|
/// First row ID in the range
|
|
|
|
UInt32 range_start;
|
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Last row ID in the range (inclusive)
|
2022-06-24 01:56:15 +00:00
|
|
|
UInt32 range_end;
|
|
|
|
};
|
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// GinFilter provides underlying funtionalities for building inverted index and also
|
|
|
|
/// it does filtering the unmatched rows according to its query string.
|
|
|
|
/// It also builds and uses skipping index which stores (segmentID, RowIDStart, RowIDEnd) triples.
|
2022-06-24 01:56:15 +00:00
|
|
|
class GinFilter
|
|
|
|
{
|
|
|
|
public:
|
2023-01-10 16:26:27 +00:00
|
|
|
using GinSegmentWithRowIDRanges = std::vector<GinSegmentWithRowIDRange>;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-05 03:42:45 +00:00
|
|
|
explicit GinFilter(const GinFilterParameters& params_);
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Add term(which length is 'len' and located at 'data') and its row ID to
|
|
|
|
/// the postings list builder for building inverted index for the given store.
|
2023-01-05 03:42:45 +00:00
|
|
|
void add(const char* data, size_t len, UInt32 rowID, GinIndexStorePtr& store, UInt64 limit);
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Accumulate (segmentID, RowIDStart, RowIDEnd) for building skipping index
|
2022-06-24 01:56:15 +00:00
|
|
|
void addRowRangeToGinFilter(UInt32 segmentID, UInt32 rowIDStart, UInt32 rowIDEnd);
|
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Clear the content
|
2022-06-24 01:56:15 +00:00
|
|
|
void clear();
|
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Check if the filter(built from query string) contains any rows in given filter 'af' by using
|
|
|
|
/// given postings list cache
|
2022-07-19 20:15:59 +00:00
|
|
|
bool contains(const GinFilter& af, PostingsCacheForStore &store) const;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Const getter for the row ID ranges
|
|
|
|
const GinSegmentWithRowIDRanges& getFilter() const { return rowid_ranges; }
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Mutable getter for the row ID ranges
|
|
|
|
GinSegmentWithRowIDRanges& getFilter() { return rowid_ranges; }
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Set the query string of the filter
|
2022-06-24 01:56:15 +00:00
|
|
|
void setQueryString(const char* data, size_t len)
|
|
|
|
{
|
|
|
|
query_string = String(data, len);
|
|
|
|
}
|
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Const getter of the query string
|
2022-09-25 23:29:30 +00:00
|
|
|
const String &getQueryString() const { return query_string; }
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Add term which are tokens generated from the query string
|
2022-09-28 14:28:28 +00:00
|
|
|
void addTerm(const char* data, size_t len)
|
|
|
|
{
|
|
|
|
if (len > FST::MAX_TERM_LENGTH)
|
|
|
|
return;
|
|
|
|
terms.push_back(String(data, len));
|
|
|
|
}
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Const getter of terms(generated from the query string)
|
2022-06-24 01:56:15 +00:00
|
|
|
const std::vector<String>& getTerms() const { return terms;}
|
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Check if the given postings list cache has matched rows by using the filter
|
|
|
|
bool match(const PostingsCache& postings_cache) const;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Get filter name ("inverted")
|
2022-09-25 23:29:30 +00:00
|
|
|
static String getName();
|
2022-09-28 14:28:28 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Constant of filter name
|
2022-09-28 14:28:28 +00:00
|
|
|
static constexpr auto FilterName = "inverted";
|
2022-06-24 01:56:15 +00:00
|
|
|
private:
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Filter parameters
|
2023-01-05 03:42:45 +00:00
|
|
|
const GinFilterParameters& params;
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Query string of the filter
|
2022-06-24 01:56:15 +00:00
|
|
|
String query_string;
|
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Tokenized terms from query string
|
2022-06-24 01:56:15 +00:00
|
|
|
std::vector<String> terms;
|
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Row ID ranges which are (segmentID, RowIDStart, RowIDEnd)
|
|
|
|
GinSegmentWithRowIDRanges rowid_ranges;
|
2022-09-25 23:29:30 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Helper method for checking if postings list cache is empty
|
|
|
|
static bool hasEmptyPostingsList(const PostingsCache& postings_cache);
|
2022-06-24 01:56:15 +00:00
|
|
|
|
2023-01-10 16:26:27 +00:00
|
|
|
/// Helper method to check if the postings list cache has intersection with given row ID range
|
|
|
|
static bool matchInRange(const PostingsCache& postings_cache, UInt32 segment_id, UInt32 range_start, UInt32 range_end);
|
2022-06-24 01:56:15 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
using GinFilterPtr = std::shared_ptr<GinFilter>;
|
|
|
|
|
|
|
|
}
|