ClickHouse/src/Interpreters/GinFilter.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

109 lines
3.4 KiB
C++
Raw Normal View History

2022-06-24 01:56:15 +00:00
#pragma once
#include <vector>
#include <memory>
#include <Storages/MergeTree/GinIndexStore.h>
namespace DB
{
struct GinFilterParameters
{
explicit GinFilterParameters(size_t ngrams_, Float64 density_);
2022-06-24 01:56:15 +00:00
size_t ngrams;
Float64 density;
2022-06-24 01:56:15 +00:00
};
2023-01-10 16:26:27 +00:00
struct GinSegmentWithRowIDRange
2022-06-24 01:56:15 +00:00
{
2022-07-03 12:18:51 +00:00
/// Segment ID of the row ID range
2022-06-24 01:56:15 +00:00
UInt32 segment_id;
/// First row ID in the range
UInt32 range_start;
2023-01-10 16:26:27 +00:00
/// Last row ID in the range (inclusive)
2022-06-24 01:56:15 +00:00
UInt32 range_end;
};
2023-01-10 16:26:27 +00:00
/// GinFilter provides underlying funtionalities for building inverted index and also
/// it does filtering the unmatched rows according to its query string.
/// It also builds and uses skipping index which stores (segmentID, RowIDStart, RowIDEnd) triples.
2022-06-24 01:56:15 +00:00
class GinFilter
{
public:
2023-01-10 16:26:27 +00:00
using GinSegmentWithRowIDRanges = std::vector<GinSegmentWithRowIDRange>;
2022-06-24 01:56:15 +00:00
explicit GinFilter(const GinFilterParameters& params_);
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
/// Add term(which length is 'len' and located at 'data') and its row ID to
/// the postings list builder for building inverted index for the given store.
void add(const char* data, size_t len, UInt32 rowID, GinIndexStorePtr& store, UInt64 limit);
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
/// Accumulate (segmentID, RowIDStart, RowIDEnd) for building skipping index
2022-06-24 01:56:15 +00:00
void addRowRangeToGinFilter(UInt32 segmentID, UInt32 rowIDStart, UInt32 rowIDEnd);
2023-01-10 16:26:27 +00:00
/// Clear the content
2022-06-24 01:56:15 +00:00
void clear();
2023-01-10 16:26:27 +00:00
/// Check if the filter(built from query string) contains any rows in given filter 'af' by using
/// given postings list cache
2022-07-19 20:15:59 +00:00
bool contains(const GinFilter& af, PostingsCacheForStore &store) const;
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
/// Const getter for the row ID ranges
const GinSegmentWithRowIDRanges& getFilter() const { return rowid_ranges; }
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
/// Mutable getter for the row ID ranges
GinSegmentWithRowIDRanges& getFilter() { return rowid_ranges; }
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
/// Set the query string of the filter
2022-06-24 01:56:15 +00:00
void setQueryString(const char* data, size_t len)
{
query_string = String(data, len);
}
2023-01-10 16:26:27 +00:00
/// Const getter of the query string
const String &getQueryString() const { return query_string; }
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
/// Add term which are tokens generated from the query string
void addTerm(const char* data, size_t len)
{
if (len > FST::MAX_TERM_LENGTH)
return;
terms.push_back(String(data, len));
}
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
/// Const getter of terms(generated from the query string)
2022-06-24 01:56:15 +00:00
const std::vector<String>& getTerms() const { return terms;}
2023-01-10 16:26:27 +00:00
/// Check if the given postings list cache has matched rows by using the filter
bool match(const PostingsCache& postings_cache) const;
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
/// Get filter name ("inverted")
static String getName();
2023-01-10 16:26:27 +00:00
/// Constant of filter name
static constexpr auto FilterName = "inverted";
2022-06-24 01:56:15 +00:00
private:
2023-01-10 16:26:27 +00:00
/// Filter parameters
const GinFilterParameters& params;
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
/// Query string of the filter
2022-06-24 01:56:15 +00:00
String query_string;
2023-01-10 16:26:27 +00:00
/// Tokenized terms from query string
2022-06-24 01:56:15 +00:00
std::vector<String> terms;
2023-01-10 16:26:27 +00:00
/// Row ID ranges which are (segmentID, RowIDStart, RowIDEnd)
GinSegmentWithRowIDRanges rowid_ranges;
2023-01-10 16:26:27 +00:00
/// Helper method for checking if postings list cache is empty
static bool hasEmptyPostingsList(const PostingsCache& postings_cache);
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
/// Helper method to check if the postings list cache has intersection with given row ID range
static bool matchInRange(const PostingsCache& postings_cache, UInt32 segment_id, UInt32 range_start, UInt32 range_end);
2022-06-24 01:56:15 +00:00
};
using GinFilterPtr = std::shared_ptr<GinFilter>;
}