ClickHouse/src/Interpreters/GinFilter.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

97 lines
2.8 KiB
C++
Raw Normal View History

2022-06-24 01:56:15 +00:00
#pragma once
2023-01-20 09:32:36 +00:00
#include <Storages/MergeTree/GinIndexStore.h>
2022-06-24 01:56:15 +00:00
#include <vector>
#include <memory>
2023-01-20 09:32:36 +00:00
2022-06-24 01:56:15 +00:00
namespace DB
{
2023-01-20 09:32:36 +00:00
static inline constexpr auto INVERTED_INDEX_NAME = "inverted";
2022-06-24 01:56:15 +00:00
struct GinFilterParameters
{
2023-01-20 09:32:36 +00:00
GinFilterParameters(size_t ngrams_, Float64 density_);
2022-06-24 01:56:15 +00:00
size_t ngrams;
Float64 density;
2022-06-24 01:56:15 +00:00
};
2023-01-20 09:32:36 +00:00
struct GinSegmentWithRowIdRange
2022-06-24 01:56:15 +00:00
{
2022-07-03 12:18:51 +00:00
/// Segment ID of the row ID range
2022-06-24 01:56:15 +00:00
UInt32 segment_id;
/// First row ID in the range
UInt32 range_start;
2023-01-10 16:26:27 +00:00
/// Last row ID in the range (inclusive)
2022-06-24 01:56:15 +00:00
UInt32 range_end;
};
2023-01-20 09:32:36 +00:00
using GinSegmentWithRowIdRangeVector = std::vector<GinSegmentWithRowIdRange>;
2023-01-10 17:09:11 +00:00
/// GinFilter provides underlying functionalities for building inverted index and also
2023-01-10 16:26:27 +00:00
/// it does filtering the unmatched rows according to its query string.
/// It also builds and uses skipping index which stores (segmentID, RowIDStart, RowIDEnd) triples.
2022-06-24 01:56:15 +00:00
class GinFilter
{
public:
2023-01-20 09:32:36 +00:00
explicit GinFilter(const GinFilterParameters & params_);
2022-06-24 01:56:15 +00:00
2023-01-20 09:32:36 +00:00
/// Add term (located at 'data' with length 'len') and its row ID to the postings list builder
/// for building inverted index for the given store.
void add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store, UInt64 limit) const;
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
/// Accumulate (segmentID, RowIDStart, RowIDEnd) for building skipping index
2022-06-24 01:56:15 +00:00
void addRowRangeToGinFilter(UInt32 segmentID, UInt32 rowIDStart, UInt32 rowIDEnd);
2023-01-10 16:26:27 +00:00
/// Clear the content
2022-06-24 01:56:15 +00:00
void clear();
2023-01-20 09:32:36 +00:00
/// Check if the filter (built from query string) contains any rows in given filter by using
2023-01-10 16:26:27 +00:00
/// given postings list cache
2023-01-20 09:32:36 +00:00
bool contains(const GinFilter & filter, PostingsCacheForStore & cache_store) const;
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
/// Set the query string of the filter
2023-01-20 09:32:36 +00:00
void setQueryString(const char * data, size_t len)
2022-06-24 01:56:15 +00:00
{
query_string = String(data, len);
}
2023-01-10 16:26:27 +00:00
/// Add term which are tokens generated from the query string
2023-01-20 09:32:36 +00:00
void addTerm(const char * data, size_t len)
{
if (len > FST::MAX_TERM_LENGTH)
return;
terms.push_back(String(data, len));
}
2022-06-24 01:56:15 +00:00
2023-01-20 09:32:36 +00:00
/// Getter
const String & getQueryString() const { return query_string; }
const std::vector<String> & getTerms() const { return terms; }
2023-01-20 09:32:36 +00:00
const GinSegmentWithRowIdRangeVector & getFilter() const { return rowid_ranges; }
GinSegmentWithRowIdRangeVector & getFilter() { return rowid_ranges; }
2022-06-24 01:56:15 +00:00
private:
2023-01-10 16:26:27 +00:00
/// Filter parameters
2023-01-20 09:32:36 +00:00
const GinFilterParameters & params;
2022-06-24 01:56:15 +00:00
2023-01-10 16:26:27 +00:00
/// Query string of the filter
2022-06-24 01:56:15 +00:00
String query_string;
2023-01-10 16:26:27 +00:00
/// Tokenized terms from query string
2022-06-24 01:56:15 +00:00
std::vector<String> terms;
2023-01-10 16:26:27 +00:00
/// Row ID ranges which are (segmentID, RowIDStart, RowIDEnd)
2023-01-20 09:32:36 +00:00
GinSegmentWithRowIdRangeVector rowid_ranges;
2022-06-24 01:56:15 +00:00
2023-01-20 09:32:36 +00:00
/// Check if the given postings list cache has matched rows by using the filter
bool match(const GinPostingsCache & postings_cache) const;
2022-06-24 01:56:15 +00:00
};
2023-01-20 09:32:36 +00:00
using GinFilters = std::vector<GinFilter>;
2022-06-24 01:56:15 +00:00
}