2019-02-20 11:22:07 +00:00
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <Interpreters/BloomFilter.h>
|
|
|
|
#include <Storages/MergeTree/MergeTreeIndices.h>
|
2019-02-20 16:24:46 +00:00
|
|
|
#include <Storages/MergeTree/KeyCondition.h>
|
2019-02-20 11:22:07 +00:00
|
|
|
|
|
|
|
#include <memory>
|
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2020-05-28 12:37:05 +00:00
|
|
|
/// Interface for string parsers.
|
|
|
|
struct ITokenExtractor
|
|
|
|
{
|
|
|
|
virtual ~ITokenExtractor() = default;
|
2021-04-01 02:37:19 +00:00
|
|
|
|
2020-05-28 12:37:05 +00:00
|
|
|
/// Fast inplace implementation for regular use.
|
|
|
|
/// Gets string (data ptr and len) and start position for extracting next token (state of extractor).
|
|
|
|
/// Returns false if parsing is finished, otherwise returns true.
|
2021-04-01 02:37:19 +00:00
|
|
|
virtual bool nextInField(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const = 0;
|
|
|
|
|
|
|
|
/// Optimized version that can assume at least 15 padding bytes after data + len (as our Columns provide).
|
|
|
|
virtual bool nextInColumn(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const
|
|
|
|
{
|
|
|
|
return nextInField(data, len, pos, token_start, token_len);
|
|
|
|
}
|
|
|
|
|
2020-05-28 12:37:05 +00:00
|
|
|
/// Special implementation for creating bloom filter for LIKE function.
|
|
|
|
/// It skips unescaped `%` and `_` and supports escaping symbols, but it is less lightweight.
|
|
|
|
virtual bool nextLike(const String & str, size_t * pos, String & out) const = 0;
|
|
|
|
|
|
|
|
virtual bool supportLike() const = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
using TokenExtractorPtr = const ITokenExtractor *;
|
|
|
|
|
2021-04-01 02:37:19 +00:00
|
|
|
struct MergeTreeIndexGranuleFullText final : public IMergeTreeIndexGranule
|
2019-02-20 11:22:07 +00:00
|
|
|
{
|
2019-05-10 03:42:28 +00:00
|
|
|
explicit MergeTreeIndexGranuleFullText(
|
2020-05-28 12:37:05 +00:00
|
|
|
const String & index_name_,
|
|
|
|
size_t columns_number,
|
|
|
|
const BloomFilterParameters & params_);
|
2019-02-20 11:22:07 +00:00
|
|
|
|
2019-05-10 03:42:28 +00:00
|
|
|
~MergeTreeIndexGranuleFullText() override = default;
|
2019-02-20 11:22:07 +00:00
|
|
|
|
|
|
|
void serializeBinary(WriteBuffer & ostr) const override;
|
2021-08-05 18:09:17 +00:00
|
|
|
void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override;
|
2019-02-20 11:22:07 +00:00
|
|
|
|
2019-02-26 20:36:15 +00:00
|
|
|
bool empty() const override { return !has_elems; }
|
2019-02-20 11:22:07 +00:00
|
|
|
|
2020-05-28 12:37:05 +00:00
|
|
|
String index_name;
|
|
|
|
BloomFilterParameters params;
|
|
|
|
|
2019-05-10 03:42:28 +00:00
|
|
|
std::vector<BloomFilter> bloom_filters;
|
2019-02-20 12:48:50 +00:00
|
|
|
bool has_elems;
|
2019-02-20 11:22:07 +00:00
|
|
|
};
|
|
|
|
|
2019-05-10 03:42:28 +00:00
|
|
|
using MergeTreeIndexGranuleFullTextPtr = std::shared_ptr<MergeTreeIndexGranuleFullText>;
|
2019-03-11 17:59:36 +00:00
|
|
|
|
2021-04-01 02:37:19 +00:00
|
|
|
struct MergeTreeIndexAggregatorFullText final : IMergeTreeIndexAggregator
|
2019-03-11 17:59:36 +00:00
|
|
|
{
|
2020-05-28 12:37:05 +00:00
|
|
|
explicit MergeTreeIndexAggregatorFullText(
|
|
|
|
const Names & index_columns_,
|
|
|
|
const String & index_name_,
|
|
|
|
const BloomFilterParameters & params_,
|
|
|
|
TokenExtractorPtr token_extractor_);
|
2019-03-11 17:59:36 +00:00
|
|
|
|
2019-05-10 03:42:28 +00:00
|
|
|
~MergeTreeIndexAggregatorFullText() override = default;
|
2019-03-11 17:59:36 +00:00
|
|
|
|
|
|
|
bool empty() const override { return !granule || granule->empty(); }
|
|
|
|
MergeTreeIndexGranulePtr getGranuleAndReset() override;
|
|
|
|
|
|
|
|
void update(const Block & block, size_t * pos, size_t limit) override;
|
|
|
|
|
2020-05-28 12:37:05 +00:00
|
|
|
Names index_columns;
|
|
|
|
String index_name;
|
|
|
|
BloomFilterParameters params;
|
|
|
|
TokenExtractorPtr token_extractor;
|
|
|
|
|
2019-05-10 03:42:28 +00:00
|
|
|
MergeTreeIndexGranuleFullTextPtr granule;
|
2019-03-11 17:59:36 +00:00
|
|
|
};
|
|
|
|
|
2019-02-20 11:22:07 +00:00
|
|
|
|
2021-04-01 02:37:19 +00:00
|
|
|
class MergeTreeConditionFullText final : public IMergeTreeIndexCondition
|
2019-02-20 11:22:07 +00:00
|
|
|
{
|
|
|
|
public:
|
2019-05-10 03:42:28 +00:00
|
|
|
MergeTreeConditionFullText(
|
2019-02-20 16:24:46 +00:00
|
|
|
const SelectQueryInfo & query_info,
|
2021-04-10 23:33:54 +00:00
|
|
|
ContextPtr context,
|
2020-05-28 12:37:05 +00:00
|
|
|
const Block & index_sample_block,
|
|
|
|
const BloomFilterParameters & params_,
|
|
|
|
TokenExtractorPtr token_extactor_);
|
2019-02-20 11:22:07 +00:00
|
|
|
|
2019-05-10 03:42:28 +00:00
|
|
|
~MergeTreeConditionFullText() override = default;
|
2019-02-20 11:22:07 +00:00
|
|
|
|
2019-02-20 16:24:46 +00:00
|
|
|
bool alwaysUnknownOrTrue() const override;
|
|
|
|
bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const override;
|
2019-02-20 11:22:07 +00:00
|
|
|
private:
|
2019-02-24 21:17:52 +00:00
|
|
|
struct KeyTuplePositionMapping
|
|
|
|
{
|
|
|
|
KeyTuplePositionMapping(size_t tuple_index_, size_t key_index_) : tuple_index(tuple_index_), key_index(key_index_) {}
|
|
|
|
|
|
|
|
size_t tuple_index;
|
|
|
|
size_t key_index;
|
|
|
|
};
|
2019-02-20 16:24:46 +00:00
|
|
|
/// Uses RPN like KeyCondition
|
|
|
|
struct RPNElement
|
|
|
|
{
|
|
|
|
enum Function
|
|
|
|
{
|
|
|
|
/// Atoms of a Boolean expression.
|
|
|
|
FUNCTION_EQUALS,
|
|
|
|
FUNCTION_NOT_EQUALS,
|
2019-02-24 21:17:52 +00:00
|
|
|
FUNCTION_IN,
|
|
|
|
FUNCTION_NOT_IN,
|
2019-07-12 11:35:17 +00:00
|
|
|
FUNCTION_MULTI_SEARCH,
|
2019-02-20 16:24:46 +00:00
|
|
|
FUNCTION_UNKNOWN, /// Can take any value.
|
|
|
|
/// Operators of the logical expression.
|
|
|
|
FUNCTION_NOT,
|
|
|
|
FUNCTION_AND,
|
|
|
|
FUNCTION_OR,
|
|
|
|
/// Constants
|
|
|
|
ALWAYS_FALSE,
|
|
|
|
ALWAYS_TRUE,
|
|
|
|
};
|
|
|
|
|
|
|
|
RPNElement(
|
2019-07-12 11:35:17 +00:00
|
|
|
Function function_ = FUNCTION_UNKNOWN, size_t key_column_ = 0, std::unique_ptr<BloomFilter> && const_bloom_filter_ = nullptr)
|
|
|
|
: function(function_), key_column(key_column_), bloom_filter(std::move(const_bloom_filter_)) {}
|
2019-02-20 16:24:46 +00:00
|
|
|
|
|
|
|
Function function = FUNCTION_UNKNOWN;
|
2019-07-12 11:35:17 +00:00
|
|
|
/// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS and FUNCTION_MULTI_SEARCH
|
2019-02-20 16:24:46 +00:00
|
|
|
size_t key_column;
|
2019-07-12 11:35:17 +00:00
|
|
|
|
|
|
|
/// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS
|
2019-05-10 03:42:28 +00:00
|
|
|
std::unique_ptr<BloomFilter> bloom_filter;
|
2019-07-12 11:35:17 +00:00
|
|
|
|
|
|
|
/// For FUNCTION_IN, FUNCTION_NOT_IN and FUNCTION_MULTI_SEARCH
|
2019-05-10 03:42:28 +00:00
|
|
|
std::vector<std::vector<BloomFilter>> set_bloom_filters;
|
2019-07-12 11:35:17 +00:00
|
|
|
|
|
|
|
/// For FUNCTION_IN and FUNCTION_NOT_IN
|
2019-02-25 18:38:57 +00:00
|
|
|
std::vector<size_t> set_key_position;
|
2019-02-20 16:24:46 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
using RPN = std::vector<RPNElement>;
|
|
|
|
|
2019-02-25 18:23:21 +00:00
|
|
|
bool atomFromAST(const ASTPtr & node, Block & block_with_constants, RPNElement & out);
|
2019-02-20 16:24:46 +00:00
|
|
|
|
|
|
|
bool getKey(const ASTPtr & node, size_t & key_column_num);
|
2019-02-25 18:23:21 +00:00
|
|
|
bool tryPrepareSetBloomFilter(const ASTs & args, RPNElement & out);
|
2019-02-20 16:24:46 +00:00
|
|
|
|
2020-05-28 12:37:05 +00:00
|
|
|
static bool createFunctionEqualsCondition(
|
|
|
|
RPNElement & out, const Field & value, const BloomFilterParameters & params, TokenExtractorPtr token_extractor);
|
2019-07-16 11:40:11 +00:00
|
|
|
|
2020-05-28 12:37:05 +00:00
|
|
|
Names index_columns;
|
|
|
|
DataTypes index_data_types;
|
|
|
|
BloomFilterParameters params;
|
|
|
|
TokenExtractorPtr token_extractor;
|
2019-02-20 16:24:46 +00:00
|
|
|
RPN rpn;
|
2019-02-22 19:59:40 +00:00
|
|
|
/// Sets from syntax analyzer.
|
|
|
|
PreparedSets prepared_sets;
|
2019-02-20 11:22:07 +00:00
|
|
|
};
|
|
|
|
|
2019-02-23 13:06:23 +00:00
|
|
|
|
|
|
|
/// Parser extracting all ngrams from string.
|
2021-04-01 02:37:19 +00:00
|
|
|
struct NgramTokenExtractor final : public ITokenExtractor
|
2019-02-21 21:29:24 +00:00
|
|
|
{
|
|
|
|
NgramTokenExtractor(size_t n_) : n(n_) {}
|
|
|
|
|
2019-03-20 14:52:05 +00:00
|
|
|
static String getName() { return "ngrambf_v1"; }
|
2019-02-21 21:29:24 +00:00
|
|
|
|
2021-04-01 02:37:19 +00:00
|
|
|
bool nextInField(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const override;
|
2019-02-21 21:29:24 +00:00
|
|
|
bool nextLike(const String & str, size_t * pos, String & token) const override;
|
|
|
|
|
2019-02-26 19:37:07 +00:00
|
|
|
bool supportLike() const override { return true; }
|
2019-02-25 18:04:25 +00:00
|
|
|
|
2019-02-21 21:29:24 +00:00
|
|
|
size_t n;
|
2019-02-21 20:32:36 +00:00
|
|
|
};
|
2019-02-20 11:22:07 +00:00
|
|
|
|
2019-02-25 14:23:19 +00:00
|
|
|
/// Parser extracting tokens (sequences of numbers and ascii letters).
|
2021-04-01 02:37:19 +00:00
|
|
|
struct SplitTokenExtractor final : public ITokenExtractor
|
2019-02-22 10:51:19 +00:00
|
|
|
{
|
2019-03-20 14:52:05 +00:00
|
|
|
static String getName() { return "tokenbf_v1"; }
|
2019-02-22 10:51:19 +00:00
|
|
|
|
2021-04-01 02:37:19 +00:00
|
|
|
bool nextInField(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const override;
|
|
|
|
bool nextInColumn(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const override;
|
2019-02-22 10:51:19 +00:00
|
|
|
bool nextLike(const String & str, size_t * pos, String & token) const override;
|
2019-02-25 18:04:25 +00:00
|
|
|
|
2019-03-06 15:30:27 +00:00
|
|
|
bool supportLike() const override { return true; }
|
2019-02-25 14:23:19 +00:00
|
|
|
};
|
2019-02-22 10:51:19 +00:00
|
|
|
|
2019-02-24 18:55:56 +00:00
|
|
|
|
2021-04-01 02:37:19 +00:00
|
|
|
class MergeTreeIndexFullText final : public IMergeTreeIndex
|
2019-02-20 11:22:07 +00:00
|
|
|
{
|
|
|
|
public:
|
2019-05-10 03:42:28 +00:00
|
|
|
MergeTreeIndexFullText(
|
2020-05-28 13:09:03 +00:00
|
|
|
const IndexDescription & index_,
|
2020-05-28 12:37:05 +00:00
|
|
|
const BloomFilterParameters & params_,
|
|
|
|
std::unique_ptr<ITokenExtractor> && token_extractor_)
|
|
|
|
: IMergeTreeIndex(index_)
|
|
|
|
, params(params_)
|
|
|
|
, token_extractor(std::move(token_extractor_)) {}
|
2019-02-20 11:22:07 +00:00
|
|
|
|
2019-05-10 03:42:28 +00:00
|
|
|
~MergeTreeIndexFullText() override = default;
|
2019-02-20 11:22:07 +00:00
|
|
|
|
|
|
|
MergeTreeIndexGranulePtr createIndexGranule() const override;
|
2019-03-11 17:59:36 +00:00
|
|
|
MergeTreeIndexAggregatorPtr createIndexAggregator() const override;
|
2019-02-20 11:22:07 +00:00
|
|
|
|
2019-06-19 15:30:48 +00:00
|
|
|
MergeTreeIndexConditionPtr createIndexCondition(
|
2021-04-10 23:33:54 +00:00
|
|
|
const SelectQueryInfo & query, ContextPtr context) const override;
|
2019-02-20 11:22:07 +00:00
|
|
|
|
2019-02-25 08:43:19 +00:00
|
|
|
bool mayBenefitFromIndexForIn(const ASTPtr & node) const override;
|
|
|
|
|
2020-05-28 12:37:05 +00:00
|
|
|
BloomFilterParameters params;
|
2019-10-20 04:43:54 +00:00
|
|
|
/// Function for selecting next token.
|
2020-05-28 12:37:05 +00:00
|
|
|
std::unique_ptr<ITokenExtractor> token_extractor;
|
2019-02-20 11:22:07 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
}
|