ClickHouse/src/Storages/MergeTree/MergeTreeIndexFullText.h

176 lines
5.4 KiB
C++
Raw Normal View History

2019-02-20 11:22:07 +00:00
#pragma once
2021-10-06 20:22:29 +00:00
#include <memory>
2019-02-20 11:22:07 +00:00
#include <Storages/MergeTree/MergeTreeIndices.h>
2019-02-20 16:24:46 +00:00
#include <Storages/MergeTree/KeyCondition.h>
2021-10-06 20:22:29 +00:00
#include <Interpreters/BloomFilter.h>
#include <Interpreters/ITokenExtractor.h>
2019-02-20 11:22:07 +00:00
namespace DB
{
struct MergeTreeIndexGranuleFullText final : public IMergeTreeIndexGranule
2019-02-20 11:22:07 +00:00
{
2019-05-10 03:42:28 +00:00
explicit MergeTreeIndexGranuleFullText(
2020-05-28 12:37:05 +00:00
const String & index_name_,
size_t columns_number,
const BloomFilterParameters & params_);
2019-02-20 11:22:07 +00:00
2019-05-10 03:42:28 +00:00
~MergeTreeIndexGranuleFullText() override = default;
2019-02-20 11:22:07 +00:00
void serializeBinary(WriteBuffer & ostr) const override;
void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override;
2019-02-20 11:22:07 +00:00
2019-02-26 20:36:15 +00:00
bool empty() const override { return !has_elems; }
2019-02-20 11:22:07 +00:00
2020-05-28 12:37:05 +00:00
String index_name;
BloomFilterParameters params;
2019-05-10 03:42:28 +00:00
std::vector<BloomFilter> bloom_filters;
2019-02-20 12:48:50 +00:00
bool has_elems;
2019-02-20 11:22:07 +00:00
};
2019-05-10 03:42:28 +00:00
using MergeTreeIndexGranuleFullTextPtr = std::shared_ptr<MergeTreeIndexGranuleFullText>;
2019-03-11 17:59:36 +00:00
struct MergeTreeIndexAggregatorFullText final : IMergeTreeIndexAggregator
2019-03-11 17:59:36 +00:00
{
2020-05-28 12:37:05 +00:00
explicit MergeTreeIndexAggregatorFullText(
const Names & index_columns_,
const String & index_name_,
const BloomFilterParameters & params_,
TokenExtractorPtr token_extractor_);
2019-03-11 17:59:36 +00:00
2019-05-10 03:42:28 +00:00
~MergeTreeIndexAggregatorFullText() override = default;
2019-03-11 17:59:36 +00:00
bool empty() const override { return !granule || granule->empty(); }
MergeTreeIndexGranulePtr getGranuleAndReset() override;
void update(const Block & block, size_t * pos, size_t limit) override;
2020-05-28 12:37:05 +00:00
Names index_columns;
String index_name;
BloomFilterParameters params;
TokenExtractorPtr token_extractor;
2019-05-10 03:42:28 +00:00
MergeTreeIndexGranuleFullTextPtr granule;
2019-03-11 17:59:36 +00:00
};
2019-02-20 11:22:07 +00:00
class MergeTreeConditionFullText final : public IMergeTreeIndexCondition
2019-02-20 11:22:07 +00:00
{
public:
2019-05-10 03:42:28 +00:00
MergeTreeConditionFullText(
2019-02-20 16:24:46 +00:00
const SelectQueryInfo & query_info,
ContextPtr context,
2020-05-28 12:37:05 +00:00
const Block & index_sample_block,
const BloomFilterParameters & params_,
TokenExtractorPtr token_extactor_);
2019-02-20 11:22:07 +00:00
2019-05-10 03:42:28 +00:00
~MergeTreeConditionFullText() override = default;
2019-02-20 11:22:07 +00:00
2019-02-20 16:24:46 +00:00
bool alwaysUnknownOrTrue() const override;
bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const override;
2019-02-20 11:22:07 +00:00
private:
2019-02-24 21:17:52 +00:00
struct KeyTuplePositionMapping
{
KeyTuplePositionMapping(size_t tuple_index_, size_t key_index_) : tuple_index(tuple_index_), key_index(key_index_) {}
size_t tuple_index;
size_t key_index;
};
2019-02-20 16:24:46 +00:00
/// Uses RPN like KeyCondition
struct RPNElement
{
enum Function
{
/// Atoms of a Boolean expression.
FUNCTION_EQUALS,
FUNCTION_NOT_EQUALS,
FUNCTION_HAS,
2019-02-24 21:17:52 +00:00
FUNCTION_IN,
FUNCTION_NOT_IN,
FUNCTION_MULTI_SEARCH,
2019-02-20 16:24:46 +00:00
FUNCTION_UNKNOWN, /// Can take any value.
/// Operators of the logical expression.
FUNCTION_NOT,
FUNCTION_AND,
FUNCTION_OR,
/// Constants
ALWAYS_FALSE,
ALWAYS_TRUE,
};
RPNElement(
Function function_ = FUNCTION_UNKNOWN, size_t key_column_ = 0, std::unique_ptr<BloomFilter> && const_bloom_filter_ = nullptr)
: function(function_), key_column(key_column_), bloom_filter(std::move(const_bloom_filter_)) {}
2019-02-20 16:24:46 +00:00
Function function = FUNCTION_UNKNOWN;
/// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS and FUNCTION_MULTI_SEARCH
2019-02-20 16:24:46 +00:00
size_t key_column;
/// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS
2019-05-10 03:42:28 +00:00
std::unique_ptr<BloomFilter> bloom_filter;
/// For FUNCTION_IN, FUNCTION_NOT_IN and FUNCTION_MULTI_SEARCH
2019-05-10 03:42:28 +00:00
std::vector<std::vector<BloomFilter>> set_bloom_filters;
/// For FUNCTION_IN and FUNCTION_NOT_IN
2019-02-25 18:38:57 +00:00
std::vector<size_t> set_key_position;
2019-02-20 16:24:46 +00:00
};
using RPN = std::vector<RPNElement>;
bool traverseAtomAST(const ASTPtr & node, Block & block_with_constants, RPNElement & out);
bool traverseASTEquals(
const String & function_name,
const ASTPtr & key_ast,
const DataTypePtr & value_type,
const Field & value_field,
RPNElement & out);
2019-02-20 16:24:46 +00:00
bool getKey(const std::string & key_column_name, size_t & key_column_num);
2019-02-25 18:23:21 +00:00
bool tryPrepareSetBloomFilter(const ASTs & args, RPNElement & out);
2019-02-20 16:24:46 +00:00
2020-05-28 12:37:05 +00:00
static bool createFunctionEqualsCondition(
RPNElement & out, const Field & value, const BloomFilterParameters & params, TokenExtractorPtr token_extractor);
2019-07-16 11:40:11 +00:00
2020-05-28 12:37:05 +00:00
Names index_columns;
DataTypes index_data_types;
BloomFilterParameters params;
TokenExtractorPtr token_extractor;
2019-02-20 16:24:46 +00:00
RPN rpn;
2019-02-22 19:59:40 +00:00
/// Sets from syntax analyzer.
PreparedSets prepared_sets;
2019-02-20 11:22:07 +00:00
};
class MergeTreeIndexFullText final : public IMergeTreeIndex
2019-02-20 11:22:07 +00:00
{
public:
2019-05-10 03:42:28 +00:00
MergeTreeIndexFullText(
2020-05-28 13:09:03 +00:00
const IndexDescription & index_,
2020-05-28 12:37:05 +00:00
const BloomFilterParameters & params_,
std::unique_ptr<ITokenExtractor> && token_extractor_)
: IMergeTreeIndex(index_)
, params(params_)
, token_extractor(std::move(token_extractor_)) {}
2019-02-20 11:22:07 +00:00
2019-05-10 03:42:28 +00:00
~MergeTreeIndexFullText() override = default;
2019-02-20 11:22:07 +00:00
MergeTreeIndexGranulePtr createIndexGranule() const override;
2019-03-11 17:59:36 +00:00
MergeTreeIndexAggregatorPtr createIndexAggregator() const override;
2019-02-20 11:22:07 +00:00
2019-06-19 15:30:48 +00:00
MergeTreeIndexConditionPtr createIndexCondition(
const SelectQueryInfo & query, ContextPtr context) const override;
2019-02-20 11:22:07 +00:00
2019-02-25 08:43:19 +00:00
bool mayBenefitFromIndexForIn(const ASTPtr & node) const override;
2020-05-28 12:37:05 +00:00
BloomFilterParameters params;
2019-10-20 04:43:54 +00:00
/// Function for selecting next token.
2020-05-28 12:37:05 +00:00
std::unique_ptr<ITokenExtractor> token_extractor;
2019-02-20 11:22:07 +00:00
};
}