mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 09:32:06 +00:00
Merge pull request #4499 from nikvas0/nikvas0/bloom_filter_index
Bloom filter indices
This commit is contained in:
commit
2b33e9b7d5
76
dbms/src/Interpreters/BloomFilter.cpp
Normal file
76
dbms/src/Interpreters/BloomFilter.cpp
Normal file
@ -0,0 +1,76 @@
|
||||
#include <Interpreters/BloomFilter.h>
|
||||
|
||||
#include <city.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
static constexpr UInt64 SEED_GEN_A = 845897321;
|
||||
static constexpr UInt64 SEED_GEN_B = 217728422;
|
||||
|
||||
|
||||
StringBloomFilter::StringBloomFilter(size_t size_, size_t hashes_, size_t seed_)
|
||||
: size(size_), hashes(hashes_), seed(seed_), words((size + sizeof(UnderType) - 1) / sizeof(UnderType)), filter(words, 0) {}
|
||||
|
||||
StringBloomFilter::StringBloomFilter(const StringBloomFilter & bloom_filter)
|
||||
: size(bloom_filter.size), hashes(bloom_filter.hashes), seed(bloom_filter.seed), words(bloom_filter.words), filter(bloom_filter.filter) {}
|
||||
|
||||
bool StringBloomFilter::find(const char * data, size_t len)
|
||||
{
|
||||
size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed);
|
||||
size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B);
|
||||
|
||||
for (size_t i = 0; i < hashes; ++i)
|
||||
{
|
||||
size_t pos = (hash1 + i * hash2 + i * i) % (8 * size);
|
||||
if (!(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType))))))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void StringBloomFilter::add(const char * data, size_t len)
|
||||
{
|
||||
size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed);
|
||||
size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B);
|
||||
|
||||
for (size_t i = 0; i < hashes; ++i)
|
||||
{
|
||||
size_t pos = (hash1 + i * hash2 + i * i) % (8 * size);
|
||||
filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType))));
|
||||
}
|
||||
}
|
||||
|
||||
void StringBloomFilter::clear()
|
||||
{
|
||||
filter.assign(words, 0);
|
||||
}
|
||||
|
||||
bool StringBloomFilter::contains(const StringBloomFilter & bf)
|
||||
{
|
||||
for (size_t i = 0; i < words; ++i)
|
||||
{
|
||||
if ((filter[i] & bf.filter[i]) != bf.filter[i])
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
UInt64 StringBloomFilter::isEmpty() const
|
||||
{
|
||||
for (size_t i = 0; i < words; ++i)
|
||||
if (filter[i] != 0)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool operator== (const StringBloomFilter & a, const StringBloomFilter & b)
|
||||
{
|
||||
for (size_t i = 0; i < a.words; ++i)
|
||||
if (a.filter[i] != b.filter[i])
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
50
dbms/src/Interpreters/BloomFilter.h
Normal file
50
dbms/src/Interpreters/BloomFilter.h
Normal file
@ -0,0 +1,50 @@
|
||||
#pragma once
|
||||
|
||||
#include <Core/Types.h>
|
||||
#include <vector>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/// Bloom filter for strings.
|
||||
class StringBloomFilter
|
||||
{
|
||||
public:
|
||||
using UnderType = UInt64;
|
||||
using Container = std::vector<UnderType>;
|
||||
|
||||
/// size -- size of filter in bytes.
|
||||
/// hashes -- number of used hash functions.
|
||||
/// seed -- random seed for hash functions generation.
|
||||
StringBloomFilter(size_t size_, size_t hashes_, size_t seed_);
|
||||
StringBloomFilter(const StringBloomFilter & bloom_filter);
|
||||
|
||||
bool find(const char * data, size_t len);
|
||||
void add(const char * data, size_t len);
|
||||
void clear();
|
||||
|
||||
/// Checks if this contains everything from another bloom filter.
|
||||
/// Bloom filters must have equal size and seed.
|
||||
bool contains(const StringBloomFilter & bf);
|
||||
|
||||
const Container & getFilter() const { return filter; }
|
||||
Container & getFilter() { return filter; }
|
||||
|
||||
/// For debug.
|
||||
UInt64 isEmpty() const;
|
||||
|
||||
friend bool operator== (const StringBloomFilter & a, const StringBloomFilter & b);
|
||||
private:
|
||||
|
||||
size_t size;
|
||||
size_t hashes;
|
||||
size_t seed;
|
||||
size_t words;
|
||||
Container filter;
|
||||
};
|
||||
|
||||
|
||||
bool operator== (const StringBloomFilter & a, const StringBloomFilter & b);
|
||||
|
||||
}
|
@ -317,7 +317,7 @@ bool KeyCondition::addCondition(const String & column, const Range & range)
|
||||
/** Computes value of constant expression and its data type.
|
||||
* Returns false, if expression isn't constant.
|
||||
*/
|
||||
static bool getConstant(const ASTPtr & expr, Block & block_with_constants, Field & out_value, DataTypePtr & out_type)
|
||||
bool KeyCondition::getConstant(const ASTPtr & expr, Block & block_with_constants, Field & out_value, DataTypePtr & out_type)
|
||||
{
|
||||
String column_name = expr->getColumnName();
|
||||
|
||||
|
@ -266,6 +266,11 @@ public:
|
||||
*/
|
||||
using MonotonicFunctionsChain = std::vector<FunctionBasePtr>;
|
||||
|
||||
/** Computes value of constant expression and its data type.
|
||||
* Returns false, if expression isn't constant.
|
||||
*/
|
||||
static bool getConstant(
|
||||
const ASTPtr & expr, Block & block_with_constants, Field & out_value, DataTypePtr & out_type);
|
||||
|
||||
static Block getBlockWithConstants(
|
||||
const ASTPtr & query, const SyntaxAnalyzerResultPtr & syntax_analyzer_result, const Context & context);
|
||||
|
710
dbms/src/Storages/MergeTree/MergeTreeBloomFilterIndex.cpp
Normal file
710
dbms/src/Storages/MergeTree/MergeTreeBloomFilterIndex.cpp
Normal file
@ -0,0 +1,710 @@
|
||||
#include <Storages/MergeTree/MergeTreeBloomFilterIndex.h>
|
||||
|
||||
#include <Common/StringUtils/StringUtils.h>
|
||||
#include <Common/UTF8Helpers.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <Interpreters/ExpressionActions.h>
|
||||
#include <Interpreters/ExpressionAnalyzer.h>
|
||||
#include <Interpreters/SyntaxAnalyzer.h>
|
||||
#include <Interpreters/QueryNormalizer.h>
|
||||
#include <Storages/MergeTree/MergeTreeData.h>
|
||||
#include <Storages/MergeTree/RPNBuilder.h>
|
||||
#include <Parsers/ASTIdentifier.h>
|
||||
#include <Parsers/ASTLiteral.h>
|
||||
#include <Parsers/ASTSubquery.h>
|
||||
|
||||
#include <Poco/Logger.h>
|
||||
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int INCORRECT_QUERY;
|
||||
}
|
||||
|
||||
|
||||
/// Adds all tokens from string to bloom filter.
|
||||
static void stringToBloomFilter(
|
||||
const char * data, size_t size, const std::unique_ptr<ITokenExtractor> & token_extractor, StringBloomFilter & bloom_filter)
|
||||
{
|
||||
size_t cur = 0;
|
||||
size_t token_start = 0;
|
||||
size_t token_len = 0;
|
||||
while (cur < size && token_extractor->next(data, size, &cur, &token_start, &token_len))
|
||||
bloom_filter.add(data + token_start, token_len);
|
||||
}
|
||||
|
||||
/// Adds all tokens from like pattern string to bloom filter. (Because like pattern can contain `\%` and `\_`.)
|
||||
static void likeStringToBloomFilter(
|
||||
const String & data, const std::unique_ptr<ITokenExtractor> & token_extractor, StringBloomFilter & bloom_filter)
|
||||
{
|
||||
size_t cur = 0;
|
||||
String token;
|
||||
while (cur < data.size() && token_extractor->nextLike(data, &cur, token))
|
||||
bloom_filter.add(token.c_str(), token.size());
|
||||
}
|
||||
|
||||
|
||||
MergeTreeBloomFilterIndexGranule::MergeTreeBloomFilterIndexGranule(const MergeTreeBloomFilterIndex & index)
|
||||
: IMergeTreeIndexGranule()
|
||||
, index(index)
|
||||
, bloom_filters(
|
||||
index.columns.size(), StringBloomFilter(index.bloom_filter_size, index.bloom_filter_hashes, index.seed))
|
||||
, has_elems(false) {}
|
||||
|
||||
void MergeTreeBloomFilterIndexGranule::serializeBinary(WriteBuffer & ostr) const
|
||||
{
|
||||
if (empty())
|
||||
throw Exception(
|
||||
"Attempt to write empty minmax index `" + index.name + "`", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
for (const auto & bloom_filter : bloom_filters)
|
||||
ostr.write(reinterpret_cast<const char *>(bloom_filter.getFilter().data()), index.bloom_filter_size);
|
||||
}
|
||||
|
||||
void MergeTreeBloomFilterIndexGranule::deserializeBinary(ReadBuffer & istr)
|
||||
{
|
||||
for (auto & bloom_filter : bloom_filters)
|
||||
{
|
||||
istr.read(reinterpret_cast<char *>(bloom_filter.getFilter().data()), index.bloom_filter_size);
|
||||
}
|
||||
has_elems = true;
|
||||
}
|
||||
|
||||
|
||||
MergeTreeBloomFilterIndexAggregator::MergeTreeBloomFilterIndexAggregator(const MergeTreeBloomFilterIndex & index)
|
||||
: index(index), granule(std::make_shared<MergeTreeBloomFilterIndexGranule>(index)) {}
|
||||
|
||||
MergeTreeIndexGranulePtr MergeTreeBloomFilterIndexAggregator::getGranuleAndReset()
|
||||
{
|
||||
auto new_granule = std::make_shared<MergeTreeBloomFilterIndexGranule>(index);
|
||||
new_granule.swap(granule);
|
||||
return new_granule;
|
||||
}
|
||||
|
||||
void MergeTreeBloomFilterIndexAggregator::update(const Block & block, size_t * pos, size_t limit)
|
||||
{
|
||||
if (*pos >= block.rows())
|
||||
throw Exception(
|
||||
"The provided position is not less than the number of block rows. Position: "
|
||||
+ toString(*pos) + ", Block rows: " + toString(block.rows()) + ".", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
size_t rows_read = std::min(limit, block.rows() - *pos);
|
||||
|
||||
for (size_t col = 0; col < index.columns.size(); ++col)
|
||||
{
|
||||
const auto & column = block.getByName(index.columns[col]).column;
|
||||
for (size_t i = 0; i < rows_read; ++i)
|
||||
{
|
||||
auto ref = column->getDataAt(*pos + i);
|
||||
stringToBloomFilter(ref.data, ref.size, index.token_extractor_func, granule->bloom_filters[col]);
|
||||
}
|
||||
}
|
||||
granule->has_elems = true;
|
||||
*pos += rows_read;
|
||||
}
|
||||
|
||||
|
||||
const BloomFilterCondition::AtomMap BloomFilterCondition::atom_map
|
||||
{
|
||||
{
|
||||
"notEquals",
|
||||
[] (RPNElement & out, const Field & value, const MergeTreeBloomFilterIndex & idx)
|
||||
{
|
||||
out.function = RPNElement::FUNCTION_NOT_EQUALS;
|
||||
out.bloom_filter = std::make_unique<StringBloomFilter>(
|
||||
idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
|
||||
|
||||
const auto & str = value.get<String>();
|
||||
stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, *out.bloom_filter);
|
||||
return true;
|
||||
}
|
||||
},
|
||||
{
|
||||
"equals",
|
||||
[] (RPNElement & out, const Field & value, const MergeTreeBloomFilterIndex & idx)
|
||||
{
|
||||
out.function = RPNElement::FUNCTION_EQUALS;
|
||||
out.bloom_filter = std::make_unique<StringBloomFilter>(
|
||||
idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
|
||||
|
||||
const auto & str = value.get<String>();
|
||||
stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, *out.bloom_filter);
|
||||
return true;
|
||||
}
|
||||
},
|
||||
{
|
||||
"like",
|
||||
[] (RPNElement & out, const Field & value, const MergeTreeBloomFilterIndex & idx)
|
||||
{
|
||||
out.function = RPNElement::FUNCTION_LIKE;
|
||||
out.bloom_filter = std::make_unique<StringBloomFilter>(
|
||||
idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
|
||||
|
||||
const auto & str = value.get<String>();
|
||||
likeStringToBloomFilter(str, idx.token_extractor_func, *out.bloom_filter);
|
||||
return true;
|
||||
}
|
||||
},
|
||||
{
|
||||
"notIn",
|
||||
[] (RPNElement & out, const Field &, const MergeTreeBloomFilterIndex &)
|
||||
{
|
||||
out.function = RPNElement::FUNCTION_NOT_IN;
|
||||
return true;
|
||||
}
|
||||
},
|
||||
{
|
||||
"in",
|
||||
[] (RPNElement & out, const Field &, const MergeTreeBloomFilterIndex &)
|
||||
{
|
||||
out.function = RPNElement::FUNCTION_IN;
|
||||
return true;
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
BloomFilterCondition::BloomFilterCondition(
|
||||
const SelectQueryInfo & query_info,
|
||||
const Context & context,
|
||||
const MergeTreeBloomFilterIndex & index_) : index(index_), prepared_sets(query_info.sets)
|
||||
{
|
||||
rpn = std::move(
|
||||
RPNBuilder<RPNElement>(
|
||||
query_info, context,
|
||||
[this] (const ASTPtr & node,
|
||||
const Context & /* context */,
|
||||
Block & block_with_constants,
|
||||
RPNElement & out) -> bool
|
||||
{
|
||||
return this->atomFromAST(node, block_with_constants, out);
|
||||
}).extractRPN());
|
||||
}
|
||||
|
||||
bool BloomFilterCondition::alwaysUnknownOrTrue() const
|
||||
{
|
||||
/// Check like in KeyCondition.
|
||||
std::vector<bool> rpn_stack;
|
||||
|
||||
for (const auto & element : rpn)
|
||||
{
|
||||
if (element.function == RPNElement::FUNCTION_UNKNOWN
|
||||
|| element.function == RPNElement::ALWAYS_TRUE)
|
||||
{
|
||||
rpn_stack.push_back(true);
|
||||
}
|
||||
else if (element.function == RPNElement::FUNCTION_EQUALS
|
||||
|| element.function == RPNElement::FUNCTION_NOT_EQUALS
|
||||
|| element.function == RPNElement::FUNCTION_LIKE
|
||||
|| element.function == RPNElement::FUNCTION_NOT_LIKE
|
||||
|| element.function == RPNElement::FUNCTION_IN
|
||||
|| element.function == RPNElement::FUNCTION_NOT_IN
|
||||
|| element.function == RPNElement::ALWAYS_FALSE)
|
||||
{
|
||||
rpn_stack.push_back(false);
|
||||
}
|
||||
else if (element.function == RPNElement::FUNCTION_NOT)
|
||||
{
|
||||
// do nothing
|
||||
}
|
||||
else if (element.function == RPNElement::FUNCTION_AND)
|
||||
{
|
||||
auto arg1 = rpn_stack.back();
|
||||
rpn_stack.pop_back();
|
||||
auto arg2 = rpn_stack.back();
|
||||
rpn_stack.back() = arg1 && arg2;
|
||||
}
|
||||
else if (element.function == RPNElement::FUNCTION_OR)
|
||||
{
|
||||
auto arg1 = rpn_stack.back();
|
||||
rpn_stack.pop_back();
|
||||
auto arg2 = rpn_stack.back();
|
||||
rpn_stack.back() = arg1 || arg2;
|
||||
}
|
||||
else
|
||||
throw Exception("Unexpected function type in KeyCondition::RPNElement", ErrorCodes::LOGICAL_ERROR);
|
||||
}
|
||||
|
||||
return rpn_stack[0];
|
||||
}
|
||||
|
||||
bool BloomFilterCondition::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const
|
||||
{
|
||||
std::shared_ptr<MergeTreeBloomFilterIndexGranule> granule
|
||||
= std::dynamic_pointer_cast<MergeTreeBloomFilterIndexGranule>(idx_granule);
|
||||
if (!granule)
|
||||
throw Exception(
|
||||
"BloomFilter index condition got a granule with the wrong type.", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
/// Check like in KeyCondition.
|
||||
std::vector<BoolMask> rpn_stack;
|
||||
for (const auto & element : rpn)
|
||||
{
|
||||
if (element.function == RPNElement::FUNCTION_UNKNOWN)
|
||||
{
|
||||
rpn_stack.emplace_back(true, true);
|
||||
}
|
||||
else if (element.function == RPNElement::FUNCTION_EQUALS
|
||||
|| element.function == RPNElement::FUNCTION_NOT_EQUALS)
|
||||
{
|
||||
rpn_stack.emplace_back(
|
||||
granule->bloom_filters[element.key_column].contains(*element.bloom_filter), true);
|
||||
|
||||
if (element.function == RPNElement::FUNCTION_NOT_EQUALS)
|
||||
rpn_stack.back() = !rpn_stack.back();
|
||||
}
|
||||
else if (element.function == RPNElement::FUNCTION_LIKE
|
||||
|| element.function == RPNElement::FUNCTION_NOT_LIKE)
|
||||
{
|
||||
rpn_stack.emplace_back(
|
||||
granule->bloom_filters[element.key_column].contains(*element.bloom_filter), true);
|
||||
|
||||
if (element.function == RPNElement::FUNCTION_NOT_LIKE)
|
||||
rpn_stack.back() = !rpn_stack.back();
|
||||
}
|
||||
else if (element.function == RPNElement::FUNCTION_IN
|
||||
|| element.function == RPNElement::FUNCTION_NOT_IN)
|
||||
{
|
||||
std::vector<bool> result(element.set_bloom_filters.back().size(), true);
|
||||
|
||||
for (size_t column = 0; column < element.set_key_position.size(); ++column)
|
||||
{
|
||||
const size_t key_idx = element.set_key_position[column];
|
||||
|
||||
const auto & bloom_filters = element.set_bloom_filters[column];
|
||||
for (size_t row = 0; row < bloom_filters.size(); ++row)
|
||||
result[row] = result[row] && granule->bloom_filters[key_idx].contains(bloom_filters[row]);
|
||||
}
|
||||
|
||||
rpn_stack.emplace_back(
|
||||
std::find(std::cbegin(result), std::cend(result), true) != std::end(result), true);
|
||||
if (element.function == RPNElement::FUNCTION_NOT_IN)
|
||||
rpn_stack.back() = !rpn_stack.back();
|
||||
}
|
||||
else if (element.function == RPNElement::FUNCTION_NOT)
|
||||
{
|
||||
rpn_stack.back() = !rpn_stack.back();
|
||||
}
|
||||
else if (element.function == RPNElement::FUNCTION_AND)
|
||||
{
|
||||
auto arg1 = rpn_stack.back();
|
||||
rpn_stack.pop_back();
|
||||
auto arg2 = rpn_stack.back();
|
||||
rpn_stack.back() = arg1 & arg2;
|
||||
}
|
||||
else if (element.function == RPNElement::FUNCTION_OR)
|
||||
{
|
||||
auto arg1 = rpn_stack.back();
|
||||
rpn_stack.pop_back();
|
||||
auto arg2 = rpn_stack.back();
|
||||
rpn_stack.back() = arg1 | arg2;
|
||||
}
|
||||
else if (element.function == RPNElement::ALWAYS_FALSE)
|
||||
{
|
||||
rpn_stack.emplace_back(false, true);
|
||||
}
|
||||
else if (element.function == RPNElement::ALWAYS_TRUE)
|
||||
{
|
||||
rpn_stack.emplace_back(true, false);
|
||||
}
|
||||
else
|
||||
throw Exception("Unexpected function type in KeyCondition::RPNElement", ErrorCodes::LOGICAL_ERROR);
|
||||
}
|
||||
|
||||
if (rpn_stack.size() != 1)
|
||||
throw Exception("Unexpected stack size in KeyCondition::mayBeTrueInRange", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
return rpn_stack[0].can_be_true;
|
||||
}
|
||||
|
||||
bool BloomFilterCondition::getKey(const ASTPtr & node, size_t & key_column_num)
|
||||
{
|
||||
auto it = std::find(index.columns.begin(), index.columns.end(), node->getColumnName());
|
||||
if (it == index.columns.end())
|
||||
return false;
|
||||
|
||||
key_column_num = static_cast<size_t>(it - index.columns.begin());
|
||||
return true;
|
||||
}
|
||||
|
||||
bool BloomFilterCondition::atomFromAST(
|
||||
const ASTPtr & node, Block & block_with_constants, RPNElement & out)
|
||||
{
|
||||
Field const_value;
|
||||
DataTypePtr const_type;
|
||||
if (const auto * func = typeid_cast<const ASTFunction *>(node.get()))
|
||||
{
|
||||
const ASTs & args = typeid_cast<const ASTExpressionList &>(*func->arguments).children;
|
||||
|
||||
if (args.size() != 2)
|
||||
return false;
|
||||
|
||||
size_t key_arg_pos; /// Position of argument with key column (non-const argument)
|
||||
size_t key_column_num = -1; /// Number of a key column (inside key_column_names array)
|
||||
|
||||
if (functionIsInOrGlobalInOperator(func->name) && tryPrepareSetBloomFilter(args, out))
|
||||
{
|
||||
key_arg_pos = 0;
|
||||
}
|
||||
else if (KeyCondition::getConstant(args[1], block_with_constants, const_value, const_type) && getKey(args[0], key_column_num))
|
||||
{
|
||||
key_arg_pos = 0;
|
||||
}
|
||||
else if (KeyCondition::getConstant(args[0], block_with_constants, const_value, const_type) && getKey(args[1], key_column_num))
|
||||
{
|
||||
key_arg_pos = 1;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
|
||||
if (const_type && const_type->getTypeId() != TypeIndex::String && const_type->getTypeId() != TypeIndex::FixedString)
|
||||
return false;
|
||||
|
||||
if (key_arg_pos == 1 && (func->name != "equals" || func->name != "notEquals"))
|
||||
return false;
|
||||
else if (!index.token_extractor_func->supportLike() && (func->name == "like" || func->name == "notLike"))
|
||||
return false;
|
||||
else
|
||||
key_arg_pos = 0;
|
||||
|
||||
const auto atom_it = atom_map.find(func->name);
|
||||
if (atom_it == std::end(atom_map))
|
||||
return false;
|
||||
|
||||
out.key_column = key_column_num;
|
||||
return atom_it->second(out, const_value, index);
|
||||
}
|
||||
else if (KeyCondition::getConstant(node, block_with_constants, const_value, const_type))
|
||||
{
|
||||
/// Check constant like in KeyCondition
|
||||
if (const_value.getType() == Field::Types::UInt64
|
||||
|| const_value.getType() == Field::Types::Int64
|
||||
|| const_value.getType() == Field::Types::Float64)
|
||||
{
|
||||
/// Zero in all types is represented in memory the same way as in UInt64.
|
||||
out.function = const_value.get<UInt64>()
|
||||
? RPNElement::ALWAYS_TRUE
|
||||
: RPNElement::ALWAYS_FALSE;
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool BloomFilterCondition::tryPrepareSetBloomFilter(
|
||||
const ASTs & args,
|
||||
RPNElement & out)
|
||||
{
|
||||
const ASTPtr & left_arg = args[0];
|
||||
const ASTPtr & right_arg = args[1];
|
||||
|
||||
std::vector<KeyTuplePositionMapping> key_tuple_mapping;
|
||||
DataTypes data_types;
|
||||
|
||||
const auto * left_arg_tuple = typeid_cast<const ASTFunction *>(left_arg.get());
|
||||
if (left_arg_tuple && left_arg_tuple->name == "tuple")
|
||||
{
|
||||
const auto & tuple_elements = left_arg_tuple->arguments->children;
|
||||
for (size_t i = 0; i < tuple_elements.size(); ++i)
|
||||
{
|
||||
size_t key = 0;
|
||||
if (getKey(tuple_elements[i], key))
|
||||
{
|
||||
key_tuple_mapping.emplace_back(i, key);
|
||||
data_types.push_back(index.data_types[key]);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
size_t key = 0;
|
||||
if (getKey(left_arg, key))
|
||||
{
|
||||
key_tuple_mapping.emplace_back(0, key);
|
||||
data_types.push_back(index.data_types[key]);
|
||||
}
|
||||
}
|
||||
|
||||
if (key_tuple_mapping.empty())
|
||||
return false;
|
||||
|
||||
PreparedSetKey set_key;
|
||||
if (typeid_cast<const ASTSubquery *>(right_arg.get()) || typeid_cast<const ASTIdentifier *>(right_arg.get()))
|
||||
set_key = PreparedSetKey::forSubquery(*right_arg);
|
||||
else
|
||||
set_key = PreparedSetKey::forLiteral(*right_arg, data_types);
|
||||
|
||||
auto set_it = prepared_sets.find(set_key);
|
||||
if (set_it == prepared_sets.end())
|
||||
return false;
|
||||
|
||||
const SetPtr & prepared_set = set_it->second;
|
||||
if (!prepared_set->hasExplicitSetElements())
|
||||
return false;
|
||||
|
||||
for (const auto & data_type : prepared_set->getDataTypes())
|
||||
if (data_type->getTypeId() != TypeIndex::String && data_type->getTypeId() != TypeIndex::FixedString)
|
||||
return false;
|
||||
|
||||
std::vector<std::vector<StringBloomFilter>> bloom_filters;
|
||||
std::vector<size_t> key_position;
|
||||
|
||||
const auto & columns = prepared_set->getSetElements();
|
||||
for (size_t col = 0; col < key_tuple_mapping.size(); ++col)
|
||||
{
|
||||
bloom_filters.emplace_back();
|
||||
key_position.push_back(key_tuple_mapping[col].key_index);
|
||||
|
||||
size_t tuple_idx = key_tuple_mapping[col].tuple_index;
|
||||
const auto & column = columns[tuple_idx];
|
||||
for (size_t row = 0; row < prepared_set->getTotalRowCount(); ++row)
|
||||
{
|
||||
bloom_filters.back().emplace_back(index.bloom_filter_size, index.bloom_filter_hashes, index.seed);
|
||||
auto ref = column->getDataAt(row);
|
||||
stringToBloomFilter(ref.data, ref.size, index.token_extractor_func, bloom_filters.back().back());
|
||||
}
|
||||
}
|
||||
|
||||
out.set_key_position = std::move(key_position);
|
||||
out.set_bloom_filters = std::move(bloom_filters);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
MergeTreeIndexGranulePtr MergeTreeBloomFilterIndex::createIndexGranule() const
|
||||
{
|
||||
return std::make_shared<MergeTreeBloomFilterIndexGranule>(*this);
|
||||
}
|
||||
|
||||
MergeTreeIndexAggregatorPtr MergeTreeBloomFilterIndex::createIndexAggregator() const
|
||||
{
|
||||
return std::make_shared<MergeTreeBloomFilterIndexAggregator>(*this);
|
||||
}
|
||||
|
||||
IndexConditionPtr MergeTreeBloomFilterIndex::createIndexCondition(
|
||||
const SelectQueryInfo & query, const Context & context) const
|
||||
{
|
||||
return std::make_shared<BloomFilterCondition>(query, context, *this);
|
||||
};
|
||||
|
||||
bool MergeTreeBloomFilterIndex::mayBenefitFromIndexForIn(const ASTPtr & node) const
|
||||
{
|
||||
return std::find(std::cbegin(columns), std::cend(columns), node->getColumnName()) != std::cend(columns);
|
||||
}
|
||||
|
||||
|
||||
bool NgramTokenExtractor::next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const
|
||||
{
|
||||
*token_start = *pos;
|
||||
*token_len = 0;
|
||||
size_t code_points = 0;
|
||||
for (; code_points < n && *token_start + *token_len < len; ++code_points)
|
||||
{
|
||||
size_t sz = UTF8::seqLength(static_cast<UInt8>(data[*token_start + *token_len]));
|
||||
*token_len += sz;
|
||||
}
|
||||
*pos += UTF8::seqLength(static_cast<UInt8>(data[*pos]));
|
||||
return code_points == n;
|
||||
}
|
||||
|
||||
bool NgramTokenExtractor::nextLike(const String & str, size_t * pos, String & token) const
|
||||
{
|
||||
token.clear();
|
||||
|
||||
size_t code_points = 0;
|
||||
bool escaped = false;
|
||||
for (size_t i = *pos; i < str.size();)
|
||||
{
|
||||
if (escaped && (str[i] == '%' || str[i] == '_' || str[i] == '\\'))
|
||||
{
|
||||
token += str[i];
|
||||
++code_points;
|
||||
escaped = false;
|
||||
++i;
|
||||
}
|
||||
else if (!escaped && (str[i] == '%' || str[i] == '_'))
|
||||
{
|
||||
/// This token is too small, go to the next.
|
||||
token.clear();
|
||||
code_points = 0;
|
||||
escaped = false;
|
||||
*pos = ++i;
|
||||
}
|
||||
else if (!escaped && str[i] == '\\')
|
||||
{
|
||||
escaped = true;
|
||||
++i;
|
||||
}
|
||||
else
|
||||
{
|
||||
const size_t sz = UTF8::seqLength(static_cast<UInt8>(str[i]));
|
||||
for (size_t j = 0; j < sz; ++j)
|
||||
token += str[i + j];
|
||||
i += sz;
|
||||
++code_points;
|
||||
escaped = false;
|
||||
}
|
||||
|
||||
if (code_points == n)
|
||||
{
|
||||
*pos += UTF8::seqLength(static_cast<UInt8>(str[*pos]));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool SplitTokenExtractor::next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const
|
||||
{
|
||||
*token_start = *pos;
|
||||
*token_len = 0;
|
||||
while (*pos < len)
|
||||
{
|
||||
if (isASCII(data[*pos]) && !isAlphaNumericASCII(data[*pos]))
|
||||
{
|
||||
if (*token_len > 0)
|
||||
return true;
|
||||
*token_start = ++*pos;
|
||||
}
|
||||
else
|
||||
{
|
||||
const size_t sz = UTF8::seqLength(static_cast<UInt8>(data[*pos]));
|
||||
*pos += sz;
|
||||
*token_len += sz;
|
||||
}
|
||||
}
|
||||
return *token_len > 0;
|
||||
}
|
||||
|
||||
bool SplitTokenExtractor::nextLike(const String & str, size_t * pos, String & token) const
|
||||
{
|
||||
token.clear();
|
||||
bool bad_token = false; // % or _ before token
|
||||
bool escaped = false;
|
||||
while (*pos < str.size())
|
||||
{
|
||||
if (!escaped && (str[*pos] == '%' || str[*pos] == '_'))
|
||||
{
|
||||
token.clear();
|
||||
bad_token = true;
|
||||
++*pos;
|
||||
}
|
||||
else if (!escaped && str[*pos] == '\\')
|
||||
{
|
||||
escaped = true;
|
||||
++*pos;
|
||||
}
|
||||
else if (isASCII(str[*pos]) && !isAlphaNumericASCII(str[*pos]))
|
||||
{
|
||||
if (!bad_token && !token.empty())
|
||||
return true;
|
||||
|
||||
token.clear();
|
||||
bad_token = false;
|
||||
escaped = false;
|
||||
++*pos;
|
||||
}
|
||||
else
|
||||
{
|
||||
const size_t sz = UTF8::seqLength(static_cast<UInt8>(str[*pos]));
|
||||
for (size_t j = 0; j < sz; ++j)
|
||||
{
|
||||
token += str[*pos];
|
||||
++*pos;
|
||||
}
|
||||
escaped = false;
|
||||
}
|
||||
}
|
||||
|
||||
return !bad_token && !token.empty();
|
||||
}
|
||||
|
||||
|
||||
std::unique_ptr<IMergeTreeIndex> bloomFilterIndexCreator(
|
||||
const NamesAndTypesList & new_columns,
|
||||
std::shared_ptr<ASTIndexDeclaration> node,
|
||||
const Context & context)
|
||||
{
|
||||
if (node->name.empty())
|
||||
throw Exception("Index must have unique name", ErrorCodes::INCORRECT_QUERY);
|
||||
|
||||
ASTPtr expr_list = MergeTreeData::extractKeyExpressionList(node->expr->clone());
|
||||
|
||||
auto syntax = SyntaxAnalyzer(context, {}).analyze(
|
||||
expr_list, new_columns);
|
||||
auto index_expr = ExpressionAnalyzer(expr_list, syntax, context).getActions(false);
|
||||
|
||||
auto sample = ExpressionAnalyzer(expr_list, syntax, context)
|
||||
.getActions(true)->getSampleBlock();
|
||||
|
||||
Names columns;
|
||||
DataTypes data_types;
|
||||
|
||||
for (size_t i = 0; i < expr_list->children.size(); ++i)
|
||||
{
|
||||
const auto & column = sample.getByPosition(i);
|
||||
|
||||
columns.emplace_back(column.name);
|
||||
data_types.emplace_back(column.type);
|
||||
|
||||
if (data_types.back()->getTypeId() != TypeIndex::String
|
||||
&& data_types.back()->getTypeId() != TypeIndex::FixedString)
|
||||
throw Exception("Bloom filter index can be used only with `String` or `FixedString` column.", ErrorCodes::INCORRECT_QUERY);
|
||||
}
|
||||
|
||||
boost::algorithm::to_lower(node->type->name);
|
||||
if (node->type->name == NgramTokenExtractor::getName())
|
||||
{
|
||||
if (!node->type->arguments || node->type->arguments->children.size() != 4)
|
||||
throw Exception("`ngrambf` index must have exactly 4 arguments.", ErrorCodes::INCORRECT_QUERY);
|
||||
|
||||
size_t n = typeid_cast<const ASTLiteral &>(
|
||||
*node->type->arguments->children[0]).value.get<size_t>();
|
||||
size_t bloom_filter_size = typeid_cast<const ASTLiteral &>(
|
||||
*node->type->arguments->children[1]).value.get<size_t>();
|
||||
size_t bloom_filter_hashes = typeid_cast<const ASTLiteral &>(
|
||||
*node->type->arguments->children[2]).value.get<size_t>();
|
||||
size_t seed = typeid_cast<const ASTLiteral &>(
|
||||
*node->type->arguments->children[3]).value.get<size_t>();
|
||||
|
||||
auto tokenizer = std::make_unique<NgramTokenExtractor>(n);
|
||||
|
||||
return std::make_unique<MergeTreeBloomFilterIndex>(
|
||||
node->name, std::move(index_expr), columns, data_types, sample, node->granularity,
|
||||
bloom_filter_size, bloom_filter_hashes, seed, std::move(tokenizer));
|
||||
}
|
||||
else if (node->type->name == SplitTokenExtractor::getName())
|
||||
{
|
||||
if (!node->type->arguments || node->type->arguments->children.size() != 3)
|
||||
throw Exception("`tokenbf` index must have exactly 3 arguments.", ErrorCodes::INCORRECT_QUERY);
|
||||
|
||||
size_t bloom_filter_size = typeid_cast<const ASTLiteral &>(
|
||||
*node->type->arguments->children[0]).value.get<size_t>();
|
||||
size_t bloom_filter_hashes = typeid_cast<const ASTLiteral &>(
|
||||
*node->type->arguments->children[1]).value.get<size_t>();
|
||||
size_t seed = typeid_cast<const ASTLiteral &>(
|
||||
*node->type->arguments->children[2]).value.get<size_t>();
|
||||
|
||||
auto tokenizer = std::make_unique<SplitTokenExtractor>();
|
||||
|
||||
return std::make_unique<MergeTreeBloomFilterIndex>(
|
||||
node->name, std::move(index_expr), columns, data_types, sample, node->granularity,
|
||||
bloom_filter_size, bloom_filter_hashes, seed, std::move(tokenizer));
|
||||
}
|
||||
else
|
||||
{
|
||||
throw Exception("Unknown index type: `" + node->name + "`.", ErrorCodes::LOGICAL_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
207
dbms/src/Storages/MergeTree/MergeTreeBloomFilterIndex.h
Normal file
207
dbms/src/Storages/MergeTree/MergeTreeBloomFilterIndex.h
Normal file
@ -0,0 +1,207 @@
|
||||
#pragma once
|
||||
|
||||
#include <Interpreters/BloomFilter.h>
|
||||
#include <Storages/MergeTree/MergeTreeIndices.h>
|
||||
#include <Storages/MergeTree/KeyCondition.h>
|
||||
|
||||
#include <memory>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class MergeTreeBloomFilterIndex;
|
||||
|
||||
|
||||
struct MergeTreeBloomFilterIndexGranule : public IMergeTreeIndexGranule
|
||||
{
|
||||
explicit MergeTreeBloomFilterIndexGranule(
|
||||
const MergeTreeBloomFilterIndex & index);
|
||||
|
||||
~MergeTreeBloomFilterIndexGranule() override = default;
|
||||
|
||||
void serializeBinary(WriteBuffer & ostr) const override;
|
||||
void deserializeBinary(ReadBuffer & istr) override;
|
||||
|
||||
bool empty() const override { return !has_elems; }
|
||||
|
||||
const MergeTreeBloomFilterIndex & index;
|
||||
std::vector<StringBloomFilter> bloom_filters;
|
||||
bool has_elems;
|
||||
};
|
||||
|
||||
using MergeTreeBloomFilterIndexGranulePtr = std::shared_ptr<MergeTreeBloomFilterIndexGranule>;
|
||||
|
||||
|
||||
struct MergeTreeBloomFilterIndexAggregator : IMergeTreeIndexAggregator
|
||||
{
|
||||
explicit MergeTreeBloomFilterIndexAggregator(const MergeTreeBloomFilterIndex & index);
|
||||
|
||||
~MergeTreeBloomFilterIndexAggregator() override = default;
|
||||
|
||||
bool empty() const override { return !granule || granule->empty(); }
|
||||
MergeTreeIndexGranulePtr getGranuleAndReset() override;
|
||||
|
||||
void update(const Block & block, size_t * pos, size_t limit) override;
|
||||
|
||||
const MergeTreeBloomFilterIndex & index;
|
||||
MergeTreeBloomFilterIndexGranulePtr granule;
|
||||
};
|
||||
|
||||
|
||||
class BloomFilterCondition : public IIndexCondition
|
||||
{
|
||||
public:
|
||||
BloomFilterCondition(
|
||||
const SelectQueryInfo & query_info,
|
||||
const Context & context,
|
||||
const MergeTreeBloomFilterIndex & index_);
|
||||
|
||||
~BloomFilterCondition() override = default;
|
||||
|
||||
bool alwaysUnknownOrTrue() const override;
|
||||
|
||||
bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const override;
|
||||
private:
|
||||
struct KeyTuplePositionMapping
|
||||
{
|
||||
KeyTuplePositionMapping(size_t tuple_index_, size_t key_index_) : tuple_index(tuple_index_), key_index(key_index_) {}
|
||||
|
||||
size_t tuple_index;
|
||||
size_t key_index;
|
||||
};
|
||||
/// Uses RPN like KeyCondition
|
||||
struct RPNElement
|
||||
{
|
||||
enum Function
|
||||
{
|
||||
/// Atoms of a Boolean expression.
|
||||
FUNCTION_EQUALS,
|
||||
FUNCTION_NOT_EQUALS,
|
||||
FUNCTION_LIKE,
|
||||
FUNCTION_NOT_LIKE,
|
||||
FUNCTION_IN,
|
||||
FUNCTION_NOT_IN,
|
||||
FUNCTION_UNKNOWN, /// Can take any value.
|
||||
/// Operators of the logical expression.
|
||||
FUNCTION_NOT,
|
||||
FUNCTION_AND,
|
||||
FUNCTION_OR,
|
||||
/// Constants
|
||||
ALWAYS_FALSE,
|
||||
ALWAYS_TRUE,
|
||||
};
|
||||
|
||||
RPNElement(
|
||||
Function function_ = FUNCTION_UNKNOWN, size_t key_column_ = 0, std::unique_ptr<StringBloomFilter> && const_bloom_filter_ = nullptr)
|
||||
: function(function_), key_column(key_column_), bloom_filter(std::move(const_bloom_filter_)) {}
|
||||
|
||||
Function function = FUNCTION_UNKNOWN;
|
||||
/// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS, FUNCTION_LIKE, FUNCTION_NOT_LIKE.
|
||||
size_t key_column;
|
||||
std::unique_ptr<StringBloomFilter> bloom_filter;
|
||||
/// For FUNCTION_IN and FUNCTION_NOT_IN
|
||||
std::vector<std::vector<StringBloomFilter>> set_bloom_filters;
|
||||
std::vector<size_t> set_key_position;
|
||||
};
|
||||
|
||||
using AtomMap = std::unordered_map<std::string, bool(*)(RPNElement & out, const Field & value, const MergeTreeBloomFilterIndex & idx)>;
|
||||
using RPN = std::vector<RPNElement>;
|
||||
|
||||
bool atomFromAST(const ASTPtr & node, Block & block_with_constants, RPNElement & out);
|
||||
|
||||
bool getKey(const ASTPtr & node, size_t & key_column_num);
|
||||
bool tryPrepareSetBloomFilter(const ASTs & args, RPNElement & out);
|
||||
|
||||
static const AtomMap atom_map;
|
||||
|
||||
const MergeTreeBloomFilterIndex & index;
|
||||
RPN rpn;
|
||||
/// Sets from syntax analyzer.
|
||||
PreparedSets prepared_sets;
|
||||
};
|
||||
|
||||
|
||||
/// Interface for string parsers.
|
||||
struct ITokenExtractor
|
||||
{
|
||||
virtual ~ITokenExtractor() = default;
|
||||
/// Fast inplace implementation for regular use.
|
||||
/// Gets string (data ptr and len) and start position for extracting next token (state of extractor).
|
||||
/// Returns false if parsing is finished, otherwise returns true.
|
||||
virtual bool next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const = 0;
|
||||
/// Special implementation for creating bloom filter for LIKE function.
|
||||
/// It skips unescaped `%` and `_` and supports escaping symbols, but it is less lightweight.
|
||||
virtual bool nextLike(const String & str, size_t * pos, String & out) const = 0;
|
||||
|
||||
virtual bool supportLike() const = 0;
|
||||
};
|
||||
|
||||
/// Parser extracting all ngrams from string.
|
||||
struct NgramTokenExtractor : public ITokenExtractor
|
||||
{
|
||||
NgramTokenExtractor(size_t n_) : n(n_) {}
|
||||
|
||||
static String getName() { return "ngrambf_v1"; }
|
||||
|
||||
bool next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const override;
|
||||
bool nextLike(const String & str, size_t * pos, String & token) const override;
|
||||
|
||||
bool supportLike() const override { return true; }
|
||||
|
||||
size_t n;
|
||||
};
|
||||
|
||||
/// Parser extracting tokens (sequences of numbers and ascii letters).
|
||||
struct SplitTokenExtractor : public ITokenExtractor
|
||||
{
|
||||
static String getName() { return "tokenbf_v1"; }
|
||||
|
||||
bool next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const override;
|
||||
bool nextLike(const String & str, size_t * pos, String & token) const override;
|
||||
|
||||
bool supportLike() const override { return true; }
|
||||
};
|
||||
|
||||
|
||||
class MergeTreeBloomFilterIndex : public IMergeTreeIndex
|
||||
{
|
||||
public:
|
||||
MergeTreeBloomFilterIndex(
|
||||
String name_,
|
||||
ExpressionActionsPtr expr_,
|
||||
const Names & columns_,
|
||||
const DataTypes & data_types_,
|
||||
const Block & header_,
|
||||
size_t granularity_,
|
||||
size_t bloom_filter_size_,
|
||||
size_t bloom_filter_hashes_,
|
||||
size_t seed_,
|
||||
std::unique_ptr<ITokenExtractor> && token_extractor_func_)
|
||||
: IMergeTreeIndex(name_, expr_, columns_, data_types_, header_, granularity_)
|
||||
, bloom_filter_size(bloom_filter_size_)
|
||||
, bloom_filter_hashes(bloom_filter_hashes_)
|
||||
, seed(seed_)
|
||||
, token_extractor_func(std::move(token_extractor_func_)) {}
|
||||
|
||||
~MergeTreeBloomFilterIndex() override = default;
|
||||
|
||||
MergeTreeIndexGranulePtr createIndexGranule() const override;
|
||||
MergeTreeIndexAggregatorPtr createIndexAggregator() const override;
|
||||
|
||||
IndexConditionPtr createIndexCondition(
|
||||
const SelectQueryInfo & query, const Context & context) const override;
|
||||
|
||||
bool mayBenefitFromIndexForIn(const ASTPtr & node) const override;
|
||||
|
||||
/// Bloom filter size in bytes.
|
||||
size_t bloom_filter_size;
|
||||
/// Number of bloom filter hash functions.
|
||||
size_t bloom_filter_hashes;
|
||||
/// Bloom filter seed.
|
||||
size_t seed;
|
||||
/// Fucntion for selecting next token.
|
||||
std::unique_ptr<ITokenExtractor> token_extractor_func;
|
||||
};
|
||||
|
||||
}
|
@ -2517,14 +2517,22 @@ bool MergeTreeData::mayBenefitFromIndexForIn(const ASTPtr & left_in_operand) con
|
||||
if (left_in_operand_tuple && left_in_operand_tuple->name == "tuple")
|
||||
{
|
||||
for (const auto & item : left_in_operand_tuple->arguments->children)
|
||||
{
|
||||
if (isPrimaryOrMinMaxKeyColumnPossiblyWrappedInFunctions(item))
|
||||
return true;
|
||||
|
||||
for (const auto & index : skip_indices)
|
||||
if (index->mayBenefitFromIndexForIn(item))
|
||||
return true;
|
||||
}
|
||||
/// The tuple itself may be part of the primary key, so check that as a last resort.
|
||||
return isPrimaryOrMinMaxKeyColumnPossiblyWrappedInFunctions(left_in_operand);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (const auto & index : skip_indices)
|
||||
if (index->mayBenefitFromIndexForIn(left_in_operand))
|
||||
return true;
|
||||
|
||||
return isPrimaryOrMinMaxKeyColumnPossiblyWrappedInFunctions(left_in_operand);
|
||||
}
|
||||
}
|
||||
|
@ -65,10 +65,18 @@ std::unique_ptr<IMergeTreeIndex> setIndexCreator(
|
||||
std::shared_ptr<ASTIndexDeclaration> node,
|
||||
const Context & context);
|
||||
|
||||
std::unique_ptr<IMergeTreeIndex> bloomFilterIndexCreator(
|
||||
const NamesAndTypesList & columns,
|
||||
std::shared_ptr<ASTIndexDeclaration> node,
|
||||
const Context & context);
|
||||
|
||||
|
||||
MergeTreeIndexFactory::MergeTreeIndexFactory()
|
||||
{
|
||||
registerIndex("minmax", minmaxIndexCreator);
|
||||
registerIndex("set", setIndexCreator);
|
||||
registerIndex("ngrambf_v1", bloomFilterIndexCreator);
|
||||
registerIndex("tokenbf_v1", bloomFilterIndexCreator);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -95,6 +95,9 @@ public:
|
||||
/// gets filename without extension
|
||||
String getFileName() const { return INDEX_FILE_PREFIX + name; }
|
||||
|
||||
/// Checks whether the column is in data skipping index.
|
||||
virtual bool mayBenefitFromIndexForIn(const ASTPtr & node) const = 0;
|
||||
|
||||
virtual MergeTreeIndexGranulePtr createIndexGranule() const = 0;
|
||||
virtual MergeTreeIndexAggregatorPtr createIndexAggregator() const = 0;
|
||||
|
||||
|
@ -133,6 +133,20 @@ IndexConditionPtr MergeTreeMinMaxIndex::createIndexCondition(
|
||||
return std::make_shared<MinMaxCondition>(query, context, *this);
|
||||
};
|
||||
|
||||
bool MergeTreeMinMaxIndex::mayBenefitFromIndexForIn(const ASTPtr & node) const
|
||||
{
|
||||
const String column_name = node->getColumnName();
|
||||
|
||||
for (const auto & name : columns)
|
||||
if (column_name == name)
|
||||
return true;
|
||||
|
||||
if (const auto * func = typeid_cast<const ASTFunction *>(node.get()))
|
||||
if (func->arguments->children.size() == 1)
|
||||
return mayBenefitFromIndexForIn(func->arguments->children.front());
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
std::unique_ptr<IMergeTreeIndex> minmaxIndexCreator(
|
||||
const NamesAndTypesList & new_columns,
|
||||
|
@ -82,6 +82,7 @@ public:
|
||||
IndexConditionPtr createIndexCondition(
|
||||
const SelectQueryInfo & query, const Context & context) const override;
|
||||
|
||||
bool mayBenefitFromIndexForIn(const ASTPtr & node) const override;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -462,11 +462,16 @@ IndexConditionPtr MergeTreeSetSkippingIndex::createIndexCondition(
|
||||
return std::make_shared<SetIndexCondition>(query, context, *this);
|
||||
};
|
||||
|
||||
bool MergeTreeSetSkippingIndex::mayBenefitFromIndexForIn(const ASTPtr &) const
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
std::unique_ptr<IMergeTreeIndex> setIndexCreator(
|
||||
const NamesAndTypesList & new_columns,
|
||||
std::shared_ptr<ASTIndexDeclaration> node,
|
||||
const Context & context)
|
||||
const NamesAndTypesList & new_columns,
|
||||
std::shared_ptr<ASTIndexDeclaration> node,
|
||||
const Context & context)
|
||||
{
|
||||
if (node->name.empty())
|
||||
throw Exception("Index must have unique name", ErrorCodes::INCORRECT_QUERY);
|
||||
|
@ -112,6 +112,8 @@ public:
|
||||
IndexConditionPtr createIndexCondition(
|
||||
const SelectQueryInfo & query, const Context & context) const override;
|
||||
|
||||
bool mayBenefitFromIndexForIn(const ASTPtr & node) const override;
|
||||
|
||||
size_t max_rows = 0;
|
||||
};
|
||||
|
||||
|
130
dbms/src/Storages/MergeTree/RPNBuilder.h
Normal file
130
dbms/src/Storages/MergeTree/RPNBuilder.h
Normal file
@ -0,0 +1,130 @@
|
||||
#pragma once
|
||||
|
||||
#include <Common/typeid_cast.h>
|
||||
#include <Core/Block.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <Interpreters/Context.h>
|
||||
#include <Interpreters/ExpressionAnalyzer.h>
|
||||
#include <Parsers/ASTExpressionList.h>
|
||||
#include <Parsers/ASTSelectQuery.h>
|
||||
#include <Parsers/ASTFunction.h>
|
||||
#include <Storages/SelectQueryInfo.h>
|
||||
#include <Storages/MergeTree/KeyCondition.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/// Builds reverse polish notation
|
||||
template <typename RPNElement>
|
||||
class RPNBuilder
|
||||
{
|
||||
public:
|
||||
using RPN = std::vector<RPNElement>;
|
||||
using AtomFromASTFunc = std::function<
|
||||
bool(const ASTPtr & node, const Context & context, Block & block_with_constants, RPNElement & out)>;
|
||||
|
||||
RPNBuilder(
|
||||
const SelectQueryInfo & query_info,
|
||||
const Context & context_,
|
||||
const AtomFromASTFunc & atomFromAST_)
|
||||
: context(context_), atomFromAST(atomFromAST_)
|
||||
{
|
||||
/** Evaluation of expressions that depend only on constants.
|
||||
* For the index to be used, if it is written, for example `WHERE Date = toDate(now())`.
|
||||
*/
|
||||
block_with_constants = KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context);
|
||||
|
||||
/// Trasform WHERE section to Reverse Polish notation
|
||||
const ASTSelectQuery & select = typeid_cast<const ASTSelectQuery &>(*query_info.query);
|
||||
if (select.where_expression)
|
||||
{
|
||||
traverseAST(select.where_expression);
|
||||
|
||||
if (select.prewhere_expression)
|
||||
{
|
||||
traverseAST(select.prewhere_expression);
|
||||
rpn.emplace_back(RPNElement::FUNCTION_AND);
|
||||
}
|
||||
}
|
||||
else if (select.prewhere_expression)
|
||||
{
|
||||
traverseAST(select.prewhere_expression);
|
||||
}
|
||||
else
|
||||
{
|
||||
rpn.emplace_back(RPNElement::FUNCTION_UNKNOWN);
|
||||
}
|
||||
}
|
||||
|
||||
RPN && extractRPN() { return std::move(rpn); }
|
||||
|
||||
private:
|
||||
void traverseAST(const ASTPtr & node)
|
||||
{
|
||||
RPNElement element;
|
||||
|
||||
if (ASTFunction * func = typeid_cast<ASTFunction *>(&*node))
|
||||
{
|
||||
if (operatorFromAST(func, element))
|
||||
{
|
||||
auto & args = typeid_cast<ASTExpressionList &>(*func->arguments).children;
|
||||
for (size_t i = 0, size = args.size(); i < size; ++i)
|
||||
{
|
||||
traverseAST(args[i]);
|
||||
|
||||
/** The first part of the condition is for the correct support of `and` and `or` functions of arbitrary arity
|
||||
* - in this case `n - 1` elements are added (where `n` is the number of arguments).
|
||||
*/
|
||||
if (i != 0 || element.function == RPNElement::FUNCTION_NOT)
|
||||
rpn.emplace_back(std::move(element));
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (!atomFromAST(node, context, block_with_constants, element))
|
||||
{
|
||||
element.function = RPNElement::FUNCTION_UNKNOWN;
|
||||
}
|
||||
|
||||
rpn.emplace_back(std::move(element));
|
||||
}
|
||||
|
||||
bool operatorFromAST(const ASTFunction * func, RPNElement & out)
|
||||
{
|
||||
/// Functions AND, OR, NOT.
|
||||
/** Also a special function `indexHint` - works as if instead of calling a function there are just parentheses
|
||||
* (or, the same thing - calling the function `and` from one argument).
|
||||
*/
|
||||
const ASTs & args = typeid_cast<const ASTExpressionList &>(*func->arguments).children;
|
||||
|
||||
if (func->name == "not")
|
||||
{
|
||||
if (args.size() != 1)
|
||||
return false;
|
||||
|
||||
out.function = RPNElement::FUNCTION_NOT;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (func->name == "and" || func->name == "indexHint")
|
||||
out.function = RPNElement::FUNCTION_AND;
|
||||
else if (func->name == "or")
|
||||
out.function = RPNElement::FUNCTION_OR;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
const Context & context;
|
||||
const AtomFromASTFunc & atomFromAST;
|
||||
Block block_with_constants;
|
||||
RPN rpn;
|
||||
};
|
||||
|
||||
|
||||
};
|
@ -0,0 +1,41 @@
|
||||
8 aбвгдеёж
|
||||
"rows_read": 2,
|
||||
8 aбвгдеёж
|
||||
"rows_read": 2,
|
||||
13 abc
|
||||
"rows_read": 1,
|
||||
1 ClickHouse is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).
|
||||
"rows_read": 2,
|
||||
1 ClickHouse is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).
|
||||
"rows_read": 2,
|
||||
0 ClickHouse - столбцовая система управления базами данных (СУБД) для онлайн обработки аналитических запросов (OLAP).
|
||||
"rows_read": 2,
|
||||
0 ClickHouse - столбцовая система управления базами данных (СУБД) для онлайн обработки аналитических запросов (OLAP).
|
||||
5 еще строка
|
||||
"rows_read": 4,
|
||||
12 <div> странный <strong>html</strong> </div>
|
||||
"rows_read": 2,
|
||||
9 2_2%2_2\\
|
||||
"rows_read": 2,
|
||||
9 2_2%2_2\\
|
||||
"rows_read": 2,
|
||||
9 2_2%2_2\\
|
||||
"rows_read": 2,
|
||||
9 2_2%2_2\\
|
||||
"rows_read": 2,
|
||||
8 aбвгдеёж
|
||||
13 abc
|
||||
"rows_read": 3,
|
||||
8 aбвгдеёж
|
||||
"rows_read": 2,
|
||||
1 column-oriented
|
||||
2 column-oriented
|
||||
"rows_read": 4,
|
||||
0 ClickHouse is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).
|
||||
"rows_read": 2,
|
||||
1 column-oriented
|
||||
2 column-oriented
|
||||
"rows_read": 4,
|
||||
6 some string
|
||||
13 abc
|
||||
"rows_read": 3,
|
143
dbms/tests/queries/0_stateless/00908_bloom_filter_index.sh
Executable file
143
dbms/tests/queries/0_stateless/00908_bloom_filter_index.sh
Executable file
@ -0,0 +1,143 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
. $CURDIR/../shell_config.sh
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS test.bloom_filter_idx;"
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS test.bloom_filter_idx2;"
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS test.bloom_filter_idx2;"
|
||||
|
||||
|
||||
# NGRAM BF
|
||||
$CLICKHOUSE_CLIENT -n --query="
|
||||
SET allow_experimental_data_skipping_indices = 1;
|
||||
CREATE TABLE test.bloom_filter_idx
|
||||
(
|
||||
k UInt64,
|
||||
s String,
|
||||
INDEX bf (s, lower(s)) TYPE ngrambf_v1(3, 512, 2, 0) GRANULARITY 1
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY k
|
||||
SETTINGS index_granularity = 2;"
|
||||
|
||||
$CLICKHOUSE_CLIENT -n --query="
|
||||
SET allow_experimental_data_skipping_indices = 1;
|
||||
CREATE TABLE test.bloom_filter_idx2
|
||||
(
|
||||
k UInt64,
|
||||
s FixedString(15),
|
||||
INDEX bf (s, lower(s)) TYPE ngrambf_v1(3, 512, 2, 0) GRANULARITY 1
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY k
|
||||
SETTINGS index_granularity = 2;"
|
||||
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="INSERT INTO test.bloom_filter_idx VALUES
|
||||
(0, 'ClickHouse - столбцовая система управления базами данных (СУБД) для онлайн обработки аналитических запросов (OLAP).'),
|
||||
(1, 'ClickHouse is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).'),
|
||||
(2, 'column-oriented database management system'),
|
||||
(3, 'столбцовая система управления базами данных'),
|
||||
(4, 'какая-то строка'),
|
||||
(5, 'еще строка'),
|
||||
(6, 'some string'),
|
||||
(7, 'another string'),
|
||||
(8, 'aбвгдеёж'),
|
||||
(9, '2_2%2_2\\\\'),
|
||||
(11, '!@#$%^&*0123456789'),
|
||||
(12, '<div> странный <strong>html</strong> </div>'),
|
||||
(13, 'abc')"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="INSERT INTO test.bloom_filter_idx2 VALUES
|
||||
(0, 'ClickHouse'),
|
||||
(1, 'column-oriented'),
|
||||
(2, 'column-oriented'),
|
||||
(6, 'some string'),
|
||||
(8, 'aбвгдеёж'),
|
||||
(9, '2_2%2_2\\\\'),
|
||||
(13, 'abc')"
|
||||
|
||||
# EQUAL
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx2 WHERE lower(s) = 'aбвгдеёж' OR s = 'aбвгдеёж' ORDER BY k"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx2 WHERE lower(s) = 'aбвгдеёж' OR s = 'aбвгдеёж' ORDER BY k FORMAT JSON" | grep "rows_read"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s = 'aбвгдеёж' ORDER BY k"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s = 'aбвгдеёж' ORDER BY k FORMAT JSON" | grep "rows_read"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE lower(s) = 'abc' ORDER BY k"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE lower(s) = 'abc' ORDER BY k FORMAT JSON" | grep "rows_read"
|
||||
|
||||
# LIKE
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%database%' AND s LIKE '%ClickHouse%' ORDER BY k"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%database%' AND s LIKE '%ClickHouse%' ORDER BY k FORMAT JSON" | grep "rows_read"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%database%' AND lower(s) LIKE '%clickhouse%' ORDER BY k"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%database%' AND lower(s) LIKE '%clickhouse%' ORDER BY k FORMAT JSON" | grep "rows_read"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%базами данных%' AND s LIKE '%ClickHouse%' ORDER BY k"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%базами данных%' AND s LIKE '%ClickHouse%' ORDER BY k FORMAT JSON" | grep "rows_read"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE (s LIKE '%базами данных%' AND s LIKE '%ClickHouse%') OR s LIKE '____строка' ORDER BY k"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE (s LIKE '%базами данных%' AND s LIKE '%ClickHouse%') OR s LIKE '____строка' ORDER BY k FORMAT JSON" | grep "rows_read"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%%<div>_%_%_</div>%%' ORDER BY k"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%%<div>_%_%_</div>%%' ORDER BY k FORMAT JSON" | grep "rows_read"
|
||||
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%2\\\\%2%' ORDER BY k"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%2\\\\%2%' ORDER BY k FORMAT JSON" | grep "rows_read"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%_\\\\%2\\\\__\\\\' ORDER BY k"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%_\\\\%2\\\\__\\\\' ORDER BY k FORMAT JSON" | grep "rows_read"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '2\\\\_2\\\\%2_2\\\\' ORDER BY k"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '2\\\\_2\\\\%2_2\\\\' ORDER BY k FORMAT JSON" | grep "rows_read"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '2\\\\_2\\\\%2_2_' ORDER BY k"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '2\\\\_2\\\\%2_2_' ORDER BY k FORMAT JSON" | grep "rows_read"
|
||||
|
||||
# IN
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s IN ('aбвгдеёж', 'abc') ORDER BY k"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s IN ('aбвгдеёж', 'abc') ORDER BY k FORMAT JSON" | grep "rows_read"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE (s, lower(s)) IN (('aбвгдеёж', 'aбвгдеёж'), ('abc', 'cba')) ORDER BY k"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE (s, lower(s)) IN (('aбвгдеёж', 'aбвгдеёж'), ('abc', 'cba')) ORDER BY k FORMAT JSON" | grep "rows_read"
|
||||
|
||||
|
||||
# TOKEN BF
|
||||
$CLICKHOUSE_CLIENT -n --query="
|
||||
SET allow_experimental_data_skipping_indices = 1;
|
||||
CREATE TABLE test.bloom_filter_idx3
|
||||
(
|
||||
k UInt64,
|
||||
s String,
|
||||
INDEX bf (s, lower(s)) TYPE tokenbf_v1(512, 3, 0) GRANULARITY 1
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY k
|
||||
SETTINGS index_granularity = 2;"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="INSERT INTO test.bloom_filter_idx3 VALUES
|
||||
(0, 'ClickHouse is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).'),
|
||||
(1, 'column-oriented'),
|
||||
(2, 'column-oriented'),
|
||||
(6, 'some string'),
|
||||
(8, 'column with ints'),
|
||||
(9, '2_2%2_2\\\\'),
|
||||
(13, 'abc')"
|
||||
|
||||
# EQUAL
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx3 WHERE lower(s) = 'column-oriented' OR s = 'column-oriented' ORDER BY k"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx3 WHERE lower(s) = 'column-oriented' OR s = 'column-oriented' ORDER BY k FORMAT JSON" | grep "rows_read"
|
||||
|
||||
# LIKE
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx3 WHERE lower(s) LIKE '%(dbms)%' ORDER BY k"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx3 WHERE lower(s) LIKE '%(dbms)%' ORDER BY k FORMAT JSON" | grep "rows_read"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx3 WHERE s LIKE 'column-%' AND s LIKE '%-oriented' ORDER BY k"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx3 WHERE s LIKE 'column-%' AND s LIKE '%-oriented' ORDER BY k FORMAT JSON" | grep "rows_read"
|
||||
|
||||
# IN
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx3 WHERE s IN ('some string', 'abc') ORDER BY k"
|
||||
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx3 WHERE s IN ('some string', 'abc') ORDER BY k FORMAT JSON" | grep "rows_read"
|
||||
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE test.bloom_filter_idx"
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE test.bloom_filter_idx2"
|
||||
$CLICKHOUSE_CLIENT --query="DROP TABLE test.bloom_filter_idx3"
|
@ -241,7 +241,7 @@ INDEX index_name expr TYPE type(...) GRANULARITY granularity_value
|
||||
|
||||
For tables from the `*MergeTree` family data skipping indices can be specified.
|
||||
|
||||
These indices aggregate some information about the specified expression on blocks, which consist of `granularity_value` granules,
|
||||
These indices aggregate some information about the specified expression on blocks, which consist of `granularity_value` granules (size of the granule is specified using `index_granularity` setting in the table engine),
|
||||
then these aggregates are used in `SELECT` queries for reducing the amount of data to read from the disk by skipping big blocks of data where `where` query cannot be satisfied.
|
||||
|
||||
|
||||
@ -273,9 +273,21 @@ Stores extremes of the specified expression (if the expression is `tuple`, then
|
||||
* `set(max_rows)`
|
||||
Stores unique values of the specified expression (no more than `max_rows` rows, `max_rows=0` means "no limits"), use them to check if the `WHERE` expression is not satisfiable on a block of the data.
|
||||
|
||||
* `ngrambf_v1(n, size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)`
|
||||
Stores [bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) that contains all ngrams from block of data. Works only with strings.
|
||||
Can be used for optimization of `equals`, `like` and `in` expressions.
|
||||
`n` -- ngram size,
|
||||
`size_of_bloom_filter_in_bytes` -- bloom filter size in bytes (you can use big values here, for example, 256 or 512, because it can be compressed well),
|
||||
`number_of_hash_functions` -- number of hash functions used in bloom filter,
|
||||
`random_seed` -- seed for bloom filter hash functions.
|
||||
|
||||
* `tokenbf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)`
|
||||
The same as `ngrambf_v1`, but instead of ngrams stores tokens, which are sequences separated by non-alphanumeric characters.
|
||||
|
||||
```sql
|
||||
INDEX sample_index (u64 * length(s)) TYPE minmax GRANULARITY 4
|
||||
INDEX b (u64 * length(str), i32 + f64 * 100, date, str) TYPE set(100) GRANULARITY 4
|
||||
INDEX sample_index2 (u64 * length(str), i32 + f64 * 100, date, str) TYPE set(100) GRANULARITY 4
|
||||
INDEX sample_index3 (lower(str), str) TYPE ngrambf_v1(3, 256, 2, 0) GRANULARITY 4
|
||||
```
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user