2019-05-10 03:42:28 +00:00
|
|
|
#include <Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h>
|
|
|
|
#include <Interpreters/QueryNormalizer.h>
|
|
|
|
#include <Interpreters/BloomFilterHash.h>
|
|
|
|
#include <Common/HashTable/ClearableHashMap.h>
|
|
|
|
#include <Storages/MergeTree/RPNBuilder.h>
|
|
|
|
#include <Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h>
|
|
|
|
#include <DataTypes/DataTypeTuple.h>
|
|
|
|
#include <Columns/ColumnConst.h>
|
|
|
|
#include <ext/bit_cast.h>
|
|
|
|
#include <Parsers/ASTSubquery.h>
|
|
|
|
#include <Parsers/ASTIdentifier.h>
|
|
|
|
#include <Columns/ColumnTuple.h>
|
2019-06-19 10:50:37 +00:00
|
|
|
#include <Interpreters/castColumn.h>
|
|
|
|
#include <Interpreters/convertFieldToType.h>
|
2019-05-10 03:42:28 +00:00
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace
|
|
|
|
{
|
|
|
|
|
|
|
|
PreparedSetKey getPreparedSetKey(const ASTPtr & node, const DataTypePtr & data_type)
|
|
|
|
{
|
|
|
|
/// If the data type is tuple, let's try unbox once
|
|
|
|
if (node->as<ASTSubquery>() || node->as<ASTIdentifier>())
|
|
|
|
return PreparedSetKey::forSubquery(*node);
|
|
|
|
|
|
|
|
if (const auto * date_type_tuple = typeid_cast<const DataTypeTuple *>(&*data_type))
|
|
|
|
return PreparedSetKey::forLiteral(*node, date_type_tuple->getElements());
|
|
|
|
|
|
|
|
return PreparedSetKey::forLiteral(*node, DataTypes(1, data_type));
|
|
|
|
}
|
|
|
|
|
2019-06-19 08:51:35 +00:00
|
|
|
ColumnWithTypeAndName getPreparedSetInfo(const SetPtr & prepared_set)
|
|
|
|
{
|
|
|
|
if (prepared_set->getDataTypes().size() == 1)
|
|
|
|
return {prepared_set->getSetElements()[0], prepared_set->getDataTypes()[0], "dummy"};
|
|
|
|
|
|
|
|
return {ColumnTuple::create(prepared_set->getSetElements()), std::make_shared<DataTypeTuple>(prepared_set->getDataTypes()), "dummy"};
|
|
|
|
}
|
|
|
|
|
2019-05-10 03:42:28 +00:00
|
|
|
bool maybeTrueOnBloomFilter(const IColumn * hash_column, const BloomFilterPtr & bloom_filter, size_t hash_functions)
|
|
|
|
{
|
|
|
|
const auto const_column = typeid_cast<const ColumnConst *>(hash_column);
|
|
|
|
const auto non_const_column = typeid_cast<const ColumnUInt64 *>(hash_column);
|
|
|
|
|
|
|
|
if (!const_column && !non_const_column)
|
|
|
|
throw Exception("LOGICAL ERROR: hash column must be Const Column or UInt64 Column.", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
|
|
|
if (const_column)
|
|
|
|
{
|
|
|
|
for (size_t index = 0; index < hash_functions; ++index)
|
2019-06-19 15:09:07 +00:00
|
|
|
if (!bloom_filter->findHashWithSeed(const_column->getValue<UInt64>(), BloomFilterHash::bf_hash_seed[index]))
|
2019-05-10 03:42:28 +00:00
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
bool missing_rows = true;
|
|
|
|
const ColumnUInt64::Container & data = non_const_column->getData();
|
|
|
|
|
|
|
|
for (size_t index = 0, size = data.size(); missing_rows && index < size; ++index)
|
|
|
|
{
|
|
|
|
bool match_row = true;
|
|
|
|
for (size_t hash_index = 0; match_row && hash_index < hash_functions; ++hash_index)
|
2019-06-19 15:09:07 +00:00
|
|
|
match_row = bloom_filter->findHashWithSeed(data[index], BloomFilterHash::bf_hash_seed[hash_index]);
|
2019-05-10 03:42:28 +00:00
|
|
|
|
|
|
|
missing_rows = !match_row;
|
|
|
|
}
|
|
|
|
|
|
|
|
return !missing_rows;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
MergeTreeIndexConditionBloomFilter::MergeTreeIndexConditionBloomFilter(
|
2019-08-03 11:02:40 +00:00
|
|
|
const SelectQueryInfo & info_, const Context & context_, const Block & header_, size_t hash_functions_)
|
|
|
|
: header(header_), context(context_), query_info(info_), hash_functions(hash_functions_)
|
2019-05-10 03:42:28 +00:00
|
|
|
{
|
|
|
|
auto atomFromAST = [this](auto & node, auto &, auto & constants, auto & out) { return traverseAtomAST(node, constants, out); };
|
2019-08-03 11:02:40 +00:00
|
|
|
rpn = std::move(RPNBuilder<RPNElement>(info_, context, atomFromAST).extractRPN());
|
2019-05-10 03:42:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool MergeTreeIndexConditionBloomFilter::alwaysUnknownOrTrue() const
|
|
|
|
{
|
|
|
|
std::vector<bool> rpn_stack;
|
|
|
|
|
|
|
|
for (const auto & element : rpn)
|
|
|
|
{
|
|
|
|
if (element.function == RPNElement::FUNCTION_UNKNOWN
|
|
|
|
|| element.function == RPNElement::ALWAYS_TRUE)
|
|
|
|
{
|
|
|
|
rpn_stack.push_back(true);
|
|
|
|
}
|
|
|
|
else if (element.function == RPNElement::FUNCTION_EQUALS
|
|
|
|
|| element.function == RPNElement::FUNCTION_NOT_EQUALS
|
|
|
|
|| element.function == RPNElement::FUNCTION_IN
|
|
|
|
|| element.function == RPNElement::FUNCTION_NOT_IN
|
|
|
|
|| element.function == RPNElement::ALWAYS_FALSE)
|
|
|
|
{
|
|
|
|
rpn_stack.push_back(false);
|
|
|
|
}
|
|
|
|
else if (element.function == RPNElement::FUNCTION_NOT)
|
|
|
|
{
|
|
|
|
// do nothing
|
|
|
|
}
|
|
|
|
else if (element.function == RPNElement::FUNCTION_AND)
|
|
|
|
{
|
|
|
|
auto arg1 = rpn_stack.back();
|
|
|
|
rpn_stack.pop_back();
|
|
|
|
auto arg2 = rpn_stack.back();
|
|
|
|
rpn_stack.back() = arg1 && arg2;
|
|
|
|
}
|
|
|
|
else if (element.function == RPNElement::FUNCTION_OR)
|
|
|
|
{
|
|
|
|
auto arg1 = rpn_stack.back();
|
|
|
|
rpn_stack.pop_back();
|
|
|
|
auto arg2 = rpn_stack.back();
|
|
|
|
rpn_stack.back() = arg1 || arg2;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
throw Exception("Unexpected function type in KeyCondition::RPNElement", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
}
|
|
|
|
|
|
|
|
return rpn_stack[0];
|
|
|
|
}
|
|
|
|
|
|
|
|
bool MergeTreeIndexConditionBloomFilter::mayBeTrueOnGranule(const MergeTreeIndexGranuleBloomFilter * granule) const
|
|
|
|
{
|
|
|
|
std::vector<BoolMask> rpn_stack;
|
|
|
|
const auto & filters = granule->getFilters();
|
|
|
|
|
|
|
|
for (const auto & element : rpn)
|
|
|
|
{
|
|
|
|
if (element.function == RPNElement::FUNCTION_UNKNOWN)
|
|
|
|
{
|
|
|
|
rpn_stack.emplace_back(true, true);
|
|
|
|
}
|
|
|
|
else if (element.function == RPNElement::FUNCTION_IN
|
|
|
|
|| element.function == RPNElement::FUNCTION_NOT_IN
|
|
|
|
|| element.function == RPNElement::FUNCTION_EQUALS
|
|
|
|
|| element.function == RPNElement::FUNCTION_NOT_EQUALS)
|
|
|
|
{
|
|
|
|
bool match_rows = true;
|
|
|
|
const auto & predicate = element.predicate;
|
|
|
|
for (size_t index = 0; match_rows && index < predicate.size(); ++index)
|
|
|
|
{
|
|
|
|
const auto & query_index_hash = predicate[index];
|
|
|
|
const auto & filter = filters[query_index_hash.first];
|
|
|
|
const ColumnPtr & hash_column = query_index_hash.second;
|
|
|
|
match_rows = maybeTrueOnBloomFilter(&*hash_column, filter, hash_functions);
|
|
|
|
}
|
|
|
|
|
|
|
|
rpn_stack.emplace_back(match_rows, !match_rows);
|
|
|
|
if (element.function == RPNElement::FUNCTION_NOT_EQUALS || element.function == RPNElement::FUNCTION_NOT_IN)
|
|
|
|
rpn_stack.back() = !rpn_stack.back();
|
|
|
|
}
|
|
|
|
else if (element.function == RPNElement::FUNCTION_NOT)
|
|
|
|
{
|
|
|
|
rpn_stack.back() = !rpn_stack.back();
|
|
|
|
}
|
|
|
|
else if (element.function == RPNElement::FUNCTION_OR)
|
|
|
|
{
|
|
|
|
auto arg1 = rpn_stack.back();
|
|
|
|
rpn_stack.pop_back();
|
|
|
|
auto arg2 = rpn_stack.back();
|
|
|
|
rpn_stack.back() = arg1 | arg2;
|
|
|
|
}
|
|
|
|
else if (element.function == RPNElement::FUNCTION_AND)
|
|
|
|
{
|
|
|
|
auto arg1 = rpn_stack.back();
|
|
|
|
rpn_stack.pop_back();
|
|
|
|
auto arg2 = rpn_stack.back();
|
|
|
|
rpn_stack.back() = arg1 & arg2;
|
|
|
|
}
|
|
|
|
else if (element.function == RPNElement::ALWAYS_TRUE)
|
|
|
|
{
|
|
|
|
rpn_stack.emplace_back(true, false);
|
|
|
|
}
|
|
|
|
else if (element.function == RPNElement::ALWAYS_FALSE)
|
|
|
|
{
|
|
|
|
rpn_stack.emplace_back(false, true);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
throw Exception("Unexpected function type in KeyCondition::RPNElement", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rpn_stack.size() != 1)
|
|
|
|
throw Exception("Unexpected stack size in KeyCondition::mayBeTrueInRange", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
|
|
|
return rpn_stack[0].can_be_true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool MergeTreeIndexConditionBloomFilter::traverseAtomAST(const ASTPtr & node, Block & block_with_constants, RPNElement & out)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
Field const_value;
|
|
|
|
DataTypePtr const_type;
|
|
|
|
if (KeyCondition::getConstant(node, block_with_constants, const_value, const_type))
|
|
|
|
{
|
|
|
|
if (const_value.getType() == Field::Types::UInt64 || const_value.getType() == Field::Types::Int64 ||
|
|
|
|
const_value.getType() == Field::Types::Float64)
|
|
|
|
{
|
|
|
|
/// Zero in all types is represented in memory the same way as in UInt64.
|
|
|
|
out.function = const_value.get<UInt64>() ? RPNElement::ALWAYS_TRUE : RPNElement::ALWAYS_FALSE;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (const auto * function = node->as<ASTFunction>())
|
|
|
|
{
|
|
|
|
const ASTs & arguments = function->arguments->children;
|
|
|
|
|
|
|
|
if (arguments.size() != 2)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (functionIsInOrGlobalInOperator(function->name))
|
2019-06-19 08:51:35 +00:00
|
|
|
{
|
|
|
|
if (const auto & prepared_set = getPreparedSet(arguments[1]))
|
|
|
|
return traverseASTIn(function->name, arguments[0], prepared_set, out);
|
|
|
|
}
|
|
|
|
else if (function->name == "equals" || function->name == "notEquals")
|
2019-05-10 03:42:28 +00:00
|
|
|
{
|
|
|
|
Field const_value;
|
|
|
|
DataTypePtr const_type;
|
|
|
|
if (KeyCondition::getConstant(arguments[1], block_with_constants, const_value, const_type))
|
2019-06-19 08:51:35 +00:00
|
|
|
return traverseASTEquals(function->name, arguments[0], const_type, const_value, out);
|
2019-05-10 03:42:28 +00:00
|
|
|
else if (KeyCondition::getConstant(arguments[0], block_with_constants, const_value, const_type))
|
2019-06-19 08:51:35 +00:00
|
|
|
return traverseASTEquals(function->name, arguments[1], const_type, const_value, out);
|
2019-05-10 03:42:28 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2019-06-19 08:51:35 +00:00
|
|
|
bool MergeTreeIndexConditionBloomFilter::traverseASTIn(
|
|
|
|
const String & function_name, const ASTPtr & key_ast, const SetPtr & prepared_set, RPNElement & out)
|
|
|
|
{
|
|
|
|
const auto & prepared_info = getPreparedSetInfo(prepared_set);
|
|
|
|
return traverseASTIn(function_name, key_ast, prepared_info.type, prepared_info.column, out);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool MergeTreeIndexConditionBloomFilter::traverseASTIn(
|
|
|
|
const String & function_name, const ASTPtr & key_ast, const DataTypePtr & type, const ColumnPtr & column, RPNElement & out)
|
2019-05-10 03:42:28 +00:00
|
|
|
{
|
|
|
|
if (header.has(key_ast->getColumnName()))
|
|
|
|
{
|
2019-06-19 08:51:35 +00:00
|
|
|
size_t row_size = column->size();
|
|
|
|
size_t position = header.getPositionByName(key_ast->getColumnName());
|
2019-06-19 10:50:37 +00:00
|
|
|
const DataTypePtr & index_type = header.getByPosition(position).type;
|
|
|
|
const auto & converted_column = castColumn(ColumnWithTypeAndName{column, type, ""}, index_type, context);
|
|
|
|
out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithColumn(index_type, converted_column, 0, row_size)));
|
2019-05-10 03:42:28 +00:00
|
|
|
|
2019-06-19 08:51:35 +00:00
|
|
|
if (function_name == "in" || function_name == "globalIn")
|
|
|
|
out.function = RPNElement::FUNCTION_IN;
|
2019-05-10 03:42:28 +00:00
|
|
|
|
2019-06-19 08:51:35 +00:00
|
|
|
if (function_name == "notIn" || function_name == "globalNotIn")
|
|
|
|
out.function = RPNElement::FUNCTION_NOT_IN;
|
2019-05-10 03:42:28 +00:00
|
|
|
|
2019-06-19 08:51:35 +00:00
|
|
|
return true;
|
|
|
|
}
|
2019-05-10 03:42:28 +00:00
|
|
|
|
2019-06-19 08:51:35 +00:00
|
|
|
if (const auto * function = key_ast->as<ASTFunction>())
|
|
|
|
{
|
|
|
|
WhichDataType which(type);
|
|
|
|
|
|
|
|
if (which.isTuple() && function->name == "tuple")
|
|
|
|
{
|
|
|
|
const auto & tuple_column = typeid_cast<const ColumnTuple *>(column.get());
|
|
|
|
const auto & tuple_data_type = typeid_cast<const DataTypeTuple *>(type.get());
|
|
|
|
const ASTs & arguments = typeid_cast<const ASTExpressionList &>(*function->arguments).children;
|
2019-05-10 03:42:28 +00:00
|
|
|
|
2019-06-19 08:51:35 +00:00
|
|
|
if (tuple_data_type->getElements().size() != arguments.size() || tuple_column->getColumns().size() != arguments.size())
|
|
|
|
throw Exception("Illegal types of arguments of function " + function_name, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
|
|
|
|
|
|
|
bool match_with_subtype = false;
|
|
|
|
const auto & sub_columns = tuple_column->getColumns();
|
|
|
|
const auto & sub_data_types = tuple_data_type->getElements();
|
2019-05-10 03:42:28 +00:00
|
|
|
|
2019-06-19 08:51:35 +00:00
|
|
|
for (size_t index = 0; index < arguments.size(); ++index)
|
|
|
|
match_with_subtype |= traverseASTIn(function_name, arguments[index], sub_data_types[index], sub_columns[index], out);
|
2019-05-10 03:42:28 +00:00
|
|
|
|
2019-06-19 08:51:35 +00:00
|
|
|
return match_with_subtype;
|
2019-05-10 03:42:28 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2019-06-19 08:51:35 +00:00
|
|
|
bool MergeTreeIndexConditionBloomFilter::traverseASTEquals(
|
2019-05-10 03:42:28 +00:00
|
|
|
const String & function_name, const ASTPtr & key_ast, const DataTypePtr & value_type, const Field & value_field, RPNElement & out)
|
|
|
|
{
|
|
|
|
if (header.has(key_ast->getColumnName()))
|
|
|
|
{
|
|
|
|
size_t position = header.getPositionByName(key_ast->getColumnName());
|
2019-06-19 10:50:37 +00:00
|
|
|
const DataTypePtr & index_type = header.getByPosition(position).type;
|
|
|
|
Field converted_field = convertFieldToType(value_field, *index_type, &*value_type);
|
|
|
|
out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithField(&*index_type, converted_field)));
|
2019-05-10 03:42:28 +00:00
|
|
|
out.function = function_name == "equals" ? RPNElement::FUNCTION_EQUALS : RPNElement::FUNCTION_NOT_EQUALS;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (const auto * function = key_ast->as<ASTFunction>())
|
|
|
|
{
|
|
|
|
WhichDataType which(value_type);
|
|
|
|
|
|
|
|
if (which.isTuple() && function->name == "tuple")
|
|
|
|
{
|
|
|
|
const TupleBackend & tuple = get<const Tuple &>(value_field).toUnderType();
|
|
|
|
const auto value_tuple_data_type = typeid_cast<const DataTypeTuple *>(value_type.get());
|
|
|
|
const ASTs & arguments = typeid_cast<const ASTExpressionList &>(*function->arguments).children;
|
|
|
|
|
|
|
|
if (tuple.size() != arguments.size())
|
|
|
|
throw Exception("Illegal types of arguments of function " + function_name, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
|
|
|
|
|
|
|
bool match_with_subtype = false;
|
|
|
|
const DataTypes & subtypes = value_tuple_data_type->getElements();
|
|
|
|
|
|
|
|
for (size_t index = 0; index < tuple.size(); ++index)
|
2019-06-19 08:51:35 +00:00
|
|
|
match_with_subtype |= traverseASTEquals(function_name, arguments[index], subtypes[index], tuple[index], out);
|
2019-05-10 03:42:28 +00:00
|
|
|
|
|
|
|
return match_with_subtype;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2019-06-19 08:51:35 +00:00
|
|
|
SetPtr MergeTreeIndexConditionBloomFilter::getPreparedSet(const ASTPtr & node)
|
|
|
|
{
|
|
|
|
if (header.has(node->getColumnName()))
|
|
|
|
{
|
|
|
|
const auto & column_and_type = header.getByName(node->getColumnName());
|
|
|
|
const auto & prepared_set_it = query_info.sets.find(getPreparedSetKey(node, column_and_type.type));
|
|
|
|
|
|
|
|
if (prepared_set_it != query_info.sets.end() && prepared_set_it->second->hasExplicitSetElements())
|
|
|
|
return prepared_set_it->second;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
for (const auto & prepared_set_it : query_info.sets)
|
|
|
|
if (prepared_set_it.first.ast_hash == node->getTreeHash() && prepared_set_it.second->hasExplicitSetElements())
|
|
|
|
return prepared_set_it.second;
|
|
|
|
}
|
|
|
|
|
|
|
|
return DB::SetPtr();
|
|
|
|
}
|
|
|
|
|
2019-05-10 03:42:28 +00:00
|
|
|
}
|