#include #include #include #include #include #include #include #include #include #include #include namespace DB { namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int INCORRECT_QUERY; } /// 0b11 -- can be true and false at the same time static const Field UNKNOWN_FIELD(3u); MergeTreeIndexGranuleSet::MergeTreeIndexGranuleSet( const String & index_name_, const Block & index_sample_block_, size_t max_rows_) : index_name(index_name_) , max_rows(max_rows_) , index_sample_block(index_sample_block_) , block(index_sample_block) { } MergeTreeIndexGranuleSet::MergeTreeIndexGranuleSet( const String & index_name_, const Block & index_sample_block_, size_t max_rows_, MutableColumns && mutable_columns_) : index_name(index_name_) , max_rows(max_rows_) , index_sample_block(index_sample_block_) , block(index_sample_block.cloneWithColumns(std::move(mutable_columns_))) { } void MergeTreeIndexGranuleSet::serializeBinary(WriteBuffer & ostr) const { if (empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to write empty set index {}.", backQuote(index_name)); const auto & size_type = DataTypePtr(std::make_shared()); auto size_serialization = size_type->getDefaultSerialization(); if (max_rows != 0 && size() > max_rows) { size_serialization->serializeBinary(0, ostr, {}); return; } size_serialization->serializeBinary(size(), ostr, {}); for (size_t i = 0; i < index_sample_block.columns(); ++i) { const auto & type = index_sample_block.getByPosition(i).type; ISerialization::SerializeBinaryBulkSettings settings; settings.getter = [&ostr](ISerialization::SubstreamPath) -> WriteBuffer * { return &ostr; }; settings.position_independent_encoding = false; settings.low_cardinality_max_dictionary_size = 0; auto serialization = type->getDefaultSerialization(); ISerialization::SerializeBinaryBulkStatePtr state; const auto & column = *block.getByPosition(i).column; serialization->serializeBinaryBulkStatePrefix(column, settings, state); serialization->serializeBinaryBulkWithMultipleStreams(column, 0, size(), settings, state); serialization->serializeBinaryBulkStateSuffix(settings, state); } } void MergeTreeIndexGranuleSet::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) { if (version != 1) throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown index version {}.", version); block.clear(); Field field_rows; const auto & size_type = DataTypePtr(std::make_shared()); size_type->getDefaultSerialization()->deserializeBinary(field_rows, istr, {}); size_t rows_to_read = field_rows.get(); if (rows_to_read == 0) return; for (size_t i = 0; i < index_sample_block.columns(); ++i) { const auto & column = index_sample_block.getByPosition(i); const auto & type = column.type; ColumnPtr new_column = type->createColumn(); ISerialization::DeserializeBinaryBulkSettings settings; settings.getter = [&](ISerialization::SubstreamPath) -> ReadBuffer * { return &istr; }; settings.position_independent_encoding = false; ISerialization::DeserializeBinaryBulkStatePtr state; auto serialization = type->getDefaultSerialization(); serialization->deserializeBinaryBulkStatePrefix(settings, state); serialization->deserializeBinaryBulkWithMultipleStreams(new_column, rows_to_read, settings, state, nullptr); block.insert(ColumnWithTypeAndName(new_column, type, column.name)); } } MergeTreeIndexAggregatorSet::MergeTreeIndexAggregatorSet(const String & index_name_, const Block & index_sample_block_, size_t max_rows_) : index_name(index_name_) , max_rows(max_rows_) , index_sample_block(index_sample_block_) , columns(index_sample_block_.cloneEmptyColumns()) { ColumnRawPtrs column_ptrs; column_ptrs.reserve(index_sample_block.columns()); Columns materialized_columns; for (const auto & column : index_sample_block.getColumns()) { materialized_columns.emplace_back(column->convertToFullColumnIfConst()->convertToFullColumnIfLowCardinality()); column_ptrs.emplace_back(materialized_columns.back().get()); } data.init(ClearableSetVariants::chooseMethod(column_ptrs, key_sizes)); columns = index_sample_block.cloneEmptyColumns(); } void MergeTreeIndexAggregatorSet::update(const Block & block, size_t * pos, size_t limit) { if (*pos >= block.rows()) throw Exception(ErrorCodes::LOGICAL_ERROR, "The provided position is not less than the number of block rows. " "Position: {}, Block rows: {}.", toString(*pos), toString(block.rows())); size_t rows_read = std::min(limit, block.rows() - *pos); if (max_rows && size() > max_rows) { *pos += rows_read; return; } ColumnRawPtrs index_column_ptrs; index_column_ptrs.reserve(index_sample_block.columns()); Columns materialized_columns; const Names index_columns = index_sample_block.getNames(); for (const auto & column_name : index_columns) { materialized_columns.emplace_back( block.getByName(column_name).column->convertToFullColumnIfConst()->convertToFullColumnIfLowCardinality()); index_column_ptrs.emplace_back(materialized_columns.back().get()); } IColumn::Filter filter(block.rows(), 0); bool has_new_data = false; switch (data.type) { case ClearableSetVariants::Type::EMPTY: break; #define M(NAME) \ case ClearableSetVariants::Type::NAME: \ has_new_data = buildFilter(*data.NAME, index_column_ptrs, filter, *pos, rows_read, data); \ break; APPLY_FOR_SET_VARIANTS(M) #undef M } if (has_new_data) { for (size_t i = 0; i < columns.size(); ++i) { auto filtered_column = block.getByName(index_columns[i]).column->filter(filter, block.rows()); columns[i]->insertRangeFrom(*filtered_column, 0, filtered_column->size()); } } *pos += rows_read; } template bool MergeTreeIndexAggregatorSet::buildFilter( Method & method, const ColumnRawPtrs & column_ptrs, IColumn::Filter & filter, size_t pos, size_t limit, ClearableSetVariants & variants) const { /// Like DistinctSortedTransform. typename Method::State state(column_ptrs, key_sizes, nullptr); bool has_new_data = false; for (size_t i = 0; i < limit; ++i) { auto emplace_result = state.emplaceKey(method.data, pos + i, variants.string_pool); if (emplace_result.isInserted()) has_new_data = true; /// Emit the record if there is no such key in the current set yet. /// Skip it otherwise. filter[pos + i] = emplace_result.isInserted(); } return has_new_data; } MergeTreeIndexGranulePtr MergeTreeIndexAggregatorSet::getGranuleAndReset() { auto granule = std::make_shared(index_name, index_sample_block, max_rows, std::move(columns)); switch (data.type) { case ClearableSetVariants::Type::EMPTY: break; #define M(NAME) \ case ClearableSetVariants::Type::NAME: \ data.NAME->data.clear(); \ break; APPLY_FOR_SET_VARIANTS(M) #undef M } columns = index_sample_block.cloneEmptyColumns(); return granule; } MergeTreeIndexConditionSet::MergeTreeIndexConditionSet( const String & index_name_, const Block & index_sample_block, size_t max_rows_, const SelectQueryInfo & query_info, ContextPtr context) : index_name(index_name_) , max_rows(max_rows_) { for (const auto & name : index_sample_block.getNames()) if (!key_columns.contains(name)) key_columns.insert(name); ASTPtr ast_filter_node = buildFilterNode(query_info.query); if (!ast_filter_node) return; if (context->getSettingsRef().allow_experimental_analyzer) { if (!query_info.filter_actions_dag) return; if (checkDAGUseless(*query_info.filter_actions_dag->getOutputs().at(0), context)) return; const auto * filter_node = query_info.filter_actions_dag->getOutputs().at(0); auto filter_actions_dag = ActionsDAG::buildFilterActionsDAG({filter_node}, {}, context); const auto * filter_actions_dag_node = filter_actions_dag->getOutputs().at(0); std::unordered_map node_to_result_node; filter_actions_dag->getOutputs()[0] = &traverseDAG(*filter_actions_dag_node, filter_actions_dag, context, node_to_result_node); filter_actions_dag->removeUnusedActions(); actions = std::make_shared(filter_actions_dag); } else { if (checkASTUseless(ast_filter_node)) return; auto expression_ast = ast_filter_node->clone(); /// Replace logical functions with bit functions. /// Working with UInt8: last bit = can be true, previous = can be false (Like src/Storages/MergeTree/BoolMask.h). traverseAST(expression_ast); auto syntax_analyzer_result = TreeRewriter(context).analyze(expression_ast, index_sample_block.getNamesAndTypesList()); actions = ExpressionAnalyzer(expression_ast, syntax_analyzer_result, context).getActions(true); } } bool MergeTreeIndexConditionSet::alwaysUnknownOrTrue() const { return isUseless(); } bool MergeTreeIndexConditionSet::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const { if (isUseless()) return true; auto granule = std::dynamic_pointer_cast(idx_granule); if (!granule) throw Exception(ErrorCodes::LOGICAL_ERROR, "Set index condition got a granule with the wrong type"); if (isUseless() || granule->empty() || (max_rows != 0 && granule->size() > max_rows)) return true; Block result = granule->block; actions->execute(result); const auto & filter_node_name = actions->getActionsDAG().getOutputs().at(0)->result_name; auto column = result.getByName(filter_node_name).column->convertToFullColumnIfConst()->convertToFullColumnIfLowCardinality(); if (column->onlyNull()) return false; const auto * col_uint8 = typeid_cast(column.get()); const NullMap * null_map = nullptr; if (const auto * col_nullable = checkAndGetColumn(*column)) { col_uint8 = typeid_cast(&col_nullable->getNestedColumn()); null_map = &col_nullable->getNullMapData(); } if (!col_uint8) throw Exception(ErrorCodes::LOGICAL_ERROR, "ColumnUInt8 expected as Set index condition result"); const auto & condition = col_uint8->getData(); size_t column_size = column->size(); for (size_t i = 0; i < column_size; ++i) if ((!null_map || (*null_map)[i] == 0) && condition[i] & 1) return true; return false; } const ActionsDAG::Node & MergeTreeIndexConditionSet::traverseDAG(const ActionsDAG::Node & node, ActionsDAGPtr & result_dag, const ContextPtr & context, std::unordered_map & node_to_result_node) const { auto result_node_it = node_to_result_node.find(&node); if (result_node_it != node_to_result_node.end()) return *result_node_it->second; const ActionsDAG::Node * result_node = nullptr; if (const auto * operator_node_ptr = operatorFromDAG(node, result_dag, context, node_to_result_node)) { result_node = operator_node_ptr; } else if (const auto * atom_node_ptr = atomFromDAG(node, result_dag, context)) { result_node = atom_node_ptr; if (atom_node_ptr->type == ActionsDAG::ActionType::INPUT || atom_node_ptr->type == ActionsDAG::ActionType::FUNCTION) { auto bit_wrapper_function = FunctionFactory::instance().get("__bitWrapperFunc", context); result_node = &result_dag->addFunction(bit_wrapper_function, {atom_node_ptr}, {}); } } else { ColumnWithTypeAndName unknown_field_column_with_type; unknown_field_column_with_type.name = calculateConstantActionNodeName(UNKNOWN_FIELD); unknown_field_column_with_type.type = std::make_shared(); unknown_field_column_with_type.column = unknown_field_column_with_type.type->createColumnConst(1, UNKNOWN_FIELD); result_node = &result_dag->addColumn(unknown_field_column_with_type); } node_to_result_node.emplace(&node, result_node); return *result_node; } const ActionsDAG::Node * MergeTreeIndexConditionSet::atomFromDAG(const ActionsDAG::Node & node, ActionsDAGPtr & result_dag, const ContextPtr & context) const { /// Function, literal or column const auto * node_to_check = &node; while (node_to_check->type == ActionsDAG::ActionType::ALIAS) node_to_check = node_to_check->children[0]; if (node_to_check->column && isColumnConst(*node_to_check->column)) return &node; RPNBuilderTreeContext tree_context(context); RPNBuilderTreeNode tree_node(node_to_check, tree_context); auto column_name = tree_node.getColumnName(); if (key_columns.contains(column_name)) { const auto * result_node = node_to_check; if (node.type != ActionsDAG::ActionType::INPUT) result_node = &result_dag->addInput(column_name, node.result_type); return result_node; } if (node.type != ActionsDAG::ActionType::FUNCTION) return nullptr; const auto & arguments = node.children; size_t arguments_size = arguments.size(); ActionsDAG::NodeRawConstPtrs children(arguments_size); for (size_t i = 0; i < arguments_size; ++i) { children[i] = atomFromDAG(*arguments[i], result_dag, context); if (!children[i]) return nullptr; } return &result_dag->addFunction(node.function_base, children, {}); } const ActionsDAG::Node * MergeTreeIndexConditionSet::operatorFromDAG(const ActionsDAG::Node & node, ActionsDAGPtr & result_dag, const ContextPtr & context, std::unordered_map & node_to_result_node) const { /// Functions AND, OR, NOT. Replace with bit*. const auto * node_to_check = &node; while (node_to_check->type == ActionsDAG::ActionType::ALIAS) node_to_check = node_to_check->children[0]; if (node_to_check->column && isColumnConst(*node_to_check->column)) return nullptr; if (node_to_check->type != ActionsDAG::ActionType::FUNCTION) return nullptr; auto function_name = node_to_check->function->getName(); const auto & arguments = node_to_check->children; size_t arguments_size = arguments.size(); if (function_name == "not") { if (arguments_size != 1) return nullptr; auto bit_swap_last_two_function = FunctionFactory::instance().get("__bitSwapLastTwo", context); return &result_dag->addFunction(bit_swap_last_two_function, {arguments[0]}, {}); } else if (function_name == "and" || function_name == "indexHint" || function_name == "or") { if (arguments_size < 2) return nullptr; ActionsDAG::NodeRawConstPtrs children; children.resize(arguments_size); for (size_t i = 0; i < arguments_size; ++i) children[i] = &traverseDAG(*arguments[i], result_dag, context, node_to_result_node); FunctionOverloadResolverPtr function; if (function_name == "and" || function_name == "indexHint") function = FunctionFactory::instance().get("__bitBoolMaskAnd", context); else function = FunctionFactory::instance().get("__bitBoolMaskOr", context); const auto * last_argument = children.back(); children.pop_back(); const auto * before_last_argument = children.back(); children.pop_back(); while (true) { last_argument = &result_dag->addFunction(function, {before_last_argument, last_argument}, {}); if (children.empty()) break; before_last_argument = children.back(); children.pop_back(); } return last_argument; } return nullptr; } bool MergeTreeIndexConditionSet::checkDAGUseless(const ActionsDAG::Node & node, const ContextPtr & context, bool atomic) const { const auto * node_to_check = &node; while (node_to_check->type == ActionsDAG::ActionType::ALIAS) node_to_check = node_to_check->children[0]; RPNBuilderTreeContext tree_context(context); RPNBuilderTreeNode tree_node(node_to_check, tree_context); if (node.column && isColumnConst(*node.column)) { Field literal; node.column->get(0, literal); return !atomic && literal.safeGet(); } else if (node.type == ActionsDAG::ActionType::FUNCTION) { auto column_name = tree_node.getColumnName(); if (key_columns.contains(column_name)) return false; auto function_name = node.function_base->getName(); const auto & arguments = node.children; if (function_name == "and" || function_name == "indexHint") return std::all_of(arguments.begin(), arguments.end(), [&, atomic](const auto & arg) { return checkDAGUseless(*arg, context, atomic); }); else if (function_name == "or") return std::any_of(arguments.begin(), arguments.end(), [&, atomic](const auto & arg) { return checkDAGUseless(*arg, context, atomic); }); else if (function_name == "not") return checkDAGUseless(*arguments.at(0), context, atomic); else return std::any_of(arguments.begin(), arguments.end(), [&](const auto & arg) { return checkDAGUseless(*arg, context, true /*atomic*/); }); } auto column_name = tree_node.getColumnName(); return !key_columns.contains(column_name); } void MergeTreeIndexConditionSet::traverseAST(ASTPtr & node) const { if (operatorFromAST(node)) { auto & args = node->as()->arguments->children; for (auto & arg : args) traverseAST(arg); return; } if (atomFromAST(node)) { if (node->as() || node->as()) node = makeASTFunction("__bitWrapperFunc", node); } else node = std::make_shared(UNKNOWN_FIELD); } bool MergeTreeIndexConditionSet::atomFromAST(ASTPtr & node) const { /// Function, literal or column if (node->as()) return true; if (const auto * identifier = node->as()) return key_columns.contains(identifier->getColumnName()); if (auto * func = node->as()) { if (key_columns.contains(func->getColumnName())) { /// Function is already calculated. node = std::make_shared(func->getColumnName()); return true; } auto & args = func->arguments->children; for (auto & arg : args) if (!atomFromAST(arg)) return false; return true; } return false; } bool MergeTreeIndexConditionSet::operatorFromAST(ASTPtr & node) { /// Functions AND, OR, NOT. Replace with bit*. auto * func = node->as(); if (!func) return false; auto & args = func->arguments->children; if (func->name == "not") { if (args.size() != 1) return false; func->name = "__bitSwapLastTwo"; } else if (func->name == "and" || func->name == "indexHint") { auto last_arg = args.back(); args.pop_back(); ASTPtr new_func; if (args.size() > 1) new_func = makeASTFunction( "__bitBoolMaskAnd", node, last_arg); else new_func = makeASTFunction( "__bitBoolMaskAnd", args.back(), last_arg); node = new_func; } else if (func->name == "or") { auto last_arg = args.back(); args.pop_back(); ASTPtr new_func; if (args.size() > 1) new_func = makeASTFunction( "__bitBoolMaskOr", node, last_arg); else new_func = makeASTFunction( "__bitBoolMaskOr", args.back(), last_arg); node = new_func; } else return false; return true; } bool MergeTreeIndexConditionSet::checkASTUseless(const ASTPtr & node, bool atomic) const { if (!node) return true; if (const auto * func = node->as()) { if (key_columns.contains(func->getColumnName())) return false; const ASTs & args = func->arguments->children; if (func->name == "and" || func->name == "indexHint") return std::all_of(args.begin(), args.end(), [this, atomic](const auto & arg) { return checkASTUseless(arg, atomic); }); else if (func->name == "or") return std::any_of(args.begin(), args.end(), [this, atomic](const auto & arg) { return checkASTUseless(arg, atomic); }); else if (func->name == "not") return checkASTUseless(args[0], atomic); else return std::any_of(args.begin(), args.end(), [this](const auto & arg) { return checkASTUseless(arg, true); }); } else if (const auto * literal = node->as()) return !atomic && literal->value.safeGet(); else if (const auto * identifier = node->as()) return !key_columns.contains(identifier->getColumnName()); else return true; } MergeTreeIndexGranulePtr MergeTreeIndexSet::createIndexGranule() const { return std::make_shared(index.name, index.sample_block, max_rows); } MergeTreeIndexAggregatorPtr MergeTreeIndexSet::createIndexAggregator() const { return std::make_shared(index.name, index.sample_block, max_rows); } MergeTreeIndexConditionPtr MergeTreeIndexSet::createIndexCondition( const SelectQueryInfo & query, ContextPtr context) const { return std::make_shared(index.name, index.sample_block, max_rows, query, context); } bool MergeTreeIndexSet::mayBenefitFromIndexForIn(const ASTPtr &) const { return false; } MergeTreeIndexPtr setIndexCreator(const IndexDescription & index) { size_t max_rows = index.arguments[0].get(); return std::make_shared(index, max_rows); } void setIndexValidator(const IndexDescription & index, bool /*attach*/) { if (index.arguments.size() != 1) throw Exception(ErrorCodes::INCORRECT_QUERY, "Set index must have exactly one argument."); else if (index.arguments[0].getType() != Field::Types::UInt64) throw Exception(ErrorCodes::INCORRECT_QUERY, "Set index argument must be positive integer."); } }