diff --git a/dbms/src/Columns/IColumn.cpp b/dbms/src/Columns/IColumn.cpp index a3ed0885651..df56823b6aa 100644 --- a/dbms/src/Columns/IColumn.cpp +++ b/dbms/src/Columns/IColumn.cpp @@ -9,6 +9,11 @@ namespace DB { +Field IColumn::get(size_t n) const +{ + return (*this)[n]; +} + String IColumn::dumpStructure() const { WriteBufferFromOwnString res; diff --git a/dbms/src/Columns/IColumn.h b/dbms/src/Columns/IColumn.h index 090537d6770..aa9455fc6cc 100644 --- a/dbms/src/Columns/IColumn.h +++ b/dbms/src/Columns/IColumn.h @@ -70,6 +70,7 @@ public: /// Returns value of n-th element in universal Field representation. /// Is used in rare cases, since creation of Field instance is expensive usually. virtual Field operator[](size_t n) const = 0; + Field get(size_t n) const; /// Like the previous one, but avoids extra copying if Field is in a container, for example. virtual void get(size_t n, Field & res) const = 0; diff --git a/dbms/src/Core/Block.cpp b/dbms/src/Core/Block.cpp index a18d34af994..bc29a74f4eb 100644 --- a/dbms/src/Core/Block.cpp +++ b/dbms/src/Core/Block.cpp @@ -181,25 +181,25 @@ const ColumnWithTypeAndName & Block::safeGetByPosition(size_t position) const } -ColumnWithTypeAndName & Block::getByName(const std::string & name) +const ColumnWithTypeAndName * Block::findByName(const std::string & name) const { auto it = index_by_name.find(name); if (index_by_name.end() == it) - throw Exception("Not found column " + name + " in block. There are only columns: " + dumpNames() - , ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); - - return data[it->second]; + { + return nullptr; + } + return &data[it->second]; } const ColumnWithTypeAndName & Block::getByName(const std::string & name) const { - auto it = index_by_name.find(name); - if (index_by_name.end() == it) + auto * result = findByName(name); + if (!result) throw Exception("Not found column " + name + " in block. There are only columns: " + dumpNames() , ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); - return data[it->second]; + return *result; } diff --git a/dbms/src/Core/Block.h b/dbms/src/Core/Block.h index 82b60c83efb..ce804ddc0b5 100644 --- a/dbms/src/Core/Block.h +++ b/dbms/src/Core/Block.h @@ -28,7 +28,7 @@ class Block { private: using Container = ColumnsWithTypeAndName; - using IndexByName = std::map; + using IndexByName = std::unordered_map; Container data; IndexByName index_by_name; @@ -64,7 +64,20 @@ public: ColumnWithTypeAndName & safeGetByPosition(size_t position); const ColumnWithTypeAndName & safeGetByPosition(size_t position) const; - ColumnWithTypeAndName & getByName(const std::string & name); + ColumnWithTypeAndName* findByName(const std::string & name) + { + return const_cast( + const_cast(this)->findByName(name)); + } + + const ColumnWithTypeAndName* findByName(const std::string & name) const; + + ColumnWithTypeAndName & getByName(const std::string & name) + { + return const_cast( + const_cast(this)->getByName(name)); + } + const ColumnWithTypeAndName & getByName(const std::string & name) const; Container::iterator begin() { return data.begin(); } diff --git a/dbms/src/Interpreters/ActionsVisitor.cpp b/dbms/src/Interpreters/ActionsVisitor.cpp index eb92ee76973..94c6cc000f9 100644 --- a/dbms/src/Interpreters/ActionsVisitor.cpp +++ b/dbms/src/Interpreters/ActionsVisitor.cpp @@ -195,17 +195,17 @@ SetPtr makeExplicitSet( return set; } -static String getUniqueName(const Block & block, const String & prefix) +static String getUniqueName(ActionsVisitor::Data & data, const String & prefix) { + auto & block = data.getSampleBlock(); auto result = prefix; if (block.has(result)) { - int i = 1; do { - result = prefix + "_" + toString(i); - ++i; + result = prefix + "_" + toString(data.next_unique_suffix); + ++data.next_unique_suffix; } while (block.has(result)); } @@ -468,7 +468,7 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data & /// If the argument is a set given by an enumeration of values (so, the set was already built), give it a unique name, /// so that sets with the same literal representation do not fuse together (they can have different types). if (!prepared_set->empty()) - column.name = getUniqueName(data.getSampleBlock(), "__set"); + column.name = getUniqueName(data, "__set"); else column.name = child->getColumnName(); @@ -496,7 +496,7 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data & ColumnWithTypeAndName column( ColumnConst::create(std::move(column_string), 1), std::make_shared(), - getUniqueName(data.getSampleBlock(), "__joinGet")); + getUniqueName(data, "__joinGet")); data.addAction(ExpressionAction::addColumn(column)); argument_types.push_back(column.type); argument_names.push_back(column.name); @@ -577,7 +577,7 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data & /// We can not name `getColumnName()`, /// because it does not uniquely define the expression (the types of arguments can be different). - String lambda_name = getUniqueName(data.getSampleBlock(), "__lambda"); + String lambda_name = getUniqueName(data, "__lambda"); auto function_capture = std::make_unique( lambda_actions, captured, lambda_arguments, result_type, result_name); @@ -612,16 +612,50 @@ void ActionsMatcher::visit(const ASTLiteral & literal, const ASTPtr & /* ast */, Data & data) { DataTypePtr type = applyVisitor(FieldToDataType(), literal.value); + const auto value = convertFieldToType(literal.value, *type); + + // FIXME why do we have a second pass with a clean sample block over the same + // AST here? Anyway, do not modify the column name if it is set already. + if (literal.unique_column_name.empty()) + { + const auto default_name = literal.getColumnName(); + auto & block = data.getSampleBlock(); + auto * existing_column = block.findByName(default_name); + + /* + * To approximate CSE, build all identical literals to a single temporary + * columns. We try to find the column by its default name, but after that + * we have to check that it contains the correct data. This might not be + * the case if it is a user-supplied column, or it is from under a join, + * etc. + * Overall, this is a hack around a generally poor name-based notion of + * column identity we currently use. + */ + if (existing_column + && existing_column->column + && isColumnConst(*existing_column->column) + && existing_column->column->size() == 1 + && existing_column->column->get(0) == value) + { + const_cast(literal).unique_column_name = default_name; + } + else + { + const_cast(literal).unique_column_name + = getUniqueName(data, default_name); + } + } + + if (data.hasColumn(literal.unique_column_name)) + { + return; + } ColumnWithTypeAndName column; - column.column = type->createColumnConst(1, convertFieldToType(literal.value, *type)); + column.name = literal.unique_column_name; + column.column = type->createColumnConst(1, value); column.type = type; - // Always create columns for literals with a unique name. Otherwise, there - // may be some weird clashes, see 01101_literal_column_clash. - column.name = getUniqueName(data.getSampleBlock(), literal.getColumnName()); - const_cast(literal).unique_column_name = column.name; - data.addAction(ExpressionAction::addColumn(column)); } diff --git a/dbms/src/Interpreters/ActionsVisitor.h b/dbms/src/Interpreters/ActionsVisitor.h index f6db551ff33..e67e181e009 100644 --- a/dbms/src/Interpreters/ActionsVisitor.h +++ b/dbms/src/Interpreters/ActionsVisitor.h @@ -42,6 +42,7 @@ struct ScopeStack const Context & context; +public: ScopeStack(const ExpressionActionsPtr & actions, const Context & context_); void pushLevel(const NamesAndTypesList & input_columns); @@ -80,6 +81,13 @@ public: size_t visit_depth; ScopeStack actions_stack; + /* + * Remember the last unique column suffix to avoid quadratic behavior + * when we add lots of column with same prefix. One counter for all + * prefixes is good enough. + */ + int next_unique_suffix; + Data(const Context & context_, SizeLimits set_size_limit_, size_t subquery_depth_, const NamesAndTypesList & source_columns_, const ExpressionActionsPtr & actions, PreparedSets & prepared_sets_, SubqueriesForSets & subqueries_for_sets_, @@ -95,7 +103,8 @@ public: only_consts(only_consts_), no_storage_or_local(no_storage_or_local_), visit_depth(0), - actions_stack(actions, context) + actions_stack(actions, context), + next_unique_suffix(actions_stack.getSampleBlock().columns() + 1) {} void updateActions(ExpressionActionsPtr & actions) diff --git a/dbms/src/Interpreters/ExpressionAnalyzer.cpp b/dbms/src/Interpreters/ExpressionAnalyzer.cpp index cad4b3bd188..dc362542df9 100644 --- a/dbms/src/Interpreters/ExpressionAnalyzer.cpp +++ b/dbms/src/Interpreters/ExpressionAnalyzer.cpp @@ -855,7 +855,35 @@ void SelectQueryExpressionAnalyzer::appendProjectResult(ExpressionActionsChain & String result_name = ast->getAliasOrColumnName(); if (required_result_columns.empty() || required_result_columns.count(result_name)) { - result_columns.emplace_back(ast->getColumnName(), result_name); + std::string source_name = ast->getColumnName(); + + /* + * For temporary columns created by ExpressionAnalyzer for literals, + * use the correct source column. Using the default display name + * returned by getColumnName is not enough, and we have to use the + * column id set by EA. In principle, this logic applies to all kinds + * of columns, not only literals. Literals are especially problematic + * for two reasons: + * 1) confusing different literal columns leads to weird side + * effects (see 01101_literal_columns_clash); + * 2) the disambiguation mechanism in SyntaxAnalyzer, that, among + * other things, creates unique aliases for columns with same + * names from different tables, is applied before these temporary + * columns are created by ExpressionAnalyzer. + * Similar problems should also manifest for function columns, which + * are likewise created at a later stage by EA. + * In general, we need to have explicit separation between display + * names and identifiers for columns. This code is a workaround for + * a particular subclass of problems, and not a proper solution. + */ + if (auto as_literal = dynamic_cast(ast.get()); + as_literal) + { + source_name = as_literal->unique_column_name; + assert(!source_name.empty()); + } + + result_columns.emplace_back(source_name, result_name); step.required_output.push_back(result_columns.back().second); } }