From d3ee3a693cb899098d2c2f20b171176f2c25e7d6 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 8 May 2019 04:41:05 +0300 Subject: [PATCH] Batch insert Values with expressions (draft) --- dbms/src/Common/ErrorCodes.cpp | 3 + .../Formats/ConstantExpressionTemplate.cpp | 166 ++++++++++++++++++ dbms/src/Formats/ConstantExpressionTemplate.h | 35 ++++ dbms/src/Formats/ValuesBlockInputStream.cpp | 127 ++++++++++---- dbms/src/Formats/ValuesBlockInputStream.h | 8 + 5 files changed, 306 insertions(+), 33 deletions(-) create mode 100644 dbms/src/Formats/ConstantExpressionTemplate.cpp create mode 100644 dbms/src/Formats/ConstantExpressionTemplate.h diff --git a/dbms/src/Common/ErrorCodes.cpp b/dbms/src/Common/ErrorCodes.cpp index 958a6b1fa2b..88c9353a443 100644 --- a/dbms/src/Common/ErrorCodes.cpp +++ b/dbms/src/Common/ErrorCodes.cpp @@ -448,6 +448,9 @@ namespace ErrorCodes extern const int SETTINGS_ARE_NOT_SUPPORTED = 471; extern const int READONLY_SETTING = 472; extern const int INVALID_TEMPLATE_FORMAT = 473; + extern const int CANNOT_CREATE_EXPRESSION_TEMPLATE = 474; + extern const int CANNOT_PARSE_EXPRESSION_USING_TEMPLATE = 475; + extern const int CANNOT_EVALUATE_EXPRESSION_TEMPLATE = 476; extern const int KEEPER_EXCEPTION = 999; extern const int POCO_EXCEPTION = 1000; diff --git a/dbms/src/Formats/ConstantExpressionTemplate.cpp b/dbms/src/Formats/ConstantExpressionTemplate.cpp new file mode 100644 index 00000000000..777947e72cd --- /dev/null +++ b/dbms/src/Formats/ConstantExpressionTemplate.cpp @@ -0,0 +1,166 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_CREATE_EXPRESSION_TEMPLATE; + extern const int CANNOT_PARSE_EXPRESSION_USING_TEMPLATE; + extern const int CANNOT_EVALUATE_EXPRESSION_TEMPLATE; +} + + +ConstantExpressionTemplate::ConstantExpressionTemplate(const IDataType & result_column_type, TokenIterator begin, TokenIterator end, + const Context & context) +{ + std::pair expr_template = replaceLiteralsWithDummyIdentifiers(begin, end); + for (const auto & col : expr_template.second) + literals.insert({nullptr, col.type, col.name}); + columns = literals.cloneEmptyColumns(); + + ParserExpression parser; + Expected expected; + Tokens template_tokens(expr_template.first.data(), expr_template.first.data() + expr_template.first.size()); + TokenIterator token_iterator1(template_tokens); + + ASTPtr ast_template; + if (!parser.parse(token_iterator1, ast_template, expected)) + throw Exception("Cannot parse template after replacing literals: ", ErrorCodes::CANNOT_CREATE_EXPRESSION_TEMPLATE); + + addNodesToCastResult(result_column_type, ast_template); + result_column_name = ast_template->getColumnName(); + + auto syntax_result = SyntaxAnalyzer(context).analyze(ast_template, expr_template.second); + + actions_on_literals = ExpressionAnalyzer(ast_template, syntax_result, context).getActions(false); +} + +void ConstantExpressionTemplate::parseExpression(ReadBuffer & istr, const FormatSettings & settings) +{ + size_t cur_column = 0; + try + { + size_t cur_token = 0; + while (cur_column < literals.columns()) + { + size_t skip_tokens_until = token_after_literal_idx[cur_column]; + while (cur_token < skip_tokens_until) + { + // TODO skip comments + skipWhitespaceIfAny(istr); + assertString(tokens[cur_token++], istr); + } + skipWhitespaceIfAny(istr); + const IDataType & type = *literals.getByPosition(cur_column).type; + type.deserializeAsTextQuoted(*columns[cur_column], istr, settings); + ++cur_column; + } + while (cur_token < tokens.size()) + { + skipWhitespaceIfAny(istr); + assertString(tokens[cur_token++], istr); + } + } catch (DB::Exception & e) + { + for (size_t i = 0; i < cur_column; ++i) + columns[i]->popBack(1); + + if (!isParseError(e.code())) + throw; + } + throw DB::Exception("Cannot parse expression using template", ErrorCodes::CANNOT_PARSE_EXPRESSION_USING_TEMPLATE); +} + +ColumnPtr ConstantExpressionTemplate::evaluateAll() +{ + Block evaluated = literals.cloneWithColumns(std::move(columns)); + columns = literals.cloneEmptyColumns(); + actions_on_literals->execute(evaluated); + + if (!evaluated || evaluated.rows() == 0) + throw Exception("Logical error: empty block after evaluation of batch of constant expressions", + ErrorCodes::LOGICAL_ERROR); + + if (!evaluated.has(result_column_name)) + throw Exception("Cannot evaluate template " + result_column_name + ", block structure:\n" + evaluated.dumpStructure(), + ErrorCodes::CANNOT_EVALUATE_EXPRESSION_TEMPLATE); + + return evaluated.getByName(result_column_name).column; +} + +std::pair +ConstantExpressionTemplate::replaceLiteralsWithDummyIdentifiers(TokenIterator & begin, TokenIterator & end) +{ + NamesAndTypesList dummy_columns; + ParserLiteral parser; + String result; + size_t token_idx = 0; + while (begin != end) + { + const Token & t = *begin; + if (t.isError()) + throw DB::Exception("Error in tokens", ErrorCodes::CANNOT_CREATE_EXPRESSION_TEMPLATE); + + // TODO don't convert constant string arguments of functions such as CAST(x, 'type') + // TODO process Array as one literal to make possible parsing constant arrays of different size + if (t.type == TokenType::Number || t.type == TokenType::StringLiteral) + { + Expected expected; + ASTPtr ast; + if (!parser.parse(begin, ast, expected)) + throw DB::Exception("Cannot determine literal type", ErrorCodes::CANNOT_CREATE_EXPRESSION_TEMPLATE); + + // TODO use nullable type if necessary (e.g. value is not NULL, but result_column_type is nullable and next rows may contain NULLs) + // TODO parse numbers more carefully: sign is a separate token before number + Field & value = ast->as().value; + DataTypePtr type = DataTypeFactory::instance().get(value.getTypeName()); + // TODO ensure dummy_col_name is unique (there was no _dummy_x identifier in expression) + String dummy_col_name = "_dummy_" + std::to_string(dummy_columns.size()); + dummy_columns.push_back(NameAndTypePair(dummy_col_name, type)); + token_after_literal_idx.push_back(token_idx); + result.append(dummy_col_name); + } + else + { + tokens.emplace_back(t.begin, t.size()); + result.append(tokens.back()); + ++begin; + ++token_idx; + } + result.append(" "); + } + if (dummy_columns.empty()) // TODO + throw DB::Exception("not implemented yet", ErrorCodes::CANNOT_CREATE_EXPRESSION_TEMPLATE); + return std::make_pair(result, dummy_columns); +} + +void ConstantExpressionTemplate::addNodesToCastResult(const IDataType & result_column_type, ASTPtr & expr) +{ + auto result_type = std::make_shared(result_column_type.getName()); + + auto arguments = std::make_shared(); + arguments->children.push_back(std::move(expr)); + arguments->children.push_back(std::move(result_type)); + + auto cast = std::make_shared(); + cast->name = "CAST"; + cast->arguments = std::move(arguments); + cast->children.push_back(cast->arguments); + + expr = std::move(cast); +} + +} diff --git a/dbms/src/Formats/ConstantExpressionTemplate.h b/dbms/src/Formats/ConstantExpressionTemplate.h new file mode 100644 index 00000000000..37038244da5 --- /dev/null +++ b/dbms/src/Formats/ConstantExpressionTemplate.h @@ -0,0 +1,35 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +class ConstantExpressionTemplate +{ +public: + ConstantExpressionTemplate(const IDataType & result_column_type, TokenIterator begin, TokenIterator end, const Context & context); + + void parseExpression(ReadBuffer & istr, const FormatSettings & settings); + + ColumnPtr evaluateAll(); + +private: + std::pair replaceLiteralsWithDummyIdentifiers(TokenIterator & begin, TokenIterator & end); + + static void addNodesToCastResult(const IDataType & result_column_type, ASTPtr & expr); + +private: + std::vector tokens; + std::vector token_after_literal_idx; + String result_column_name; + ExpressionActionsPtr actions_on_literals; + Block literals; + MutableColumns columns; + +}; + +} diff --git a/dbms/src/Formats/ValuesBlockInputStream.cpp b/dbms/src/Formats/ValuesBlockInputStream.cpp index c902905afc4..89d29e811c4 100644 --- a/dbms/src/Formats/ValuesBlockInputStream.cpp +++ b/dbms/src/Formats/ValuesBlockInputStream.cpp @@ -26,6 +26,9 @@ namespace ErrorCodes extern const int CANNOT_PARSE_DATE; extern const int SYNTAX_ERROR; extern const int VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE; + extern const int CANNOT_CREATE_EXPRESSION_TEMPLATE; + extern const int CANNOT_PARSE_EXPRESSION_USING_TEMPLATE; + extern const int CANNOT_EVALUATE_EXPRESSION_TEMPLATE; } @@ -34,6 +37,7 @@ ValuesBlockInputStream::ValuesBlockInputStream(ReadBuffer & istr_, const Block & : istr(istr_), header(header_), context(std::make_unique(context_)), format_settings(format_settings), max_block_size(max_block_size_) { + templates.resize(header.columns()); /// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it. skipBOMIfExists(istr); } @@ -52,7 +56,6 @@ bool ValuesBlockInputStream::read(MutableColumns & columns) * But as an exception, it also supports processing arbitrary expressions instead of values. * This is very inefficient. But if there are no expressions, then there is no overhead. */ - ParserExpression parser; assertChar('(', istr); @@ -66,8 +69,16 @@ bool ValuesBlockInputStream::read(MutableColumns & columns) bool rollback_on_exception = false; try { - header.getByPosition(i).type->deserializeAsTextQuoted(*columns[i], istr, format_settings); - rollback_on_exception = true; + if (templates[i]) + { + templates[i].value().parseExpression(istr, format_settings); + } + else + { + header.getByPosition(i).type->deserializeAsTextQuoted(*columns[i], istr, format_settings); + rollback_on_exception = true; + } + skipWhitespaceIfAny(istr); if (i != num_columns - 1) @@ -89,7 +100,8 @@ bool ValuesBlockInputStream::read(MutableColumns & columns) || e.code() == ErrorCodes::CANNOT_PARSE_NUMBER || e.code() == ErrorCodes::CANNOT_PARSE_DATE || e.code() == ErrorCodes::CANNOT_PARSE_DATETIME - || e.code() == ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT) + || e.code() == ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT + || e.code() == ErrorCodes::CANNOT_PARSE_EXPRESSION_USING_TEMPLATE) { /// TODO Case when the expression does not fit entirely in the buffer. @@ -100,35 +112,19 @@ bool ValuesBlockInputStream::read(MutableColumns & columns) if (rollback_on_exception) columns[i]->popBack(1); - const IDataType & type = *header.getByPosition(i).type; - - Expected expected; - - Tokens tokens(prev_istr_position, istr.buffer().end()); - TokenIterator token_iterator(tokens); - - ASTPtr ast; - if (!parser.parse(token_iterator, ast, expected)) - throw Exception("Cannot parse expression of type " + type.getName() + " here: " - + String(prev_istr_position, std::min(SHOW_CHARS_ON_SYNTAX_ERROR, istr.buffer().end() - prev_istr_position)), - ErrorCodes::SYNTAX_ERROR); - - istr.position() = const_cast(token_iterator->begin); - - std::pair value_raw = evaluateConstantExpression(ast, *context); - Field value = convertFieldToType(value_raw.first, type, value_raw.second.get()); - - /// Check that we are indeed allowed to insert a NULL. - if (value.isNull()) + // TODO read(MutableColumns & columns) should not know number of rows in block an should not assign to columns + if (likely(rows_in_block)) { - if (!type.isNullable()) - throw Exception{"Expression returns value " + applyVisitor(FieldVisitorToString(), value) - + ", that is out of range of type " + type.getName() - + ", at: " + String(prev_istr_position, std::min(SHOW_CHARS_ON_SYNTAX_ERROR, istr.buffer().end() - prev_istr_position)), - ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE}; + if (e.code() == ErrorCodes::CANNOT_PARSE_EXPRESSION_USING_TEMPLATE) + { + /// Expression in the current row is not match generated on the first row template. + /// Evaluate expressions, which were parsed using this template. + columns[i] = std::move(*templates[i].value().evaluateAll()).mutate(); + /// And do not use the template anymore. + templates[i].reset(); + } } - - columns[i]->insert(value); + parseExpression(prev_istr_position, columns, i, rows_in_block == 0); skipWhitespaceIfAny(istr); @@ -153,13 +149,13 @@ Block ValuesBlockInputStream::readImpl() { MutableColumns columns = header.cloneEmptyColumns(); - for (size_t rows = 0; rows < max_block_size; ++rows) + for (rows_in_block = 0; rows_in_block < max_block_size; ++rows_in_block) { try { - ++total_rows; if (!read(columns)) break; + ++total_rows; } catch (Exception & e) { @@ -169,12 +165,77 @@ Block ValuesBlockInputStream::readImpl() } } + /// Evaluate expressions, which were parsed using this template, if any + for (size_t i = 0; i < columns.size(); ++i) + { + if (templates[i]) + { + columns[i] = std::move(*templates[i].value().evaluateAll()).mutate(); + templates[i].reset(); + } + } + if (columns.empty() || columns[0]->empty()) return {}; return header.cloneWithColumns(std::move(columns)); } +Field +ValuesBlockInputStream::parseExpression(char * prev_istr_position, MutableColumns & columns, size_t column_idx, bool generate_template) +{ + const IDataType & type = *header.getByPosition(column_idx).type; + + Expected expected; + + Tokens tokens(prev_istr_position, istr.buffer().end()); + TokenIterator token_iterator(tokens); + + ASTPtr ast; + if (!parser.parse(token_iterator, ast, expected)) + throw Exception("Cannot parse expression of type " + type.getName() + " here: " + + String(prev_istr_position, std::min(SHOW_CHARS_ON_SYNTAX_ERROR, istr.buffer().end() - prev_istr_position)), + ErrorCodes::SYNTAX_ERROR); + + istr.position() = const_cast(token_iterator->begin); + + std::pair value_raw = evaluateConstantExpression(ast, *context); + Field value = convertFieldToType(value_raw.first, type, value_raw.second.get()); + + /// Check that we are indeed allowed to insert a NULL. + if (value.isNull()) + { + if (!type.isNullable()) + throw Exception{"Expression returns value " + applyVisitor(FieldVisitorToString(), value) + + ", that is out of range of type " + type.getName() + + ", at: " + + String(prev_istr_position, std::min(SHOW_CHARS_ON_SYNTAX_ERROR, istr.buffer().end() - prev_istr_position)), + ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE}; + } + + if (generate_template) + { + try + { + templates[column_idx] = ConstantExpressionTemplate(type, TokenIterator(tokens), token_iterator, *context); + istr.position() = prev_istr_position; + templates[column_idx].value().parseExpression(istr, format_settings); + } + catch (DB::Exception &) + { + /// Continue parsing without template + templates[column_idx].reset(); + columns[column_idx]->insert(value); + istr.position() = const_cast(token_iterator->begin); + } + } + else + { + columns[column_idx]->insert(value); + } + return value; +} + void registerInputFormatValues(FormatFactory & factory) { diff --git a/dbms/src/Formats/ValuesBlockInputStream.h b/dbms/src/Formats/ValuesBlockInputStream.h index 27fd67bd47e..e8e3dd2a330 100644 --- a/dbms/src/Formats/ValuesBlockInputStream.h +++ b/dbms/src/Formats/ValuesBlockInputStream.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB @@ -33,15 +34,22 @@ public: bool read(MutableColumns & columns); private: + typedef std::vector> ConstantExpressionTemplates; + Block readImpl() override; + Field parseExpression(char * prev_istr_position, MutableColumns & columns, size_t column_idx, bool generate_template); + private: ReadBuffer & istr; Block header; std::unique_ptr context; /// pimpl const FormatSettings format_settings; UInt64 max_block_size; + UInt64 rows_in_block = 0; size_t total_rows = 0; + ParserExpression parser; + ConstantExpressionTemplates templates; }; }