mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-29 13:10:48 +00:00
Batch insert Values with expressions (draft)
This commit is contained in:
parent
804c911d3f
commit
d3ee3a693c
@ -448,6 +448,9 @@ namespace ErrorCodes
|
||||
extern const int SETTINGS_ARE_NOT_SUPPORTED = 471;
|
||||
extern const int READONLY_SETTING = 472;
|
||||
extern const int INVALID_TEMPLATE_FORMAT = 473;
|
||||
extern const int CANNOT_CREATE_EXPRESSION_TEMPLATE = 474;
|
||||
extern const int CANNOT_PARSE_EXPRESSION_USING_TEMPLATE = 475;
|
||||
extern const int CANNOT_EVALUATE_EXPRESSION_TEMPLATE = 476;
|
||||
|
||||
extern const int KEEPER_EXCEPTION = 999;
|
||||
extern const int POCO_EXCEPTION = 1000;
|
||||
|
166
dbms/src/Formats/ConstantExpressionTemplate.cpp
Normal file
166
dbms/src/Formats/ConstantExpressionTemplate.cpp
Normal file
@ -0,0 +1,166 @@
|
||||
|
||||
#include <DataTypes/DataTypeFactory.h>
|
||||
#include <Formats/BlockInputStreamFromRowInputStream.h>
|
||||
#include <Interpreters/ExpressionAnalyzer.h>
|
||||
#include <Interpreters/SyntaxAnalyzer.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <Parsers/ASTExpressionList.h>
|
||||
#include <Parsers/ASTFunction.h>
|
||||
#include <Parsers/ASTLiteral.h>
|
||||
#include <Formats/ConstantExpressionTemplate.h>
|
||||
#include <Parsers/ExpressionElementParsers.h>
|
||||
#include <Parsers/ExpressionListParsers.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int CANNOT_CREATE_EXPRESSION_TEMPLATE;
|
||||
extern const int CANNOT_PARSE_EXPRESSION_USING_TEMPLATE;
|
||||
extern const int CANNOT_EVALUATE_EXPRESSION_TEMPLATE;
|
||||
}
|
||||
|
||||
|
||||
ConstantExpressionTemplate::ConstantExpressionTemplate(const IDataType & result_column_type, TokenIterator begin, TokenIterator end,
|
||||
const Context & context)
|
||||
{
|
||||
std::pair<String, NamesAndTypesList> expr_template = replaceLiteralsWithDummyIdentifiers(begin, end);
|
||||
for (const auto & col : expr_template.second)
|
||||
literals.insert({nullptr, col.type, col.name});
|
||||
columns = literals.cloneEmptyColumns();
|
||||
|
||||
ParserExpression parser;
|
||||
Expected expected;
|
||||
Tokens template_tokens(expr_template.first.data(), expr_template.first.data() + expr_template.first.size());
|
||||
TokenIterator token_iterator1(template_tokens);
|
||||
|
||||
ASTPtr ast_template;
|
||||
if (!parser.parse(token_iterator1, ast_template, expected))
|
||||
throw Exception("Cannot parse template after replacing literals: ", ErrorCodes::CANNOT_CREATE_EXPRESSION_TEMPLATE);
|
||||
|
||||
addNodesToCastResult(result_column_type, ast_template);
|
||||
result_column_name = ast_template->getColumnName();
|
||||
|
||||
auto syntax_result = SyntaxAnalyzer(context).analyze(ast_template, expr_template.second);
|
||||
|
||||
actions_on_literals = ExpressionAnalyzer(ast_template, syntax_result, context).getActions(false);
|
||||
}
|
||||
|
||||
void ConstantExpressionTemplate::parseExpression(ReadBuffer & istr, const FormatSettings & settings)
|
||||
{
|
||||
size_t cur_column = 0;
|
||||
try
|
||||
{
|
||||
size_t cur_token = 0;
|
||||
while (cur_column < literals.columns())
|
||||
{
|
||||
size_t skip_tokens_until = token_after_literal_idx[cur_column];
|
||||
while (cur_token < skip_tokens_until)
|
||||
{
|
||||
// TODO skip comments
|
||||
skipWhitespaceIfAny(istr);
|
||||
assertString(tokens[cur_token++], istr);
|
||||
}
|
||||
skipWhitespaceIfAny(istr);
|
||||
const IDataType & type = *literals.getByPosition(cur_column).type;
|
||||
type.deserializeAsTextQuoted(*columns[cur_column], istr, settings);
|
||||
++cur_column;
|
||||
}
|
||||
while (cur_token < tokens.size())
|
||||
{
|
||||
skipWhitespaceIfAny(istr);
|
||||
assertString(tokens[cur_token++], istr);
|
||||
}
|
||||
} catch (DB::Exception & e)
|
||||
{
|
||||
for (size_t i = 0; i < cur_column; ++i)
|
||||
columns[i]->popBack(1);
|
||||
|
||||
if (!isParseError(e.code()))
|
||||
throw;
|
||||
}
|
||||
throw DB::Exception("Cannot parse expression using template", ErrorCodes::CANNOT_PARSE_EXPRESSION_USING_TEMPLATE);
|
||||
}
|
||||
|
||||
ColumnPtr ConstantExpressionTemplate::evaluateAll()
|
||||
{
|
||||
Block evaluated = literals.cloneWithColumns(std::move(columns));
|
||||
columns = literals.cloneEmptyColumns();
|
||||
actions_on_literals->execute(evaluated);
|
||||
|
||||
if (!evaluated || evaluated.rows() == 0)
|
||||
throw Exception("Logical error: empty block after evaluation of batch of constant expressions",
|
||||
ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
if (!evaluated.has(result_column_name))
|
||||
throw Exception("Cannot evaluate template " + result_column_name + ", block structure:\n" + evaluated.dumpStructure(),
|
||||
ErrorCodes::CANNOT_EVALUATE_EXPRESSION_TEMPLATE);
|
||||
|
||||
return evaluated.getByName(result_column_name).column;
|
||||
}
|
||||
|
||||
std::pair<String, NamesAndTypesList>
|
||||
ConstantExpressionTemplate::replaceLiteralsWithDummyIdentifiers(TokenIterator & begin, TokenIterator & end)
|
||||
{
|
||||
NamesAndTypesList dummy_columns;
|
||||
ParserLiteral parser;
|
||||
String result;
|
||||
size_t token_idx = 0;
|
||||
while (begin != end)
|
||||
{
|
||||
const Token & t = *begin;
|
||||
if (t.isError())
|
||||
throw DB::Exception("Error in tokens", ErrorCodes::CANNOT_CREATE_EXPRESSION_TEMPLATE);
|
||||
|
||||
// TODO don't convert constant string arguments of functions such as CAST(x, 'type')
|
||||
// TODO process Array as one literal to make possible parsing constant arrays of different size
|
||||
if (t.type == TokenType::Number || t.type == TokenType::StringLiteral)
|
||||
{
|
||||
Expected expected;
|
||||
ASTPtr ast;
|
||||
if (!parser.parse(begin, ast, expected))
|
||||
throw DB::Exception("Cannot determine literal type", ErrorCodes::CANNOT_CREATE_EXPRESSION_TEMPLATE);
|
||||
|
||||
// TODO use nullable type if necessary (e.g. value is not NULL, but result_column_type is nullable and next rows may contain NULLs)
|
||||
// TODO parse numbers more carefully: sign is a separate token before number
|
||||
Field & value = ast->as<ASTLiteral &>().value;
|
||||
DataTypePtr type = DataTypeFactory::instance().get(value.getTypeName());
|
||||
// TODO ensure dummy_col_name is unique (there was no _dummy_x identifier in expression)
|
||||
String dummy_col_name = "_dummy_" + std::to_string(dummy_columns.size());
|
||||
dummy_columns.push_back(NameAndTypePair(dummy_col_name, type));
|
||||
token_after_literal_idx.push_back(token_idx);
|
||||
result.append(dummy_col_name);
|
||||
}
|
||||
else
|
||||
{
|
||||
tokens.emplace_back(t.begin, t.size());
|
||||
result.append(tokens.back());
|
||||
++begin;
|
||||
++token_idx;
|
||||
}
|
||||
result.append(" ");
|
||||
}
|
||||
if (dummy_columns.empty()) // TODO
|
||||
throw DB::Exception("not implemented yet", ErrorCodes::CANNOT_CREATE_EXPRESSION_TEMPLATE);
|
||||
return std::make_pair(result, dummy_columns);
|
||||
}
|
||||
|
||||
void ConstantExpressionTemplate::addNodesToCastResult(const IDataType & result_column_type, ASTPtr & expr)
|
||||
{
|
||||
auto result_type = std::make_shared<ASTLiteral>(result_column_type.getName());
|
||||
|
||||
auto arguments = std::make_shared<ASTExpressionList>();
|
||||
arguments->children.push_back(std::move(expr));
|
||||
arguments->children.push_back(std::move(result_type));
|
||||
|
||||
auto cast = std::make_shared<ASTFunction>();
|
||||
cast->name = "CAST";
|
||||
cast->arguments = std::move(arguments);
|
||||
cast->children.push_back(cast->arguments);
|
||||
|
||||
expr = std::move(cast);
|
||||
}
|
||||
|
||||
}
|
35
dbms/src/Formats/ConstantExpressionTemplate.h
Normal file
35
dbms/src/Formats/ConstantExpressionTemplate.h
Normal file
@ -0,0 +1,35 @@
|
||||
#pragma once
|
||||
|
||||
#include <Core/Block.h>
|
||||
#include <Interpreters/ExpressionActions.h>
|
||||
#include <Formats/FormatSettings.h>
|
||||
#include <Parsers/TokenIterator.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class ConstantExpressionTemplate
|
||||
{
|
||||
public:
|
||||
ConstantExpressionTemplate(const IDataType & result_column_type, TokenIterator begin, TokenIterator end, const Context & context);
|
||||
|
||||
void parseExpression(ReadBuffer & istr, const FormatSettings & settings);
|
||||
|
||||
ColumnPtr evaluateAll();
|
||||
|
||||
private:
|
||||
std::pair<String, NamesAndTypesList> replaceLiteralsWithDummyIdentifiers(TokenIterator & begin, TokenIterator & end);
|
||||
|
||||
static void addNodesToCastResult(const IDataType & result_column_type, ASTPtr & expr);
|
||||
|
||||
private:
|
||||
std::vector<String> tokens;
|
||||
std::vector<size_t> token_after_literal_idx;
|
||||
String result_column_name;
|
||||
ExpressionActionsPtr actions_on_literals;
|
||||
Block literals;
|
||||
MutableColumns columns;
|
||||
|
||||
};
|
||||
|
||||
}
|
@ -26,6 +26,9 @@ namespace ErrorCodes
|
||||
extern const int CANNOT_PARSE_DATE;
|
||||
extern const int SYNTAX_ERROR;
|
||||
extern const int VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE;
|
||||
extern const int CANNOT_CREATE_EXPRESSION_TEMPLATE;
|
||||
extern const int CANNOT_PARSE_EXPRESSION_USING_TEMPLATE;
|
||||
extern const int CANNOT_EVALUATE_EXPRESSION_TEMPLATE;
|
||||
}
|
||||
|
||||
|
||||
@ -34,6 +37,7 @@ ValuesBlockInputStream::ValuesBlockInputStream(ReadBuffer & istr_, const Block &
|
||||
: istr(istr_), header(header_), context(std::make_unique<Context>(context_)),
|
||||
format_settings(format_settings), max_block_size(max_block_size_)
|
||||
{
|
||||
templates.resize(header.columns());
|
||||
/// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it.
|
||||
skipBOMIfExists(istr);
|
||||
}
|
||||
@ -52,7 +56,6 @@ bool ValuesBlockInputStream::read(MutableColumns & columns)
|
||||
* But as an exception, it also supports processing arbitrary expressions instead of values.
|
||||
* This is very inefficient. But if there are no expressions, then there is no overhead.
|
||||
*/
|
||||
ParserExpression parser;
|
||||
|
||||
assertChar('(', istr);
|
||||
|
||||
@ -66,8 +69,16 @@ bool ValuesBlockInputStream::read(MutableColumns & columns)
|
||||
bool rollback_on_exception = false;
|
||||
try
|
||||
{
|
||||
header.getByPosition(i).type->deserializeAsTextQuoted(*columns[i], istr, format_settings);
|
||||
rollback_on_exception = true;
|
||||
if (templates[i])
|
||||
{
|
||||
templates[i].value().parseExpression(istr, format_settings);
|
||||
}
|
||||
else
|
||||
{
|
||||
header.getByPosition(i).type->deserializeAsTextQuoted(*columns[i], istr, format_settings);
|
||||
rollback_on_exception = true;
|
||||
}
|
||||
|
||||
skipWhitespaceIfAny(istr);
|
||||
|
||||
if (i != num_columns - 1)
|
||||
@ -89,7 +100,8 @@ bool ValuesBlockInputStream::read(MutableColumns & columns)
|
||||
|| e.code() == ErrorCodes::CANNOT_PARSE_NUMBER
|
||||
|| e.code() == ErrorCodes::CANNOT_PARSE_DATE
|
||||
|| e.code() == ErrorCodes::CANNOT_PARSE_DATETIME
|
||||
|| e.code() == ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT)
|
||||
|| e.code() == ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT
|
||||
|| e.code() == ErrorCodes::CANNOT_PARSE_EXPRESSION_USING_TEMPLATE)
|
||||
{
|
||||
/// TODO Case when the expression does not fit entirely in the buffer.
|
||||
|
||||
@ -100,35 +112,19 @@ bool ValuesBlockInputStream::read(MutableColumns & columns)
|
||||
if (rollback_on_exception)
|
||||
columns[i]->popBack(1);
|
||||
|
||||
const IDataType & type = *header.getByPosition(i).type;
|
||||
|
||||
Expected expected;
|
||||
|
||||
Tokens tokens(prev_istr_position, istr.buffer().end());
|
||||
TokenIterator token_iterator(tokens);
|
||||
|
||||
ASTPtr ast;
|
||||
if (!parser.parse(token_iterator, ast, expected))
|
||||
throw Exception("Cannot parse expression of type " + type.getName() + " here: "
|
||||
+ String(prev_istr_position, std::min(SHOW_CHARS_ON_SYNTAX_ERROR, istr.buffer().end() - prev_istr_position)),
|
||||
ErrorCodes::SYNTAX_ERROR);
|
||||
|
||||
istr.position() = const_cast<char *>(token_iterator->begin);
|
||||
|
||||
std::pair<Field, DataTypePtr> value_raw = evaluateConstantExpression(ast, *context);
|
||||
Field value = convertFieldToType(value_raw.first, type, value_raw.second.get());
|
||||
|
||||
/// Check that we are indeed allowed to insert a NULL.
|
||||
if (value.isNull())
|
||||
// TODO read(MutableColumns & columns) should not know number of rows in block an should not assign to columns
|
||||
if (likely(rows_in_block))
|
||||
{
|
||||
if (!type.isNullable())
|
||||
throw Exception{"Expression returns value " + applyVisitor(FieldVisitorToString(), value)
|
||||
+ ", that is out of range of type " + type.getName()
|
||||
+ ", at: " + String(prev_istr_position, std::min(SHOW_CHARS_ON_SYNTAX_ERROR, istr.buffer().end() - prev_istr_position)),
|
||||
ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE};
|
||||
if (e.code() == ErrorCodes::CANNOT_PARSE_EXPRESSION_USING_TEMPLATE)
|
||||
{
|
||||
/// Expression in the current row is not match generated on the first row template.
|
||||
/// Evaluate expressions, which were parsed using this template.
|
||||
columns[i] = std::move(*templates[i].value().evaluateAll()).mutate();
|
||||
/// And do not use the template anymore.
|
||||
templates[i].reset();
|
||||
}
|
||||
}
|
||||
|
||||
columns[i]->insert(value);
|
||||
parseExpression(prev_istr_position, columns, i, rows_in_block == 0);
|
||||
|
||||
skipWhitespaceIfAny(istr);
|
||||
|
||||
@ -153,13 +149,13 @@ Block ValuesBlockInputStream::readImpl()
|
||||
{
|
||||
MutableColumns columns = header.cloneEmptyColumns();
|
||||
|
||||
for (size_t rows = 0; rows < max_block_size; ++rows)
|
||||
for (rows_in_block = 0; rows_in_block < max_block_size; ++rows_in_block)
|
||||
{
|
||||
try
|
||||
{
|
||||
++total_rows;
|
||||
if (!read(columns))
|
||||
break;
|
||||
++total_rows;
|
||||
}
|
||||
catch (Exception & e)
|
||||
{
|
||||
@ -169,12 +165,77 @@ Block ValuesBlockInputStream::readImpl()
|
||||
}
|
||||
}
|
||||
|
||||
/// Evaluate expressions, which were parsed using this template, if any
|
||||
for (size_t i = 0; i < columns.size(); ++i)
|
||||
{
|
||||
if (templates[i])
|
||||
{
|
||||
columns[i] = std::move(*templates[i].value().evaluateAll()).mutate();
|
||||
templates[i].reset();
|
||||
}
|
||||
}
|
||||
|
||||
if (columns.empty() || columns[0]->empty())
|
||||
return {};
|
||||
|
||||
return header.cloneWithColumns(std::move(columns));
|
||||
}
|
||||
|
||||
Field
|
||||
ValuesBlockInputStream::parseExpression(char * prev_istr_position, MutableColumns & columns, size_t column_idx, bool generate_template)
|
||||
{
|
||||
const IDataType & type = *header.getByPosition(column_idx).type;
|
||||
|
||||
Expected expected;
|
||||
|
||||
Tokens tokens(prev_istr_position, istr.buffer().end());
|
||||
TokenIterator token_iterator(tokens);
|
||||
|
||||
ASTPtr ast;
|
||||
if (!parser.parse(token_iterator, ast, expected))
|
||||
throw Exception("Cannot parse expression of type " + type.getName() + " here: "
|
||||
+ String(prev_istr_position, std::min(SHOW_CHARS_ON_SYNTAX_ERROR, istr.buffer().end() - prev_istr_position)),
|
||||
ErrorCodes::SYNTAX_ERROR);
|
||||
|
||||
istr.position() = const_cast<char *>(token_iterator->begin);
|
||||
|
||||
std::pair<Field, DataTypePtr> value_raw = evaluateConstantExpression(ast, *context);
|
||||
Field value = convertFieldToType(value_raw.first, type, value_raw.second.get());
|
||||
|
||||
/// Check that we are indeed allowed to insert a NULL.
|
||||
if (value.isNull())
|
||||
{
|
||||
if (!type.isNullable())
|
||||
throw Exception{"Expression returns value " + applyVisitor(FieldVisitorToString(), value)
|
||||
+ ", that is out of range of type " + type.getName()
|
||||
+ ", at: " +
|
||||
String(prev_istr_position, std::min(SHOW_CHARS_ON_SYNTAX_ERROR, istr.buffer().end() - prev_istr_position)),
|
||||
ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE};
|
||||
}
|
||||
|
||||
if (generate_template)
|
||||
{
|
||||
try
|
||||
{
|
||||
templates[column_idx] = ConstantExpressionTemplate(type, TokenIterator(tokens), token_iterator, *context);
|
||||
istr.position() = prev_istr_position;
|
||||
templates[column_idx].value().parseExpression(istr, format_settings);
|
||||
}
|
||||
catch (DB::Exception &)
|
||||
{
|
||||
/// Continue parsing without template
|
||||
templates[column_idx].reset();
|
||||
columns[column_idx]->insert(value);
|
||||
istr.position() = const_cast<char *>(token_iterator->begin);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
columns[column_idx]->insert(value);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
|
||||
void registerInputFormatValues(FormatFactory & factory)
|
||||
{
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include <Core/Block.h>
|
||||
#include <DataStreams/IBlockInputStream.h>
|
||||
#include <Formats/FormatSettings.h>
|
||||
#include <Formats/ConstantExpressionTemplate.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -33,15 +34,22 @@ public:
|
||||
bool read(MutableColumns & columns);
|
||||
|
||||
private:
|
||||
typedef std::vector<std::optional<ConstantExpressionTemplate>> ConstantExpressionTemplates;
|
||||
|
||||
Block readImpl() override;
|
||||
|
||||
Field parseExpression(char * prev_istr_position, MutableColumns & columns, size_t column_idx, bool generate_template);
|
||||
|
||||
private:
|
||||
ReadBuffer & istr;
|
||||
Block header;
|
||||
std::unique_ptr<Context> context; /// pimpl
|
||||
const FormatSettings format_settings;
|
||||
UInt64 max_block_size;
|
||||
UInt64 rows_in_block = 0;
|
||||
size_t total_rows = 0;
|
||||
ParserExpression parser;
|
||||
ConstantExpressionTemplates templates;
|
||||
};
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user