2019-05-07 22:39:18 +00:00
|
|
|
#include <IO/ReadHelpers.h>
|
|
|
|
#include <Interpreters/evaluateConstantExpression.h>
|
|
|
|
#include <Interpreters/Context.h>
|
|
|
|
#include <Interpreters/convertFieldToType.h>
|
|
|
|
#include <Parsers/TokenIterator.h>
|
|
|
|
#include <Parsers/ExpressionListParsers.h>
|
2019-09-02 16:26:22 +00:00
|
|
|
#include <Processors/Formats/Impl/ValuesBlockInputFormat.h>
|
2019-05-07 22:39:18 +00:00
|
|
|
#include <Formats/FormatFactory.h>
|
|
|
|
#include <Common/FieldVisitors.h>
|
|
|
|
#include <Core/Block.h>
|
|
|
|
#include <Common/typeid_cast.h>
|
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED;
|
|
|
|
extern const int CANNOT_PARSE_QUOTED_STRING;
|
|
|
|
extern const int CANNOT_PARSE_NUMBER;
|
|
|
|
extern const int CANNOT_PARSE_DATE;
|
|
|
|
extern const int CANNOT_PARSE_DATETIME;
|
|
|
|
extern const int CANNOT_READ_ARRAY_FROM_TEXT;
|
|
|
|
extern const int CANNOT_PARSE_DATE;
|
|
|
|
extern const int SYNTAX_ERROR;
|
|
|
|
extern const int VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE;
|
2019-05-08 01:41:05 +00:00
|
|
|
extern const int CANNOT_PARSE_EXPRESSION_USING_TEMPLATE;
|
2019-05-07 22:39:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2019-09-02 16:26:22 +00:00
|
|
|
ValuesBlockInputFormat::ValuesBlockInputFormat(ReadBuffer & in_, const Block & header_, const RowInputFormatParams & params_,
|
|
|
|
const Context & context_, const FormatSettings & format_settings_)
|
|
|
|
: IInputFormat(header_, buf), buf(in_), params(params_), context(std::make_unique<Context>(context_)),
|
|
|
|
format_settings(format_settings_), num_columns(header_.columns()),
|
|
|
|
attempts_to_generate_template(num_columns), rows_parsed_using_template(num_columns), templates(num_columns)
|
2019-05-07 22:39:18 +00:00
|
|
|
{
|
|
|
|
/// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it.
|
2019-09-02 16:26:22 +00:00
|
|
|
skipBOMIfExists(buf);
|
|
|
|
/// TODO remove before merge
|
2019-05-24 03:13:18 +00:00
|
|
|
const_cast<FormatSettings&>(this->format_settings).values.interpret_expressions = false;
|
2019-05-07 22:39:18 +00:00
|
|
|
}
|
|
|
|
|
2019-09-02 16:26:22 +00:00
|
|
|
Chunk ValuesBlockInputFormat::generate()
|
2019-05-07 22:39:18 +00:00
|
|
|
{
|
2019-09-02 16:26:22 +00:00
|
|
|
const Block & header = getPort().getHeader();
|
2019-05-16 02:05:44 +00:00
|
|
|
MutableColumns columns = header.cloneEmptyColumns();
|
2019-05-07 22:39:18 +00:00
|
|
|
|
2019-09-02 16:26:22 +00:00
|
|
|
for (size_t rows_in_block = 0, batch = 0; rows_in_block < params.max_block_size; ++rows_in_block, ++batch)
|
2019-05-07 22:39:18 +00:00
|
|
|
{
|
2019-09-02 16:26:22 +00:00
|
|
|
if (params.rows_portion_size && batch == params.rows_portion_size)
|
2019-09-02 12:57:22 +00:00
|
|
|
{
|
|
|
|
batch = 0;
|
2019-09-02 16:26:22 +00:00
|
|
|
if (!checkTimeLimit(params, total_stopwatch) || isCancelled())
|
2019-09-02 12:57:22 +00:00
|
|
|
break;
|
|
|
|
}
|
2019-05-07 22:39:18 +00:00
|
|
|
try
|
|
|
|
{
|
2019-09-02 16:26:22 +00:00
|
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
if (buf.eof() || *buf.position() == ';')
|
2019-05-16 02:05:44 +00:00
|
|
|
break;
|
2019-09-02 16:26:22 +00:00
|
|
|
assertChar('(', buf);
|
2019-05-07 22:39:18 +00:00
|
|
|
|
2019-05-16 02:05:44 +00:00
|
|
|
for (size_t column_idx = 0; column_idx < num_columns; ++column_idx)
|
2019-05-07 22:39:18 +00:00
|
|
|
{
|
2019-09-02 16:26:22 +00:00
|
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
PeekableReadBufferCheckpoint checkpoint{buf};
|
2019-05-23 03:07:49 +00:00
|
|
|
|
|
|
|
bool parse_separate_value = !parseExpressionUsingTemplate(columns[column_idx], column_idx);
|
2019-05-12 03:15:08 +00:00
|
|
|
|
2019-05-16 02:05:44 +00:00
|
|
|
/// Parse value using fast streaming parser for literals and slow SQL parser for expressions.
|
|
|
|
/// If there is SQL expression in the first row, template of this expression will be generated,
|
|
|
|
/// so it makes possible to parse next rows much faster if expressions in next rows have the same structure
|
|
|
|
if (parse_separate_value)
|
2019-05-23 03:07:49 +00:00
|
|
|
readValue(*columns[column_idx], column_idx, shouldGenerateNewTemplate(column_idx));
|
2019-05-07 22:39:18 +00:00
|
|
|
}
|
2019-05-12 03:15:08 +00:00
|
|
|
|
2019-09-02 16:26:22 +00:00
|
|
|
skipWhitespaceIfAny(buf);
|
|
|
|
if (!buf.eof() && *buf.position() == ',')
|
|
|
|
++buf.position();
|
2019-05-07 22:39:18 +00:00
|
|
|
|
2019-05-08 01:41:05 +00:00
|
|
|
++total_rows;
|
2019-05-07 22:39:18 +00:00
|
|
|
}
|
|
|
|
catch (Exception & e)
|
|
|
|
{
|
|
|
|
if (isParseError(e.code()))
|
|
|
|
e.addMessage(" at row " + std::to_string(total_rows));
|
|
|
|
throw;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-05-16 02:05:44 +00:00
|
|
|
/// Evaluate expressions, which were parsed using templates, if any
|
2019-05-08 01:41:05 +00:00
|
|
|
for (size_t i = 0; i < columns.size(); ++i)
|
|
|
|
{
|
2019-05-23 03:07:49 +00:00
|
|
|
if (!templates[i] || !templates[i].value().rowsCount())
|
|
|
|
continue;
|
|
|
|
if (columns[i]->empty())
|
2019-05-08 01:41:05 +00:00
|
|
|
columns[i] = std::move(*templates[i].value().evaluateAll()).mutate();
|
2019-05-23 03:07:49 +00:00
|
|
|
else
|
|
|
|
{
|
|
|
|
ColumnPtr evaluated = templates[i].value().evaluateAll();
|
|
|
|
columns[i]->insertRangeFrom(*evaluated, 0, evaluated->size());
|
2019-05-08 01:41:05 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-05-07 22:39:18 +00:00
|
|
|
if (columns.empty() || columns[0]->empty())
|
2019-09-02 16:26:22 +00:00
|
|
|
{
|
|
|
|
readSuffix();
|
2019-05-07 22:39:18 +00:00
|
|
|
return {};
|
2019-09-02 16:26:22 +00:00
|
|
|
}
|
2019-05-07 22:39:18 +00:00
|
|
|
|
2019-09-02 16:26:22 +00:00
|
|
|
size_t rows_in_block = columns[0]->size();
|
|
|
|
return Chunk{std::move(columns), rows_in_block};
|
2019-05-07 22:39:18 +00:00
|
|
|
}
|
|
|
|
|
2019-09-02 16:26:22 +00:00
|
|
|
bool ValuesBlockInputFormat::parseExpressionUsingTemplate(MutableColumnPtr & column, size_t column_idx)
|
2019-05-23 03:07:49 +00:00
|
|
|
{
|
|
|
|
if (templates[column_idx])
|
|
|
|
{
|
|
|
|
/// Try to parse expression using template if one was successfully generated while parsing the first row
|
|
|
|
try
|
|
|
|
{
|
2019-09-02 16:26:22 +00:00
|
|
|
templates[column_idx].value().parseExpression(buf, format_settings);
|
|
|
|
assertDelimiterAfterValue(column_idx);
|
2019-05-23 03:07:49 +00:00
|
|
|
++rows_parsed_using_template[column_idx];
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
catch (DB::Exception & e)
|
|
|
|
{
|
|
|
|
if (e.code() != ErrorCodes::CANNOT_PARSE_EXPRESSION_USING_TEMPLATE)
|
|
|
|
throw;
|
|
|
|
/// Expression in the current row is not match generated on the first row template.
|
|
|
|
/// Evaluate expressions, which were parsed using this template.
|
|
|
|
if (column->empty())
|
|
|
|
column = std::move(*templates[column_idx].value().evaluateAll()).mutate();
|
|
|
|
else
|
|
|
|
{
|
|
|
|
ColumnPtr evaluated = templates[column_idx].value().evaluateAll();
|
|
|
|
column->insertRangeFrom(*evaluated, 0, evaluated->size());
|
|
|
|
}
|
|
|
|
/// Do not use the template anymore and fallback to slow SQL parser
|
|
|
|
templates[column_idx].reset();
|
|
|
|
++attempts_to_generate_template[column_idx];
|
2019-09-02 16:26:22 +00:00
|
|
|
buf.rollbackToCheckpoint();
|
2019-05-23 03:07:49 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2019-09-02 16:26:22 +00:00
|
|
|
void ValuesBlockInputFormat::readValue(IColumn & column, size_t column_idx, bool generate_template)
|
2019-05-16 02:05:44 +00:00
|
|
|
{
|
|
|
|
bool rollback_on_exception = false;
|
|
|
|
try
|
|
|
|
{
|
2019-09-02 16:26:22 +00:00
|
|
|
const Block & header = getPort().getHeader();
|
|
|
|
header.getByPosition(column_idx).type->deserializeAsTextQuoted(column, buf, format_settings);
|
2019-05-16 02:05:44 +00:00
|
|
|
rollback_on_exception = true;
|
|
|
|
|
2019-09-02 16:26:22 +00:00
|
|
|
skipWhitespaceIfAny(buf);
|
2019-05-16 02:05:44 +00:00
|
|
|
|
2019-09-02 16:26:22 +00:00
|
|
|
assertDelimiterAfterValue(column_idx);
|
2019-05-16 02:05:44 +00:00
|
|
|
}
|
|
|
|
catch (const Exception & e)
|
|
|
|
{
|
2019-05-23 01:30:00 +00:00
|
|
|
if (!format_settings.values.interpret_expressions && !(format_settings.values.deduce_templates_of_expressions && generate_template))
|
2019-05-16 02:05:44 +00:00
|
|
|
throw;
|
|
|
|
|
|
|
|
/** The normal streaming parser could not parse the value.
|
|
|
|
* Let's try to parse it with a SQL parser as a constant expression.
|
|
|
|
* This is an exceptional case.
|
|
|
|
*/
|
|
|
|
if (e.code() == ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED
|
|
|
|
|| e.code() == ErrorCodes::CANNOT_PARSE_QUOTED_STRING
|
|
|
|
|| e.code() == ErrorCodes::CANNOT_PARSE_NUMBER
|
|
|
|
|| e.code() == ErrorCodes::CANNOT_PARSE_DATE
|
|
|
|
|| e.code() == ErrorCodes::CANNOT_PARSE_DATETIME
|
|
|
|
|| e.code() == ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT)
|
|
|
|
{
|
|
|
|
if (rollback_on_exception)
|
|
|
|
column.popBack(1);
|
|
|
|
|
2019-09-02 16:26:22 +00:00
|
|
|
buf.rollbackToCheckpoint();
|
2019-05-16 02:05:44 +00:00
|
|
|
parseExpression(column, column_idx, generate_template);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
throw;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2019-09-02 16:26:22 +00:00
|
|
|
ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx, bool generate_template)
|
2019-05-08 01:41:05 +00:00
|
|
|
{
|
2019-09-02 16:26:22 +00:00
|
|
|
const Block & header = getPort().getHeader();
|
2019-05-08 01:41:05 +00:00
|
|
|
const IDataType & type = *header.getByPosition(column_idx).type;
|
|
|
|
|
|
|
|
Expected expected;
|
|
|
|
|
2019-05-12 03:15:08 +00:00
|
|
|
// TODO make tokenizer to work with buffers, not only with continuous memory
|
2019-09-02 16:26:22 +00:00
|
|
|
Tokens tokens(buf.position(), buf.buffer().end());
|
2019-09-02 12:57:22 +00:00
|
|
|
IParser::Pos token_iterator(tokens);
|
2019-05-08 01:41:05 +00:00
|
|
|
|
|
|
|
ASTPtr ast;
|
|
|
|
if (!parser.parse(token_iterator, ast, expected))
|
2019-05-12 03:15:08 +00:00
|
|
|
{
|
2019-09-02 16:26:22 +00:00
|
|
|
buf.rollbackToCheckpoint();
|
2019-05-08 01:41:05 +00:00
|
|
|
throw Exception("Cannot parse expression of type " + type.getName() + " here: "
|
2019-09-02 16:26:22 +00:00
|
|
|
+ String(buf.position(), std::min(SHOW_CHARS_ON_SYNTAX_ERROR, buf.buffer().end() - buf.position())),
|
2019-05-08 01:41:05 +00:00
|
|
|
ErrorCodes::SYNTAX_ERROR);
|
2019-05-12 03:15:08 +00:00
|
|
|
}
|
2019-05-08 01:41:05 +00:00
|
|
|
|
|
|
|
std::pair<Field, DataTypePtr> value_raw = evaluateConstantExpression(ast, *context);
|
|
|
|
Field value = convertFieldToType(value_raw.first, type, value_raw.second.get());
|
|
|
|
|
|
|
|
/// Check that we are indeed allowed to insert a NULL.
|
2019-05-12 03:15:08 +00:00
|
|
|
if (value.isNull() && !type.isNullable())
|
2019-05-08 01:41:05 +00:00
|
|
|
{
|
2019-09-02 16:26:22 +00:00
|
|
|
buf.rollbackToCheckpoint();
|
2019-05-12 03:15:08 +00:00
|
|
|
throw Exception{"Expression returns value " + applyVisitor(FieldVisitorToString(), value)
|
|
|
|
+ ", that is out of range of type " + type.getName()
|
|
|
|
+ ", at: " +
|
2019-09-02 16:26:22 +00:00
|
|
|
String(buf.position(), std::min(SHOW_CHARS_ON_SYNTAX_ERROR, buf.buffer().end() - buf.position())),
|
2019-05-12 03:15:08 +00:00
|
|
|
ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE};
|
2019-05-08 01:41:05 +00:00
|
|
|
}
|
|
|
|
|
2019-09-02 16:26:22 +00:00
|
|
|
buf.position() = const_cast<char *>(token_iterator->begin);
|
2019-05-12 03:15:08 +00:00
|
|
|
|
2019-05-23 01:30:00 +00:00
|
|
|
if (format_settings.values.deduce_templates_of_expressions && generate_template)
|
2019-05-08 01:41:05 +00:00
|
|
|
{
|
2019-05-16 02:05:44 +00:00
|
|
|
if (templates[column_idx])
|
|
|
|
throw DB::Exception("Template for column " + std::to_string(column_idx) + " already exists and it was not evaluated yet",
|
|
|
|
ErrorCodes::LOGICAL_ERROR);
|
2019-05-08 01:41:05 +00:00
|
|
|
try
|
|
|
|
{
|
2019-09-04 16:54:20 +00:00
|
|
|
templates[column_idx] = ConstantExpressionTemplate(header.getByPosition(column_idx).type, TokenIterator(tokens), token_iterator, ast, *context);
|
2019-09-02 16:26:22 +00:00
|
|
|
buf.rollbackToCheckpoint();
|
|
|
|
templates[column_idx].value().parseExpression(buf, format_settings);
|
|
|
|
assertDelimiterAfterValue(column_idx);
|
2019-05-16 02:05:44 +00:00
|
|
|
return;
|
2019-05-08 01:41:05 +00:00
|
|
|
}
|
2019-05-16 02:05:44 +00:00
|
|
|
catch (...)
|
2019-05-08 01:41:05 +00:00
|
|
|
{
|
2019-05-23 01:30:00 +00:00
|
|
|
if (!format_settings.values.interpret_expressions)
|
|
|
|
throw;
|
2019-05-08 01:41:05 +00:00
|
|
|
/// Continue parsing without template
|
|
|
|
templates[column_idx].reset();
|
2019-09-02 16:26:22 +00:00
|
|
|
buf.position() = const_cast<char *>(token_iterator->begin);
|
2019-05-08 01:41:05 +00:00
|
|
|
}
|
|
|
|
}
|
2019-05-16 02:05:44 +00:00
|
|
|
|
2019-09-02 16:26:22 +00:00
|
|
|
assertDelimiterAfterValue(column_idx);
|
2019-05-16 02:05:44 +00:00
|
|
|
column.insert(value);
|
|
|
|
}
|
|
|
|
|
2019-09-02 16:26:22 +00:00
|
|
|
void ValuesBlockInputFormat::assertDelimiterAfterValue(size_t column_idx)
|
2019-05-16 02:05:44 +00:00
|
|
|
{
|
2019-09-02 16:26:22 +00:00
|
|
|
skipWhitespaceIfAny(buf);
|
2019-05-16 02:05:44 +00:00
|
|
|
|
|
|
|
if (column_idx + 1 != num_columns)
|
2019-09-02 16:26:22 +00:00
|
|
|
assertChar(',', buf);
|
2019-05-08 01:41:05 +00:00
|
|
|
else
|
2019-09-02 16:26:22 +00:00
|
|
|
assertChar(')', buf);
|
2019-05-08 01:41:05 +00:00
|
|
|
}
|
|
|
|
|
2019-09-02 16:26:22 +00:00
|
|
|
bool ValuesBlockInputFormat::shouldGenerateNewTemplate(size_t column_idx)
|
2019-05-23 03:07:49 +00:00
|
|
|
{
|
|
|
|
// TODO better heuristic
|
|
|
|
constexpr size_t max_attempts = 3;
|
|
|
|
constexpr size_t rows_per_attempt = 10;
|
|
|
|
if (attempts_to_generate_template[column_idx] < max_attempts)
|
|
|
|
return true;
|
2019-05-24 01:31:11 +00:00
|
|
|
if (rows_parsed_using_template[column_idx] / attempts_to_generate_template[column_idx] > rows_per_attempt)
|
2019-05-23 03:07:49 +00:00
|
|
|
{
|
|
|
|
/// Try again
|
|
|
|
attempts_to_generate_template[column_idx] = 0;
|
|
|
|
rows_parsed_using_template[column_idx] = 0;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2019-05-07 22:39:18 +00:00
|
|
|
|
2019-09-02 16:26:22 +00:00
|
|
|
void registerInputFormatProcessorValues(FormatFactory & factory)
|
2019-05-07 22:39:18 +00:00
|
|
|
{
|
2019-09-02 16:26:22 +00:00
|
|
|
factory.registerInputFormatProcessor("Values", [](
|
2019-05-07 22:39:18 +00:00
|
|
|
ReadBuffer & buf,
|
2019-09-02 16:26:22 +00:00
|
|
|
const Block & header,
|
2019-05-07 22:39:18 +00:00
|
|
|
const Context & context,
|
2019-09-02 16:26:22 +00:00
|
|
|
const RowInputFormatParams & params,
|
2019-05-07 22:39:18 +00:00
|
|
|
const FormatSettings & settings)
|
|
|
|
{
|
2019-09-02 16:26:22 +00:00
|
|
|
return std::make_shared<ValuesBlockInputFormat>(buf, header, params, context, settings);
|
2019-05-07 22:39:18 +00:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|