ClickHouse/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp

435 lines
16 KiB
C++
Raw Normal View History

#include <IO/ReadHelpers.h>
#include <Interpreters/evaluateConstantExpression.h>
#include <Interpreters/Context.h>
#include <Interpreters/convertFieldToType.h>
#include <Parsers/TokenIterator.h>
#include <Parsers/ExpressionListParsers.h>
#include <Processors/Formats/Impl/ValuesBlockInputFormat.h>
#include <Formats/FormatFactory.h>
#include <Common/FieldVisitors.h>
#include <Core/Block.h>
#include <Common/typeid_cast.h>
#include <common/find_symbols.h>
2019-09-11 19:55:28 +00:00
#include <Parsers/ASTLiteral.h>
2019-10-16 19:52:00 +00:00
#include <DataTypes/DataTypeNullable.h>
namespace DB
{
namespace ErrorCodes
{
2020-02-25 18:02:41 +00:00
extern const int LOGICAL_ERROR;
extern const int SYNTAX_ERROR;
extern const int TYPE_MISMATCH;
2019-09-11 19:55:28 +00:00
extern const int SUPPORT_IS_DISABLED;
extern const int ARGUMENT_OUT_OF_BOUND;
}
ValuesBlockInputFormat::ValuesBlockInputFormat(ReadBuffer & in_, const Block & header_, const RowInputFormatParams & params_,
const FormatSettings & format_settings_)
: IInputFormat(header_, buf), buf(in_), params(params_),
format_settings(format_settings_), num_columns(header_.columns()),
2019-09-13 17:40:48 +00:00
parser_type_for_column(num_columns, ParserType::Streaming),
attempts_to_deduce_template(num_columns), attempts_to_deduce_template_cached(num_columns),
2019-09-25 16:08:58 +00:00
rows_parsed_using_template(num_columns), templates(num_columns), types(header_.getDataTypes())
{
/// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it.
skipBOMIfExists(buf);
}
Chunk ValuesBlockInputFormat::generate()
{
const Block & header = getPort().getHeader();
2019-05-16 02:05:44 +00:00
MutableColumns columns = header.cloneEmptyColumns();
2019-10-16 19:52:00 +00:00
block_missing_values.clear();
for (size_t rows_in_block = 0; rows_in_block < params.max_block_size; ++rows_in_block)
{
try
{
skipWhitespaceIfAny(buf);
if (buf.eof() || *buf.position() == ';')
2019-05-16 02:05:44 +00:00
break;
2019-10-16 19:52:00 +00:00
readRow(columns, rows_in_block);
2019-10-02 17:51:00 +00:00
if (params.callback)
params.callback();
}
catch (Exception & e)
{
if (isParseError(e.code()))
e.addMessage(" at row " + std::to_string(total_rows));
throw;
}
}
2019-05-16 02:05:44 +00:00
/// Evaluate expressions, which were parsed using templates, if any
for (size_t i = 0; i < columns.size(); ++i)
{
2019-09-11 19:55:28 +00:00
if (!templates[i] || !templates[i]->rowsCount())
continue;
if (columns[i]->empty())
2019-10-16 19:52:00 +00:00
columns[i] = std::move(*templates[i]->evaluateAll(block_missing_values, i)).mutate();
else
{
2019-10-16 19:52:00 +00:00
ColumnPtr evaluated = templates[i]->evaluateAll(block_missing_values, i, columns[i]->size());
columns[i]->insertRangeFrom(*evaluated, 0, evaluated->size());
}
}
if (columns.empty() || columns[0]->empty())
{
readSuffix();
return {};
}
size_t rows_in_block = columns[0]->size();
return Chunk{std::move(columns), rows_in_block};
}
2019-10-16 19:52:00 +00:00
void ValuesBlockInputFormat::readRow(MutableColumns & columns, size_t row_num)
2019-09-25 16:08:58 +00:00
{
assertChar('(', buf);
for (size_t column_idx = 0; column_idx < num_columns; ++column_idx)
{
skipWhitespaceIfAny(buf);
PeekableReadBufferCheckpoint checkpoint{buf};
2019-10-16 19:52:00 +00:00
bool read;
2019-09-25 16:08:58 +00:00
/// Parse value using fast streaming parser for literals and slow SQL parser for expressions.
/// If there is SQL expression in some row, template of this expression will be deduced,
/// so it makes possible to parse the following rows much faster
/// if expressions in the following rows have the same structure
if (parser_type_for_column[column_idx] == ParserType::Streaming)
2019-10-16 19:52:00 +00:00
read = tryReadValue(*columns[column_idx], column_idx);
2019-09-25 16:08:58 +00:00
else if (parser_type_for_column[column_idx] == ParserType::BatchTemplate)
2019-10-16 19:52:00 +00:00
read = tryParseExpressionUsingTemplate(columns[column_idx], column_idx);
2019-09-25 16:08:58 +00:00
else /// if (parser_type_for_column[column_idx] == ParserType::SingleExpressionEvaluation)
2019-10-16 19:52:00 +00:00
read = parseExpression(*columns[column_idx], column_idx);
if (!read)
block_missing_values.setBit(column_idx, row_num);
/// If read is true, value still may be missing. Bit mask for these values will be copied from ConstantExpressionTemplate later.
2019-09-25 16:08:58 +00:00
}
skipWhitespaceIfAny(buf);
if (!buf.eof() && *buf.position() == ',')
++buf.position();
++total_rows;
}
2019-10-16 19:52:00 +00:00
bool ValuesBlockInputFormat::tryParseExpressionUsingTemplate(MutableColumnPtr & column, size_t column_idx)
{
2019-09-11 19:55:28 +00:00
/// Try to parse expression using template if one was successfully deduced while parsing the first row
auto settings = context->getSettingsRef();
if (templates[column_idx]->parseExpression(buf, format_settings, settings))
{
2019-09-11 19:55:28 +00:00
++rows_parsed_using_template[column_idx];
2019-10-16 19:52:00 +00:00
return true;
}
2019-09-11 19:55:28 +00:00
/// Expression in the current row is not match template deduced on the first row.
/// Evaluate expressions, which were parsed using this template.
if (column->empty())
2019-10-16 19:52:00 +00:00
column = std::move(*templates[column_idx]->evaluateAll(block_missing_values, column_idx)).mutate();
2019-09-11 19:55:28 +00:00
else
{
2019-10-16 19:52:00 +00:00
ColumnPtr evaluated = templates[column_idx]->evaluateAll(block_missing_values, column_idx, column->size());
2019-09-11 19:55:28 +00:00
column->insertRangeFrom(*evaluated, 0, evaluated->size());
}
/// Do not use this template anymore
templates[column_idx].reset();
buf.rollbackToCheckpoint();
/// It will deduce new template or fallback to slow SQL parser
2019-10-16 19:52:00 +00:00
return parseExpression(*column, column_idx);
}
2019-10-16 19:52:00 +00:00
bool ValuesBlockInputFormat::tryReadValue(IColumn & column, size_t column_idx)
2019-05-16 02:05:44 +00:00
{
2019-09-13 17:40:48 +00:00
bool rollback_on_exception = false;
2019-05-16 02:05:44 +00:00
try
{
2019-10-16 19:52:00 +00:00
bool read = true;
const auto & type = types[column_idx];
if (format_settings.null_as_default && !type->isNullable())
read = DataTypeNullable::deserializeTextQuoted(column, buf, format_settings, type);
else
type->deserializeAsTextQuoted(column, buf, format_settings);
2019-09-13 17:40:48 +00:00
rollback_on_exception = true;
2019-05-16 02:05:44 +00:00
2019-09-13 17:40:48 +00:00
skipWhitespaceIfAny(buf);
assertDelimiterAfterValue(column_idx);
2019-10-16 19:52:00 +00:00
return read;
2019-05-16 02:05:44 +00:00
}
catch (const Exception & e)
{
/// Do not consider decimal overflow as parse error to avoid attempts to parse it as expression with float literal
bool decimal_overflow = e.code() == ErrorCodes::ARGUMENT_OUT_OF_BOUND;
if (!isParseError(e.code()) || decimal_overflow)
2019-05-16 02:05:44 +00:00
throw;
2019-09-13 17:40:48 +00:00
if (rollback_on_exception)
column.popBack(1);
/// Switch to SQL parser and don't try to use streaming parser for complex expressions
/// Note: Throwing exceptions for each expression may be very slow because of stacktraces
buf.rollbackToCheckpoint();
2019-10-16 19:52:00 +00:00
return parseExpression(column, column_idx);
2019-05-16 02:05:44 +00:00
}
}
2019-10-16 19:52:00 +00:00
bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx)
{
const Block & header = getPort().getHeader();
const IDataType & type = *header.getByPosition(column_idx).type;
auto settings = context->getSettingsRef();
/// We need continuous memory containing the expression to use Lexer
skipToNextRow(0, 1);
buf.makeContinuousMemoryFromCheckpointToPos();
buf.rollbackToCheckpoint();
Expected expected;
Tokens tokens(buf.position(), buf.buffer().end());
IParser::Pos token_iterator(tokens, settings.max_parser_depth);
ASTPtr ast;
bool parsed = parser.parse(token_iterator, ast, expected);
2019-09-06 19:01:44 +00:00
/// Consider delimiter after value (',' or ')') as part of expression
if (column_idx + 1 != num_columns)
parsed &= token_iterator->type == TokenType::Comma;
else
parsed &= token_iterator->type == TokenType::ClosingRoundBracket;
if (!parsed)
throw Exception("Cannot parse expression of type " + type.getName() + " here: "
+ String(buf.position(), std::min(SHOW_CHARS_ON_SYNTAX_ERROR, buf.buffer().end() - buf.position())),
ErrorCodes::SYNTAX_ERROR);
++token_iterator;
2019-10-02 17:51:00 +00:00
if (parser_type_for_column[column_idx] != ParserType::Streaming && dynamic_cast<const ASTLiteral *>(ast.get()))
2019-09-13 17:40:48 +00:00
{
/// It's possible that streaming parsing has failed on some row (e.g. because of '+' sign before integer),
/// but it still can parse the following rows
/// Check if we can use fast streaming parser instead if using templates
bool rollback_on_exception = false;
bool ok = false;
try
{
header.getByPosition(column_idx).type->deserializeAsTextQuoted(column, buf, format_settings);
rollback_on_exception = true;
skipWhitespaceIfAny(buf);
if (checkDelimiterAfterValue(column_idx))
ok = true;
}
catch (const Exception & e)
{
bool decimal_overflow = e.code() == ErrorCodes::ARGUMENT_OUT_OF_BOUND;
if (!isParseError(e.code()) || decimal_overflow)
2019-09-13 17:40:48 +00:00
throw;
}
if (ok)
{
parser_type_for_column[column_idx] = ParserType::Streaming;
2019-10-16 19:52:00 +00:00
return true;
2019-09-13 17:40:48 +00:00
}
else if (rollback_on_exception)
column.popBack(1);
}
2019-10-02 17:51:00 +00:00
parser_type_for_column[column_idx] = ParserType::SingleExpressionEvaluation;
2019-09-13 17:40:48 +00:00
/// Try to deduce template of expression and use it to parse the following rows
if (shouldDeduceNewTemplate(column_idx))
{
2019-05-16 02:05:44 +00:00
if (templates[column_idx])
throw DB::Exception("Template for column " + std::to_string(column_idx) + " already exists and it was not evaluated yet",
ErrorCodes::LOGICAL_ERROR);
2019-10-02 17:51:00 +00:00
std::exception_ptr exception;
try
{
2019-09-13 17:40:48 +00:00
bool found_in_cache = false;
const auto & result_type = header.getByPosition(column_idx).type;
const char * delimiter = (column_idx + 1 == num_columns) ? ")" : ",";
2019-10-16 19:52:00 +00:00
auto structure = templates_cache.getFromCacheOrConstruct(result_type, format_settings.null_as_default,
TokenIterator(tokens), token_iterator,
2019-09-13 17:40:48 +00:00
ast, *context, &found_in_cache, delimiter);
templates[column_idx].emplace(structure);
if (found_in_cache)
++attempts_to_deduce_template_cached[column_idx];
else
++attempts_to_deduce_template[column_idx];
buf.rollbackToCheckpoint();
if (templates[column_idx]->parseExpression(buf, format_settings, settings))
2019-10-02 17:51:00 +00:00
{
++rows_parsed_using_template[column_idx];
parser_type_for_column[column_idx] = ParserType::BatchTemplate;
2019-10-16 19:52:00 +00:00
return true;
2019-10-02 17:51:00 +00:00
}
}
2019-05-16 02:05:44 +00:00
catch (...)
{
2019-10-02 17:51:00 +00:00
exception = std::current_exception();
}
if (!format_settings.values.interpret_expressions)
{
if (exception)
std::rethrow_exception(exception);
else
{
buf.rollbackToCheckpoint();
size_t len = const_cast<char *>(token_iterator->begin) - buf.position();
throw Exception("Cannot deduce template of expression: " + std::string(buf.position(), len), ErrorCodes::SYNTAX_ERROR);
}
}
2019-10-02 17:51:00 +00:00
/// Continue parsing without template
templates[column_idx].reset();
}
2019-05-16 02:05:44 +00:00
2019-09-13 17:40:48 +00:00
if (!format_settings.values.interpret_expressions)
throw Exception("Interpreting expressions is disabled", ErrorCodes::SUPPORT_IS_DISABLED);
/// Try to evaluate single expression if other parsers don't work
buf.position() = const_cast<char *>(token_iterator->begin);
2019-09-04 19:42:01 +00:00
std::pair<Field, DataTypePtr> value_raw = evaluateConstantExpression(ast, *context);
Field value = convertFieldToType(value_raw.first, type, value_raw.second.get());
/// Check that we are indeed allowed to insert a NULL.
if (value.isNull() && !type.isNullable())
{
2019-10-16 19:52:00 +00:00
if (format_settings.null_as_default)
{
type.insertDefaultInto(column);
return false;
}
2019-09-04 19:42:01 +00:00
buf.rollbackToCheckpoint();
throw Exception{"Cannot insert NULL value into a column of type '" + type.getName() + "'"
+ " at: " +
2019-09-04 19:42:01 +00:00
String(buf.position(), std::min(SHOW_CHARS_ON_SYNTAX_ERROR, buf.buffer().end() - buf.position())),
ErrorCodes::TYPE_MISMATCH};
2019-09-04 19:42:01 +00:00
}
2019-05-16 02:05:44 +00:00
column.insert(value);
2019-10-16 19:52:00 +00:00
return true;
2019-05-16 02:05:44 +00:00
}
/// Can be used in fileSegmentationEngine for parallel parsing of Values
2019-11-18 19:25:17 +00:00
bool ValuesBlockInputFormat::skipToNextRow(size_t min_chunk_bytes, int balance)
{
skipWhitespaceIfAny(buf);
if (buf.eof() || *buf.position() == ';')
return false;
bool quoted = false;
size_t chunk_begin_buf_count = buf.count();
2019-11-18 19:25:17 +00:00
while (!buf.eof() && (balance || buf.count() - chunk_begin_buf_count < min_chunk_bytes))
{
buf.position() = find_first_symbols<'\\', '\'', ')', '('>(buf.position(), buf.buffer().end());
if (buf.position() == buf.buffer().end())
continue;
if (*buf.position() == '\\')
{
++buf.position();
if (!buf.eof())
++buf.position();
}
else if (*buf.position() == '\'')
{
quoted ^= true;
++buf.position();
}
else if (*buf.position() == ')')
{
++buf.position();
if (!quoted)
--balance;
}
else if (*buf.position() == '(')
{
++buf.position();
if (!quoted)
++balance;
}
}
if (!buf.eof() && *buf.position() == ',')
++buf.position();
return true;
}
void ValuesBlockInputFormat::assertDelimiterAfterValue(size_t column_idx)
2019-05-16 02:05:44 +00:00
{
2019-09-25 16:08:58 +00:00
if (unlikely(!checkDelimiterAfterValue(column_idx)))
throwAtAssertionFailed((column_idx + 1 == num_columns) ? ")" : ",", buf);
}
2019-09-04 19:42:01 +00:00
bool ValuesBlockInputFormat::checkDelimiterAfterValue(size_t column_idx)
{
skipWhitespaceIfAny(buf);
2019-09-25 16:08:58 +00:00
if (likely(column_idx + 1 != num_columns))
2019-09-04 19:42:01 +00:00
return checkChar(',', buf);
else
return checkChar(')', buf);
}
bool ValuesBlockInputFormat::shouldDeduceNewTemplate(size_t column_idx)
{
2019-09-11 19:55:28 +00:00
if (!format_settings.values.deduce_templates_of_expressions)
return false;
2019-09-04 19:42:01 +00:00
/// TODO better heuristic
2019-09-13 17:40:48 +00:00
/// Using template from cache is approx 2x faster, than evaluating single expression
/// Construction of new template is approx 1.5x slower, than evaluating single expression
float attempts_weighted = 1.5 * attempts_to_deduce_template[column_idx] + 0.5 * attempts_to_deduce_template_cached[column_idx];
constexpr size_t max_attempts = 100;
if (attempts_weighted < max_attempts)
return true;
2019-09-13 17:40:48 +00:00
if (rows_parsed_using_template[column_idx] / attempts_weighted > 1)
{
/// Try again
2019-09-04 19:42:01 +00:00
attempts_to_deduce_template[column_idx] = 0;
2019-09-13 17:40:48 +00:00
attempts_to_deduce_template_cached[column_idx] = 0;
rows_parsed_using_template[column_idx] = 0;
return true;
}
return false;
}
2019-09-30 18:21:58 +00:00
void ValuesBlockInputFormat::readSuffix()
{
if (buf.hasUnreadData())
throw Exception("Unread data in PeekableReadBuffer will be lost. Most likely it's a bug.", ErrorCodes::LOGICAL_ERROR);
}
void ValuesBlockInputFormat::resetParser()
{
IInputFormat::resetParser();
2019-12-03 00:50:50 +00:00
// I'm not resetting parser modes here.
2019-12-03 00:51:10 +00:00
// There is a good chance that all messages have the same format.
total_rows = 0;
}
void registerInputFormatProcessorValues(FormatFactory & factory)
{
factory.registerInputFormatProcessor("Values", [](
ReadBuffer & buf,
const Block & header,
const RowInputFormatParams & params,
const FormatSettings & settings)
{
return std::make_shared<ValuesBlockInputFormat>(buf, header, params, settings);
});
}
}