Merge pull request #49960 from Avogar/fix-tsv-nullable-parsing

Fix possible Logical error on bad Nullable parsing for text formats
This commit is contained in:
Kruglov Pavel 2023-05-23 11:42:07 +02:00 committed by GitHub
commit 0e346c78ae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 27 additions and 1 deletions

View File

@ -355,6 +355,9 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col
/// It can happen only if there is a string instead of a number
/// or if someone uses tab or LF in TSV null_representation.
/// In the first case we cannot continue reading anyway. The second case seems to be unlikely.
/// We also should delete incorrectly deserialized value from nested column.
nested_column.popBack(1);
if (null_representation.find('\t') != std::string::npos || null_representation.find('\n') != std::string::npos)
throw DB::ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation "
"containing '\\t' or '\\n' may not work correctly for large input.");
@ -447,6 +450,8 @@ ReturnType SerializationNullable::deserializeTextQuotedImpl(IColumn & column, Re
/// We have some unread data in PeekableReadBuffer own memory.
/// It can happen only if there is an unquoted string instead of a number.
/// We also should delete incorrectly deserialized value from nested column.
nested_column.popBack(1);
throw DB::ParsingException(
ErrorCodes::CANNOT_READ_ALL_DATA,
"Error while parsing Nullable: got an unquoted string {} instead of a number",
@ -579,6 +584,9 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB
/// It can happen only if there is an unquoted string instead of a number
/// or if someone uses csv delimiter, LF or CR in CSV null representation.
/// In the first case we cannot continue reading anyway. The second case seems to be unlikely.
/// We also should delete incorrectly deserialized value from nested column.
nested_column.popBack(1);
if (null_representation.find(settings.csv.delimiter) != std::string::npos || null_representation.find('\r') != std::string::npos
|| null_representation.find('\n') != std::string::npos)
throw DB::ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "CSV custom null representation containing "

View File

@ -44,7 +44,7 @@ CustomSeparatedRowInputFormat::CustomSeparatedRowInputFormat(
format_settings_,
std::make_unique<CustomSeparatedFormatReader>(*buf_, ignore_spaces_, format_settings_),
format_settings_.custom.try_detect_header)
, buf(std::move(buf_))
, buf(std::move(buf_)), ignore_spaces(ignore_spaces_)
{
/// In case of CustomSeparatedWithNames(AndTypes) formats and enabled setting input_format_with_names_use_header we don't know
/// the exact number of columns in data (because it can contain unknown columns). So, if field_delimiter and row_after_delimiter are

View File

@ -0,0 +1,3 @@
1 \N
1 \N
1 \N

View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
# NOTE: this sh wrapper is required because of shell_config
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
$CLICKHOUSE_CLIENT -q "drop table if exists test"
$CLICKHOUSE_CLIENT -q "create table test (x UInt32, y Nullable(UInt32)) engine=MergeTree order by x"
$CLICKHOUSE_CLIENT -q "select '1\t\\\N\n2\t\\\' format RawBLOB" | $CLICKHOUSE_CLIENT -q "insert into test settings input_format_allow_errors_num=1 format TSV"
$CLICKHOUSE_CLIENT -q "select '1,\\\N\n2,\\\' format RawBLOB" | $CLICKHOUSE_CLIENT -q "insert into test settings input_format_allow_errors_num=1 format CSV"
$CLICKHOUSE_CLIENT -q "select '1\tNULL\n2\tN' format RawBLOB" | $CLICKHOUSE_CLIENT -q "insert into test settings input_format_allow_errors_num=2, format_custom_escaping_rule='Quoted' format CustomSeparated"
$CLICKHOUSE_CLIENT -q "select * from test"
$CLICKHOUSE_CLIENT -q "drop table test";