Fix NaN deserialization for Quoted escaping rule

This commit is contained in:
avogar 2021-12-03 15:37:39 +03:00
parent 049b2c0c14
commit 4f136cb30c
4 changed files with 127 additions and 6 deletions

View File

@ -394,14 +394,67 @@ template<typename ReturnType>
ReturnType SerializationNullable::deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings,
const SerializationPtr & nested)
{
return safeDeserialize<ReturnType>(column, *nested,
[&istr]
if (istr.eof() || (*istr.position() != 'N' && *istr.position() != 'n'))
{
return checkStringByFirstCharacterAndAssertTheRestCaseInsensitive("NULL", istr);
},
/// This is not null, surely.
return safeDeserialize<ReturnType>(column, *nested,
[] { return false; },
[&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextQuoted(nested_column, istr, settings); });
}
/// Check if we have enough data in buffer to check if it's a null.
if (istr.available() >= 4)
{
auto check_for_null = [&istr]()
{
auto * pos = istr.position();
if (checkStringCaseInsensitive("NULL", istr))
return true;
istr.position() = pos;
return false;
};
auto deserialize_nested = [&nested, &settings, &istr] (IColumn & nested_column)
{
nested->deserializeTextQuoted(nested_column, istr, settings);
};
return safeDeserialize<ReturnType>(column, *nested, check_for_null, deserialize_nested);
}
/// We don't have enough data in buffer to check if it's a NULL
/// and we cannot check it just by one symbol (otherwise we won't be able
/// to differentiate for example NULL and NaN for float)
/// Use PeekableReadBuffer to make a checkpoint before checking
/// null and rollback if the check was failed.
PeekableReadBuffer buf(istr, true);
auto check_for_null = [&buf]()
{
buf.setCheckpoint();
SCOPE_EXIT(buf.dropCheckpoint());
if (checkStringCaseInsensitive("NULL", buf))
return true;
buf.rollbackToCheckpoint();
return false;
};
auto deserialize_nested = [&nested, &settings, &buf] (IColumn & nested_column)
{
nested->deserializeTextQuoted(nested_column, buf, settings);
/// Check that we don't have any unread data in PeekableReadBuffer own memory.
if (likely(!buf.hasUnreadData()))
return;
/// We have some unread data in PeekableReadBuffer own memory.
/// It can happen only if there is an unquoted string instead of a number.
throw DB::ParsingException(
ErrorCodes::CANNOT_READ_ALL_DATA,
"Error while parsing Nullable: got an unquoted string {} instead of a number",
String(buf.position(), std::min(10ul, buf.available())));
};
return safeDeserialize<ReturnType>(column, *nested, check_for_null, deserialize_nested);
}
void SerializationNullable::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{

View File

@ -205,7 +205,7 @@ void CustomSeparatedRowInputFormat::syncAfterError()
bool CustomSeparatedRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out)
{
return parseDelimiterWithDiagnosticInfo(out, buf, format_settings.custom.row_before_delimiter, "delimiter before first firld", ignore_spaces);
return parseDelimiterWithDiagnosticInfo(out, buf, format_settings.custom.row_before_delimiter, "delimiter before first field", ignore_spaces);
}
bool CustomSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out)

View File

@ -0,0 +1,12 @@
\N 1
nan 2
42.42 3
\N 4
\N 5
\N 6
\N 7
nan 8
nan 9
nan 10
nan 11
OK

View File

@ -0,0 +1,56 @@
#!/usr/bin/env bash
# Tags: no-parallel
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
DATA_FILE=$USER_FILES_PATH/test_02130.data
SELECT_QUERY="select * from file('test_02130.data', 'CustomSeparated', 'x Nullable(Float64), y Nullable(UInt64)') settings input_format_parallel_parsing=0, format_custom_escaping_rule='Quoted'"
$CLICKHOUSE_CLIENT -q "drop table if exists test_02130"
$CLICKHOUSE_CLIENT -q "create table test_02130 (x Nullable(Float64), y Nullable(UInt64)) engine=Memory()"
echo -e "null\t1" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "$SELECT_QUERY"
echo -e "nan\t2" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "$SELECT_QUERY"
echo -e "42.42\t3" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "$SELECT_QUERY"
echo -e "null\t4" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "$SELECT_QUERY" --max_read_buffer_size=1
echo -e "null\t5" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "$SELECT_QUERY" --max_read_buffer_size=2
echo -e "null\t6" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "$SELECT_QUERY" --max_read_buffer_size=3
echo -e "null\t7" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "$SELECT_QUERY" --max_read_buffer_size=4
echo -e "nan\t8" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "$SELECT_QUERY" --max_read_buffer_size=1
echo -e "nan\t9" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "$SELECT_QUERY" --max_read_buffer_size=2
echo -e "nan\t10" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "$SELECT_QUERY" --max_read_buffer_size=3
echo -e "nan\t11" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "$SELECT_QUERY" --max_read_buffer_size=4
echo -e "42\tnan" > $DATA_FILE
$CLICKHOUSE_CLIENT -q "$SELECT_QUERY" --max_read_buffer_size=4 2>&1 | grep -F -q "CANNOT_READ_ALL_DATA" && echo 'OK' || echo 'FAIL'
$CLICKHOUSE_CLIENT -q "select * from test_02130 order by y"
$CLICKHOUSE_CLIENT -q "drop table test_02130"
rm $DATA_FILE