From d5c5a3213bdc8342b225537b003c3e07678fdaae Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 21 Oct 2021 16:52:27 +0300 Subject: [PATCH 01/11] Add custom null representation support for TSV/CSV input formats, fix bugs in deserializing NULLs in some cases --- src/Core/Settings.h | 4 +- .../Serializations/SerializationNullable.cpp | 200 +++++++++--------- src/Formats/FormatFactory.cpp | 4 +- 3 files changed, 102 insertions(+), 106 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index f91bf684c85..402e2b2f6a4 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -593,8 +593,8 @@ class IColumn; M(String, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy'.", 0) \ M(UInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \ M(Bool, output_format_tsv_crlf_end_of_line, false, "If it is set true, end of line in TSV format will be \\r\\n instead of \\n.", 0) \ - M(String, output_format_csv_null_representation, "\\N", "Custom NULL representation in CSV format", 0) \ - M(String, output_format_tsv_null_representation, "\\N", "Custom NULL representation in TSV format", 0) \ + M(String, format_csv_null_representation, "\\N", "Custom NULL representation in CSV format", 0) \ + M(String, format_tsv_null_representation, "\\N", "Custom NULL representation in TSV format", 0) \ M(Bool, output_format_decimal_trailing_zeros, false, "Output trailing zeros when printing Decimal values. E.g. 1.230000 instead of 1.23.", 0) \ \ M(UInt64, input_format_allow_errors_num, 0, "Maximum absolute amount of errors while reading text formats (like CSV, TSV). In case of error, if at least absolute or relative amount of errors is lower than corresponding value, will skip until next line and continue.", 0) \ diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 560a4812123..c3a13cc3a52 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -7,12 +7,10 @@ #include #include #include -#include #include #include #include -#include -#include +#include #include namespace DB @@ -21,6 +19,7 @@ namespace DB namespace ErrorCodes { extern const int CANNOT_READ_ALL_DATA; + extern const int INVALID_SETTING_VALUE; } DataTypePtr SerializationNullable::SubcolumnCreator::create(const DataTypePtr & prev) const @@ -260,56 +259,54 @@ template ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested) { - /// Little tricky, because we cannot discriminate null from first character. + const String & null_representation = settings.tsv.null_representation; - if (istr.eof() || *istr.position() != '\\') /// Some data types can deserialize absence of data (e.g. empty string), so eof is ok. + if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0])) { /// This is not null, surely. return safeDeserialize(column, *nested, [] { return false; }, [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextEscaped(nested_column, istr, settings); }); } - else + + PeekableReadBuffer buf(istr); + auto check_for_null = [&buf, &null_representation]() { - /// Now we know, that data in buffer starts with backslash. - ++istr.position(); + buf.setCheckpoint(); + SCOPE_EXIT(buf.dropCheckpoint()); + if (checkString(null_representation, buf) && (buf.eof() || *buf.position() == '\t' || *buf.position() == '\n')) + return true; - if (istr.eof()) - throw ParsingException("Unexpected end of stream, while parsing value of Nullable type, after backslash", ErrorCodes::CANNOT_READ_ALL_DATA); + buf.rollbackToCheckpoint(); + return false; + }; - return safeDeserialize(column, *nested, - [&istr] - { - if (*istr.position() == 'N') - { - ++istr.position(); - return true; - } - return false; - }, - [&nested, &istr, &settings] (IColumn & nested_column) - { - if (istr.position() != istr.buffer().begin()) - { - /// We could step back to consume backslash again. - --istr.position(); - nested->deserializeTextEscaped(nested_column, istr, settings); - } - else - { - /// Otherwise, we need to place backslash back in front of istr. - ReadBufferFromMemory prefix("\\", 1); - ConcatReadBuffer prepended_istr(prefix, istr); + auto deserialize_nested = [&nested, &settings, &buf, &null_representation] (IColumn & nested_column) + { + auto * pos = buf.position(); + nested->deserializeTextEscaped(nested_column, buf, settings); + /// Check that we don't have any unread data in PeekableReadBuffer own memory. + if (likely(!buf.hasUnreadData())) + return; - nested->deserializeTextEscaped(nested_column, prepended_istr, settings); + /// We have some unread data in PeekableReadBuffer own memory. + /// It can happen only if there is a string instead of a number + /// or if someone uses tab or LF in TSV null_representation. + /// In the first case we cannot continue reading anyway. The second case seems to be unlikely. + if (null_representation.find('\t') != std::string::npos || null_representation.find('\n') != std::string::npos) + throw DB::ParsingException("TSV custom null representation containing '\\t' or '\\n' may not work correctly " + "for large input.", ErrorCodes::CANNOT_READ_ALL_DATA); - /// Synchronise cursor position in original buffer. + WriteBufferFromOwnString parsed_value; + nested->serializeTextEscaped(nested_column, nested_column.size() - 1, parsed_value, settings); + throw DB::ParsingException("Error while parsing \"" + std::string(pos, std::min(size_t{10}, buf.available())) + "\" as Nullable" + + " at position " + std::to_string(buf.count()) + ": got \"" + std::string(pos, buf.position() - pos) + + "\", which was deserialized as \"" + + parsed_value.str() + "\". It seems that input data is ill-formatted.", + ErrorCodes::CANNOT_READ_ALL_DATA); + }; - if (prepended_istr.count() > 1) - istr.position() = prepended_istr.position(); - } - }); - } + return safeDeserialize(column, *nested, check_for_null, deserialize_nested); } void SerializationNullable::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -350,13 +347,30 @@ template ReturnType SerializationNullable::deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested) { - return safeDeserialize(column, *nested, - [&istr] - { - return checkStringByFirstCharacterAndAssertTheRestCaseInsensitive("NULL", istr) - || checkStringByFirstCharacterAndAssertTheRest("ᴺᵁᴸᴸ", istr); - }, - [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeWholeText(nested_column, istr, settings); }); + PeekableReadBuffer buf(istr); + auto check_for_null = [&buf]() + { + buf.setCheckpoint(); + SCOPE_EXIT(buf.dropCheckpoint()); + + if (checkStringCaseInsensitive("NULL", buf)) + return true; + + buf.rollbackToCheckpoint(); + if (checkStringCaseInsensitive("ᴺᵁᴸᴸ", buf)) + return true; + + buf.rollbackToCheckpoint(); + return false; + }; + + auto deserialize_nested = [&nested, &settings, &buf] (IColumn & nested_column) + { + nested->deserializeWholeText(nested_column, buf, settings); + assert(!buf.hasUnreadData()); + }; + + return safeDeserialize(column, *nested, check_for_null, deserialize_nested); } @@ -377,71 +391,53 @@ void SerializationNullable::deserializeTextCSV(IColumn & column, ReadBuffer & is template ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested) + const SerializationPtr & nested) { - constexpr char const * null_literal = "NULL"; - constexpr size_t len = 4; - size_t null_prefix_len = 0; - - auto check_for_null = [&istr, &settings, &null_prefix_len] + const String & null_representation = settings.csv.null_representation; + if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0])) { - if (checkStringByFirstCharacterAndAssertTheRest("\\N", istr)) - return true; - if (!settings.csv.unquoted_null_literal_as_null) - return false; + /// This is not null, surely. + return safeDeserialize(column, *nested, + [] { return false; }, + [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextCSV(nested_column, istr, settings); }); + } - /// Check for unquoted NULL - while (!istr.eof() && null_prefix_len < len && null_literal[null_prefix_len] == *istr.position()) - { - ++null_prefix_len; - ++istr.position(); - } - if (null_prefix_len == len) + PeekableReadBuffer buf(istr); + auto check_for_null = [&buf, &null_representation, &settings]() + { + buf.setCheckpoint(); + SCOPE_EXIT(buf.dropCheckpoint()); + if (checkString(null_representation, buf) && (buf.eof() || *buf.position() == settings.csv.delimiter || *buf.position() == '\r' || *buf.position() == '\n')) return true; - /// Value and "NULL" have common prefix, but value is not "NULL". - /// Restore previous buffer position if possible. - if (null_prefix_len <= istr.offset()) - { - istr.position() -= null_prefix_len; - null_prefix_len = 0; - } + buf.rollbackToCheckpoint(); return false; }; - auto deserialize_nested = [&nested, &settings, &istr, &null_prefix_len] (IColumn & nested_column) + auto deserialize_nested = [&nested, &settings, &buf, &null_representation] (IColumn & nested_column) { - if (likely(!null_prefix_len)) - nested->deserializeTextCSV(nested_column, istr, settings); - else - { - /// Previous buffer position was not restored, - /// so we need to prepend extracted characters (rare case) - ReadBufferFromMemory prepend(null_literal, null_prefix_len); - ConcatReadBuffer buf(prepend, istr); - nested->deserializeTextCSV(nested_column, buf, settings); + auto * pos = buf.position(); + nested->deserializeTextCSV(nested_column, buf, settings); + /// Check that we don't have any unread data in PeekableReadBuffer own memory. + if (likely(!buf.hasUnreadData())) + return; - /// Check if all extracted characters were read by nested parser and update buffer position - if (null_prefix_len < buf.count()) - istr.position() = buf.position(); - else if (null_prefix_len > buf.count()) - { - /// It can happen only if there is an unquoted string instead of a number - /// or if someone uses 'U' or 'L' as delimiter in CSV. - /// In the first case we cannot continue reading anyway. The second case seems to be unlikely. - if (settings.csv.delimiter == 'U' || settings.csv.delimiter == 'L') - throw DB::ParsingException("Enabled setting input_format_csv_unquoted_null_literal_as_null may not work correctly " - "with format_csv_delimiter = 'U' or 'L' for large input.", ErrorCodes::CANNOT_READ_ALL_DATA); - WriteBufferFromOwnString parsed_value; - nested->serializeTextCSV(nested_column, nested_column.size() - 1, parsed_value, settings); - throw DB::ParsingException("Error while parsing \"" + std::string(null_literal, null_prefix_len) - + std::string(istr.position(), std::min(size_t{10}, istr.available())) + "\" as Nullable" - + " at position " + std::to_string(istr.count()) + ": got \"" + std::string(null_literal, buf.count()) - + "\", which was deserialized as \"" - + parsed_value.str() + "\". It seems that input data is ill-formatted.", - ErrorCodes::CANNOT_READ_ALL_DATA); - } - } + /// We have some unread data in PeekableReadBuffer own memory. + /// It can happen only if there is an unquoted string instead of a number + /// or if someone uses csv delimiter, LF or CR in CSV null representation. + /// In the first case we cannot continue reading anyway. The second case seems to be unlikely. + if (null_representation.find(settings.csv.delimiter) != std::string::npos || null_representation.find('\r') != std::string::npos + || null_representation.find('\n') != std::string::npos) + throw DB::ParsingException("CSV custom null representation containing format_csv_delimiter, '\\r' or '\\n' may not work correctly " + "for large input.", ErrorCodes::CANNOT_READ_ALL_DATA); + + WriteBufferFromOwnString parsed_value; + nested->serializeTextCSV(nested_column, nested_column.size() - 1, parsed_value, settings); + throw DB::ParsingException("Error while parsing \"" + std::string(pos, std::min(size_t{10}, buf.available())) + "\" as Nullable" + + " at position " + std::to_string(buf.count()) + ": got \"" + std::string(pos, buf.position() - pos) + + "\", which was deserialized as \"" + + parsed_value.str() + "\". It seems that input data is ill-formatted.", + ErrorCodes::CANNOT_READ_ALL_DATA); }; return safeDeserialize(column, *nested, check_for_null, deserialize_nested); diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index d2dc18a03fd..152b58f9fa7 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -58,7 +58,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.delimiter = settings.format_csv_delimiter; format_settings.csv.empty_as_default = settings.input_format_defaults_for_omitted_fields; format_settings.csv.input_format_enum_as_number = settings.input_format_csv_enum_as_number; - format_settings.csv.null_representation = settings.output_format_csv_null_representation; + format_settings.csv.null_representation = settings.format_csv_null_representation; format_settings.csv.unquoted_null_literal_as_null = settings.input_format_csv_unquoted_null_literal_as_null; format_settings.csv.input_format_arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv; format_settings.custom.escaping_rule = settings.format_custom_escaping_rule; @@ -102,7 +102,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.tsv.crlf_end_of_line = settings.output_format_tsv_crlf_end_of_line; format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default; format_settings.tsv.input_format_enum_as_number = settings.input_format_tsv_enum_as_number; - format_settings.tsv.null_representation = settings.output_format_tsv_null_representation; + format_settings.tsv.null_representation = settings.format_tsv_null_representation; format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals; format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions; format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions; From 7a37e24b52f415e2590e73b1e2d0614c82720c55 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 21 Oct 2021 16:58:25 +0300 Subject: [PATCH 02/11] Small fix in deserializeWholeText --- src/DataTypes/Serializations/SerializationNullable.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index c3a13cc3a52..cc73bac9a97 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -353,11 +353,11 @@ ReturnType SerializationNullable::deserializeWholeTextImpl(IColumn & column, Rea buf.setCheckpoint(); SCOPE_EXIT(buf.dropCheckpoint()); - if (checkStringCaseInsensitive("NULL", buf)) + if (checkStringCaseInsensitive("NULL", buf) && buf.eof()) return true; buf.rollbackToCheckpoint(); - if (checkStringCaseInsensitive("ᴺᵁᴸᴸ", buf)) + if (checkStringCaseInsensitive("ᴺᵁᴸᴸ", buf) && buf.eof()) return true; buf.rollbackToCheckpoint(); From b50fff4e2bbd5d1777327001e531d22d3ce4ce09 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 21 Oct 2021 17:33:26 +0300 Subject: [PATCH 03/11] Fix style --- src/DataTypes/Serializations/SerializationNullable.cpp | 1 - src/DataTypes/Serializations/SerializationString.cpp | 3 --- 2 files changed, 4 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index cc73bac9a97..a6618128275 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -19,7 +19,6 @@ namespace DB namespace ErrorCodes { extern const int CANNOT_READ_ALL_DATA; - extern const int INVALID_SETTING_VALUE; } DataTypePtr SerializationNullable::SubcolumnCreator::create(const DataTypePtr & prev) const diff --git a/src/DataTypes/Serializations/SerializationString.cpp b/src/DataTypes/Serializations/SerializationString.cpp index c3c24ed6749..f655a6ca1f9 100644 --- a/src/DataTypes/Serializations/SerializationString.cpp +++ b/src/DataTypes/Serializations/SerializationString.cpp @@ -1,7 +1,6 @@ #include #include -#include #include #include @@ -9,8 +8,6 @@ #include #include -#include -#include #include #include From 8aad00818a5d17e85153faf3e3e0c4849c8d1b35 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 21 Oct 2021 18:01:23 +0300 Subject: [PATCH 04/11] Fix build --- src/DataTypes/Serializations/SerializationNullable.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index a6618128275..75f9d40740b 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -12,6 +12,7 @@ #include #include #include +#include namespace DB { From a0aa30a4e982367d35a2f8734d919cd8b5f6ef49 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 21 Oct 2021 19:41:43 +0300 Subject: [PATCH 05/11] Fix tests --- tests/queries/0_stateless/00301_csv.sh | 2 +- tests/queries/0_stateless/01474_custom_null_tsv.sh | 2 +- .../02029_output_csv_null_representation.reference | 4 ++-- .../0_stateless/02029_output_csv_null_representation.sql | 8 ++++---- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/queries/0_stateless/00301_csv.sh b/tests/queries/0_stateless/00301_csv.sh index 0aee9abe25c..333af9dcfe0 100755 --- a/tests/queries/0_stateless/00301_csv.sh +++ b/tests/queries/0_stateless/00301_csv.sh @@ -33,7 +33,7 @@ $CLICKHOUSE_CLIENT --query="CREATE TABLE csv (t Nullable(DateTime('Europe/Moscow echo 'NULL, NULL "2016-01-01 01:02:03",NUL -"2016-01-02 01:02:03",Nhello' | $CLICKHOUSE_CLIENT --input_format_csv_unquoted_null_literal_as_null=1 --query="INSERT INTO csv FORMAT CSV"; +"2016-01-02 01:02:03",Nhello' | $CLICKHOUSE_CLIENT --format_csv_null_representation='NULL' --query="INSERT INTO csv FORMAT CSV"; $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s NULLS LAST"; $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; diff --git a/tests/queries/0_stateless/01474_custom_null_tsv.sh b/tests/queries/0_stateless/01474_custom_null_tsv.sh index 9dc1c4b7777..fb5939faf5e 100755 --- a/tests/queries/0_stateless/01474_custom_null_tsv.sh +++ b/tests/queries/0_stateless/01474_custom_null_tsv.sh @@ -9,7 +9,7 @@ $CLICKHOUSE_CLIENT --query="CREATE TABLE tsv_custom_null (id Nullable(UInt32)) E $CLICKHOUSE_CLIENT --query="INSERT INTO tsv_custom_null VALUES (NULL)"; -$CLICKHOUSE_CLIENT --output_format_tsv_null_representation='MyNull' --query="SELECT * FROM tsv_custom_null FORMAT TSV"; +$CLICKHOUSE_CLIENT --format_tsv_null_representation='MyNull' --query="SELECT * FROM tsv_custom_null FORMAT TSV"; $CLICKHOUSE_CLIENT --query="DROP TABLE tsv_custom_null"; diff --git a/tests/queries/0_stateless/02029_output_csv_null_representation.reference b/tests/queries/0_stateless/02029_output_csv_null_representation.reference index a5174f4424f..eda4b09e312 100644 --- a/tests/queries/0_stateless/02029_output_csv_null_representation.reference +++ b/tests/queries/0_stateless/02029_output_csv_null_representation.reference @@ -1,4 +1,4 @@ -# output_format_csv_null_representation should initially be \\N +# format_csv_null_representation should initially be \\N "val1",\N,"val3" -# Changing output_format_csv_null_representation +# Changing format_csv_null_representation "val1",∅,"val3" diff --git a/tests/queries/0_stateless/02029_output_csv_null_representation.sql b/tests/queries/0_stateless/02029_output_csv_null_representation.sql index 772c6c89144..a27c552ee60 100644 --- a/tests/queries/0_stateless/02029_output_csv_null_representation.sql +++ b/tests/queries/0_stateless/02029_output_csv_null_representation.sql @@ -7,10 +7,10 @@ CREATE TABLE test_data ( INSERT INTO test_data VALUES ('val1', NULL, 'val3'); -SELECT '# output_format_csv_null_representation should initially be \\N'; +SELECT '# format_csv_null_representation should initially be \\N'; SELECT * FROM test_data FORMAT CSV; -SELECT '# Changing output_format_csv_null_representation'; -SET output_format_csv_null_representation = '∅'; +SELECT '# Changing format_csv_null_representation'; +SET format_csv_null_representation = '∅'; SELECT * FROM test_data FORMAT CSV; -SET output_format_csv_null_representation = '\\N'; +SET format_csv_null_representation = '\\N'; From d1ef96a5efec518626baae177a8c8185d2b87b29 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 27 Oct 2021 17:29:15 +0300 Subject: [PATCH 06/11] Add test, avoid unnecessary allocations, use PeekableReadBuffer only in corner case --- docs/en/interfaces/formats.md | 5 +- docs/en/operations/settings/settings.md | 16 +-- src/Core/Settings.h | 1 - .../Serializations/SerializationNullable.cpp | 61 +++++++-- src/Formats/FormatFactory.cpp | 1 - src/Formats/FormatSettings.h | 1 - src/IO/PeekableReadBuffer.cpp | 4 +- src/IO/PeekableReadBuffer.h | 7 +- ...v_csv_custom_null_representation.reference | 55 ++++++++ ...2103_tsv_csv_custom_null_representation.sh | 125 ++++++++++++++++++ ...104_json_strings_nullable_string.reference | 2 + .../02104_json_strings_nullable_string.sh | 17 +++ 12 files changed, 268 insertions(+), 27 deletions(-) create mode 100644 tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.reference create mode 100755 tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.sh create mode 100644 tests/queries/0_stateless/02104_json_strings_nullable_string.reference create mode 100755 tests/queries/0_stateless/02104_json_strings_nullable_string.sh diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index d0e5c44b4f7..57459152129 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -124,7 +124,8 @@ Only a small set of symbols are escaped. You can easily stumble onto a string va Arrays are written as a list of comma-separated values in square brackets. Number items in the array are formatted as normally. `Date` and `DateTime` types are written in single quotes. Strings are written in single quotes with the same escaping rules as above. -[NULL](../sql-reference/syntax.md) is formatted as `\N`. +[NULL](../sql-reference/syntax.md) is formatted according to setting [format_tsv_null_representation](../operations/settings/settings.md#settings-format_tsv_null_representation) (default value is `\N`). + Each element of [Nested](../sql-reference/data-types/nested-data-structures/nested.md) structures is represented as array. @@ -380,7 +381,7 @@ Empty unquoted input values are replaced with default values for the respective [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields) is enabled. -`NULL` is formatted as `\N` or `NULL` or an empty unquoted string (see settings [input_format_csv_unquoted_null_literal_as_null](../operations/settings/settings.md#settings-input_format_csv_unquoted_null_literal_as_null) and [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#session_settings-input_format_defaults_for_omitted_fields)). +`NULL` is formatted according to setting [format_csv_null_representation](../operations/settings/settings.md#settings-format_csv_null_representation) (default value is `\N`). The CSV format supports the output of totals and extremes the same way as `TabSeparated`. diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 2b088bf45bf..b10d649a952 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -1512,10 +1512,6 @@ When `output_format_json_quote_denormals = 1`, the query returns: The character is interpreted as a delimiter in the CSV data. By default, the delimiter is `,`. -## input_format_csv_unquoted_null_literal_as_null {#settings-input_format_csv_unquoted_null_literal_as_null} - -For CSV input format enables or disables parsing of unquoted `NULL` as literal (synonym for `\N`). - ## input_format_csv_enum_as_number {#settings-input_format_csv_enum_as_number} Enables or disables parsing enum values as enum ids for CSV input format. @@ -2859,9 +2855,9 @@ Possible values: Default value: `1`. -## output_format_csv_null_representation {#output_format_csv_null_representation} +## format_csv_null_representation {#format_csv_null_representation} -Defines the representation of `NULL` for [CSV](../../interfaces/formats.md#csv) output format. User can set any string as a value, for example, `My NULL`. +Defines the representation of `NULL` for [CSV](../../interfaces/formats.md#csv) output and input formats. User can set any string as a value, for example, `My NULL`. Default value: `\N`. @@ -2884,7 +2880,7 @@ Result Query ```sql -SET output_format_csv_null_representation = 'My NULL'; +SET format_csv_null_representation = 'My NULL'; SELECT * FROM csv_custom_null FORMAT CSV; ``` @@ -2896,9 +2892,9 @@ My NULL My NULL ``` -## output_format_tsv_null_representation {#output_format_tsv_null_representation} +## format_tsv_null_representation {#format_tsv_null_representation} -Defines the representation of `NULL` for [TSV](../../interfaces/formats.md#tabseparated) output format. User can set any string as a value, for example, `My NULL`. +Defines the representation of `NULL` for [TSV](../../interfaces/formats.md#tabseparated) output and input formats. User can set any string as a value, for example, `My NULL`. Default value: `\N`. @@ -2921,7 +2917,7 @@ Result Query ```sql -SET output_format_tsv_null_representation = 'My NULL'; +SET format_tsv_null_representation = 'My NULL'; SELECT * FROM tsv_custom_null FORMAT TSV; ``` diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 402e2b2f6a4..ba0ffdb6f33 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -552,7 +552,6 @@ class IColumn; M(Bool, format_csv_allow_single_quotes, true, "If it is set to true, allow strings in single quotes.", 0) \ M(Bool, format_csv_allow_double_quotes, true, "If it is set to true, allow strings in double quotes.", 0) \ M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \ - M(Bool, input_format_csv_unquoted_null_literal_as_null, false, "Consider unquoted NULL literal as \\N", 0) \ M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices \\N", 0) \ M(Bool, input_format_csv_arrays_as_nested_csv, false, R"(When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: "[""Hello"", ""world"", ""42"""" TV""]". Braces around array can be omitted.)", 0) \ M(Bool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow, CSVWithNames, TSVWithNames and TSKV formats).", 0) \ diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 75f9d40740b..0dbb140af8b 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -261,6 +261,7 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R { const String & null_representation = settings.tsv.null_representation; + /// Some data types can deserialize absence of data (e.g. empty string), so eof is ok. if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0])) { /// This is not null, surely. @@ -269,7 +270,28 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextEscaped(nested_column, istr, settings); }); } - PeekableReadBuffer buf(istr); + /// Check if we have enough data in buffer to check if it's a null. + if (istr.available() > null_representation.size()) + { + auto check_for_null = [&istr, &null_representation]() + { + auto * pos = istr.position(); + if (checkString(null_representation, istr) && (*istr.position() == '\t' || *istr.position() == '\n')) + return true; + istr.position() = pos; + return false; + }; + auto deserialize_nested = [&nested, &settings, &istr] (IColumn & nested_column) + { + nested->deserializeTextEscaped(nested_column, istr, settings); + }; + return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + } + + /// We don't have enough data in buffer to check if it's a null. + /// Use PeekableReadBuffer to make a checkpoint before checking null + /// representation and rollback if check was failed. + PeekableReadBuffer buf(istr, true); auto check_for_null = [&buf, &null_representation]() { buf.setCheckpoint(); @@ -281,7 +303,7 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R return false; }; - auto deserialize_nested = [&nested, &settings, &buf, &null_representation] (IColumn & nested_column) + auto deserialize_nested = [&nested, &settings, &buf, &null_representation, &istr] (IColumn & nested_column) { auto * pos = buf.position(); nested->deserializeTextEscaped(nested_column, buf, settings); @@ -299,8 +321,8 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R WriteBufferFromOwnString parsed_value; nested->serializeTextEscaped(nested_column, nested_column.size() - 1, parsed_value, settings); - throw DB::ParsingException("Error while parsing \"" + std::string(pos, std::min(size_t{10}, buf.available())) + "\" as Nullable" - + " at position " + std::to_string(buf.count()) + ": got \"" + std::string(pos, buf.position() - pos) + throw DB::ParsingException("Error while parsing \"" + std::string(pos, buf.buffer().end()) + std::string(istr.position(), std::min(size_t(10), istr.available())) + "\" as Nullable" + + " at position " + std::to_string(istr.count()) + ": got \"" + std::string(pos, buf.position() - pos) + "\", which was deserialized as \"" + parsed_value.str() + "\". It seems that input data is ill-formatted.", ErrorCodes::CANNOT_READ_ALL_DATA); @@ -347,7 +369,7 @@ template ReturnType SerializationNullable::deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested) { - PeekableReadBuffer buf(istr); + PeekableReadBuffer buf(istr, true); auto check_for_null = [&buf]() { buf.setCheckpoint(); @@ -402,7 +424,28 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextCSV(nested_column, istr, settings); }); } - PeekableReadBuffer buf(istr); + /// Check if we have enough data in buffer to check if it's a null. + if (istr.available() > null_representation.size()) + { + auto check_for_null = [&istr, &null_representation, &settings]() + { + auto * pos = istr.position(); + if (checkString(null_representation, istr) && (*istr.position() == settings.csv.delimiter || *istr.position() == '\r' || *istr.position() == '\n')) + return true; + istr.position() = pos; + return false; + }; + auto deserialize_nested = [&nested, &settings, &istr] (IColumn & nested_column) + { + nested->deserializeTextCSV(nested_column, istr, settings); + }; + return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + } + + /// We don't have enough data in buffer to check if it's a null. + /// Use PeekableReadBuffer to make a checkpoint before checking null + /// representation and rollback if the check was failed. + PeekableReadBuffer buf(istr, true); auto check_for_null = [&buf, &null_representation, &settings]() { buf.setCheckpoint(); @@ -414,7 +457,7 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB return false; }; - auto deserialize_nested = [&nested, &settings, &buf, &null_representation] (IColumn & nested_column) + auto deserialize_nested = [&nested, &settings, &buf, &null_representation, &istr] (IColumn & nested_column) { auto * pos = buf.position(); nested->deserializeTextCSV(nested_column, buf, settings); @@ -433,8 +476,8 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB WriteBufferFromOwnString parsed_value; nested->serializeTextCSV(nested_column, nested_column.size() - 1, parsed_value, settings); - throw DB::ParsingException("Error while parsing \"" + std::string(pos, std::min(size_t{10}, buf.available())) + "\" as Nullable" - + " at position " + std::to_string(buf.count()) + ": got \"" + std::string(pos, buf.position() - pos) + throw DB::ParsingException("Error while parsing \"" + std::string(pos, buf.buffer().end()) + std::string(istr.position(), std::min(size_t(10), istr.available())) + "\" as Nullable" + + " at position " + std::to_string(istr.count()) + ": got \"" + std::string(pos, buf.position() - pos) + "\", which was deserialized as \"" + parsed_value.str() + "\". It seems that input data is ill-formatted.", ErrorCodes::CANNOT_READ_ALL_DATA); diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 152b58f9fa7..a34660a0197 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -59,7 +59,6 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.empty_as_default = settings.input_format_defaults_for_omitted_fields; format_settings.csv.input_format_enum_as_number = settings.input_format_csv_enum_as_number; format_settings.csv.null_representation = settings.format_csv_null_representation; - format_settings.csv.unquoted_null_literal_as_null = settings.input_format_csv_unquoted_null_literal_as_null; format_settings.csv.input_format_arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv; format_settings.custom.escaping_rule = settings.format_custom_escaping_rule; format_settings.custom.field_delimiter = settings.format_custom_field_delimiter; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 403ccbc6763..269ce9a8a53 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -71,7 +71,6 @@ struct FormatSettings char delimiter = ','; bool allow_single_quotes = true; bool allow_double_quotes = true; - bool unquoted_null_literal_as_null = false; bool empty_as_default = false; bool crlf_end_of_line = false; bool input_format_enum_as_number = false; diff --git a/src/IO/PeekableReadBuffer.cpp b/src/IO/PeekableReadBuffer.cpp index c7cef777afc..e2b1873283f 100644 --- a/src/IO/PeekableReadBuffer.cpp +++ b/src/IO/PeekableReadBuffer.cpp @@ -9,8 +9,8 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -PeekableReadBuffer::PeekableReadBuffer(ReadBuffer & sub_buf_, size_t start_size_ /*= DBMS_DEFAULT_BUFFER_SIZE*/) - : BufferWithOwnMemory(start_size_), sub_buf(sub_buf_) +PeekableReadBuffer::PeekableReadBuffer(ReadBuffer & sub_buf_, bool use_existing_memory /*= false*/, size_t start_size_ /*= DBMS_DEFAULT_BUFFER_SIZE*/) + : BufferWithOwnMemory(use_existing_memory ? sizeof(existing_memory) : start_size_, use_existing_memory ? existing_memory : nullptr), sub_buf(sub_buf_) { padded &= sub_buf.isPadded(); /// Read from sub-buffer diff --git a/src/IO/PeekableReadBuffer.h b/src/IO/PeekableReadBuffer.h index 4515c6f8ce5..a8eff09c4f2 100644 --- a/src/IO/PeekableReadBuffer.h +++ b/src/IO/PeekableReadBuffer.h @@ -20,7 +20,7 @@ class PeekableReadBuffer : public BufferWithOwnMemory { friend class PeekableReadBufferCheckpoint; public: - explicit PeekableReadBuffer(ReadBuffer & sub_buf_, size_t start_size_ = DBMS_DEFAULT_BUFFER_SIZE); + explicit PeekableReadBuffer(ReadBuffer & sub_buf_, bool use_existing_memory = false, size_t start_size_ = DBMS_DEFAULT_BUFFER_SIZE); ~PeekableReadBuffer() override; @@ -89,6 +89,11 @@ private: size_t peeked_size = 0; std::optional checkpoint = std::nullopt; bool checkpoint_in_own_memory = false; + + /// Small amount of memory on stack to use in BufferWithOwnMemory on + /// it's creation to prevent unnecessary allocation if PeekableReadBuffer + /// is often created. + char existing_memory[16]; }; diff --git a/tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.reference b/tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.reference new file mode 100644 index 00000000000..06618cc63b1 --- /dev/null +++ b/tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.reference @@ -0,0 +1,55 @@ +TSV +\N +\N +Some text +\N +Some text +\N +Some more text +\N +\N +Some more text +1 Some text 1 +1 \N 1 +CustomNullSome text +CustomNullSome text +\N +Some more text +\N +\N +Some more text +1 \N 1 +1 \N 1 +CSV +\N +\N +\\NSome text +\N +\\NSome text +\N +Some more text +\N +\N +Some more text +1 \\NSome text 1 +1 \N 1 +CustomNullSome text +CustomNullSome text +\N +Some more text +\N +\N +Some more text +1 \N 1 +1 \N 1 +Corner cases +TSV +Some text \N +Some text CustomNull Some text +OK +OK +CSV +Some text \N +Some text CustomNull Some text +OK +OK diff --git a/tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.sh b/tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.sh new file mode 100755 index 00000000000..1d8e080c7b6 --- /dev/null +++ b/tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') + +DATA_FILE=$USER_FILES_PATH/test_02103_null.data + +echo "TSV" + +echo 'Custom NULL representation' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)') SETTINGS format_tsv_null_representation='Custom NULL representation'" + +echo -e 'N\tU\tL\tL' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)') SETTINGS format_tsv_null_representation='N\tU\tL\tL'" + +echo -e "\\NSome text" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)')" + +echo -e "\\N" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)')" + +echo -e "\\NSome text\n\\N\nSome more text" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)')" + +echo -e "\\N\n\\N\nSome more text" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)')" + +echo -e "1\t\\NSome text\t1" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 'x Int32, s Nullable(String), y Int32')" + +echo -e "1\t\\N\t1" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 'x Int32, s Nullable(String), y Int32')" + +echo -e "CustomNullSome text" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)') SETTINGS format_tsv_null_representation='CustomNull'" + +echo -e "CustomNullSome text\nCustomNull\nSome more text" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)') SETTINGS format_tsv_null_representation='CustomNull'" + +echo -e "CustomNull\nCustomNull\nSome more text" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)') SETTINGS format_tsv_null_representation='CustomNull'" + +echo -e "1\tCustomNull\t1" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 'x Int32, s Nullable(String), y Int32') SETTINGS format_tsv_null_representation='CustomNull'" + +echo -e "1\tCustomNull\t1" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 'x Int32, s Nullable(String), y Int32') SETTINGS format_tsv_null_representation='CustomNull'" + + +echo "CSV" + +echo 'Custom NULL representation' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Nullable(String)') SETTINGS format_csv_null_representation='Custom NULL representation'" + +echo -e 'N,U,L,L' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Nullable(String)') SETTINGS format_csv_null_representation='N,U,L,L'" + +echo -e "\\NSome text" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Nullable(String)')" + +echo -e "\\N" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Nullable(String)')" + +echo -e "\\NSome text\n\\N\nSome more text" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Nullable(String)')" + +echo -e "\\N\n\\N\nSome more text" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Nullable(String)')" + +echo -e "1,\\NSome text,1" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 'x Int32, s Nullable(String), y Int32')" + +echo -e "1,\\N,1" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 'x Int32, s Nullable(String), y Int32')" + +echo -e "CustomNullSome text" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Nullable(String)') SETTINGS format_csv_null_representation='CustomNull'" + +echo -e "CustomNullSome text\nCustomNull\nSome more text" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Nullable(String)') SETTINGS format_csv_null_representation='CustomNull'" + +echo -e "CustomNull\nCustomNull\nSome more text" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Nullable(String)') SETTINGS format_csv_null_representation='CustomNull'" + +echo -e "1,CustomNull,1" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 'x Int32, s Nullable(String), y Int32') SETTINGS format_csv_null_representation='CustomNull'" + +echo -e "1,CustomNull,1" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 'x Int32, s Nullable(String), y Int32') SETTINGS format_csv_null_representation='CustomNull'" + + +echo 'Corner cases' +echo 'TSV' + +echo -e "Some text\tCustomNull" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's String, n Nullable(String)') settings max_read_buffer_size=15, format_tsv_null_representation='CustomNull', input_format_parallel_parsing=0" + +echo -e "Some text\tCustomNull Some text" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's String, n Nullable(String)') settings max_read_buffer_size=15, format_tsv_null_representation='CustomNull', input_format_parallel_parsing=0" + +echo -e "Some text\t123NNN" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's String, n Nullable(Int32)') settings max_read_buffer_size=14, format_tsv_null_representation='123NN', input_format_parallel_parsing=0" 2>&1 | grep -F -q "CANNOT_READ_ALL_DATA" && echo 'OK' || echo 'FAIL' + +echo -e "Some text\tNU\tLL" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's String, n Nullable(String)') settings max_read_buffer_size=13, format_tsv_null_representation='NU\tL', input_format_parallel_parsing=0" 2>&1 | grep -F -q "CANNOT_READ_ALL_DATA" && echo 'OK' || echo 'FAIL' + +echo 'CSV' + +echo -e "Some text,CustomNull" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's String, n Nullable(String)') settings max_read_buffer_size=15, format_csv_null_representation='CustomNull', input_format_parallel_parsing=0" + +echo -e "Some text,CustomNull Some text" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's String, n Nullable(String)') settings max_read_buffer_size=15, format_csv_null_representation='CustomNull', input_format_parallel_parsing=0" + +echo -e "Some text,123NNN" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's String, n Nullable(Int32)') settings max_read_buffer_size=14, format_csv_null_representation='123NN', input_format_parallel_parsing=0" 2>&1 | grep -F -q "CANNOT_READ_ALL_DATA" && echo 'OK' || echo 'FAIL' + +echo -e "Some text,NU,LL" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's String, n Nullable(String)') settings max_read_buffer_size=13, format_csv_null_representation='NU,L', input_format_parallel_parsing=0" 2>&1 | grep -F -q "CANNOT_READ_ALL_DATA" && echo 'OK' || echo 'FAIL' + +rm $DATA_FILE + diff --git a/tests/queries/0_stateless/02104_json_strings_nullable_string.reference b/tests/queries/0_stateless/02104_json_strings_nullable_string.reference new file mode 100644 index 00000000000..a2b5b4ad2ec --- /dev/null +++ b/tests/queries/0_stateless/02104_json_strings_nullable_string.reference @@ -0,0 +1,2 @@ +NULLSome string +NULLSome string diff --git a/tests/queries/0_stateless/02104_json_strings_nullable_string.sh b/tests/queries/0_stateless/02104_json_strings_nullable_string.sh new file mode 100755 index 00000000000..5385c1282b2 --- /dev/null +++ b/tests/queries/0_stateless/02104_json_strings_nullable_string.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +DATA_FILE=$USER_FILES_PATH/test_02104_null.data + +echo -e '{"s" : "NULLSome string"}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02104_null.data', 'JSONStringsEachRow', 's Nullable(String)')" + +echo -e '["NULLSome string"]' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02104_null.data', 'JSONCompactStringsEachRow', 's Nullable(String)')" + +rm $DATA_FILE + From d30aecbda8d55f26e540d289fe4ae75f58e462e6 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 27 Oct 2021 17:29:36 +0300 Subject: [PATCH 07/11] Add performance test --- tests/performance/tsv_csv_nullable_parsing.xml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 tests/performance/tsv_csv_nullable_parsing.xml diff --git a/tests/performance/tsv_csv_nullable_parsing.xml b/tests/performance/tsv_csv_nullable_parsing.xml new file mode 100644 index 00000000000..2d5c5cec275 --- /dev/null +++ b/tests/performance/tsv_csv_nullable_parsing.xml @@ -0,0 +1,15 @@ + + +CREATE TABLE IF NOT EXISTS table_tsv (s Nullable(String)) ENGINE = File('TSV') +CREATE TABLE IF NOT EXISTS table_csv (s Nullable(String)) ENGINE = File('CSV') + +INSERT INTO table_tsv SELECT number % 2 ? 'Some text' : NULL FROM numbers(1000000) FORMAT TSV +INSERT INTO table_csv SELECT number % 2 ? 'Some text' : NULL FROM numbers(1000000) FORMAT CSV + +SELECT * FROM table_tsv FORMAT Null +SELECT * FROM table_csv FORMAT Null + +DROP TABLE IF EXISTS table_tsv +DROP TABLE IF EXISTS table_csv + + From 6e8c2ab28f2da979d214caa8df90ed2fc8c5e87a Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 28 Oct 2021 17:02:07 +0300 Subject: [PATCH 08/11] Use small amount of memory on stack in PeekableReadBuffer --- src/IO/PeekableReadBuffer.cpp | 48 ++++++++++++++----- src/IO/PeekableReadBuffer.h | 15 ++++-- ...v_csv_custom_null_representation.reference | 21 ++++++++ ...2103_tsv_csv_custom_null_representation.sh | 7 +++ 4 files changed, 74 insertions(+), 17 deletions(-) diff --git a/src/IO/PeekableReadBuffer.cpp b/src/IO/PeekableReadBuffer.cpp index e2b1873283f..40929acd848 100644 --- a/src/IO/PeekableReadBuffer.cpp +++ b/src/IO/PeekableReadBuffer.cpp @@ -9,8 +9,8 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -PeekableReadBuffer::PeekableReadBuffer(ReadBuffer & sub_buf_, bool use_existing_memory /*= false*/, size_t start_size_ /*= DBMS_DEFAULT_BUFFER_SIZE*/) - : BufferWithOwnMemory(use_existing_memory ? sizeof(existing_memory) : start_size_, use_existing_memory ? existing_memory : nullptr), sub_buf(sub_buf_) +PeekableReadBuffer::PeekableReadBuffer(ReadBuffer & sub_buf_, size_t start_size_ /*= 0*/) + : BufferWithOwnMemory(start_size_), sub_buf(sub_buf_) { padded &= sub_buf.isPadded(); /// Read from sub-buffer @@ -27,6 +27,7 @@ void PeekableReadBuffer::reset() peeked_size = 0; checkpoint = std::nullopt; checkpoint_in_own_memory = false; + use_stack_memory = true; if (!currentlyReadFromOwnMemory()) sub_buf.position() = pos; @@ -72,21 +73,23 @@ bool PeekableReadBuffer::peekNext() sub_buf.position() = copy_from; } + char * memory_data = getMemoryData(); + /// Save unread data from sub-buffer to own memory - memcpy(memory.data() + peeked_size, sub_buf.position(), bytes_to_copy); + memcpy(memory_data + peeked_size, sub_buf.position(), bytes_to_copy); /// If useSubbufferOnly() is false, then checkpoint is in own memory and it was updated in resizeOwnMemoryIfNecessary /// Otherwise, checkpoint now at the beginning of own memory if (checkpoint && useSubbufferOnly()) { - checkpoint.emplace(memory.data()); + checkpoint.emplace(memory_data); checkpoint_in_own_memory = true; } if (currentlyReadFromOwnMemory()) { /// Update buffer size - BufferBase::set(memory.data(), peeked_size + bytes_to_copy, offset()); + BufferBase::set(memory_data, peeked_size + bytes_to_copy, offset()); } else { @@ -99,7 +102,7 @@ bool PeekableReadBuffer::peekNext() else pos_offset = 0; } - BufferBase::set(memory.data(), peeked_size + bytes_to_copy, pos_offset); + BufferBase::set(memory_data, peeked_size + bytes_to_copy, pos_offset); } peeked_size += bytes_to_copy; @@ -125,8 +128,9 @@ void PeekableReadBuffer::rollbackToCheckpoint(bool drop) /// Checkpoint is in own memory and position is not. assert(checkpointInOwnMemory()); + char * memory_data = getMemoryData(); /// Switch to reading from own memory. - BufferBase::set(memory.data(), peeked_size, *checkpoint - memory.data()); + BufferBase::set(memory_data, peeked_size, *checkpoint - memory_data); } if (drop) @@ -224,12 +228,31 @@ void PeekableReadBuffer::resizeOwnMemoryIfNecessary(size_t bytes_to_append) bool need_update_pos = currentlyReadFromOwnMemory(); size_t offset = 0; if (need_update_checkpoint) - offset = *checkpoint - memory.data(); + { + char * memory_data = getMemoryData(); + offset = *checkpoint - memory_data; + } else if (need_update_pos) offset = this->offset(); size_t new_size = peeked_size + bytes_to_append; - if (memory.size() < new_size) + + if (use_stack_memory) + { + /// If stack memory is still enough, do nothing. + if (sizeof(stack_memory) >= new_size) + return; + + /// Stack memory is not enough, allocate larger buffer. + use_stack_memory = false; + memory.resize(std::max(size_t(DBMS_DEFAULT_BUFFER_SIZE), new_size)); + memcpy(memory.data(), stack_memory, sizeof(stack_memory)); + if (need_update_checkpoint) + checkpoint.emplace(memory.data() + offset); + if (need_update_pos) + BufferBase::set(memory.data(), peeked_size, pos - stack_memory); + } + else if (memory.size() < new_size) { if (bytes_to_append < offset && 2 * (peeked_size - offset) <= memory.size()) { @@ -273,10 +296,11 @@ void PeekableReadBuffer::makeContinuousMemoryFromCheckpointToPos() size_t bytes_to_append = pos - sub_buf.position(); resizeOwnMemoryIfNecessary(bytes_to_append); - memcpy(memory.data() + peeked_size, sub_buf.position(), bytes_to_append); + char * memory_data = getMemoryData(); + memcpy(memory_data + peeked_size, sub_buf.position(), bytes_to_append); sub_buf.position() = pos; peeked_size += bytes_to_append; - BufferBase::set(memory.data(), peeked_size, peeked_size); + BufferBase::set(memory_data, peeked_size, peeked_size); } PeekableReadBuffer::~PeekableReadBuffer() @@ -287,7 +311,7 @@ PeekableReadBuffer::~PeekableReadBuffer() bool PeekableReadBuffer::hasUnreadData() const { - return peeked_size && pos != memory.data() + peeked_size; + return peeked_size && pos != getMemoryData() + peeked_size; } } diff --git a/src/IO/PeekableReadBuffer.h b/src/IO/PeekableReadBuffer.h index a8eff09c4f2..f22987d9daa 100644 --- a/src/IO/PeekableReadBuffer.h +++ b/src/IO/PeekableReadBuffer.h @@ -20,7 +20,7 @@ class PeekableReadBuffer : public BufferWithOwnMemory { friend class PeekableReadBufferCheckpoint; public: - explicit PeekableReadBuffer(ReadBuffer & sub_buf_, bool use_existing_memory = false, size_t start_size_ = DBMS_DEFAULT_BUFFER_SIZE); + explicit PeekableReadBuffer(ReadBuffer & sub_buf_, size_t start_size_ = 0); ~PeekableReadBuffer() override; @@ -84,16 +84,21 @@ private: /// Updates all invalidated pointers and sizes. void resizeOwnMemoryIfNecessary(size_t bytes_to_append); + char * getMemoryData() { return use_stack_memory ? stack_memory : memory.data(); } + const char * getMemoryData() const { return use_stack_memory ? stack_memory : memory.data(); } + ReadBuffer & sub_buf; size_t peeked_size = 0; std::optional checkpoint = std::nullopt; bool checkpoint_in_own_memory = false; - /// Small amount of memory on stack to use in BufferWithOwnMemory on - /// it's creation to prevent unnecessary allocation if PeekableReadBuffer - /// is often created. - char existing_memory[16]; + /// To prevent expensive and in some cases unnecessary memory allocations on PeekableReadBuffer + /// creation (for example if PeekableReadBuffer is often created or if we need to remember small amount of + /// data after checkpoint), at the beginning we will use small amount of memory on stack and allocate + /// larger buffer only if reserved memory is not enough. + char stack_memory[16]; + bool use_stack_memory = true; }; diff --git a/tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.reference b/tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.reference index 06618cc63b1..a89bc46acfb 100644 --- a/tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.reference +++ b/tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.reference @@ -53,3 +53,24 @@ Some text \N Some text CustomNull Some text OK OK +Large custom NULL +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0000000000Custom NULL representation0000000000 +0000000000Custom NULL representation0000000000 +0000000000Custom NULL representation0000000000 +0000000000Custom NULL representation0000000000 +0000000000Custom NULL representation0000000000 +0000000000Custom NULL representation0000000000 +0000000000Custom NULL representation0000000000 +0000000000Custom NULL representation0000000000 +0000000000Custom NULL representation0000000000 +0000000000Custom NULL representation0000000000 diff --git a/tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.sh b/tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.sh index 1d8e080c7b6..676e8cb867f 100755 --- a/tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.sh +++ b/tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.sh @@ -121,5 +121,12 @@ $CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's Stri echo -e "Some text,NU,LL" > $DATA_FILE $CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'CSV', 's String, n Nullable(String)') settings max_read_buffer_size=13, format_csv_null_representation='NU,L', input_format_parallel_parsing=0" 2>&1 | grep -F -q "CANNOT_READ_ALL_DATA" && echo 'OK' || echo 'FAIL' + +echo 'Large custom NULL' + +$CLICKHOUSE_CLIENT -q "select '0000000000Custom NULL representation0000000000' FROM numbers(10)" > $DATA_FILE +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)') SETTINGS max_read_buffer_size=5, input_format_parallel_parsing=0, format_tsv_null_representation='0000000000Custom NULL representation0000000000'" +$CLICKHOUSE_CLIENT -q "SELECT * FROM file('test_02103_null.data', 'TSV', 's Nullable(String)') SETTINGS max_read_buffer_size=5, input_format_parallel_parsing=0, format_tsv_null_representation='0000000000Custom NULL representation000000000'" + rm $DATA_FILE From 821300342268168f7721ecf3a327e91834491852 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 29 Oct 2021 16:51:57 +0300 Subject: [PATCH 09/11] Fix PVS check, mark tests as no-parallel --- .../Serializations/SerializationNullable.cpp | 40 +++++++++---------- ...2103_tsv_csv_custom_null_representation.sh | 1 + .../02104_json_strings_nullable_string.sh | 1 + 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 0dbb140af8b..7bf7b6e55b0 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -257,7 +257,7 @@ void SerializationNullable::deserializeTextEscaped(IColumn & column, ReadBuffer template ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested) + const SerializationPtr & nested_serialization) { const String & null_representation = settings.tsv.null_representation; @@ -265,9 +265,9 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0])) { /// This is not null, surely. - return safeDeserialize(column, *nested, + return safeDeserialize(column, *nested_serialization, [] { return false; }, - [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextEscaped(nested_column, istr, settings); }); + [&nested_serialization, &istr, &settings] (IColumn & nested_column) { nested_serialization->deserializeTextEscaped(nested_column, istr, settings); }); } /// Check if we have enough data in buffer to check if it's a null. @@ -281,11 +281,11 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R istr.position() = pos; return false; }; - auto deserialize_nested = [&nested, &settings, &istr] (IColumn & nested_column) + auto deserialize_nested = [&nested_serialization, &settings, &istr] (IColumn & nested_column) { - nested->deserializeTextEscaped(nested_column, istr, settings); + nested_serialization->deserializeTextEscaped(nested_column, istr, settings); }; - return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); } /// We don't have enough data in buffer to check if it's a null. @@ -303,10 +303,10 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R return false; }; - auto deserialize_nested = [&nested, &settings, &buf, &null_representation, &istr] (IColumn & nested_column) + auto deserialize_nested = [&nested_serialization, &settings, &buf, &null_representation, &istr] (IColumn & nested_column) { auto * pos = buf.position(); - nested->deserializeTextEscaped(nested_column, buf, settings); + nested_serialization->deserializeTextEscaped(nested_column, buf, settings); /// Check that we don't have any unread data in PeekableReadBuffer own memory. if (likely(!buf.hasUnreadData())) return; @@ -320,7 +320,7 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R "for large input.", ErrorCodes::CANNOT_READ_ALL_DATA); WriteBufferFromOwnString parsed_value; - nested->serializeTextEscaped(nested_column, nested_column.size() - 1, parsed_value, settings); + nested_serialization->serializeTextEscaped(nested_column, nested_column.size() - 1, parsed_value, settings); throw DB::ParsingException("Error while parsing \"" + std::string(pos, buf.buffer().end()) + std::string(istr.position(), std::min(size_t(10), istr.available())) + "\" as Nullable" + " at position " + std::to_string(istr.count()) + ": got \"" + std::string(pos, buf.position() - pos) + "\", which was deserialized as \"" @@ -328,7 +328,7 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R ErrorCodes::CANNOT_READ_ALL_DATA); }; - return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); } void SerializationNullable::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -413,15 +413,15 @@ void SerializationNullable::deserializeTextCSV(IColumn & column, ReadBuffer & is template ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested) + const SerializationPtr & nested_serialization) { const String & null_representation = settings.csv.null_representation; if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0])) { /// This is not null, surely. - return safeDeserialize(column, *nested, + return safeDeserialize(column, *nested_serialization, [] { return false; }, - [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextCSV(nested_column, istr, settings); }); + [&nested_serialization, &istr, &settings] (IColumn & nested_column) { nested_serialization->deserializeTextCSV(nested_column, istr, settings); }); } /// Check if we have enough data in buffer to check if it's a null. @@ -435,11 +435,11 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB istr.position() = pos; return false; }; - auto deserialize_nested = [&nested, &settings, &istr] (IColumn & nested_column) + auto deserialize_nested = [&nested_serialization, &settings, &istr] (IColumn & nested_column) { - nested->deserializeTextCSV(nested_column, istr, settings); + nested_serialization->deserializeTextCSV(nested_column, istr, settings); }; - return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); } /// We don't have enough data in buffer to check if it's a null. @@ -457,10 +457,10 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB return false; }; - auto deserialize_nested = [&nested, &settings, &buf, &null_representation, &istr] (IColumn & nested_column) + auto deserialize_nested = [&nested_serialization, &settings, &buf, &null_representation, &istr] (IColumn & nested_column) { auto * pos = buf.position(); - nested->deserializeTextCSV(nested_column, buf, settings); + nested_serialization->deserializeTextCSV(nested_column, buf, settings); /// Check that we don't have any unread data in PeekableReadBuffer own memory. if (likely(!buf.hasUnreadData())) return; @@ -475,7 +475,7 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB "for large input.", ErrorCodes::CANNOT_READ_ALL_DATA); WriteBufferFromOwnString parsed_value; - nested->serializeTextCSV(nested_column, nested_column.size() - 1, parsed_value, settings); + nested_serialization->serializeTextCSV(nested_column, nested_column.size() - 1, parsed_value, settings); throw DB::ParsingException("Error while parsing \"" + std::string(pos, buf.buffer().end()) + std::string(istr.position(), std::min(size_t(10), istr.available())) + "\" as Nullable" + " at position " + std::to_string(istr.count()) + ": got \"" + std::string(pos, buf.position() - pos) + "\", which was deserialized as \"" @@ -483,7 +483,7 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB ErrorCodes::CANNOT_READ_ALL_DATA); }; - return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); } void SerializationNullable::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const diff --git a/tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.sh b/tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.sh index 676e8cb867f..4162e046ca4 100755 --- a/tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.sh +++ b/tests/queries/0_stateless/02103_tsv_csv_custom_null_representation.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: no-parallel CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/02104_json_strings_nullable_string.sh b/tests/queries/0_stateless/02104_json_strings_nullable_string.sh index 5385c1282b2..6a5d369e7b6 100755 --- a/tests/queries/0_stateless/02104_json_strings_nullable_string.sh +++ b/tests/queries/0_stateless/02104_json_strings_nullable_string.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: no-parallel CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From 3b12c54bf96cb2596183f6b4c45e45abafaf2236 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 2 Nov 2021 15:05:59 +0300 Subject: [PATCH 10/11] Fix PVS check --- .../Serializations/SerializationNullable.cpp | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index acec3790196..9ae78403d11 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -285,7 +285,7 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R template ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested) + const SerializationPtr & nested_serialization) { const String & null_representation = settings.tsv.null_representation; @@ -293,14 +293,14 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0])) { /// This is not null, surely. - return safeDeserialize(column, *nested, + return safeDeserialize(column, *nested_serialization, [] { return false; }, - [&nested, &istr, &settings] (IColumn & nested_column) + [&nested_serialization, &istr, &settings] (IColumn & nested_column) { if constexpr (escaped) - nested->deserializeTextEscaped(nested_column, istr, settings); + nested_serialization->deserializeTextEscaped(nested_column, istr, settings); else - nested->deserializeTextRaw(nested_column, istr, settings); + nested_serialization->deserializeTextRaw(nested_column, istr, settings); }); } @@ -315,14 +315,14 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col istr.position() = pos; return false; }; - auto deserialize_nested = [&nested, &settings, &istr] (IColumn & nested_column) + auto deserialize_nested = [&nested_serialization, &settings, &istr] (IColumn & nested_column) { if constexpr (escaped) - nested->deserializeTextEscaped(nested_column, istr, settings); + nested_serialization->deserializeTextEscaped(nested_column, istr, settings); else - nested->deserializeTextRaw(nested_column, istr, settings); + nested_serialization->deserializeTextRaw(nested_column, istr, settings); }; - return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); } /// We don't have enough data in buffer to check if it's a null. @@ -340,13 +340,13 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col return false; }; - auto deserialize_nested = [&nested, &settings, &buf, &null_representation, &istr] (IColumn & nested_column) + auto deserialize_nested = [&nested_serialization, &settings, &buf, &null_representation, &istr] (IColumn & nested_column) { auto * pos = buf.position(); if constexpr (escaped) - nested->deserializeTextEscaped(nested_column, buf, settings); + nested_serialization->deserializeTextEscaped(nested_column, buf, settings); else - nested->deserializeTextRaw(nested_column, buf, settings); + nested_serialization->deserializeTextRaw(nested_column, buf, settings); /// Check that we don't have any unread data in PeekableReadBuffer own memory. if (likely(!buf.hasUnreadData())) return; @@ -361,9 +361,9 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col WriteBufferFromOwnString parsed_value; if constexpr (escaped) - nested->serializeTextEscaped(nested_column, nested_column.size() - 1, parsed_value, settings); + nested_serialization->serializeTextEscaped(nested_column, nested_column.size() - 1, parsed_value, settings); else - nested->serializeTextRaw(nested_column, nested_column.size() - 1, parsed_value, settings); + nested_serialization->serializeTextRaw(nested_column, nested_column.size() - 1, parsed_value, settings); throw DB::ParsingException("Error while parsing \"" + std::string(pos, buf.buffer().end()) + std::string(istr.position(), std::min(size_t(10), istr.available())) + "\" as Nullable" + " at position " + std::to_string(istr.count()) + ": got \"" + std::string(pos, buf.position() - pos) + "\", which was deserialized as \"" @@ -371,7 +371,7 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col ErrorCodes::CANNOT_READ_ALL_DATA); }; - return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); } void SerializationNullable::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -456,15 +456,15 @@ void SerializationNullable::deserializeTextCSV(IColumn & column, ReadBuffer & is template ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested) + const SerializationPtr & nested_serialization) { const String & null_representation = settings.csv.null_representation; if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0])) { /// This is not null, surely. - return safeDeserialize(column, *nested, + return safeDeserialize(column, *nested_serialization, [] { return false; }, - [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextCSV(nested_column, istr, settings); }); + [&nested_serialization, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextCSV(nested_column, istr, settings); }); } /// Check if we have enough data in buffer to check if it's a null. @@ -478,11 +478,11 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB istr.position() = pos; return false; }; - auto deserialize_nested = [&nested, &settings, &istr] (IColumn & nested_column) + auto deserialize_nested = [&nested_serialization, &settings, &istr] (IColumn & nested_column) { - nested->deserializeTextCSV(nested_column, istr, settings); + nested_serialization->deserializeTextCSV(nested_column, istr, settings); }; - return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); } /// We don't have enough data in buffer to check if it's a null. @@ -500,10 +500,10 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB return false; }; - auto deserialize_nested = [&nested, &settings, &buf, &null_representation, &istr] (IColumn & nested_column) + auto deserialize_nested = [&nested_serialization, &settings, &buf, &null_representation, &istr] (IColumn & nested_column) { auto * pos = buf.position(); - nested->deserializeTextCSV(nested_column, buf, settings); + nested_serialization->deserializeTextCSV(nested_column, buf, settings); /// Check that we don't have any unread data in PeekableReadBuffer own memory. if (likely(!buf.hasUnreadData())) return; @@ -518,7 +518,7 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB "for large input.", ErrorCodes::CANNOT_READ_ALL_DATA); WriteBufferFromOwnString parsed_value; - nested->serializeTextCSV(nested_column, nested_column.size() - 1, parsed_value, settings); + nested_serialization->serializeTextCSV(nested_column, nested_column.size() - 1, parsed_value, settings); throw DB::ParsingException("Error while parsing \"" + std::string(pos, buf.buffer().end()) + std::string(istr.position(), std::min(size_t(10), istr.available())) + "\" as Nullable" + " at position " + std::to_string(istr.count()) + ": got \"" + std::string(pos, buf.position() - pos) + "\", which was deserialized as \"" @@ -526,7 +526,7 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB ErrorCodes::CANNOT_READ_ALL_DATA); }; - return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); } void SerializationNullable::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const From 1d2f197e70ea1cb1249926b15ac0fcb0ce478cc3 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 2 Nov 2021 16:09:20 +0300 Subject: [PATCH 11/11] Fix build --- src/DataTypes/Serializations/SerializationNullable.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 9ae78403d11..5e2b31ebb9d 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -464,7 +464,7 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB /// This is not null, surely. return safeDeserialize(column, *nested_serialization, [] { return false; }, - [&nested_serialization, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextCSV(nested_column, istr, settings); }); + [&nested_serialization, &istr, &settings] (IColumn & nested_column) { nested_serialization->deserializeTextCSV(nested_column, istr, settings); }); } /// Check if we have enough data in buffer to check if it's a null.