From 324dfd4f814e246b544e8d79fe1b79c9cf152446 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 14 Oct 2021 13:32:49 +0300 Subject: [PATCH] Refactor and improve TSV, CSV and JSONCompactEachRow formats, fix some bugs in formats --- src/Core/Block.cpp | 11 + src/Core/Block.h | 1 + src/Core/Settings.h | 4 +- src/DataTypes/DataTypeLowCardinality.h | 1 + src/DataTypes/IDataType.h | 3 + .../Serializations/ISerialization.cpp | 15 + src/DataTypes/Serializations/ISerialization.h | 8 + .../SerializationFixedString.cpp | 2 +- .../Serializations/SerializationNullable.cpp | 48 +- .../Serializations/SerializationNullable.h | 7 + .../Serializations/SerializationString.cpp | 2 +- src/Formats/FormatFactory.cpp | 5 +- src/Formats/FormatSettings.h | 2 + src/Formats/JSONEachRowUtils.cpp | 55 +- src/Formats/JSONEachRowUtils.h | 8 +- src/Formats/registerFormats.cpp | 2 + src/Processors/Formats/IInputFormat.h | 13 +- src/Processors/Formats/IRowOutputFormat.cpp | 7 + src/Processors/Formats/IRowOutputFormat.h | 3 + .../Formats/Impl/CSVRowInputFormat.cpp | 411 ++++---------- .../Formats/Impl/CSVRowInputFormat.h | 36 +- .../Formats/Impl/CSVRowOutputFormat.cpp | 50 +- .../Formats/Impl/CSVRowOutputFormat.h | 6 +- .../Impl/JSONAsStringRowInputFormat.cpp | 2 +- .../Impl/JSONCompactEachRowRowInputFormat.cpp | 386 ++++++-------- .../Impl/JSONCompactEachRowRowInputFormat.h | 48 +- .../JSONCompactEachRowRowOutputFormat.cpp | 105 ++-- .../Impl/JSONCompactEachRowRowOutputFormat.h | 9 +- .../Impl/JSONEachRowRowInputFormat.cpp | 48 +- .../Formats/Impl/RegexpRowInputFormat.cpp | 6 +- .../Formats/Impl/TSKVRowInputFormat.cpp | 2 +- .../Formats/Impl/TSKVRowOutputFormat.cpp | 2 +- .../Impl/TabSeparatedRawRowInputFormat.h | 58 -- .../Impl/TabSeparatedRawRowOutputFormat.h | 35 -- .../Impl/TabSeparatedRowInputFormat.cpp | 501 ++++++------------ .../Formats/Impl/TabSeparatedRowInputFormat.h | 41 +- .../Impl/TabSeparatedRowOutputFormat.cpp | 98 ++-- .../Impl/TabSeparatedRowOutputFormat.h | 8 +- .../Impl/TemplateBlockOutputFormat.cpp | 2 +- .../Formats/Impl/TemplateRowInputFormat.cpp | 16 +- .../Formats/Impl/ValuesBlockInputFormat.cpp | 4 +- .../RowInputFormatWithNamesAndTypes.cpp | 265 +++++++++ .../Formats/RowInputFormatWithNamesAndTypes.h | 73 +++ tests/queries/0_stateless/00300_csv.reference | 4 + tests/queries/0_stateless/00300_csv.sql | 3 +- tests/queries/0_stateless/00301_csv.sh | 4 +- .../00938_template_input_format.reference | 8 + .../00938_template_input_format.sh | 24 + .../01034_JSONCompactEachRow.reference | 21 + .../0_stateless/01034_JSONCompactEachRow.sql | 15 +- .../01195_formats_diagnostic_info.reference | 13 +- .../01195_formats_diagnostic_info.sh | 16 + ...output_format_tsv_csv_with_names.reference | 14 + .../01375_output_format_tsv_csv_with_names.sh | 9 + ..._tsv_csv_with_names_write_prefix.reference | 25 + ...ge_file_tsv_csv_with_names_write_prefix.sh | 12 +- ...48_json_compact_strings_each_row.reference | 21 + .../01448_json_compact_strings_each_row.sql | 15 +- ...097_json_strings_deserialization.reference | 4 + .../02097_json_strings_deserialization.sh | 14 + .../02098_with_types_use_header.reference | 16 + .../02098_with_types_use_header.sh | 33 ++ .../02099_tsv_raw_format.reference | 113 ++++ .../0_stateless/02099_tsv_raw_format.sh | 59 +++ ...ardinality_nullable_null_default.reference | 14 + ...0_low_cardinality_nullable_null_default.sh | 21 + ...ty_as_default_and_omitted_fields.reference | 16 + ...101_empty_as_default_and_omitted_fields.sh | 39 ++ ...llel_formatting_json_and_friends.reference | 28 +- ...59_parallel_formatting_json_and_friends.sh | 6 +- ...lel_parsing_with_names_and_types.reference | 20 + ...7_parallel_parsing_with_names_and_types.sh | 32 ++ 72 files changed, 1743 insertions(+), 1285 deletions(-) delete mode 100644 src/Processors/Formats/Impl/TabSeparatedRawRowInputFormat.h delete mode 100644 src/Processors/Formats/Impl/TabSeparatedRawRowOutputFormat.h create mode 100644 src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp create mode 100644 src/Processors/Formats/RowInputFormatWithNamesAndTypes.h create mode 100644 tests/queries/0_stateless/02097_json_strings_deserialization.reference create mode 100755 tests/queries/0_stateless/02097_json_strings_deserialization.sh create mode 100644 tests/queries/0_stateless/02098_with_types_use_header.reference create mode 100755 tests/queries/0_stateless/02098_with_types_use_header.sh create mode 100644 tests/queries/0_stateless/02099_tsv_raw_format.reference create mode 100755 tests/queries/0_stateless/02099_tsv_raw_format.sh create mode 100644 tests/queries/0_stateless/02100_low_cardinality_nullable_null_default.reference create mode 100755 tests/queries/0_stateless/02100_low_cardinality_nullable_null_default.sh create mode 100644 tests/queries/0_stateless/02101_empty_as_default_and_omitted_fields.reference create mode 100755 tests/queries/0_stateless/02101_empty_as_default_and_omitted_fields.sh create mode 100644 tests/queries/1_stateful/00167_parallel_parsing_with_names_and_types.reference create mode 100755 tests/queries/1_stateful/00167_parallel_parsing_with_names_and_types.sh diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp index a59ac60155e..43c3fa9a54a 100644 --- a/src/Core/Block.cpp +++ b/src/Core/Block.cpp @@ -582,6 +582,17 @@ DataTypes Block::getDataTypes() const return res; } +Names Block::getDataTypeNames() const +{ + Names res; + res.reserve(columns()); + + for (const auto & elem : data) + res.push_back(elem.type->getName()); + + return res; +} + bool blocksHaveEqualStructure(const Block & lhs, const Block & rhs) { diff --git a/src/Core/Block.h b/src/Core/Block.h index a7e3cee194b..973b0028219 100644 --- a/src/Core/Block.h +++ b/src/Core/Block.h @@ -90,6 +90,7 @@ public: NamesAndTypesList getNamesAndTypesList() const; Names getNames() const; DataTypes getDataTypes() const; + Names getDataTypeNames() const; /// Returns number of rows from first column in block, not equal to nullptr. If no columns, returns 0. size_t rows() const; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index a5767955045..9d1f8163dfe 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -557,8 +557,10 @@ class IColumn; M(Bool, input_format_csv_arrays_as_nested_csv, false, R"(When reading Array from CSV, expect that its elements were serialized in nested CSV and then put into string. Example: "[""Hello"", ""world"", ""42"""" TV""]". Braces around array can be omitted.)", 0) \ M(Bool, input_format_skip_unknown_fields, false, "Skip columns with unknown names from input data (it works for JSONEachRow, CSVWithNames, TSVWithNames and TSKV formats).", 0) \ M(Bool, input_format_with_names_use_header, true, "For TSVWithNames and CSVWithNames input formats this controls whether format parser is to assume that column data appear in the input exactly as they are specified in the header.", 0) \ + M(Bool, input_format_with_types_use_header, true, "For TSVWithNamesTypes and CSVWithNamesAndTypes input formats this controls whether format parser should check if data types from the input match data types from the header.", 0) \ M(Bool, input_format_import_nested_json, false, "Map nested JSON data to nested tables (it works for JSONEachRow format).", 0) \ - M(Bool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow, CSV and TSV formats).", IMPORTANT) \ + M(Bool, input_format_defaults_for_omitted_fields, true, "For input data calculate default expressions for omitted fields (it works for JSONEachRow, JSONCompactEachRow, CSV and TSV formats).", IMPORTANT) \ + M(Bool, input_format_csv_empty_as_default, true, "Treat empty fields in CSV input as default values.", 0) \ M(Bool, input_format_tsv_empty_as_default, false, "Treat empty fields in TSV input as default values.", 0) \ M(Bool, input_format_tsv_enum_as_number, false, "Treat inserted enum values in TSV formats as enum indices \\N", 0) \ M(Bool, input_format_null_as_default, true, "For text input formats initialize null fields with default values if data type of this field is not nullable", 0) \ diff --git a/src/DataTypes/DataTypeLowCardinality.h b/src/DataTypes/DataTypeLowCardinality.h index 1266174c6d6..7f4286046d9 100644 --- a/src/DataTypes/DataTypeLowCardinality.h +++ b/src/DataTypes/DataTypeLowCardinality.h @@ -51,6 +51,7 @@ public: bool isNullable() const override { return false; } bool onlyNull() const override { return false; } bool lowCardinality() const override { return true; } + bool isLowCardinalityNullable() const override { return dictionary_type->isNullable(); } static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type); static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type, MutableColumnPtr && keys); diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index 360bf9f16e0..6b156336f99 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -270,6 +270,9 @@ public: virtual bool lowCardinality() const { return false; } + /// Checks if this type is LowCardinality(Nullable(...)) + virtual bool isLowCardinalityNullable() const { return false; } + /// Strings, Numbers, Date, DateTime, Nullable virtual bool canBeInsideLowCardinality() const { return false; } diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index 7077c5bfa14..e7ee8f56ecb 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -195,4 +196,18 @@ bool ISerialization::isSpecialCompressionAllowed(const SubstreamPath & path) return true; } +void ISerialization::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + /// Read until \t or \n. + readString(field, istr); + ReadBufferFromString buf(field); + deserializeWholeText(column, buf, settings); +} + +void ISerialization::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + serializeText(column, row_num, ostr, settings); +} + } diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index f1d82a2000a..4fda939aeca 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -246,6 +246,14 @@ public: serializeText(column, row_num, ostr, settings); } + /** Text deserialization without escaping and quoting. Reads all data until first \n or \t + * into a temporary string and then call deserializeWholeText. It was implemented this way + * because this function is rarely used and because proper implementation requires a lot of + * additional code in data types serialization and ReadHelpers. + */ + virtual void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; + virtual void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const; + static String getFileNameForStream(const NameAndTypePair & column, const SubstreamPath & path); static String getFileNameForStream(const String & name_in_storage, const SubstreamPath & path); static String getSubcolumnNameForStream(const SubstreamPath & path); diff --git a/src/DataTypes/Serializations/SerializationFixedString.cpp b/src/DataTypes/Serializations/SerializationFixedString.cpp index 5c63631e2a3..972313a564f 100644 --- a/src/DataTypes/Serializations/SerializationFixedString.cpp +++ b/src/DataTypes/Serializations/SerializationFixedString.cpp @@ -163,7 +163,7 @@ void SerializationFixedString::deserializeTextQuoted(IColumn & column, ReadBuffe void SerializationFixedString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { - read(*this, column, [&istr](ColumnFixedString::Chars & data) { readStringInto(data, istr); }); + read(*this, column, [&istr](ColumnFixedString::Chars & data) { readStringUntilEOFInto(data, istr); }); } diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index b607d5871d6..865e4250bdc 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -218,8 +219,36 @@ void SerializationNullable::deserializeTextEscaped(IColumn & column, ReadBuffer deserializeTextEscapedImpl(column, istr, settings, nested); } +void SerializationNullable::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextRawImpl(column, istr, settings, nested); +} + +void SerializationNullable::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnNullable & col = assert_cast(column); + + if (col.isNullAt(row_num)) + writeString(settings.tsv.null_representation, ostr); + else + nested->serializeTextRaw(col.getNestedColumn(), row_num, ostr, settings); +} + +template +ReturnType SerializationNullable::deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested) +{ + return deserializeTextEscapedAndRawImpl(column, istr, settings, nested); +} + template ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, + const SerializationPtr & nested) +{ + return deserializeTextEscapedAndRawImpl(column, istr, settings, nested); +} + +template +ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested) { /// Little tricky, because we cannot discriminate null from first character. @@ -229,7 +258,13 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R /// This is not null, surely. return safeDeserialize(column, *nested, [] { return false; }, - [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextEscaped(nested_column, istr, settings); }); + [&nested, &istr, &settings] (IColumn & nested_column) + { + if constexpr (escaped) + nested->deserializeTextEscaped(nested_column, istr, settings); + else + nested->deserializeTextRaw(nested_column, istr, settings); + }); } else { @@ -255,7 +290,10 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R { /// We could step back to consume backslash again. --istr.position(); - nested->deserializeTextEscaped(nested_column, istr, settings); + if constexpr (escaped) + nested->deserializeTextEscaped(nested_column, istr, settings); + else + nested->deserializeTextRaw(nested_column, istr, settings); } else { @@ -263,7 +301,10 @@ ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, R ReadBufferFromMemory prefix("\\", 1); ConcatReadBuffer prepended_istr(prefix, istr); - nested->deserializeTextEscaped(nested_column, prepended_istr, settings); + if constexpr (escaped) + nested->deserializeTextEscaped(nested_column, prepended_istr, settings); + else + nested->deserializeTextRaw(nested_column, prepended_istr, settings); /// Synchronise cursor position in original buffer. @@ -469,5 +510,6 @@ template bool SerializationNullable::deserializeTextEscapedImpl(IColumn & template bool SerializationNullable::deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); template bool SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); template bool SerializationNullable::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); +template bool SerializationNullable::deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); } diff --git a/src/DataTypes/Serializations/SerializationNullable.h b/src/DataTypes/Serializations/SerializationNullable.h index b0b96c021d3..acc3456e1fd 100644 --- a/src/DataTypes/Serializations/SerializationNullable.h +++ b/src/DataTypes/Serializations/SerializationNullable.h @@ -68,6 +68,9 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + /// If ReturnType is bool, check for NULL and deserialize value into non-nullable column (and return true) or insert default value of nested type (and return false) /// If ReturnType is void, deserialize Nullable(T) template @@ -80,6 +83,10 @@ public: static ReturnType deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); template static ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); + template + static ReturnType deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); + template + static ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); }; } diff --git a/src/DataTypes/Serializations/SerializationString.cpp b/src/DataTypes/Serializations/SerializationString.cpp index c3c24ed6749..89f7fe4ad9d 100644 --- a/src/DataTypes/Serializations/SerializationString.cpp +++ b/src/DataTypes/Serializations/SerializationString.cpp @@ -245,7 +245,7 @@ static inline void read(IColumn & column, Reader && reader) void SerializationString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { - read(column, [&](ColumnString::Chars & data) { readStringInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readStringUntilEOFInto(data, istr); }); } diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index d3ff5cbf8a7..4b9c2fbf78d 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -15,7 +15,6 @@ #include #include -#include namespace DB { @@ -57,7 +56,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes; format_settings.csv.crlf_end_of_line = settings.output_format_csv_crlf_end_of_line; format_settings.csv.delimiter = settings.format_csv_delimiter; - format_settings.csv.empty_as_default = settings.input_format_defaults_for_omitted_fields; + format_settings.csv.empty_as_default = settings.input_format_csv_empty_as_default; format_settings.csv.input_format_enum_as_number = settings.input_format_csv_enum_as_number; format_settings.csv.null_representation = settings.output_format_csv_null_representation; format_settings.csv.unquoted_null_literal_as_null = settings.input_format_csv_unquoted_null_literal_as_null; @@ -108,10 +107,12 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions; format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions; format_settings.with_names_use_header = settings.input_format_with_names_use_header; + format_settings.with_types_use_header = settings.input_format_with_types_use_header; format_settings.write_statistics = settings.output_format_write_statistics; format_settings.arrow.low_cardinality_as_dictionary = settings.output_format_arrow_low_cardinality_as_dictionary; format_settings.arrow.import_nested = settings.input_format_arrow_import_nested; format_settings.orc.import_nested = settings.input_format_orc_import_nested; + format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; /// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context if (format_settings.schema.is_server) diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 8c894c77e82..0aa34a1aa36 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -25,10 +25,12 @@ struct FormatSettings bool skip_unknown_fields = false; bool with_names_use_header = false; + bool with_types_use_header = false; bool write_statistics = true; bool import_nested_json = false; bool null_as_default = true; bool decimal_trailing_zeros = false; + bool defaults_for_omitted_fields = true; enum class DateTimeInputFormat { diff --git a/src/Formats/JSONEachRowUtils.cpp b/src/Formats/JSONEachRowUtils.cpp index b918825df79..b1eca1c6932 100644 --- a/src/Formats/JSONEachRowUtils.cpp +++ b/src/Formats/JSONEachRowUtils.cpp @@ -1,4 +1,7 @@ #include +#include +#include + #include namespace DB @@ -9,7 +12,8 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) +template +static std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) { skipWhitespaceIfAny(in); @@ -49,19 +53,19 @@ std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, D } else { - pos = find_first_symbols<'{', '}', '\\', '"'>(pos, in.buffer().end()); + pos = find_first_symbols(pos, in.buffer().end()); if (pos > in.buffer().end()) throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR); else if (pos == in.buffer().end()) continue; - else if (*pos == '{') + else if (*pos == opening_bracket) { ++balance; ++pos; } - else if (*pos == '}') + else if (*pos == closing_bracket) { --balance; ++pos; @@ -87,6 +91,16 @@ std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, D return {loadAtPosition(in, memory, pos), number_of_rows}; } +std::pair fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) +{ + return fileSegmentationEngineJSONEachRowImpl<'{', '}'>(in, memory, min_chunk_size); +} + +std::pair fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) +{ + return fileSegmentationEngineJSONEachRowImpl<'[', ']'>(in, memory, min_chunk_size); +} + bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf) { /// For JSONEachRow we can safely skip whitespace characters @@ -94,4 +108,37 @@ bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf) return buf.eof() || *buf.position() == '['; } +bool readFieldImpl(ReadBuffer & in, IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, const String & column_name, const FormatSettings & format_settings, bool yield_strings) +{ + try + { + bool as_nullable = format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable(); + + if (yield_strings) + { + String str; + readJSONString(str, in); + + ReadBufferFromString buf(str); + + if (as_nullable) + return SerializationNullable::deserializeWholeTextImpl(column, buf, format_settings, serialization); + + serialization->deserializeWholeText(column, buf, format_settings); + return true; + } + + if (as_nullable) + return SerializationNullable::deserializeTextJSONImpl(column, in, format_settings, serialization); + + serialization->deserializeTextJSON(column, in, format_settings); + return true; + } + catch (Exception & e) + { + e.addMessage("(while reading the value of key " + column_name + ")"); + throw; + } +} + } diff --git a/src/Formats/JSONEachRowUtils.h b/src/Formats/JSONEachRowUtils.h index 79dd6c6c192..7954da7a6c4 100644 --- a/src/Formats/JSONEachRowUtils.h +++ b/src/Formats/JSONEachRowUtils.h @@ -1,10 +1,16 @@ #pragma once +#include +#include + namespace DB { -std::pair fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size); +std::pair fileSegmentationEngineJSONEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size); +std::pair fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size); bool nonTrivialPrefixAndSuffixCheckerJSONEachRowImpl(ReadBuffer & buf); +bool readFieldImpl(ReadBuffer & in, IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, const String & column_name, const FormatSettings & format_settings, bool yield_strings); + } diff --git a/src/Formats/registerFormats.cpp b/src/Formats/registerFormats.cpp index 3e4c0366e8a..6faddd3c63f 100644 --- a/src/Formats/registerFormats.cpp +++ b/src/Formats/registerFormats.cpp @@ -15,6 +15,7 @@ void registerFileSegmentationEngineCSV(FormatFactory & factory); void registerFileSegmentationEngineJSONEachRow(FormatFactory & factory); void registerFileSegmentationEngineRegexp(FormatFactory & factory); void registerFileSegmentationEngineJSONAsString(FormatFactory & factory); +void registerFileSegmentationEngineJSONCompactEachRow(FormatFactory & factory); /// Formats for both input/output. @@ -88,6 +89,7 @@ void registerFormats() registerFileSegmentationEngineJSONEachRow(factory); registerFileSegmentationEngineRegexp(factory); registerFileSegmentationEngineJSONAsString(factory); + registerFileSegmentationEngineJSONCompactEachRow(factory); registerInputFormatNative(factory); registerOutputFormatNative(factory); diff --git a/src/Processors/Formats/IInputFormat.h b/src/Processors/Formats/IInputFormat.h index f133161c3ec..ff58a614966 100644 --- a/src/Processors/Formats/IInputFormat.h +++ b/src/Processors/Formats/IInputFormat.h @@ -16,16 +16,11 @@ struct ColumnMapping using OptionalIndexes = std::vector>; OptionalIndexes column_indexes_for_input_fields; - /// Tracks which columns we have read in a single read() call. - /// For columns that are never read, it is initialized to false when we - /// read the file header, and never changed afterwards. - /// For other columns, it is updated on each read() call. - std::vector read_columns; + /// The list of column indexes that are not presented in input data. + std::vector not_presented_columns; - - /// Whether we have any columns that are not read from file at all, - /// and must be always initialized with defaults. - bool have_always_default_columns{false}; + /// The list of column names in input data. Needed for better exception messages. + std::vector names_of_columns; }; using ColumnMappingPtr = std::shared_ptr; diff --git a/src/Processors/Formats/IRowOutputFormat.cpp b/src/Processors/Formats/IRowOutputFormat.cpp index 6b7a9a46eaa..ad111bdc66a 100644 --- a/src/Processors/Formats/IRowOutputFormat.cpp +++ b/src/Processors/Formats/IRowOutputFormat.cpp @@ -113,4 +113,11 @@ void IRowOutputFormat::writeTotals(const DB::Columns & columns, size_t row_num) write(columns, row_num); } +void registerOutputFormatWithNamesAndTypes(const String & base_format_name, RegisterOutputFormatWithNamesAndTypes register_func) +{ + register_func(base_format_name, false, false); + register_func(base_format_name + "WithNames", true, false); + register_func(base_format_name + "WithNamesAndTypes", true, true); +} + } diff --git a/src/Processors/Formats/IRowOutputFormat.h b/src/Processors/Formats/IRowOutputFormat.h index c35d93b6133..50c70a527bf 100644 --- a/src/Processors/Formats/IRowOutputFormat.h +++ b/src/Processors/Formats/IRowOutputFormat.h @@ -87,4 +87,7 @@ private: }; +using RegisterOutputFormatWithNamesAndTypes = std::function; +void registerOutputFormatWithNamesAndTypes(const String & base_format_name, RegisterOutputFormatWithNamesAndTypes register_func); + } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 8ccc04faf35..824c33858d6 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -19,63 +19,24 @@ namespace ErrorCodes } -CSVRowInputFormat::CSVRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_, - bool with_names_, const FormatSettings & format_settings_) - : RowInputFormatWithDiagnosticInfo(header_, in_, params_) - , with_names(with_names_) - , format_settings(format_settings_) +CSVRowInputFormat::CSVRowInputFormat( + const Block & header_, + ReadBuffer & in_, + const Params & params_, + bool with_names_, + bool with_types_, + const FormatSettings & format_settings_) + : RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_) { - const String bad_delimiters = " \t\"'.UL"; if (bad_delimiters.find(format_settings.csv.delimiter) != String::npos) - throw Exception(String("CSV format may not work correctly with delimiter '") + format_settings.csv.delimiter + - "'. Try use CustomSeparated format instead.", ErrorCodes::BAD_ARGUMENTS); - - const auto & sample = getPort().getHeader(); - size_t num_columns = sample.columns(); - - data_types.resize(num_columns); - column_indexes_by_names.reserve(num_columns); - - for (size_t i = 0; i < num_columns; ++i) - { - const auto & column_info = sample.getByPosition(i); - - data_types[i] = column_info.type; - column_indexes_by_names.emplace(column_info.name, i); - } -} - - -/// Map an input file column to a table column, based on its name. -void CSVRowInputFormat::addInputColumn(const String & column_name) -{ - const auto column_it = column_indexes_by_names.find(column_name); - if (column_it == column_indexes_by_names.end()) - { - if (format_settings.skip_unknown_fields) - { - column_mapping->column_indexes_for_input_fields.push_back(std::nullopt); - return; - } - throw Exception( - "Unknown field found in CSV header: '" + column_name + "' " + - "at position " + std::to_string(column_mapping->column_indexes_for_input_fields.size()) + - "\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed", - ErrorCodes::INCORRECT_DATA - ); - } - - const auto column_index = column_it->second; - - if (column_mapping->read_columns[column_index]) - throw Exception("Duplicate field found while parsing CSV header: " + column_name, ErrorCodes::INCORRECT_DATA); - - column_mapping->read_columns[column_index] = true; - column_mapping->column_indexes_for_input_fields.emplace_back(column_index); + String("CSV format may not work correctly with delimiter '") + format_settings.csv.delimiter + + "'. Try use CustomSeparated format instead.", + ErrorCodes::BAD_ARGUMENTS); } + static void skipEndOfLine(ReadBuffer & in) { /// \n (Unix) or \r\n (DOS/Windows) or \n\r (Mac OS Classic) @@ -99,29 +60,6 @@ static void skipEndOfLine(ReadBuffer & in) throw Exception("Expected end of line", ErrorCodes::INCORRECT_DATA); } - -static void skipDelimiter(ReadBuffer & in, const char delimiter, bool is_last_column) -{ - if (is_last_column) - { - if (in.eof()) - return; - - /// we support the extra delimiter at the end of the line - if (*in.position() == delimiter) - { - ++in.position(); - if (in.eof()) - return; - } - - skipEndOfLine(in); - } - else - assertChar(delimiter, in); -} - - /// Skip `whitespace` symbols allowed in CSV. static inline void skipWhitespacesAndTabs(ReadBuffer & in) { @@ -131,255 +69,138 @@ static inline void skipWhitespacesAndTabs(ReadBuffer & in) ++in.position(); } - -static void skipRow(ReadBuffer & in, const FormatSettings::CSV & settings, size_t num_columns) +void CSVRowInputFormat::skipFieldDelimiter() { - String tmp; - for (size_t i = 0; i < num_columns; ++i) - { - skipWhitespacesAndTabs(in); - readCSVString(tmp, in, settings); - skipWhitespacesAndTabs(in); - - skipDelimiter(in, settings.delimiter, i + 1 == num_columns); - } + skipWhitespacesAndTabs(*in); + assertChar(format_settings.csv.delimiter, *in); } -void CSVRowInputFormat::setupAllColumnsByTableSchema() +String CSVRowInputFormat::readFieldIntoString() { - const auto & header = getPort().getHeader(); - column_mapping->read_columns.assign(header.columns(), true); - column_mapping->column_indexes_for_input_fields.resize(header.columns()); - - for (size_t i = 0; i < column_mapping->column_indexes_for_input_fields.size(); ++i) - column_mapping->column_indexes_for_input_fields[i] = i; + skipWhitespacesAndTabs(*in); + String field; + readCSVString(field, *in, format_settings.csv); + return field; } - -void CSVRowInputFormat::readPrefix() +void CSVRowInputFormat::skipField() { - /// In this format, we assume, that if first string field contain BOM as value, it will be written in quotes, - /// so BOM at beginning of stream cannot be confused with BOM in first string value, and it is safe to skip it. - skipBOMIfExists(*in); - - size_t num_columns = data_types.size(); - const auto & header = getPort().getHeader(); - - /// This is a bit of abstraction leakage, but we have almost the same code in other places. - /// Thus, we check if this InputFormat is working with the "real" beginning of the data in case of parallel parsing. - if (with_names && getCurrentUnitNumber() == 0) - { - /// This CSV file has a header row with column names. Depending on the - /// settings, use it or skip it. - if (format_settings.with_names_use_header) - { - /// Look at the file header to see which columns we have there. - /// The missing columns are filled with defaults. - column_mapping->read_columns.assign(header.columns(), false); - do - { - String column_name; - skipWhitespacesAndTabs(*in); - readCSVString(column_name, *in, format_settings.csv); - skipWhitespacesAndTabs(*in); - - addInputColumn(column_name); - } - while (checkChar(format_settings.csv.delimiter, *in)); - - skipDelimiter(*in, format_settings.csv.delimiter, true); - - for (auto read_column : column_mapping->read_columns) - { - if (!read_column) - { - column_mapping->have_always_default_columns = true; - break; - } - } - - return; - } - else - { - skipRow(*in, format_settings.csv, num_columns); - setupAllColumnsByTableSchema(); - } - } - else if (!column_mapping->is_set) - setupAllColumnsByTableSchema(); + readFieldIntoString(); } - -bool CSVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext) +void CSVRowInputFormat::skipRowEndDelimiter() { + skipWhitespacesAndTabs(*in); + if (in->eof()) - return false; + return; - updateDiagnosticInfo(); + /// we support the extra delimiter at the end of the line + if (*in->position() == format_settings.csv.delimiter) + ++in->position(); - /// Track whether we have to fill any columns in this row with default - /// values. If not, we return an empty column mask to the caller, so that - /// it doesn't have to check it. - bool have_default_columns = column_mapping->have_always_default_columns; + skipWhitespacesAndTabs(*in); + if (in->eof()) + return; - ext.read_columns.assign(column_mapping->read_columns.size(), true); - const auto delimiter = format_settings.csv.delimiter; - for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) - { - const auto & table_column = column_mapping->column_indexes_for_input_fields[file_column]; - const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); - - if (table_column) - { - skipWhitespacesAndTabs(*in); - ext.read_columns[*table_column] = readField(*columns[*table_column], data_types[*table_column], - serializations[*table_column], is_last_file_column); - - if (!ext.read_columns[*table_column]) - have_default_columns = true; - skipWhitespacesAndTabs(*in); - } - else - { - /// We never read this column from the file, just skip it. - String tmp; - readCSVString(tmp, *in, format_settings.csv); - } - - skipDelimiter(*in, delimiter, is_last_file_column); - } - - if (have_default_columns) - { - for (size_t i = 0; i < column_mapping->read_columns.size(); i++) - { - if (!column_mapping->read_columns[i]) - { - /// The column value for this row is going to be overwritten - /// with default by the caller, but the general assumption is - /// that the column size increases for each row, so we have - /// to insert something. Since we do not care about the exact - /// value, we do not have to use the default value specified by - /// the data type, and can just use IColumn::insertDefault(). - columns[i]->insertDefault(); - ext.read_columns[i] = false; - } - } - } - - return true; + skipEndOfLine(*in); } -bool CSVRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) +void CSVRowInputFormat::skipRow() +{ + do + { + skipField(); + skipWhitespacesAndTabs(*in); + } + while (checkChar(format_settings.csv.delimiter, *in)); + + skipRowEndDelimiter(); +} + +std::vector CSVRowInputFormat::readHeaderRow() +{ + std::vector fields; + do + { + fields.push_back(readFieldIntoString()); + skipWhitespacesAndTabs(*in); + } + while (checkChar(format_settings.csv.delimiter, *in)); + + skipRowEndDelimiter(); + return fields; +} + +bool CSVRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { const char delimiter = format_settings.csv.delimiter; - for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) + try { - if (file_column == 0 && in->eof()) - { - out << "\n"; - return false; - } - skipWhitespacesAndTabs(*in); - if (column_mapping->column_indexes_for_input_fields[file_column].has_value()) + assertChar(delimiter, *in); + } + catch (const DB::Exception &) + { + if (*in->position() == '\n' || *in->position() == '\r') { - const auto & header = getPort().getHeader(); - size_t col_idx = column_mapping->column_indexes_for_input_fields[file_column].value(); - if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], - out, file_column)) - return false; + out << "ERROR: Line feed found where delimiter (" << delimiter << ") is expected." + " It's like your file has less columns than expected.\n" + "And if your file has the right number of columns, maybe it has unescaped quotes in values.\n"; } else { - static const String skipped_column_str = ""; - static const DataTypePtr skipped_column_type = std::make_shared(); - static const MutableColumnPtr skipped_column = skipped_column_type->createColumn(); - if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column)) - return false; - } - skipWhitespacesAndTabs(*in); - - /// Delimiters - if (file_column + 1 == column_mapping->column_indexes_for_input_fields.size()) - { - if (in->eof()) - return false; - - /// we support the extra delimiter at the end of the line - if (*in->position() == delimiter) - { - ++in->position(); - if (in->eof()) - break; - } - - if (!in->eof() && *in->position() != '\n' && *in->position() != '\r') - { - out << "ERROR: There is no line feed. "; - verbosePrintString(in->position(), in->position() + 1, out); - out << " found instead.\n" - " It's like your file has more columns than expected.\n" - "And if your file has the right number of columns, maybe it has an unquoted string value with a comma.\n"; - - return false; - } - - skipEndOfLine(*in); - } - else - { - try - { - assertChar(delimiter, *in); - } - catch (const DB::Exception &) - { - if (*in->position() == '\n' || *in->position() == '\r') - { - out << "ERROR: Line feed found where delimiter (" << delimiter << ") is expected." - " It's like your file has less columns than expected.\n" - "And if your file has the right number of columns, maybe it has unescaped quotes in values.\n"; - } - else - { - out << "ERROR: There is no delimiter (" << delimiter << "). "; - verbosePrintString(in->position(), in->position() + 1, out); - out << " found instead.\n"; - } - return false; - } + out << "ERROR: There is no delimiter (" << delimiter << "). "; + verbosePrintString(in->position(), in->position() + 1, out); + out << " found instead.\n"; } + return false; } return true; } +bool CSVRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +{ + skipWhitespacesAndTabs(*in); + + if (in->eof()) + return true; + + /// we support the extra delimiter at the end of the line + if (*in->position() == format_settings.csv.delimiter) + { + ++in->position(); + skipWhitespacesAndTabs(*in); + if (in->eof()) + return true; + } + + if (!in->eof() && *in->position() != '\n' && *in->position() != '\r') + { + out << "ERROR: There is no line feed. "; + verbosePrintString(in->position(), in->position() + 1, out); + out << " found instead.\n" + " It's like your file has more columns than expected.\n" + "And if your file has the right number of columns, maybe it has an unquoted string value with a comma.\n"; + + return false; + } + + skipEndOfLine(*in); + return true; +} void CSVRowInputFormat::syncAfterError() { skipToNextLineOrEOF(*in); } -void CSVRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) +bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & /*column_name*/) { - const auto & index = column_mapping->column_indexes_for_input_fields[file_column]; - if (index) - { - const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); - readField(column, type, serializations[*index], is_last_file_column); - } - else - { - String tmp; - readCSVString(tmp, *in, format_settings.csv); - } -} + skipWhitespacesAndTabs(*in); -bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column) -{ const bool at_delimiter = !in->eof() && *in->position() == format_settings.csv.delimiter; const bool at_last_column_line_end = is_last_file_column && (in->eof() || *in->position() == '\n' || *in->position() == '\r'); @@ -398,7 +219,7 @@ bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, co column.insertDefault(); return false; } - else if (format_settings.null_as_default && !type->isNullable()) + else if (format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable()) { /// If value is null but type is not nullable then use default value instead. return SerializationNullable::deserializeTextCSVImpl(column, *in, format_settings, serialization); @@ -411,28 +232,23 @@ bool CSVRowInputFormat::readField(IColumn & column, const DataTypePtr & type, co } } -void CSVRowInputFormat::resetParser() -{ - RowInputFormatWithDiagnosticInfo::resetParser(); - column_mapping->column_indexes_for_input_fields.clear(); - column_mapping->read_columns.clear(); - column_mapping->have_always_default_columns = false; -} void registerInputFormatCSV(FormatFactory & factory) { - for (bool with_names : {false, true}) + auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - factory.registerInputFormat(with_names ? "CSVWithNames" : "CSV", [=]( + factory.registerInputFormat(format_name, [with_names, with_types]( ReadBuffer & buf, const Block & sample, IRowInputFormat::Params params, const FormatSettings & settings) { - return std::make_shared(sample, buf, params, with_names, settings); + return std::make_shared(sample, buf, std::move(params), with_names, with_types, settings); }); - } + }; + + registerInputFormatWithNamesAndTypes("CSV", register_func); } static std::pair fileSegmentationEngineCSVImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) @@ -501,8 +317,7 @@ static std::pair fileSegmentationEngineCSVImpl(ReadBuffer & in, DB void registerFileSegmentationEngineCSV(FormatFactory & factory) { - factory.registerFileSegmentationEngine("CSV", &fileSegmentationEngineCSVImpl); - factory.registerFileSegmentationEngine("CSVWithNames", &fileSegmentationEngineCSVImpl); + registerFileSegmentationEngineForFormatWithNamesAndTypes(factory, "CSV", &fileSegmentationEngineCSVImpl); } } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.h b/src/Processors/Formats/Impl/CSVRowInputFormat.h index b6075745b39..2e036fa2318 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include @@ -14,41 +14,39 @@ namespace DB /** A stream for inputting data in csv format. * Does not conform with https://tools.ietf.org/html/rfc4180 because it skips spaces and tabs between values. */ -class CSVRowInputFormat : public RowInputFormatWithDiagnosticInfo +class CSVRowInputFormat : public RowInputFormatWithNamesAndTypes { public: /** with_names - in the first line the header with column names + * with_types - on the next line header with type names */ CSVRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_, - bool with_names_, const FormatSettings & format_settings_); + bool with_names_, bool with_types_, const FormatSettings & format_settings_); String getName() const override { return "CSVRowInputFormat"; } - bool readRow(MutableColumns & columns, RowReadExtension & ext) override; - void readPrefix() override; bool allowSyncAfterError() const override { return true; } void syncAfterError() override; - void resetParser() override; private: - /// There fields are computed in constructor. - bool with_names; - const FormatSettings format_settings; - DataTypes data_types; - using IndexesMap = std::unordered_map; - IndexesMap column_indexes_by_names; - - void addInputColumn(const String & column_name); - - void setupAllColumnsByTableSchema(); - bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; - void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override; + bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; + bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\r' && *pos != format_settings.csv.delimiter && *pos != ' ' && *pos != '\t'; } - bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column); + bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; + + void skipField(const String & /*column_name*/) override { skipField(); } + void skipField(); + + void skipRow() override; + void skipFieldDelimiter() override; + void skipRowEndDelimiter() override; + + Names readHeaderRow() override; + String readFieldIntoString(); }; } diff --git a/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp b/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp index 14d0e519c0c..9fba7ba3627 100644 --- a/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp @@ -8,8 +8,8 @@ namespace DB { -CSVRowOutputFormat::CSVRowOutputFormat(WriteBuffer & out_, const Block & header_, bool with_names_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_) - : IRowOutputFormat(header_, out_, params_), with_names(with_names_), format_settings(format_settings_) +CSVRowOutputFormat::CSVRowOutputFormat(WriteBuffer & out_, const Block & header_, bool with_names_, bool with_types_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_) + : IRowOutputFormat(header_, out_, params_), with_names(with_names_), with_types(with_types_), format_settings(format_settings_) { const auto & sample = getPort(PortKind::Main).getHeader(); size_t columns = sample.columns(); @@ -18,25 +18,27 @@ CSVRowOutputFormat::CSVRowOutputFormat(WriteBuffer & out_, const Block & header_ data_types[i] = sample.safeGetByPosition(i).type; } +void CSVRowOutputFormat::writeLine(const std::vector & values) +{ + for (size_t i = 0; i < values.size(); ++i) + { + writeCSVString(values[i], out); + if (i + 1 == values.size()) + writeRowEndDelimiter(); + else + writeFieldDelimiter(); + } +} void CSVRowOutputFormat::doWritePrefix() { const auto & sample = getPort(PortKind::Main).getHeader(); - size_t columns = sample.columns(); if (with_names) - { - for (size_t i = 0; i < columns; ++i) - { - writeCSVString(sample.safeGetByPosition(i).name, out); + writeLine(sample.getNames()); - char delimiter = format_settings.csv.delimiter; - if (i + 1 == columns) - delimiter = '\n'; - - writeChar(delimiter, out); - } - } + if (with_types) + writeLine(sample.getDataTypeNames()); } @@ -72,18 +74,20 @@ void CSVRowOutputFormat::writeBeforeExtremes() void registerOutputFormatCSV(FormatFactory & factory) { - for (bool with_names : {false, true}) + auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - factory.registerOutputFormat(with_names ? "CSVWithNames" : "CSV", [=]( - WriteBuffer & buf, - const Block & sample, - const RowOutputFormatParams & params, - const FormatSettings & format_settings) + factory.registerOutputFormat(format_name, [=]( + WriteBuffer & buf, + const Block & sample, + const RowOutputFormatParams & params, + const FormatSettings & format_settings) { - return std::make_shared(buf, sample, with_names, params, format_settings); + return std::make_shared(buf, sample, with_names, with_types, params, format_settings); }); - factory.markOutputFormatSupportsParallelFormatting(with_names ? "CSVWithNames" : "CSV"); - } + factory.markOutputFormatSupportsParallelFormatting(format_name); + }; + + registerOutputFormatWithNamesAndTypes("CSV", register_func); } } diff --git a/src/Processors/Formats/Impl/CSVRowOutputFormat.h b/src/Processors/Formats/Impl/CSVRowOutputFormat.h index 780a6c4d3ce..7f5d90203ea 100644 --- a/src/Processors/Formats/Impl/CSVRowOutputFormat.h +++ b/src/Processors/Formats/Impl/CSVRowOutputFormat.h @@ -20,7 +20,7 @@ public: /** with_names - output in the first line a header with column names * with_types - output in the next line header with the names of the types */ - CSVRowOutputFormat(WriteBuffer & out_, const Block & header_, bool with_names_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_); + CSVRowOutputFormat(WriteBuffer & out_, const Block & header_, bool with_names_, bool with_types, const RowOutputFormatParams & params_, const FormatSettings & format_settings_); String getName() const override { return "CSVRowOutputFormat"; } @@ -38,9 +38,11 @@ public: return String("text/csv; charset=UTF-8; header=") + (with_names ? "present" : "absent"); } -protected: +private: + void writeLine(const std::vector & values); bool with_names; + bool with_types; const FormatSettings format_settings; DataTypes data_types; }; diff --git a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp index 5b32bf94c4d..a5e0ac6862c 100644 --- a/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONAsStringRowInputFormat.cpp @@ -185,7 +185,7 @@ void registerInputFormatJSONAsString(FormatFactory & factory) void registerFileSegmentationEngineJSONAsString(FormatFactory & factory) { - factory.registerFileSegmentationEngine("JSONAsString", &fileSegmentationEngineJSONEachRowImpl); + factory.registerFileSegmentationEngine("JSONAsString", &fileSegmentationEngineJSONEachRow); } void registerNonTrivialPrefixAndSuffixCheckerJSONAsString(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp index 962e9d6e5ac..c551597ca5f 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp @@ -1,8 +1,11 @@ +#include + #include #include - -#include +#include #include +#include +#include #include #include @@ -16,179 +19,36 @@ namespace ErrorCodes } -JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat(ReadBuffer & in_, - const Block & header_, - Params params_, - const FormatSettings & format_settings_, - bool with_names_, - bool yield_strings_) - : IRowInputFormat(header_, in_, std::move(params_)), format_settings(format_settings_), with_names(with_names_), yield_strings(yield_strings_) +JSONCompactEachRowRowInputFormat::JSONCompactEachRowRowInputFormat( + const Block & header_, + ReadBuffer & in_, + Params params_, + bool with_names_, + bool with_types_, + bool yield_strings_, + const FormatSettings & format_settings_) + : RowInputFormatWithNamesAndTypes(header_, in_, std::move(params_), with_names_, with_types_, format_settings_) + , yield_strings(yield_strings_) { - const auto & sample = getPort().getHeader(); - size_t num_columns = sample.columns(); - - data_types.resize(num_columns); - column_indexes_by_names.reserve(num_columns); - - for (size_t i = 0; i < num_columns; ++i) - { - const auto & column_info = sample.getByPosition(i); - - data_types[i] = column_info.type; - column_indexes_by_names.emplace(column_info.name, i); - } } -void JSONCompactEachRowRowInputFormat::resetParser() +void JSONCompactEachRowRowInputFormat::skipRowStartDelimiter() { - IRowInputFormat::resetParser(); - column_indexes_for_input_fields.clear(); - not_seen_columns.clear(); -} - -void JSONCompactEachRowRowInputFormat::readPrefix() -{ - /// In this format, BOM at beginning of stream cannot be confused with value, so it is safe to skip it. - skipBOMIfExists(*in); - - if (with_names) - { - size_t num_columns = getPort().getHeader().columns(); - read_columns.assign(num_columns, false); - - assertChar('[', *in); - do - { - skipWhitespaceIfAny(*in); - String column_name; - readJSONString(column_name, *in); - addInputColumn(column_name); - skipWhitespaceIfAny(*in); - } - while (checkChar(',', *in)); - assertChar(']', *in); - skipEndOfLine(); - - /// Type checking - assertChar('[', *in); - for (size_t i = 0; i < column_indexes_for_input_fields.size(); ++i) - { - skipWhitespaceIfAny(*in); - String data_type; - readJSONString(data_type, *in); - - if (column_indexes_for_input_fields[i] && - data_types[*column_indexes_for_input_fields[i]]->getName() != data_type) - { - throw Exception( - "Type of '" + getPort().getHeader().getByPosition(*column_indexes_for_input_fields[i]).name - + "' must be " + data_types[*column_indexes_for_input_fields[i]]->getName() + - ", not " + data_type, - ErrorCodes::INCORRECT_DATA - ); - } - - if (i != column_indexes_for_input_fields.size() - 1) - assertChar(',', *in); - skipWhitespaceIfAny(*in); - } - assertChar(']', *in); - } - else - { - size_t num_columns = getPort().getHeader().columns(); - read_columns.assign(num_columns, true); - column_indexes_for_input_fields.resize(num_columns); - - for (size_t i = 0; i < num_columns; ++i) - { - column_indexes_for_input_fields[i] = i; - } - } - - for (size_t i = 0; i < read_columns.size(); ++i) - { - if (!read_columns[i]) - { - not_seen_columns.emplace_back(i); - } - } -} - -void JSONCompactEachRowRowInputFormat::addInputColumn(const String & column_name) -{ - names_of_columns.emplace_back(column_name); - - const auto column_it = column_indexes_by_names.find(column_name); - if (column_it == column_indexes_by_names.end()) - { - if (format_settings.skip_unknown_fields) - { - column_indexes_for_input_fields.push_back(std::nullopt); - return; - } - - throw Exception( - "Unknown field found in JSONCompactEachRow header: '" + column_name + "' " + - "at position " + std::to_string(column_indexes_for_input_fields.size()) + - "\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed", - ErrorCodes::INCORRECT_DATA - ); - } - - const auto column_index = column_it->second; - - if (read_columns[column_index]) - throw Exception("Duplicate field found while parsing JSONCompactEachRow header: " + column_name, ErrorCodes::INCORRECT_DATA); - - read_columns[column_index] = true; - column_indexes_for_input_fields.emplace_back(column_index); -} - -bool JSONCompactEachRowRowInputFormat::readRow(DB::MutableColumns &columns, DB::RowReadExtension &ext) -{ - skipEndOfLine(); - - if (in->eof()) - return false; - - size_t num_columns = columns.size(); - - read_columns.assign(num_columns, false); - + skipWhitespaceIfAny(*in); assertChar('[', *in); - for (size_t file_column = 0; file_column < column_indexes_for_input_fields.size(); ++file_column) - { - const auto & table_column = column_indexes_for_input_fields[file_column]; - if (table_column) - { - readField(*table_column, columns); - } - else - { - skipJSONField(*in, StringRef(names_of_columns[file_column])); - } +} - skipWhitespaceIfAny(*in); - if (in->eof()) - throw ParsingException("Unexpected end of stream while parsing JSONCompactEachRow format", ErrorCodes::CANNOT_READ_ALL_DATA); - if (file_column + 1 != column_indexes_for_input_fields.size()) - { - assertChar(',', *in); - skipWhitespaceIfAny(*in); - } - } +void JSONCompactEachRowRowInputFormat::skipFieldDelimiter() +{ + skipWhitespaceIfAny(*in); + assertChar(',', *in); +} + +void JSONCompactEachRowRowInputFormat::skipRowEndDelimiter() +{ + skipWhitespaceIfAny(*in); assertChar(']', *in); - for (const auto & name : not_seen_columns) - columns[name]->insertDefault(); - - ext.read_columns = read_columns; - return true; -} - -void JSONCompactEachRowRowInputFormat::skipEndOfLine() -{ skipWhitespaceIfAny(*in); if (!in->eof() && (*in->position() == ',' || *in->position() == ';')) ++in->position(); @@ -196,39 +56,55 @@ void JSONCompactEachRowRowInputFormat::skipEndOfLine() skipWhitespaceIfAny(*in); } -void JSONCompactEachRowRowInputFormat::readField(size_t index, MutableColumns & columns) +String JSONCompactEachRowRowInputFormat::readFieldIntoString() { - try + skipWhitespaceIfAny(*in); + String field; + readJSONString(field, *in); + return field; +} + +void JSONCompactEachRowRowInputFormat::skipField(const String & column_name) +{ + skipWhitespaceIfAny(*in); + skipJSONField(*in, column_name); +} + +void JSONCompactEachRowRowInputFormat::skipRow() +{ + skipRowStartDelimiter(); + size_t i = 0; + do { - read_columns[index] = true; - const auto & type = data_types[index]; - const auto & serialization = serializations[index]; - - if (yield_strings) - { - String str; - readJSONString(str, *in); - - ReadBufferFromString buf(str); - - if (format_settings.null_as_default && !type->isNullable()) - read_columns[index] = SerializationNullable::deserializeWholeTextImpl(*columns[index], buf, format_settings, serialization); - else - serialization->deserializeWholeText(*columns[index], buf, format_settings); - } - else - { - if (format_settings.null_as_default && !type->isNullable()) - read_columns[index] = SerializationNullable::deserializeTextJSONImpl(*columns[index], *in, format_settings, serialization); - else - serialization->deserializeTextJSON(*columns[index], *in, format_settings); - } + if (i >= column_mapping->names_of_columns.size()) + throw Exception(ErrorCodes::INCORRECT_DATA, "The number of columns in a row differs from the number of column names"); + skipField(column_mapping->names_of_columns[i++]); + skipWhitespaceIfAny(*in); } - catch (Exception & e) + while (checkChar(',', *in)); + + skipRowEndDelimiter(); +} + +std::vector JSONCompactEachRowRowInputFormat::readHeaderRow() +{ + skipRowStartDelimiter(); + std::vector fields; + do { - e.addMessage("(while reading the value of key " + getPort().getHeader().getByPosition(index).name + ")"); - throw; + fields.push_back(readFieldIntoString()); + skipWhitespaceIfAny(*in); } + while (checkChar(',', *in)); + + skipRowEndDelimiter(); + return fields; +} + +bool JSONCompactEachRowRowInputFormat::readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & column_name) +{ + skipWhitespaceIfAny(*in); + return readFieldImpl(*in, column, type, serialization, column_name, format_settings, yield_strings); } void JSONCompactEachRowRowInputFormat::syncAfterError() @@ -236,43 +112,99 @@ void JSONCompactEachRowRowInputFormat::syncAfterError() skipToUnescapedNextLineOrEOF(*in); } +bool JSONCompactEachRowRowInputFormat::parseRowStartWithDiagnosticInfo(WriteBuffer & out) +{ + skipWhitespaceIfAny(*in); + if (!checkChar('[', *in)) + { + out << "ERROR: There is no '[' before the row.\n"; + return false; + } + + return true; +} + +bool JSONCompactEachRowRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) +{ + try + { + skipWhitespaceIfAny(*in); + assertChar(',', *in); + } + catch (const DB::Exception &) + { + if (*in->position() == ']') + { + out << "ERROR: Closing parenthesis (']') found where comma is expected." + " It's like your file has less columns than expected.\n" + "And if your file has the right number of columns, maybe it has unescaped quotes in values.\n"; + } + else + { + out << "ERROR: There is no comma. "; + verbosePrintString(in->position(), in->position() + 1, out); + out << " found instead.\n"; + } + return false; + } + + return true; +} + +bool JSONCompactEachRowRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) +{ + skipWhitespaceIfAny(*in); + + if (in->eof()) + { + out << "ERROR: Unexpected end of file. ']' expected at the end of row."; + return false; + } + + if (!checkChar(']', *in)) + { + out << "ERROR: There is no closing parenthesis (']') at the end of the row. "; + verbosePrintString(in->position(), in->position() + 1, out); + out << " found instead.\n"; + return false; + } + + skipWhitespaceIfAny(*in); + + if (in->eof()) + return true; + + if ((*in->position() == ',' || *in->position() == ';')) + ++in->position(); + + skipWhitespaceIfAny(*in); + return true; +} + void registerInputFormatJSONCompactEachRow(FormatFactory & factory) { - factory.registerInputFormat("JSONCompactEachRow", []( - ReadBuffer & buf, - const Block & sample, - IRowInputFormat::Params params, - const FormatSettings & settings) + for (bool yield_strings : {true, false}) { - return std::make_shared(buf, sample, std::move(params), settings, false, false); - }); + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerInputFormat(format_name, [with_names, with_types, yield_strings]( + ReadBuffer & buf, + const Block & sample, + IRowInputFormat::Params params, + const FormatSettings & settings) + { + return std::make_shared(sample, buf, std::move(params), with_names, with_types, yield_strings, settings); + }); + }; - factory.registerInputFormat("JSONCompactEachRowWithNamesAndTypes", []( - ReadBuffer & buf, - const Block & sample, - IRowInputFormat::Params params, - const FormatSettings & settings) - { - return std::make_shared(buf, sample, std::move(params), settings, true, false); - }); + registerInputFormatWithNamesAndTypes(yield_strings ? "JSONCompactStringsEachRow" : "JSONCompactEachRow", register_func); + } +} - factory.registerInputFormat("JSONCompactStringsEachRow", []( - ReadBuffer & buf, - const Block & sample, - IRowInputFormat::Params params, - const FormatSettings & settings) - { - return std::make_shared(buf, sample, std::move(params), settings, false, true); - }); - - factory.registerInputFormat("JSONCompactStringsEachRowWithNamesAndTypes", []( - ReadBuffer & buf, - const Block & sample, - IRowInputFormat::Params params, - const FormatSettings & settings) - { - return std::make_shared(buf, sample, std::move(params), settings, true, true); - }); +void registerFileSegmentationEngineJSONCompactEachRow(FormatFactory & factory) +{ + registerFileSegmentationEngineForFormatWithNamesAndTypes(factory, "JSONCompactEachRow", &fileSegmentationEngineJSONCompactEachRow); + registerFileSegmentationEngineForFormatWithNamesAndTypes(factory, "JSONCompactStringsEachRow", &fileSegmentationEngineJSONCompactEachRow); } } diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h index 4077eb6e008..fe8fc2acda3 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include #include #include @@ -17,49 +17,43 @@ class ReadBuffer; * - JSONCompactStringsEachRowWithNamesAndTypes * */ -class JSONCompactEachRowRowInputFormat : public IRowInputFormat +class JSONCompactEachRowRowInputFormat : public RowInputFormatWithNamesAndTypes { public: JSONCompactEachRowRowInputFormat( - ReadBuffer & in_, const Block & header_, + ReadBuffer & in_, Params params_, - const FormatSettings & format_settings_, bool with_names_, - bool yield_strings_); + bool with_types_, + bool yield_strings_, + const FormatSettings & format_settings_); String getName() const override { return "JSONCompactEachRowRowInputFormat"; } - - void readPrefix() override; - bool readRow(MutableColumns & columns, RowReadExtension & ext) override; bool allowSyncAfterError() const override { return true; } void syncAfterError() override; - void resetParser() override; private: - void addInputColumn(const String & column_name); - void skipEndOfLine(); - void readField(size_t index, MutableColumns & columns); + bool parseRowStartWithDiagnosticInfo(WriteBuffer & out) override; + bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; + bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; + bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override + { + return *pos != ',' && *pos != ']' && *pos != ' ' && *pos != '\t'; + } - const FormatSettings format_settings; + bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; - using IndexesMap = std::unordered_map; - IndexesMap column_indexes_by_names; + void skipField(const String & column_name) override; + void skipRow() override; + void skipRowStartDelimiter() override; + void skipFieldDelimiter() override; + void skipRowEndDelimiter() override; - using OptionalIndexes = std::vector>; - OptionalIndexes column_indexes_for_input_fields; + Names readHeaderRow() override; + String readFieldIntoString(); - DataTypes data_types; - std::vector read_columns; - std::vector not_seen_columns; - - /// This is for the correct exceptions in skipping unknown fields. - std::vector names_of_columns; - - /// For *WithNamesAndTypes formats. - bool with_names; - /// For JSONCompactString* formats. bool yield_strings; }; diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.cpp index 1ce4277023d..c7df76e3b83 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.cpp @@ -13,12 +13,10 @@ JSONCompactEachRowRowOutputFormat::JSONCompactEachRowRowOutputFormat(WriteBuffer const RowOutputFormatParams & params_, const FormatSettings & settings_, bool with_names_, + bool with_types_, bool yield_strings_) - : IRowOutputFormat(header_, out_, params_), settings(settings_), with_names(with_names_), yield_strings(yield_strings_) + : IRowOutputFormat(header_, out_, params_), settings(settings_), with_names(with_names_), with_types(with_types_), yield_strings(yield_strings_) { - const auto & sample = getPort(PortKind::Main).getHeader(); - NamesAndTypesList columns(sample.getNamesAndTypesList()); - fields.assign(columns.begin(), columns.end()); } @@ -57,39 +55,40 @@ void JSONCompactEachRowRowOutputFormat::writeTotals(const Columns & columns, siz { writeChar('\n', out); size_t num_columns = columns.size(); - writeChar('[', out); + writeRowStartDelimiter(); for (size_t i = 0; i < num_columns; ++i) { if (i != 0) - JSONCompactEachRowRowOutputFormat::writeFieldDelimiter(); + writeFieldDelimiter(); - JSONCompactEachRowRowOutputFormat::writeField(*columns[i], *serializations[i], row_num); + writeField(*columns[i], *serializations[i], row_num); } - writeCString("]\n", out); + writeRowEndDelimiter(); +} + +void JSONCompactEachRowRowOutputFormat::writeLine(const std::vector & values) +{ + writeRowStartDelimiter(); + for (size_t i = 0; i < values.size(); ++i) + { + writeChar('\"', out); + writeString(values[i], out); + writeChar('\"', out); + if (i != values.size() - 1) + writeFieldDelimiter(); + } + writeRowEndDelimiter(); } void JSONCompactEachRowRowOutputFormat::doWritePrefix() { + const auto & header = getPort(PortKind::Main).getHeader(); + if (with_names) - { - writeChar('[', out); - for (size_t i = 0; i < fields.size(); ++i) - { - writeChar('\"', out); - writeString(fields[i].name, out); - writeChar('\"', out); - if (i != fields.size() - 1) - writeCString(", ", out); - } - writeCString("]\n[", out); - for (size_t i = 0; i < fields.size(); ++i) - { - writeJSONString(fields[i].type->getName(), out, settings); - if (i != fields.size() - 1) - writeCString(", ", out); - } - writeCString("]\n", out); - } + writeLine(header.getNames()); + + if (with_types) + writeLine(header.getDataTypeNames()); } void JSONCompactEachRowRowOutputFormat::consumeTotals(DB::Chunk chunk) @@ -100,45 +99,23 @@ void JSONCompactEachRowRowOutputFormat::consumeTotals(DB::Chunk chunk) void registerOutputFormatJSONCompactEachRow(FormatFactory & factory) { - factory.registerOutputFormat("JSONCompactEachRow", []( - WriteBuffer & buf, - const Block & sample, - const RowOutputFormatParams & params, - const FormatSettings & format_settings) + for (bool yield_strings : {false, true}) { - return std::make_shared(buf, sample, params, format_settings, false, false); - }); - factory.markOutputFormatSupportsParallelFormatting("JSONCompactEachRow"); + auto register_func = [&](const String & format_name, bool with_names, bool with_types) + { + factory.registerOutputFormat(format_name, [=]( + WriteBuffer & buf, + const Block & sample, + const RowOutputFormatParams & params, + const FormatSettings & format_settings) + { + return std::make_shared(buf, sample, params, format_settings, with_names, with_types, yield_strings); + }); + factory.markOutputFormatSupportsParallelFormatting(format_name); + }; - factory.registerOutputFormat("JSONCompactEachRowWithNamesAndTypes", []( - WriteBuffer &buf, - const Block &sample, - const RowOutputFormatParams & params, - const FormatSettings &format_settings) - { - return std::make_shared(buf, sample, params, format_settings, true, false); - }); - factory.markOutputFormatSupportsParallelFormatting("JSONCompactEachRowWithNamesAndTypes"); - - factory.registerOutputFormat("JSONCompactStringsEachRow", []( - WriteBuffer & buf, - const Block & sample, - const RowOutputFormatParams & params, - const FormatSettings & format_settings) - { - return std::make_shared(buf, sample, params, format_settings, false, true); - }); - factory.markOutputFormatSupportsParallelFormatting("JSONCompactStringsEachRow"); - - factory.registerOutputFormat("JSONCompactStringsEachRowWithNamesAndTypes", []( - WriteBuffer &buf, - const Block &sample, - const RowOutputFormatParams & params, - const FormatSettings &format_settings) - { - return std::make_shared(buf, sample, params, format_settings, true, true); - }); - factory.markOutputFormatSupportsParallelFormatting("JSONCompactStringsEachRowWithNamesAndTypes"); + registerOutputFormatWithNamesAndTypes(yield_strings ? "JSONCompactStringsEachRow" : "JSONCompactEachRow", register_func); + } } diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.h b/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.h index 792eb906f4b..aa12ba7e809 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.h +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.h @@ -21,15 +21,14 @@ public: const RowOutputFormatParams & params_, const FormatSettings & settings_, bool with_names_, + bool with_types_, bool yield_strings_); String getName() const override { return "JSONCompactEachRowRowOutputFormat"; } void doWritePrefix() override; - void writeBeforeTotals() override {} void writeTotals(const Columns & columns, size_t row_num) override; - void writeAfterTotals() override {} void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override; void writeFieldDelimiter() override; @@ -42,11 +41,11 @@ protected: void consumeExtremes(Chunk) override {} private: + void writeLine(const std::vector & values); + FormatSettings settings; - - NamesAndTypes fields; - bool with_names; + bool with_types; bool yield_strings; }; } diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index d04ba2a49e4..28481313974 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -136,37 +136,10 @@ void JSONEachRowRowInputFormat::readField(size_t index, MutableColumns & columns if (seen_columns[index]) throw Exception("Duplicate field found while parsing JSONEachRow format: " + columnName(index), ErrorCodes::INCORRECT_DATA); - try - { - seen_columns[index] = read_columns[index] = true; - const auto & type = getPort().getHeader().getByPosition(index).type; - const auto & serialization = serializations[index]; - - if (yield_strings) - { - String str; - readJSONString(str, *in); - - ReadBufferFromString buf(str); - - if (format_settings.null_as_default && !type->isNullable()) - read_columns[index] = SerializationNullable::deserializeWholeTextImpl(*columns[index], buf, format_settings, serialization); - else - serialization->deserializeWholeText(*columns[index], buf, format_settings); - } - else - { - if (format_settings.null_as_default && !type->isNullable()) - read_columns[index] = SerializationNullable::deserializeTextJSONImpl(*columns[index], *in, format_settings, serialization); - else - serialization->deserializeTextJSON(*columns[index], *in, format_settings); - } - } - catch (Exception & e) - { - e.addMessage("(while reading the value of key " + columnName(index) + ")"); - throw; - } + seen_columns[index] = true; + const auto & type = getPort().getHeader().getByPosition(index).type; + const auto & serialization = serializations[index]; + read_columns[index] = readFieldImpl(*in, *columns[index], type, serialization, columnName(index), format_settings, yield_strings); } inline bool JSONEachRowRowInputFormat::advanceToNextKey(size_t key_index) @@ -282,8 +255,13 @@ bool JSONEachRowRowInputFormat::readRow(MutableColumns & columns, RowReadExtensi if (!seen_columns[i]) header.getByPosition(i).type->insertDefaultInto(*columns[i]); - /// return info about defaults set - ext.read_columns = read_columns; + /// Return info about defaults set. + /// If defaults_for_omitted_fields is set to 0, we should just leave already inserted defaults. + if (format_settings.defaults_for_omitted_fields) + ext.read_columns = read_columns; + else + ext.read_columns.assign(read_columns.size(), true); + return true; } @@ -355,8 +333,8 @@ void registerInputFormatJSONEachRow(FormatFactory & factory) void registerFileSegmentationEngineJSONEachRow(FormatFactory & factory) { - factory.registerFileSegmentationEngine("JSONEachRow", &fileSegmentationEngineJSONEachRowImpl); - factory.registerFileSegmentationEngine("JSONStringsEachRow", &fileSegmentationEngineJSONEachRowImpl); + factory.registerFileSegmentationEngine("JSONEachRow", &fileSegmentationEngineJSONEachRow); + factory.registerFileSegmentationEngine("JSONStringsEachRow", &fileSegmentationEngineJSONEachRow); } void registerNonTrivialPrefixAndSuffixCheckerJSONEachRow(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp index 8cb0fce609e..62c0eaa457e 100644 --- a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp @@ -60,7 +60,7 @@ RegexpRowInputFormat::ColumnFormat RegexpRowInputFormat::stringToFormat(const St bool RegexpRowInputFormat::readField(size_t index, MutableColumns & columns) { const auto & type = getPort().getHeader().getByPosition(index).type; - bool parse_as_nullable = format_settings.null_as_default && !type->isNullable(); + bool parse_as_nullable = format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable(); bool read = true; ReadBuffer field_buf(const_cast(matched_fields[index].data()), matched_fields[index].size(), 0); try @@ -94,9 +94,9 @@ bool RegexpRowInputFormat::readField(size_t index, MutableColumns & columns) break; case ColumnFormat::Raw: if (parse_as_nullable) - read = SerializationNullable::deserializeWholeTextImpl(*columns[index], field_buf, format_settings, serialization); + read = SerializationNullable::deserializeTextRawImpl(*columns[index], field_buf, format_settings, serialization); else - serialization->deserializeWholeText(*columns[index], field_buf, format_settings); + serialization->deserializeTextRaw(*columns[index], field_buf, format_settings); break; default: break; diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index 331d6e435d1..606c67aa0d1 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -143,7 +143,7 @@ bool TSKVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ex seen_columns[index] = read_columns[index] = true; const auto & type = getPort().getHeader().getByPosition(index).type; const auto & serialization = serializations[index]; - if (format_settings.null_as_default && !type->isNullable()) + if (format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable()) read_columns[index] = SerializationNullable::deserializeTextEscapedImpl(*columns[index], *in, format_settings, serialization); else serialization->deserializeTextEscaped(*columns[index], *in, format_settings); diff --git a/src/Processors/Formats/Impl/TSKVRowOutputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowOutputFormat.cpp index 6161303d23a..14dec8420a8 100644 --- a/src/Processors/Formats/Impl/TSKVRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowOutputFormat.cpp @@ -8,7 +8,7 @@ namespace DB { TSKVRowOutputFormat::TSKVRowOutputFormat(WriteBuffer & out_, const Block & header, const RowOutputFormatParams & params_, const FormatSettings & format_settings_) - : TabSeparatedRowOutputFormat(out_, header, false, false, params_, format_settings_) + : TabSeparatedRowOutputFormat(out_, header, false, false, false, params_, format_settings_) { const auto & sample = getPort(PortKind::Main).getHeader(); NamesAndTypesList columns(sample.getNamesAndTypesList()); diff --git a/src/Processors/Formats/Impl/TabSeparatedRawRowInputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRawRowInputFormat.h deleted file mode 100644 index 3e12388bede..00000000000 --- a/src/Processors/Formats/Impl/TabSeparatedRawRowInputFormat.h +++ /dev/null @@ -1,58 +0,0 @@ -#pragma once - -#include -#include -#include -#include - - -namespace DB -{ - -/** A stream to input data in tsv format, but without escaping individual values. - * It only supports columns without '\n' or '\t' - */ -class TabSeparatedRawRowInputFormat : public TabSeparatedRowInputFormat -{ -public: - /** with_names - the first line is the header with the names of the columns - * with_types - on the next line header with type names - */ - TabSeparatedRawRowInputFormat( - const Block & header_, - ReadBuffer & in_, - const Params & params_, - bool with_names_, - bool with_types_, - const FormatSettings & format_settings_) - : TabSeparatedRowInputFormat(header_, in_, params_, with_names_, with_types_, format_settings_) - { - } - - String getName() const override { return "TabSeparatedRawRowInputFormat"; } - - bool readField(IColumn & column, const DataTypePtr &, const SerializationPtr & serialization, bool) override - { - String tmp; - - while (!in->eof()) - { - char * pos = find_first_symbols<'\n', '\t'>(in->position(), in->buffer().end()); - - tmp.append(in->position(), pos - in->position()); - in->position() = pos; - - if (pos == in->buffer().end()) - in->next(); - else - break; - } - - ReadBufferFromString cell(tmp); - serialization->deserializeWholeText(column, cell, format_settings); - - return true; - } -}; - -} diff --git a/src/Processors/Formats/Impl/TabSeparatedRawRowOutputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRawRowOutputFormat.h deleted file mode 100644 index dc9312e53bc..00000000000 --- a/src/Processors/Formats/Impl/TabSeparatedRawRowOutputFormat.h +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - -#include -#include - - -namespace DB -{ - -/** A stream for outputting data in tsv format, but without escaping individual values. - * (That is, the output is irreversible.) - */ -class TabSeparatedRawRowOutputFormat : public TabSeparatedRowOutputFormat -{ -public: - TabSeparatedRawRowOutputFormat( - WriteBuffer & out_, - const Block & header_, - bool with_names_, - bool with_types_, - const RowOutputFormatParams & params_, - const FormatSettings & format_settings_) - : TabSeparatedRowOutputFormat(out_, header_, with_names_, with_types_, params_, format_settings_) - { - } - - String getName() const override { return "TabSeparatedRawRowOutputFormat"; } - - void writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) override - { - serialization.serializeText(column, row_num, out, format_settings); - } -}; - -} diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 1ff52c9f695..ec6dfef4f0c 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -1,9 +1,8 @@ #include -#include +#include #include #include -#include #include #include #include @@ -19,19 +18,6 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } - -static void skipTSVRow(ReadBuffer & in, const size_t num_columns) -{ - NullOutput null_sink; - - for (size_t i = 0; i < num_columns; ++i) - { - readEscapedStringInto(null_sink, in); - assertChar(i == num_columns - 1 ? '\n' : '\t', in); - } -} - - /** Check for a common error case - usage of Windows line feed. */ static void checkForCarriageReturn(ReadBuffer & in) @@ -45,187 +31,74 @@ static void checkForCarriageReturn(ReadBuffer & in) } -TabSeparatedRowInputFormat::TabSeparatedRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_, - bool with_names_, bool with_types_, const FormatSettings & format_settings_) - : RowInputFormatWithDiagnosticInfo(header_, in_, params_), with_names(with_names_), with_types(with_types_), format_settings(format_settings_) +TabSeparatedRowInputFormat::TabSeparatedRowInputFormat( + const Block & header_, + ReadBuffer & in_, + const Params & params_, + bool with_names_, + bool with_types_, + bool is_raw_, + const FormatSettings & format_settings_) + : RowInputFormatWithNamesAndTypes(header_, in_, params_, with_names_, with_types_, format_settings_), is_raw(is_raw_) { - const auto & sample = getPort().getHeader(); - size_t num_columns = sample.columns(); - - data_types.resize(num_columns); - column_indexes_by_names.reserve(num_columns); - - for (size_t i = 0; i < num_columns; ++i) - { - const auto & column_info = sample.getByPosition(i); - - data_types[i] = column_info.type; - column_indexes_by_names.emplace(column_info.name, i); - } - - column_mapping->column_indexes_for_input_fields.reserve(num_columns); - column_mapping->read_columns.assign(num_columns, false); } - -void TabSeparatedRowInputFormat::setupAllColumnsByTableSchema() +void TabSeparatedRowInputFormat::skipFieldDelimiter() { - const auto & header = getPort().getHeader(); - column_mapping->read_columns.assign(header.columns(), true); - column_mapping->column_indexes_for_input_fields.resize(header.columns()); - - for (size_t i = 0; i < column_mapping->column_indexes_for_input_fields.size(); ++i) - column_mapping->column_indexes_for_input_fields[i] = i; + assertChar('\t', *in); } - -void TabSeparatedRowInputFormat::addInputColumn(const String & column_name) -{ - const auto column_it = column_indexes_by_names.find(column_name); - if (column_it == column_indexes_by_names.end()) - { - if (format_settings.skip_unknown_fields) - { - column_mapping->column_indexes_for_input_fields.push_back(std::nullopt); - return; - } - - throw Exception( - "Unknown field found in TSV header: '" + column_name + "' " + - "at position " + std::to_string(column_mapping->column_indexes_for_input_fields.size()) + - "\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed", - ErrorCodes::INCORRECT_DATA - ); - } - - const auto column_index = column_it->second; - - if (column_mapping->read_columns[column_index]) - throw Exception("Duplicate field found while parsing TSV header: " + column_name, ErrorCodes::INCORRECT_DATA); - - column_mapping->read_columns[column_index] = true; - column_mapping->column_indexes_for_input_fields.emplace_back(column_index); -} - - -void TabSeparatedRowInputFormat::fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension & row_read_extension) -{ - /// It is safe to memorize this on the first run - the format guarantees this does not change - if (unlikely(row_num == 1)) - { - columns_to_fill_with_default_values.clear(); - for (size_t index = 0; index < column_mapping->read_columns.size(); ++index) - if (column_mapping->read_columns[index] == 0) - columns_to_fill_with_default_values.push_back(index); - } - - for (const auto column_index : columns_to_fill_with_default_values) - { - data_types[column_index]->insertDefaultInto(*columns[column_index]); - row_read_extension.read_columns[column_index] = false; - } -} - - -void TabSeparatedRowInputFormat::readPrefix() -{ - if (with_names || with_types || data_types.at(0)->textCanContainOnlyValidUTF8()) - { - /// In this format, we assume that column name or type cannot contain BOM, - /// so, if format has header, - /// then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it. - skipBOMIfExists(*in); - } - - /// This is a bit of abstraction leakage, but we have almost the same code in other places. - /// Thus, we check if this InputFormat is working with the "real" beginning of the data in case of parallel parsing. - if (with_names && getCurrentUnitNumber() == 0) - { - if (format_settings.with_names_use_header) - { - String column_name; - for (;;) - { - readEscapedString(column_name, *in); - if (!checkChar('\t', *in)) - { - /// Check last column for \r before adding it, otherwise an error will be: - /// "Unknown field found in TSV header" - checkForCarriageReturn(*in); - addInputColumn(column_name); - break; - } - else - addInputColumn(column_name); - } - - - if (!in->eof()) - { - assertChar('\n', *in); - } - } - else - { - setupAllColumnsByTableSchema(); - skipTSVRow(*in, column_mapping->column_indexes_for_input_fields.size()); - } - } - else if (!column_mapping->is_set) - setupAllColumnsByTableSchema(); - - if (with_types) - { - skipTSVRow(*in, column_mapping->column_indexes_for_input_fields.size()); - } -} - - -bool TabSeparatedRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ext) +void TabSeparatedRowInputFormat::skipRowEndDelimiter() { if (in->eof()) - return false; + return; - updateDiagnosticInfo(); + if (unlikely(row_num <= 1)) + checkForCarriageReturn(*in); - ext.read_columns.assign(column_mapping->read_columns.size(), true); - for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) + assertChar('\n', *in); +} + +String TabSeparatedRowInputFormat::readFieldIntoString() +{ + String field; + readEscapedString(field, *in); + return field; +} + +void TabSeparatedRowInputFormat::skipField() +{ + NullOutput null_sink; + readEscapedStringInto(null_sink, *in); +} + +void TabSeparatedRowInputFormat::skipRow() +{ + do { - const auto & column_index = column_mapping->column_indexes_for_input_fields[file_column]; - const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); - if (column_index) - { - const auto & type = data_types[*column_index]; - ext.read_columns[*column_index] = readField(*columns[*column_index], type, serializations[*column_index], is_last_file_column); - } - else - { - NullOutput null_sink; - readEscapedStringInto(null_sink, *in); - } - - /// skip separators - if (file_column + 1 < column_mapping->column_indexes_for_input_fields.size()) - { - assertChar('\t', *in); - } - else if (!in->eof()) - { - if (unlikely(row_num == 1)) - checkForCarriageReturn(*in); - - assertChar('\n', *in); - } + skipField(); } + while (checkChar('\t', *in)); - fillUnreadColumnsWithDefaults(columns, ext); + skipRowEndDelimiter(); +} - return true; +std::vector TabSeparatedRowInputFormat::readHeaderRow() +{ + std::vector fields; + do + { + fields.push_back(readFieldIntoString()); + } + while (checkChar('\t', *in)); + + skipRowEndDelimiter(); + return fields; } bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & type, - const SerializationPtr & serialization, bool is_last_file_column) + const SerializationPtr & serialization, bool is_last_file_column, const String & /*column_name*/) { const bool at_delimiter = !is_last_file_column && !in->eof() && *in->position() == '\t'; const bool at_last_column_line_end = is_last_file_column && (in->eof() || *in->position() == '\n'); @@ -235,137 +108,112 @@ bool TabSeparatedRowInputFormat::readField(IColumn & column, const DataTypePtr & column.insertDefault(); return false; } - else if (format_settings.null_as_default && !type->isNullable()) + + bool as_nullable = format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable(); + + if (is_raw) + { + if (as_nullable) + return SerializationNullable::deserializeTextRawImpl(column, *in, format_settings, serialization); + + serialization->deserializeTextRaw(column, *in, format_settings); + return true; + } + + + + if (as_nullable) return SerializationNullable::deserializeTextEscapedImpl(column, *in, format_settings, serialization); serialization->deserializeTextEscaped(column, *in, format_settings); return true; } -bool TabSeparatedRowInputFormat::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) +bool TabSeparatedRowInputFormat::parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) { - for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) + try { - if (file_column == 0 && in->eof()) + assertChar('\t', *in); + } + catch (const DB::Exception &) + { + if (*in->position() == '\n') { - out << "\n"; - return false; + out << "ERROR: Line feed found where tab is expected." + " It's like your file has less columns than expected.\n" + "And if your file has the right number of columns, " + "maybe it has an unescaped backslash in value before tab, which causes the tab to be escaped.\n"; } - - if (column_mapping->column_indexes_for_input_fields[file_column].has_value()) + else if (*in->position() == '\r') { - const auto & header = getPort().getHeader(); - size_t col_idx = column_mapping->column_indexes_for_input_fields[file_column].value(); - if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], - out, file_column)) - return false; + out << "ERROR: Carriage return found where tab is expected.\n"; } else { - static const String skipped_column_str = ""; - static const DataTypePtr skipped_column_type = std::make_shared(); - static const MutableColumnPtr skipped_column = skipped_column_type->createColumn(); - if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column)) - return false; - } - - /// Delimiters - if (file_column + 1 == column_mapping->column_indexes_for_input_fields.size()) - { - if (!in->eof()) - { - try - { - assertChar('\n', *in); - } - catch (const DB::Exception &) - { - if (*in->position() == '\t') - { - out << "ERROR: Tab found where line feed is expected." - " It's like your file has more columns than expected.\n" - "And if your file has the right number of columns, maybe it has an unescaped tab in a value.\n"; - } - else if (*in->position() == '\r') - { - out << "ERROR: Carriage return found where line feed is expected." - " It's like your file has DOS/Windows style line separators, that is illegal in TabSeparated format.\n"; - } - else - { - out << "ERROR: There is no line feed. "; - verbosePrintString(in->position(), in->position() + 1, out); - out << " found instead.\n"; - } - return false; - } - } - } - else - { - try - { - assertChar('\t', *in); - } - catch (const DB::Exception &) - { - if (*in->position() == '\n') - { - out << "ERROR: Line feed found where tab is expected." - " It's like your file has less columns than expected.\n" - "And if your file has the right number of columns, " - "maybe it has an unescaped backslash in value before tab, which causes the tab to be escaped.\n"; - } - else if (*in->position() == '\r') - { - out << "ERROR: Carriage return found where tab is expected.\n"; - } - else - { - out << "ERROR: There is no tab. "; - verbosePrintString(in->position(), in->position() + 1, out); - out << " found instead.\n"; - } - return false; - } + out << "ERROR: There is no tab. "; + verbosePrintString(in->position(), in->position() + 1, out); + out << " found instead.\n"; } + return false; } return true; } -void TabSeparatedRowInputFormat::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) +bool TabSeparatedRowInputFormat::parseRowEndWithDiagnosticInfo(WriteBuffer & out) { - const auto & index = column_mapping->column_indexes_for_input_fields[file_column]; - if (index) - { - bool can_be_parsed_as_null = removeLowCardinality(type)->isNullable(); + if (in->eof()) + return true; - // check null value for type is not nullable. don't cross buffer bound for simplicity, so maybe missing some case - if (!can_be_parsed_as_null && !in->eof()) + try + { + assertChar('\n', *in); + } + catch (const DB::Exception &) + { + if (*in->position() == '\t') { - if (*in->position() == '\\' && in->available() >= 2) + out << "ERROR: Tab found where line feed is expected." + " It's like your file has more columns than expected.\n" + "And if your file has the right number of columns, maybe it has an unescaped tab in a value.\n"; + } + else if (*in->position() == '\r') + { + out << "ERROR: Carriage return found where line feed is expected." + " It's like your file has DOS/Windows style line separators, that is illegal in TabSeparated format.\n"; + } + else + { + out << "ERROR: There is no line feed. "; + verbosePrintString(in->position(), in->position() + 1, out); + out << " found instead.\n"; + } + return false; + } + + return true; +} + +void TabSeparatedRowInputFormat::checkNullValueForNonNullable(DataTypePtr type) +{ + bool can_be_parsed_as_null = removeLowCardinality(type)->isNullable() || format_settings.null_as_default; + + // check null value for type is not nullable. don't cross buffer bound for simplicity, so maybe missing some case + if (!can_be_parsed_as_null && !in->eof()) + { + if (*in->position() == '\\' && in->available() >= 2) + { + ++in->position(); + if (*in->position() == 'N') { ++in->position(); - if (*in->position() == 'N') - { - ++in->position(); - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected NULL value of not Nullable type {}", type->getName()); - } - else - { - --in->position(); - } + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected NULL value of not Nullable type {}", type->getName()); + } + else + { + --in->position(); } } - - const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); - readField(column, type, serializations[*index], is_last_file_column); - } - else - { - NullOutput null_sink; - readEscapedStringInto(null_sink, *in); } } @@ -374,66 +222,28 @@ void TabSeparatedRowInputFormat::syncAfterError() skipToUnescapedNextLineOrEOF(*in); } -void TabSeparatedRowInputFormat::resetParser() -{ - RowInputFormatWithDiagnosticInfo::resetParser(); - const auto & sample = getPort().getHeader(); - column_mapping->read_columns.assign(sample.columns(), false); - column_mapping->column_indexes_for_input_fields.clear(); - columns_to_fill_with_default_values.clear(); -} - void registerInputFormatTabSeparated(FormatFactory & factory) { - for (const auto * name : {"TabSeparated", "TSV"}) + for (bool is_raw : {false, true}) { - factory.registerInputFormat(name, []( - ReadBuffer & buf, - const Block & sample, - IRowInputFormat::Params params, - const FormatSettings & settings) + auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - return std::make_shared(sample, buf, params, false, false, settings); - }); - } + factory.registerInputFormat(format_name, [with_names, with_types, is_raw]( + ReadBuffer & buf, + const Block & sample, + IRowInputFormat::Params params, + const FormatSettings & settings) + { + return std::make_shared(sample, buf, std::move(params), with_names, with_types, is_raw, settings); + }); + }; - for (const auto * name : {"TabSeparatedRaw", "TSVRaw"}) - { - factory.registerInputFormat(name, []( - ReadBuffer & buf, - const Block & sample, - IRowInputFormat::Params params, - const FormatSettings & settings) - { - return std::make_shared(sample, buf, params, false, false, settings); - }); - } - - for (const auto * name : {"TabSeparatedWithNames", "TSVWithNames"}) - { - factory.registerInputFormat(name, []( - ReadBuffer & buf, - const Block & sample, - IRowInputFormat::Params params, - const FormatSettings & settings) - { - return std::make_shared(sample, buf, params, true, false, settings); - }); - } - - for (const auto * name : {"TabSeparatedWithNamesAndTypes", "TSVWithNamesAndTypes"}) - { - factory.registerInputFormat(name, []( - ReadBuffer & buf, - const Block & sample, - IRowInputFormat::Params params, - const FormatSettings & settings) - { - return std::make_shared(sample, buf, params, true, true, settings); - }); + registerInputFormatWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func); + registerInputFormatWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func); } } +template static std::pair fileSegmentationEngineTabSeparatedImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size) { bool need_more_data = true; @@ -442,13 +252,18 @@ static std::pair fileSegmentationEngineTabSeparatedImpl(ReadBuffer while (loadAtPosition(in, memory, pos) && need_more_data) { - pos = find_first_symbols<'\\', '\r', '\n'>(pos, in.buffer().end()); + if constexpr (is_raw) + pos = find_first_symbols<'\r', '\n'>(pos, in.buffer().end()); + else + pos = find_first_symbols<'\\', '\r', '\n'>(pos, in.buffer().end()); if (pos > in.buffer().end()) - throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR); - else if (pos == in.buffer().end()) + throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR); + + if (pos == in.buffer().end()) continue; - else if (*pos == '\\') + + if (!is_raw && *pos == '\\') { ++pos; if (loadAtPosition(in, memory, pos)) @@ -472,11 +287,13 @@ static std::pair fileSegmentationEngineTabSeparatedImpl(ReadBuffer void registerFileSegmentationEngineTabSeparated(FormatFactory & factory) { + registerFileSegmentationEngineForFormatWithNamesAndTypes(factory, "TSV", &fileSegmentationEngineTabSeparatedImpl); + registerFileSegmentationEngineForFormatWithNamesAndTypes(factory, "TabSeparated", &fileSegmentationEngineTabSeparatedImpl); + registerFileSegmentationEngineForFormatWithNamesAndTypes(factory, "TSVRaw", &fileSegmentationEngineTabSeparatedImpl); + registerFileSegmentationEngineForFormatWithNamesAndTypes(factory, "TabSeparatedRaw", &fileSegmentationEngineTabSeparatedImpl); + // We can use the same segmentation engine for TSKV. - for (const auto & name : {"TabSeparated", "TSV", "TSKV", "TabSeparatedWithNames", "TSVWithNames"}) - { - factory.registerFileSegmentationEngine(name, &fileSegmentationEngineTabSeparatedImpl); - } + factory.registerFileSegmentationEngine("TSKV", &fileSegmentationEngineTabSeparatedImpl); } } diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h index 8127b5ceba7..31e6e12400a 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.h @@ -2,7 +2,7 @@ #include #include -#include +#include namespace DB @@ -10,46 +10,39 @@ namespace DB /** A stream to input data in tsv format. */ -class TabSeparatedRowInputFormat : public RowInputFormatWithDiagnosticInfo +class TabSeparatedRowInputFormat : public RowInputFormatWithNamesAndTypes { public: /** with_names - the first line is the header with the names of the columns * with_types - on the next line header with type names */ TabSeparatedRowInputFormat(const Block & header_, ReadBuffer & in_, const Params & params_, - bool with_names_, bool with_types_, const FormatSettings & format_settings_); + bool with_names_, bool with_types_, bool is_raw, const FormatSettings & format_settings_); String getName() const override { return "TabSeparatedRowInputFormat"; } - bool readRow(MutableColumns & columns, RowReadExtension &) override; - void readPrefix() override; bool allowSyncAfterError() const override { return true; } void syncAfterError() override; - void resetParser() override; - -protected: - bool with_names; - bool with_types; - const FormatSettings format_settings; - - virtual bool readField(IColumn & column, const DataTypePtr & type, - const SerializationPtr & serialization, bool is_last_file_column); - private: - DataTypes data_types; + bool is_raw; - using IndexesMap = std::unordered_map; - IndexesMap column_indexes_by_names; + bool readField(IColumn & column, const DataTypePtr & type, + const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) override; - std::vector columns_to_fill_with_default_values; + void skipField(const String & /*column_name*/) override { skipField(); } + void skipField(); + void skipRow() override; + void skipFieldDelimiter() override; + void skipRowEndDelimiter() override; - void addInputColumn(const String & column_name); - void setupAllColumnsByTableSchema(); - void fillUnreadColumnsWithDefaults(MutableColumns & columns, RowReadExtension & row_read_extension); + Names readHeaderRow() override; + String readFieldIntoString(); - bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; - void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override; + void checkNullValueForNonNullable(DataTypePtr type) override; + + bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) override; + bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) override; bool isGarbageAfterField(size_t, ReadBuffer::Position pos) override { return *pos != '\n' && *pos != '\t'; } }; diff --git a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp index 71d5bdba355..9a4f079867e 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp @@ -1,5 +1,4 @@ #include -#include #include #include @@ -11,41 +10,43 @@ TabSeparatedRowOutputFormat::TabSeparatedRowOutputFormat( const Block & header_, bool with_names_, bool with_types_, + bool is_raw_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_) - : IRowOutputFormat(header_, out_, params_), with_names(with_names_), with_types(with_types_), format_settings(format_settings_) + : IRowOutputFormat(header_, out_, params_), with_names(with_names_), with_types(with_types_), is_raw(is_raw_), format_settings(format_settings_) { } +void TabSeparatedRowOutputFormat::writeLine(const std::vector & values) +{ + for (size_t i = 0; i < values.size(); ++i) + { + writeEscapedString(values[i], out); + if (i + 1 == values.size()) + writeRowEndDelimiter(); + else + writeFieldDelimiter(); + } +} void TabSeparatedRowOutputFormat::doWritePrefix() { const auto & header = getPort(PortKind::Main).getHeader(); - size_t columns = header.columns(); if (with_names) - { - for (size_t i = 0; i < columns; ++i) - { - writeEscapedString(header.safeGetByPosition(i).name, out); - writeChar(i == columns - 1 ? '\n' : '\t', out); - } - } + writeLine(header.getNames()); if (with_types) - { - for (size_t i = 0; i < columns; ++i) - { - writeEscapedString(header.safeGetByPosition(i).type->getName(), out); - writeChar(i == columns - 1 ? '\n' : '\t', out); - } - } + writeLine(header.getDataTypeNames()); } void TabSeparatedRowOutputFormat::writeField(const IColumn & column, const ISerialization & serialization, size_t row_num) { - serialization.serializeTextEscaped(column, row_num, out, format_settings); + if (is_raw) + serialization.serializeTextRaw(column, row_num, out, format_settings); + else + serialization.serializeTextEscaped(column, row_num, out, format_settings); } @@ -75,56 +76,23 @@ void TabSeparatedRowOutputFormat::writeBeforeExtremes() void registerOutputFormatTabSeparated(FormatFactory & factory) { - for (const auto * name : {"TabSeparated", "TSV"}) + for (bool is_raw : {false, true}) { - factory.registerOutputFormat(name, []( - WriteBuffer & buf, - const Block & sample, - const RowOutputFormatParams & params, - const FormatSettings & settings) + auto register_func = [&](const String & format_name, bool with_names, bool with_types) { - return std::make_shared(buf, sample, false, false, params, settings); - }); - factory.markOutputFormatSupportsParallelFormatting(name); - } + factory.registerOutputFormat(format_name, [=]( + WriteBuffer & buf, + const Block & sample, + const RowOutputFormatParams & params, + const FormatSettings & settings) + { + return std::make_shared(buf, sample, with_names, with_types, is_raw, params, settings); + }); + factory.markOutputFormatSupportsParallelFormatting(format_name); + }; - for (const auto * name : {"TabSeparatedRaw", "TSVRaw"}) - { - factory.registerOutputFormat(name, []( - WriteBuffer & buf, - const Block & sample, - const RowOutputFormatParams & params, - const FormatSettings & settings) - { - return std::make_shared(buf, sample, false, false, params, settings); - }); - factory.markOutputFormatSupportsParallelFormatting(name); - } - - for (const auto * name : {"TabSeparatedWithNames", "TSVWithNames"}) - { - factory.registerOutputFormat(name, []( - WriteBuffer & buf, - const Block & sample, - const RowOutputFormatParams & params, - const FormatSettings & settings) - { - return std::make_shared(buf, sample, true, false, params, settings); - }); - factory.markOutputFormatSupportsParallelFormatting(name); - } - - for (const auto * name : {"TabSeparatedWithNamesAndTypes", "TSVWithNamesAndTypes"}) - { - factory.registerOutputFormat(name, []( - WriteBuffer & buf, - const Block & sample, - const RowOutputFormatParams & params, - const FormatSettings & settings) - { - return std::make_shared(buf, sample, true, true, params, settings); - }); - factory.markOutputFormatSupportsParallelFormatting(name); + registerOutputFormatWithNamesAndTypes(is_raw ? "TSVRaw" : "TSV", register_func); + registerOutputFormatWithNamesAndTypes(is_raw ? "TabSeparatedRaw" : "TabSeparated", register_func); } } diff --git a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.h b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.h index e3190be70e8..7dcc6529f1c 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.h +++ b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.h @@ -23,6 +23,7 @@ public: const Block & header_, bool with_names_, bool with_types_, + bool is_raw_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_); @@ -39,10 +40,13 @@ public: /// https://www.iana.org/assignments/media-types/text/tab-separated-values String getContentType() const override { return "text/tab-separated-values; charset=UTF-8"; } -protected: - +private: + void writeLine(const std::vector & values); bool with_names; bool with_types; + bool is_raw; + +protected: const FormatSettings format_settings; }; diff --git a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index ed98ab372b6..db5db4701a9 100644 --- a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -130,7 +130,7 @@ void TemplateBlockOutputFormat::serializeField(const IColumn & column, const ISe serialization.serializeTextXML(column, row_num, out, settings); break; case ColumnFormat::Raw: - serialization.serializeText(column, row_num, out, settings); + serialization.serializeTextRaw(column, row_num, out, settings); break; default: __builtin_unreachable(); diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index 021b2532b39..c096b62e967 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -45,8 +45,8 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, ReadBuffer } else { - if (format.formats[i] == ColumnFormat::Xml || format.formats[i] == ColumnFormat::Raw) - format.throwInvalidFormat("XML and Raw deserialization is not supported", i); + if (format.formats[i] == ColumnFormat::Xml) + format.throwInvalidFormat("XML deserialization is not supported", i); } } @@ -54,8 +54,8 @@ TemplateRowInputFormat::TemplateRowInputFormat(const Block & header_, ReadBuffer std::vector column_in_format(header_.columns(), false); for (size_t i = 0; i < row_format.columnsCount(); ++i) { - if (row_format.formats[i] == ColumnFormat::Xml || row_format.formats[i] == ColumnFormat::Raw) - row_format.throwInvalidFormat("XML and Raw deserialization is not supported", i); + if (row_format.formats[i] == ColumnFormat::Xml) + row_format.throwInvalidFormat("XML deserialization is not supported", i); if (row_format.format_idx_to_column_idx[i]) { @@ -194,7 +194,7 @@ bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type, { ColumnFormat col_format = row_format.formats[file_column]; bool read = true; - bool parse_as_nullable = settings.null_as_default && !type->isNullable(); + bool parse_as_nullable = settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable(); try { switch (col_format) @@ -226,6 +226,12 @@ bool TemplateRowInputFormat::deserializeField(const DataTypePtr & type, else serialization->deserializeTextJSON(column, buf, settings); break; + case ColumnFormat::Raw: + if (parse_as_nullable) + read = SerializationNullable::deserializeTextRawImpl(column, buf, settings, serialization); + else + serialization->deserializeTextRaw(column, buf, settings); + break; default: __builtin_unreachable(); } diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index 0f6a21055d0..4eb447b82c3 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -168,7 +168,7 @@ bool ValuesBlockInputFormat::tryReadValue(IColumn & column, size_t column_idx) bool read = true; const auto & type = types[column_idx]; const auto & serialization = serializations[column_idx]; - if (format_settings.null_as_default && !type->isNullable()) + if (format_settings.null_as_default && !type->isNullable() && !type->isLowCardinalityNullable()) read = SerializationNullable::deserializeTextQuotedImpl(column, buf, format_settings, serialization); else serialization->deserializeTextQuoted(column, buf, format_settings); @@ -409,7 +409,7 @@ bool ValuesBlockInputFormat::parseExpression(IColumn & column, size_t column_idx Field value = convertFieldToType(expression_value, type, value_raw.second.get()); /// Check that we are indeed allowed to insert a NULL. - if (value.isNull() && !type.isNullable()) + if (value.isNull() && !type.isNullable() && !type.isLowCardinalityNullable()) { if (format_settings.null_as_default) { diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp new file mode 100644 index 00000000000..614ec27c0d5 --- /dev/null +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -0,0 +1,265 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INCORRECT_DATA; +} + +RowInputFormatWithNamesAndTypes::RowInputFormatWithNamesAndTypes( + const Block & header_, + ReadBuffer & in_, + const Params & params_, + bool with_names_, + bool with_types_, + const FormatSettings & format_settings_) + : RowInputFormatWithDiagnosticInfo(header_, in_, params_), format_settings(format_settings_), with_names(with_names_), with_types(with_types_) +{ + const auto & sample = getPort().getHeader(); + size_t num_columns = sample.columns(); + + data_types.resize(num_columns); + column_indexes_by_names.reserve(num_columns); + + for (size_t i = 0; i < num_columns; ++i) + { + const auto & column_info = sample.getByPosition(i); + + data_types[i] = column_info.type; + column_indexes_by_names.emplace(column_info.name, i); + } +} + +void RowInputFormatWithNamesAndTypes::setupAllColumnsByTableSchema() +{ + const auto & header = getPort().getHeader(); + column_mapping->column_indexes_for_input_fields.resize(header.columns()); + column_mapping->names_of_columns = header.getNames(); + + for (size_t i = 0; i < column_mapping->column_indexes_for_input_fields.size(); ++i) + column_mapping->column_indexes_for_input_fields[i] = i; +} + +void RowInputFormatWithNamesAndTypes::addInputColumn(const String & column_name, std::vector & read_columns) +{ + column_mapping->names_of_columns.push_back(column_name); + + const auto column_it = column_indexes_by_names.find(column_name); + if (column_it == column_indexes_by_names.end()) + { + if (format_settings.skip_unknown_fields) + { + column_mapping->column_indexes_for_input_fields.push_back(std::nullopt); + return; + } + + throw Exception( + ErrorCodes::INCORRECT_DATA, + "Unknown field found in {} header: '{}' at position {}\nSet the 'input_format_skip_unknown_fields' parameter explicitly to ignore and proceed", + getName(), column_name, column_mapping->column_indexes_for_input_fields.size()); + } + + const auto column_index = column_it->second; + + if (read_columns[column_index]) + throw Exception("Duplicate field found while parsing TSV header: " + column_name, ErrorCodes::INCORRECT_DATA); + + read_columns[column_index] = true; + column_mapping->column_indexes_for_input_fields.emplace_back(column_index); +} + +void RowInputFormatWithNamesAndTypes::readPrefix() +{ + if (with_names || with_types || data_types.at(0)->textCanContainOnlyValidUTF8()) + { + /// We assume that column name or type cannot contain BOM, so, if format has header, + /// then BOM at beginning of stream cannot be confused with name or type of field, and it is safe to skip it. + skipBOMIfExists(*in); + } + + /// This is a bit of abstraction leakage, but we need it in parallel parsing: + /// we check if this InputFormat is working with the "real" beginning of the data. + if (with_names && getCurrentUnitNumber() == 0) + { + if (format_settings.with_names_use_header) + { + std::vector read_columns(data_types.size(), false); + auto column_names = readHeaderRow(); + for (const auto & name : column_names) + addInputColumn(name, read_columns); + + for (size_t i = 0; i != read_columns.size(); ++i) + { + if (!read_columns[i]) + column_mapping->not_presented_columns.push_back(i); + } + } + else + { + setupAllColumnsByTableSchema(); + skipRow(); + } + } + else if (!column_mapping->is_set) + setupAllColumnsByTableSchema(); + + if (with_types && getCurrentUnitNumber() == 0) + { + if (format_settings.with_types_use_header) + { + auto types = readHeaderRow(); + if (types.size() != column_mapping->column_indexes_for_input_fields.size()) + throw Exception( + ErrorCodes::INCORRECT_DATA, + "The number of data types differs from the number of column names in input data"); + + /// Check that types from input matches types from header. + for (size_t i = 0; i < types.size(); ++i) + { + if (column_mapping->column_indexes_for_input_fields[i] && + data_types[*column_mapping->column_indexes_for_input_fields[i]]->getName() != types[i]) + { + throw Exception( + ErrorCodes::INCORRECT_DATA, + "Type of '{}' must be {}, not {}", + getPort().getHeader().getByPosition(*column_mapping->column_indexes_for_input_fields[i]).name, + data_types[*column_mapping->column_indexes_for_input_fields[i]]->getName(), types[i]); + } + } + } + else + skipRow(); + } +} + +void RowInputFormatWithNamesAndTypes::insertDefaultsForNotSeenColumns(MutableColumns & columns, RowReadExtension & ext) +{ + for (auto index : column_mapping->not_presented_columns) + { + columns[index]->insertDefault(); + ext.read_columns[index] = false; + } +} + +bool RowInputFormatWithNamesAndTypes::readRow(MutableColumns & columns, RowReadExtension & ext) +{ + if (in->eof()) + return false; + + updateDiagnosticInfo(); + skipRowStartDelimiter(); + + ext.read_columns.resize(data_types.size()); + for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) + { + const auto & column_index = column_mapping->column_indexes_for_input_fields[file_column]; + const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); + if (column_index) + ext.read_columns[*column_index] = readField( + *columns[*column_index], + data_types[*column_index], + serializations[*column_index], + is_last_file_column, + column_mapping->names_of_columns[file_column]); + else + skipField(column_mapping->names_of_columns[file_column]); + + if (!is_last_file_column) + skipFieldDelimiter(); + } + + skipRowEndDelimiter(); + + insertDefaultsForNotSeenColumns(columns, ext); + + /// If defaults_for_omitted_fields is set to 0, we should leave already inserted defaults. + if (!format_settings.defaults_for_omitted_fields) + ext.read_columns.assign(ext.read_columns.size(), true); + + return true; +} + +void RowInputFormatWithNamesAndTypes::resetParser() +{ + RowInputFormatWithDiagnosticInfo::resetParser(); + column_mapping->column_indexes_for_input_fields.clear(); + column_mapping->not_presented_columns.clear(); +} + +void RowInputFormatWithNamesAndTypes::tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) +{ + const auto & index = column_mapping->column_indexes_for_input_fields[file_column]; + if (index) + { + checkNullValueForNonNullable(type); + const bool is_last_file_column = file_column + 1 == column_mapping->column_indexes_for_input_fields.size(); + readField(column, type, serializations[*index], is_last_file_column, column_mapping->names_of_columns[file_column]); + } + else + { + skipField(column_mapping->names_of_columns[file_column]); + } +} + +bool RowInputFormatWithNamesAndTypes::parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) +{ + if (in->eof()) + { + out << "\n"; + return false; + } + + if (!parseRowStartWithDiagnosticInfo(out)) + return false; + + for (size_t file_column = 0; file_column < column_mapping->column_indexes_for_input_fields.size(); ++file_column) + { + if (column_mapping->column_indexes_for_input_fields[file_column].has_value()) + { + const auto & header = getPort().getHeader(); + size_t col_idx = column_mapping->column_indexes_for_input_fields[file_column].value(); + if (!deserializeFieldAndPrintDiagnosticInfo(header.getByPosition(col_idx).name, data_types[col_idx], *columns[col_idx], out, file_column)) + return false; + } + else + { + static const String skipped_column_str = ""; + static const DataTypePtr skipped_column_type = std::make_shared(); + static const MutableColumnPtr skipped_column = skipped_column_type->createColumn(); + if (!deserializeFieldAndPrintDiagnosticInfo(skipped_column_str, skipped_column_type, *skipped_column, out, file_column)) + return false; + } + + /// Delimiters + if (file_column + 1 != column_mapping->column_indexes_for_input_fields.size()) + { + if (!parseFieldDelimiterWithDiagnosticInfo(out)) + return false; + } + } + + return parseRowEndWithDiagnosticInfo(out); +} + +void registerInputFormatWithNamesAndTypes(const String & base_format_name, RegisterFormatWithNamesAndTypesFunc register_func) +{ + register_func(base_format_name, false, false); + register_func(base_format_name + "WithNames", true, false); + register_func(base_format_name + "WithNamesAndTypes", true, true); +} + +void registerFileSegmentationEngineForFormatWithNamesAndTypes( + FormatFactory & factory, const String & base_format_name, FormatFactory::FileSegmentationEngine segmentation_engine) +{ + factory.registerFileSegmentationEngine(base_format_name, segmentation_engine); + factory.registerFileSegmentationEngine(base_format_name + "WithNames", segmentation_engine); + factory.registerFileSegmentationEngine(base_format_name + "WithNamesAndTypes", segmentation_engine); +} + + +} diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h new file mode 100644 index 00000000000..d9413b3a9bc --- /dev/null +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h @@ -0,0 +1,73 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +/// Base class for input formats with -WithNames and -WithNamesAndTypes suffixes. +class RowInputFormatWithNamesAndTypes : public RowInputFormatWithDiagnosticInfo +{ +public: + /** with_names - in the first line the header with column names + * with_types - in the second line the header with column names + */ + RowInputFormatWithNamesAndTypes( + const Block & header_, + ReadBuffer & in_, + const Params & params_, + bool with_names_, bool with_types_, const FormatSettings & format_settings_); + + bool readRow(MutableColumns & columns, RowReadExtension & ext) override; + void readPrefix() override; + void resetParser() override; + +protected: + /// Return false if there was no real value and we inserted default value. + virtual bool readField(IColumn & column, const DataTypePtr & type, const SerializationPtr & serialization, bool is_last_file_column, const String & column_name) = 0; + + virtual void skipField(const String & column_name) = 0; + virtual void skipRow() = 0; + virtual void skipRowStartDelimiter() {} + virtual void skipFieldDelimiter() {} + virtual void skipRowEndDelimiter() {} + + + /// Methods for parsing with diagnostic info. + virtual void checkNullValueForNonNullable(DataTypePtr /*type*/) {} + virtual bool parseRowStartWithDiagnosticInfo(WriteBuffer & /*out*/) { return true; } + virtual bool parseFieldDelimiterWithDiagnosticInfo(WriteBuffer & out) = 0; + virtual bool parseRowEndWithDiagnosticInfo(WriteBuffer & out) = 0; + + /// Read the list of names or types. + virtual std::vector readHeaderRow() = 0; + + const FormatSettings format_settings; + DataTypes data_types; + +private: + bool parseRowAndPrintDiagnosticInfo(MutableColumns & columns, WriteBuffer & out) override; + void tryDeserializeField(const DataTypePtr & type, IColumn & column, size_t file_column) override; + + void setupAllColumnsByTableSchema(); + void addInputColumn(const String & column_name, std::vector & read_columns); + void insertDefaultsForNotSeenColumns(MutableColumns & columns, RowReadExtension & ext); + + bool with_names; + bool with_types; + std::unordered_map column_indexes_by_names; +}; + +using RegisterFormatWithNamesAndTypesFunc = std::function; + +void registerInputFormatWithNamesAndTypes(const String & base_format_name, RegisterFormatWithNamesAndTypesFunc register_func); + +void registerFileSegmentationEngineForFormatWithNamesAndTypes( + FormatFactory & factory, const String & base_format_name, FormatFactory::FileSegmentationEngine segmentation_engine); + +} diff --git a/tests/queries/0_stateless/00300_csv.reference b/tests/queries/0_stateless/00300_csv.reference index 9d2fe7233d8..42cd22078c4 100644 --- a/tests/queries/0_stateless/00300_csv.reference +++ b/tests/queries/0_stateless/00300_csv.reference @@ -1,6 +1,10 @@ +"Hello, ""World""",123,"[1,2,3]",456,"['abc','def']","Newline +here" "x","y","z","a","b" "Hello, ""World""",123,"[1,2,3]",456,"['abc','def']","Newline here" +"x","y","z","a","b" +"String","UInt8","Array(UInt8)","Tuple(UInt16, Array(String))","String" "Hello, ""World""",123,"[1,2,3]",456,"['abc','def']","Newline here" 0,"0","[]","2000-01-01","2000-01-01 00:00:00" diff --git a/tests/queries/0_stateless/00300_csv.sql b/tests/queries/0_stateless/00300_csv.sql index 0c761ad0af1..76b1b29df06 100644 --- a/tests/queries/0_stateless/00300_csv.sql +++ b/tests/queries/0_stateless/00300_csv.sql @@ -1,3 +1,4 @@ -SELECT 'Hello, "World"' AS x, 123 AS y, [1, 2, 3] AS z, (456, ['abc', 'def']) AS a, 'Newline\nhere' AS b FORMAT CSVWithNames; SELECT 'Hello, "World"' AS x, 123 AS y, [1, 2, 3] AS z, (456, ['abc', 'def']) AS a, 'Newline\nhere' AS b FORMAT CSV; +SELECT 'Hello, "World"' AS x, 123 AS y, [1, 2, 3] AS z, (456, ['abc', 'def']) AS a, 'Newline\nhere' AS b FORMAT CSVWithNames; +SELECT 'Hello, "World"' AS x, 123 AS y, [1, 2, 3] AS z, (456, ['abc', 'def']) AS a, 'Newline\nhere' AS b FORMAT CSVWithNamesAndTypes; SELECT number, toString(number), range(number), toDate('2000-01-01') + number, toDateTime('2000-01-01 00:00:00') + number FROM system.numbers LIMIT 10 FORMAT CSV; diff --git a/tests/queries/0_stateless/00301_csv.sh b/tests/queries/0_stateless/00301_csv.sh index 0aee9abe25c..39721ce1050 100755 --- a/tests/queries/0_stateless/00301_csv.sh +++ b/tests/queries/0_stateless/00301_csv.sh @@ -13,7 +13,7 @@ Hello "world", 789 ,2016-01-03 "Hello world", 100, 2016-01-04, default,, - default-eof,,' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --query="INSERT INTO csv FORMAT CSV"; + default-eof,,' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_csv_empty_as_default=1 --query="INSERT INTO csv FORMAT CSV"; $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY d"; $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; @@ -33,7 +33,7 @@ $CLICKHOUSE_CLIENT --query="CREATE TABLE csv (t Nullable(DateTime('Europe/Moscow echo 'NULL, NULL "2016-01-01 01:02:03",NUL -"2016-01-02 01:02:03",Nhello' | $CLICKHOUSE_CLIENT --input_format_csv_unquoted_null_literal_as_null=1 --query="INSERT INTO csv FORMAT CSV"; +"2016-01-02 01:02:03",Nhello' | $CLICKHOUSE_CLIENT --input_format_csv_unquoted_null_literal_as_null=1 --input_format_csv_empty_as_default=1 --query="INSERT INTO csv FORMAT CSV"; $CLICKHOUSE_CLIENT --query="SELECT * FROM csv ORDER BY s NULLS LAST"; $CLICKHOUSE_CLIENT --query="DROP TABLE csv"; diff --git a/tests/queries/0_stateless/00938_template_input_format.reference b/tests/queries/0_stateless/00938_template_input_format.reference index ce89532886d..e1f77d9a581 100644 --- a/tests/queries/0_stateless/00938_template_input_format.reference +++ b/tests/queries/0_stateless/00938_template_input_format.reference @@ -23,3 +23,11 @@ cv bn m","","as""df'gh","",456,"2016-01-02" "as""df'gh","","zx cv bn m","",789,"2016-01-04" "qwe,rty","","","",9876543210,"2016-01-03" +==== check raw ==== +"qwe,rty","as""df'gh","","zx +cv bn m",123,"2016-01-01" +"as""df\'gh","","zx +cv bn m","qwe,rty",456,"2016-01-02" +"zx\cv\bn m","qwe,rty","as""df'gh","",789,"2016-01-04" +"","zx +cv bn m","qwe,rty","as""df'gh",9876543210,"2016-01-03" diff --git a/tests/queries/0_stateless/00938_template_input_format.sh b/tests/queries/0_stateless/00938_template_input_format.sh index 75616b35af0..bf7631cf3d5 100755 --- a/tests/queries/0_stateless/00938_template_input_format.sh +++ b/tests/queries/0_stateless/00938_template_input_format.sh @@ -50,6 +50,30 @@ format_template_rows_between_delimiter = ','"; $CLICKHOUSE_CLIENT --query="SELECT * FROM template2 ORDER BY n FORMAT CSV"; +echo "==== check raw ====" + +echo -ne '{prefix} \n${data}\n $$ suffix $$\n' > "$CURDIR"/00938_template_input_format_resultset.tmp +echo -ne 'n:\t${n:Escaped}, s1:\t${0:Raw}\t, s2:\t${1:Quoted}, s3:\t${s3:JSON}, s4:\t${3:CSV}, d:\t${d:Escaped}\t' > "$CURDIR"/00938_template_input_format_row.tmp + + +$CLICKHOUSE_CLIENT --query="TRUNCATE TABLE template1"; + +echo "{prefix}"' '" +n: 123, s1: qwe,rty , s2: 'as\"df\\'gh', s3: \"\", s4: \"zx +cv bn m\", d: 2016-01-01 ; +n: 456, s1: as\"df\\'gh , s2: '', s3: \"zx\\ncv\\tbn m\", s4: \"qwe,rty\", d: 2016-01-02 ; +n: 9876543210, s1: , s2: 'zx\\ncv\\tbn m', s3: \"qwe,rty\", s4: \"as\"\"df'gh\", d: 2016-01-03 ; +n: 789, s1: zx\cv\bn m , s2: 'qwe,rty', s3: \"as\\\"df'gh\", s4: \"\", d: 2016-01-04"$'\t'" + $ suffix $" | $CLICKHOUSE_CLIENT --query="INSERT INTO template1 FORMAT Template SETTINGS \ +format_template_resultset = '$CURDIR/00938_template_input_format_resultset.tmp', \ +format_template_row = '$CURDIR/00938_template_input_format_row.tmp', \ +format_template_rows_between_delimiter = ';\n'"; + +$CLICKHOUSE_CLIENT --query="SELECT * FROM template1 ORDER BY n FORMAT CSV"; + + + $CLICKHOUSE_CLIENT --query="DROP TABLE template1"; $CLICKHOUSE_CLIENT --query="DROP TABLE template2"; rm "$CURDIR"/00938_template_input_format_resultset.tmp "$CURDIR"/00938_template_input_format_row.tmp + diff --git a/tests/queries/0_stateless/01034_JSONCompactEachRow.reference b/tests/queries/0_stateless/01034_JSONCompactEachRow.reference index 6ec53e11fc9..bfc99d688d5 100644 --- a/tests/queries/0_stateless/01034_JSONCompactEachRow.reference +++ b/tests/queries/0_stateless/01034_JSONCompactEachRow.reference @@ -12,6 +12,11 @@ [1, "a"] [2, "b"] [3, "c"] +---------- +["value", "name"] +[1, "a"] +[2, "b"] +[3, "c"] 4 ["name", "c"] ["String", "UInt64"] @@ -31,17 +36,33 @@ 8 ["first", 1, 2, 0] ["second", 2, 0, 6] +["first", 1, 2, 0] +["second", 2, 0, 6] 9 ["first", 1, 2, 8] ["second", 2, 32, 6] +["first", 1, 2, 8] +["second", 2, 32, 6] 10 ["first", 1, 16, 8] ["second", 2, 32, 8] +["first", 1, 16, 8] +["second", 2, 32, 8] 11 ["v1", "v2", "v3", "v4"] ["String", "UInt8", "UInt16", "UInt8"] ["", 2, 3, 1] +["", 2, 3, 1] +---------- +["v1", "v2", "v3", "v4"] +["", 2, 3, 1] +["", 2, 3, 1] 12 ["v1", "n.id", "n.name"] ["UInt8", "Array(UInt8)", "Array(String)"] [16, [15,16,0], ["first","second","third"]] +[16, [15,16,0], ["first","second","third"]] +---------- +["v1", "n.id", "n.name"] +[16, [15,16,0], ["first","second","third"]] +[16, [15,16,0], ["first","second","third"]] diff --git a/tests/queries/0_stateless/01034_JSONCompactEachRow.sql b/tests/queries/0_stateless/01034_JSONCompactEachRow.sql index f5442c90a2a..f71597a60e5 100644 --- a/tests/queries/0_stateless/01034_JSONCompactEachRow.sql +++ b/tests/queries/0_stateless/01034_JSONCompactEachRow.sql @@ -10,8 +10,10 @@ SELECT 2; /* Check Totals */ SELECT name, count() AS c FROM test_table GROUP BY name WITH TOTALS ORDER BY name FORMAT JSONCompactEachRow; SELECT 3; -/* Check JSONCompactEachRowWithNamesAndTypes Output */ +/* Check JSONCompactEachRowWithNames and JSONCompactEachRowWithNamesAndTypes Output */ SELECT * FROM test_table FORMAT JSONCompactEachRowWithNamesAndTypes; +SELECT '----------'; +SELECT * FROM test_table FORMAT JSONCompactEachRowWithNames; SELECT 4; /* Check Totals */ SELECT name, count() AS c FROM test_table GROUP BY name WITH TOTALS ORDER BY name FORMAT JSONCompactEachRowWithNamesAndTypes; @@ -35,30 +37,39 @@ INSERT INTO test_table_2 FORMAT JSONCompactEachRow [16, [15, 16, null], ["first" SELECT * FROM test_table_2 FORMAT JSONCompactEachRow; TRUNCATE TABLE test_table_2; SELECT 8; -/* Check JSONCompactEachRowWithNamesAndTypes Output */ +/* Check JSONCompactEachRowWithNamesAndTypes and JSONCompactEachRowWithNamesAndTypes Input */ SET input_format_null_as_default = 0; INSERT INTO test_table FORMAT JSONCompactEachRowWithNamesAndTypes ["v1", "v2", "v3", "v4"]["String","UInt8","UInt16","UInt8"]["first", 1, "2", null]["second", 2, null, 6]; +INSERT INTO test_table FORMAT JSONCompactEachRowWithNames ["v1", "v2", "v3", "v4"]["first", 1, "2", null]["second", 2, null, 6]; SELECT * FROM test_table FORMAT JSONCompactEachRow; TRUNCATE TABLE test_table; SELECT 9; /* Check input_format_null_as_default = 1 */ SET input_format_null_as_default = 1; INSERT INTO test_table FORMAT JSONCompactEachRowWithNamesAndTypes ["v1", "v2", "v3", "v4"]["String","UInt8","UInt16","UInt8"]["first", 1, "2", null] ["second", 2, null, 6]; +INSERT INTO test_table FORMAT JSONCompactEachRowWithNames ["v1", "v2", "v3", "v4"]["first", 1, "2", null] ["second", 2, null, 6]; SELECT * FROM test_table FORMAT JSONCompactEachRow; SELECT 10; /* Check Header */ TRUNCATE TABLE test_table; SET input_format_skip_unknown_fields = 1; INSERT INTO test_table FORMAT JSONCompactEachRowWithNamesAndTypes ["v1", "v2", "invalid_column"]["String", "UInt8", "UInt8"]["first", 1, 32]["second", 2, "64"]; +INSERT INTO test_table FORMAT JSONCompactEachRowWithNames ["v1", "v2", "invalid_column"]["first", 1, 32]["second", 2, "64"]; SELECT * FROM test_table FORMAT JSONCompactEachRow; SELECT 11; TRUNCATE TABLE test_table; INSERT INTO test_table FORMAT JSONCompactEachRowWithNamesAndTypes ["v4", "v2", "v3"]["UInt8", "UInt8", "UInt16"][1, 2, 3] +INSERT INTO test_table FORMAT JSONCompactEachRowWithNames ["v4", "v2", "v3"][1, 2, 3] SELECT * FROM test_table FORMAT JSONCompactEachRowWithNamesAndTypes; +SELECT '----------'; +SELECT * FROM test_table FORMAT JSONCompactEachRowWithNames; SELECT 12; /* Check Nested */ INSERT INTO test_table_2 FORMAT JSONCompactEachRowWithNamesAndTypes ["v1", "n.id", "n.name"]["UInt8", "Array(UInt8)", "Array(String)"][16, [15, 16, null], ["first", "second", "third"]]; +INSERT INTO test_table_2 FORMAT JSONCompactEachRowWithNames ["v1", "n.id", "n.name"][16, [15, 16, null], ["first", "second", "third"]]; SELECT * FROM test_table_2 FORMAT JSONCompactEachRowWithNamesAndTypes; +SELECT '----------'; +SELECT * FROM test_table_2 FORMAT JSONCompactEachRowWithNames; DROP TABLE IF EXISTS test_table; DROP TABLE IF EXISTS test_table_2; diff --git a/tests/queries/0_stateless/01195_formats_diagnostic_info.reference b/tests/queries/0_stateless/01195_formats_diagnostic_info.reference index 15fc31538ce..eddbb80198d 100644 --- a/tests/queries/0_stateless/01195_formats_diagnostic_info.reference +++ b/tests/queries/0_stateless/01195_formats_diagnostic_info.reference @@ -1,5 +1,5 @@ CSV -Column 2, name: d, type: Decimal(18, 10), parsed text: "123456789"ERROR +Column 2, name: d, type: Decimal(18, 10), parsed text: " 123456789"ERROR ERROR: garbage after DateTime: "7, Hello" ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format. ERROR: There is no line feed. "1" found instead. @@ -28,3 +28,14 @@ ERROR: There is no delimiter before field 1: expected "", got "7Hello< ERROR: There is no delimiter after last field: expected "", got "1" ERROR: There is no delimiter after last field: expected "", got "Hello" Column 0, name: t, type: DateTime, ERROR: text "" is not like DateTime +JSONCompactEachRow +Column 2, name: d, type: Decimal(18, 10), parsed text: " 123456789"ERROR +Column 0, name: t, type: DateTime, parsed text: "2020-04-21 12:34:56"ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format. +ERROR: garbage after DateTime: "7, Hello" +ERROR: DateTime must be in YYYY-MM-DD hh:mm:ss or NNNNNNNNNN (unix timestamp, exactly 10 digits) format. +ERROR: There is no closing parenthesis (']') at the end of the row. "," found instead. +Column 1, name: s, type: String, parsed text: ERROR +ERROR: There is no '[' before the row. +ERROR: garbage after Decimal(18, 10): ";" +ERROR: There is no comma. ";" found instead. +ERROR: Closing parenthesis (']') found where comma is expected. It's like your file has less columns than expected. diff --git a/tests/queries/0_stateless/01195_formats_diagnostic_info.sh b/tests/queries/0_stateless/01195_formats_diagnostic_info.sh index 6c64b17f719..dde410d95c4 100755 --- a/tests/queries/0_stateless/01195_formats_diagnostic_info.sh +++ b/tests/queries/0_stateless/01195_formats_diagnostic_info.sh @@ -38,3 +38,19 @@ echo -e '2020-04-21 12:34:567\tHello\t123456789' | "${PARSER[@]}" 2>&1| grep "ER echo -e '2020-04-21 12:34:56\tHello\t12345678\t1' | "${PARSER[@]}" 2>&1| grep "ERROR" echo -e '2020-04-21 12:34:56\t\t123Hello' | "${PARSER[@]}" 2>&1| grep "ERROR" echo -e '2020-04-21 12:34:56\tHello\t12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR" + +PARSER=(${CLICKHOUSE_LOCAL} --query 'SELECT t, s, d FROM table' --structure 't DateTime, s String, d Decimal64(10)' --input-format JSONCompactEachRow) +echo '["2020-04-21 12:34:56", "Hello", 12345678]' | "${PARSER[@]}" 2>&1| grep "ERROR" || echo "JSONCompactEachRow" +echo '["2020-04-21 12:34:56", "Hello", 123456789]' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo '["2020-04-21 12:34:567", "Hello", 123456789]' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo '["2020-04-21 12:34:56"7, "Hello", 123456789]' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo '["2020-04-21 12:34:56", "Hello", 12345678,1]' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo '["2020-04-21 12:34:56",,123Hello]' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '["2020-04-21 12:34:56", "Hello", 12345678\n]' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '"2020-04-21 12:34:56", "Hello", 12345678]' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '["2020-04-21 12:34:56", "Hello", 12345678;' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '["2020-04-21 12:34:56", "Hello", 12345678' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '["2020-04-21 12:34:56", "Hello", 12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '["2020-04-21 12:34:56", "Hello"; 12345678\n' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '["2020-04-21 12:34:56", "Hello"\n' | "${PARSER[@]}" 2>&1| grep "ERROR" +echo -e '["2020-04-21 12:34:56", "Hello"]' | "${PARSER[@]}" 2>&1| grep "ERROR" diff --git a/tests/queries/0_stateless/01375_output_format_tsv_csv_with_names.reference b/tests/queries/0_stateless/01375_output_format_tsv_csv_with_names.reference index 6f1974ccd73..ffea4c736dc 100644 --- a/tests/queries/0_stateless/01375_output_format_tsv_csv_with_names.reference +++ b/tests/queries/0_stateless/01375_output_format_tsv_csv_with_names.reference @@ -7,7 +7,21 @@ number UInt64 0 1 +TSVRawWithNames +number +0 +1 +TSVRawWithNamesAndTypes +number +UInt64 +0 +1 CSVWithNames "number" 0 1 +CSVWithNamesAndTypes +"number" +"UInt64" +0 +1 diff --git a/tests/queries/0_stateless/01375_output_format_tsv_csv_with_names.sh b/tests/queries/0_stateless/01375_output_format_tsv_csv_with_names.sh index ad9cc2c53a8..69f3ab1c9a8 100755 --- a/tests/queries/0_stateless/01375_output_format_tsv_csv_with_names.sh +++ b/tests/queries/0_stateless/01375_output_format_tsv_csv_with_names.sh @@ -15,5 +15,14 @@ ${CLICKHOUSE_LOCAL} "${opts[@]}" --format TSVWithNames echo 'TSVWithNamesAndTypes' ${CLICKHOUSE_LOCAL} "${opts[@]}" --format TSVWithNamesAndTypes +echo 'TSVRawWithNames' +${CLICKHOUSE_LOCAL} "${opts[@]}" --format TSVWithNames + +echo 'TSVRawWithNamesAndTypes' +${CLICKHOUSE_LOCAL} "${opts[@]}" --format TSVWithNamesAndTypes + echo 'CSVWithNames' ${CLICKHOUSE_LOCAL} "${opts[@]}" --format CSVWithNames + +echo 'CSVWithNamesAndTypes' +${CLICKHOUSE_LOCAL} "${opts[@]}" --format CSVWithNamesAndTypes diff --git a/tests/queries/0_stateless/01375_storage_file_tsv_csv_with_names_write_prefix.reference b/tests/queries/0_stateless/01375_storage_file_tsv_csv_with_names_write_prefix.reference index 4f2a79b9905..78286b89a39 100644 --- a/tests/queries/0_stateless/01375_storage_file_tsv_csv_with_names_write_prefix.reference +++ b/tests/queries/0_stateless/01375_storage_file_tsv_csv_with_names_write_prefix.reference @@ -2,6 +2,11 @@ zero rows TSVWithNames TSVWithNamesAndTypes CSVWithNames +CSVWithNamesAndTypes +JSONCompactEachRowWithNames +JSONCompactEachRowWithNamesAndTypes +JSONCompactStringsEachRow +JSONCompactStringsEachRowWithNamesAndTypes multi clickhouse-local one file TSVWithNames 0 @@ -15,3 +20,23 @@ CSVWithNames 0 0 0 +CSVWithNamesAndTypes +0 +0 +0 +JSONCompactEachRowWithNames +0 +0 +0 +JSONCompactEachRowWithNamesAndTypes +0 +0 +0 +JSONCompactStringsEachRow +0 +0 +0 +JSONCompactStringsEachRowWithNamesAndTypes +0 +0 +0 diff --git a/tests/queries/0_stateless/01375_storage_file_tsv_csv_with_names_write_prefix.sh b/tests/queries/0_stateless/01375_storage_file_tsv_csv_with_names_write_prefix.sh index 469f7e7008b..7731deaa8ff 100755 --- a/tests/queries/0_stateless/01375_storage_file_tsv_csv_with_names_write_prefix.sh +++ b/tests/queries/0_stateless/01375_storage_file_tsv_csv_with_names_write_prefix.sh @@ -6,26 +6,26 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # zero rows echo 'zero rows' -for format in TSVWithNames TSVWithNamesAndTypes CSVWithNames; do +for format in TSVWithNames TSVWithNamesAndTypes CSVWithNames CSVWithNamesAndTypes JSONCompactEachRowWithNames JSONCompactEachRowWithNamesAndTypes JSONCompactStringsEachRow JSONCompactStringsEachRowWithNamesAndTypes; do echo $format ${CLICKHOUSE_LOCAL} --query=" - CREATE TABLE ${format}_01375 ENGINE File($format, '01375_$format.tsv') AS SELECT * FROM numbers(1) WHERE number < 0; + CREATE TABLE ${format}_01375 ENGINE File($format, '01375_$format') AS SELECT * FROM numbers(1) WHERE number < 0; SELECT * FROM ${format}_01375; DROP TABLE ${format}_01375; " - rm 01375_$format.tsv + rm 01375_$format done # run multiple times to the same file echo 'multi clickhouse-local one file' -for format in TSVWithNames TSVWithNamesAndTypes CSVWithNames; do +for format in TSVWithNames TSVWithNamesAndTypes CSVWithNames CSVWithNamesAndTypes JSONCompactEachRowWithNames JSONCompactEachRowWithNamesAndTypes JSONCompactStringsEachRow JSONCompactStringsEachRowWithNamesAndTypes; do echo $format for _ in {1..2}; do ${CLICKHOUSE_LOCAL} --query=" - CREATE TABLE ${format}_01375 ENGINE File($format, '01375_$format.tsv') AS SELECT * FROM numbers(1); + CREATE TABLE ${format}_01375 ENGINE File($format, '01375_$format') AS SELECT * FROM numbers(1); SELECT * FROM ${format}_01375; DROP TABLE ${format}_01375; " done - rm 01375_$format.tsv + rm 01375_$format done diff --git a/tests/queries/0_stateless/01448_json_compact_strings_each_row.reference b/tests/queries/0_stateless/01448_json_compact_strings_each_row.reference index fb1a066f272..8a69cf26ffd 100644 --- a/tests/queries/0_stateless/01448_json_compact_strings_each_row.reference +++ b/tests/queries/0_stateless/01448_json_compact_strings_each_row.reference @@ -12,6 +12,11 @@ ["1", "a"] ["2", "b"] ["3", "c"] +---------- +["value", "name"] +["1", "a"] +["2", "b"] +["3", "c"] 4 ["name", "c"] ["String", "UInt64"] @@ -31,17 +36,33 @@ 8 ["first", "1", "2", "0"] ["second", "2", "0", "6"] +["first", "1", "2", "0"] +["second", "2", "0", "6"] 9 ["first", "1", "2", "8"] ["second", "2", "32", "6"] +["first", "1", "2", "8"] +["second", "2", "32", "6"] 10 ["first", "1", "16", "8"] ["second", "2", "32", "8"] +["first", "1", "16", "8"] +["second", "2", "32", "8"] 11 ["v1", "v2", "v3", "v4"] ["String", "UInt8", "UInt16", "UInt8"] ["", "2", "3", "1"] +["", "2", "3", "1"] +--------- +["v1", "v2", "v3", "v4"] +["", "2", "3", "1"] +["", "2", "3", "1"] 12 ["v1", "n.id", "n.name"] ["UInt8", "Array(UInt8)", "Array(String)"] ["16", "[15,16,17]", "['first','second','third']"] +["16", "[15,16,17]", "['first','second','third']"] +--------- +["v1", "n.id", "n.name"] +["16", "[15,16,17]", "['first','second','third']"] +["16", "[15,16,17]", "['first','second','third']"] diff --git a/tests/queries/0_stateless/01448_json_compact_strings_each_row.sql b/tests/queries/0_stateless/01448_json_compact_strings_each_row.sql index 925faa3a17f..869041193cf 100644 --- a/tests/queries/0_stateless/01448_json_compact_strings_each_row.sql +++ b/tests/queries/0_stateless/01448_json_compact_strings_each_row.sql @@ -12,8 +12,10 @@ SELECT 2; /* Check Totals */ SELECT name, count() AS c FROM test_table GROUP BY name WITH TOTALS ORDER BY name FORMAT JSONCompactStringsEachRow; SELECT 3; -/* Check JSONCompactStringsEachRowWithNamesAndTypes Output */ +/* Check JSONCompactStringsEachRowWithNames and JSONCompactStringsEachRowWithNamesAndTypes Output */ SELECT * FROM test_table FORMAT JSONCompactStringsEachRowWithNamesAndTypes; +SELECT '----------'; +SELECT * FROM test_table FORMAT JSONCompactStringsEachRowWithNames; SELECT 4; /* Check Totals */ SELECT name, count() AS c FROM test_table GROUP BY name WITH TOTALS ORDER BY name FORMAT JSONCompactStringsEachRowWithNamesAndTypes; @@ -37,30 +39,39 @@ INSERT INTO test_table_2 FORMAT JSONCompactStringsEachRow ["16", "[15, 16, 17]", SELECT * FROM test_table_2 FORMAT JSONCompactStringsEachRow; TRUNCATE TABLE test_table_2; SELECT 8; -/* Check JSONCompactStringsEachRowWithNamesAndTypes Output */ +/* Check JSONCompactStringsEachRowWithNames and JSONCompactStringsEachRowWithNamesAndTypes Input */ SET input_format_null_as_default = 0; INSERT INTO test_table FORMAT JSONCompactStringsEachRowWithNamesAndTypes ["v1", "v2", "v3", "v4"]["String","UInt8","UInt16","UInt8"]["first", "1", "2", "null"]["second", "2", "null", "6"]; +INSERT INTO test_table FORMAT JSONCompactStringsEachRowWithNames ["v1", "v2", "v3", "v4"]["first", "1", "2", "null"]["second", "2", "null", "6"]; SELECT * FROM test_table FORMAT JSONCompactStringsEachRow; TRUNCATE TABLE test_table; SELECT 9; /* Check input_format_null_as_default = 1 */ SET input_format_null_as_default = 1; INSERT INTO test_table FORMAT JSONCompactStringsEachRowWithNamesAndTypes ["v1", "v2", "v3", "v4"]["String","UInt8","UInt16","UInt8"]["first", "1", "2", "null"] ["second", "2", "null", "6"]; +INSERT INTO test_table FORMAT JSONCompactStringsEachRowWithNames ["v1", "v2", "v3", "v4"]["first", "1", "2", "null"] ["second", "2", "null", "6"]; SELECT * FROM test_table FORMAT JSONCompactStringsEachRow; SELECT 10; /* Check Header */ TRUNCATE TABLE test_table; SET input_format_skip_unknown_fields = 1; INSERT INTO test_table FORMAT JSONCompactStringsEachRowWithNamesAndTypes ["v1", "v2", "invalid_column"]["String", "UInt8", "UInt8"]["first", "1", "32"]["second", "2", "64"]; +INSERT INTO test_table FORMAT JSONCompactStringsEachRowWithNames ["v1", "v2", "invalid_column"]["first", "1", "32"]["second", "2", "64"]; SELECT * FROM test_table FORMAT JSONCompactStringsEachRow; SELECT 11; TRUNCATE TABLE test_table; INSERT INTO test_table FORMAT JSONCompactStringsEachRowWithNamesAndTypes ["v4", "v2", "v3"]["UInt8", "UInt8", "UInt16"]["1", "2", "3"] +INSERT INTO test_table FORMAT JSONCompactStringsEachRowWithNames ["v4", "v2", "v3"]["1", "2", "3"] SELECT * FROM test_table FORMAT JSONCompactStringsEachRowWithNamesAndTypes; +SELECT '---------'; +SELECT * FROM test_table FORMAT JSONCompactStringsEachRowWithNames; SELECT 12; /* Check Nested */ INSERT INTO test_table_2 FORMAT JSONCompactStringsEachRowWithNamesAndTypes ["v1", "n.id", "n.name"]["UInt8", "Array(UInt8)", "Array(String)"]["16", "[15, 16, 17]", "['first', 'second', 'third']"]; +INSERT INTO test_table_2 FORMAT JSONCompactStringsEachRowWithNames ["v1", "n.id", "n.name"]["16", "[15, 16, 17]", "['first', 'second', 'third']"]; SELECT * FROM test_table_2 FORMAT JSONCompactStringsEachRowWithNamesAndTypes; +SELECT '---------'; +SELECT * FROM test_table_2 FORMAT JSONCompactStringsEachRowWithNames; DROP TABLE IF EXISTS test_table; DROP TABLE IF EXISTS test_table_2; diff --git a/tests/queries/0_stateless/02097_json_strings_deserialization.reference b/tests/queries/0_stateless/02097_json_strings_deserialization.reference new file mode 100644 index 00000000000..8d7ffe54606 --- /dev/null +++ b/tests/queries/0_stateless/02097_json_strings_deserialization.reference @@ -0,0 +1,4 @@ +test\n\t\0\n test\n\t\0\n +test\n\t\0\n test\n\t\0\n +test\n\t\0\n test\n\t\0\n +test\n\t\0\n test\n\t\0\n diff --git a/tests/queries/0_stateless/02097_json_strings_deserialization.sh b/tests/queries/0_stateless/02097_json_strings_deserialization.sh new file mode 100755 index 00000000000..ae9e1ea7645 --- /dev/null +++ b/tests/queries/0_stateless/02097_json_strings_deserialization.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "drop table if exists test_02097" +$CLICKHOUSE_CLIENT -q "create table test_02097 (s String, f FixedString(8)) engine=Memory()" +echo -e "('test\n\t\0\n', 'test\n\t\0\n')" | $CLICKHOUSE_CLIENT -q "insert into test_02097 format Values" +$CLICKHOUSE_CLIENT -q "select * from test_02097 format JSONStringsEachRow" | $CLICKHOUSE_CLIENT -q "insert into test_02097 format JSONStringsEachRow" +$CLICKHOUSE_CLIENT -q "select * from test_02097 format JSONCompactStringsEachRow" | $CLICKHOUSE_CLIENT -q "insert into test_02097 format JSONCompactStringsEachRow" +$CLICKHOUSE_CLIENT -q "select * from test_02097" +$CLICKHOUSE_CLIENT -q "drop table test_02097" + diff --git a/tests/queries/0_stateless/02098_with_types_use_header.reference b/tests/queries/0_stateless/02098_with_types_use_header.reference new file mode 100644 index 00000000000..c1d70452d1d --- /dev/null +++ b/tests/queries/0_stateless/02098_with_types_use_header.reference @@ -0,0 +1,16 @@ +TSVWithNamesAndTypes +OK +OK +OK +CSVWithNamesAndTypes +OK +OK +OK +JSONCompactEachRowWithNamesAndTypes +OK +OK +OK +JSONCompactStringsEachRowWithNamesAndTypes +OK +OK +OK diff --git a/tests/queries/0_stateless/02098_with_types_use_header.sh b/tests/queries/0_stateless/02098_with_types_use_header.sh new file mode 100755 index 00000000000..cbeb783aed0 --- /dev/null +++ b/tests/queries/0_stateless/02098_with_types_use_header.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_02098" +$CLICKHOUSE_CLIENT -q "CREATE TABLE test_02098 (x UInt32, y String, z Date) engine=Memory()" + +echo "TSVWithNamesAndTypes" +echo -e "x\ty\tz\nString\tDate\tUInt32\ntext\t2020-01-01\t1" | $CLICKHOUSE_CLIENT --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT TSVWithNamesAndTypes" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL' +echo -e "y\tz\tx\nString\tDate\tUInt32\ntext\t2020-01-01\t1" | $CLICKHOUSE_CLIENT --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT TSVWithNamesAndTypes" && echo 'OK' || echo 'FAIL' +echo -e "x\tz\ty\nUInt32\tString\tDate\n1\ttext\t2020-01-01" | $CLICKHOUSE_CLIENT --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT TSVWithNamesAndTypes" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL' + + +echo "CSVWithNamesAndTypes" +echo -e "'x','y','z'\n'String','Date','UInt32'\n'text','2020-01-01',1" | $CLICKHOUSE_CLIENT --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT CSVWithNamesAndTypes" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL' +echo -e "'y','z','x'\n'String','Date','UInt32'\n'text','2020-01-01',1" | $CLICKHOUSE_CLIENT --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT CSVWithNamesAndTypes" && echo 'OK' || echo 'FAIL' +echo -e "'x','z','y'\n'UInt32','String',Date'\n1,'text','2020-01-01'" | $CLICKHOUSE_CLIENT --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT CSVWithNamesAndTypes" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL' + + +echo "JSONCompactEachRowWithNamesAndTypes" +echo -e '["x","y","z"]\n["String","Date","UInt32"]\n["text","2020-01-01",1]' | $CLICKHOUSE_CLIENT --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT JSONCompactEachRowWithNamesAndTypes" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL' +echo -e '["y","z","x"]\n["String","Date","UInt32"]\n["text","2020-01-01",1]' | $CLICKHOUSE_CLIENT --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT JSONCompactEachRowWithNamesAndTypes" && echo 'OK' || echo 'FAIL' +echo -e '["x","z","y"]\n["UInt32", "String", "Date"]\n[1, "text","2020-01-01"]' | $CLICKHOUSE_CLIENT --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT JSONCompactEachRowWithNamesAndTypes" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL' + +echo "JSONCompactStringsEachRowWithNamesAndTypes" +echo -e '["x","y","z"]\n["String","Date","UInt32"]\n["text","2020-01-01","1"]' | $CLICKHOUSE_CLIENT --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT JSONCompactStringsEachRowWithNamesAndTypes" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL' +echo -e '["y","z","x"]\n["String","Date","UInt32"]\n["text","2020-01-01","1"]' | $CLICKHOUSE_CLIENT --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT JSONCompactStringsEachRowWithNamesAndTypes" && echo 'OK' || echo 'FAIL' +echo -e '["x","z","y"]\n["UInt32", "String", "Date"]\n["1", "text","2020-01-01"]' | $CLICKHOUSE_CLIENT --input_format_with_types_use_header=1 -q "INSERT INTO test_02098 FORMAT JSONCompactStringsEachRowWithNamesAndTypes" 2>&1 | grep -F -q "INCORRECT_DATA" && echo 'OK' || echo 'FAIL' + +$CLICKHOUSE_CLIENT -q "DROP TABLE test_02098" diff --git a/tests/queries/0_stateless/02099_tsv_raw_format.reference b/tests/queries/0_stateless/02099_tsv_raw_format.reference new file mode 100644 index 00000000000..de46cf8dff7 --- /dev/null +++ b/tests/queries/0_stateless/02099_tsv_raw_format.reference @@ -0,0 +1,113 @@ +TSVRaw +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +TSVRawWithNames +number string date +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +TSVRawWithNamesAndTypes +number string date +UInt64 String Date +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +TabSeparatedRaw +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +TabSeparatedRawWithNames +number string date +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +TabSeparatedRawWithNamesAndTypes +number string date +UInt64 String Date +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 0 1970-01-01 +1 1 1970-01-02 +2 2 1970-01-03 +0 +\N +2 +\N +0 +\N +2 +\N +0 +\N +2 +\N +0 +\N +2 +\N +0 +\N +2 +\N +0 +\N +2 +\N +0 +\N +2 +\N +0 +\N +2 +\N +0 +\N +2 +\N +0 +\N +2 +\N +0 +\N +2 +\N +nSome text +b1cad4eb4be08a40387c9de70d02fcc2 - +b1cad4eb4be08a40387c9de70d02fcc2 - diff --git a/tests/queries/0_stateless/02099_tsv_raw_format.sh b/tests/queries/0_stateless/02099_tsv_raw_format.sh new file mode 100755 index 00000000000..ef59e399bdf --- /dev/null +++ b/tests/queries/0_stateless/02099_tsv_raw_format.sh @@ -0,0 +1,59 @@ +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_02099" +$CLICKHOUSE_CLIENT -q "CREATE TABLE test_02099 (number UInt64, string String, date Date) ENGINE=Memory()" + +FORMATS=('TSVRaw' 'TSVRawWithNames' 'TSVRawWithNamesAndTypes' 'TabSeparatedRaw' 'TabSeparatedRawWithNames' 'TabSeparatedRawWithNamesAndTypes') + +for format in "${FORMATS[@]}" +do + echo $format + $CLICKHOUSE_CLIENT -q "INSERT INTO test_02099 SELECT number, toString(number), toDate(number) FROM numbers(3)" + $CLICKHOUSE_CLIENT -q "SELECT * FROM test_02099 FORMAT $format" + + $CLICKHOUSE_CLIENT -q "SELECT * FROM test_02099 FORMAT $format" | $CLICKHOUSE_CLIENT -q "INSERT INTO test_02099 FORMAT $format" + $CLICKHOUSE_CLIENT -q "SELECT * FROM test_02099" + + $CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02099" +done + +$CLICKHOUSE_CLIENT -q "DROP TABLE test_02099" + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_nullable_02099" +$CLICKHOUSE_CLIENT -q "CREATE TABLE test_nullable_02099 ENGINE=Memory() AS SELECT number % 2 ? NULL : number from numbers(4)"; + +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_nullable_02099 FORMAT TSVRaw" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_nullable_02099 FORMAT TSVRaw" | $CLICKHOUSE_CLIENT -q "INSERT INTO test_nullable_02099 FORMAT TSVRaw" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_nullable_02099" + + +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_nullable_02099 FORMAT TSV" | $CLICKHOUSE_CLIENT -q "INSERT INTO test_nullable_02099 FORMAT TSVRaw" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_nullable_02099 FORMAT TSVRaw" | $CLICKHOUSE_CLIENT -q "INSERT INTO test_nullable_02099 FORMAT TSV" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_nullable_02099" + +$CLICKHOUSE_CLIENT -q "DROP TABLE test_nullable_02099" + + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_nullable_string_02099" +$CLICKHOUSE_CLIENT -q "CREATE TABLE test_nullable_string_02099 (s Nullable(String)) ENGINE=Memory()"; + +echo 'nSome text' | $CLICKHOUSE_CLIENT -q "INSERT INTO test_nullable_string_02099 FORMAT TSVRaw" + +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_nullable_string_02099" +$CLICKHOUSE_CLIENT -q "DROP TABLE test_nullable_string_02099" + + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_parallel_parsing_02099" +$CLICKHOUSE_CLIENT -q "CREATE TABLE test_parallel_parsing_02099 (x UInt64, a Array(UInt64), s String) ENGINE=Memory()"; +$CLICKHOUSE_CLIENT -q "SELECT number AS x, range(number % 50) AS a, toString(a) AS s FROM numbers(1000000) FORMAT TSVRaw" | $CLICKHOUSE_CLIENT --input_format_parallel_parsing=0 -q "INSERT INTO test_parallel_parsing_02099 FORMAT TSVRaw" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_parallel_parsing_02099 ORDER BY x" | md5sum + +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_parallel_parsing_02099" + +$CLICKHOUSE_CLIENT -q "SELECT number AS x, range(number % 50) AS a, toString(a) AS s FROM numbers(1000000) FORMAT TSVRaw" | $CLICKHOUSE_CLIENT --input_format_parallel_parsing=1 -q "INSERT INTO test_parallel_parsing_02099 FORMAT TSVRaw" +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_parallel_parsing_02099 ORDER BY x" | md5sum + +$CLICKHOUSE_CLIENT -q "DROP TABLE test_parallel_parsing_02099" + diff --git a/tests/queries/0_stateless/02100_low_cardinality_nullable_null_default.reference b/tests/queries/0_stateless/02100_low_cardinality_nullable_null_default.reference new file mode 100644 index 00000000000..12b4d6ad854 --- /dev/null +++ b/tests/queries/0_stateless/02100_low_cardinality_nullable_null_default.reference @@ -0,0 +1,14 @@ +CSV +\N +TSV +\N +TSVRaw +\N +TSKV +\N +JSONCompactEachRow +\N +JSONEachRow +\N +Values +\N diff --git a/tests/queries/0_stateless/02100_low_cardinality_nullable_null_default.sh b/tests/queries/0_stateless/02100_low_cardinality_nullable_null_default.sh new file mode 100755 index 00000000000..d380e784229 --- /dev/null +++ b/tests/queries/0_stateless/02100_low_cardinality_nullable_null_default.sh @@ -0,0 +1,21 @@ +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_02100" +$CLICKHOUSE_CLIENT -q "CREATE TABLE test_02100 (x LowCardinality(Nullable(String)) DEFAULT 'default') ENGINE=Memory()" + +FORMATS=('CSV' 'TSV' 'TSVRaw' 'TSKV' 'JSONCompactEachRow' 'JSONEachRow' 'Values') + +for format in "${FORMATS[@]}" +do + echo $format + $CLICKHOUSE_CLIENT -q "SELECT NULL as x FORMAT $format" | $CLICKHOUSE_CLIENT -q "INSERT INTO test_02100 FORMAT $format" + + $CLICKHOUSE_CLIENT -q "SELECT * FROM test_02100" + + $CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02100" +done + +$CLICKHOUSE_CLIENT -q "DROP TABLE test_02100" + diff --git a/tests/queries/0_stateless/02101_empty_as_default_and_omitted_fields.reference b/tests/queries/0_stateless/02101_empty_as_default_and_omitted_fields.reference new file mode 100644 index 00000000000..61444c7a238 --- /dev/null +++ b/tests/queries/0_stateless/02101_empty_as_default_and_omitted_fields.reference @@ -0,0 +1,16 @@ +TSV +1 42 +2 0 +3 42 +4 0 +CSV +1 42 +2 0 +3 42 +4 0 +JSONEachRow +1 42 +2 0 +JSONCompactEachRow +1 42 +2 0 diff --git a/tests/queries/0_stateless/02101_empty_as_default_and_omitted_fields.sh b/tests/queries/0_stateless/02101_empty_as_default_and_omitted_fields.sh new file mode 100755 index 00000000000..344982bcd84 --- /dev/null +++ b/tests/queries/0_stateless/02101_empty_as_default_and_omitted_fields.sh @@ -0,0 +1,39 @@ +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test_02101" +$CLICKHOUSE_CLIENT -q "CREATE TABLE test_02101 (x UInt64, y UInt64 DEFAULT 42) ENGINE=Memory()" + +echo 'TSV' +echo -e 'x\ty\n1\t' | $CLICKHOUSE_CLIENT --input_format_tsv_empty_as_default=1 --input_format_defaults_for_omitted_fields=1 -q "INSERT INTO test_02101 FORMAT TSVWithNames" +echo -e 'x\ty\n2\t' | $CLICKHOUSE_CLIENT --input_format_tsv_empty_as_default=1 --input_format_defaults_for_omitted_fields=0 -q "INSERT INTO test_02101 FORMAT TSVWithNames" +echo -e 'x\tz\n3\t123' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_skip_unknown_fields=1 -q "INSERT INTO test_02101 FORMAT TSVWithNames" +echo -e 'x\tz\n4\t123' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=0 --input_format_skip_unknown_fields=1 -q "INSERT INTO test_02101 FORMAT TSVWithNames" + +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02101 ORDER BY x" +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02101" + +echo 'CSV' +echo -e '"x","y"\n1,' | $CLICKHOUSE_CLIENT --input_format_csv_empty_as_default=1 --input_format_defaults_for_omitted_fields=1 -q "INSERT INTO test_02101 FORMAT CSVWithNames" +echo -e '"x","y"\n2,' | $CLICKHOUSE_CLIENT --input_format_csv_empty_as_default=1 --input_format_defaults_for_omitted_fields=0 -q "INSERT INTO test_02101 FORMAT CSVWithNames" +echo -e '"x","z"\n3,123' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_skip_unknown_fields=1 -q "INSERT INTO test_02101 FORMAT CSVWithNames" +echo -e '"x","z"\n4,123' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=0 --input_format_skip_unknown_fields=1 -q "INSERT INTO test_02101 FORMAT CSVWithNames" + +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02101 ORDER BY x" +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02101" + +echo 'JSONEachRow' +echo -e '{"x" : 1, "z" : 123}' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_skip_unknown_fields=1 -q "INSERT INTO test_02101 FORMAT JSONEachRow" +echo -e '{"x" : 2, "z" : 123}' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=0 --input_format_skip_unknown_fields=1 -q "INSERT INTO test_02101 FORMAT JSONEachRow" + +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02101 ORDER BY x" +$CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02101" + +echo 'JSONCompactEachRow' +echo -e '["x", "z"], [1, 123]' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=1 --input_format_skip_unknown_fields=1 -q "INSERT INTO test_02101 FORMAT JSONCompactEachRowWithNames" +echo -e '["x", "z"], [2, 123]' | $CLICKHOUSE_CLIENT --input_format_defaults_for_omitted_fields=0 --input_format_skip_unknown_fields=1 -q "INSERT INTO test_02101 FORMAT JSONCompactEachRowWithNames" + +$CLICKHOUSE_CLIENT -q "SELECT * FROM test_02101 ORDER BY x" +$CLICKHOUSE_CLIENT -q "DROP TABLE test_02101" + diff --git a/tests/queries/1_stateful/00159_parallel_formatting_json_and_friends.reference b/tests/queries/1_stateful/00159_parallel_formatting_json_and_friends.reference index 6d663c33057..7ad5359a30e 100644 --- a/tests/queries/1_stateful/00159_parallel_formatting_json_and_friends.reference +++ b/tests/queries/1_stateful/00159_parallel_formatting_json_and_friends.reference @@ -1,12 +1,28 @@ JSONEachRow, false -7251839681e559f5a92db107571bb357 - +e0a3c9978a92a277f2fff4664f3c1749 - JSONEachRow, true -7251839681e559f5a92db107571bb357 - +e0a3c9978a92a277f2fff4664f3c1749 - JSONCompactEachRow, false -ba1081a754a06ef6563840b2d8d4d327 - +0c1efbbc25a5bd90a2ecea559d283667 - JSONCompactEachRow, true -ba1081a754a06ef6563840b2d8d4d327 - +0c1efbbc25a5bd90a2ecea559d283667 - +JSONCompactStringsEachRow, false +0c1efbbc25a5bd90a2ecea559d283667 - +JSONCompactStringsEachRow, true +0c1efbbc25a5bd90a2ecea559d283667 - +JSONCompactEachRowWithNames, false +b9e4f8ecadbb650245d1762f4187ee0a - +JSONCompactEachRowWithNames, true +b9e4f8ecadbb650245d1762f4187ee0a - +JSONCompactStringsEachRowWithNames, false +b9e4f8ecadbb650245d1762f4187ee0a - +JSONCompactStringsEachRowWithNames, true +b9e4f8ecadbb650245d1762f4187ee0a - +JSONCompactEachRowWithNamesAndTypes, false +8b41f7375999b53d4c9607398456fe5b - +JSONCompactEachRowWithNamesAndTypes, true +8b41f7375999b53d4c9607398456fe5b - JSONCompactStringsEachRowWithNamesAndTypes, false -31ded3cd9971b124450fb5a44a8bce63 - +8b41f7375999b53d4c9607398456fe5b - JSONCompactStringsEachRowWithNamesAndTypes, true -31ded3cd9971b124450fb5a44a8bce63 - +8b41f7375999b53d4c9607398456fe5b - diff --git a/tests/queries/1_stateful/00159_parallel_formatting_json_and_friends.sh b/tests/queries/1_stateful/00159_parallel_formatting_json_and_friends.sh index 5d54328e45d..f6c87eabfde 100755 --- a/tests/queries/1_stateful/00159_parallel_formatting_json_and_friends.sh +++ b/tests/queries/1_stateful/00159_parallel_formatting_json_and_friends.sh @@ -6,15 +6,15 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . "$CURDIR"/../shell_config.sh -FORMATS=('JSONEachRow' 'JSONCompactEachRow' 'JSONCompactStringsEachRowWithNamesAndTypes') +FORMATS=('JSONEachRow' 'JSONCompactEachRow' 'JSONCompactStringsEachRow' 'JSONCompactEachRowWithNames' 'JSONCompactStringsEachRowWithNames' 'JSONCompactEachRowWithNamesAndTypes' 'JSONCompactStringsEachRowWithNamesAndTypes') for format in "${FORMATS[@]}" do echo "$format, false"; $CLICKHOUSE_CLIENT --output_format_parallel_formatting=false -q \ - "SELECT ClientEventTime::DateTime('Europe/Moscow') as a, MobilePhoneModel as b, ClientIP6 as c FROM test.hits ORDER BY a, b, c Format $format" | md5sum + "SELECT ClientEventTime::DateTime('Europe/Moscow') as a, MobilePhoneModel as b, ClientIP6 as c FROM test.hits ORDER BY a, b, c LIMIT 3000000 Format $format" | md5sum echo "$format, true"; $CLICKHOUSE_CLIENT --output_format_parallel_formatting=true -q \ - "SELECT ClientEventTime::DateTime('Europe/Moscow') as a, MobilePhoneModel as b, ClientIP6 as c FROM test.hits ORDER BY a, b, c Format $format" | md5sum + "SELECT ClientEventTime::DateTime('Europe/Moscow') as a, MobilePhoneModel as b, ClientIP6 as c FROM test.hits ORDER BY a, b, c LIMIT 3000000 Format $format" | md5sum done diff --git a/tests/queries/1_stateful/00167_parallel_parsing_with_names_and_types.reference b/tests/queries/1_stateful/00167_parallel_parsing_with_names_and_types.reference new file mode 100644 index 00000000000..0c0367694b2 --- /dev/null +++ b/tests/queries/1_stateful/00167_parallel_parsing_with_names_and_types.reference @@ -0,0 +1,20 @@ +TSVWithNamesAndTypes, false +7c1feeaae418e502d66fcc8e31946f2e - +TSVWithNamesAndTypes, true +7c1feeaae418e502d66fcc8e31946f2e - +CSVWithNamesAndTypes, false +7c1feeaae418e502d66fcc8e31946f2e - +CSVWithNamesAndTypes, true +7c1feeaae418e502d66fcc8e31946f2e - +JSONStringsEachRow, false +7c1feeaae418e502d66fcc8e31946f2e - +JSONStringsEachRow, true +7c1feeaae418e502d66fcc8e31946f2e - +JSONCompactEachRowWithNamesAndTypes, false +7c1feeaae418e502d66fcc8e31946f2e - +JSONCompactEachRowWithNamesAndTypes, true +7c1feeaae418e502d66fcc8e31946f2e - +JSONCompactStringsEachRowWithNamesAndTypes, false +7c1feeaae418e502d66fcc8e31946f2e - +JSONCompactStringsEachRowWithNamesAndTypes, true +7c1feeaae418e502d66fcc8e31946f2e - diff --git a/tests/queries/1_stateful/00167_parallel_parsing_with_names_and_types.sh b/tests/queries/1_stateful/00167_parallel_parsing_with_names_and_types.sh new file mode 100755 index 00000000000..9fdca20d097 --- /dev/null +++ b/tests/queries/1_stateful/00167_parallel_parsing_with_names_and_types.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +FORMATS=('TSVWithNamesAndTypes' 'CSVWithNamesAndTypes' 'JSONStringsEachRow' 'JSONCompactEachRowWithNamesAndTypes' 'JSONCompactStringsEachRowWithNamesAndTypes') +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS parsing_with_names" + +for format in "${FORMATS[@]}" +do + # Columns are permuted + $CLICKHOUSE_CLIENT -q "CREATE TABLE parsing_with_names(c FixedString(16), a DateTime('Europe/Moscow'), b String) ENGINE=Memory()" + + echo "$format, false"; + $CLICKHOUSE_CLIENT --output_format_parallel_formatting=false -q \ + "SELECT URLRegions as d, toTimeZone(ClientEventTime, 'Europe/Moscow') as a, MobilePhoneModel as b, ParamPrice as e, ClientIP6 as c FROM test.hits LIMIT 5000 Format $format" | \ + $CLICKHOUSE_CLIENT --input_format_skip_unknown_fields=1 --input_format_parallel_parsing=false -q "INSERT INTO parsing_with_names FORMAT $format SETTINGS input_format_null_as_default=0" + + $CLICKHOUSE_CLIENT -q "SELECT * FROM parsing_with_names;" | md5sum + $CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS parsing_with_names" + + + $CLICKHOUSE_CLIENT -q "CREATE TABLE parsing_with_names(c FixedString(16), a DateTime('Europe/Moscow'), b String) ENGINE=Memory()" + echo "$format, true"; + $CLICKHOUSE_CLIENT --output_format_parallel_formatting=false -q \ + "SELECT URLRegions as d, toTimeZone(ClientEventTime, 'Europe/Moscow') as a, MobilePhoneModel as b, ParamPrice as e, ClientIP6 as c FROM test.hits LIMIT 5000 Format $format" | \ + $CLICKHOUSE_CLIENT --input_format_skip_unknown_fields=1 --input_format_parallel_parsing=true -q "INSERT INTO parsing_with_names FORMAT $format SETTINGS input_format_null_as_default=0" + + $CLICKHOUSE_CLIENT -q "SELECT * FROM parsing_with_names;" | md5sum + $CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS parsing_with_names" +done