diff --git a/src/Core/FormatFactorySettings.h b/src/Core/FormatFactorySettings.h index a095bffc4c9..45f8d9bb9b6 100644 --- a/src/Core/FormatFactorySettings.h +++ b/src/Core/FormatFactorySettings.h @@ -1242,6 +1242,9 @@ Set the quoting rule for identifiers in SHOW CREATE query )", 0) \ DECLARE(IdentifierQuotingStyle, show_create_query_identifier_quoting_style, IdentifierQuotingStyle::Backticks, R"( Set the quoting style for identifiers in SHOW CREATE query +)", 0) \ + DECLARE(String, composed_data_type_output_format_mode, "default", R"( +Set composed data type output format mode, default or spark. )", 0) \ // End of FORMAT_FACTORY_SETTINGS diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index f0d3e001362..364d7c2cb4a 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -138,6 +138,8 @@ static std::initializer_list -static void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, Writer && write_nested) +static void serializeTextImpl(const IColumn & column, size_t row_num, const FormatSettings & settings, WriteBuffer & ostr, Writer && write_nested) { const ColumnArray & column_array = assert_cast(column); const ColumnArray::Offsets & offsets = column_array.getOffsets(); @@ -412,10 +412,14 @@ static void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffe const IColumn & nested_column = column_array.getData(); writeChar('[', ostr); - for (size_t i = offset; i < next_offset; ++i) + + if (next_offset != offset) + write_nested(nested_column, offset); + for (size_t i = offset + 1; i < next_offset; ++i) { - if (i != offset) - writeChar(',', ostr); + writeChar(',', ostr); + if (settings.composed_data_type_output_format_mode == "spark") + writeChar(' ', ostr); write_nested(nested_column, i); } writeChar(']', ostr); @@ -520,10 +524,13 @@ static ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reade void SerializationArray::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { - serializeTextImpl(column, row_num, ostr, + serializeTextImpl(column, row_num, settings, ostr, [&](const IColumn & nested_column, size_t i) { - nested->serializeTextQuoted(nested_column, i, ostr, settings); + if (settings.composed_data_type_output_format_mode == "spark") + nested->serializeText(nested_column, i, ostr, settings); + else + nested->serializeTextQuoted(nested_column, i, ostr, settings); }); } diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp index 6538589e4f8..c51255d1dce 100644 --- a/src/DataTypes/Serializations/SerializationMap.cpp +++ b/src/DataTypes/Serializations/SerializationMap.cpp @@ -90,6 +90,7 @@ template void SerializationMap::serializeTextImpl( const IColumn & column, size_t row_num, + const FormatSettings & settings, WriteBuffer & ostr, KeyWriter && key_writer, ValueWriter && value_writer) const @@ -104,15 +105,31 @@ void SerializationMap::serializeTextImpl( size_t next_offset = offsets[row_num]; writeChar('{', ostr); - for (size_t i = offset; i < next_offset; ++i) + if (offset != next_offset) { - if (i != offset) - writeChar(',', ostr); - - key_writer(ostr, key, nested_tuple.getColumn(0), i); - writeChar(':', ostr); - value_writer(ostr, value, nested_tuple.getColumn(1), i); + key_writer(ostr, key, nested_tuple.getColumn(0), offset); + if (settings.composed_data_type_output_format_mode == "spark") + writeString(std::string_view(" -> "), ostr); + else + writeChar(':', ostr); + value_writer(ostr, value, nested_tuple.getColumn(1), offset); } + if (settings.composed_data_type_output_format_mode == "spark") + for (size_t i = offset + 1; i < next_offset; ++i) + { + writeString(std::string_view(", "), ostr); + key_writer(ostr, key, nested_tuple.getColumn(0), i); + writeString(std::string_view(" -> "), ostr); + value_writer(ostr, value, nested_tuple.getColumn(1), i); + } + else + for (size_t i = offset + 1; i < next_offset; ++i) + { + writeChar(',', ostr); + key_writer(ostr, key, nested_tuple.getColumn(0), i); + writeChar(':', ostr); + value_writer(ostr, value, nested_tuple.getColumn(1), i); + } writeChar('}', ostr); } @@ -221,10 +238,13 @@ void SerializationMap::serializeText(const IColumn & column, size_t row_num, Wri { auto writer = [&settings](WriteBuffer & buf, const SerializationPtr & subcolumn_serialization, const IColumn & subcolumn, size_t pos) { - subcolumn_serialization->serializeTextQuoted(subcolumn, pos, buf, settings); + if (settings.composed_data_type_output_format_mode == "spark") + subcolumn_serialization->serializeText(subcolumn, pos, buf, settings); + else + subcolumn_serialization->serializeTextQuoted(subcolumn, pos, buf, settings); }; - serializeTextImpl(column, row_num, ostr, writer, writer); + serializeTextImpl(column, row_num, settings, ostr, writer, writer); } void SerializationMap::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const @@ -266,7 +286,7 @@ bool SerializationMap::tryDeserializeText(IColumn & column, ReadBuffer & istr, c void SerializationMap::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { - serializeTextImpl(column, row_num, ostr, + serializeTextImpl(column, row_num, settings, ostr, [&settings](WriteBuffer & buf, const SerializationPtr & subcolumn_serialization, const IColumn & subcolumn, size_t pos) { /// We need to double-quote all keys (including integers) to produce valid JSON. diff --git a/src/DataTypes/Serializations/SerializationMap.h b/src/DataTypes/Serializations/SerializationMap.h index 007d153ec7e..74229bcfb15 100644 --- a/src/DataTypes/Serializations/SerializationMap.h +++ b/src/DataTypes/Serializations/SerializationMap.h @@ -70,7 +70,7 @@ public: private: template - void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, KeyWriter && key_writer, ValueWriter && value_writer) const; + void serializeTextImpl(const IColumn & column, size_t row_num, const FormatSettings & settings, WriteBuffer & ostr, KeyWriter && key_writer, ValueWriter && value_writer) const; template ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const; diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index 366949e7ac0..ec92ceb9110 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -137,12 +137,25 @@ void SerializationTuple::deserializeBinary(IColumn & column, ReadBuffer & istr, void SerializationTuple::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('(', ostr); - for (size_t i = 0; i < elems.size(); ++i) + if (!elems.empty()) { - if (i != 0) - writeChar(',', ostr); - elems[i]->serializeTextQuoted(extractElementColumn(column, i), row_num, ostr, settings); + if (settings.composed_data_type_output_format_mode == "spark") + elems[0]->serializeText(extractElementColumn(column, 0), row_num, ostr, settings); + else + elems[0]->serializeTextQuoted(extractElementColumn(column, 0), row_num, ostr, settings); } + if (settings.composed_data_type_output_format_mode == "spark") + for (size_t i = 1; i < elems.size(); ++i) + { + writeString(std::string_view(", "), ostr); + elems[i]->serializeText(extractElementColumn(column, i), row_num, ostr, settings); + } + else + for (size_t i = 1; i < elems.size(); ++i) + { + writeChar(',', ostr); + elems[i]->serializeTextQuoted(extractElementColumn(column, i), row_num, ostr, settings); + } writeChar(')', ostr); } diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 7239229d417..fa464b9fb7c 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -251,6 +251,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se format_settings.values.deduce_templates_of_expressions = settings[Setting::input_format_values_deduce_templates_of_expressions]; format_settings.values.interpret_expressions = settings[Setting::input_format_values_interpret_expressions]; format_settings.values.escape_quote_with_quote = settings[Setting::output_format_values_escape_quote_with_quote]; + format_settings.composed_data_type_output_format_mode = settings[Setting::composed_data_type_output_format_mode]; format_settings.with_names_use_header = settings[Setting::input_format_with_names_use_header]; format_settings.with_types_use_header = settings[Setting::input_format_with_types_use_header]; format_settings.write_statistics = settings[Setting::output_format_write_statistics]; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 315c2d1bc32..b81616bf017 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -38,6 +38,7 @@ struct FormatSettings bool try_infer_variant = false; bool seekable_read = true; + String composed_data_type_output_format_mode = "default"; UInt64 max_rows_to_read_for_schema_inference = 25000; UInt64 max_bytes_to_read_for_schema_inference = 32 * 1024 * 1024; diff --git a/tests/queries/0_stateless/03259_to_string_spark_format.reference b/tests/queries/0_stateless/03259_to_string_spark_format.reference new file mode 100644 index 00000000000..9e2c518dac4 --- /dev/null +++ b/tests/queries/0_stateless/03259_to_string_spark_format.reference @@ -0,0 +1,16 @@ +-- array format -- +[\'1\'] +[1, 2, abc, \'1\'] +[1, 2, abc, \'1\'] +[1, 2, abc, \'1\'] +[1, 2, abc, \'1\'] +-- map format -- +{1343 -> fe, afe -> fefe} +{1343 -> fe, afe -> fefe} +{1343 -> fe, afe -> fefe} +{1343 -> fe, afe -> fefe} +-- tuple format -- +(1, 3, abc) +(1, 3, abc) +(1, 3, abc) +(1, 3, abc) diff --git a/tests/queries/0_stateless/03259_to_string_spark_format.sql b/tests/queries/0_stateless/03259_to_string_spark_format.sql new file mode 100644 index 00000000000..7a657f803bd --- /dev/null +++ b/tests/queries/0_stateless/03259_to_string_spark_format.sql @@ -0,0 +1,18 @@ +SELECT '-- array format --'; +SELECT CAST(array('\'1\'') , 'String') SETTINGS composed_data_type_output_format_mode = 'spark'; +SELECT CAST([materialize('1'), '2', 'abc', '\'1\''], 'String') SETTINGS composed_data_type_output_format_mode = 'spark'; +SELECT CAST([materialize('1'), materialize('2'), 'abc', '\'1\''], 'String') SETTINGS composed_data_type_output_format_mode = 'spark'; +SELECT CAST([materialize('1'), materialize('2'), materialize('abc'), '\'1\''], 'String') SETTINGS composed_data_type_output_format_mode = 'spark'; +SELECT CAST([materialize('1'), materialize('2'), materialize('abc'), materialize('\'1\'')], 'String') SETTINGS composed_data_type_output_format_mode = 'spark'; + +SELECT '-- map format --'; +SELECT toString(map('1343', 'fe', 'afe', 'fefe')) SETTINGS composed_data_type_output_format_mode = 'spark'; +SELECT toString(map(materialize('1343'), materialize('fe'), 'afe', 'fefe')) SETTINGS composed_data_type_output_format_mode = 'spark'; +SELECT toString(map(materialize('1343'), materialize('fe'), materialize('afe'), 'fefe')) SETTINGS composed_data_type_output_format_mode = 'spark'; +SELECT toString(map(materialize('1343'), materialize('fe'), materialize('afe'), materialize('fefe'))) SETTINGS composed_data_type_output_format_mode = 'spark'; + +SELECT '-- tuple format --'; +SELECT toString(('1', '3', 'abc')) SETTINGS composed_data_type_output_format_mode = 'spark'; +SELECT toString((materialize('1'), '3', 'abc')) SETTINGS composed_data_type_output_format_mode = 'spark'; +SELECT toString((materialize('1'), materialize('3'), 'abc')) SETTINGS composed_data_type_output_format_mode = 'spark'; +SELECT toString((materialize('1'), materialize('3'), materialize('abc'))) SETTINGS composed_data_type_output_format_mode = 'spark';