Merge pull request #70957 from bigo-sg/tostring-fix

Function cast support Spark text output
This commit is contained in:
Vladimir Cherkasov 2024-12-02 10:28:35 +00:00 committed by GitHub
commit e8dab58d2d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 101 additions and 21 deletions

View File

@ -1242,6 +1242,9 @@ Set the quoting rule for identifiers in SHOW CREATE query
)", 0) \ )", 0) \
DECLARE(IdentifierQuotingStyle, show_create_query_identifier_quoting_style, IdentifierQuotingStyle::Backticks, R"( DECLARE(IdentifierQuotingStyle, show_create_query_identifier_quoting_style, IdentifierQuotingStyle::Backticks, R"(
Set the quoting style for identifiers in SHOW CREATE query Set the quoting style for identifiers in SHOW CREATE query
)", 0) \
DECLARE(String, composed_data_type_output_format_mode, "default", R"(
Set composed data type output format mode, default or spark.
)", 0) \ )", 0) \
// End of FORMAT_FACTORY_SETTINGS // End of FORMAT_FACTORY_SETTINGS

View File

@ -67,6 +67,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
{"max_bytes_ratio_before_external_group_by", 0., 0., "New setting."}, {"max_bytes_ratio_before_external_group_by", 0., 0., "New setting."},
{"max_bytes_ratio_before_external_sort", 0., 0., "New setting."}, {"max_bytes_ratio_before_external_sort", 0., 0., "New setting."},
{"use_async_executor_for_materialized_views", false, false, "New setting."}, {"use_async_executor_for_materialized_views", false, false, "New setting."},
{"composed_data_type_output_format_mode", "default", "default", "New setting"},
{"http_response_headers", "", "", "New setting."}, {"http_response_headers", "", "", "New setting."},
} }
}, },

View File

@ -401,7 +401,7 @@ void SerializationArray::deserializeBinaryBulkWithMultipleStreams(
template <typename Writer> template <typename Writer>
static void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, Writer && write_nested) static void serializeTextImpl(const IColumn & column, size_t row_num, const FormatSettings & settings, WriteBuffer & ostr, Writer && write_nested)
{ {
const ColumnArray & column_array = assert_cast<const ColumnArray &>(column); const ColumnArray & column_array = assert_cast<const ColumnArray &>(column);
const ColumnArray::Offsets & offsets = column_array.getOffsets(); const ColumnArray::Offsets & offsets = column_array.getOffsets();
@ -412,10 +412,14 @@ static void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffe
const IColumn & nested_column = column_array.getData(); const IColumn & nested_column = column_array.getData();
writeChar('[', ostr); writeChar('[', ostr);
for (size_t i = offset; i < next_offset; ++i)
if (next_offset != offset)
write_nested(nested_column, offset);
for (size_t i = offset + 1; i < next_offset; ++i)
{ {
if (i != offset) writeChar(',', ostr);
writeChar(',', ostr); if (settings.composed_data_type_output_format_mode == "spark")
writeChar(' ', ostr);
write_nested(nested_column, i); write_nested(nested_column, i);
} }
writeChar(']', ostr); writeChar(']', ostr);
@ -520,10 +524,13 @@ static ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reade
void SerializationArray::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const void SerializationArray::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{ {
serializeTextImpl(column, row_num, ostr, serializeTextImpl(column, row_num, settings, ostr,
[&](const IColumn & nested_column, size_t i) [&](const IColumn & nested_column, size_t i)
{ {
nested->serializeTextQuoted(nested_column, i, ostr, settings); if (settings.composed_data_type_output_format_mode == "spark")
nested->serializeText(nested_column, i, ostr, settings);
else
nested->serializeTextQuoted(nested_column, i, ostr, settings);
}); });
} }

View File

@ -90,6 +90,7 @@ template <typename KeyWriter, typename ValueWriter>
void SerializationMap::serializeTextImpl( void SerializationMap::serializeTextImpl(
const IColumn & column, const IColumn & column,
size_t row_num, size_t row_num,
const FormatSettings & settings,
WriteBuffer & ostr, WriteBuffer & ostr,
KeyWriter && key_writer, KeyWriter && key_writer,
ValueWriter && value_writer) const ValueWriter && value_writer) const
@ -104,15 +105,31 @@ void SerializationMap::serializeTextImpl(
size_t next_offset = offsets[row_num]; size_t next_offset = offsets[row_num];
writeChar('{', ostr); writeChar('{', ostr);
for (size_t i = offset; i < next_offset; ++i) if (offset != next_offset)
{ {
if (i != offset) key_writer(ostr, key, nested_tuple.getColumn(0), offset);
writeChar(',', ostr); if (settings.composed_data_type_output_format_mode == "spark")
writeString(std::string_view(" -> "), ostr);
key_writer(ostr, key, nested_tuple.getColumn(0), i); else
writeChar(':', ostr); writeChar(':', ostr);
value_writer(ostr, value, nested_tuple.getColumn(1), i); value_writer(ostr, value, nested_tuple.getColumn(1), offset);
} }
if (settings.composed_data_type_output_format_mode == "spark")
for (size_t i = offset + 1; i < next_offset; ++i)
{
writeString(std::string_view(", "), ostr);
key_writer(ostr, key, nested_tuple.getColumn(0), i);
writeString(std::string_view(" -> "), ostr);
value_writer(ostr, value, nested_tuple.getColumn(1), i);
}
else
for (size_t i = offset + 1; i < next_offset; ++i)
{
writeChar(',', ostr);
key_writer(ostr, key, nested_tuple.getColumn(0), i);
writeChar(':', ostr);
value_writer(ostr, value, nested_tuple.getColumn(1), i);
}
writeChar('}', ostr); writeChar('}', ostr);
} }
@ -221,10 +238,13 @@ void SerializationMap::serializeText(const IColumn & column, size_t row_num, Wri
{ {
auto writer = [&settings](WriteBuffer & buf, const SerializationPtr & subcolumn_serialization, const IColumn & subcolumn, size_t pos) auto writer = [&settings](WriteBuffer & buf, const SerializationPtr & subcolumn_serialization, const IColumn & subcolumn, size_t pos)
{ {
subcolumn_serialization->serializeTextQuoted(subcolumn, pos, buf, settings); if (settings.composed_data_type_output_format_mode == "spark")
subcolumn_serialization->serializeText(subcolumn, pos, buf, settings);
else
subcolumn_serialization->serializeTextQuoted(subcolumn, pos, buf, settings);
}; };
serializeTextImpl(column, row_num, ostr, writer, writer); serializeTextImpl(column, row_num, settings, ostr, writer, writer);
} }
void SerializationMap::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const void SerializationMap::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const
@ -266,7 +286,7 @@ bool SerializationMap::tryDeserializeText(IColumn & column, ReadBuffer & istr, c
void SerializationMap::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const void SerializationMap::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{ {
serializeTextImpl(column, row_num, ostr, serializeTextImpl(column, row_num, settings, ostr,
[&settings](WriteBuffer & buf, const SerializationPtr & subcolumn_serialization, const IColumn & subcolumn, size_t pos) [&settings](WriteBuffer & buf, const SerializationPtr & subcolumn_serialization, const IColumn & subcolumn, size_t pos)
{ {
/// We need to double-quote all keys (including integers) to produce valid JSON. /// We need to double-quote all keys (including integers) to produce valid JSON.

View File

@ -70,7 +70,7 @@ public:
private: private:
template <typename KeyWriter, typename ValueWriter> template <typename KeyWriter, typename ValueWriter>
void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, KeyWriter && key_writer, ValueWriter && value_writer) const; void serializeTextImpl(const IColumn & column, size_t row_num, const FormatSettings & settings, WriteBuffer & ostr, KeyWriter && key_writer, ValueWriter && value_writer) const;
template <typename ReturnType = void, typename Reader> template <typename ReturnType = void, typename Reader>
ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const; ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const;

View File

@ -137,12 +137,25 @@ void SerializationTuple::deserializeBinary(IColumn & column, ReadBuffer & istr,
void SerializationTuple::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const void SerializationTuple::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{ {
writeChar('(', ostr); writeChar('(', ostr);
for (size_t i = 0; i < elems.size(); ++i) if (!elems.empty())
{ {
if (i != 0) if (settings.composed_data_type_output_format_mode == "spark")
writeChar(',', ostr); elems[0]->serializeText(extractElementColumn(column, 0), row_num, ostr, settings);
elems[i]->serializeTextQuoted(extractElementColumn(column, i), row_num, ostr, settings); else
elems[0]->serializeTextQuoted(extractElementColumn(column, 0), row_num, ostr, settings);
} }
if (settings.composed_data_type_output_format_mode == "spark")
for (size_t i = 1; i < elems.size(); ++i)
{
writeString(std::string_view(", "), ostr);
elems[i]->serializeText(extractElementColumn(column, i), row_num, ostr, settings);
}
else
for (size_t i = 1; i < elems.size(); ++i)
{
writeChar(',', ostr);
elems[i]->serializeTextQuoted(extractElementColumn(column, i), row_num, ostr, settings);
}
writeChar(')', ostr); writeChar(')', ostr);
} }

View File

@ -251,6 +251,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.values.deduce_templates_of_expressions = settings[Setting::input_format_values_deduce_templates_of_expressions]; format_settings.values.deduce_templates_of_expressions = settings[Setting::input_format_values_deduce_templates_of_expressions];
format_settings.values.interpret_expressions = settings[Setting::input_format_values_interpret_expressions]; format_settings.values.interpret_expressions = settings[Setting::input_format_values_interpret_expressions];
format_settings.values.escape_quote_with_quote = settings[Setting::output_format_values_escape_quote_with_quote]; format_settings.values.escape_quote_with_quote = settings[Setting::output_format_values_escape_quote_with_quote];
format_settings.composed_data_type_output_format_mode = settings[Setting::composed_data_type_output_format_mode];
format_settings.with_names_use_header = settings[Setting::input_format_with_names_use_header]; format_settings.with_names_use_header = settings[Setting::input_format_with_names_use_header];
format_settings.with_types_use_header = settings[Setting::input_format_with_types_use_header]; format_settings.with_types_use_header = settings[Setting::input_format_with_types_use_header];
format_settings.write_statistics = settings[Setting::output_format_write_statistics]; format_settings.write_statistics = settings[Setting::output_format_write_statistics];

View File

@ -38,6 +38,7 @@ struct FormatSettings
bool try_infer_variant = false; bool try_infer_variant = false;
bool seekable_read = true; bool seekable_read = true;
String composed_data_type_output_format_mode = "default";
UInt64 max_rows_to_read_for_schema_inference = 25000; UInt64 max_rows_to_read_for_schema_inference = 25000;
UInt64 max_bytes_to_read_for_schema_inference = 32 * 1024 * 1024; UInt64 max_bytes_to_read_for_schema_inference = 32 * 1024 * 1024;

View File

@ -0,0 +1,16 @@
-- array format --
[\'1\']
[1, 2, abc, \'1\']
[1, 2, abc, \'1\']
[1, 2, abc, \'1\']
[1, 2, abc, \'1\']
-- map format --
{1343 -> fe, afe -> fefe}
{1343 -> fe, afe -> fefe}
{1343 -> fe, afe -> fefe}
{1343 -> fe, afe -> fefe}
-- tuple format --
(1, 3, abc)
(1, 3, abc)
(1, 3, abc)
(1, 3, abc)

View File

@ -0,0 +1,18 @@
SELECT '-- array format --';
SELECT CAST(array('\'1\'') , 'String') SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT CAST([materialize('1'), '2', 'abc', '\'1\''], 'String') SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT CAST([materialize('1'), materialize('2'), 'abc', '\'1\''], 'String') SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT CAST([materialize('1'), materialize('2'), materialize('abc'), '\'1\''], 'String') SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT CAST([materialize('1'), materialize('2'), materialize('abc'), materialize('\'1\'')], 'String') SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT '-- map format --';
SELECT toString(map('1343', 'fe', 'afe', 'fefe')) SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT toString(map(materialize('1343'), materialize('fe'), 'afe', 'fefe')) SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT toString(map(materialize('1343'), materialize('fe'), materialize('afe'), 'fefe')) SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT toString(map(materialize('1343'), materialize('fe'), materialize('afe'), materialize('fefe'))) SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT '-- tuple format --';
SELECT toString(('1', '3', 'abc')) SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT toString((materialize('1'), '3', 'abc')) SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT toString((materialize('1'), materialize('3'), 'abc')) SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT toString((materialize('1'), materialize('3'), materialize('abc'))) SETTINGS composed_data_type_output_format_mode = 'spark';