This commit is contained in:
zhanglistar 2024-11-21 00:51:26 +00:00 committed by GitHub
commit 453486b8fc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 102 additions and 21 deletions

View File

@ -1242,6 +1242,9 @@ Set the quoting rule for identifiers in SHOW CREATE query
)", 0) \
DECLARE(IdentifierQuotingStyle, show_create_query_identifier_quoting_style, IdentifierQuotingStyle::Backticks, R"(
Set the quoting style for identifiers in SHOW CREATE query
)", 0) \
DECLARE(String, composed_data_type_output_format_mode, "default", R"(
Set composed data type output format mode, default or spark.
)", 0) \
// End of FORMAT_FACTORY_SETTINGS

View File

@ -138,6 +138,8 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
{"allow_reorder_prewhere_conditions", true, true, "New setting"},
{"input_format_parquet_bloom_filter_push_down", false, true, "When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and bloom filter in the Parquet metadata."},
{"date_time_64_output_format_cut_trailing_zeros_align_to_groups_of_thousands", false, false, "Dynamically trim the trailing zeros of datetime64 values to adjust the output scale to (0, 3, 6), corresponding to 'seconds', 'milliseconds', and 'microseconds'."},
{"composed_data_type_output_format_mode", "default", "default", "New setting"},
{"spark_text_output_format", "default", "default", "New setting"},
}
},
{"24.9",

View File

@ -401,7 +401,7 @@ void SerializationArray::deserializeBinaryBulkWithMultipleStreams(
template <typename Writer>
static void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, Writer && write_nested)
static void serializeTextImpl(const IColumn & column, size_t row_num, const FormatSettings & settings, WriteBuffer & ostr, Writer && write_nested)
{
const ColumnArray & column_array = assert_cast<const ColumnArray &>(column);
const ColumnArray::Offsets & offsets = column_array.getOffsets();
@ -412,10 +412,14 @@ static void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffe
const IColumn & nested_column = column_array.getData();
writeChar('[', ostr);
for (size_t i = offset; i < next_offset; ++i)
if (next_offset != offset)
write_nested(nested_column, offset);
for (size_t i = offset + 1; i < next_offset; ++i)
{
if (i != offset)
writeChar(',', ostr);
if (settings.composed_data_type_output_format_mode == "spark")
writeChar(' ', ostr);
write_nested(nested_column, i);
}
writeChar(']', ostr);
@ -520,9 +524,12 @@ static ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reade
void SerializationArray::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
serializeTextImpl(column, row_num, ostr,
serializeTextImpl(column, row_num, settings, ostr,
[&](const IColumn & nested_column, size_t i)
{
if (settings.composed_data_type_output_format_mode == "spark")
nested->serializeText(nested_column, i, ostr, settings);
else
nested->serializeTextQuoted(nested_column, i, ostr, settings);
});
}

View File

@ -90,6 +90,7 @@ template <typename KeyWriter, typename ValueWriter>
void SerializationMap::serializeTextImpl(
const IColumn & column,
size_t row_num,
const FormatSettings & settings,
WriteBuffer & ostr,
KeyWriter && key_writer,
ValueWriter && value_writer) const
@ -104,11 +105,27 @@ void SerializationMap::serializeTextImpl(
size_t next_offset = offsets[row_num];
writeChar('{', ostr);
for (size_t i = offset; i < next_offset; ++i)
if (offset != next_offset)
{
key_writer(ostr, key, nested_tuple.getColumn(0), offset);
if (settings.composed_data_type_output_format_mode == "spark")
writeString(std::string_view(" -> "), ostr);
else
writeChar(':', ostr);
value_writer(ostr, value, nested_tuple.getColumn(1), offset);
}
if (settings.composed_data_type_output_format_mode == "spark")
for (size_t i = offset + 1; i < next_offset; ++i)
{
writeString(std::string_view(", "), ostr);
key_writer(ostr, key, nested_tuple.getColumn(0), i);
writeString(std::string_view(" -> "), ostr);
value_writer(ostr, value, nested_tuple.getColumn(1), i);
}
else
for (size_t i = offset + 1; i < next_offset; ++i)
{
if (i != offset)
writeChar(',', ostr);
key_writer(ostr, key, nested_tuple.getColumn(0), i);
writeChar(':', ostr);
value_writer(ostr, value, nested_tuple.getColumn(1), i);
@ -221,10 +238,13 @@ void SerializationMap::serializeText(const IColumn & column, size_t row_num, Wri
{
auto writer = [&settings](WriteBuffer & buf, const SerializationPtr & subcolumn_serialization, const IColumn & subcolumn, size_t pos)
{
if (settings.composed_data_type_output_format_mode == "spark")
subcolumn_serialization->serializeText(subcolumn, pos, buf, settings);
else
subcolumn_serialization->serializeTextQuoted(subcolumn, pos, buf, settings);
};
serializeTextImpl(column, row_num, ostr, writer, writer);
serializeTextImpl(column, row_num, settings, ostr, writer, writer);
}
void SerializationMap::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const
@ -266,7 +286,7 @@ bool SerializationMap::tryDeserializeText(IColumn & column, ReadBuffer & istr, c
void SerializationMap::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
serializeTextImpl(column, row_num, ostr,
serializeTextImpl(column, row_num, settings, ostr,
[&settings](WriteBuffer & buf, const SerializationPtr & subcolumn_serialization, const IColumn & subcolumn, size_t pos)
{
/// We need to double-quote all keys (including integers) to produce valid JSON.

View File

@ -70,7 +70,7 @@ public:
private:
template <typename KeyWriter, typename ValueWriter>
void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, KeyWriter && key_writer, ValueWriter && value_writer) const;
void serializeTextImpl(const IColumn & column, size_t row_num, const FormatSettings & settings, WriteBuffer & ostr, KeyWriter && key_writer, ValueWriter && value_writer) const;
template <typename ReturnType = void, typename Reader>
ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const;

View File

@ -137,9 +137,22 @@ void SerializationTuple::deserializeBinary(IColumn & column, ReadBuffer & istr,
void SerializationTuple::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
writeChar('(', ostr);
for (size_t i = 0; i < elems.size(); ++i)
if (!elems.empty())
{
if (settings.composed_data_type_output_format_mode == "spark")
elems[0]->serializeText(extractElementColumn(column, 0), row_num, ostr, settings);
else
elems[0]->serializeTextQuoted(extractElementColumn(column, 0), row_num, ostr, settings);
}
if (settings.composed_data_type_output_format_mode == "spark")
for (size_t i = 1; i < elems.size(); ++i)
{
writeString(std::string_view(", "), ostr);
elems[i]->serializeText(extractElementColumn(column, i), row_num, ostr, settings);
}
else
for (size_t i = 1; i < elems.size(); ++i)
{
if (i != 0)
writeChar(',', ostr);
elems[i]->serializeTextQuoted(extractElementColumn(column, i), row_num, ostr, settings);
}

View File

@ -251,6 +251,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.values.deduce_templates_of_expressions = settings[Setting::input_format_values_deduce_templates_of_expressions];
format_settings.values.interpret_expressions = settings[Setting::input_format_values_interpret_expressions];
format_settings.values.escape_quote_with_quote = settings[Setting::output_format_values_escape_quote_with_quote];
format_settings.composed_data_type_output_format_mode = settings[Setting::composed_data_type_output_format_mode];
format_settings.with_names_use_header = settings[Setting::input_format_with_names_use_header];
format_settings.with_types_use_header = settings[Setting::input_format_with_types_use_header];
format_settings.write_statistics = settings[Setting::output_format_write_statistics];

View File

@ -38,6 +38,7 @@ struct FormatSettings
bool try_infer_variant = false;
bool seekable_read = true;
String composed_data_type_output_format_mode = "default";
UInt64 max_rows_to_read_for_schema_inference = 25000;
UInt64 max_bytes_to_read_for_schema_inference = 32 * 1024 * 1024;

View File

@ -0,0 +1,16 @@
-- array format --
[\'1\']
[1, 2, abc, \'1\']
[1, 2, abc, \'1\']
[1, 2, abc, \'1\']
[1, 2, abc, \'1\']
-- map format --
{1343 -> fe, afe -> fefe}
{1343 -> fe, afe -> fefe}
{1343 -> fe, afe -> fefe}
{1343 -> fe, afe -> fefe}
-- tuple format --
(1, 3, abc)
(1, 3, abc)
(1, 3, abc)
(1, 3, abc)

View File

@ -0,0 +1,18 @@
SELECT '-- array format --';
SELECT CAST(array('\'1\'') , 'String') SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT CAST([materialize('1'), '2', 'abc', '\'1\''], 'String') SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT CAST([materialize('1'), materialize('2'), 'abc', '\'1\''], 'String') SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT CAST([materialize('1'), materialize('2'), materialize('abc'), '\'1\''], 'String') SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT CAST([materialize('1'), materialize('2'), materialize('abc'), materialize('\'1\'')], 'String') SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT '-- map format --';
SELECT toString(map('1343', 'fe', 'afe', 'fefe')) SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT toString(map(materialize('1343'), materialize('fe'), 'afe', 'fefe')) SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT toString(map(materialize('1343'), materialize('fe'), materialize('afe'), 'fefe')) SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT toString(map(materialize('1343'), materialize('fe'), materialize('afe'), materialize('fefe'))) SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT '-- tuple format --';
SELECT toString(('1', '3', 'abc')) SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT toString((materialize('1'), '3', 'abc')) SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT toString((materialize('1'), materialize('3'), 'abc')) SETTINGS composed_data_type_output_format_mode = 'spark';
SELECT toString((materialize('1'), materialize('3'), materialize('abc'))) SETTINGS composed_data_type_output_format_mode = 'spark';