diff --git a/.clang-tidy b/.clang-tidy index ecb8ac6dcbf..ddd0ee6d911 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -142,6 +142,7 @@ Checks: '-*, clang-analyzer-cplusplus.PlacementNewChecker, clang-analyzer-cplusplus.SelfAssignment, clang-analyzer-deadcode.DeadStores, + clang-analyzer-cplusplus.Move, clang-analyzer-optin.cplusplus.VirtualCall, clang-analyzer-security.insecureAPI.UncheckedReturn, clang-analyzer-security.insecureAPI.bcmp, diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 430c7a194eb..c4b4ab77867 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -610,6 +610,7 @@ class IColumn; M(Char, input_format_hive_text_collection_items_delimiter, '\x02', "Delimiter between collection(array or map) items in Hive Text File", 0) \ M(Char, input_format_hive_text_map_keys_delimiter, '\x03', "Delimiter between a pair of map key/values in Hive Text File", 0) \ M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \ + M(MsgPackUUIDRepresentation, output_format_msgpack_uuid_representation, FormatSettings::MsgPackUUIDRepresentation::EXT, "The way how to output UUID in MsgPack format.", 0) \ M(UInt64, input_format_max_rows_to_read_for_schema_inference, 100, "The maximum rows of data to read for automatic schema inference", 0) \ \ M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic' and 'best_effort'.", 0) \ diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp index b62575c9730..5d16f0a5c85 100644 --- a/src/Core/SettingsEnums.cpp +++ b/src/Core/SettingsEnums.cpp @@ -130,4 +130,10 @@ IMPLEMENT_SETTING_ENUM(EscapingRule, ErrorCodes::BAD_ARGUMENTS, {"JSON", FormatSettings::EscapingRule::JSON}, {"XML", FormatSettings::EscapingRule::XML}, {"Raw", FormatSettings::EscapingRule::Raw}}) + +IMPLEMENT_SETTING_ENUM(MsgPackUUIDRepresentation , ErrorCodes::BAD_ARGUMENTS, + {{"bin", FormatSettings::MsgPackUUIDRepresentation::BIN}, + {"str", FormatSettings::MsgPackUUIDRepresentation::STR}, + {"ext", FormatSettings::MsgPackUUIDRepresentation::EXT}}) + } diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h index 5d2640da319..068d8c21ec3 100644 --- a/src/Core/SettingsEnums.h +++ b/src/Core/SettingsEnums.h @@ -171,4 +171,6 @@ DECLARE_SETTING_ENUM(ShortCircuitFunctionEvaluation) DECLARE_SETTING_ENUM_WITH_RENAME(EnumComparingMode, FormatSettings::EnumComparingMode) DECLARE_SETTING_ENUM_WITH_RENAME(EscapingRule, FormatSettings::EscapingRule) + +DECLARE_SETTING_ENUM_WITH_RENAME(MsgPackUUIDRepresentation, FormatSettings::MsgPackUUIDRepresentation) } diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index f19d03dc8d0..be565a532bb 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -129,6 +129,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; format_settings.seekable_read = settings.input_format_allow_seeks; format_settings.msgpack.number_of_columns = settings.input_format_msgpack_number_of_columns; + format_settings.msgpack.output_uuid_representation = settings.output_format_msgpack_uuid_representation; format_settings.max_rows_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference; /// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index b484d623944..265c879e768 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -231,9 +231,17 @@ struct FormatSettings EnumComparingMode enum_comparing_mode = EnumComparingMode::BY_VALUES; } capn_proto; + enum class MsgPackUUIDRepresentation + { + STR, // Output UUID as a string of 36 characters. + BIN, // Output UUID as 16-bytes binary. + EXT, // Output UUID as ExtType = 2 + }; + struct { UInt64 number_of_columns = 0; + MsgPackUUIDRepresentation output_uuid_representation = MsgPackUUIDRepresentation::EXT; } msgpack; }; diff --git a/src/Formats/MsgPackExtensionTypes.h b/src/Formats/MsgPackExtensionTypes.h new file mode 100644 index 00000000000..139d2f9047b --- /dev/null +++ b/src/Formats/MsgPackExtensionTypes.h @@ -0,0 +1,11 @@ +#pragma once + +namespace DB +{ + +enum class MsgPackExtensionTypes +{ + UUID = 0x02, +}; + +} diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp index 2471a98f83d..56fc5d7857b 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -12,6 +13,7 @@ #include #include #include +#include #include #include @@ -20,6 +22,8 @@ #include #include +#include + namespace DB { @@ -153,16 +157,29 @@ static void insertInteger(IColumn & column, DataTypePtr type, UInt64 value) } } -static void insertString(IColumn & column, DataTypePtr type, const char * value, size_t size) +static void insertString(IColumn & column, DataTypePtr type, const char * value, size_t size, bool bin) { auto insert_func = [&](IColumn & column_, DataTypePtr type_) { - insertString(column_, type_, value, size); + insertString(column_, type_, value, size, bin); }; if (checkAndInsertNullable(column, type, insert_func) || checkAndInsertLowCardinality(column, type, insert_func)) return; + if (isUUID(type)) + { + ReadBufferFromMemory buf(value, size); + UUID uuid; + if (bin) + readBinary(uuid, buf); + else + readUUIDText(uuid, buf); + + assert_cast(column).insertValue(uuid); + return; + } + if (!isStringOrFixedString(type)) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert MessagePack string into column with type {}.", type->getName()); @@ -218,6 +235,15 @@ static void insertNull(IColumn & column, DataTypePtr type) assert_cast(column).insertDefault(); } +static void insertUUID(IColumn & column, DataTypePtr /*type*/, const char * value, size_t size) +{ + ReadBufferFromMemory buf(value, size); + UUID uuid; + readBinaryBigEndian(uuid.toUnderType().items[0], buf); + readBinaryBigEndian(uuid.toUnderType().items[1], buf); + assert_cast(column).insertValue(uuid); +} + bool MsgPackVisitor::visit_positive_integer(UInt64 value) // NOLINT { insertInteger(info_stack.top().column, info_stack.top().type, value); @@ -232,13 +258,13 @@ bool MsgPackVisitor::visit_negative_integer(Int64 value) // NOLINT bool MsgPackVisitor::visit_str(const char * value, size_t size) // NOLINT { - insertString(info_stack.top().column, info_stack.top().type, value, size); + insertString(info_stack.top().column, info_stack.top().type, value, size, false); return true; } bool MsgPackVisitor::visit_bin(const char * value, size_t size) // NOLINT { - insertString(info_stack.top().column, info_stack.top().type, value, size); + insertString(info_stack.top().column, info_stack.top().type, value, size, true); return true; } @@ -324,6 +350,18 @@ bool MsgPackVisitor::visit_nil() return true; } +bool MsgPackVisitor::visit_ext(const char * value, uint32_t size) +{ + int8_t type = *value; + if (*value == int8_t(MsgPackExtensionTypes::UUID)) + { + insertUUID(info_stack.top().column, info_stack.top().type, value + 1, size - 1); + return true; + } + + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported MsgPack extension type: {%x}", type); +} + void MsgPackVisitor::parse_error(size_t, size_t) // NOLINT { throw Exception("Error occurred while parsing msgpack data.", ErrorCodes::INCORRECT_DATA); @@ -455,8 +493,13 @@ DataTypePtr MsgPackSchemaReader::getDataType(const msgpack::object & object) } case msgpack::type::object_type::NIL: return nullptr; - default: - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Msgpack type is not supported"); + case msgpack::type::object_type::EXT: + { + msgpack::object_ext object_ext = object.via.ext; + if (object_ext.type() == int8_t(MsgPackExtensionTypes::UUID)) + return std::make_shared(); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Msgpack extension type {%x} is not supported", object_ext.type()); + } } } diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.h b/src/Processors/Formats/Impl/MsgPackRowInputFormat.h index c2ad31c7c4c..2298e35fed5 100644 --- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.h +++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.h @@ -42,6 +42,7 @@ public: bool end_map_key(); bool start_map_value(); bool end_map_value(); + bool visit_ext(const char * value, uint32_t size); /// This function will be called if error occurs in parsing [[noreturn]] void parse_error(size_t parsed_offset, size_t error_offset); @@ -55,7 +56,7 @@ private: std::stack info_stack; }; -class MsgPackRowInputFormat final : public IRowInputFormat +class MsgPackRowInputFormat : public IRowInputFormat { public: MsgPackRowInputFormat(const Block & header_, ReadBuffer & in_, Params params_); diff --git a/src/Processors/Formats/Impl/MsgPackRowOutputFormat.cpp b/src/Processors/Formats/Impl/MsgPackRowOutputFormat.cpp index 36a8a62b39e..edec9774b5f 100644 --- a/src/Processors/Formats/Impl/MsgPackRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/MsgPackRowOutputFormat.cpp @@ -5,6 +5,9 @@ #include #include +#include +#include + #include #include #include @@ -19,6 +22,8 @@ #include #include +#include + namespace DB { @@ -27,8 +32,8 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; } -MsgPackRowOutputFormat::MsgPackRowOutputFormat(WriteBuffer & out_, const Block & header_, const RowOutputFormatParams & params_) - : IRowOutputFormat(header_, out_, params_), packer(out_) {} +MsgPackRowOutputFormat::MsgPackRowOutputFormat(WriteBuffer & out_, const Block & header_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_) + : IRowOutputFormat(header_, out_, params_), packer(out_), format_settings(format_settings_) {} void MsgPackRowOutputFormat::serializeField(const IColumn & column, DataTypePtr data_type, size_t row_num) { @@ -164,6 +169,42 @@ void MsgPackRowOutputFormat::serializeField(const IColumn & column, DataTypePtr serializeField(*dict_column, dict_type, index); return; } + case TypeIndex::UUID: + { + const auto & uuid_column = assert_cast(column); + switch (format_settings.msgpack.output_uuid_representation) + { + case FormatSettings::MsgPackUUIDRepresentation::BIN: + { + WriteBufferFromOwnString buf; + writeBinary(uuid_column.getElement(row_num), buf); + StringRef uuid_bin = buf.stringRef(); + packer.pack_bin(uuid_bin.size); + packer.pack_bin_body(uuid_bin.data, uuid_bin.size); + return; + } + case FormatSettings::MsgPackUUIDRepresentation::STR: + { + WriteBufferFromOwnString buf; + writeText(uuid_column.getElement(row_num), buf); + StringRef uuid_text = buf.stringRef(); + packer.pack_str(uuid_text.size); + packer.pack_bin_body(uuid_text.data, uuid_text.size); + return; + } + case FormatSettings::MsgPackUUIDRepresentation::EXT: + { + WriteBufferFromOwnString buf; + UUID value = uuid_column.getElement(row_num); + writeBinaryBigEndian(value.toUnderType().items[0], buf); + writeBinaryBigEndian(value.toUnderType().items[1], buf); + StringRef uuid_ext = buf.stringRef(); + packer.pack_ext(sizeof(UUID), int8_t(MsgPackExtensionTypes::UUID)); + packer.pack_ext_body(uuid_ext.data, uuid_ext.size); + return; + } + } + } default: break; } @@ -186,9 +227,9 @@ void registerOutputFormatMsgPack(FormatFactory & factory) WriteBuffer & buf, const Block & sample, const RowOutputFormatParams & params, - const FormatSettings &) + const FormatSettings & settings) { - return std::make_shared(buf, sample, params); + return std::make_shared(buf, sample, params, settings); }); factory.markOutputFormatSupportsParallelFormatting("MsgPack"); } diff --git a/src/Processors/Formats/Impl/MsgPackRowOutputFormat.h b/src/Processors/Formats/Impl/MsgPackRowOutputFormat.h index 19b37afed90..e2abbd588c4 100644 --- a/src/Processors/Formats/Impl/MsgPackRowOutputFormat.h +++ b/src/Processors/Formats/Impl/MsgPackRowOutputFormat.h @@ -18,7 +18,7 @@ namespace DB class MsgPackRowOutputFormat final : public IRowOutputFormat { public: - MsgPackRowOutputFormat(WriteBuffer & out_, const Block & header_, const RowOutputFormatParams & params_); + MsgPackRowOutputFormat(WriteBuffer & out_, const Block & header_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_); String getName() const override { return "MsgPackRowOutputFormat"; } @@ -28,6 +28,7 @@ private: void serializeField(const IColumn & column, DataTypePtr data_type, size_t row_num); msgpack::packer packer; + const FormatSettings format_settings; }; } diff --git a/tests/queries/0_stateless/02187_msg_pack_uuid.reference b/tests/queries/0_stateless/02187_msg_pack_uuid.reference new file mode 100644 index 00000000000..c567cc14ad2 --- /dev/null +++ b/tests/queries/0_stateless/02187_msg_pack_uuid.reference @@ -0,0 +1,4 @@ +5e7084e0-019f-461f-9e70-84e0019f561f +5e7084e0-019f-461f-9e70-84e0019f561f +5e7084e0-019f-461f-9e70-84e0019f561f +5e7084e0-019f-461f-9e70-84e0019f561f UUID diff --git a/tests/queries/0_stateless/02187_msg_pack_uuid.sh b/tests/queries/0_stateless/02187_msg_pack_uuid.sh new file mode 100755 index 00000000000..9be92d66790 --- /dev/null +++ b/tests/queries/0_stateless/02187_msg_pack_uuid.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "insert into table function file('uuid_str.msgpack', 'MsgPack', 'uuid UUID') select toUUID('5e7084e0-019f-461f-9e70-84e0019f561f') settings output_format_msgpack_uuid_representation='str'" +$CLICKHOUSE_CLIENT -q "select * from file('uuid_str.msgpack', 'MsgPack', 'uuid UUID')" + +$CLICKHOUSE_CLIENT -q "insert into table function file('uuid_bin.msgpack', 'MsgPack', 'uuid UUID') select toUUID('5e7084e0-019f-461f-9e70-84e0019f561f') settings output_format_msgpack_uuid_representation='bin'" +$CLICKHOUSE_CLIENT -q "select * from file('uuid_bin.msgpack', 'MsgPack', 'uuid UUID')" + +$CLICKHOUSE_CLIENT -q "insert into table function file('uuid_ext.msgpack', 'MsgPack', 'uuid UUID') select toUUID('5e7084e0-019f-461f-9e70-84e0019f561f') settings output_format_msgpack_uuid_representation='ext'" +$CLICKHOUSE_CLIENT -q "select * from file('uuid_ext.msgpack', 'MsgPack', 'uuid UUID')" +$CLICKHOUSE_CLIENT -q "select c1, toTypeName(c1) from file('uuid_ext.msgpack') settings input_format_msgpack_number_of_columns=1" +