diff --git a/dbms/src/Formats/ProtobufReader.cpp b/dbms/src/Formats/ProtobufReader.cpp index 5dfda83c37d..e99bbd49f41 100644 --- a/dbms/src/Formats/ProtobufReader.cpp +++ b/dbms/src/Formats/ProtobufReader.cpp @@ -48,7 +48,8 @@ namespace } - +// SimpleReader is an utility class to deserialize protobufs. +// Knows nothing about protobuf schemas, just provides useful functions to deserialize data. ProtobufReader::SimpleReader::SimpleReader(ReadBuffer & in_) : in(in_) , cursor(1 /* We starts at cursor == 1 to keep any cursor value > REACHED_END, this allows to simplify conditions */) @@ -374,6 +375,7 @@ void ProtobufReader::SimpleReader::ignoreGroup() } +// Implementation for a converter from any protobuf field type to any DB data type. class ProtobufReader::ConverterBaseImpl : public ProtobufReader::IConverter { public: @@ -688,19 +690,19 @@ private: std::optional> enum_name_to_value_map; }; -#define PROTOBUF_READER_CONVERTER_IMPL_SPECIALIZATION_FOR_STRINGS(field_type_id) \ - template<> \ - class ProtobufReader::ConverterImpl : public ConverterFromString \ +#define PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_STRINGS(field_type_id) \ + template <> \ + std::unique_ptr ProtobufReader::createConverter( \ + const google::protobuf::FieldDescriptor * field) \ { \ - using ConverterFromString::ConverterFromString; \ + return std::make_unique(simple_reader, field); \ } -PROTOBUF_READER_CONVERTER_IMPL_SPECIALIZATION_FOR_STRINGS(google::protobuf::FieldDescriptor::TYPE_STRING); -PROTOBUF_READER_CONVERTER_IMPL_SPECIALIZATION_FOR_STRINGS(google::protobuf::FieldDescriptor::TYPE_BYTES); -#undef PROTOBUF_READER_CONVERTER_IMPL_SPECIALIZATION_FOR_STRINGS +PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_STRINGS(google::protobuf::FieldDescriptor::TYPE_STRING) +PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_STRINGS(google::protobuf::FieldDescriptor::TYPE_BYTES) +#undef PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_STRINGS - -template +template class ProtobufReader::ConverterFromNumber : public ConverterBaseImpl { public: @@ -708,7 +710,7 @@ public: bool readStringInto(PaddedPODArray & str) override { - T number; + FromType number; if (!readField(number)) return false; WriteBufferFromVector> buf(str); @@ -765,7 +767,7 @@ private: template bool readNumeric(To & value) { - T number; + FromType number; if (!readField(number)) return false; value = numericCast(number); @@ -775,9 +777,9 @@ private: template bool readEnum(EnumType & value) { - if constexpr (!std::is_integral_v) + if constexpr (!std::is_integral_v) cannotConvertType("Enum"); // It's not correct to convert floating point to enum. - T number; + FromType number; if (!readField(number)) return false; value = numericCast(number); @@ -799,40 +801,39 @@ private: template bool readDecimal(Decimal & decimal, UInt32 scale) { - T number; + FromType number; if (!readField(number)) return false; - decimal.value = convertToDecimal, DataTypeDecimal>>(number, scale); + decimal.value = convertToDecimal, DataTypeDecimal>>(number, scale); return true; } - bool readField(T & value) + bool readField(FromType & value) { - if constexpr ((field_type_id == google::protobuf::FieldDescriptor::TYPE_INT32) && std::is_same_v) + if constexpr (((field_type_id == google::protobuf::FieldDescriptor::TYPE_INT32) && std::is_same_v) + || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_INT64) && std::is_same_v)) + { return simple_reader.readInt(value); - else if constexpr ((field_type_id == google::protobuf::FieldDescriptor::TYPE_SINT32) && std::is_same_v) - return simple_reader.readSInt(value); - else if constexpr ((field_type_id == google::protobuf::FieldDescriptor::TYPE_UINT32) && std::is_same_v) + } + else if constexpr (((field_type_id == google::protobuf::FieldDescriptor::TYPE_UINT32) && std::is_same_v) + || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_UINT64) && std::is_same_v)) + { return simple_reader.readUInt(value); - else if constexpr ((field_type_id == google::protobuf::FieldDescriptor::TYPE_INT64) && std::is_same_v) - return simple_reader.readInt(value); - else if constexpr ((field_type_id == google::protobuf::FieldDescriptor::TYPE_SINT64) && std::is_same_v) + } + + else if constexpr (((field_type_id == google::protobuf::FieldDescriptor::TYPE_SINT32) && std::is_same_v) + || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_SINT64) && std::is_same_v)) + { return simple_reader.readSInt(value); - else if constexpr ((field_type_id == google::protobuf::FieldDescriptor::TYPE_UINT64) && std::is_same_v) - return simple_reader.readUInt(value); - else if constexpr ((field_type_id == google::protobuf::FieldDescriptor::TYPE_FIXED32) && std::is_same_v) - return simple_reader.readFixed(value); - else if constexpr ((field_type_id == google::protobuf::FieldDescriptor::TYPE_SFIXED32) && std::is_same_v) - return simple_reader.readFixed(value); - else if constexpr ((field_type_id == google::protobuf::FieldDescriptor::TYPE_FIXED64) && std::is_same_v) - return simple_reader.readFixed(value); - else if constexpr ((field_type_id == google::protobuf::FieldDescriptor::TYPE_SFIXED64) && std::is_same_v) - return simple_reader.readFixed(value); - else if constexpr ((field_type_id == google::protobuf::FieldDescriptor::TYPE_FLOAT) && std::is_same_v) - return simple_reader.readFixed(value); + } else { - static_assert((field_type_id == google::protobuf::FieldDescriptor::TYPE_DOUBLE) && std::is_same_v); + static_assert(((field_type_id == google::protobuf::FieldDescriptor::TYPE_FIXED32) && std::is_same_v) + || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_SFIXED32) && std::is_same_v) + || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_FIXED64) && std::is_same_v) + || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_SFIXED64) && std::is_same_v) + || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_FLOAT) && std::is_same_v) + || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_DOUBLE) && std::is_same_v)); return simple_reader.readFixed(value); } } @@ -840,30 +841,30 @@ private: std::optional> set_of_enum_values; }; -#define PROTOBUF_READER_CONVERTER_IMPL_SPECIALIZATION_FOR_NUMBERS(field_type_id, field_type) \ - template<> \ - class ProtobufReader::ConverterImpl : public ConverterFromNumber \ +#define PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(field_type_id, field_type) \ + template <> \ + std::unique_ptr ProtobufReader::createConverter( \ + const google::protobuf::FieldDescriptor * field) \ { \ - using ConverterFromNumber::ConverterFromNumber; \ + return std::make_unique>(simple_reader, field); \ } -PROTOBUF_READER_CONVERTER_IMPL_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_INT32, Int64); -PROTOBUF_READER_CONVERTER_IMPL_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_SINT32, Int64); -PROTOBUF_READER_CONVERTER_IMPL_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_UINT32, UInt64); -PROTOBUF_READER_CONVERTER_IMPL_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_INT64, Int64); -PROTOBUF_READER_CONVERTER_IMPL_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_SINT64, Int64); -PROTOBUF_READER_CONVERTER_IMPL_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_UINT64, UInt64); -PROTOBUF_READER_CONVERTER_IMPL_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_FIXED32, UInt32); -PROTOBUF_READER_CONVERTER_IMPL_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_SFIXED32, Int32); -PROTOBUF_READER_CONVERTER_IMPL_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_FIXED64, UInt64); -PROTOBUF_READER_CONVERTER_IMPL_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_SFIXED64, Int64); -PROTOBUF_READER_CONVERTER_IMPL_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_FLOAT, float); -PROTOBUF_READER_CONVERTER_IMPL_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_DOUBLE, double); -#undef PROTOBUF_READER_CONVERTER_IMPL_SPECIALIZATION_FOR_NUMBERS +PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_INT32, Int64); +PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_SINT32, Int64); +PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_UINT32, UInt64); +PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_INT64, Int64); +PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_SINT64, Int64); +PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_UINT64, UInt64); +PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_FIXED32, UInt32); +PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_SFIXED32, Int32); +PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_FIXED64, UInt64); +PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_SFIXED64, Int64); +PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_FLOAT, float); +PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_DOUBLE, double); +#undef PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS -template<> -class ProtobufReader::ConverterImpl : public ConverterBaseImpl +class ProtobufReader::ConverterFromBool : public ConverterBaseImpl { public: using ConverterBaseImpl::ConverterBaseImpl; @@ -913,10 +914,15 @@ private: } }; +template <> +std::unique_ptr ProtobufReader::createConverter( + const google::protobuf::FieldDescriptor * field) +{ + return std::make_unique(simple_reader, field); +} -template<> -class ProtobufReader::ConverterImpl : public ConverterBaseImpl +class ProtobufReader::ConverterFromEnum : public ConverterBaseImpl { public: using ConverterBaseImpl::ConverterBaseImpl; @@ -973,10 +979,15 @@ private: Int64 pbnumber; if (!readField(pbnumber)) return false; - auto it = enum_pbnumber_to_value_map->find(pbnumber); - if (it == enum_pbnumber_to_value_map->end()) - cannotConvertValue(toString(pbnumber), "Enum"); - value = static_cast(it->second); + if (enum_pbnumber_always_equals_value) + value = static_cast(pbnumber); + else + { + auto it = enum_pbnumber_to_value_map->find(pbnumber); + if (it == enum_pbnumber_to_value_map->end()) + cannotConvertValue(toString(pbnumber), "Enum"); + value = static_cast(it->second); + } return true; } @@ -999,12 +1010,19 @@ private: if (likely(enum_pbnumber_to_value_map.has_value())) return; enum_pbnumber_to_value_map.emplace(); + enum_pbnumber_always_equals_value = true; for (const auto & name_value_pair : name_value_pairs) { Int16 value = name_value_pair.second; const auto * enum_descriptor = field->enum_type()->FindValueByName(name_value_pair.first); if (enum_descriptor) + { enum_pbnumber_to_value_map->emplace(enum_descriptor->number(), value); + if (enum_descriptor->number() != value) + enum_pbnumber_always_equals_value = false; + } + else + enum_pbnumber_always_equals_value = false; } } @@ -1015,8 +1033,16 @@ private: std::optional> enum_pbnumber_to_name_map; std::optional> enum_pbnumber_to_value_map; + bool enum_pbnumber_always_equals_value; }; +template <> +std::unique_ptr ProtobufReader::createConverter( + const google::protobuf::FieldDescriptor * field) +{ + return std::make_unique(simple_reader, field); +} + ProtobufReader::ProtobufReader( ReadBuffer & in_, const google::protobuf::Descriptor * message_type, const std::vector & column_names) @@ -1041,7 +1067,7 @@ void ProtobufReader::setTraitsDataAfterMatchingColumns(Message * message) { #define PROTOBUF_READER_CONVERTER_CREATING_CASE(field_type_id) \ case field_type_id: \ - field.data.converter = std::make_unique>(simple_reader, field.field_descriptor); \ + field.data.converter = createConverter(field.field_descriptor); \ break PROTOBUF_READER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_STRING); PROTOBUF_READER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_BYTES); diff --git a/dbms/src/Formats/ProtobufReader.h b/dbms/src/Formats/ProtobufReader.h index 52e82894211..2917899139d 100644 --- a/dbms/src/Formats/ProtobufReader.h +++ b/dbms/src/Formats/ProtobufReader.h @@ -158,9 +158,10 @@ private: }; class ConverterBaseImpl; - template class ConverterImpl; class ConverterFromString; template class ConverterFromNumber; + class ConverterFromBool; + class ConverterFromEnum; struct ColumnMatcherTraits { @@ -178,6 +179,9 @@ private: void setTraitsDataAfterMatchingColumns(Message * message); + template + std::unique_ptr createConverter(const google::protobuf::FieldDescriptor * field); + SimpleReader simple_reader; std::unique_ptr root_message; Message* current_message = nullptr; diff --git a/dbms/src/Formats/ProtobufRowInputStream.h b/dbms/src/Formats/ProtobufRowInputStream.h index 535ac408b01..624ef4b8cfb 100644 --- a/dbms/src/Formats/ProtobufRowInputStream.h +++ b/dbms/src/Formats/ProtobufRowInputStream.h @@ -13,7 +13,13 @@ class Block; class FormatSchemaInfo; -/** Interface of stream, that allows to read data by rows. +/** Stream designed to deserialize data from the google protobuf format. + * Each row is read as a separated message. + * These messages are delimited according to documentation + * https://github.com/protocolbuffers/protobuf/blob/master/src/google/protobuf/util/delimited_message_util.h + * Serializing in the protobuf format requires the 'format_schema' setting to be set, e.g. + * INSERT INTO table FORMAT Protobuf SETTINGS format_schema = 'schema:Message' + * where schema is the name of "schema.proto" file specifying protobuf schema. */ class ProtobufRowInputStream : public IRowInputStream {