diff --git a/contrib/avro b/contrib/avro index 6cfcf6c2429..92caca2d42f 160000 --- a/contrib/avro +++ b/contrib/avro @@ -1 +1 @@ -Subproject commit 6cfcf6c24293af100d523b89b61d1ab216fa4735 +Subproject commit 92caca2d42fc9a97e34e95f963593539d32ed331 diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 364e3282f00..19c449b8e26 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -70,6 +71,7 @@ namespace ErrorCodes extern const int INCORRECT_DATA; extern const int ILLEGAL_COLUMN; extern const int TYPE_MISMATCH; + extern const int CANNOT_PARSE_UUID; } class InputStreamReadBufferAdapter : public avro::InputStream @@ -176,6 +178,19 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node { case avro::AVRO_STRING: [[fallthrough]]; case avro::AVRO_BYTES: + if (target.isUUID()) + { + return [tmp = std::string()](IColumn & column, avro::Decoder & decoder) mutable + { + decoder.decodeString(tmp); + if (tmp.length() != 36) + throw Exception(std::string("Cannot parse uuid ") + tmp, ErrorCodes::CANNOT_PARSE_UUID); + + UUID uuid; + parseUUID(reinterpret_cast(tmp.data()), std::reverse_iterator(reinterpret_cast(&uuid) + 16)); + assert_cast(column).insertValue(uuid); + }; + } if (target.isString() || target.isFixedString()) { return [tmp = std::string()](IColumn & column, avro::Decoder & decoder) mutable diff --git a/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp b/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp index 620eafa2fd7..82688fe407c 100644 --- a/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -207,6 +208,18 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF encoder.encodeEnum(enum_mapping.at(enum_value)); }}; } + case TypeIndex::UUID: + { + auto schema = avro::StringSchema(); + schema.root()->setLogicalType(avro::LogicalType(avro::LogicalType::UUID)); + return {schema, [](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + const auto & uuid = assert_cast(column).getElement(row_num); + std::array s; + formatUUID(std::reverse_iterator(reinterpret_cast(&uuid) + 16), s.data()); + encoder.encodeBytes(reinterpret_cast(s.data()), s.size()); + }}; + } case TypeIndex::Array: { const auto & array_type = assert_cast(*data_type); diff --git a/tests/queries/0_stateless/01060_avro.reference b/tests/queries/0_stateless/01060_avro.reference index 192a86ca9bb..0cf84dab914 100644 --- a/tests/queries/0_stateless/01060_avro.reference +++ b/tests/queries/0_stateless/01060_avro.reference @@ -14,8 +14,8 @@ "79cd909892d7e7ade1987cc7422628ba" "79cd909892d7e7ade1987cc7422628ba" = logical_types -"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000" -18250,1578641516227,1578641516227000 +"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000","7c856fd6-005f-46c7-a7b5-3a082ef6c659" +18250,1578641516227,1578641516227000,"7c856fd6-005f-46c7-a7b5-3a082ef6c659" = references "a1","c1" "a2","c2" @@ -52,7 +52,7 @@ not found = complex "A","t","['s1','s2']","[['a1'],['a2']]","s1",\N,"79cd909892d7e7ade1987cc7422628ba" = logical_types -"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000" +"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000","7c856fd6-005f-46c7-a7b5-3a082ef6c659" = other 0 1000 diff --git a/tests/queries/0_stateless/01060_avro.sh b/tests/queries/0_stateless/01060_avro.sh index 71f27c8491a..fe5d91c75c0 100755 --- a/tests/queries/0_stateless/01060_avro.sh +++ b/tests/queries/0_stateless/01060_avro.sh @@ -21,8 +21,8 @@ cat $DATA_DIR/complex.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-fo cat $DATA_DIR/complex.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "g_fixed FixedString(32)" -q 'select * from table' echo = logical_types -cat $DATA_DIR/logical_types.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "a_date Date, b_timestamp_millis DateTime64(3, 'UTC'), c_timestamp_micros DateTime64(6, 'UTC')" -q 'select * from table' -cat $DATA_DIR/logical_types.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a_date Int32, b_timestamp_millis Int64, c_timestamp_micros Int64' -q 'select * from table' +cat $DATA_DIR/logical_types.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "a_date Date, b_timestamp_millis DateTime64(3, 'UTC'), c_timestamp_micros DateTime64(6, 'UTC'), d_uuid UUID" -q 'select * from table' +cat $DATA_DIR/logical_types.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a_date Int32, b_timestamp_millis Int64, c_timestamp_micros Int64, d_uuid UUID' -q 'select * from table' echo = references cat $DATA_DIR/references.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "a String, c String" -q 'select * from table' @@ -76,8 +76,8 @@ S2="a_enum_to_string String, b_enum_to_enum Enum('t' = 1, 'f' = 0), c_array_stri echo "\"A\",\"t\",\"['s1','s2']\",\"[['a1'],['a2']]\",\"s1\",\N,\"79cd909892d7e7ade1987cc7422628ba\"" | ${CLICKHOUSE_LOCAL} --input-format CSV -S "$S2" -q "select * from table format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S2" -q 'select * from table' echo = logical_types -S3="a_date Date, b_timestamp_millis DateTime64(3, 'UTC'), c_timestamp_micros DateTime64(6, 'UTC')" -echo '"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000"' | ${CLICKHOUSE_LOCAL} --input-format CSV -S "$S3" -q "select * from table format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S3" -q 'select * from table' +S3="a_date Date, b_timestamp_millis DateTime64(3, 'UTC'), c_timestamp_micros DateTime64(6, 'UTC'), d_uuid UUID" +echo '"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000","7c856fd6-005f-46c7-a7b5-3a082ef6c659"' | ${CLICKHOUSE_LOCAL} --input-format CSV -S "$S3" -q "select * from table format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S3" -q 'select * from table' echo = other S4="a Int64" diff --git a/tests/queries/0_stateless/data_avro/generate_avro.sh b/tests/queries/0_stateless/data_avro/generate_avro.sh index 0bd8dad773b..5cdebc266cc 100755 --- a/tests/queries/0_stateless/data_avro/generate_avro.sh +++ b/tests/queries/0_stateless/data_avro/generate_avro.sh @@ -14,4 +14,4 @@ avro-tools fromjson --schema-file nested_complex.avsc nested_complex.json > nes #compression avro-tools fromjson --codec null --schema-file simple.avsc simple.json > simple.null.avro avro-tools fromjson --codec deflate --schema-file simple.avsc simple.json > simple.deflate.avro -avro-tools fromjson --codec snappy --schema-file simple.avsc simple.json > simple.snappy.avro \ No newline at end of file +avro-tools fromjson --codec snappy --schema-file simple.avsc simple.json > simple.snappy.avro diff --git a/tests/queries/0_stateless/data_avro/logical_types.avro b/tests/queries/0_stateless/data_avro/logical_types.avro index 7b8a3f60b7a..e176b62d822 100644 Binary files a/tests/queries/0_stateless/data_avro/logical_types.avro and b/tests/queries/0_stateless/data_avro/logical_types.avro differ diff --git a/tests/queries/0_stateless/data_avro/logical_types.avsc b/tests/queries/0_stateless/data_avro/logical_types.avsc index 5d9fd96821f..e6961baba4e 100644 --- a/tests/queries/0_stateless/data_avro/logical_types.avsc +++ b/tests/queries/0_stateless/data_avro/logical_types.avsc @@ -4,6 +4,7 @@ "fields": [ {"name": "a_date", "type": { "type": "int", "logicalType": "date"}}, {"name": "b_timestamp_millis", "type": { "type": "long", "logicalType": "timestamp-millis"}}, - {"name": "c_timestamp_micros", "type": { "type": "long", "logicalType": "timestamp-micros"}} + {"name": "c_timestamp_micros", "type": { "type": "long", "logicalType": "timestamp-micros"}}, + {"name": "d_uuid", "type": { "type": "string", "logicalType": "uuid"}} ] } \ No newline at end of file diff --git a/tests/queries/0_stateless/data_avro/logical_types.json b/tests/queries/0_stateless/data_avro/logical_types.json index 652b85246e7..976d7710642 100644 --- a/tests/queries/0_stateless/data_avro/logical_types.json +++ b/tests/queries/0_stateless/data_avro/logical_types.json @@ -1 +1 @@ -{"a_date":18250,"b_timestamp_millis":1578641516227,"c_timestamp_micros":1578641516227000} +{"a_date":18250,"b_timestamp_millis":1578641516227,"c_timestamp_micros":1578641516227000, "d_uuid":"7c856fd6-005f-46c7-a7b5-3a082ef6c659"}