From fc1ae85600c5258dc994fc45b35d01a364e2e8d1 Mon Sep 17 00:00:00 2001 From: Andrew Onyshchuk Date: Thu, 25 Jun 2020 14:25:45 -0500 Subject: [PATCH] Avro UUID support --- contrib/avro | 2 +- .../Formats/Impl/AvroRowInputFormat.cpp | 14 ++++++++++++++ .../Formats/Impl/AvroRowOutputFormat.cpp | 13 +++++++++++++ tests/queries/0_stateless/01060_avro.reference | 6 +++--- tests/queries/0_stateless/01060_avro.sh | 8 ++++---- .../0_stateless/data_avro/generate_avro.sh | 2 +- .../0_stateless/data_avro/logical_types.avro | Bin 361 -> 462 bytes .../0_stateless/data_avro/logical_types.avsc | 3 ++- .../0_stateless/data_avro/logical_types.json | 2 +- 9 files changed, 39 insertions(+), 11 deletions(-) diff --git a/contrib/avro b/contrib/avro index 6cfcf6c2429..92caca2d42f 160000 --- a/contrib/avro +++ b/contrib/avro @@ -1 +1 @@ -Subproject commit 6cfcf6c24293af100d523b89b61d1ab216fa4735 +Subproject commit 92caca2d42fc9a97e34e95f963593539d32ed331 diff --git a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp index 364e3282f00..60fffd65f88 100644 --- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -176,6 +177,19 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node { case avro::AVRO_STRING: [[fallthrough]]; case avro::AVRO_BYTES: + if (target.isUUID()) + { + return [tmp = std::string()](IColumn & column, avro::Decoder & decoder) mutable + { + decoder.decodeString(tmp); + if (tmp.length() != 36) + throw Exception(std::string("Cannot parse uuid ") + tmp, ErrorCodes::CANNOT_PARSE_UUID); + + UUID uuid; + parseUUID(reinterpret_cast(tmp.data()), std::reverse_iterator(reinterpret_cast(&uuid) + 16)); + assert_cast(column).insertValue(uuid); + }; + } if (target.isString() || target.isFixedString()) { return [tmp = std::string()](IColumn & column, avro::Decoder & decoder) mutable diff --git a/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp b/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp index 620eafa2fd7..82688fe407c 100644 --- a/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/AvroRowOutputFormat.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -207,6 +208,18 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF encoder.encodeEnum(enum_mapping.at(enum_value)); }}; } + case TypeIndex::UUID: + { + auto schema = avro::StringSchema(); + schema.root()->setLogicalType(avro::LogicalType(avro::LogicalType::UUID)); + return {schema, [](const IColumn & column, size_t row_num, avro::Encoder & encoder) + { + const auto & uuid = assert_cast(column).getElement(row_num); + std::array s; + formatUUID(std::reverse_iterator(reinterpret_cast(&uuid) + 16), s.data()); + encoder.encodeBytes(reinterpret_cast(s.data()), s.size()); + }}; + } case TypeIndex::Array: { const auto & array_type = assert_cast(*data_type); diff --git a/tests/queries/0_stateless/01060_avro.reference b/tests/queries/0_stateless/01060_avro.reference index 192a86ca9bb..0cf84dab914 100644 --- a/tests/queries/0_stateless/01060_avro.reference +++ b/tests/queries/0_stateless/01060_avro.reference @@ -14,8 +14,8 @@ "79cd909892d7e7ade1987cc7422628ba" "79cd909892d7e7ade1987cc7422628ba" = logical_types -"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000" -18250,1578641516227,1578641516227000 +"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000","7c856fd6-005f-46c7-a7b5-3a082ef6c659" +18250,1578641516227,1578641516227000,"7c856fd6-005f-46c7-a7b5-3a082ef6c659" = references "a1","c1" "a2","c2" @@ -52,7 +52,7 @@ not found = complex "A","t","['s1','s2']","[['a1'],['a2']]","s1",\N,"79cd909892d7e7ade1987cc7422628ba" = logical_types -"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000" +"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000","7c856fd6-005f-46c7-a7b5-3a082ef6c659" = other 0 1000 diff --git a/tests/queries/0_stateless/01060_avro.sh b/tests/queries/0_stateless/01060_avro.sh index 71f27c8491a..fe5d91c75c0 100755 --- a/tests/queries/0_stateless/01060_avro.sh +++ b/tests/queries/0_stateless/01060_avro.sh @@ -21,8 +21,8 @@ cat $DATA_DIR/complex.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-fo cat $DATA_DIR/complex.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "g_fixed FixedString(32)" -q 'select * from table' echo = logical_types -cat $DATA_DIR/logical_types.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "a_date Date, b_timestamp_millis DateTime64(3, 'UTC'), c_timestamp_micros DateTime64(6, 'UTC')" -q 'select * from table' -cat $DATA_DIR/logical_types.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a_date Int32, b_timestamp_millis Int64, c_timestamp_micros Int64' -q 'select * from table' +cat $DATA_DIR/logical_types.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "a_date Date, b_timestamp_millis DateTime64(3, 'UTC'), c_timestamp_micros DateTime64(6, 'UTC'), d_uuid UUID" -q 'select * from table' +cat $DATA_DIR/logical_types.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a_date Int32, b_timestamp_millis Int64, c_timestamp_micros Int64, d_uuid UUID' -q 'select * from table' echo = references cat $DATA_DIR/references.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "a String, c String" -q 'select * from table' @@ -76,8 +76,8 @@ S2="a_enum_to_string String, b_enum_to_enum Enum('t' = 1, 'f' = 0), c_array_stri echo "\"A\",\"t\",\"['s1','s2']\",\"[['a1'],['a2']]\",\"s1\",\N,\"79cd909892d7e7ade1987cc7422628ba\"" | ${CLICKHOUSE_LOCAL} --input-format CSV -S "$S2" -q "select * from table format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S2" -q 'select * from table' echo = logical_types -S3="a_date Date, b_timestamp_millis DateTime64(3, 'UTC'), c_timestamp_micros DateTime64(6, 'UTC')" -echo '"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000"' | ${CLICKHOUSE_LOCAL} --input-format CSV -S "$S3" -q "select * from table format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S3" -q 'select * from table' +S3="a_date Date, b_timestamp_millis DateTime64(3, 'UTC'), c_timestamp_micros DateTime64(6, 'UTC'), d_uuid UUID" +echo '"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000","7c856fd6-005f-46c7-a7b5-3a082ef6c659"' | ${CLICKHOUSE_LOCAL} --input-format CSV -S "$S3" -q "select * from table format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S3" -q 'select * from table' echo = other S4="a Int64" diff --git a/tests/queries/0_stateless/data_avro/generate_avro.sh b/tests/queries/0_stateless/data_avro/generate_avro.sh index 0bd8dad773b..5cdebc266cc 100755 --- a/tests/queries/0_stateless/data_avro/generate_avro.sh +++ b/tests/queries/0_stateless/data_avro/generate_avro.sh @@ -14,4 +14,4 @@ avro-tools fromjson --schema-file nested_complex.avsc nested_complex.json > nes #compression avro-tools fromjson --codec null --schema-file simple.avsc simple.json > simple.null.avro avro-tools fromjson --codec deflate --schema-file simple.avsc simple.json > simple.deflate.avro -avro-tools fromjson --codec snappy --schema-file simple.avsc simple.json > simple.snappy.avro \ No newline at end of file +avro-tools fromjson --codec snappy --schema-file simple.avsc simple.json > simple.snappy.avro diff --git a/tests/queries/0_stateless/data_avro/logical_types.avro b/tests/queries/0_stateless/data_avro/logical_types.avro index 7b8a3f60b7a078a39b5bf6b2f7d841ccde80c056..e176b62d8225df985f1fcea205144cce3efd194b 100644 GIT binary patch delta 183 zcmaFKbdFirKPiimMJ%zbC||EQIU_YUaS7{2VF5;4ooc1L#N1RRE2Wh9($dTnB^{-b z$^sz28cHh_mlS2@r31xt^3yYu6LUhq(n=u3lNA`nBK&iIR`Rjwu{y~;-~axSwmwtN zl(|f8v)@1c82w?{>o?6G&$4=$CtH}BrKOnZ8W@6(}&o9iZ;Cz