Merge pull request #11954 from oandrew/avro-uuid

Avro UUID support
This commit is contained in:
alexey-milovidov 2020-06-26 14:09:54 +03:00 committed by GitHub
commit 751d36f7c7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 40 additions and 11 deletions

2
contrib/avro vendored

@ -1 +1 @@
Subproject commit 6cfcf6c24293af100d523b89b61d1ab216fa4735 Subproject commit 92caca2d42fc9a97e34e95f963593539d32ed331

View File

@ -23,6 +23,7 @@
#include <DataTypes/DataTypeNullable.h> #include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeString.h> #include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypeTuple.h> #include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeUUID.h>
#include <DataTypes/IDataType.h> #include <DataTypes/IDataType.h>
#include <DataTypes/getLeastSupertype.h> #include <DataTypes/getLeastSupertype.h>
@ -70,6 +71,7 @@ namespace ErrorCodes
extern const int INCORRECT_DATA; extern const int INCORRECT_DATA;
extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_COLUMN;
extern const int TYPE_MISMATCH; extern const int TYPE_MISMATCH;
extern const int CANNOT_PARSE_UUID;
} }
class InputStreamReadBufferAdapter : public avro::InputStream class InputStreamReadBufferAdapter : public avro::InputStream
@ -176,6 +178,19 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node
{ {
case avro::AVRO_STRING: [[fallthrough]]; case avro::AVRO_STRING: [[fallthrough]];
case avro::AVRO_BYTES: case avro::AVRO_BYTES:
if (target.isUUID())
{
return [tmp = std::string()](IColumn & column, avro::Decoder & decoder) mutable
{
decoder.decodeString(tmp);
if (tmp.length() != 36)
throw Exception(std::string("Cannot parse uuid ") + tmp, ErrorCodes::CANNOT_PARSE_UUID);
UUID uuid;
parseUUID(reinterpret_cast<const UInt8 *>(tmp.data()), std::reverse_iterator<UInt8 *>(reinterpret_cast<UInt8 *>(&uuid) + 16));
assert_cast<DataTypeUUID::ColumnType &>(column).insertValue(uuid);
};
}
if (target.isString() || target.isFixedString()) if (target.isString() || target.isFixedString())
{ {
return [tmp = std::string()](IColumn & column, avro::Decoder & decoder) mutable return [tmp = std::string()](IColumn & column, avro::Decoder & decoder) mutable

View File

@ -17,6 +17,7 @@
#include <DataTypes/DataTypeEnum.h> #include <DataTypes/DataTypeEnum.h>
#include <DataTypes/DataTypeLowCardinality.h> #include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/DataTypeNullable.h> #include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeUUID.h>
#include <Columns/ColumnArray.h> #include <Columns/ColumnArray.h>
#include <Columns/ColumnFixedString.h> #include <Columns/ColumnFixedString.h>
@ -207,6 +208,18 @@ AvroSerializer::SchemaWithSerializeFn AvroSerializer::createSchemaWithSerializeF
encoder.encodeEnum(enum_mapping.at(enum_value)); encoder.encodeEnum(enum_mapping.at(enum_value));
}}; }};
} }
case TypeIndex::UUID:
{
auto schema = avro::StringSchema();
schema.root()->setLogicalType(avro::LogicalType(avro::LogicalType::UUID));
return {schema, [](const IColumn & column, size_t row_num, avro::Encoder & encoder)
{
const auto & uuid = assert_cast<const DataTypeUUID::ColumnType &>(column).getElement(row_num);
std::array<UInt8, 36> s;
formatUUID(std::reverse_iterator<const UInt8 *>(reinterpret_cast<const UInt8 *>(&uuid) + 16), s.data());
encoder.encodeBytes(reinterpret_cast<const uint8_t *>(s.data()), s.size());
}};
}
case TypeIndex::Array: case TypeIndex::Array:
{ {
const auto & array_type = assert_cast<const DataTypeArray &>(*data_type); const auto & array_type = assert_cast<const DataTypeArray &>(*data_type);

View File

@ -14,8 +14,8 @@
"79cd909892d7e7ade1987cc7422628ba" "79cd909892d7e7ade1987cc7422628ba"
"79cd909892d7e7ade1987cc7422628ba" "79cd909892d7e7ade1987cc7422628ba"
= logical_types = logical_types
"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000" "2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000","7c856fd6-005f-46c7-a7b5-3a082ef6c659"
18250,1578641516227,1578641516227000 18250,1578641516227,1578641516227000,"7c856fd6-005f-46c7-a7b5-3a082ef6c659"
= references = references
"a1","c1" "a1","c1"
"a2","c2" "a2","c2"
@ -52,7 +52,7 @@ not found
= complex = complex
"A","t","['s1','s2']","[['a1'],['a2']]","s1",\N,"79cd909892d7e7ade1987cc7422628ba" "A","t","['s1','s2']","[['a1'],['a2']]","s1",\N,"79cd909892d7e7ade1987cc7422628ba"
= logical_types = logical_types
"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000" "2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000","7c856fd6-005f-46c7-a7b5-3a082ef6c659"
= other = other
0 0
1000 1000

View File

@ -21,8 +21,8 @@ cat $DATA_DIR/complex.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-fo
cat $DATA_DIR/complex.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "g_fixed FixedString(32)" -q 'select * from table' cat $DATA_DIR/complex.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "g_fixed FixedString(32)" -q 'select * from table'
echo = logical_types echo = logical_types
cat $DATA_DIR/logical_types.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "a_date Date, b_timestamp_millis DateTime64(3, 'UTC'), c_timestamp_micros DateTime64(6, 'UTC')" -q 'select * from table' cat $DATA_DIR/logical_types.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "a_date Date, b_timestamp_millis DateTime64(3, 'UTC'), c_timestamp_micros DateTime64(6, 'UTC'), d_uuid UUID" -q 'select * from table'
cat $DATA_DIR/logical_types.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a_date Int32, b_timestamp_millis Int64, c_timestamp_micros Int64' -q 'select * from table' cat $DATA_DIR/logical_types.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a_date Int32, b_timestamp_millis Int64, c_timestamp_micros Int64, d_uuid UUID' -q 'select * from table'
echo = references echo = references
cat $DATA_DIR/references.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "a String, c String" -q 'select * from table' cat $DATA_DIR/references.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "a String, c String" -q 'select * from table'
@ -76,8 +76,8 @@ S2="a_enum_to_string String, b_enum_to_enum Enum('t' = 1, 'f' = 0), c_array_stri
echo "\"A\",\"t\",\"['s1','s2']\",\"[['a1'],['a2']]\",\"s1\",\N,\"79cd909892d7e7ade1987cc7422628ba\"" | ${CLICKHOUSE_LOCAL} --input-format CSV -S "$S2" -q "select * from table format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S2" -q 'select * from table' echo "\"A\",\"t\",\"['s1','s2']\",\"[['a1'],['a2']]\",\"s1\",\N,\"79cd909892d7e7ade1987cc7422628ba\"" | ${CLICKHOUSE_LOCAL} --input-format CSV -S "$S2" -q "select * from table format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S2" -q 'select * from table'
echo = logical_types echo = logical_types
S3="a_date Date, b_timestamp_millis DateTime64(3, 'UTC'), c_timestamp_micros DateTime64(6, 'UTC')" S3="a_date Date, b_timestamp_millis DateTime64(3, 'UTC'), c_timestamp_micros DateTime64(6, 'UTC'), d_uuid UUID"
echo '"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000"' | ${CLICKHOUSE_LOCAL} --input-format CSV -S "$S3" -q "select * from table format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S3" -q 'select * from table' echo '"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000","7c856fd6-005f-46c7-a7b5-3a082ef6c659"' | ${CLICKHOUSE_LOCAL} --input-format CSV -S "$S3" -q "select * from table format Avro" | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S "$S3" -q 'select * from table'
echo = other echo = other
S4="a Int64" S4="a Int64"

View File

@ -14,4 +14,4 @@ avro-tools fromjson --schema-file nested_complex.avsc nested_complex.json > nes
#compression #compression
avro-tools fromjson --codec null --schema-file simple.avsc simple.json > simple.null.avro avro-tools fromjson --codec null --schema-file simple.avsc simple.json > simple.null.avro
avro-tools fromjson --codec deflate --schema-file simple.avsc simple.json > simple.deflate.avro avro-tools fromjson --codec deflate --schema-file simple.avsc simple.json > simple.deflate.avro
avro-tools fromjson --codec snappy --schema-file simple.avsc simple.json > simple.snappy.avro avro-tools fromjson --codec snappy --schema-file simple.avsc simple.json > simple.snappy.avro

View File

@ -4,6 +4,7 @@
"fields": [ "fields": [
{"name": "a_date", "type": { "type": "int", "logicalType": "date"}}, {"name": "a_date", "type": { "type": "int", "logicalType": "date"}},
{"name": "b_timestamp_millis", "type": { "type": "long", "logicalType": "timestamp-millis"}}, {"name": "b_timestamp_millis", "type": { "type": "long", "logicalType": "timestamp-millis"}},
{"name": "c_timestamp_micros", "type": { "type": "long", "logicalType": "timestamp-micros"}} {"name": "c_timestamp_micros", "type": { "type": "long", "logicalType": "timestamp-micros"}},
{"name": "d_uuid", "type": { "type": "string", "logicalType": "uuid"}}
] ]
} }

View File

@ -1 +1 @@
{"a_date":18250,"b_timestamp_millis":1578641516227,"c_timestamp_micros":1578641516227000} {"a_date":18250,"b_timestamp_millis":1578641516227,"c_timestamp_micros":1578641516227000, "d_uuid":"7c856fd6-005f-46c7-a7b5-3a082ef6c659"}