2020-01-10 09:05:20 +00:00
#!/usr/bin/env bash
set -e
CUR_DIR = $( cd " $( dirname " ${ BASH_SOURCE [0] } " ) " && pwd )
2020-12-28 11:46:53 +00:00
# shellcheck source=../shell_config.sh
2020-08-01 00:56:32 +00:00
. " $CUR_DIR " /../shell_config.sh
2020-01-10 09:05:20 +00:00
DATA_DIR = $CUR_DIR /data_avro
# input
2020-07-31 23:55:20 +00:00
echo '===' input
echo '=' primitive
2020-01-10 09:05:20 +00:00
2020-08-01 00:56:32 +00:00
cat " $DATA_DIR " /primitive.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S 'a_bool UInt8, b_int Int32, c_long Int64, d_float Float32, e_double Float64, f_bytes String, g_string String' -q 'select * from table'
cat " $DATA_DIR " /primitive.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S 'a_bool UInt8, c_long Int64, g_string String' -q 'select * from table'
cat " $DATA_DIR " /primitive.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S 'g_string String, c_long Int64, a_bool UInt8' -q 'select * from table'
cat " $DATA_DIR " /primitive.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S 'g_string String' -q 'select * from table'
2020-01-10 09:05:20 +00:00
2020-07-31 23:55:20 +00:00
echo '=' complex
2020-08-01 00:56:32 +00:00
cat " $DATA_DIR " /complex.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S "a_enum_to_string String, b_enum_to_enum Enum('t' = 1, 'f' = 0), c_array_string Array(String), d_array_array_string Array(Array(String)), e_union_null_string Nullable(String), f_union_long_null Nullable(Int64), g_fixed FixedString(32)" -q 'select * from table'
cat " $DATA_DIR " /complex.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S "g_fixed FixedString(32)" -q 'select * from table'
2020-01-10 09:05:20 +00:00
2020-07-31 23:55:20 +00:00
echo '=' logical_types
2020-08-01 00:56:32 +00:00
cat " $DATA_DIR " /logical_types.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S "a_date Date, b_timestamp_millis DateTime64(3, 'UTC'), c_timestamp_micros DateTime64(6, 'UTC'), d_uuid UUID" -q 'select * from table'
cat " $DATA_DIR " /logical_types.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S 'a_date Int32, b_timestamp_millis Int64, c_timestamp_micros Int64, d_uuid UUID' -q 'select * from table'
2020-01-10 09:05:20 +00:00
2020-07-31 23:55:20 +00:00
echo '=' references
2020-08-01 00:56:32 +00:00
cat " $DATA_DIR " /references.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S "a String, c String" -q 'select * from table'
2020-01-10 09:05:20 +00:00
2020-07-31 23:55:20 +00:00
echo '=' nested
2020-08-01 00:56:32 +00:00
cat " $DATA_DIR " /nested.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S 'a Int64, "b.a" String, "b.b" Double, "b.c" Double, c String' -q 'select * from table'
cat " $DATA_DIR " /nested.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S '"b.c" Double, "b.a" String, a Int64, c String' -q 'select * from table'
cat " $DATA_DIR " /nested.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S '"b" Double' -q 'select * from table' 2>& 1 | grep -i 'not compatible' -o
2020-04-25 22:30:17 +00:00
2020-07-31 23:55:20 +00:00
echo '=' nested_complex
2020-04-25 22:30:17 +00:00
# special case union(null, T)
2020-08-01 00:56:32 +00:00
cat " $DATA_DIR " /nested_complex.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S '"b.b2_null_str" Nullable(String)' -q 'select * from table'
2020-04-25 22:30:17 +00:00
# union branch to non-null with default
2020-08-01 00:56:32 +00:00
cat " $DATA_DIR " /nested_complex.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S "\"b.b2_null_str.string\" String default 'default'" -q 'select * from table'
2020-04-25 22:30:17 +00:00
# union branch to nullable
2020-08-01 00:56:32 +00:00
cat " $DATA_DIR " /nested_complex.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S "\"b.b2_null_str.string\" Nullable(String)" -q 'select * from table'
2020-04-25 22:30:17 +00:00
# multiple union branches simultaneously
2020-08-01 00:56:32 +00:00
cat " $DATA_DIR " /nested_complex.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S "\"b.b3_null_str_double.string\" Nullable(String), \"b.b3_null_str_double.double\" Nullable(Double)" -q 'select * from table'
2020-04-25 22:30:17 +00:00
# and even nested recursive structures!
2020-08-01 00:56:32 +00:00
cat " $DATA_DIR " /nested_complex.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S "\"b.b4_null_sub1.sub1.b2_null_str\" Nullable(String)" -q 'select * from table'
2020-04-18 20:15:39 +00:00
2020-07-31 23:55:20 +00:00
echo '=' compression
2020-08-01 00:56:32 +00:00
cat " $DATA_DIR " /simple.null.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S 'a Int64' -q 'select count() from table'
cat " $DATA_DIR " /simple.deflate.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S 'a Int64' -q 'select count() from table'
2020-01-10 09:28:58 +00:00
#snappy is optional
2020-02-03 00:04:08 +00:00
#cat $DATA_DIR/simple.snappy.avro | ${CLICKHOUSE_LOCAL} --input-format Avro --output-format CSV -S 'a Int64' -q 'select count() from table'
2020-01-10 09:05:20 +00:00
2020-07-31 23:55:20 +00:00
echo '=' other
2020-01-10 09:05:20 +00:00
#no data
2020-08-01 00:56:32 +00:00
cat " $DATA_DIR " /empty.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S 'a Int64' -q 'select count() from table'
2020-01-10 09:05:20 +00:00
# type mismatch
2020-08-01 00:56:32 +00:00
cat " $DATA_DIR " /simple.null.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S 'a Int32' -q 'select count() from table'
2020-01-10 09:05:20 +00:00
# field not found
2020-08-01 00:56:32 +00:00
cat " $DATA_DIR " /simple.null.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S 'b Int64' -q 'select count() from table' 2>& 1 | grep -i 'not found' -o
2020-06-28 18:40:48 +00:00
# allow_missing_fields
2020-08-01 00:56:32 +00:00
cat " $DATA_DIR " /simple.null.avro | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV --input_format_avro_allow_missing_fields 1 -S 'b Int64' -q 'select count() from table'
2020-01-10 09:05:20 +00:00
# output
2020-07-31 23:55:20 +00:00
echo '===' output
2020-01-10 09:05:20 +00:00
2020-07-31 23:55:20 +00:00
echo '=' primitive
2020-01-10 09:05:20 +00:00
S1 = "a_bool UInt8, b_int Int32, c_long Int64, d_float Float32, e_double Float64, f_bytes String, g_string String"
2020-02-03 00:04:08 +00:00
echo '1,1,2,3.4,5.6,"b1","s1"' | ${ CLICKHOUSE_LOCAL } --input-format CSV -S " $S1 " -q "select * from table format Avro" | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S " $S1 " -q 'select * from table'
2020-01-10 09:05:20 +00:00
2020-07-31 23:55:20 +00:00
echo '=' complex
2020-01-10 09:05:20 +00:00
S2 = "a_enum_to_string String, b_enum_to_enum Enum('t' = 1, 'f' = 0), c_array_string Array(String), d_array_array_string Array(Array(String)), e_union_null_string Nullable(String), f_union_long_null Nullable(Int64), g_fixed FixedString(32)"
2020-02-03 00:04:08 +00:00
echo "\"A\",\"t\",\"['s1','s2']\",\"[['a1'],['a2']]\",\"s1\",\N,\"79cd909892d7e7ade1987cc7422628ba\"" | ${ CLICKHOUSE_LOCAL } --input-format CSV -S " $S2 " -q "select * from table format Avro" | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S " $S2 " -q 'select * from table'
2020-01-10 09:05:20 +00:00
2020-07-31 23:55:20 +00:00
echo '=' logical_types
2020-06-25 19:25:45 +00:00
S3 = "a_date Date, b_timestamp_millis DateTime64(3, 'UTC'), c_timestamp_micros DateTime64(6, 'UTC'), d_uuid UUID"
echo '"2019-12-20","2020-01-10 07:31:56.227","2020-01-10 07:31:56.227000","7c856fd6-005f-46c7-a7b5-3a082ef6c659"' | ${ CLICKHOUSE_LOCAL } --input-format CSV -S " $S3 " -q "select * from table format Avro" | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S " $S3 " -q 'select * from table'
2020-01-10 09:05:20 +00:00
2020-07-31 23:55:20 +00:00
echo '=' other
2020-01-10 09:05:20 +00:00
S4 = "a Int64"
2020-02-03 00:04:08 +00:00
${ CLICKHOUSE_LOCAL } -q "select toInt64(number) as a from numbers(0) format Avro" | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S " $S4 " -q 'select count() from table'
${ CLICKHOUSE_LOCAL } -q "select toInt64(number) as a from numbers(1000) format Avro" | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S " $S4 " -q 'select count() from table'
2020-01-10 09:05:20 +00:00
2020-01-23 02:51:49 +00:00
# type supported via conversion
2020-07-31 23:55:20 +00:00
${ CLICKHOUSE_LOCAL } -q "select toInt16(123) as a format Avro" | wc -c | tr -d ' '
2021-07-12 10:19:48 +00:00
echo '=' string column pattern
${ CLICKHOUSE_LOCAL } -q "select 'русская строка' as a format Avro SETTINGS output_format_avro_string_column_pattern = 'a'" | ${ CLICKHOUSE_LOCAL } --input-format Avro --output-format CSV -S "a String" -q 'select * from table'
# it is expected that invalid UTF-8 can be created
${ CLICKHOUSE_LOCAL } -q "select '\x61\xF0\x80\x80\x80b' as a format Avro" > /dev/null && echo Ok
A_NEEDLE = "'\"name\":\"a\",\"type\":\"string\"'"
2021-07-19 10:45:24 +00:00
AAA_NEEDLE = "'\"name\":\"aaa\",\"type\":\"string\"'"
2021-07-12 10:19:48 +00:00
B_NEEDLE = "'\"name\":\"b\",\"type\":\"string\"'"
2021-07-19 10:45:24 +00:00
PATTERNQUERY = "select 'русская строка' as a, 'русская строка' as aaa, 'русская строка' as b format Avro SETTINGS output_format_avro_string_column_pattern ="
2021-07-12 10:19:48 +00:00
PATTERNPATTERN = "'a'"
2021-07-19 10:45:24 +00:00
${ CLICKHOUSE_LOCAL } -q " $PATTERNQUERY $PATTERNPATTERN " | tr -d '\n' | ${ CLICKHOUSE_LOCAL } --structure "avro_raw String" --input-format LineAsString -q " select countSubstrings(avro_raw, $A_NEEDLE ), countSubstrings(avro_raw, $AAA_NEEDLE ), countSubstrings(avro_raw, $B_NEEDLE ) from table "
PATTERNPATTERN = " '^a $' "
${ CLICKHOUSE_LOCAL } -q " $PATTERNQUERY $PATTERNPATTERN " | tr -d '\n' | ${ CLICKHOUSE_LOCAL } --structure "avro_raw String" --input-format LineAsString -q " select countSubstrings(avro_raw, $A_NEEDLE ), countSubstrings(avro_raw, $AAA_NEEDLE ), countSubstrings(avro_raw, $B_NEEDLE ) from table "
PATTERNPATTERN = "'aaa'"
${ CLICKHOUSE_LOCAL } -q " $PATTERNQUERY $PATTERNPATTERN " | tr -d '\n' | ${ CLICKHOUSE_LOCAL } --structure "avro_raw String" --input-format LineAsString -q " select countSubstrings(avro_raw, $A_NEEDLE ), countSubstrings(avro_raw, $AAA_NEEDLE ), countSubstrings(avro_raw, $B_NEEDLE ) from table "
2021-07-12 10:19:48 +00:00
PATTERNPATTERN = "'a|b'"
2021-07-19 10:45:24 +00:00
${ CLICKHOUSE_LOCAL } -q " $PATTERNQUERY $PATTERNPATTERN " | tr -d '\n' | ${ CLICKHOUSE_LOCAL } --structure "avro_raw String" --input-format LineAsString -q " select countSubstrings(avro_raw, $A_NEEDLE ), countSubstrings(avro_raw, $AAA_NEEDLE ), countSubstrings(avro_raw, $B_NEEDLE ) from table "
2021-07-12 10:19:48 +00:00
PATTERNPATTERN = "'.*'"
2021-07-19 10:45:24 +00:00
${ CLICKHOUSE_LOCAL } -q " $PATTERNQUERY $PATTERNPATTERN " | tr -d '\n' | ${ CLICKHOUSE_LOCAL } --structure "avro_raw String" --input-format LineAsString -q " select countSubstrings(avro_raw, $A_NEEDLE ), countSubstrings(avro_raw, $AAA_NEEDLE ), countSubstrings(avro_raw, $B_NEEDLE ) from table "