SELECT String from ClickHouse as Avro string - PartialMatch

This commit is contained in:
Ilya Golshtein 2021-07-19 13:45:24 +03:00
parent 1fc822b19d
commit 8204516882
4 changed files with 18 additions and 11 deletions

View File

@ -67,11 +67,9 @@ public:
bool isStringAsString(const String & column_name)
{
return RE2::FullMatch(column_name, string_to_string_regexp);
return RE2::PartialMatch(column_name, string_to_string_regexp);
}
~AvroSerializerTraits() = default;
private:
const RE2 string_to_string_regexp;
};

View File

@ -36,7 +36,7 @@ private:
};
/// Type names for different complex types (e.g. enums, fixed strings) must be unique. We use simple incremental number to give them different names.
/*static*/ SchemaWithSerializeFn createSchemaWithSerializeFn(DataTypePtr data_type, size_t & type_name_increment, const String & column_name);
SchemaWithSerializeFn createSchemaWithSerializeFn(DataTypePtr data_type, size_t & type_name_increment, const String & column_name);
std::vector<SerializeFn> serialize_fns;
avro::ValidSchema valid_schema;

View File

@ -61,6 +61,8 @@ not found
= string column pattern
"русская строка"
Ok
1 0
1 1
1 1
1 1 0
1 0 0
0 1 0
1 1 1
1 1 1

View File

@ -97,14 +97,21 @@ ${CLICKHOUSE_LOCAL} -q "select 'русская строка' as a format Avro S
${CLICKHOUSE_LOCAL} -q "select '\x61\xF0\x80\x80\x80b' as a format Avro" > /dev/null && echo Ok
A_NEEDLE="'\"name\":\"a\",\"type\":\"string\"'"
AAA_NEEDLE="'\"name\":\"aaa\",\"type\":\"string\"'"
B_NEEDLE="'\"name\":\"b\",\"type\":\"string\"'"
PATTERNQUERY="select 'русская строка' as a, 'русская строка' as b format Avro SETTINGS output_format_avro_string_column_pattern ="
PATTERNQUERY="select 'русская строка' as a, 'русская строка' as aaa, 'русская строка' as b format Avro SETTINGS output_format_avro_string_column_pattern ="
PATTERNPATTERN="'a'"
${CLICKHOUSE_LOCAL} -q "$PATTERNQUERY $PATTERNPATTERN" | tr -d '\n' | ${CLICKHOUSE_LOCAL} --structure "avro_raw String" --input-format LineAsString -q "select countSubstrings(avro_raw, $A_NEEDLE), countSubstrings(avro_raw, $B_NEEDLE) from table"
${CLICKHOUSE_LOCAL} -q "$PATTERNQUERY $PATTERNPATTERN" | tr -d '\n' | ${CLICKHOUSE_LOCAL} --structure "avro_raw String" --input-format LineAsString -q "select countSubstrings(avro_raw, $A_NEEDLE), countSubstrings(avro_raw, $AAA_NEEDLE), countSubstrings(avro_raw, $B_NEEDLE) from table"
PATTERNPATTERN="'^a$'"
${CLICKHOUSE_LOCAL} -q "$PATTERNQUERY $PATTERNPATTERN" | tr -d '\n' | ${CLICKHOUSE_LOCAL} --structure "avro_raw String" --input-format LineAsString -q "select countSubstrings(avro_raw, $A_NEEDLE), countSubstrings(avro_raw, $AAA_NEEDLE), countSubstrings(avro_raw, $B_NEEDLE) from table"
PATTERNPATTERN="'aaa'"
${CLICKHOUSE_LOCAL} -q "$PATTERNQUERY $PATTERNPATTERN" | tr -d '\n' | ${CLICKHOUSE_LOCAL} --structure "avro_raw String" --input-format LineAsString -q "select countSubstrings(avro_raw, $A_NEEDLE), countSubstrings(avro_raw, $AAA_NEEDLE), countSubstrings(avro_raw, $B_NEEDLE) from table"
PATTERNPATTERN="'a|b'"
${CLICKHOUSE_LOCAL} -q "$PATTERNQUERY $PATTERNPATTERN" | tr -d '\n' | ${CLICKHOUSE_LOCAL} --structure "avro_raw String" --input-format LineAsString -q "select countSubstrings(avro_raw, $A_NEEDLE), countSubstrings(avro_raw, $B_NEEDLE) from table"
${CLICKHOUSE_LOCAL} -q "$PATTERNQUERY $PATTERNPATTERN" | tr -d '\n' | ${CLICKHOUSE_LOCAL} --structure "avro_raw String" --input-format LineAsString -q "select countSubstrings(avro_raw, $A_NEEDLE), countSubstrings(avro_raw, $AAA_NEEDLE), countSubstrings(avro_raw, $B_NEEDLE) from table"
PATTERNPATTERN="'.*'"
${CLICKHOUSE_LOCAL} -q "$PATTERNQUERY $PATTERNPATTERN" | tr -d '\n' | ${CLICKHOUSE_LOCAL} --structure "avro_raw String" --input-format LineAsString -q "select countSubstrings(avro_raw, $A_NEEDLE), countSubstrings(avro_raw, $B_NEEDLE) from table"
${CLICKHOUSE_LOCAL} -q "$PATTERNQUERY $PATTERNPATTERN" | tr -d '\n' | ${CLICKHOUSE_LOCAL} --structure "avro_raw String" --input-format LineAsString -q "select countSubstrings(avro_raw, $A_NEEDLE), countSubstrings(avro_raw, $AAA_NEEDLE), countSubstrings(avro_raw, $B_NEEDLE) from table"