Merge pull request #55974 from Avogar/fix-protobuf-auto-schema

Fix autogenerated Protobuf schema with fields with underscore
This commit is contained in:
Kruglov Pavel 2023-11-01 18:17:09 +01:00 committed by GitHub
commit 754ab9fa6c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 49 additions and 13 deletions

View File

@ -126,7 +126,7 @@ String prepareNullableAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypeP
String prepareTupleAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) String prepareTupleAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent)
{ {
const auto & tuple_type = assert_cast<const DataTypeTuple &>(*data_type); const auto & tuple_type = assert_cast<const DataTypeTuple &>(*data_type);
auto nested_names_and_types = getCollectedTupleElements(tuple_type); auto nested_names_and_types = getCollectedTupleElements(tuple_type, false, "CapnProto");
String struct_name = getSchemaMessageName(column_name); String struct_name = getSchemaMessageName(column_name);
startStruct(buf, struct_name, indent); startStruct(buf, struct_name, indent);
@ -222,7 +222,7 @@ String prepareAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & dat
void StructureToCapnProtoSchema::writeSchema(WriteBuffer & buf, const String & message_name, const NamesAndTypesList & names_and_types_) void StructureToCapnProtoSchema::writeSchema(WriteBuffer & buf, const String & message_name, const NamesAndTypesList & names_and_types_)
{ {
auto names_and_types = collectNested(names_and_types_); auto names_and_types = collectNested(names_and_types_, true, "CapnProto");
writeCapnProtoHeader(buf); writeCapnProtoHeader(buf);
startStruct(buf, getSchemaMessageName(message_name), 0); startStruct(buf, getSchemaMessageName(message_name), 0);

View File

@ -4,6 +4,11 @@
namespace DB namespace DB
{ {
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
namespace StructureToFormatSchemaUtils namespace StructureToFormatSchemaUtils
{ {
@ -57,27 +62,34 @@ String getSchemaMessageName(const String & column_name)
namespace namespace
{ {
std::pair<String, String> splitName(const String & name) std::pair<String, String> splitName(const String & name, bool allow_split_by_underscore)
{ {
const auto * begin = name.data(); const auto * begin = name.data();
const auto * end = name.data() + name.size(); const auto * end = name.data() + name.size();
const auto * it = find_first_symbols<'_', '.'>(begin, end); const char * it = nullptr;
if (allow_split_by_underscore)
it = find_first_symbols<'_', '.'>(begin, end);
else
it = find_first_symbols<'.'>(begin, end);
String first = String(begin, it); String first = String(begin, it);
String second = it == end ? "" : String(it + 1, end); String second = it == end ? "" : String(it + 1, end);
return {std::move(first), std::move(second)}; return {std::move(first), std::move(second)};
} }
} }
NamesAndTypesList collectNested(const NamesAndTypesList & names_and_types) NamesAndTypesList collectNested(const NamesAndTypesList & names_and_types, bool allow_split_by_underscore, const String & format_name)
{ {
/// Find all columns with dots '.' or underscores '_' and move them into a tuple. /// Find all columns with dots '.' or underscores '_' (if allowed) and move them into a tuple.
/// For example if we have columns 'a.b UInt32, a.c UInt32, x_y String' we will /// For example if we have columns 'a.b UInt32, a.c UInt32, x_y String' we will
/// change it to 'a Tuple(b UInt32, c UInt32), x Tuple(y String)' /// change it to 'a Tuple(b UInt32, c UInt32), x Tuple(y String)'
NamesAndTypesList result; NamesAndTypesList result;
std::unordered_map<String, NamesAndTypesList> nested; std::unordered_map<String, NamesAndTypesList> nested;
for (const auto & [name, type] : names_and_types) for (const auto & [name, type] : names_and_types)
{ {
auto [field_name, nested_name] = splitName(name); auto [field_name, nested_name] = splitName(name, allow_split_by_underscore);
if (isdigit(field_name[0]))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Format {} doesn't support field names that starts with a digit: '{}'", format_name, field_name);
if (nested_name.empty()) if (nested_name.empty())
result.emplace_back(name, type); result.emplace_back(name, type);
else else
@ -90,7 +102,7 @@ NamesAndTypesList collectNested(const NamesAndTypesList & names_and_types)
return result; return result;
} }
NamesAndTypesList getCollectedTupleElements(const DataTypeTuple & tuple_type) NamesAndTypesList getCollectedTupleElements(const DataTypeTuple & tuple_type, bool allow_split_by_underscore, const String & format_name)
{ {
const auto & nested_types = tuple_type.getElements(); const auto & nested_types = tuple_type.getElements();
Names nested_names; Names nested_names;
@ -109,7 +121,7 @@ NamesAndTypesList getCollectedTupleElements(const DataTypeTuple & tuple_type)
for (size_t i = 0; i != nested_names.size(); ++i) for (size_t i = 0; i != nested_names.size(); ++i)
result.emplace_back(nested_names[i], nested_types[i]); result.emplace_back(nested_names[i], nested_types[i]);
return collectNested(result); return collectNested(result, allow_split_by_underscore, format_name);
} }
} }

View File

@ -19,9 +19,9 @@ namespace StructureToFormatSchemaUtils
String getSchemaMessageName(const String & column_name); String getSchemaMessageName(const String & column_name);
NamesAndTypesList collectNested(const NamesAndTypesList & names_and_types); NamesAndTypesList collectNested(const NamesAndTypesList & names_and_types, bool allow_split_by_underscore, const String & format_name);
NamesAndTypesList getCollectedTupleElements(const DataTypeTuple & tuple_type); NamesAndTypesList getCollectedTupleElements(const DataTypeTuple & tuple_type, bool allow_split_by_underscore, const String & format_name);
} }
} }

View File

@ -105,7 +105,7 @@ String prepareArrayAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr &
String prepareTupleAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) String prepareTupleAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent)
{ {
const auto & tuple_type = assert_cast<const DataTypeTuple &>(*data_type); const auto & tuple_type = assert_cast<const DataTypeTuple &>(*data_type);
auto nested_names_and_types = getCollectedTupleElements(tuple_type); auto nested_names_and_types = getCollectedTupleElements(tuple_type, false, "Protobuf");
String message_name = getSchemaMessageName(column_name); String message_name = getSchemaMessageName(column_name);
startMessage(buf, message_name, indent); startMessage(buf, message_name, indent);
@ -202,7 +202,7 @@ String prepareAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & data
void StructureToProtobufSchema::writeSchema(WriteBuffer & buf, const String & message_name, const NamesAndTypesList & names_and_types_) void StructureToProtobufSchema::writeSchema(WriteBuffer & buf, const String & message_name, const NamesAndTypesList & names_and_types_)
{ {
auto names_and_types = collectNested(names_and_types_); auto names_and_types = collectNested(names_and_types_, false, "Protobuf");
writeProtobufHeader(buf); writeProtobufHeader(buf);
startMessage(buf, getSchemaMessageName(message_name), 0); startMessage(buf, getSchemaMessageName(message_name), 0);
size_t field_index = 1; size_t field_index = 1;

View File

@ -0,0 +1,7 @@
message Message
{
uint32 col_1 = 1;
}
1
1

View File

@ -0,0 +1,17 @@
#!/usr/bin/env bash
# Tags: no-fasttest
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
SCHEMA_FILE=$CLICKHOUSE_TEST_UNIQUE_NAME-schema
$CLICKHOUSE_LOCAL -q "select 42 as col_1 format Protobuf settings output_format_schema='$SCHEMA_FILE.proto'" > /dev/null
tail -n +2 $SCHEMA_FILE.proto
$CLICKHOUSE_LOCAL -q "select 42 as \`col.1\` format Protobuf" 2>&1 | grep -c -F "BAD_ARGUMENTS"
$CLICKHOUSE_LOCAL -q "select 42 as \`col.1\` format CapnProto" 2>&1 | grep -c -F "BAD_ARGUMENTS"
rm $SCHEMA_FILE*