diff --git a/src/Formats/StructureToCapnProtoSchema.cpp b/src/Formats/StructureToCapnProtoSchema.cpp index 9f4d96b7c8a..99298fadee1 100644 --- a/src/Formats/StructureToCapnProtoSchema.cpp +++ b/src/Formats/StructureToCapnProtoSchema.cpp @@ -126,7 +126,7 @@ String prepareNullableAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypeP String prepareTupleAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) { const auto & tuple_type = assert_cast(*data_type); - auto nested_names_and_types = getCollectedTupleElements(tuple_type); + auto nested_names_and_types = getCollectedTupleElements(tuple_type, false, "CapnProto"); String struct_name = getSchemaMessageName(column_name); startStruct(buf, struct_name, indent); @@ -222,7 +222,7 @@ String prepareAndGetCapnProtoTypeName(WriteBuffer & buf, const DataTypePtr & dat void StructureToCapnProtoSchema::writeSchema(WriteBuffer & buf, const String & message_name, const NamesAndTypesList & names_and_types_) { - auto names_and_types = collectNested(names_and_types_); + auto names_and_types = collectNested(names_and_types_, true, "CapnProto"); writeCapnProtoHeader(buf); startStruct(buf, getSchemaMessageName(message_name), 0); diff --git a/src/Formats/StructureToFormatSchemaUtils.cpp b/src/Formats/StructureToFormatSchemaUtils.cpp index a9374647ebc..47701fa4f81 100644 --- a/src/Formats/StructureToFormatSchemaUtils.cpp +++ b/src/Formats/StructureToFormatSchemaUtils.cpp @@ -4,6 +4,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + namespace StructureToFormatSchemaUtils { @@ -57,27 +62,34 @@ String getSchemaMessageName(const String & column_name) namespace { - std::pair splitName(const String & name) + std::pair splitName(const String & name, bool allow_split_by_underscore) { const auto * begin = name.data(); const auto * end = name.data() + name.size(); - const auto * it = find_first_symbols<'_', '.'>(begin, end); + const char * it = nullptr; + if (allow_split_by_underscore) + it = find_first_symbols<'_', '.'>(begin, end); + else + it = find_first_symbols<'.'>(begin, end); String first = String(begin, it); String second = it == end ? "" : String(it + 1, end); return {std::move(first), std::move(second)}; } } -NamesAndTypesList collectNested(const NamesAndTypesList & names_and_types) +NamesAndTypesList collectNested(const NamesAndTypesList & names_and_types, bool allow_split_by_underscore, const String & format_name) { - /// Find all columns with dots '.' or underscores '_' and move them into a tuple. + /// Find all columns with dots '.' or underscores '_' (if allowed) and move them into a tuple. /// For example if we have columns 'a.b UInt32, a.c UInt32, x_y String' we will /// change it to 'a Tuple(b UInt32, c UInt32), x Tuple(y String)' NamesAndTypesList result; std::unordered_map nested; for (const auto & [name, type] : names_and_types) { - auto [field_name, nested_name] = splitName(name); + auto [field_name, nested_name] = splitName(name, allow_split_by_underscore); + if (isdigit(field_name[0])) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Format {} doesn't support field names that starts with a digit: '{}'", format_name, field_name); + if (nested_name.empty()) result.emplace_back(name, type); else @@ -90,7 +102,7 @@ NamesAndTypesList collectNested(const NamesAndTypesList & names_and_types) return result; } -NamesAndTypesList getCollectedTupleElements(const DataTypeTuple & tuple_type) +NamesAndTypesList getCollectedTupleElements(const DataTypeTuple & tuple_type, bool allow_split_by_underscore, const String & format_name) { const auto & nested_types = tuple_type.getElements(); Names nested_names; @@ -109,7 +121,7 @@ NamesAndTypesList getCollectedTupleElements(const DataTypeTuple & tuple_type) for (size_t i = 0; i != nested_names.size(); ++i) result.emplace_back(nested_names[i], nested_types[i]); - return collectNested(result); + return collectNested(result, allow_split_by_underscore, format_name); } } diff --git a/src/Formats/StructureToFormatSchemaUtils.h b/src/Formats/StructureToFormatSchemaUtils.h index c6b86501ac8..f5bd38a40a6 100644 --- a/src/Formats/StructureToFormatSchemaUtils.h +++ b/src/Formats/StructureToFormatSchemaUtils.h @@ -19,9 +19,9 @@ namespace StructureToFormatSchemaUtils String getSchemaMessageName(const String & column_name); - NamesAndTypesList collectNested(const NamesAndTypesList & names_and_types); + NamesAndTypesList collectNested(const NamesAndTypesList & names_and_types, bool allow_split_by_underscore, const String & format_name); - NamesAndTypesList getCollectedTupleElements(const DataTypeTuple & tuple_type); + NamesAndTypesList getCollectedTupleElements(const DataTypeTuple & tuple_type, bool allow_split_by_underscore, const String & format_name); } } diff --git a/src/Formats/StructureToProtobufSchema.cpp b/src/Formats/StructureToProtobufSchema.cpp index 4a704e8d428..178c0ae3cc2 100644 --- a/src/Formats/StructureToProtobufSchema.cpp +++ b/src/Formats/StructureToProtobufSchema.cpp @@ -105,7 +105,7 @@ String prepareArrayAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & String prepareTupleAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & data_type, const String & column_name, size_t indent) { const auto & tuple_type = assert_cast(*data_type); - auto nested_names_and_types = getCollectedTupleElements(tuple_type); + auto nested_names_and_types = getCollectedTupleElements(tuple_type, false, "Protobuf"); String message_name = getSchemaMessageName(column_name); startMessage(buf, message_name, indent); @@ -202,7 +202,7 @@ String prepareAndGetProtobufTypeName(WriteBuffer & buf, const DataTypePtr & data void StructureToProtobufSchema::writeSchema(WriteBuffer & buf, const String & message_name, const NamesAndTypesList & names_and_types_) { - auto names_and_types = collectNested(names_and_types_); + auto names_and_types = collectNested(names_and_types_, false, "Protobuf"); writeProtobufHeader(buf); startMessage(buf, getSchemaMessageName(message_name), 0); size_t field_index = 1; diff --git a/tests/queries/0_stateless/02905_structure_to_schema_bad_names.reference b/tests/queries/0_stateless/02905_structure_to_schema_bad_names.reference new file mode 100644 index 00000000000..2759a2351a0 --- /dev/null +++ b/tests/queries/0_stateless/02905_structure_to_schema_bad_names.reference @@ -0,0 +1,7 @@ + +message Message +{ + uint32 col_1 = 1; +} +1 +1 diff --git a/tests/queries/0_stateless/02905_structure_to_schema_bad_names.sh b/tests/queries/0_stateless/02905_structure_to_schema_bad_names.sh new file mode 100755 index 00000000000..9e9c49e63f5 --- /dev/null +++ b/tests/queries/0_stateless/02905_structure_to_schema_bad_names.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +SCHEMA_FILE=$CLICKHOUSE_TEST_UNIQUE_NAME-schema + +$CLICKHOUSE_LOCAL -q "select 42 as col_1 format Protobuf settings output_format_schema='$SCHEMA_FILE.proto'" > /dev/null +tail -n +2 $SCHEMA_FILE.proto + +$CLICKHOUSE_LOCAL -q "select 42 as \`col.1\` format Protobuf" 2>&1 | grep -c -F "BAD_ARGUMENTS" +$CLICKHOUSE_LOCAL -q "select 42 as \`col.1\` format CapnProto" 2>&1 | grep -c -F "BAD_ARGUMENTS" + +rm $SCHEMA_FILE* +