Backport #70697 to 24.8: Fix infinite recursion when infering a proto schema with skip unsupported fields enabled

This commit is contained in:
robot-clickhouse 2024-10-16 17:06:33 +00:00
parent d2acaba718
commit f4d546fd2e
4 changed files with 126 additions and 17 deletions

View File

@ -3725,20 +3725,14 @@ namespace
const google::protobuf::FieldDescriptor * field_descriptor,
bool skip_unsupported_fields,
bool allow_repeat,
std::unordered_set<const google::protobuf::FieldDescriptor *> & pending_resolution)
std::unordered_set<const google::protobuf::FieldDescriptor *> & unresolved_descriptors)
{
if (pending_resolution.contains(field_descriptor))
{
if (skip_unsupported_fields)
return std::nullopt;
throw Exception(ErrorCodes::BAD_ARGUMENTS, "ClickHouse doesn't support type recursion ({})", field_descriptor->full_name());
}
pending_resolution.emplace(field_descriptor);
SCOPE_EXIT({ pending_resolution.erase(field_descriptor); });
chassert(unresolved_descriptors.contains(field_descriptor));
if (allow_repeat && field_descriptor->is_map())
{
auto name_and_type = getNameAndDataTypeFromField(field_descriptor, skip_unsupported_fields, false);
/// We don't add the same unresolved descriptor again since we are trying to re-resolve and put in under a Tuple
auto name_and_type
= getNameAndDataTypeFromFieldRecursive(field_descriptor, skip_unsupported_fields, false, unresolved_descriptors);
if (!name_and_type)
return std::nullopt;
const auto * tuple_type = assert_cast<const DataTypeTuple *>(name_and_type->type.get());
@ -3747,7 +3741,9 @@ namespace
if (allow_repeat && field_descriptor->is_repeated())
{
auto name_and_type = getNameAndDataTypeFromField(field_descriptor, skip_unsupported_fields, false);
/// We don't add the same unresolved descriptor again since we are trying to re-resolve and put in under an Array
auto name_and_type
= getNameAndDataTypeFromFieldRecursive(field_descriptor, skip_unsupported_fields, false, unresolved_descriptors);
if (!name_and_type)
return std::nullopt;
return NameAndTypePair{name_and_type->name, std::make_shared<DataTypeArray>(name_and_type->type)};
@ -3814,10 +3810,21 @@ namespace
else if (message_descriptor->field_count() == 1)
{
const auto * nested_field_descriptor = message_descriptor->field(0);
auto nested_name_and_type
= getNameAndDataTypeFromFieldRecursive(nested_field_descriptor, skip_unsupported_fields, true, pending_resolution);
if (auto p = unresolved_descriptors.emplace(nested_field_descriptor); !p.second)
{
if (skip_unsupported_fields)
return std::nullopt;
throw Exception(
ErrorCodes::BAD_ARGUMENTS,
"ClickHouse doesn't support type recursion ({})",
nested_field_descriptor->full_name());
}
auto nested_name_and_type = getNameAndDataTypeFromFieldRecursive(
nested_field_descriptor, skip_unsupported_fields, true, unresolved_descriptors);
if (!nested_name_and_type)
return std::nullopt;
unresolved_descriptors.erase(nested_field_descriptor);
return NameAndTypePair{field_descriptor->name() + "_" + nested_name_and_type->name, nested_name_and_type->type};
}
else
@ -3826,10 +3833,20 @@ namespace
Strings nested_names;
for (int i = 0; i != message_descriptor->field_count(); ++i)
{
if (auto p = unresolved_descriptors.emplace(message_descriptor->field(i)); !p.second)
{
if (skip_unsupported_fields)
continue;
throw Exception(
ErrorCodes::BAD_ARGUMENTS,
"ClickHouse doesn't support type recursion ({})",
message_descriptor->field(i)->full_name());
}
auto nested_name_and_type = getNameAndDataTypeFromFieldRecursive(
message_descriptor->field(i), skip_unsupported_fields, true, pending_resolution);
message_descriptor->field(i), skip_unsupported_fields, true, unresolved_descriptors);
if (!nested_name_and_type)
continue;
unresolved_descriptors.erase(message_descriptor->field(i));
nested_types.push_back(nested_name_and_type->type);
nested_names.push_back(nested_name_and_type->name);
}
@ -3848,8 +3865,9 @@ namespace
const google::protobuf::FieldDescriptor * field_descriptor, bool skip_unsupported_fields, bool allow_repeat = true)
{
/// Keep track of the fields that are pending resolution to avoid recursive types, which are unsupported
std::unordered_set<const google::protobuf::FieldDescriptor *> pending_resolution{};
return getNameAndDataTypeFromFieldRecursive(field_descriptor, skip_unsupported_fields, allow_repeat, pending_resolution);
std::unordered_set<const google::protobuf::FieldDescriptor *> unresolved_descriptors{};
unresolved_descriptors.emplace(field_descriptor);
return getNameAndDataTypeFromFieldRecursive(field_descriptor, skip_unsupported_fields, allow_repeat, unresolved_descriptors);
}
}

View File

@ -0,0 +1,19 @@
1
Row 1:
──────
name: fields
type: Map(String, Tuple(
null_value Enum8('NULL_VALUE' = 0),
number_value Float64,
string_value String,
bool_value UInt8,
list_value_values Array(Tuple(
null_value Enum8('NULL_VALUE' = 0),
number_value Float64,
string_value String,
bool_value UInt8))))
default_type:
default_expression:
comment:
codec_expression:
ttl_expression:

View File

@ -0,0 +1,10 @@
#!/usr/bin/env bash
# Tags: no-fasttest
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
SCHEMADIR="$CUR_DIR/format_schemas"
$CLICKHOUSE_LOCAL -q "DESCRIBE TABLE file('nonexist', 'Protobuf') FORMAT Vertical SETTINGS format_schema='$SCHEMADIR/03252_recursive_type.proto:Struct', input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference=0" |& grep -c BAD_ARGUMENTS
$CLICKHOUSE_LOCAL -q "DESCRIBE TABLE file('nonexist', 'Protobuf') FORMAT Vertical SETTINGS format_schema='$SCHEMADIR/03252_recursive_type.proto:Struct', input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference=1"

View File

@ -0,0 +1,62 @@
// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc. All rights reserved.
// https://developers.google.com/protocol-buffers/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Based on Google's struct.proto (see above license)
syntax = "proto3";
message Struct {
map<string, Value> fields = 1;
}
message Value {
oneof kind {
NullValue null_value = 1;
double number_value = 2;
string string_value = 3;
bool bool_value = 4;
Struct struct_value = 5;
ListValue list_value = 6;
}
}
enum NullValue {
NULL_VALUE = 0;
}
message ListValue {
repeated Value values = 1;
}
message Message {
string event = 1;
Struct payload = 2;
}