ClickHouse/dbms/src/Formats/ProtobufColumnMatcher.h

161 lines
5.3 KiB
C++
Raw Normal View History

#pragma once
#include <Common/config.h>
#if USE_PROTOBUF
#include <memory>
#include <unordered_map>
#include <vector>
#include <Core/Types.h>
#include <boost/blank.hpp>
#include <google/protobuf/descriptor.h>
#include <google/protobuf/descriptor.pb.h>
namespace google
{
namespace protobuf
{
class Descriptor;
class FieldDescriptor;
}
}
namespace DB
{
namespace ProtobufColumnMatcher
{
struct DefaultTraits
{
using MessageData = boost::blank;
using FieldData = boost::blank;
};
template <typename Traits = DefaultTraits>
struct Message;
/// Represents a field in a protobuf message.
template <typename Traits = DefaultTraits>
struct Field
{
const google::protobuf::FieldDescriptor * field_descriptor = nullptr;
/// Same as field_descriptor->number().
UInt32 field_number = 0;
/// Index of a column; either 'column_index' or 'nested_message' is set.
size_t column_index = -1;
std::unique_ptr<Message<Traits>> nested_message;
typename Traits::FieldData data;
};
/// Represents a protobuf message.
template <typename Traits>
struct Message
{
std::vector<Field<Traits>> fields;
/// Points to the parent message if this is a nested message.
Message * parent = nullptr;
size_t index_in_parent = -1;
typename Traits::MessageData data;
};
/// Utility function finding matching columns for each protobuf field.
template <typename Traits = DefaultTraits>
static std::unique_ptr<Message<Traits>> matchColumns(
const std::vector<String> & column_names,
const google::protobuf::Descriptor * message_type);
namespace details
{
void throwNoCommonColumns();
class ColumnNameMatcher
{
public:
ColumnNameMatcher(const std::vector<String> & column_names);
size_t findColumn(const String & field_name);
private:
std::unordered_map<String, size_t> column_name_to_index_map;
std::vector<bool> column_usage;
};
template <typename Traits>
std::unique_ptr<Message<Traits>> matchColumnsRecursive(
ColumnNameMatcher & name_matcher,
const google::protobuf::Descriptor * message_type,
const String & field_name_prefix)
{
auto message = std::make_unique<Message<Traits>>();
for (int i = 0; i != message_type->field_count(); ++i)
{
const google::protobuf::FieldDescriptor * field_descriptor = message_type->field(i);
if ((field_descriptor->type() == google::protobuf::FieldDescriptor::TYPE_MESSAGE)
|| (field_descriptor->type() == google::protobuf::FieldDescriptor::TYPE_GROUP))
{
auto nested_message = matchColumnsRecursive<Traits>(
name_matcher, field_descriptor->message_type(), field_name_prefix + field_descriptor->name() + ".");
if (nested_message)
{
message->fields.emplace_back();
auto & current_field = message->fields.back();
current_field.field_number = field_descriptor->number();
current_field.field_descriptor = field_descriptor;
current_field.nested_message = std::move(nested_message);
current_field.nested_message->parent = message.get();
}
}
else
{
size_t column_index = name_matcher.findColumn(field_name_prefix + field_descriptor->name());
if (column_index != static_cast<size_t>(-1))
{
message->fields.emplace_back();
auto & current_field = message->fields.back();
current_field.field_number = field_descriptor->number();
current_field.field_descriptor = field_descriptor;
current_field.column_index = column_index;
}
}
}
if (message->fields.empty())
return nullptr;
// Columns should be sorted by field_number, it's necessary for writing protobufs and useful reading protobufs.
std::sort(message->fields.begin(), message->fields.end(), [](const Field<Traits> & left, const Field<Traits> & right)
{
return left.field_number < right.field_number;
});
for (size_t i = 0; i != message->fields.size(); ++i)
{
auto & field = message->fields[i];
if (field.nested_message)
field.nested_message->index_in_parent = i;
}
return message;
}
}
template <typename Data>
static std::unique_ptr<Message<Data>> matchColumns(
const std::vector<String> & column_names,
const google::protobuf::Descriptor * message_type)
{
details::ColumnNameMatcher name_matcher(column_names);
auto message = details::matchColumnsRecursive<Data>(name_matcher, message_type, "");
if (!message)
details::throwNoCommonColumns();
return message;
}
}
}
#endif