AvroConfluent bugfixes

This commit is contained in:
Andrew Onyshchuk 2020-02-01 18:53:11 -06:00
parent 16d4990088
commit 927e572d39
6 changed files with 133 additions and 43 deletions

View File

@ -189,7 +189,7 @@ struct Settings : public SettingsCollection<Settings>
M(SettingBool, input_format_values_interpret_expressions, true, "For Values format: if the field could not be parsed by streaming parser, run SQL parser and try to interpret it as SQL expression.", 0) \ M(SettingBool, input_format_values_interpret_expressions, true, "For Values format: if the field could not be parsed by streaming parser, run SQL parser and try to interpret it as SQL expression.", 0) \
M(SettingBool, input_format_values_deduce_templates_of_expressions, true, "For Values format: if the field could not be parsed by streaming parser, run SQL parser, deduce template of the SQL expression, try to parse all rows using template and then interpret expression for all rows.", 0) \ M(SettingBool, input_format_values_deduce_templates_of_expressions, true, "For Values format: if the field could not be parsed by streaming parser, run SQL parser, deduce template of the SQL expression, try to parse all rows using template and then interpret expression for all rows.", 0) \
M(SettingBool, input_format_values_accurate_types_of_literals, true, "For Values format: when parsing and interpreting expressions using template, check actual type of literal to avoid possible overflow and precision issues.", 0) \ M(SettingBool, input_format_values_accurate_types_of_literals, true, "For Values format: when parsing and interpreting expressions using template, check actual type of literal to avoid possible overflow and precision issues.", 0) \
M(SettingString, input_format_avro_schema_registry_url, "", "For AvroConfluent format: Confluent Schema Registry URL.", 0) \ M(SettingURI, format_avro_schema_registry_url, {}, "For AvroConfluent format: Confluent Schema Registry URL.", 0) \
\ \
M(SettingBool, output_format_json_quote_64bit_integers, true, "Controls quoting of 64-bit integers in JSON output format.", 0) \ M(SettingBool, output_format_json_quote_64bit_integers, true, "Controls quoting of 64-bit integers in JSON output format.", 0) \
\ \

View File

@ -396,6 +396,54 @@ void SettingEnum<EnumType, Tag>::set(const Field & x)
} }
String SettingURI::toString() const
{
return value.toString();
}
Field SettingURI::toField() const
{
return value.toString();
}
void SettingURI::set(const Poco::URI & x)
{
value = x;
changed = true;
}
void SettingURI::set(const Field & x)
{
const String & s = safeGet<const String &>(x);
set(s);
}
void SettingURI::set(const String & x)
{
try {
Poco::URI uri(x);
set(uri);
}
catch (const Poco::Exception& e)
{
throw Exception{Exception::CreateFromPoco, e};
}
}
void SettingURI::serialize(WriteBuffer & buf, SettingsBinaryFormat) const
{
writeStringBinary(toString(), buf);
}
void SettingURI::deserialize(ReadBuffer & buf, SettingsBinaryFormat)
{
String s;
readStringBinary(s, buf);
set(s);
}
#define IMPLEMENT_SETTING_ENUM(ENUM_NAME, LIST_OF_NAMES_MACRO, ERROR_CODE_FOR_UNEXPECTED_NAME) \ #define IMPLEMENT_SETTING_ENUM(ENUM_NAME, LIST_OF_NAMES_MACRO, ERROR_CODE_FOR_UNEXPECTED_NAME) \
IMPLEMENT_SETTING_ENUM_WITH_TAG(ENUM_NAME, void, LIST_OF_NAMES_MACRO, ERROR_CODE_FOR_UNEXPECTED_NAME) IMPLEMENT_SETTING_ENUM_WITH_TAG(ENUM_NAME, void, LIST_OF_NAMES_MACRO, ERROR_CODE_FOR_UNEXPECTED_NAME)

View File

@ -1,6 +1,7 @@
#pragma once #pragma once
#include <Poco/Timespan.h> #include <Poco/Timespan.h>
#include <Poco/URI.h>
#include <DataStreams/SizeLimits.h> #include <DataStreams/SizeLimits.h>
#include <Formats/FormatSettings.h> #include <Formats/FormatSettings.h>
#include <common/StringRef.h> #include <common/StringRef.h>
@ -196,6 +197,26 @@ struct SettingEnum
void deserialize(ReadBuffer & buf, SettingsBinaryFormat format); void deserialize(ReadBuffer & buf, SettingsBinaryFormat format);
}; };
struct SettingURI
{
Poco::URI value;
bool changed = false;
SettingURI(const Poco::URI & x = Poco::URI{}) : value(x) {}
operator Poco::URI() const { return value; }
SettingURI & operator= (const Poco::URI & x) { set(x); return *this; }
String toString() const;
Field toField() const;
void set(const Poco::URI & x);
void set(const Field & x);
void set(const String & x);
void serialize(WriteBuffer & buf, SettingsBinaryFormat format) const;
void deserialize(ReadBuffer & buf, SettingsBinaryFormat format);
};
enum class LoadBalancing enum class LoadBalancing
{ {

View File

@ -14,7 +14,7 @@
#include <DataStreams/NativeBlockInputStream.h> #include <DataStreams/NativeBlockInputStream.h>
#include <Processors/Formats/Impl/ValuesBlockInputFormat.h> #include <Processors/Formats/Impl/ValuesBlockInputFormat.h>
#include <Processors/Formats/Impl/MySQLOutputFormat.h> #include <Processors/Formats/Impl/MySQLOutputFormat.h>
#include <Poco/URI.h>
namespace DB namespace DB
{ {
@ -68,7 +68,15 @@ static FormatSettings getInputFormatSetting(const Settings & settings, const Con
format_settings.custom.row_before_delimiter = settings.format_custom_row_before_delimiter; format_settings.custom.row_before_delimiter = settings.format_custom_row_before_delimiter;
format_settings.custom.row_after_delimiter = settings.format_custom_row_after_delimiter; format_settings.custom.row_after_delimiter = settings.format_custom_row_after_delimiter;
format_settings.custom.row_between_delimiter = settings.format_custom_row_between_delimiter; format_settings.custom.row_between_delimiter = settings.format_custom_row_between_delimiter;
format_settings.avro.schema_registry_url = settings.input_format_avro_schema_registry_url;
/// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context
if (context.hasGlobalContext() && (context.getGlobalContext().getApplicationType() == Context::ApplicationType::SERVER))
{
const Poco::URI & avro_schema_registry_url = settings.format_avro_schema_registry_url;
if (!avro_schema_registry_url.empty())
context.getRemoteHostFilter().checkURL(avro_schema_registry_url);
}
format_settings.avro.schema_registry_url = settings.format_avro_schema_registry_url.toString();
return format_settings; return format_settings;
} }

View File

@ -480,7 +480,7 @@ AvroDeserializer::AvroDeserializer(const ColumnsWithTypeAndName & columns, avro:
} }
} }
void AvroDeserializer::deserializeRow(MutableColumns & columns, avro::Decoder & decoder) void AvroDeserializer::deserializeRow(MutableColumns & columns, avro::Decoder & decoder) const
{ {
for (size_t i = 0; i < field_mapping.size(); i++) for (size_t i = 0; i < field_mapping.size(); i++)
{ {
@ -519,51 +519,49 @@ bool AvroRowInputFormat::readRow(MutableColumns & columns, RowReadExtension &)
class AvroConfluentRowInputFormat::SchemaRegistry class AvroConfluentRowInputFormat::SchemaRegistry
{ {
public: public:
SchemaRegistry(const std::string & base_url_) SchemaRegistry(const std::string & base_url_, size_t schema_cache_max_size = 1000)
: base_url(base_url_), schema_cache(schema_cache_max_size)
{ {
if (base_url_.empty()) if (base_url.empty())
{
throw Exception("Empty Schema Registry URL", ErrorCodes::BAD_ARGUMENTS); throw Exception("Empty Schema Registry URL", ErrorCodes::BAD_ARGUMENTS);
}
try
{
base_url = base_url_;
}
catch (const Poco::SyntaxException & e)
{
throw Exception("Invalid Schema Registry URL: " + e.displayText(), ErrorCodes::BAD_ARGUMENTS);
}
} }
avro::ValidSchema getSchema(uint32_t id) const avro::ValidSchema getSchema(uint32_t id)
{
auto [schema, loaded] = schema_cache.getOrSet(
id,
[this, id](){ return std::make_shared<avro::ValidSchema>(fetchSchema(id)); }
);
return *schema;
}
private:
avro::ValidSchema fetchSchema(uint32_t id)
{ {
try try
{ {
try try
{ {
/// TODO Host checking to prevent SSRF
Poco::URI url(base_url, "/schemas/ids/" + std::to_string(id)); Poco::URI url(base_url, "/schemas/ids/" + std::to_string(id));
LOG_TRACE((&Logger::get("AvroConfluentRowInputFormat")), "Fetching schema id = " << id);
/// One second for connect/send/receive. Just in case. /// One second for connect/send/receive. Just in case.
ConnectionTimeouts timeouts({1, 0}, {1, 0}, {1, 0}); ConnectionTimeouts timeouts({1, 0}, {1, 0}, {1, 0});
Poco::Net::HTTPRequest request(Poco::Net::HTTPRequest::HTTP_GET, url.getPathAndQuery()); Poco::Net::HTTPRequest request(Poco::Net::HTTPRequest::HTTP_GET, url.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1);
request.setHost(url.getHost());
auto session = makePooledHTTPSession(url, timeouts, 1); auto session = makePooledHTTPSession(url, timeouts, 1);
session->sendRequest(request); session->sendRequest(request);
Poco::Net::HTTPResponse response; Poco::Net::HTTPResponse response;
auto & response_body = session->receiveResponse(response); auto response_body = receiveResponse(*session, request, response, false);
if (response.getStatus() != Poco::Net::HTTPResponse::HTTP_OK)
{
throw Exception("HTTP code " + std::to_string(response.getStatus()), ErrorCodes::INCORRECT_DATA);
}
Poco::JSON::Parser parser; Poco::JSON::Parser parser;
auto json_body = parser.parse(response_body).extract<Poco::JSON::Object::Ptr>(); auto json_body = parser.parse(*response_body).extract<Poco::JSON::Object::Ptr>();
auto schema = json_body->getValue<std::string>("schema"); auto schema = json_body->getValue<std::string>("schema");
LOG_TRACE((&Logger::get("AvroConfluentRowInputFormat")),
"Succesfully fetched schema id = " << id << "\n" << schema);
return avro::compileJsonSchemaFromString(schema); return avro::compileJsonSchemaFromString(schema);
} }
catch (const Exception &) catch (const Exception &)
@ -588,8 +586,27 @@ public:
private: private:
Poco::URI base_url; Poco::URI base_url;
LRUCache<uint32_t, avro::ValidSchema> schema_cache;
}; };
using ConfluentSchemaRegistry = AvroConfluentRowInputFormat::SchemaRegistry;
#define SCHEMA_REGISTRY_CACHE_MAX_SIZE 1000
/// Cache of Schema Registry URL -> SchemaRegistry
static LRUCache<std::string, ConfluentSchemaRegistry> schema_registry_cache(SCHEMA_REGISTRY_CACHE_MAX_SIZE);
static std::shared_ptr<ConfluentSchemaRegistry> getConfluentSchemaRegistry(const FormatSettings & format_settings)
{
const auto & base_url = format_settings.avro.schema_registry_url;
auto [schema_registry, loaded] = schema_registry_cache.getOrSet(
base_url,
[base_url]()
{
return std::make_shared<ConfluentSchemaRegistry>(base_url);
}
);
return schema_registry;
}
static uint32_t readConfluentSchemaId(ReadBuffer & in) static uint32_t readConfluentSchemaId(ReadBuffer & in)
{ {
uint8_t magic; uint8_t magic;
@ -611,7 +628,7 @@ AvroConfluentRowInputFormat::AvroConfluentRowInputFormat(
const Block & header_, ReadBuffer & in_, Params params_, const FormatSettings & format_settings_) const Block & header_, ReadBuffer & in_, Params params_, const FormatSettings & format_settings_)
: IRowInputFormat(header_.cloneEmpty(), in_, params_) : IRowInputFormat(header_.cloneEmpty(), in_, params_)
, header_columns(header_.getColumnsWithTypeAndName()) , header_columns(header_.getColumnsWithTypeAndName())
, schema_registry(std::make_unique<SchemaRegistry>(format_settings_.avro.schema_registry_url)) , schema_registry(getConfluentSchemaRegistry(format_settings_))
, input_stream(std::make_unique<InputStreamReadBufferAdapter>(in)) , input_stream(std::make_unique<InputStreamReadBufferAdapter>(in))
, decoder(avro::binaryDecoder()) , decoder(avro::binaryDecoder())
@ -632,7 +649,7 @@ bool AvroConfluentRowInputFormat::readRow(MutableColumns & columns, RowReadExten
return true; return true;
} }
AvroDeserializer & AvroConfluentRowInputFormat::getOrCreateDeserializer(SchemaId schema_id) const AvroDeserializer & AvroConfluentRowInputFormat::getOrCreateDeserializer(SchemaId schema_id)
{ {
auto it = deserializer_cache.find(schema_id); auto it = deserializer_cache.find(schema_id);
if (it == deserializer_cache.end()) if (it == deserializer_cache.end())
@ -657,12 +674,6 @@ void registerInputFormatProcessorAvro(FormatFactory & factory)
}); });
#if USE_POCO_JSON #if USE_POCO_JSON
/// AvroConfluent format is disabled for the following reasons:
/// 1. There is no test for it.
/// 2. RemoteHostFilter is not used to prevent CSRF attacks.
#if 0
factory.registerInputFormatProcessor("AvroConfluent",[]( factory.registerInputFormatProcessor("AvroConfluent",[](
ReadBuffer & buf, ReadBuffer & buf,
const Block & sample, const Block & sample,
@ -673,8 +684,6 @@ void registerInputFormatProcessorAvro(FormatFactory & factory)
}); });
#endif #endif
#endif
} }
} }

View File

@ -23,7 +23,7 @@ class AvroDeserializer
{ {
public: public:
AvroDeserializer(const ColumnsWithTypeAndName & columns, avro::ValidSchema schema); AvroDeserializer(const ColumnsWithTypeAndName & columns, avro::ValidSchema schema);
void deserializeRow(MutableColumns & columns, avro::Decoder & decoder); void deserializeRow(MutableColumns & columns, avro::Decoder & decoder) const;
private: private:
using DeserializeFn = std::function<void(IColumn & column, avro::Decoder & decoder)>; using DeserializeFn = std::function<void(IColumn & column, avro::Decoder & decoder)>;
@ -58,6 +58,12 @@ private:
}; };
#if USE_POCO_JSON #if USE_POCO_JSON
/// Confluent framing + Avro binary datum encoding. Mainly used for Kafka.
/// Uses 3 caches:
/// 1. global: schema registry cache (base_url -> SchemaRegistry)
/// 2. SchemaRegistry: schema cache (schema_id -> schema)
/// 3. AvroConfluentRowInputFormat: deserializer cache (schema_id -> AvroDeserializer)
/// This is needed because KafkaStorage creates a new instance of InputFormat per a batch of messages
class AvroConfluentRowInputFormat : public IRowInputFormat class AvroConfluentRowInputFormat : public IRowInputFormat
{ {
public: public:
@ -65,15 +71,13 @@ public:
virtual bool readRow(MutableColumns & columns, RowReadExtension & ext) override; virtual bool readRow(MutableColumns & columns, RowReadExtension & ext) override;
String getName() const override { return "AvroConfluentRowInputFormat"; } String getName() const override { return "AvroConfluentRowInputFormat"; }
class SchemaRegistry;
private: private:
const ColumnsWithTypeAndName header_columns; const ColumnsWithTypeAndName header_columns;
std::shared_ptr<SchemaRegistry> schema_registry;
class SchemaRegistry;
std::unique_ptr<SchemaRegistry> schema_registry;
using SchemaId = uint32_t; using SchemaId = uint32_t;
std::unordered_map<SchemaId, AvroDeserializer> deserializer_cache; std::unordered_map<SchemaId, AvroDeserializer> deserializer_cache;
AvroDeserializer & getOrCreateDeserializer(SchemaId schema_id); const AvroDeserializer & getOrCreateDeserializer(SchemaId schema_id);
avro::InputStreamPtr input_stream; avro::InputStreamPtr input_stream;
avro::DecoderPtr decoder; avro::DecoderPtr decoder;