ClickHouse/src/Formats/FormatFactory.cpp

374 lines
17 KiB
C++
Raw Normal View History

#include <Formats/FormatFactory.h>
2019-10-28 23:43:22 +00:00
#include <algorithm>
#include <Common/Exception.h>
#include <Interpreters/Context.h>
#include <Core/Settings.h>
#include <DataStreams/MaterializingBlockOutputStream.h>
2020-05-18 10:00:22 +00:00
#include <DataStreams/SquashingBlockOutputStream.h>
#include <DataStreams/NativeBlockInputStream.h>
#include <Formats/FormatSettings.h>
2019-02-19 18:41:18 +00:00
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/IRowOutputFormat.h>
#include <Processors/Formats/InputStreamFromInputFormat.h>
#include <Processors/Formats/OutputStreamToOutputFormat.h>
#include <Processors/Formats/Impl/ValuesBlockInputFormat.h>
#include <Processors/Formats/Impl/MySQLOutputFormat.h>
2020-05-18 10:00:22 +00:00
#include <Processors/Formats/Impl/NativeFormat.cpp>
2020-06-11 00:36:57 +00:00
#include <Processors/Formats/Impl/ParallelParsingInputFormat.h>
2020-06-11 23:00:49 +00:00
#include <Processors/Formats/Impl/ParallelFormattingOutputFormat.h>
2020-02-02 00:53:11 +00:00
#include <Poco/URI.h>
2020-10-07 18:16:58 +00:00
#include <IO/ReadHelpers.h>
#if !defined(ARCADIA_BUILD)
# include <Common/config.h>
#endif
namespace DB
{
namespace ErrorCodes
{
extern const int UNKNOWN_FORMAT;
extern const int LOGICAL_ERROR;
extern const int FORMAT_IS_NOT_SUITABLE_FOR_INPUT;
extern const int FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT;
}
const FormatFactory::Creators & FormatFactory::getCreators(const String & name) const
{
auto it = dict.find(name);
if (dict.end() != it)
return it->second;
throw Exception("Unknown format " + name, ErrorCodes::UNKNOWN_FORMAT);
}
FormatSettings getFormatSettings(const Context & context)
2019-02-19 18:41:18 +00:00
{
const auto & settings = context.getSettingsRef();
return getFormatSettings(context, settings);
}
template <typename Settings>
FormatSettings getFormatSettings(const Context & context,
const Settings & settings)
2019-02-19 18:41:18 +00:00
{
FormatSettings format_settings;
format_settings.avro.allow_missing_fields = settings.input_format_avro_allow_missing_fields;
format_settings.avro.output_codec = settings.output_format_avro_codec;
format_settings.avro.output_sync_interval = settings.output_format_avro_sync_interval;
format_settings.avro.schema_registry_url = settings.format_avro_schema_registry_url.toString();
2018-07-04 21:00:50 +00:00
format_settings.csv.allow_double_quotes = settings.format_csv_allow_double_quotes;
format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes;
format_settings.csv.crlf_end_of_line = settings.output_format_csv_crlf_end_of_line;
format_settings.csv.delimiter = settings.format_csv_delimiter;
format_settings.csv.empty_as_default = settings.input_format_defaults_for_omitted_fields;
format_settings.csv.input_format_enum_as_number = settings.input_format_csv_enum_as_number;
format_settings.csv.unquoted_null_literal_as_null = settings.input_format_csv_unquoted_null_literal_as_null;
format_settings.custom.escaping_rule = settings.format_custom_escaping_rule;
format_settings.custom.field_delimiter = settings.format_custom_field_delimiter;
format_settings.custom.result_after_delimiter = settings.format_custom_result_after_delimiter;
format_settings.custom.result_after_delimiter = settings.format_custom_result_after_delimiter;
format_settings.custom.result_before_delimiter = settings.format_custom_result_before_delimiter;
format_settings.custom.row_after_delimiter = settings.format_custom_row_after_delimiter;
format_settings.custom.row_before_delimiter = settings.format_custom_row_before_delimiter;
format_settings.custom.row_between_delimiter = settings.format_custom_row_between_delimiter;
format_settings.date_time_input_format = settings.date_time_input_format;
format_settings.date_time_output_format = settings.date_time_output_format;
format_settings.enable_streaming = settings.output_format_enable_streaming;
format_settings.import_nested_json = settings.input_format_import_nested_json;
format_settings.input_allow_errors_num = settings.input_format_allow_errors_num;
format_settings.input_allow_errors_ratio = settings.input_format_allow_errors_ratio;
format_settings.json.array_of_rows = settings.output_format_json_array_of_rows;
format_settings.json.escape_forward_slashes = settings.output_format_json_escape_forward_slashes;
format_settings.json.named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects;
format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers;
format_settings.json.quote_denormals = settings.output_format_json_quote_denormals;
format_settings.null_as_default = settings.input_format_null_as_default;
format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size;
format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
format_settings.pretty.color = settings.output_format_pretty_color;
format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
format_settings.pretty.max_rows = settings.output_format_pretty_max_rows;
format_settings.pretty.max_value_width = settings.output_format_pretty_max_value_width;
format_settings.pretty.output_format_pretty_row_numbers = settings.output_format_pretty_row_numbers;
format_settings.regexp.escaping_rule = settings.format_regexp_escaping_rule;
format_settings.regexp.regexp = settings.format_regexp;
format_settings.regexp.skip_unmatched = settings.format_regexp_skip_unmatched;
format_settings.schema.format_schema = settings.format_schema;
format_settings.schema.format_schema_path = context.getFormatSchemaPath();
format_settings.schema.is_server = context.hasGlobalContext() && (context.getGlobalContext().getApplicationType() == Context::ApplicationType::SERVER);
format_settings.skip_unknown_fields = settings.input_format_skip_unknown_fields;
2019-09-24 14:25:22 +00:00
format_settings.template_settings.resultset_format = settings.format_template_resultset;
format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter;
format_settings.template_settings.row_format = settings.format_template_row;
2020-02-03 07:40:12 +00:00
format_settings.tsv.crlf_end_of_line = settings.output_format_tsv_crlf_end_of_line;
2019-10-16 14:22:22 +00:00
format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default;
format_settings.tsv.input_format_enum_as_number = settings.input_format_tsv_enum_as_number;
2020-09-08 12:37:18 +00:00
format_settings.tsv.null_representation = settings.output_format_tsv_null_representation;
format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals;
format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions;
format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;
format_settings.with_names_use_header = settings.input_format_with_names_use_header;
format_settings.write_statistics = settings.output_format_write_statistics;
2020-02-02 00:53:11 +00:00
/// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context
if (format_settings.schema.is_server)
2020-02-02 00:53:11 +00:00
{
const Poco::URI & avro_schema_registry_url = settings.format_avro_schema_registry_url;
if (!avro_schema_registry_url.empty())
context.getRemoteHostFilter().checkURL(avro_schema_registry_url);
}
2019-02-19 18:41:18 +00:00
return format_settings;
}
template
FormatSettings getFormatSettings<FormatFactorySettings>(const Context & context,
const FormatFactorySettings & settings);
template
FormatSettings getFormatSettings<Settings>(const Context & context,
const Settings & settings);
2019-02-19 18:41:18 +00:00
2020-05-18 10:00:22 +00:00
InputFormatPtr FormatFactory::getInput(
2019-07-08 13:00:54 +00:00
const String & name,
ReadBuffer & buf,
const Block & sample,
const Context & context,
UInt64 max_block_size,
const std::optional<FormatSettings> & _format_settings) const
2019-02-19 18:41:18 +00:00
{
2019-07-31 13:26:08 +00:00
if (name == "Native")
2020-05-18 10:00:22 +00:00
return std::make_shared<NativeInputFormatFromNativeBlockInputStream>(sample, buf);
2019-02-19 18:41:18 +00:00
auto format_settings = _format_settings
? *_format_settings : getFormatSettings(context);
2019-08-02 14:41:19 +00:00
if (!getCreators(name).input_processor_creator)
2019-08-14 15:54:51 +00:00
{
2020-09-24 19:16:32 +00:00
throw Exception("Format " + name + " is not suitable for input (with processors)", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_INPUT);
2019-08-14 15:54:51 +00:00
}
2019-02-19 18:41:18 +00:00
2019-10-01 10:48:46 +00:00
const Settings & settings = context.getSettingsRef();
const auto & file_segmentation_engine = getCreators(name).file_segmentation_engine;
// Doesn't make sense to use parallel parsing with less than four threads
// (segmentator + two parsers + reader).
bool parallel_parsing = settings.input_format_parallel_parsing && file_segmentation_engine && settings.max_threads >= 4;
2020-11-05 18:07:14 +00:00
if (settings.min_chunk_bytes_for_parallel_parsing * settings.max_threads * 2 > settings.max_memory_usage)
parallel_parsing = false;
if (parallel_parsing && name == "JSONEachRow")
{
/// FIXME ParallelParsingBlockInputStream doesn't support formats with non-trivial readPrefix() and readSuffix()
/// For JSONEachRow we can safely skip whitespace characters
skipWhitespaceIfAny(buf);
if (buf.eof() || *buf.position() == '[')
parallel_parsing = false; /// Disable it for JSONEachRow if data is in square brackets (see JSONEachRowRowInputFormat)
}
if (parallel_parsing)
2019-10-01 10:48:46 +00:00
{
const auto & input_getter = getCreators(name).input_processor_creator;
RowInputFormatParams row_input_format_params;
row_input_format_params.max_block_size = max_block_size;
row_input_format_params.allow_errors_num = format_settings.input_allow_errors_num;
row_input_format_params.allow_errors_ratio = format_settings.input_allow_errors_ratio;
row_input_format_params.max_execution_time = settings.max_execution_time;
row_input_format_params.timeout_overflow_mode = settings.timeout_overflow_mode;
2020-06-10 12:02:34 +00:00
/// Const reference is copied to lambda.
auto parser_creator = [input_getter, sample, row_input_format_params, format_settings]
(ReadBuffer & input) -> InputFormatPtr
{ return input_getter(input, sample, row_input_format_params, format_settings); };
2020-05-18 10:00:22 +00:00
2020-06-11 00:36:57 +00:00
ParallelParsingInputFormat::Params params{
2020-06-10 12:02:34 +00:00
buf, sample, parser_creator, file_segmentation_engine, name, settings.max_threads, settings.min_chunk_bytes_for_parallel_parsing};
2020-06-11 00:36:57 +00:00
return std::make_shared<ParallelParsingInputFormat>(params);
2019-10-01 10:48:46 +00:00
}
2020-05-18 10:00:22 +00:00
auto format = getInputFormat(name, buf, sample, context, max_block_size, std::move(callback));
return format;
2019-02-19 18:41:18 +00:00
}
BlockOutputStreamPtr FormatFactory::getOutput(const String & name,
WriteBuffer & buf, const Block & sample, const Context & context,
WriteCallback callback, const std::optional<FormatSettings> & _format_settings) const
2019-02-19 18:41:18 +00:00
{
auto format_settings = _format_settings
? *_format_settings : getFormatSettings(context);
2019-08-02 14:41:19 +00:00
if (!getCreators(name).output_processor_creator)
2019-08-14 15:54:51 +00:00
{
2020-09-24 18:49:18 +00:00
throw Exception("Format " + name + " is not suitable for output (with processors)", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT);
2019-08-14 15:54:51 +00:00
}
2019-07-31 10:28:54 +00:00
2020-06-11 23:00:49 +00:00
2020-08-14 00:34:35 +00:00
const Settings & settings = context.getSettingsRef();
bool parallel_formatting = settings.output_format_parallel_formatting;
2020-06-11 23:00:49 +00:00
2020-10-08 16:06:14 +00:00
if (parallel_formatting && getCreators(name).supports_parallel_formatting && !settings.allow_experimental_live_view)
2020-06-11 23:00:49 +00:00
{
const auto & output_getter = getCreators(name).output_processor_creator;
FormatSettings format_settings = getOutputFormatSetting(settings, context);
/** TODO: Materialization is needed, because formats can use the functions `IDataType`,
* which only work with full columns.
*/
auto formatter_creator = [output_getter, sample, callback, format_settings]
(WriteBuffer & output) -> OutputFormatPtr
{ return output_getter(output, sample, std::move(callback), format_settings);};
2020-09-24 18:49:18 +00:00
ParallelFormattingOutputFormat::Params params{buf, sample, formatter_creator, settings.max_threads};
2020-06-11 23:00:49 +00:00
auto format = std::make_shared<ParallelFormattingOutputFormat>(params);
2020-10-08 16:06:14 +00:00
/// Enable auto-flush for streaming mode. Currently it is needed by INSERT WATCH query.
if (format_settings.enable_streaming)
format->setAutoFlush();
2020-06-11 23:00:49 +00:00
return std::make_shared<MaterializingBlockOutputStream>(std::make_shared<OutputStreamToOutputFormat>(format), sample);
}
auto format = getOutputFormat(name, buf, sample, context, std::move(callback));
return std::make_shared<MaterializingBlockOutputStream>(std::make_shared<OutputStreamToOutputFormat>(format), sample);
}
InputFormatPtr FormatFactory::getInputFormat(
const String & name,
ReadBuffer & buf,
const Block & sample,
const Context & context,
UInt64 max_block_size,
const std::optional<FormatSettings> & _format_settings) const
2019-02-19 18:41:18 +00:00
{
2019-08-02 14:41:19 +00:00
const auto & input_getter = getCreators(name).input_processor_creator;
2019-02-19 18:41:18 +00:00
if (!input_getter)
throw Exception("Format " + name + " is not suitable for input", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_INPUT);
const Settings & settings = context.getSettingsRef();
auto format_settings = _format_settings
? *_format_settings : getFormatSettings(context);
2019-02-19 18:41:18 +00:00
RowInputFormatParams params;
params.max_block_size = max_block_size;
params.allow_errors_num = format_settings.input_allow_errors_num;
params.allow_errors_ratio = format_settings.input_allow_errors_ratio;
params.max_execution_time = settings.max_execution_time;
params.timeout_overflow_mode = settings.timeout_overflow_mode;
2019-02-19 18:41:18 +00:00
auto format = input_getter(buf, sample, params, format_settings);
/// It's a kludge. Because I cannot remove context from values format.
if (auto * values = typeid_cast<ValuesBlockInputFormat *>(format.get()))
values->setContext(context);
return format;
2019-02-19 18:41:18 +00:00
}
OutputFormatPtr FormatFactory::getOutputFormat(
const String & name, WriteBuffer & buf, const Block & sample,
const Context & context, WriteCallback callback,
const std::optional<FormatSettings> & _format_settings) const
2019-02-19 18:41:18 +00:00
{
2019-08-02 14:41:19 +00:00
const auto & output_getter = getCreators(name).output_processor_creator;
2019-02-19 18:41:18 +00:00
if (!output_getter)
throw Exception("Format " + name + " is not suitable for output", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT);
RowOutputFormatParams params;
params.callback = std::move(callback);
auto format_settings = _format_settings
? *_format_settings : getFormatSettings(context);
2019-02-19 18:41:18 +00:00
/** TODO: Materialization is needed, because formats can use the functions `IDataType`,
* which only work with full columns.
*/
auto format = output_getter(buf, sample, params, format_settings);
/// Enable auto-flush for streaming mode. Currently it is needed by INSERT WATCH query.
if (format_settings.enable_streaming)
format->setAutoFlush();
/// It's a kludge. Because I cannot remove context from MySQL format.
if (auto * mysql = typeid_cast<MySQLOutputFormat *>(format.get()))
mysql->setContext(context);
return format;
2019-02-19 18:41:18 +00:00
}
void FormatFactory::registerInputFormat(const String & name, InputCreator input_creator)
{
2019-10-01 10:48:46 +00:00
auto & target = dict[name].input_creator;
if (target)
throw Exception("FormatFactory: Input format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
2019-02-19 18:41:18 +00:00
target = std::move(input_creator);
}
void FormatFactory::registerOutputFormat(const String & name, OutputCreator output_creator)
{
2019-08-02 14:41:19 +00:00
auto & target = dict[name].output_creator;
if (target)
throw Exception("FormatFactory: Output format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
2019-02-19 18:41:18 +00:00
target = std::move(output_creator);
}
void FormatFactory::registerInputFormatProcessor(const String & name, InputProcessorCreator input_creator)
{
2019-08-02 14:41:19 +00:00
auto & target = dict[name].input_processor_creator;
2019-02-19 18:41:18 +00:00
if (target)
throw Exception("FormatFactory: Input format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
target = std::move(input_creator);
}
void FormatFactory::registerOutputFormatProcessor(const String & name, OutputProcessorCreator output_creator)
{
2019-08-02 14:41:19 +00:00
auto & target = dict[name].output_processor_creator;
2019-02-19 18:41:18 +00:00
if (target)
throw Exception("FormatFactory: Output format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
target = std::move(output_creator);
}
2019-10-01 10:48:46 +00:00
void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine)
{
auto & target = dict[name].file_segmentation_engine;
if (target)
throw Exception("FormatFactory: File segmentation engine " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
target = std::move(file_segmentation_engine);
2019-10-01 10:48:46 +00:00
}
2020-10-06 14:02:01 +00:00
void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & name)
{
auto & target = dict[name].supports_parallel_formatting;
if (target)
throw Exception("FormatFactory: Output format " + name + " is already marked as supporting parallel formatting.", ErrorCodes::LOGICAL_ERROR);
target = true;
}
FormatFactory & FormatFactory::instance()
{
static FormatFactory ret;
return ret;
}
}