2020-04-16 12:31:57 +00:00
|
|
|
#include <Formats/FormatFactory.h>
|
|
|
|
|
2019-10-28 23:43:22 +00:00
|
|
|
#include <algorithm>
|
2019-03-22 12:08:30 +00:00
|
|
|
#include <Core/Settings.h>
|
2018-06-10 19:22:49 +00:00
|
|
|
#include <Formats/FormatSettings.h>
|
2021-12-22 22:14:23 +00:00
|
|
|
#include <Interpreters/Context.h>
|
|
|
|
#include <Interpreters/ProcessList.h>
|
2019-02-19 18:41:18 +00:00
|
|
|
#include <Processors/Formats/IRowInputFormat.h>
|
2020-10-06 12:47:52 +00:00
|
|
|
#include <Processors/Formats/IRowOutputFormat.h>
|
2019-12-25 19:17:41 +00:00
|
|
|
#include <Processors/Formats/Impl/MySQLOutputFormat.h>
|
2020-06-11 23:00:49 +00:00
|
|
|
#include <Processors/Formats/Impl/ParallelFormattingOutputFormat.h>
|
2021-12-22 22:14:23 +00:00
|
|
|
#include <Processors/Formats/Impl/ParallelParsingInputFormat.h>
|
|
|
|
#include <Processors/Formats/Impl/ValuesBlockInputFormat.h>
|
2020-02-02 00:53:11 +00:00
|
|
|
#include <Poco/URI.h>
|
2021-12-22 22:14:23 +00:00
|
|
|
#include <Common/Exception.h>
|
2018-06-10 19:22:49 +00:00
|
|
|
|
2022-01-11 13:26:14 +00:00
|
|
|
#include <boost/algorithm/string/case_conv.hpp>
|
|
|
|
|
2018-06-10 19:22:49 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int UNKNOWN_FORMAT;
|
|
|
|
extern const int LOGICAL_ERROR;
|
|
|
|
extern const int FORMAT_IS_NOT_SUITABLE_FOR_INPUT;
|
|
|
|
extern const int FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT;
|
|
|
|
}
|
|
|
|
|
|
|
|
const FormatFactory::Creators & FormatFactory::getCreators(const String & name) const
|
|
|
|
{
|
|
|
|
auto it = dict.find(name);
|
|
|
|
if (dict.end() != it)
|
|
|
|
return it->second;
|
|
|
|
throw Exception("Unknown format " + name, ErrorCodes::UNKNOWN_FORMAT);
|
|
|
|
}
|
|
|
|
|
2021-06-01 12:20:52 +00:00
|
|
|
FormatSettings getFormatSettings(ContextPtr context)
|
2019-02-19 18:41:18 +00:00
|
|
|
{
|
2021-04-10 23:33:54 +00:00
|
|
|
const auto & settings = context->getSettingsRef();
|
2020-11-02 07:50:38 +00:00
|
|
|
|
2020-11-07 08:53:39 +00:00
|
|
|
return getFormatSettings(context, settings);
|
|
|
|
}
|
2018-06-10 19:22:49 +00:00
|
|
|
|
2020-11-07 08:53:39 +00:00
|
|
|
template <typename Settings>
|
2021-06-01 12:20:52 +00:00
|
|
|
FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
|
2019-02-19 18:41:18 +00:00
|
|
|
{
|
2018-06-10 19:22:49 +00:00
|
|
|
FormatSettings format_settings;
|
2020-11-02 07:50:38 +00:00
|
|
|
|
|
|
|
format_settings.avro.allow_missing_fields = settings.input_format_avro_allow_missing_fields;
|
|
|
|
format_settings.avro.output_codec = settings.output_format_avro_codec;
|
|
|
|
format_settings.avro.output_sync_interval = settings.output_format_avro_sync_interval;
|
|
|
|
format_settings.avro.schema_registry_url = settings.format_avro_schema_registry_url.toString();
|
2021-07-09 16:18:22 +00:00
|
|
|
format_settings.avro.string_column_pattern = settings.output_format_avro_string_column_pattern.toString();
|
2021-10-13 08:19:37 +00:00
|
|
|
format_settings.avro.output_rows_in_file = settings.output_format_avro_rows_in_file;
|
2018-07-04 21:00:50 +00:00
|
|
|
format_settings.csv.allow_double_quotes = settings.format_csv_allow_double_quotes;
|
2020-11-02 07:50:38 +00:00
|
|
|
format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes;
|
|
|
|
format_settings.csv.crlf_end_of_line = settings.output_format_csv_crlf_end_of_line;
|
|
|
|
format_settings.csv.delimiter = settings.format_csv_delimiter;
|
2021-12-20 16:25:54 +00:00
|
|
|
format_settings.csv.tuple_delimiter = settings.format_csv_delimiter;
|
2021-10-14 10:32:49 +00:00
|
|
|
format_settings.csv.empty_as_default = settings.input_format_csv_empty_as_default;
|
2020-10-06 15:37:54 +00:00
|
|
|
format_settings.csv.input_format_enum_as_number = settings.input_format_csv_enum_as_number;
|
2021-10-21 13:52:27 +00:00
|
|
|
format_settings.csv.null_representation = settings.format_csv_null_representation;
|
2020-12-20 10:26:08 +00:00
|
|
|
format_settings.csv.input_format_arrays_as_nested_csv = settings.input_format_csv_arrays_as_nested_csv;
|
2021-12-02 08:14:25 +00:00
|
|
|
format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter;
|
|
|
|
format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter;
|
|
|
|
format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter;
|
2019-12-25 19:17:41 +00:00
|
|
|
format_settings.custom.escaping_rule = settings.format_custom_escaping_rule;
|
|
|
|
format_settings.custom.field_delimiter = settings.format_custom_field_delimiter;
|
2020-11-02 07:50:38 +00:00
|
|
|
format_settings.custom.result_after_delimiter = settings.format_custom_result_after_delimiter;
|
|
|
|
format_settings.custom.result_before_delimiter = settings.format_custom_result_before_delimiter;
|
2019-12-25 19:17:41 +00:00
|
|
|
format_settings.custom.row_after_delimiter = settings.format_custom_row_after_delimiter;
|
2020-11-02 07:50:38 +00:00
|
|
|
format_settings.custom.row_before_delimiter = settings.format_custom_row_before_delimiter;
|
2019-12-25 19:17:41 +00:00
|
|
|
format_settings.custom.row_between_delimiter = settings.format_custom_row_between_delimiter;
|
2018-06-10 19:22:49 +00:00
|
|
|
format_settings.date_time_input_format = settings.date_time_input_format;
|
2020-11-02 07:50:38 +00:00
|
|
|
format_settings.date_time_output_format = settings.date_time_output_format;
|
2021-11-19 05:22:44 +00:00
|
|
|
format_settings.bool_true_representation = settings.bool_true_representation;
|
|
|
|
format_settings.bool_false_representation = settings.bool_false_representation;
|
2020-04-26 13:44:11 +00:00
|
|
|
format_settings.enable_streaming = settings.output_format_enable_streaming;
|
2020-11-02 07:50:38 +00:00
|
|
|
format_settings.import_nested_json = settings.input_format_import_nested_json;
|
2018-06-10 19:22:49 +00:00
|
|
|
format_settings.input_allow_errors_num = settings.input_format_allow_errors_num;
|
|
|
|
format_settings.input_allow_errors_ratio = settings.input_format_allow_errors_ratio;
|
2020-11-17 19:50:47 +00:00
|
|
|
format_settings.json.array_of_rows = settings.output_format_json_array_of_rows;
|
2020-11-02 07:50:38 +00:00
|
|
|
format_settings.json.escape_forward_slashes = settings.output_format_json_escape_forward_slashes;
|
2020-11-18 10:38:30 +00:00
|
|
|
format_settings.json.named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects;
|
2018-06-10 19:22:49 +00:00
|
|
|
format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers;
|
|
|
|
format_settings.json.quote_denormals = settings.output_format_json_quote_denormals;
|
2020-11-02 07:50:38 +00:00
|
|
|
format_settings.null_as_default = settings.input_format_null_as_default;
|
2021-08-16 08:03:23 +00:00
|
|
|
format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros;
|
2020-11-02 07:50:38 +00:00
|
|
|
format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size;
|
2021-07-01 17:59:28 +00:00
|
|
|
format_settings.parquet.import_nested = settings.input_format_parquet_import_nested;
|
2021-12-02 08:14:25 +00:00
|
|
|
format_settings.parquet.allow_missing_columns = settings.input_format_parquet_allow_missing_columns;
|
2020-11-02 07:50:38 +00:00
|
|
|
format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
|
|
|
|
format_settings.pretty.color = settings.output_format_pretty_color;
|
2018-08-30 23:34:12 +00:00
|
|
|
format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
|
2020-11-02 07:50:38 +00:00
|
|
|
format_settings.pretty.max_rows = settings.output_format_pretty_max_rows;
|
2020-05-31 22:12:13 +00:00
|
|
|
format_settings.pretty.max_value_width = settings.output_format_pretty_max_value_width;
|
2020-09-29 12:30:36 +00:00
|
|
|
format_settings.pretty.output_format_pretty_row_numbers = settings.output_format_pretty_row_numbers;
|
2020-11-02 07:50:38 +00:00
|
|
|
format_settings.regexp.escaping_rule = settings.format_regexp_escaping_rule;
|
|
|
|
format_settings.regexp.regexp = settings.format_regexp;
|
|
|
|
format_settings.regexp.skip_unmatched = settings.format_regexp_skip_unmatched;
|
|
|
|
format_settings.schema.format_schema = settings.format_schema;
|
2021-04-10 23:33:54 +00:00
|
|
|
format_settings.schema.format_schema_path = context->getFormatSchemaPath();
|
|
|
|
format_settings.schema.is_server = context->hasGlobalContext() && (context->getGlobalContext()->getApplicationType() == Context::ApplicationType::SERVER);
|
2020-11-02 07:50:38 +00:00
|
|
|
format_settings.skip_unknown_fields = settings.input_format_skip_unknown_fields;
|
2019-09-24 14:25:22 +00:00
|
|
|
format_settings.template_settings.resultset_format = settings.format_template_resultset;
|
|
|
|
format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter;
|
2020-11-02 07:50:38 +00:00
|
|
|
format_settings.template_settings.row_format = settings.format_template_row;
|
2020-02-03 07:40:12 +00:00
|
|
|
format_settings.tsv.crlf_end_of_line = settings.output_format_tsv_crlf_end_of_line;
|
2019-10-16 14:22:22 +00:00
|
|
|
format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default;
|
2020-10-06 15:37:54 +00:00
|
|
|
format_settings.tsv.input_format_enum_as_number = settings.input_format_tsv_enum_as_number;
|
2021-10-21 13:52:27 +00:00
|
|
|
format_settings.tsv.null_representation = settings.format_tsv_null_representation;
|
2020-11-02 07:50:38 +00:00
|
|
|
format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals;
|
|
|
|
format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions;
|
|
|
|
format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;
|
|
|
|
format_settings.with_names_use_header = settings.input_format_with_names_use_header;
|
2021-10-14 10:32:49 +00:00
|
|
|
format_settings.with_types_use_header = settings.input_format_with_types_use_header;
|
2018-06-10 19:22:49 +00:00
|
|
|
format_settings.write_statistics = settings.output_format_write_statistics;
|
2021-05-25 12:01:28 +00:00
|
|
|
format_settings.arrow.low_cardinality_as_dictionary = settings.output_format_arrow_low_cardinality_as_dictionary;
|
2021-07-01 17:59:28 +00:00
|
|
|
format_settings.arrow.import_nested = settings.input_format_arrow_import_nested;
|
2021-12-02 08:14:25 +00:00
|
|
|
format_settings.arrow.allow_missing_columns = settings.input_format_arrow_allow_missing_columns;
|
2021-07-01 17:59:28 +00:00
|
|
|
format_settings.orc.import_nested = settings.input_format_orc_import_nested;
|
2021-12-02 08:14:25 +00:00
|
|
|
format_settings.orc.allow_missing_columns = settings.input_format_orc_allow_missing_columns;
|
2021-12-18 09:25:25 +00:00
|
|
|
format_settings.orc.row_batch_size = settings.input_format_orc_row_batch_size;
|
2021-10-14 10:32:49 +00:00
|
|
|
format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields;
|
2021-09-28 12:59:22 +00:00
|
|
|
format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode;
|
2021-10-31 19:53:24 +00:00
|
|
|
format_settings.seekable_read = settings.input_format_allow_seeks;
|
2021-12-15 11:30:57 +00:00
|
|
|
format_settings.msgpack.number_of_columns = settings.input_format_msgpack_number_of_columns;
|
|
|
|
format_settings.max_rows_to_read_for_schema_inference = settings.input_format_max_rows_to_read_for_schema_inference;
|
2020-02-02 00:53:11 +00:00
|
|
|
|
|
|
|
/// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context
|
2020-11-02 07:50:38 +00:00
|
|
|
if (format_settings.schema.is_server)
|
2020-02-02 00:53:11 +00:00
|
|
|
{
|
|
|
|
const Poco::URI & avro_schema_registry_url = settings.format_avro_schema_registry_url;
|
|
|
|
if (!avro_schema_registry_url.empty())
|
2021-04-10 23:33:54 +00:00
|
|
|
context->getRemoteHostFilter().checkURL(avro_schema_registry_url);
|
2020-02-02 00:53:11 +00:00
|
|
|
}
|
2018-06-10 19:22:49 +00:00
|
|
|
|
2019-02-19 18:41:18 +00:00
|
|
|
return format_settings;
|
2018-06-10 19:22:49 +00:00
|
|
|
}
|
|
|
|
|
2021-06-01 12:20:52 +00:00
|
|
|
template FormatSettings getFormatSettings<FormatFactorySettings>(ContextPtr context, const FormatFactorySettings & settings);
|
2018-06-10 19:22:49 +00:00
|
|
|
|
2021-06-01 12:20:52 +00:00
|
|
|
template FormatSettings getFormatSettings<Settings>(ContextPtr context, const Settings & settings);
|
2019-02-19 18:41:18 +00:00
|
|
|
|
|
|
|
|
2020-05-18 10:00:22 +00:00
|
|
|
InputFormatPtr FormatFactory::getInput(
|
2019-07-08 13:00:54 +00:00
|
|
|
const String & name,
|
|
|
|
ReadBuffer & buf,
|
|
|
|
const Block & sample,
|
2021-06-01 12:20:52 +00:00
|
|
|
ContextPtr context,
|
2019-07-08 13:00:54 +00:00
|
|
|
UInt64 max_block_size,
|
2020-11-07 08:53:39 +00:00
|
|
|
const std::optional<FormatSettings> & _format_settings) const
|
2019-02-19 18:41:18 +00:00
|
|
|
{
|
2020-11-07 08:53:39 +00:00
|
|
|
auto format_settings = _format_settings
|
|
|
|
? *_format_settings : getFormatSettings(context);
|
|
|
|
|
2021-10-11 16:11:50 +00:00
|
|
|
if (!getCreators(name).input_creator)
|
2019-08-14 15:54:51 +00:00
|
|
|
{
|
2020-09-24 19:16:32 +00:00
|
|
|
throw Exception("Format " + name + " is not suitable for input (with processors)", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_INPUT);
|
2019-08-14 15:54:51 +00:00
|
|
|
}
|
2019-02-19 18:41:18 +00:00
|
|
|
|
2021-04-10 23:33:54 +00:00
|
|
|
const Settings & settings = context->getSettingsRef();
|
2019-10-01 10:48:46 +00:00
|
|
|
const auto & file_segmentation_engine = getCreators(name).file_segmentation_engine;
|
|
|
|
|
2019-11-19 13:11:04 +00:00
|
|
|
// Doesn't make sense to use parallel parsing with less than four threads
|
|
|
|
// (segmentator + two parsers + reader).
|
2020-02-07 13:16:51 +00:00
|
|
|
bool parallel_parsing = settings.input_format_parallel_parsing && file_segmentation_engine && settings.max_threads >= 4;
|
|
|
|
|
2020-11-30 16:42:41 +00:00
|
|
|
if (settings.max_memory_usage && settings.min_chunk_bytes_for_parallel_parsing * settings.max_threads * 2 > settings.max_memory_usage)
|
2020-11-05 18:07:14 +00:00
|
|
|
parallel_parsing = false;
|
|
|
|
|
2020-12-30 04:50:58 +00:00
|
|
|
if (settings.max_memory_usage_for_user && settings.min_chunk_bytes_for_parallel_parsing * settings.max_threads * 2 > settings.max_memory_usage_for_user)
|
|
|
|
parallel_parsing = false;
|
|
|
|
|
2021-06-23 13:17:34 +00:00
|
|
|
if (parallel_parsing)
|
2020-02-07 13:16:51 +00:00
|
|
|
{
|
2021-06-23 13:17:34 +00:00
|
|
|
const auto & non_trivial_prefix_and_suffix_checker = getCreators(name).non_trivial_prefix_and_suffix_checker;
|
|
|
|
/// Disable parallel parsing for input formats with non-trivial readPrefix() and readSuffix().
|
|
|
|
if (non_trivial_prefix_and_suffix_checker && non_trivial_prefix_and_suffix_checker(buf))
|
|
|
|
parallel_parsing = false;
|
2020-02-07 13:16:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (parallel_parsing)
|
2019-10-01 10:48:46 +00:00
|
|
|
{
|
2021-10-11 16:11:50 +00:00
|
|
|
const auto & input_getter = getCreators(name).input_creator;
|
2019-10-01 10:48:46 +00:00
|
|
|
|
|
|
|
RowInputFormatParams row_input_format_params;
|
|
|
|
row_input_format_params.max_block_size = max_block_size;
|
|
|
|
row_input_format_params.allow_errors_num = format_settings.input_allow_errors_num;
|
|
|
|
row_input_format_params.allow_errors_ratio = format_settings.input_allow_errors_ratio;
|
|
|
|
row_input_format_params.max_execution_time = settings.max_execution_time;
|
|
|
|
row_input_format_params.timeout_overflow_mode = settings.timeout_overflow_mode;
|
|
|
|
|
2020-06-10 12:02:34 +00:00
|
|
|
/// Const reference is copied to lambda.
|
|
|
|
auto parser_creator = [input_getter, sample, row_input_format_params, format_settings]
|
|
|
|
(ReadBuffer & input) -> InputFormatPtr
|
2020-12-17 15:14:09 +00:00
|
|
|
{ return input_getter(input, sample, row_input_format_params, format_settings); };
|
2020-05-18 10:00:22 +00:00
|
|
|
|
|
|
|
|
2020-06-11 00:36:57 +00:00
|
|
|
ParallelParsingInputFormat::Params params{
|
2022-01-11 15:37:07 +00:00
|
|
|
buf, sample, parser_creator, file_segmentation_engine, name, settings.max_threads, settings.min_chunk_bytes_for_parallel_parsing,
|
|
|
|
context->getApplicationType() == Context::ApplicationType::SERVER};
|
2020-06-11 00:36:57 +00:00
|
|
|
return std::make_shared<ParallelParsingInputFormat>(params);
|
2019-10-01 10:48:46 +00:00
|
|
|
}
|
|
|
|
|
2020-05-18 10:00:22 +00:00
|
|
|
|
2020-12-07 22:52:51 +00:00
|
|
|
auto format = getInputFormat(name, buf, sample, context, max_block_size, format_settings);
|
2020-05-18 10:00:22 +00:00
|
|
|
return format;
|
2019-02-19 18:41:18 +00:00
|
|
|
}
|
|
|
|
|
2019-07-24 18:00:09 +00:00
|
|
|
InputFormatPtr FormatFactory::getInputFormat(
|
|
|
|
const String & name,
|
|
|
|
ReadBuffer & buf,
|
|
|
|
const Block & sample,
|
2021-06-01 12:20:52 +00:00
|
|
|
ContextPtr context,
|
2019-07-24 18:00:09 +00:00
|
|
|
UInt64 max_block_size,
|
2020-11-07 08:53:39 +00:00
|
|
|
const std::optional<FormatSettings> & _format_settings) const
|
2019-02-19 18:41:18 +00:00
|
|
|
{
|
2021-10-11 16:11:50 +00:00
|
|
|
const auto & input_getter = getCreators(name).input_creator;
|
2019-02-19 18:41:18 +00:00
|
|
|
if (!input_getter)
|
|
|
|
throw Exception("Format " + name + " is not suitable for input", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_INPUT);
|
|
|
|
|
2021-04-10 23:33:54 +00:00
|
|
|
const Settings & settings = context->getSettingsRef();
|
2020-11-02 07:50:38 +00:00
|
|
|
|
2021-04-10 23:33:54 +00:00
|
|
|
if (context->hasQueryContext() && settings.log_queries)
|
|
|
|
context->getQueryContext()->addQueryFactoriesInfo(Context::QueryLogFactories::Format, name);
|
2021-01-20 10:54:11 +00:00
|
|
|
|
2021-04-10 23:33:54 +00:00
|
|
|
auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context);
|
2019-02-19 18:41:18 +00:00
|
|
|
|
|
|
|
RowInputFormatParams params;
|
|
|
|
params.max_block_size = max_block_size;
|
|
|
|
params.allow_errors_num = format_settings.input_allow_errors_num;
|
|
|
|
params.allow_errors_ratio = format_settings.input_allow_errors_ratio;
|
2019-08-01 14:25:41 +00:00
|
|
|
params.max_execution_time = settings.max_execution_time;
|
|
|
|
params.timeout_overflow_mode = settings.timeout_overflow_mode;
|
2019-12-25 19:17:41 +00:00
|
|
|
auto format = input_getter(buf, sample, params, format_settings);
|
|
|
|
|
|
|
|
/// It's a kludge. Because I cannot remove context from values format.
|
|
|
|
if (auto * values = typeid_cast<ValuesBlockInputFormat *>(format.get()))
|
|
|
|
values->setContext(context);
|
|
|
|
|
|
|
|
return format;
|
2019-02-19 18:41:18 +00:00
|
|
|
}
|
|
|
|
|
2021-12-22 22:14:23 +00:00
|
|
|
static void addExistingProgressToOutputFormat(OutputFormatPtr format, ContextPtr context)
|
|
|
|
{
|
2021-12-29 15:36:42 +00:00
|
|
|
auto * element_id = context->getProcessListElement();
|
2021-12-22 22:14:23 +00:00
|
|
|
if (element_id)
|
|
|
|
{
|
|
|
|
/// While preparing the query there might have been progress (for example in subscalar subqueries) so add it here
|
|
|
|
auto current_progress = element_id->getProgressIn();
|
|
|
|
Progress read_progress{current_progress.read_rows, current_progress.read_bytes, current_progress.total_rows_to_read};
|
|
|
|
format->onProgress(read_progress);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-12-30 03:07:30 +00:00
|
|
|
OutputFormatPtr FormatFactory::getOutputFormatParallelIfPossible(
|
2021-04-10 23:33:54 +00:00
|
|
|
const String & name,
|
|
|
|
WriteBuffer & buf,
|
|
|
|
const Block & sample,
|
2021-06-01 12:20:52 +00:00
|
|
|
ContextPtr context,
|
2021-04-10 23:33:54 +00:00
|
|
|
WriteCallback callback,
|
2020-12-30 03:07:30 +00:00
|
|
|
const std::optional<FormatSettings> & _format_settings) const
|
|
|
|
{
|
2021-10-11 16:11:50 +00:00
|
|
|
const auto & output_getter = getCreators(name).output_creator;
|
2020-12-30 03:07:30 +00:00
|
|
|
if (!output_getter)
|
2021-07-07 15:46:56 +00:00
|
|
|
throw Exception(ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT, "Format {} is not suitable for output (with processors)", name);
|
2020-12-30 03:07:30 +00:00
|
|
|
|
2021-04-10 23:33:54 +00:00
|
|
|
auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context);
|
2020-12-30 03:07:30 +00:00
|
|
|
|
2021-04-10 23:33:54 +00:00
|
|
|
const Settings & settings = context->getSettingsRef();
|
2020-12-30 03:07:30 +00:00
|
|
|
|
2020-12-30 04:50:58 +00:00
|
|
|
if (settings.output_format_parallel_formatting && getCreators(name).supports_parallel_formatting
|
2021-08-12 09:29:50 +00:00
|
|
|
&& !settings.output_format_json_array_of_rows)
|
2020-12-30 03:07:30 +00:00
|
|
|
{
|
|
|
|
auto formatter_creator = [output_getter, sample, callback, format_settings]
|
|
|
|
(WriteBuffer & output) -> OutputFormatPtr
|
|
|
|
{ return output_getter(output, sample, {std::move(callback)}, format_settings);};
|
|
|
|
|
|
|
|
ParallelFormattingOutputFormat::Params builder{buf, sample, formatter_creator, settings.max_threads};
|
|
|
|
|
2021-04-10 23:33:54 +00:00
|
|
|
if (context->hasQueryContext() && settings.log_queries)
|
|
|
|
context->getQueryContext()->addQueryFactoriesInfo(Context::QueryLogFactories::Format, name);
|
2021-01-20 10:54:11 +00:00
|
|
|
|
2021-12-22 22:14:23 +00:00
|
|
|
auto format = std::make_shared<ParallelFormattingOutputFormat>(builder);
|
|
|
|
addExistingProgressToOutputFormat(format, context);
|
|
|
|
return format;
|
2020-12-30 03:07:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return getOutputFormat(name, buf, sample, context, callback, _format_settings);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2019-08-20 11:17:57 +00:00
|
|
|
OutputFormatPtr FormatFactory::getOutputFormat(
|
2021-04-10 23:33:54 +00:00
|
|
|
const String & name,
|
|
|
|
WriteBuffer & buf,
|
|
|
|
const Block & sample,
|
2021-06-01 12:20:52 +00:00
|
|
|
ContextPtr context,
|
2021-04-10 23:33:54 +00:00
|
|
|
WriteCallback callback,
|
2020-11-07 08:53:39 +00:00
|
|
|
const std::optional<FormatSettings> & _format_settings) const
|
2019-02-19 18:41:18 +00:00
|
|
|
{
|
2021-10-11 16:11:50 +00:00
|
|
|
const auto & output_getter = getCreators(name).output_creator;
|
2019-02-19 18:41:18 +00:00
|
|
|
if (!output_getter)
|
2021-07-07 15:46:56 +00:00
|
|
|
throw Exception(ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT, "Format {} is not suitable for output (with processors)", name);
|
2019-02-19 18:41:18 +00:00
|
|
|
|
2021-04-10 23:33:54 +00:00
|
|
|
if (context->hasQueryContext() && context->getSettingsRef().log_queries)
|
|
|
|
context->getQueryContext()->addQueryFactoriesInfo(Context::QueryLogFactories::Format, name);
|
2021-01-20 10:54:11 +00:00
|
|
|
|
2020-10-06 12:47:52 +00:00
|
|
|
RowOutputFormatParams params;
|
|
|
|
params.callback = std::move(callback);
|
|
|
|
|
2021-07-16 10:10:56 +00:00
|
|
|
auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context);
|
|
|
|
|
2019-02-19 18:41:18 +00:00
|
|
|
/** TODO: Materialization is needed, because formats can use the functions `IDataType`,
|
|
|
|
* which only work with full columns.
|
|
|
|
*/
|
2020-10-06 12:47:52 +00:00
|
|
|
auto format = output_getter(buf, sample, params, format_settings);
|
2019-12-25 19:17:41 +00:00
|
|
|
|
2020-04-27 15:00:10 +00:00
|
|
|
/// Enable auto-flush for streaming mode. Currently it is needed by INSERT WATCH query.
|
|
|
|
if (format_settings.enable_streaming)
|
|
|
|
format->setAutoFlush();
|
|
|
|
|
2019-12-25 19:17:41 +00:00
|
|
|
/// It's a kludge. Because I cannot remove context from MySQL format.
|
|
|
|
if (auto * mysql = typeid_cast<MySQLOutputFormat *>(format.get()))
|
|
|
|
mysql->setContext(context);
|
|
|
|
|
2021-12-22 22:14:23 +00:00
|
|
|
addExistingProgressToOutputFormat(format, context);
|
|
|
|
|
2019-12-25 19:17:41 +00:00
|
|
|
return format;
|
2019-02-19 18:41:18 +00:00
|
|
|
}
|
|
|
|
|
2021-12-03 11:42:46 +00:00
|
|
|
String FormatFactory::getContentType(
|
|
|
|
const String & name,
|
|
|
|
ContextPtr context,
|
|
|
|
const std::optional<FormatSettings> & _format_settings) const
|
|
|
|
{
|
|
|
|
const auto & output_getter = getCreators(name).output_creator;
|
|
|
|
if (!output_getter)
|
|
|
|
throw Exception(ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT, "Format {} is not suitable for output (with processors)", name);
|
|
|
|
|
|
|
|
auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context);
|
2021-12-06 20:35:29 +00:00
|
|
|
|
2021-12-03 14:09:04 +00:00
|
|
|
Block empty_block;
|
|
|
|
RowOutputFormatParams empty_params;
|
|
|
|
WriteBufferFromOwnString empty_buffer;
|
|
|
|
auto format = output_getter(empty_buffer, empty_block, empty_params, format_settings);
|
2021-12-03 11:42:46 +00:00
|
|
|
|
|
|
|
return format->getContentType();
|
|
|
|
}
|
|
|
|
|
2021-12-15 11:30:57 +00:00
|
|
|
SchemaReaderPtr FormatFactory::getSchemaReader(
|
|
|
|
const String & name,
|
|
|
|
ReadBuffer & buf,
|
|
|
|
ContextPtr context,
|
|
|
|
const std::optional<FormatSettings> & _format_settings) const
|
|
|
|
{
|
|
|
|
const auto & schema_reader_creator = dict.at(name).schema_reader_creator;
|
|
|
|
if (!schema_reader_creator)
|
|
|
|
throw Exception("FormatFactory: Format " + name + " doesn't support schema inference.", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
|
|
|
auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context);
|
|
|
|
return schema_reader_creator(buf, format_settings, context);
|
|
|
|
}
|
|
|
|
|
|
|
|
ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader(
|
|
|
|
const String & name,
|
|
|
|
ContextPtr context,
|
|
|
|
const std::optional<FormatSettings> & _format_settings) const
|
|
|
|
{
|
|
|
|
const auto & external_schema_reader_creator = dict.at(name).external_schema_reader_creator;
|
|
|
|
if (!external_schema_reader_creator)
|
|
|
|
throw Exception("FormatFactory: Format " + name + " doesn't support schema inference.", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
|
|
|
auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context);
|
|
|
|
return external_schema_reader_creator(format_settings);
|
|
|
|
}
|
2021-12-03 11:42:46 +00:00
|
|
|
|
2018-06-10 19:22:49 +00:00
|
|
|
void FormatFactory::registerInputFormat(const String & name, InputCreator input_creator)
|
|
|
|
{
|
2019-10-01 10:48:46 +00:00
|
|
|
auto & target = dict[name].input_creator;
|
2018-06-10 19:22:49 +00:00
|
|
|
if (target)
|
|
|
|
throw Exception("FormatFactory: Input format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
|
2019-02-19 18:41:18 +00:00
|
|
|
target = std::move(input_creator);
|
2018-06-10 19:22:49 +00:00
|
|
|
}
|
|
|
|
|
2021-06-23 13:17:34 +00:00
|
|
|
void FormatFactory::registerNonTrivialPrefixAndSuffixChecker(const String & name, NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker)
|
|
|
|
{
|
|
|
|
auto & target = dict[name].non_trivial_prefix_and_suffix_checker;
|
|
|
|
if (target)
|
|
|
|
throw Exception("FormatFactory: Non trivial prefix and suffix checker " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
target = std::move(non_trivial_prefix_and_suffix_checker);
|
|
|
|
}
|
|
|
|
|
2022-01-14 15:16:18 +00:00
|
|
|
void FormatFactory::registerAppendSupportChecker(const String & name, AppendSupportChecker append_support_checker)
|
2021-12-29 18:03:15 +00:00
|
|
|
{
|
2022-01-14 15:16:18 +00:00
|
|
|
auto & target = dict[name].append_support_checker;
|
2021-12-29 18:03:15 +00:00
|
|
|
if (target)
|
|
|
|
throw Exception("FormatFactory: Suffix checker " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
|
2022-01-14 15:16:18 +00:00
|
|
|
target = std::move(append_support_checker);
|
2021-12-29 18:03:15 +00:00
|
|
|
}
|
|
|
|
|
2022-01-14 15:16:18 +00:00
|
|
|
void FormatFactory::markFormatDoesntSupportAppend(const String & name)
|
2021-12-29 18:03:15 +00:00
|
|
|
{
|
2022-01-14 15:16:18 +00:00
|
|
|
registerAppendSupportChecker(name, [](const FormatSettings &){ return true; });
|
2021-12-29 18:03:15 +00:00
|
|
|
}
|
|
|
|
|
2022-01-14 15:16:18 +00:00
|
|
|
bool FormatFactory::checkIfFormatSupportAppend(const String & name, ContextPtr context, const std::optional<FormatSettings> & format_settings_)
|
2021-12-29 18:03:15 +00:00
|
|
|
{
|
|
|
|
auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context);
|
2022-01-14 15:16:18 +00:00
|
|
|
auto & append_support_checker = dict[name].append_support_checker;
|
|
|
|
/// By default we consider that format supports append
|
|
|
|
return !append_support_checker || append_support_checker(format_settings);
|
2021-12-29 18:03:15 +00:00
|
|
|
}
|
|
|
|
|
2021-10-11 16:11:50 +00:00
|
|
|
void FormatFactory::registerOutputFormat(const String & name, OutputCreator output_creator)
|
2019-02-19 18:41:18 +00:00
|
|
|
{
|
2021-10-11 16:11:50 +00:00
|
|
|
auto & target = dict[name].output_creator;
|
2019-02-19 18:41:18 +00:00
|
|
|
if (target)
|
|
|
|
throw Exception("FormatFactory: Output format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
target = std::move(output_creator);
|
2018-06-10 19:22:49 +00:00
|
|
|
}
|
|
|
|
|
2022-01-07 05:16:41 +00:00
|
|
|
void FormatFactory::registerFileExtension(const String & extension, const String & format_name)
|
|
|
|
{
|
|
|
|
file_extension_formats[extension] = format_name;
|
|
|
|
}
|
|
|
|
|
|
|
|
String FormatFactory::getFormatFromFileName(String file_name)
|
|
|
|
{
|
|
|
|
CompressionMethod compression_method = chooseCompressionMethod(file_name, "");
|
|
|
|
if (CompressionMethod::None != compression_method)
|
|
|
|
{
|
|
|
|
auto pos = file_name.find_last_of('.');
|
|
|
|
if (pos != String::npos)
|
|
|
|
file_name = file_name.substr(0, pos);
|
|
|
|
}
|
|
|
|
|
|
|
|
auto pos = file_name.find_last_of('.');
|
|
|
|
if (pos == String::npos)
|
|
|
|
return "";
|
|
|
|
|
|
|
|
String file_extension = file_name.substr(pos + 1, String::npos);
|
2022-01-11 13:26:14 +00:00
|
|
|
boost::algorithm::to_lower(file_extension);
|
2022-01-07 05:16:41 +00:00
|
|
|
return file_extension_formats[file_extension];
|
|
|
|
}
|
|
|
|
|
2019-10-01 10:48:46 +00:00
|
|
|
void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine)
|
|
|
|
{
|
|
|
|
auto & target = dict[name].file_segmentation_engine;
|
|
|
|
if (target)
|
|
|
|
throw Exception("FormatFactory: File segmentation engine " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
|
2019-12-25 19:17:41 +00:00
|
|
|
target = std::move(file_segmentation_engine);
|
2019-10-01 10:48:46 +00:00
|
|
|
}
|
2018-06-10 19:22:49 +00:00
|
|
|
|
2021-12-15 11:30:57 +00:00
|
|
|
void FormatFactory::registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator)
|
|
|
|
{
|
|
|
|
auto & target = dict[name].schema_reader_creator;
|
|
|
|
if (target)
|
|
|
|
throw Exception("FormatFactory: Schema reader " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
target = std::move(schema_reader_creator);
|
|
|
|
}
|
|
|
|
|
|
|
|
void FormatFactory::registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator)
|
|
|
|
{
|
|
|
|
auto & target = dict[name].external_schema_reader_creator;
|
|
|
|
if (target)
|
|
|
|
throw Exception("FormatFactory: Schema reader " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
target = std::move(external_schema_reader_creator);
|
|
|
|
}
|
2020-10-06 14:02:01 +00:00
|
|
|
|
|
|
|
void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & name)
|
|
|
|
{
|
|
|
|
auto & target = dict[name].supports_parallel_formatting;
|
|
|
|
if (target)
|
2021-03-30 21:25:37 +00:00
|
|
|
throw Exception("FormatFactory: Output format " + name + " is already marked as supporting parallel formatting", ErrorCodes::LOGICAL_ERROR);
|
2020-10-06 14:02:01 +00:00
|
|
|
target = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-03-30 21:25:37 +00:00
|
|
|
void FormatFactory::markFormatAsColumnOriented(const String & name)
|
|
|
|
{
|
|
|
|
auto & target = dict[name].is_column_oriented;
|
|
|
|
if (target)
|
|
|
|
throw Exception("FormatFactory: Format " + name + " is already marked as column oriented", ErrorCodes::LOGICAL_ERROR);
|
2021-04-01 00:08:02 +00:00
|
|
|
target = true;
|
2021-03-30 21:25:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool FormatFactory::checkIfFormatIsColumnOriented(const String & name)
|
|
|
|
{
|
|
|
|
const auto & target = getCreators(name);
|
|
|
|
return target.is_column_oriented;
|
|
|
|
}
|
|
|
|
|
2021-09-16 17:18:34 +00:00
|
|
|
bool FormatFactory::isInputFormat(const String & name) const
|
|
|
|
{
|
|
|
|
auto it = dict.find(name);
|
2021-10-11 16:11:50 +00:00
|
|
|
return it != dict.end() && it->second.input_creator;
|
2021-09-16 17:18:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool FormatFactory::isOutputFormat(const String & name) const
|
|
|
|
{
|
|
|
|
auto it = dict.find(name);
|
2021-10-11 16:11:50 +00:00
|
|
|
return it != dict.end() && it->second.output_creator;
|
2021-09-16 17:18:34 +00:00
|
|
|
}
|
|
|
|
|
2021-12-15 11:30:57 +00:00
|
|
|
bool FormatFactory::checkIfFormatHasSchemaReader(const String & name)
|
|
|
|
{
|
|
|
|
const auto & target = getCreators(name);
|
|
|
|
return bool(target.schema_reader_creator);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool FormatFactory::checkIfFormatHasExternalSchemaReader(const String & name)
|
|
|
|
{
|
|
|
|
const auto & target = getCreators(name);
|
|
|
|
return bool(target.external_schema_reader_creator);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool FormatFactory::checkIfFormatHasAnySchemaReader(const String & name)
|
|
|
|
{
|
|
|
|
return checkIfFormatHasSchemaReader(name) || checkIfFormatHasExternalSchemaReader(name);
|
|
|
|
}
|
|
|
|
|
2019-08-22 03:24:05 +00:00
|
|
|
FormatFactory & FormatFactory::instance()
|
|
|
|
{
|
|
|
|
static FormatFactory ret;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-06-10 19:22:49 +00:00
|
|
|
}
|