ClickHouse/dbms/src/Formats/FormatFactory.cpp

378 lines
17 KiB
C++
Raw Normal View History

2019-09-11 11:21:54 +00:00
#include <Common/config.h>
#include <Common/Exception.h>
#include <Interpreters/Context.h>
#include <Core/Settings.h>
#include <DataStreams/MaterializingBlockOutputStream.h>
2019-10-01 10:48:46 +00:00
#include <DataStreams/ParallelParsingBlockInputStream.h>
#include <Formats/FormatSettings.h>
#include <Formats/FormatFactory.h>
2019-10-01 10:48:46 +00:00
#include <DataStreams/UnionBlockInputStream.h>
2019-02-19 18:41:18 +00:00
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/InputStreamFromInputFormat.h>
#include <Processors/Formats/OutputStreamToOutputFormat.h>
2019-07-31 10:28:54 +00:00
#include <DataStreams/SquashingBlockOutputStream.h>
2019-07-31 13:26:08 +00:00
#include <DataStreams/NativeBlockInputStream.h>
namespace DB
{
namespace ErrorCodes
{
extern const int UNKNOWN_FORMAT;
extern const int LOGICAL_ERROR;
extern const int FORMAT_IS_NOT_SUITABLE_FOR_INPUT;
extern const int FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT;
}
const FormatFactory::Creators & FormatFactory::getCreators(const String & name) const
{
auto it = dict.find(name);
if (dict.end() != it)
return it->second;
throw Exception("Unknown format " + name, ErrorCodes::UNKNOWN_FORMAT);
}
2019-02-19 18:41:18 +00:00
static FormatSettings getInputFormatSetting(const Settings & settings)
{
FormatSettings format_settings;
format_settings.csv.delimiter = settings.format_csv_delimiter;
2018-07-04 21:00:50 +00:00
format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes;
format_settings.csv.allow_double_quotes = settings.format_csv_allow_double_quotes;
format_settings.csv.unquoted_null_literal_as_null = settings.input_format_csv_unquoted_null_literal_as_null;
format_settings.csv.empty_as_default = settings.input_format_defaults_for_omitted_fields;
2019-10-04 17:19:49 +00:00
format_settings.null_as_default = settings.input_format_null_as_default;
format_settings.values.interpret_expressions = settings.input_format_values_interpret_expressions;
format_settings.values.deduce_templates_of_expressions = settings.input_format_values_deduce_templates_of_expressions;
2019-09-04 19:42:01 +00:00
format_settings.values.accurate_types_of_literals = settings.input_format_values_accurate_types_of_literals;
format_settings.with_names_use_header = settings.input_format_with_names_use_header;
format_settings.skip_unknown_fields = settings.input_format_skip_unknown_fields;
format_settings.import_nested_json = settings.input_format_import_nested_json;
format_settings.date_time_input_format = settings.date_time_input_format;
format_settings.input_allow_errors_num = settings.input_format_allow_errors_num;
format_settings.input_allow_errors_ratio = settings.input_format_allow_errors_ratio;
2019-09-24 14:25:22 +00:00
format_settings.template_settings.resultset_format = settings.format_template_resultset;
format_settings.template_settings.row_format = settings.format_template_row;
format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter;
2019-10-16 14:22:22 +00:00
format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default;
2019-02-19 18:41:18 +00:00
return format_settings;
}
2019-02-19 18:41:18 +00:00
static FormatSettings getOutputFormatSetting(const Settings & settings)
{
FormatSettings format_settings;
format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers;
format_settings.json.quote_denormals = settings.output_format_json_quote_denormals;
2018-08-06 14:11:45 +00:00
format_settings.json.escape_forward_slashes = settings.output_format_json_escape_forward_slashes;
format_settings.csv.delimiter = settings.format_csv_delimiter;
2018-07-04 21:00:50 +00:00
format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes;
format_settings.csv.allow_double_quotes = settings.format_csv_allow_double_quotes;
format_settings.pretty.max_rows = settings.output_format_pretty_max_rows;
format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
format_settings.pretty.color = settings.output_format_pretty_color;
2019-09-24 14:25:22 +00:00
format_settings.template_settings.resultset_format = settings.format_template_resultset;
format_settings.template_settings.row_format = settings.format_template_row;
format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter;
format_settings.write_statistics = settings.output_format_write_statistics;
2019-02-19 20:51:44 +00:00
format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size;
2019-02-19 18:41:18 +00:00
return format_settings;
}
2019-07-08 13:00:54 +00:00
BlockInputStreamPtr FormatFactory::getInput(
const String & name,
ReadBuffer & buf,
const Block & sample,
const Context & context,
UInt64 max_block_size,
ReadCallback callback) const
2019-02-19 18:41:18 +00:00
{
2019-07-31 13:26:08 +00:00
if (name == "Native")
return std::make_shared<NativeBlockInputStream>(buf, sample, 0);
2019-02-19 18:41:18 +00:00
2019-08-02 14:41:19 +00:00
if (!getCreators(name).input_processor_creator)
2019-08-14 15:54:51 +00:00
{
2019-10-01 10:48:46 +00:00
const auto & input_getter = getCreators(name).input_creator;
2019-08-14 15:54:51 +00:00
if (!input_getter)
throw Exception("Format " + name + " is not suitable for input", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_INPUT);
2019-02-19 18:41:18 +00:00
2019-08-14 15:54:51 +00:00
const Settings & settings = context.getSettingsRef();
FormatSettings format_settings = getInputFormatSetting(settings);
2019-02-19 18:41:18 +00:00
return input_getter(buf, sample, context, max_block_size, callback ? callback : ReadCallback(), format_settings);
2019-08-14 15:54:51 +00:00
}
2019-02-19 18:41:18 +00:00
2019-10-01 10:48:46 +00:00
const Settings & settings = context.getSettingsRef();
const auto & file_segmentation_engine = getCreators(name).file_segmentation_engine;
if (name != "Values" && settings.input_format_parallel_parsing && file_segmentation_engine)
{
const auto & input_getter = getCreators(name).input_processor_creator;
if (!input_getter)
throw Exception("Format " + name + " is not suitable for input", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_INPUT);
FormatSettings format_settings = getInputFormatSetting(settings);
RowInputFormatParams row_input_format_params;
row_input_format_params.max_block_size = max_block_size;
row_input_format_params.allow_errors_num = format_settings.input_allow_errors_num;
row_input_format_params.allow_errors_ratio = format_settings.input_allow_errors_ratio;
row_input_format_params.callback = std::move(callback);
row_input_format_params.max_execution_time = settings.max_execution_time;
row_input_format_params.timeout_overflow_mode = settings.timeout_overflow_mode;
2019-10-25 14:32:57 +00:00
size_t max_threads_to_use = settings.max_threads_for_parallel_parsing;
2019-10-01 10:48:46 +00:00
if (!max_threads_to_use)
max_threads_to_use = settings.max_threads;
auto params = ParallelParsingBlockInputStream::InputCreatorParams{sample, context, row_input_format_params, format_settings};
2019-10-25 14:32:57 +00:00
ParallelParsingBlockInputStream::Builder builder{buf, input_getter, params, file_segmentation_engine, max_threads_to_use, settings.min_chunk_size_for_parallel_parsing};
2019-10-01 10:48:46 +00:00
return std::make_shared<ParallelParsingBlockInputStream>(builder);
}
auto format = getInputFormat(name, buf, sample, context, max_block_size, std::move(callback));
return std::make_shared<InputStreamFromInputFormat>(std::move(format));
2019-02-19 18:41:18 +00:00
}
BlockOutputStreamPtr FormatFactory::getOutput(
const String & name, WriteBuffer & buf, const Block & sample, const Context & context, WriteCallback callback) const
2019-02-19 18:41:18 +00:00
{
2019-08-04 10:06:42 +00:00
if (name == "PrettyCompactMonoBlock")
{
/// TODO: rewrite
auto format = getOutputFormat("PrettyCompact", buf, sample, context);
auto res = std::make_shared<SquashingBlockOutputStream>(
std::make_shared<OutputStreamToOutputFormat>(format),
sample, context.getSettingsRef().output_format_pretty_max_rows, 0);
res->disableFlush();
return std::make_shared<MaterializingBlockOutputStream>(res, sample);
}
2019-07-31 10:54:50 +00:00
2019-08-02 14:41:19 +00:00
if (!getCreators(name).output_processor_creator)
2019-08-14 15:54:51 +00:00
{
const auto & output_getter = getCreators(name).output_creator;
if (!output_getter)
throw Exception("Format " + name + " is not suitable for output", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT);
const Settings & settings = context.getSettingsRef();
FormatSettings format_settings = getOutputFormatSetting(settings);
/** Materialization is needed, because formats can use the functions `IDataType`,
2019-08-14 15:54:51 +00:00
* which only work with full columns.
*/
return std::make_shared<MaterializingBlockOutputStream>(
output_getter(buf, sample, context, callback, format_settings), sample);
2019-08-14 15:54:51 +00:00
}
2019-07-31 10:28:54 +00:00
auto format = getOutputFormat(name, buf, sample, context, callback);
return std::make_shared<MaterializingBlockOutputStream>(std::make_shared<OutputStreamToOutputFormat>(format), sample);
}
InputFormatPtr FormatFactory::getInputFormat(
const String & name,
ReadBuffer & buf,
const Block & sample,
const Context & context,
UInt64 max_block_size,
ReadCallback callback) const
2019-02-19 18:41:18 +00:00
{
2019-08-02 14:41:19 +00:00
const auto & input_getter = getCreators(name).input_processor_creator;
2019-02-19 18:41:18 +00:00
if (!input_getter)
throw Exception("Format " + name + " is not suitable for input", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_INPUT);
const Settings & settings = context.getSettingsRef();
FormatSettings format_settings = getInputFormatSetting(settings);
RowInputFormatParams params;
params.max_block_size = max_block_size;
params.allow_errors_num = format_settings.input_allow_errors_num;
params.allow_errors_ratio = format_settings.input_allow_errors_ratio;
params.callback = std::move(callback);
params.max_execution_time = settings.max_execution_time;
params.timeout_overflow_mode = settings.timeout_overflow_mode;
2019-02-19 18:41:18 +00:00
return input_getter(buf, sample, context, params, format_settings);
}
OutputFormatPtr FormatFactory::getOutputFormat(
const String & name, WriteBuffer & buf, const Block & sample, const Context & context, WriteCallback callback) const
2019-02-19 18:41:18 +00:00
{
2019-08-02 14:41:19 +00:00
const auto & output_getter = getCreators(name).output_processor_creator;
2019-02-19 18:41:18 +00:00
if (!output_getter)
throw Exception("Format " + name + " is not suitable for output", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT);
const Settings & settings = context.getSettingsRef();
FormatSettings format_settings = getOutputFormatSetting(settings);
/** TODO: Materialization is needed, because formats can use the functions `IDataType`,
* which only work with full columns.
*/
return output_getter(buf, sample, context, callback, format_settings);
2019-02-19 18:41:18 +00:00
}
void FormatFactory::registerInputFormat(const String & name, InputCreator input_creator)
{
2019-10-01 10:48:46 +00:00
auto & target = dict[name].input_creator;
if (target)
throw Exception("FormatFactory: Input format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
2019-02-19 18:41:18 +00:00
target = std::move(input_creator);
}
void FormatFactory::registerOutputFormat(const String & name, OutputCreator output_creator)
{
2019-08-02 14:41:19 +00:00
auto & target = dict[name].output_creator;
if (target)
throw Exception("FormatFactory: Output format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
2019-02-19 18:41:18 +00:00
target = std::move(output_creator);
}
void FormatFactory::registerInputFormatProcessor(const String & name, InputProcessorCreator input_creator)
{
2019-08-02 14:41:19 +00:00
auto & target = dict[name].input_processor_creator;
2019-02-19 18:41:18 +00:00
if (target)
throw Exception("FormatFactory: Input format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
target = std::move(input_creator);
}
void FormatFactory::registerOutputFormatProcessor(const String & name, OutputProcessorCreator output_creator)
{
2019-08-02 14:41:19 +00:00
auto & target = dict[name].output_processor_creator;
2019-02-19 18:41:18 +00:00
if (target)
throw Exception("FormatFactory: Output format " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
target = std::move(output_creator);
}
2019-10-01 10:48:46 +00:00
void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine)
{
auto & target = dict[name].file_segmentation_engine;
if (target)
throw Exception("FormatFactory: File segmentation engine " + name + " is already registered", ErrorCodes::LOGICAL_ERROR);
target = file_segmentation_engine;
}
/// Formats for both input/output.
void registerInputFormatNative(FormatFactory & factory);
void registerOutputFormatNative(FormatFactory & factory);
2019-02-19 18:41:18 +00:00
void registerInputFormatProcessorNative(FormatFactory & factory);
void registerOutputFormatProcessorNative(FormatFactory & factory);
void registerInputFormatProcessorRowBinary(FormatFactory & factory);
void registerOutputFormatProcessorRowBinary(FormatFactory & factory);
void registerInputFormatProcessorTabSeparated(FormatFactory & factory);
void registerOutputFormatProcessorTabSeparated(FormatFactory & factory);
void registerInputFormatProcessorValues(FormatFactory & factory);
void registerOutputFormatProcessorValues(FormatFactory & factory);
void registerInputFormatProcessorCSV(FormatFactory & factory);
void registerOutputFormatProcessorCSV(FormatFactory & factory);
void registerInputFormatProcessorTSKV(FormatFactory & factory);
void registerOutputFormatProcessorTSKV(FormatFactory & factory);
void registerInputFormatProcessorJSONEachRow(FormatFactory & factory);
void registerOutputFormatProcessorJSONEachRow(FormatFactory & factory);
void registerInputFormatProcessorParquet(FormatFactory & factory);
2019-08-21 14:19:47 +00:00
void registerInputFormatProcessorORC(FormatFactory & factory);
void registerOutputFormatProcessorParquet(FormatFactory & factory);
void registerInputFormatProcessorProtobuf(FormatFactory & factory);
2019-02-19 18:41:18 +00:00
void registerOutputFormatProcessorProtobuf(FormatFactory & factory);
2019-08-23 19:47:22 +00:00
void registerInputFormatProcessorTemplate(FormatFactory & factory);
void registerOutputFormatProcessorTemplate(FormatFactory &factory);
2019-02-19 18:41:18 +00:00
2019-10-01 10:48:46 +00:00
/// File Segmentation Engines for parallel reading
void registerFileSegmentationEngineJSONEachRow(FormatFactory & factory);
void registerFileSegmentationEngineTabSeparated(FormatFactory & factory);
2019-10-22 18:01:44 +00:00
//void registerFileSegmentationEngineValues(FormatFactory & factory);
2019-10-01 10:48:46 +00:00
void registerFileSegmentationEngineCSV(FormatFactory & factory);
void registerFileSegmentationEngineTSKV(FormatFactory & factory);
/// Output only (presentational) formats.
void registerOutputFormatNull(FormatFactory & factory);
2019-02-19 18:41:18 +00:00
void registerOutputFormatProcessorPretty(FormatFactory & factory);
void registerOutputFormatProcessorPrettyCompact(FormatFactory & factory);
void registerOutputFormatProcessorPrettySpace(FormatFactory & factory);
void registerOutputFormatProcessorVertical(FormatFactory & factory);
void registerOutputFormatProcessorJSON(FormatFactory & factory);
void registerOutputFormatProcessorJSONCompact(FormatFactory & factory);
void registerOutputFormatProcessorJSONEachRowWithProgress(FormatFactory & factory);
2019-02-19 18:41:18 +00:00
void registerOutputFormatProcessorXML(FormatFactory & factory);
void registerOutputFormatProcessorODBCDriver(FormatFactory & factory);
void registerOutputFormatProcessorODBCDriver2(FormatFactory & factory);
void registerOutputFormatProcessorNull(FormatFactory & factory);
#if USE_SSL
2019-07-04 18:55:20 +00:00
void registerOutputFormatProcessorMySQLWrite(FormatFactory & factory);
#endif
2019-02-19 18:41:18 +00:00
/// Input only formats.
2019-02-19 18:41:18 +00:00
void registerInputFormatProcessorCapnProto(FormatFactory & factory);
FormatFactory::FormatFactory()
{
registerInputFormatNative(*this);
registerOutputFormatNative(*this);
registerOutputFormatProcessorJSONEachRowWithProgress(*this);
2019-06-25 17:00:54 +00:00
2019-02-19 18:41:18 +00:00
registerInputFormatProcessorNative(*this);
registerOutputFormatProcessorNative(*this);
registerInputFormatProcessorRowBinary(*this);
registerOutputFormatProcessorRowBinary(*this);
registerInputFormatProcessorTabSeparated(*this);
registerOutputFormatProcessorTabSeparated(*this);
registerInputFormatProcessorValues(*this);
registerOutputFormatProcessorValues(*this);
registerInputFormatProcessorCSV(*this);
registerOutputFormatProcessorCSV(*this);
registerInputFormatProcessorTSKV(*this);
registerOutputFormatProcessorTSKV(*this);
registerInputFormatProcessorJSONEachRow(*this);
registerOutputFormatProcessorJSONEachRow(*this);
registerInputFormatProcessorProtobuf(*this);
2019-02-19 18:41:18 +00:00
registerOutputFormatProcessorProtobuf(*this);
registerInputFormatProcessorCapnProto(*this);
2019-08-21 14:19:47 +00:00
registerInputFormatProcessorORC(*this);
registerInputFormatProcessorParquet(*this);
registerOutputFormatProcessorParquet(*this);
2019-08-23 19:47:22 +00:00
registerInputFormatProcessorTemplate(*this);
registerOutputFormatProcessorTemplate(*this);
2019-02-19 18:41:18 +00:00
2019-10-01 10:48:46 +00:00
registerFileSegmentationEngineJSONEachRow(*this);
registerFileSegmentationEngineTabSeparated(*this);
registerFileSegmentationEngineCSV(*this);
registerFileSegmentationEngineTSKV(*this);
2019-08-02 16:20:25 +00:00
registerOutputFormatNull(*this);
2019-02-19 18:41:18 +00:00
registerOutputFormatProcessorPretty(*this);
registerOutputFormatProcessorPrettyCompact(*this);
registerOutputFormatProcessorPrettySpace(*this);
registerOutputFormatProcessorVertical(*this);
registerOutputFormatProcessorJSON(*this);
registerOutputFormatProcessorJSONCompact(*this);
registerOutputFormatProcessorXML(*this);
registerOutputFormatProcessorODBCDriver(*this);
registerOutputFormatProcessorODBCDriver2(*this);
registerOutputFormatProcessorNull(*this);
#if USE_SSL
2019-07-04 18:55:20 +00:00
registerOutputFormatProcessorMySQLWrite(*this);
2019-09-11 11:21:54 +00:00
#endif
}
FormatFactory & FormatFactory::instance()
{
static FormatFactory ret;
return ret;
}
}