2020-04-16 12:31:57 +00:00
# include <Formats/FormatFactory.h>
2019-10-28 23:43:22 +00:00
# include <algorithm>
2019-03-22 12:08:30 +00:00
# include <Core/Settings.h>
2018-06-10 19:22:49 +00:00
# include <Formats/FormatSettings.h>
2021-12-22 22:14:23 +00:00
# include <Interpreters/Context.h>
# include <Interpreters/ProcessList.h>
2019-02-19 18:41:18 +00:00
# include <Processors/Formats/IRowInputFormat.h>
2020-10-06 12:47:52 +00:00
# include <Processors/Formats/IRowOutputFormat.h>
2019-12-25 19:17:41 +00:00
# include <Processors/Formats/Impl/MySQLOutputFormat.h>
2020-06-11 23:00:49 +00:00
# include <Processors/Formats/Impl/ParallelFormattingOutputFormat.h>
2021-12-22 22:14:23 +00:00
# include <Processors/Formats/Impl/ParallelParsingInputFormat.h>
# include <Processors/Formats/Impl/ValuesBlockInputFormat.h>
2020-02-02 00:53:11 +00:00
# include <Poco/URI.h>
2021-12-22 22:14:23 +00:00
# include <Common/Exception.h>
2022-11-05 14:53:08 +00:00
# include <Common/KnownObjectNames.h>
2022-01-24 18:41:44 +00:00
# include <unistd.h>
2018-06-10 19:22:49 +00:00
2022-01-11 13:26:14 +00:00
# include <boost/algorithm/string/case_conv.hpp>
2018-06-10 19:22:49 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int UNKNOWN_FORMAT ;
extern const int LOGICAL_ERROR ;
extern const int FORMAT_IS_NOT_SUITABLE_FOR_INPUT ;
extern const int FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT ;
2022-01-14 14:46:24 +00:00
extern const int BAD_ARGUMENTS ;
2018-06-10 19:22:49 +00:00
}
const FormatFactory : : Creators & FormatFactory : : getCreators ( const String & name ) const
{
auto it = dict . find ( name ) ;
if ( dict . end ( ) ! = it )
return it - > second ;
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : UNKNOWN_FORMAT , " Unknown format {} " , name ) ;
2018-06-10 19:22:49 +00:00
}
2021-06-01 12:20:52 +00:00
FormatSettings getFormatSettings ( ContextPtr context )
2019-02-19 18:41:18 +00:00
{
2021-04-10 23:33:54 +00:00
const auto & settings = context - > getSettingsRef ( ) ;
2020-11-02 07:50:38 +00:00
2020-11-07 08:53:39 +00:00
return getFormatSettings ( context , settings ) ;
}
2018-06-10 19:22:49 +00:00
2020-11-07 08:53:39 +00:00
template < typename Settings >
2021-06-01 12:20:52 +00:00
FormatSettings getFormatSettings ( ContextPtr context , const Settings & settings )
2019-02-19 18:41:18 +00:00
{
2018-06-10 19:22:49 +00:00
FormatSettings format_settings ;
2020-11-02 07:50:38 +00:00
format_settings . avro . allow_missing_fields = settings . input_format_avro_allow_missing_fields ;
format_settings . avro . output_codec = settings . output_format_avro_codec ;
format_settings . avro . output_sync_interval = settings . output_format_avro_sync_interval ;
format_settings . avro . schema_registry_url = settings . format_avro_schema_registry_url . toString ( ) ;
2021-07-09 16:18:22 +00:00
format_settings . avro . string_column_pattern = settings . output_format_avro_string_column_pattern . toString ( ) ;
2021-10-13 08:19:37 +00:00
format_settings . avro . output_rows_in_file = settings . output_format_avro_rows_in_file ;
2018-07-04 21:00:50 +00:00
format_settings . csv . allow_double_quotes = settings . format_csv_allow_double_quotes ;
2020-11-02 07:50:38 +00:00
format_settings . csv . allow_single_quotes = settings . format_csv_allow_single_quotes ;
format_settings . csv . crlf_end_of_line = settings . output_format_csv_crlf_end_of_line ;
format_settings . csv . delimiter = settings . format_csv_delimiter ;
2021-12-20 16:25:54 +00:00
format_settings . csv . tuple_delimiter = settings . format_csv_delimiter ;
2021-10-14 10:32:49 +00:00
format_settings . csv . empty_as_default = settings . input_format_csv_empty_as_default ;
2022-08-19 16:39:13 +00:00
format_settings . csv . enum_as_number = settings . input_format_csv_enum_as_number ;
2021-10-21 13:52:27 +00:00
format_settings . csv . null_representation = settings . format_csv_null_representation ;
2022-08-19 16:39:13 +00:00
format_settings . csv . arrays_as_nested_csv = settings . input_format_csv_arrays_as_nested_csv ;
format_settings . csv . use_best_effort_in_schema_inference = settings . input_format_csv_use_best_effort_in_schema_inference ;
2022-05-25 15:00:11 +00:00
format_settings . csv . skip_first_lines = settings . input_format_csv_skip_first_lines ;
2023-01-05 22:57:25 +00:00
format_settings . csv . try_detect_header = settings . input_format_csv_detect_header ;
2021-12-02 08:14:25 +00:00
format_settings . hive_text . fields_delimiter = settings . input_format_hive_text_fields_delimiter ;
format_settings . hive_text . collection_items_delimiter = settings . input_format_hive_text_collection_items_delimiter ;
format_settings . hive_text . map_keys_delimiter = settings . input_format_hive_text_map_keys_delimiter ;
2019-12-25 19:17:41 +00:00
format_settings . custom . escaping_rule = settings . format_custom_escaping_rule ;
format_settings . custom . field_delimiter = settings . format_custom_field_delimiter ;
2020-11-02 07:50:38 +00:00
format_settings . custom . result_after_delimiter = settings . format_custom_result_after_delimiter ;
format_settings . custom . result_before_delimiter = settings . format_custom_result_before_delimiter ;
2019-12-25 19:17:41 +00:00
format_settings . custom . row_after_delimiter = settings . format_custom_row_after_delimiter ;
2020-11-02 07:50:38 +00:00
format_settings . custom . row_before_delimiter = settings . format_custom_row_before_delimiter ;
2019-12-25 19:17:41 +00:00
format_settings . custom . row_between_delimiter = settings . format_custom_row_between_delimiter ;
2023-01-05 22:57:25 +00:00
format_settings . custom . try_detect_header = settings . input_format_custom_detect_header ;
2018-06-10 19:22:49 +00:00
format_settings . date_time_input_format = settings . date_time_input_format ;
2020-11-02 07:50:38 +00:00
format_settings . date_time_output_format = settings . date_time_output_format ;
2022-03-30 10:54:19 +00:00
format_settings . input_format_ipv4_default_on_conversion_error = settings . input_format_ipv4_default_on_conversion_error ;
format_settings . input_format_ipv6_default_on_conversion_error = settings . input_format_ipv6_default_on_conversion_error ;
2021-11-19 05:22:44 +00:00
format_settings . bool_true_representation = settings . bool_true_representation ;
format_settings . bool_false_representation = settings . bool_false_representation ;
2020-04-26 13:44:11 +00:00
format_settings . enable_streaming = settings . output_format_enable_streaming ;
2020-11-02 07:50:38 +00:00
format_settings . import_nested_json = settings . input_format_import_nested_json ;
2018-06-10 19:22:49 +00:00
format_settings . input_allow_errors_num = settings . input_format_allow_errors_num ;
format_settings . input_allow_errors_ratio = settings . input_format_allow_errors_ratio ;
2020-11-17 19:50:47 +00:00
format_settings . json . array_of_rows = settings . output_format_json_array_of_rows ;
2020-11-02 07:50:38 +00:00
format_settings . json . escape_forward_slashes = settings . output_format_json_escape_forward_slashes ;
2023-01-12 16:36:44 +00:00
format_settings . json . write_named_tuples_as_objects = settings . output_format_json_named_tuples_as_objects ;
format_settings . json . read_named_tuples_as_objects = settings . input_format_json_named_tuples_as_objects ;
format_settings . json . defaults_for_missing_elements_in_named_tuple = settings . input_format_json_defaults_for_missing_elements_in_named_tuple ;
2023-01-26 15:47:56 +00:00
format_settings . json . ignore_unknown_keys_in_named_tuple = settings . input_format_json_ignore_unknown_keys_in_named_tuple ;
2018-06-10 19:22:49 +00:00
format_settings . json . quote_64bit_integers = settings . output_format_json_quote_64bit_integers ;
2022-09-20 13:49:17 +00:00
format_settings . json . quote_64bit_floats = settings . output_format_json_quote_64bit_floats ;
2018-06-10 19:22:49 +00:00
format_settings . json . quote_denormals = settings . output_format_json_quote_denormals ;
2022-09-08 16:07:20 +00:00
format_settings . json . quote_decimals = settings . output_format_json_quote_decimals ;
2022-03-29 17:37:31 +00:00
format_settings . json . read_bools_as_numbers = settings . input_format_json_read_bools_as_numbers ;
2022-09-08 16:07:20 +00:00
format_settings . json . read_numbers_as_strings = settings . input_format_json_read_numbers_as_strings ;
2022-12-08 18:58:18 +00:00
format_settings . json . read_objects_as_strings = settings . input_format_json_read_objects_as_strings ;
2022-07-13 15:57:55 +00:00
format_settings . json . try_infer_numbers_from_strings = settings . input_format_json_try_infer_numbers_from_strings ;
2022-09-20 13:49:17 +00:00
format_settings . json . validate_types_from_metadata = settings . input_format_json_validate_types_from_metadata ;
2022-09-01 19:00:24 +00:00
format_settings . json . validate_utf8 = settings . output_format_json_validate_utf8 ;
2022-09-22 16:48:54 +00:00
format_settings . json_object_each_row . column_for_object_name = settings . format_json_object_each_row_column_for_object_name ;
2022-12-21 21:21:30 +00:00
format_settings . json . allow_object_type = context - > getSettingsRef ( ) . allow_experimental_object_type ;
2020-11-02 07:50:38 +00:00
format_settings . null_as_default = settings . input_format_null_as_default ;
2021-08-16 08:03:23 +00:00
format_settings . decimal_trailing_zeros = settings . output_format_decimal_trailing_zeros ;
2020-11-02 07:50:38 +00:00
format_settings . parquet . row_group_size = settings . output_format_parquet_row_group_size ;
2023-02-23 16:14:10 +00:00
format_settings . parquet . output_version = settings . output_format_parquet_version ;
2021-07-01 17:59:28 +00:00
format_settings . parquet . import_nested = settings . input_format_parquet_import_nested ;
2022-03-21 07:47:37 +00:00
format_settings . parquet . case_insensitive_column_matching = settings . input_format_parquet_case_insensitive_column_matching ;
2021-12-02 08:14:25 +00:00
format_settings . parquet . allow_missing_columns = settings . input_format_parquet_allow_missing_columns ;
2022-03-24 12:54:12 +00:00
format_settings . parquet . skip_columns_with_unsupported_types_in_schema_inference = settings . input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference ;
2022-05-18 14:51:21 +00:00
format_settings . parquet . output_string_as_string = settings . output_format_parquet_string_as_string ;
2023-01-16 21:01:31 +00:00
format_settings . parquet . output_fixed_string_as_fixed_byte_array = settings . output_format_parquet_fixed_string_as_fixed_byte_array ;
2023-03-13 18:22:09 +00:00
format_settings . parquet . max_block_size = settings . input_format_parquet_max_block_size ;
2023-03-01 21:27:46 +00:00
format_settings . parquet . output_compression_method = settings . output_format_parquet_compression_method ;
2020-11-02 07:50:38 +00:00
format_settings . pretty . charset = settings . output_format_pretty_grid_charset . toString ( ) = = " ASCII " ? FormatSettings : : Pretty : : Charset : : ASCII : FormatSettings : : Pretty : : Charset : : UTF8 ;
format_settings . pretty . color = settings . output_format_pretty_color ;
2018-08-30 23:34:12 +00:00
format_settings . pretty . max_column_pad_width = settings . output_format_pretty_max_column_pad_width ;
2020-11-02 07:50:38 +00:00
format_settings . pretty . max_rows = settings . output_format_pretty_max_rows ;
2020-05-31 22:12:13 +00:00
format_settings . pretty . max_value_width = settings . output_format_pretty_max_value_width ;
2020-09-29 12:30:36 +00:00
format_settings . pretty . output_format_pretty_row_numbers = settings . output_format_pretty_row_numbers ;
2022-04-27 11:47:28 +00:00
format_settings . protobuf . input_flatten_google_wrappers = settings . input_format_protobuf_flatten_google_wrappers ;
format_settings . protobuf . output_nullables_with_google_wrappers = settings . output_format_protobuf_nullables_with_google_wrappers ;
2022-07-20 11:16:25 +00:00
format_settings . protobuf . skip_fields_with_unsupported_types_in_schema_inference = settings . input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference ;
2020-11-02 07:50:38 +00:00
format_settings . regexp . escaping_rule = settings . format_regexp_escaping_rule ;
format_settings . regexp . regexp = settings . format_regexp ;
format_settings . regexp . skip_unmatched = settings . format_regexp_skip_unmatched ;
format_settings . schema . format_schema = settings . format_schema ;
2021-04-10 23:33:54 +00:00
format_settings . schema . format_schema_path = context - > getFormatSchemaPath ( ) ;
format_settings . schema . is_server = context - > hasGlobalContext ( ) & & ( context - > getGlobalContext ( ) - > getApplicationType ( ) = = Context : : ApplicationType : : SERVER ) ;
2020-11-02 07:50:38 +00:00
format_settings . skip_unknown_fields = settings . input_format_skip_unknown_fields ;
2019-09-24 14:25:22 +00:00
format_settings . template_settings . resultset_format = settings . format_template_resultset ;
format_settings . template_settings . row_between_delimiter = settings . format_template_rows_between_delimiter ;
2020-11-02 07:50:38 +00:00
format_settings . template_settings . row_format = settings . format_template_row ;
2020-02-03 07:40:12 +00:00
format_settings . tsv . crlf_end_of_line = settings . output_format_tsv_crlf_end_of_line ;
2019-10-16 14:22:22 +00:00
format_settings . tsv . empty_as_default = settings . input_format_tsv_empty_as_default ;
2022-08-19 16:39:13 +00:00
format_settings . tsv . enum_as_number = settings . input_format_tsv_enum_as_number ;
2021-10-21 13:52:27 +00:00
format_settings . tsv . null_representation = settings . format_tsv_null_representation ;
2022-08-19 16:39:13 +00:00
format_settings . tsv . use_best_effort_in_schema_inference = settings . input_format_tsv_use_best_effort_in_schema_inference ;
2022-05-25 15:00:11 +00:00
format_settings . tsv . skip_first_lines = settings . input_format_tsv_skip_first_lines ;
2023-01-05 22:57:25 +00:00
format_settings . tsv . try_detect_header = settings . input_format_tsv_detect_header ;
2020-11-02 07:50:38 +00:00
format_settings . values . accurate_types_of_literals = settings . input_format_values_accurate_types_of_literals ;
format_settings . values . deduce_templates_of_expressions = settings . input_format_values_deduce_templates_of_expressions ;
format_settings . values . interpret_expressions = settings . input_format_values_interpret_expressions ;
format_settings . with_names_use_header = settings . input_format_with_names_use_header ;
2021-10-14 10:32:49 +00:00
format_settings . with_types_use_header = settings . input_format_with_types_use_header ;
2018-06-10 19:22:49 +00:00
format_settings . write_statistics = settings . output_format_write_statistics ;
2021-05-25 12:01:28 +00:00
format_settings . arrow . low_cardinality_as_dictionary = settings . output_format_arrow_low_cardinality_as_dictionary ;
2021-07-01 17:59:28 +00:00
format_settings . arrow . import_nested = settings . input_format_arrow_import_nested ;
2021-12-02 08:14:25 +00:00
format_settings . arrow . allow_missing_columns = settings . input_format_arrow_allow_missing_columns ;
2022-03-24 12:54:12 +00:00
format_settings . arrow . skip_columns_with_unsupported_types_in_schema_inference = settings . input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference ;
2022-05-18 14:51:21 +00:00
format_settings . arrow . skip_columns_with_unsupported_types_in_schema_inference = settings . input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference ;
format_settings . arrow . case_insensitive_column_matching = settings . input_format_arrow_case_insensitive_column_matching ;
format_settings . arrow . output_string_as_string = settings . output_format_arrow_string_as_string ;
2023-01-16 21:01:31 +00:00
format_settings . arrow . output_fixed_string_as_fixed_byte_array = settings . output_format_arrow_fixed_string_as_fixed_byte_array ;
2023-03-01 21:27:46 +00:00
format_settings . arrow . output_compression_method = settings . output_format_arrow_compression_method ;
2021-07-01 17:59:28 +00:00
format_settings . orc . import_nested = settings . input_format_orc_import_nested ;
2021-12-02 08:14:25 +00:00
format_settings . orc . allow_missing_columns = settings . input_format_orc_allow_missing_columns ;
2021-12-18 09:25:25 +00:00
format_settings . orc . row_batch_size = settings . input_format_orc_row_batch_size ;
2022-03-24 12:54:12 +00:00
format_settings . orc . skip_columns_with_unsupported_types_in_schema_inference = settings . input_format_orc_skip_columns_with_unsupported_types_in_schema_inference ;
2021-07-01 17:59:28 +00:00
format_settings . orc . import_nested = settings . input_format_orc_import_nested ;
2021-12-02 08:14:25 +00:00
format_settings . orc . allow_missing_columns = settings . input_format_orc_allow_missing_columns ;
2021-12-18 09:25:25 +00:00
format_settings . orc . row_batch_size = settings . input_format_orc_row_batch_size ;
2022-03-25 11:05:40 +00:00
format_settings . orc . skip_columns_with_unsupported_types_in_schema_inference = settings . input_format_orc_skip_columns_with_unsupported_types_in_schema_inference ;
2022-03-21 07:47:37 +00:00
format_settings . orc . case_insensitive_column_matching = settings . input_format_orc_case_insensitive_column_matching ;
2022-05-18 14:51:21 +00:00
format_settings . orc . output_string_as_string = settings . output_format_orc_string_as_string ;
2023-03-01 21:27:46 +00:00
format_settings . orc . output_compression_method = settings . output_format_orc_compression_method ;
2021-10-14 10:32:49 +00:00
format_settings . defaults_for_omitted_fields = settings . input_format_defaults_for_omitted_fields ;
2021-09-28 12:59:22 +00:00
format_settings . capn_proto . enum_comparing_mode = settings . format_capn_proto_enum_comparising_mode ;
2022-07-20 11:16:25 +00:00
format_settings . capn_proto . skip_fields_with_unsupported_types_in_schema_inference = settings . input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference ;
2021-10-31 19:53:24 +00:00
format_settings . seekable_read = settings . input_format_allow_seeks ;
2021-12-15 11:30:57 +00:00
format_settings . msgpack . number_of_columns = settings . input_format_msgpack_number_of_columns ;
2022-01-27 16:54:15 +00:00
format_settings . msgpack . output_uuid_representation = settings . output_format_msgpack_uuid_representation ;
2021-12-15 11:30:57 +00:00
format_settings . max_rows_to_read_for_schema_inference = settings . input_format_max_rows_to_read_for_schema_inference ;
2022-03-24 12:54:12 +00:00
format_settings . column_names_for_schema_inference = settings . column_names_for_schema_inference ;
2022-08-16 09:41:32 +00:00
format_settings . schema_inference_hints = settings . schema_inference_hints ;
2022-12-07 21:19:27 +00:00
format_settings . schema_inference_make_columns_nullable = settings . schema_inference_make_columns_nullable ;
2022-04-26 10:42:56 +00:00
format_settings . mysql_dump . table_name = settings . input_format_mysql_dump_table_name ;
format_settings . mysql_dump . map_column_names = settings . input_format_mysql_dump_map_column_names ;
2022-06-27 18:31:57 +00:00
format_settings . sql_insert . max_batch_size = settings . output_format_sql_insert_max_batch_size ;
format_settings . sql_insert . include_column_names = settings . output_format_sql_insert_include_column_names ;
format_settings . sql_insert . table_name = settings . output_format_sql_insert_table_name ;
format_settings . sql_insert . use_replace = settings . output_format_sql_insert_use_replace ;
2022-06-30 16:14:30 +00:00
format_settings . sql_insert . quote_names = settings . output_format_sql_insert_quote_names ;
2022-07-13 15:57:55 +00:00
format_settings . try_infer_integers = settings . input_format_try_infer_integers ;
format_settings . try_infer_dates = settings . input_format_try_infer_dates ;
format_settings . try_infer_datetimes = settings . input_format_try_infer_datetimes ;
2022-11-10 20:15:14 +00:00
format_settings . bson . output_string_as_string = settings . output_format_bson_string_as_string ;
format_settings . bson . skip_fields_with_unsupported_types_in_schema_inference = settings . input_format_bson_skip_fields_with_unsupported_types_in_schema_inference ;
2022-12-02 12:57:11 +00:00
format_settings . max_binary_string_size = settings . format_binary_max_string_size ;
2023-03-13 22:49:28 +00:00
format_settings . max_binary_array_size = settings . format_binary_max_array_size ;
2023-02-27 19:28:19 +00:00
format_settings . native . allow_types_conversion = settings . input_format_native_allow_types_conversion ;
2022-12-12 22:00:45 +00:00
format_settings . max_parser_depth = context - > getSettingsRef ( ) . max_parser_depth ;
2023-02-10 10:47:06 +00:00
format_settings . client_protocol_version = context - > getClientProtocolVersion ( ) ;
2020-02-02 00:53:11 +00:00
/// Validate avro_schema_registry_url with RemoteHostFilter when non-empty and in Server context
2020-11-02 07:50:38 +00:00
if ( format_settings . schema . is_server )
2020-02-02 00:53:11 +00:00
{
const Poco : : URI & avro_schema_registry_url = settings . format_avro_schema_registry_url ;
if ( ! avro_schema_registry_url . empty ( ) )
2021-04-10 23:33:54 +00:00
context - > getRemoteHostFilter ( ) . checkURL ( avro_schema_registry_url ) ;
2020-02-02 00:53:11 +00:00
}
2018-06-10 19:22:49 +00:00
2019-02-19 18:41:18 +00:00
return format_settings ;
2018-06-10 19:22:49 +00:00
}
2021-06-01 12:20:52 +00:00
template FormatSettings getFormatSettings < FormatFactorySettings > ( ContextPtr context , const FormatFactorySettings & settings ) ;
2018-06-10 19:22:49 +00:00
2021-06-01 12:20:52 +00:00
template FormatSettings getFormatSettings < Settings > ( ContextPtr context , const Settings & settings ) ;
2019-02-19 18:41:18 +00:00
2020-05-18 10:00:22 +00:00
InputFormatPtr FormatFactory : : getInput (
2019-07-08 13:00:54 +00:00
const String & name ,
ReadBuffer & buf ,
const Block & sample ,
2021-06-01 12:20:52 +00:00
ContextPtr context ,
2019-07-08 13:00:54 +00:00
UInt64 max_block_size ,
2023-03-20 07:55:44 +00:00
const std : : optional < FormatSettings > & format_settings ,
std : : optional < size_t > max_parsing_threads ) const
2019-02-19 18:41:18 +00:00
{
2023-03-20 07:55:44 +00:00
return getInputImpl (
name ,
nullptr ,
& buf ,
sample ,
context ,
max_block_size ,
/* is_remote_fs */ false ,
CompressionMethod : : None ,
/* io_schedule */ nullptr ,
format_settings ,
/* max_download_threads */ 1 ,
max_parsing_threads ) ;
}
InputFormatPtr FormatFactory : : getInputMultistream (
const String & name ,
2023-03-24 01:34:24 +00:00
SeekableReadBufferFactoryPtr buf_factory ,
2023-03-20 07:55:44 +00:00
const Block & sample ,
ContextPtr context ,
UInt64 max_block_size ,
bool is_remote_fs ,
CompressionMethod compression ,
ThreadPoolCallbackRunner < void > io_schedule ,
const std : : optional < FormatSettings > & format_settings ,
std : : optional < size_t > max_download_threads ,
std : : optional < size_t > max_parsing_threads ) const
{
return getInputImpl (
name ,
std : : move ( buf_factory ) ,
nullptr ,
sample ,
context ,
max_block_size ,
is_remote_fs ,
compression ,
io_schedule ,
format_settings ,
max_download_threads ,
max_parsing_threads ) ;
}
2020-11-07 08:53:39 +00:00
2023-03-20 07:55:44 +00:00
InputFormatPtr FormatFactory : : getInputImpl (
const String & name ,
// exactly one of the following two is nullptr
2023-03-24 01:34:24 +00:00
SeekableReadBufferFactoryPtr buf_factory ,
2023-03-20 07:55:44 +00:00
ReadBuffer * _buf ,
const Block & sample ,
ContextPtr context ,
UInt64 max_block_size ,
bool is_remote_fs ,
CompressionMethod compression ,
ThreadPoolCallbackRunner < void > io_schedule ,
const std : : optional < FormatSettings > & _format_settings ,
std : : optional < size_t > _max_download_threads ,
std : : optional < size_t > _max_parsing_threads ) const
{
chassert ( ( ! _buf ) ! = ( ! buf_factory ) ) ;
auto & creators = getCreators ( name ) ;
if ( ! creators . input_creator )
2023-01-29 21:43:36 +00:00
throw Exception ( ErrorCodes : : FORMAT_IS_NOT_SUITABLE_FOR_INPUT , " Format {} is not suitable for input " , name ) ;
2019-02-19 18:41:18 +00:00
2023-03-20 07:55:44 +00:00
auto format_settings = _format_settings ? * _format_settings : getFormatSettings ( context ) ;
2021-04-10 23:33:54 +00:00
const Settings & settings = context - > getSettingsRef ( ) ;
2023-03-20 07:55:44 +00:00
const auto & file_segmentation_engine = creators . file_segmentation_engine ;
size_t max_parsing_threads = _max_parsing_threads . value_or ( settings . max_threads ) ;
size_t max_download_threads = _max_download_threads . value_or ( settings . max_download_threads ) ;
std : : unique_ptr < ReadBuffer > owned_buf ;
2019-10-01 10:48:46 +00:00
2023-03-20 07:55:44 +00:00
if ( context - > hasQueryContext ( ) & & settings . log_queries )
context - > getQueryContext ( ) - > addQueryFactoriesInfo ( Context : : QueryLogFactories : : Format , name ) ;
2020-02-07 13:16:51 +00:00
2023-03-20 07:55:44 +00:00
// Prepare a read buffer.
2020-11-05 18:07:14 +00:00
2023-03-20 07:55:44 +00:00
bool parallel_read = max_download_threads > 1 & & buf_factory & & format_settings . seekable_read
& & ( ! creators . multistream_input_creator | | compression ! = CompressionMethod : : None ) ;
if ( parallel_read )
owned_buf = std : : make_unique < ParallelReadBuffer > (
std : : move ( buf_factory ) ,
std : : move ( io_schedule ) ,
max_download_threads ,
settings . max_download_buffer_size ) ;
if ( compression ! = CompressionMethod : : None )
{
chassert ( buf_factory ) ;
if ( ! owned_buf )
owned_buf = buf_factory - > getReader ( ) ;
owned_buf = wrapReadBufferWithCompressionMethod ( std : : move ( owned_buf ) , compression , static_cast < int > ( settings . zstd_window_log_max ) ) ;
}
if ( ! creators . multistream_input_creator & & buf_factory & & ! owned_buf )
owned_buf = buf_factory - > getReader ( ) ;
auto buf = owned_buf ? owned_buf . get ( ) : _buf ;
// Return parallel parser if needed.
2020-12-30 04:50:58 +00:00
2023-03-20 07:55:44 +00:00
bool parallel_parsing = max_parsing_threads > 1 & & settings . input_format_parallel_parsing & & creators . file_segmentation_engine ;
if ( settings . max_memory_usage & & settings . min_chunk_bytes_for_parallel_parsing * max_parsing_threads * 2 > settings . max_memory_usage )
parallel_parsing = false ;
if ( settings . max_memory_usage_for_user & & settings . min_chunk_bytes_for_parallel_parsing * max_parsing_threads * 2 > settings . max_memory_usage_for_user )
parallel_parsing = false ;
2021-06-23 13:17:34 +00:00
if ( parallel_parsing )
2020-02-07 13:16:51 +00:00
{
2021-06-23 13:17:34 +00:00
const auto & non_trivial_prefix_and_suffix_checker = getCreators ( name ) . non_trivial_prefix_and_suffix_checker ;
/// Disable parallel parsing for input formats with non-trivial readPrefix() and readSuffix().
2023-03-20 07:55:44 +00:00
if ( non_trivial_prefix_and_suffix_checker & & non_trivial_prefix_and_suffix_checker ( * buf ) )
2021-06-23 13:17:34 +00:00
parallel_parsing = false ;
2020-02-07 13:16:51 +00:00
}
2023-03-20 07:55:44 +00:00
RowInputFormatParams row_input_format_params ;
row_input_format_params . max_block_size = max_block_size ;
row_input_format_params . allow_errors_num = format_settings . input_allow_errors_num ;
row_input_format_params . allow_errors_ratio = format_settings . input_allow_errors_ratio ;
row_input_format_params . max_execution_time = settings . max_execution_time ;
row_input_format_params . timeout_overflow_mode = settings . timeout_overflow_mode ;
2020-02-07 13:16:51 +00:00
if ( parallel_parsing )
2019-10-01 10:48:46 +00:00
{
2023-03-20 07:55:44 +00:00
const auto & input_getter = creators . input_creator ;
2019-10-01 10:48:46 +00:00
2020-06-10 12:02:34 +00:00
/// Const reference is copied to lambda.
auto parser_creator = [ input_getter , sample , row_input_format_params , format_settings ]
( ReadBuffer & input ) - > InputFormatPtr
2020-12-17 15:14:09 +00:00
{ return input_getter ( input , sample , row_input_format_params , format_settings ) ; } ;
2020-05-18 10:00:22 +00:00
2020-06-11 00:36:57 +00:00
ParallelParsingInputFormat : : Params params {
2023-03-20 07:55:44 +00:00
* buf , sample , parser_creator , file_segmentation_engine , name , max_parsing_threads ,
2022-09-27 20:14:15 +00:00
settings . min_chunk_bytes_for_parallel_parsing , max_block_size , context - > getApplicationType ( ) = = Context : : ApplicationType : : SERVER } ;
2023-03-13 03:51:50 +00:00
2022-09-05 15:42:49 +00:00
auto format = std : : make_shared < ParallelParsingInputFormat > ( params ) ;
2023-03-20 07:55:44 +00:00
if ( owned_buf )
format - > addBuffer ( std : : move ( owned_buf ) ) ;
2022-09-09 15:21:37 +00:00
if ( ! settings . input_format_record_errors_file_path . toString ( ) . empty ( ) )
2022-09-08 16:37:18 +00:00
format - > setErrorsLogger ( std : : make_shared < ParallelInputFormatErrorsLogger > ( context ) ) ;
2023-03-20 07:55:44 +00:00
2022-09-05 15:42:49 +00:00
return format ;
2019-10-01 10:48:46 +00:00
}
2023-03-20 07:55:44 +00:00
// Return multistream parser if needed.
if ( creators . multistream_input_creator & & ! buf )
2022-09-08 16:37:18 +00:00
{
2023-03-20 07:55:44 +00:00
auto format = creators . multistream_input_creator (
std : : move ( buf_factory ) ,
sample ,
format_settings ,
context - > getReadSettings ( ) ,
is_remote_fs ,
io_schedule ,
max_download_threads ,
max_parsing_threads ) ;
2023-03-13 03:51:50 +00:00
if ( ! settings . input_format_record_errors_file_path . toString ( ) . empty ( ) )
2023-03-20 07:55:44 +00:00
format - > setErrorsLogger ( std : : make_shared < ParallelInputFormatErrorsLogger > ( context ) ) ;
2023-03-13 03:51:50 +00:00
return format ;
2022-09-08 16:37:18 +00:00
}
2019-02-19 18:41:18 +00:00
2023-03-20 07:55:44 +00:00
// TODO: What about the case `creators.multistream_input_creator && buf`? Currently it'll be parsed in one thread. Lame. Need to change the interface to allow making it parallel.
// Maybe the read parallelization should be taken out of the format and into a new interface RangeSetReader (or whatever), which would support both a single buffer and a factory. The logic from asArrowFile() would move there. Check if this works for ORC and Arrow formats.
2020-11-02 07:50:38 +00:00
2023-03-20 07:55:44 +00:00
// Basic parser from one ReadBuffer.
2021-01-20 10:54:11 +00:00
2023-03-20 07:55:44 +00:00
auto format = creators . input_creator ( * buf , sample , row_input_format_params , format_settings ) ;
2019-02-19 18:41:18 +00:00
2023-03-20 07:55:44 +00:00
if ( owned_buf )
format - > addBuffer ( std : : move ( owned_buf ) ) ;
if ( ! settings . input_format_record_errors_file_path . toString ( ) . empty ( ) )
format - > setErrorsLogger ( std : : make_shared < ParallelInputFormatErrorsLogger > ( context ) ) ;
2019-12-25 19:17:41 +00:00
/// It's a kludge. Because I cannot remove context from values format.
2023-03-20 07:55:44 +00:00
/// (This is not needed in the parallel_parsing and multistream cases above because VALUES format doesn't support them.)
2019-12-25 19:17:41 +00:00
if ( auto * values = typeid_cast < ValuesBlockInputFormat * > ( format . get ( ) ) )
values - > setContext ( context ) ;
return format ;
2019-02-19 18:41:18 +00:00
}
2021-12-22 22:14:23 +00:00
static void addExistingProgressToOutputFormat ( OutputFormatPtr format , ContextPtr context )
{
2022-10-17 02:21:08 +00:00
auto element_id = context - > getProcessListElement ( ) ;
2021-12-22 22:14:23 +00:00
if ( element_id )
{
/// While preparing the query there might have been progress (for example in subscalar subqueries) so add it here
auto current_progress = element_id - > getProgressIn ( ) ;
Progress read_progress { current_progress . read_rows , current_progress . read_bytes , current_progress . total_rows_to_read } ;
format - > onProgress ( read_progress ) ;
2022-12-28 20:01:41 +00:00
/// Update the start of the statistics to use the start of the query, and not the creation of the format class
format - > setStartTime ( element_id - > getQueryCPUStartTime ( ) , true ) ;
2021-12-22 22:14:23 +00:00
}
}
2020-12-30 03:07:30 +00:00
OutputFormatPtr FormatFactory : : getOutputFormatParallelIfPossible (
2021-04-10 23:33:54 +00:00
const String & name ,
WriteBuffer & buf ,
const Block & sample ,
2021-06-01 12:20:52 +00:00
ContextPtr context ,
2020-12-30 03:07:30 +00:00
const std : : optional < FormatSettings > & _format_settings ) const
{
2021-10-11 16:11:50 +00:00
const auto & output_getter = getCreators ( name ) . output_creator ;
2020-12-30 03:07:30 +00:00
if ( ! output_getter )
2023-01-29 21:43:36 +00:00
throw Exception ( ErrorCodes : : FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT , " Format {} is not suitable for output " , name ) ;
2020-12-30 03:07:30 +00:00
2021-04-10 23:33:54 +00:00
auto format_settings = _format_settings ? * _format_settings : getFormatSettings ( context ) ;
2020-12-30 03:07:30 +00:00
2021-04-10 23:33:54 +00:00
const Settings & settings = context - > getSettingsRef ( ) ;
2020-12-30 03:07:30 +00:00
2020-12-30 04:50:58 +00:00
if ( settings . output_format_parallel_formatting & & getCreators ( name ) . supports_parallel_formatting
2021-08-12 09:29:50 +00:00
& & ! settings . output_format_json_array_of_rows )
2020-12-30 03:07:30 +00:00
{
2022-10-28 16:41:10 +00:00
auto formatter_creator = [ output_getter , sample , format_settings ] ( WriteBuffer & output ) - > OutputFormatPtr
2022-03-02 17:22:12 +00:00
{
2022-10-28 16:41:10 +00:00
return output_getter ( output , sample , format_settings ) ;
2022-03-02 17:22:12 +00:00
} ;
2020-12-30 03:07:30 +00:00
ParallelFormattingOutputFormat : : Params builder { buf , sample , formatter_creator , settings . max_threads } ;
2021-04-10 23:33:54 +00:00
if ( context - > hasQueryContext ( ) & & settings . log_queries )
context - > getQueryContext ( ) - > addQueryFactoriesInfo ( Context : : QueryLogFactories : : Format , name ) ;
2021-01-20 10:54:11 +00:00
2021-12-22 22:14:23 +00:00
auto format = std : : make_shared < ParallelFormattingOutputFormat > ( builder ) ;
addExistingProgressToOutputFormat ( format , context ) ;
return format ;
2020-12-30 03:07:30 +00:00
}
2022-10-28 16:41:10 +00:00
return getOutputFormat ( name , buf , sample , context , _format_settings ) ;
2020-12-30 03:07:30 +00:00
}
2019-08-20 11:17:57 +00:00
OutputFormatPtr FormatFactory : : getOutputFormat (
2021-04-10 23:33:54 +00:00
const String & name ,
WriteBuffer & buf ,
const Block & sample ,
2021-06-01 12:20:52 +00:00
ContextPtr context ,
2020-11-07 08:53:39 +00:00
const std : : optional < FormatSettings > & _format_settings ) const
2019-02-19 18:41:18 +00:00
{
2021-10-11 16:11:50 +00:00
const auto & output_getter = getCreators ( name ) . output_creator ;
2019-02-19 18:41:18 +00:00
if ( ! output_getter )
2023-01-29 21:43:36 +00:00
throw Exception ( ErrorCodes : : FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT , " Format {} is not suitable for output " , name ) ;
2019-02-19 18:41:18 +00:00
2021-04-10 23:33:54 +00:00
if ( context - > hasQueryContext ( ) & & context - > getSettingsRef ( ) . log_queries )
context - > getQueryContext ( ) - > addQueryFactoriesInfo ( Context : : QueryLogFactories : : Format , name ) ;
2021-01-20 10:54:11 +00:00
2021-07-16 10:10:56 +00:00
auto format_settings = _format_settings ? * _format_settings : getFormatSettings ( context ) ;
2019-02-19 18:41:18 +00:00
/** TODO: Materialization is needed, because formats can use the functions `IDataType`,
* which only work with full columns .
*/
2022-10-28 16:41:10 +00:00
auto format = output_getter ( buf , sample , format_settings ) ;
2019-12-25 19:17:41 +00:00
2020-04-27 15:00:10 +00:00
/// Enable auto-flush for streaming mode. Currently it is needed by INSERT WATCH query.
if ( format_settings . enable_streaming )
format - > setAutoFlush ( ) ;
2019-12-25 19:17:41 +00:00
/// It's a kludge. Because I cannot remove context from MySQL format.
if ( auto * mysql = typeid_cast < MySQLOutputFormat * > ( format . get ( ) ) )
mysql - > setContext ( context ) ;
2021-12-22 22:14:23 +00:00
addExistingProgressToOutputFormat ( format , context ) ;
2019-12-25 19:17:41 +00:00
return format ;
2019-02-19 18:41:18 +00:00
}
2021-12-03 11:42:46 +00:00
String FormatFactory : : getContentType (
const String & name ,
ContextPtr context ,
const std : : optional < FormatSettings > & _format_settings ) const
{
const auto & output_getter = getCreators ( name ) . output_creator ;
if ( ! output_getter )
2023-01-29 21:43:36 +00:00
throw Exception ( ErrorCodes : : FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT , " Format {} is not suitable for output " , name ) ;
2021-12-03 11:42:46 +00:00
auto format_settings = _format_settings ? * _format_settings : getFormatSettings ( context ) ;
2021-12-06 20:35:29 +00:00
2021-12-03 14:09:04 +00:00
Block empty_block ;
WriteBufferFromOwnString empty_buffer ;
2022-10-28 16:41:10 +00:00
auto format = output_getter ( empty_buffer , empty_block , format_settings ) ;
2021-12-03 11:42:46 +00:00
return format - > getContentType ( ) ;
}
2021-12-15 11:30:57 +00:00
SchemaReaderPtr FormatFactory : : getSchemaReader (
const String & name ,
ReadBuffer & buf ,
2022-04-13 16:59:04 +00:00
ContextPtr & context ,
2021-12-15 11:30:57 +00:00
const std : : optional < FormatSettings > & _format_settings ) const
{
const auto & schema_reader_creator = dict . at ( name ) . schema_reader_creator ;
if ( ! schema_reader_creator )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " FormatFactory: Format {} doesn't support schema inference. " , name ) ;
2021-12-15 11:30:57 +00:00
auto format_settings = _format_settings ? * _format_settings : getFormatSettings ( context ) ;
2022-08-16 09:41:32 +00:00
auto schema_reader = schema_reader_creator ( buf , format_settings ) ;
if ( schema_reader - > needContext ( ) )
schema_reader - > setContext ( context ) ;
return schema_reader ;
2021-12-15 11:30:57 +00:00
}
ExternalSchemaReaderPtr FormatFactory : : getExternalSchemaReader (
const String & name ,
2022-04-13 16:59:04 +00:00
ContextPtr & context ,
2021-12-15 11:30:57 +00:00
const std : : optional < FormatSettings > & _format_settings ) const
{
const auto & external_schema_reader_creator = dict . at ( name ) . external_schema_reader_creator ;
if ( ! external_schema_reader_creator )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " FormatFactory: Format {} doesn't support schema inference. " , name ) ;
2021-12-15 11:30:57 +00:00
auto format_settings = _format_settings ? * _format_settings : getFormatSettings ( context ) ;
return external_schema_reader_creator ( format_settings ) ;
}
2021-12-03 11:42:46 +00:00
2023-03-13 19:29:59 +00:00
void FormatFactory : : registerInputFormat ( const String & name , InputCreator input_creator , MultistreamInputCreator multistream_input_creator )
2018-06-10 19:22:49 +00:00
{
2023-03-13 19:29:59 +00:00
chassert ( input_creator ) ;
auto & creators = dict [ name ] ;
if ( creators . input_creator )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " FormatFactory: Input format {} is already registered " , name ) ;
2023-03-13 19:29:59 +00:00
creators . input_creator = std : : move ( input_creator ) ;
creators . multistream_input_creator = std : : move ( multistream_input_creator ) ;
2022-01-12 15:28:13 +00:00
registerFileExtension ( name , name ) ;
2022-11-14 18:28:19 +00:00
KnownFormatNames : : instance ( ) . add ( name ) ;
2018-06-10 19:22:49 +00:00
}
2021-06-23 13:17:34 +00:00
void FormatFactory : : registerNonTrivialPrefixAndSuffixChecker ( const String & name , NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker )
{
auto & target = dict [ name ] . non_trivial_prefix_and_suffix_checker ;
if ( target )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " FormatFactory: Non trivial prefix and suffix checker {} is already registered " , name ) ;
2021-06-23 13:17:34 +00:00
target = std : : move ( non_trivial_prefix_and_suffix_checker ) ;
}
2022-01-14 15:16:18 +00:00
void FormatFactory : : registerAppendSupportChecker ( const String & name , AppendSupportChecker append_support_checker )
2021-12-29 18:03:15 +00:00
{
2022-01-14 15:16:18 +00:00
auto & target = dict [ name ] . append_support_checker ;
2021-12-29 18:03:15 +00:00
if ( target )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " FormatFactory: Suffix checker {} is already registered " , name ) ;
2022-01-14 15:16:18 +00:00
target = std : : move ( append_support_checker ) ;
2021-12-29 18:03:15 +00:00
}
2022-01-24 13:27:04 +00:00
void FormatFactory : : markFormatHasNoAppendSupport ( const String & name )
2021-12-29 18:03:15 +00:00
{
2022-01-14 16:17:06 +00:00
registerAppendSupportChecker ( name , [ ] ( const FormatSettings & ) { return false ; } ) ;
2021-12-29 18:03:15 +00:00
}
2022-01-14 15:16:18 +00:00
bool FormatFactory : : checkIfFormatSupportAppend ( const String & name , ContextPtr context , const std : : optional < FormatSettings > & format_settings_ )
2021-12-29 18:03:15 +00:00
{
auto format_settings = format_settings_ ? * format_settings_ : getFormatSettings ( context ) ;
2022-01-14 15:16:18 +00:00
auto & append_support_checker = dict [ name ] . append_support_checker ;
/// By default we consider that format supports append
return ! append_support_checker | | append_support_checker ( format_settings ) ;
2021-12-29 18:03:15 +00:00
}
2021-10-11 16:11:50 +00:00
void FormatFactory : : registerOutputFormat ( const String & name , OutputCreator output_creator )
2019-02-19 18:41:18 +00:00
{
2021-10-11 16:11:50 +00:00
auto & target = dict [ name ] . output_creator ;
2019-02-19 18:41:18 +00:00
if ( target )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " FormatFactory: Output format {} is already registered " , name ) ;
2019-02-19 18:41:18 +00:00
target = std : : move ( output_creator ) ;
2022-01-12 15:28:13 +00:00
registerFileExtension ( name , name ) ;
2022-11-14 18:28:19 +00:00
KnownFormatNames : : instance ( ) . add ( name ) ;
2018-06-10 19:22:49 +00:00
}
2022-01-07 05:16:41 +00:00
void FormatFactory : : registerFileExtension ( const String & extension , const String & format_name )
{
2022-01-12 15:28:13 +00:00
file_extension_formats [ boost : : to_lower_copy ( extension ) ] = format_name ;
2022-01-07 05:16:41 +00:00
}
2022-01-13 13:14:18 +00:00
String FormatFactory : : getFormatFromFileName ( String file_name , bool throw_if_not_found )
2022-01-07 05:16:41 +00:00
{
2022-01-24 18:41:44 +00:00
if ( file_name = = " stdin " )
return getFormatFromFileDescriptor ( STDIN_FILENO ) ;
2022-01-07 05:16:41 +00:00
CompressionMethod compression_method = chooseCompressionMethod ( file_name , " " ) ;
if ( CompressionMethod : : None ! = compression_method )
{
auto pos = file_name . find_last_of ( ' . ' ) ;
if ( pos ! = String : : npos )
file_name = file_name . substr ( 0 , pos ) ;
}
auto pos = file_name . find_last_of ( ' . ' ) ;
if ( pos = = String : : npos )
2022-01-14 14:46:24 +00:00
{
if ( throw_if_not_found )
throw Exception ( ErrorCodes : : BAD_ARGUMENTS , " Cannot determine the file format by it's extension " ) ;
2022-01-07 05:16:41 +00:00
return " " ;
2022-01-14 14:46:24 +00:00
}
2022-01-07 05:16:41 +00:00
String file_extension = file_name . substr ( pos + 1 , String : : npos ) ;
2022-01-11 13:26:14 +00:00
boost : : algorithm : : to_lower ( file_extension ) ;
2022-01-13 13:14:18 +00:00
auto it = file_extension_formats . find ( file_extension ) ;
if ( it = = file_extension_formats . end ( ) )
{
if ( throw_if_not_found )
throw Exception ( ErrorCodes : : BAD_ARGUMENTS , " Cannot determine the file format by it's extension " ) ;
return " " ;
}
return it - > second ;
2022-01-07 05:16:41 +00:00
}
2022-01-24 18:41:44 +00:00
String FormatFactory : : getFormatFromFileDescriptor ( int fd )
{
# ifdef OS_LINUX
2022-05-27 20:51:37 +00:00
std : : string proc_path = fmt : : format ( " /proc/self/fd/{} " , fd ) ;
2022-01-24 18:41:44 +00:00
char file_path [ PATH_MAX ] = { ' \0 ' } ;
2022-05-27 20:51:37 +00:00
if ( readlink ( proc_path . c_str ( ) , file_path , sizeof ( file_path ) - 1 ) ! = - 1 )
2022-01-24 18:41:44 +00:00
return getFormatFromFileName ( file_path , false ) ;
return " " ;
2022-06-10 08:22:31 +00:00
# elif defined(OS_DARWIN)
2022-01-24 18:41:44 +00:00
char file_path [ PATH_MAX ] = { ' \0 ' } ;
if ( fcntl ( fd , F_GETPATH , file_path ) ! = - 1 )
return getFormatFromFileName ( file_path , false ) ;
return " " ;
# else
2022-09-07 20:15:49 +00:00
( void ) fd ;
2022-01-24 18:41:44 +00:00
return " " ;
# endif
}
2019-10-01 10:48:46 +00:00
void FormatFactory : : registerFileSegmentationEngine ( const String & name , FileSegmentationEngine file_segmentation_engine )
{
auto & target = dict [ name ] . file_segmentation_engine ;
if ( target )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " FormatFactory: File segmentation engine {} is already registered " , name ) ;
2019-12-25 19:17:41 +00:00
target = std : : move ( file_segmentation_engine ) ;
2019-10-01 10:48:46 +00:00
}
2018-06-10 19:22:49 +00:00
2021-12-15 11:30:57 +00:00
void FormatFactory : : registerSchemaReader ( const String & name , SchemaReaderCreator schema_reader_creator )
{
auto & target = dict [ name ] . schema_reader_creator ;
if ( target )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " FormatFactory: Schema reader {} is already registered " , name ) ;
2021-12-15 11:30:57 +00:00
target = std : : move ( schema_reader_creator ) ;
}
void FormatFactory : : registerExternalSchemaReader ( const String & name , ExternalSchemaReaderCreator external_schema_reader_creator )
{
auto & target = dict [ name ] . external_schema_reader_creator ;
if ( target )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " FormatFactory: Schema reader {} is already registered " , name ) ;
2021-12-15 11:30:57 +00:00
target = std : : move ( external_schema_reader_creator ) ;
}
2020-10-06 14:02:01 +00:00
void FormatFactory : : markOutputFormatSupportsParallelFormatting ( const String & name )
{
auto & target = dict [ name ] . supports_parallel_formatting ;
if ( target )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " FormatFactory: Output format {} is already marked as supporting parallel formatting " , name ) ;
2020-10-06 14:02:01 +00:00
target = true ;
}
2022-05-13 18:39:19 +00:00
void FormatFactory : : markFormatSupportsSubsetOfColumns ( const String & name )
2021-03-30 21:25:37 +00:00
{
2022-05-20 14:57:27 +00:00
auto & target = dict [ name ] . supports_subset_of_columns ;
2021-03-30 21:25:37 +00:00
if ( target )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " FormatFactory: Format {} is already marked as supporting subset of columns " , name ) ;
2021-04-01 00:08:02 +00:00
target = true ;
2021-03-30 21:25:37 +00:00
}
2022-11-23 15:36:12 +00:00
void FormatFactory : : markFormatSupportsSubcolumns ( const String & name )
{
auto & target = dict [ name ] . supports_subcolumns ;
if ( target )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " FormatFactory: Format {} is already marked as supporting subcolumns " , name ) ;
2022-11-23 15:36:12 +00:00
target = true ;
}
bool FormatFactory : : checkIfFormatSupportsSubcolumns ( const String & name ) const
{
const auto & target = getCreators ( name ) ;
return target . supports_subcolumns ;
}
2021-03-30 21:25:37 +00:00
2022-05-20 14:57:27 +00:00
bool FormatFactory : : checkIfFormatSupportsSubsetOfColumns ( const String & name ) const
2021-03-30 21:25:37 +00:00
{
const auto & target = getCreators ( name ) ;
2022-05-20 14:57:27 +00:00
return target . supports_subset_of_columns ;
2021-03-30 21:25:37 +00:00
}
2022-06-27 12:43:24 +00:00
void FormatFactory : : registerAdditionalInfoForSchemaCacheGetter (
const String & name , AdditionalInfoForSchemaCacheGetter additional_info_for_schema_cache_getter )
{
auto & target = dict [ name ] . additional_info_for_schema_cache_getter ;
if ( target )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " FormatFactory: additional info for schema cache getter {} is already registered " , name ) ;
2022-06-27 12:43:24 +00:00
target = std : : move ( additional_info_for_schema_cache_getter ) ;
}
String FormatFactory : : getAdditionalInfoForSchemaCache ( const String & name , ContextPtr context , const std : : optional < FormatSettings > & format_settings_ )
{
const auto & additional_info_getter = getCreators ( name ) . additional_info_for_schema_cache_getter ;
if ( ! additional_info_getter )
return " " ;
auto format_settings = format_settings_ ? * format_settings_ : getFormatSettings ( context ) ;
return additional_info_getter ( format_settings ) ;
}
2021-09-16 17:18:34 +00:00
bool FormatFactory : : isInputFormat ( const String & name ) const
{
auto it = dict . find ( name ) ;
2021-10-11 16:11:50 +00:00
return it ! = dict . end ( ) & & it - > second . input_creator ;
2021-09-16 17:18:34 +00:00
}
bool FormatFactory : : isOutputFormat ( const String & name ) const
{
auto it = dict . find ( name ) ;
2021-10-11 16:11:50 +00:00
return it ! = dict . end ( ) & & it - > second . output_creator ;
2021-09-16 17:18:34 +00:00
}
2022-05-20 14:57:27 +00:00
bool FormatFactory : : checkIfFormatHasSchemaReader ( const String & name ) const
2021-12-15 11:30:57 +00:00
{
const auto & target = getCreators ( name ) ;
return bool ( target . schema_reader_creator ) ;
}
2022-05-20 14:57:27 +00:00
bool FormatFactory : : checkIfFormatHasExternalSchemaReader ( const String & name ) const
2021-12-15 11:30:57 +00:00
{
const auto & target = getCreators ( name ) ;
return bool ( target . external_schema_reader_creator ) ;
}
2022-05-20 14:57:27 +00:00
bool FormatFactory : : checkIfFormatHasAnySchemaReader ( const String & name ) const
2021-12-15 11:30:57 +00:00
{
return checkIfFormatHasSchemaReader ( name ) | | checkIfFormatHasExternalSchemaReader ( name ) ;
}
2022-05-23 12:48:48 +00:00
void FormatFactory : : checkFormatName ( const String & name ) const
{
auto it = dict . find ( name ) ;
if ( it = = dict . end ( ) )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : UNKNOWN_FORMAT , " Unknown format {} " , name ) ;
2022-05-23 12:48:48 +00:00
}
2019-08-22 03:24:05 +00:00
FormatFactory & FormatFactory : : instance ( )
{
static FormatFactory ret ;
return ret ;
}
2018-06-10 19:22:49 +00:00
}