ClickHouse/src/Formats/FormatFactory.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

239 lines
9.2 KiB
C++
Raw Normal View History

#pragma once
#include <Common/Allocator.h>
2020-02-03 10:02:52 +00:00
#include <Columns/IColumn.h>
#include <Formats/FormatSettings.h>
#include <Interpreters/Context_fwd.h>
#include <IO/BufferWithOwnMemory.h>
2022-01-07 05:16:41 +00:00
#include <IO/CompressionMethod.h>
2021-10-02 07:13:14 +00:00
#include <base/types.h>
#include <Core/NamesAndTypes.h>
#include <boost/noncopyable.hpp>
#include <functional>
#include <memory>
#include <unordered_map>
namespace DB
{
class Block;
struct Settings;
struct FormatFactorySettings;
class ReadBuffer;
class WriteBuffer;
2019-03-26 18:28:37 +00:00
class IProcessor;
using ProcessorPtr = std::shared_ptr<IProcessor>;
2019-02-19 18:41:18 +00:00
class IInputFormat;
class IOutputFormat;
struct RowInputFormatParams;
struct RowOutputFormatParams;
2019-02-19 18:41:18 +00:00
class ISchemaReader;
class IExternalSchemaReader;
using SchemaReaderPtr = std::shared_ptr<ISchemaReader>;
using ExternalSchemaReaderPtr = std::shared_ptr<IExternalSchemaReader>;
2019-02-19 18:41:18 +00:00
using InputFormatPtr = std::shared_ptr<IInputFormat>;
using OutputFormatPtr = std::shared_ptr<IOutputFormat>;
template <typename Allocator>
struct Memory;
2021-06-01 12:20:52 +00:00
FormatSettings getFormatSettings(ContextPtr context);
template <typename T>
2021-06-01 12:20:52 +00:00
FormatSettings getFormatSettings(ContextPtr context, const T & settings);
2021-10-19 09:58:10 +00:00
/** Allows to create an IInputFormat or IOutputFormat by the name of the format.
* Note: format and compression are independent things.
*/
class FormatFactory final : private boost::noncopyable
{
public:
2019-06-14 17:19:02 +00:00
/// This callback allows to perform some additional actions after reading a single row.
/// It's initial purpose was to extract payload for virtual columns from Kafka Consumer ReadBuffer.
using ReadCallback = std::function<void()>;
2019-10-01 10:48:46 +00:00
/** Fast reading data from buffer and save result to memory.
* Reads at least `min_bytes` and some more until the end of the chunk, depends on the format.
* If `max_rows` is non-zero the function also stops after reading the `max_rows` number of rows
* (even if the `min_bytes` boundary isn't reached yet).
2022-05-09 19:13:02 +00:00
* Used in ParallelParsingInputFormat.
2019-10-01 10:48:46 +00:00
*/
2020-11-30 16:42:41 +00:00
using FileSegmentationEngine = std::function<std::pair<bool, size_t>(
2019-10-01 10:48:46 +00:00
ReadBuffer & buf,
DB::Memory<Allocator<false>> & memory,
size_t min_bytes,
size_t max_rows)>;
2019-10-01 10:48:46 +00:00
/// This callback allows to perform some additional actions after writing a single row.
/// It's initial purpose was to flush Kafka message for each row.
2020-02-03 10:02:52 +00:00
using WriteCallback = std::function<void(
const Columns & columns,
size_t row)>;
2021-10-20 12:47:20 +00:00
private:
2021-11-10 17:29:52 +00:00
using InputCreator = std::function<InputFormatPtr(
ReadBuffer & buf,
const Block & header,
const RowInputFormatParams & params,
const FormatSettings & settings)>;
2019-02-19 18:41:18 +00:00
2021-10-11 16:11:50 +00:00
using OutputCreator = std::function<OutputFormatPtr(
2019-02-19 18:41:18 +00:00
WriteBuffer & buf,
const Block & sample,
const RowOutputFormatParams & params,
2019-02-19 18:41:18 +00:00
const FormatSettings & settings)>;
/// Some input formats can have non trivial readPrefix() and readSuffix(),
/// so in some cases there is no possibility to use parallel parsing.
/// The checker should return true if parallel parsing should be disabled.
using NonTrivialPrefixAndSuffixChecker = std::function<bool(ReadBuffer & buf)>;
2022-01-14 15:16:18 +00:00
/// Some formats can support append depending on settings.
/// The checker should return true if format support append.
using AppendSupportChecker = std::function<bool(const FormatSettings & settings)>;
using SchemaReaderCreator = std::function<SchemaReaderPtr(ReadBuffer & in, const FormatSettings & settings)>;
using ExternalSchemaReaderCreator = std::function<ExternalSchemaReaderPtr(const FormatSettings & settings)>;
/// Some formats can extract different schemas from the same source depending on
/// some settings. To process this case in schema cache we should add some additional
/// information to a cache key. This getter should return some string with information
/// about such settings. For example, for Protobuf format it's the path to the schema
/// and the name of the message.
using AdditionalInfoForSchemaCacheGetter = std::function<String(const FormatSettings & settings)>;
2019-08-02 14:41:19 +00:00
struct Creators
{
2019-10-01 10:48:46 +00:00
InputCreator input_creator;
2019-08-02 14:41:19 +00:00
OutputCreator output_creator;
2019-10-01 10:48:46 +00:00
FileSegmentationEngine file_segmentation_engine;
SchemaReaderCreator schema_reader_creator;
ExternalSchemaReaderCreator external_schema_reader_creator;
2020-10-06 14:02:01 +00:00
bool supports_parallel_formatting{false};
2022-05-20 14:57:27 +00:00
bool supports_subset_of_columns{false};
NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker;
2022-01-14 15:16:18 +00:00
AppendSupportChecker append_support_checker;
AdditionalInfoForSchemaCacheGetter additional_info_for_schema_cache_getter;
2019-08-02 14:41:19 +00:00
};
using FormatsDictionary = std::unordered_map<String, Creators>;
2022-01-07 05:16:41 +00:00
using FileExtensionFormats = std::unordered_map<String, String>;
public:
static FormatFactory & instance();
2020-05-18 10:00:22 +00:00
InputFormatPtr getInput(
const String & name,
ReadBuffer & buf,
const Block & sample,
2021-06-01 12:20:52 +00:00
ContextPtr context,
UInt64 max_block_size,
const std::optional<FormatSettings> & format_settings = std::nullopt) const;
InputFormatPtr getInputFormat(
const String & name,
ReadBuffer & buf,
const Block & sample,
2021-06-01 12:20:52 +00:00
ContextPtr context,
UInt64 max_block_size,
const std::optional<FormatSettings> & format_settings = std::nullopt) const;
2019-02-19 18:41:18 +00:00
2020-12-30 03:07:30 +00:00
/// Checks all preconditions. Returns ordinary format if parallel formatting cannot be done.
OutputFormatPtr getOutputFormatParallelIfPossible(
const String & name,
WriteBuffer & buf,
const Block & sample,
2021-06-01 12:20:52 +00:00
ContextPtr context,
WriteCallback callback = {},
2020-12-30 03:07:30 +00:00
const std::optional<FormatSettings> & format_settings = std::nullopt) const;
OutputFormatPtr getOutputFormat(
const String & name,
WriteBuffer & buf,
const Block & sample,
2021-06-01 12:20:52 +00:00
ContextPtr context,
WriteCallback callback = {},
const std::optional<FormatSettings> & _format_settings = std::nullopt) const;
String getContentType(
const String & name,
ContextPtr context,
const std::optional<FormatSettings> & format_settings = std::nullopt) const;
2019-02-19 18:41:18 +00:00
SchemaReaderPtr getSchemaReader(
const String & name,
ReadBuffer & buf,
ContextPtr & context,
const std::optional<FormatSettings> & format_settings = std::nullopt) const;
ExternalSchemaReaderPtr getExternalSchemaReader(
const String & name,
ContextPtr & context,
const std::optional<FormatSettings> & format_settings = std::nullopt) const;
2019-10-01 10:48:46 +00:00
void registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine);
void registerNonTrivialPrefixAndSuffixChecker(const String & name, NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker);
2022-01-14 15:16:18 +00:00
void registerAppendSupportChecker(const String & name, AppendSupportChecker append_support_checker);
2022-01-14 15:16:18 +00:00
/// If format always doesn't support append, you can use this method instead of
/// registerAppendSupportChecker with append_support_checker that always returns true.
2022-01-24 13:27:04 +00:00
void markFormatHasNoAppendSupport(const String & name);
2022-01-14 15:16:18 +00:00
bool checkIfFormatSupportAppend(const String & name, ContextPtr context, const std::optional<FormatSettings> & format_settings_ = std::nullopt);
2021-10-11 16:11:50 +00:00
/// Register format by its name.
void registerInputFormat(const String & name, InputCreator input_creator);
void registerOutputFormat(const String & name, OutputCreator output_creator);
2019-02-19 18:41:18 +00:00
2022-01-07 05:16:41 +00:00
/// Register file extension for format
void registerFileExtension(const String & extension, const String & format_name);
String getFormatFromFileName(String file_name, bool throw_if_not_found = false);
String getFormatFromFileDescriptor(int fd);
2022-01-07 05:16:41 +00:00
/// Register schema readers for format its name.
void registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator);
void registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator);
2020-10-06 14:02:01 +00:00
void markOutputFormatSupportsParallelFormatting(const String & name);
2022-05-13 18:39:19 +00:00
void markFormatSupportsSubsetOfColumns(const String & name);
2021-03-30 21:25:37 +00:00
2022-05-20 14:57:27 +00:00
bool checkIfFormatSupportsSubsetOfColumns(const String & name) const;
2020-10-06 14:02:01 +00:00
2022-05-20 14:57:27 +00:00
bool checkIfFormatHasSchemaReader(const String & name) const;
bool checkIfFormatHasExternalSchemaReader(const String & name) const;
bool checkIfFormatHasAnySchemaReader(const String & name) const;
void registerAdditionalInfoForSchemaCacheGetter(const String & name, AdditionalInfoForSchemaCacheGetter additional_info_for_schema_cache_getter);
String getAdditionalInfoForSchemaCache(const String & name, ContextPtr context, const std::optional<FormatSettings> & format_settings_ = std::nullopt);
2019-08-02 14:41:19 +00:00
const FormatsDictionary & getAllFormats() const
2018-07-20 15:59:11 +00:00
{
2019-08-02 14:41:19 +00:00
return dict;
}
2021-09-16 17:18:34 +00:00
bool isInputFormat(const String & name) const;
bool isOutputFormat(const String & name) const;
2022-05-23 12:48:48 +00:00
/// Check that format with specified name exists and throw an exception otherwise.
void checkFormatName(const String & name) const;
private:
2019-08-02 14:41:19 +00:00
FormatsDictionary dict;
2022-01-07 05:16:41 +00:00
FileExtensionFormats file_extension_formats;
2019-08-02 14:41:19 +00:00
const Creators & getCreators(const String & name) const;
};
}