2018-06-10 19:22:49 +00:00
|
|
|
#pragma once
|
|
|
|
|
2020-02-03 10:02:52 +00:00
|
|
|
#include <Columns/IColumn.h>
|
2019-05-17 14:34:25 +00:00
|
|
|
#include <DataStreams/IBlockStream_fwd.h>
|
2020-11-02 07:50:38 +00:00
|
|
|
#include <Formats/FormatSettings.h>
|
2021-04-10 23:33:54 +00:00
|
|
|
#include <Interpreters/Context_fwd.h>
|
2019-10-01 10:48:46 +00:00
|
|
|
#include <IO/BufferWithOwnMemory.h>
|
2021-04-10 23:33:54 +00:00
|
|
|
#include <common/types.h>
|
|
|
|
|
|
|
|
#include <boost/noncopyable.hpp>
|
2019-05-17 14:34:25 +00:00
|
|
|
|
2018-06-10 19:22:49 +00:00
|
|
|
#include <functional>
|
2019-05-17 14:34:25 +00:00
|
|
|
#include <memory>
|
2018-06-10 19:22:49 +00:00
|
|
|
#include <unordered_map>
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
class Block;
|
2020-11-02 07:50:38 +00:00
|
|
|
struct Settings;
|
2020-11-07 08:53:39 +00:00
|
|
|
struct FormatFactorySettings;
|
2018-06-10 19:22:49 +00:00
|
|
|
|
|
|
|
class ReadBuffer;
|
|
|
|
class WriteBuffer;
|
|
|
|
|
2019-03-26 18:28:37 +00:00
|
|
|
class IProcessor;
|
|
|
|
using ProcessorPtr = std::shared_ptr<IProcessor>;
|
|
|
|
|
2019-02-19 18:41:18 +00:00
|
|
|
class IInputFormat;
|
|
|
|
class IOutputFormat;
|
|
|
|
|
2019-04-05 11:39:07 +00:00
|
|
|
struct RowInputFormatParams;
|
2020-10-06 12:47:52 +00:00
|
|
|
struct RowOutputFormatParams;
|
2019-02-19 18:41:18 +00:00
|
|
|
|
|
|
|
using InputFormatPtr = std::shared_ptr<IInputFormat>;
|
|
|
|
using OutputFormatPtr = std::shared_ptr<IOutputFormat>;
|
|
|
|
|
2021-06-01 12:20:52 +00:00
|
|
|
FormatSettings getFormatSettings(ContextPtr context);
|
2018-06-10 19:22:49 +00:00
|
|
|
|
2020-11-07 08:53:39 +00:00
|
|
|
template <typename T>
|
2021-06-01 12:20:52 +00:00
|
|
|
FormatSettings getFormatSettings(ContextPtr context, const T & settings);
|
2018-06-10 19:22:49 +00:00
|
|
|
|
|
|
|
/** Allows to create an IBlockInputStream or IBlockOutputStream by the name of the format.
|
|
|
|
* Note: format and compression are independent things.
|
|
|
|
*/
|
2019-08-22 03:24:05 +00:00
|
|
|
class FormatFactory final : private boost::noncopyable
|
2018-06-10 19:22:49 +00:00
|
|
|
{
|
2019-05-23 11:15:18 +00:00
|
|
|
public:
|
2019-06-14 17:19:02 +00:00
|
|
|
/// This callback allows to perform some additional actions after reading a single row.
|
|
|
|
/// It's initial purpose was to extract payload for virtual columns from Kafka Consumer ReadBuffer.
|
2019-05-23 13:20:25 +00:00
|
|
|
using ReadCallback = std::function<void()>;
|
2019-05-23 11:15:18 +00:00
|
|
|
|
2019-10-01 10:48:46 +00:00
|
|
|
/** Fast reading data from buffer and save result to memory.
|
2019-11-18 19:25:17 +00:00
|
|
|
* Reads at least min_chunk_bytes and some more until the end of the chunk, depends on the format.
|
2019-10-25 12:28:24 +00:00
|
|
|
* Used in ParallelParsingBlockInputStream.
|
2019-10-01 10:48:46 +00:00
|
|
|
*/
|
2020-11-30 16:42:41 +00:00
|
|
|
using FileSegmentationEngine = std::function<std::pair<bool, size_t>(
|
2019-10-01 10:48:46 +00:00
|
|
|
ReadBuffer & buf,
|
|
|
|
DB::Memory<> & memory,
|
2019-11-18 19:25:17 +00:00
|
|
|
size_t min_chunk_bytes)>;
|
2019-10-01 10:48:46 +00:00
|
|
|
|
2019-08-20 11:17:57 +00:00
|
|
|
/// This callback allows to perform some additional actions after writing a single row.
|
|
|
|
/// It's initial purpose was to flush Kafka message for each row.
|
2020-02-03 10:02:52 +00:00
|
|
|
using WriteCallback = std::function<void(
|
|
|
|
const Columns & columns,
|
|
|
|
size_t row)>;
|
2019-08-20 11:17:57 +00:00
|
|
|
|
2018-06-10 19:22:49 +00:00
|
|
|
private:
|
|
|
|
using InputCreator = std::function<BlockInputStreamPtr(
|
|
|
|
ReadBuffer & buf,
|
|
|
|
const Block & sample,
|
2019-02-10 16:55:12 +00:00
|
|
|
UInt64 max_block_size,
|
2019-05-23 13:20:25 +00:00
|
|
|
ReadCallback callback,
|
2018-06-10 19:22:49 +00:00
|
|
|
const FormatSettings & settings)>;
|
|
|
|
|
|
|
|
using OutputCreator = std::function<BlockOutputStreamPtr(
|
|
|
|
WriteBuffer & buf,
|
|
|
|
const Block & sample,
|
2019-08-20 11:17:57 +00:00
|
|
|
WriteCallback callback,
|
2018-06-10 19:22:49 +00:00
|
|
|
const FormatSettings & settings)>;
|
|
|
|
|
2020-05-18 10:00:22 +00:00
|
|
|
using InputProcessorCreatorFunc = InputFormatPtr(
|
|
|
|
ReadBuffer & buf,
|
|
|
|
const Block & header,
|
|
|
|
const RowInputFormatParams & params,
|
|
|
|
const FormatSettings & settings);
|
|
|
|
|
|
|
|
using InputProcessorCreator = std::function<InputProcessorCreatorFunc>;
|
2019-02-19 18:41:18 +00:00
|
|
|
|
|
|
|
using OutputProcessorCreator = std::function<OutputFormatPtr(
|
|
|
|
WriteBuffer & buf,
|
|
|
|
const Block & sample,
|
2020-10-06 12:47:52 +00:00
|
|
|
const RowOutputFormatParams & params,
|
2019-02-19 18:41:18 +00:00
|
|
|
const FormatSettings & settings)>;
|
|
|
|
|
2021-06-23 13:17:34 +00:00
|
|
|
/// Some input formats can have non trivial readPrefix() and readSuffix(),
|
|
|
|
/// so in some cases there is no possibility to use parallel parsing.
|
|
|
|
/// The checker should return true if parallel parsing should be disabled.
|
|
|
|
using NonTrivialPrefixAndSuffixChecker = std::function<bool(ReadBuffer & buf)>;
|
|
|
|
|
2019-08-02 14:41:19 +00:00
|
|
|
struct Creators
|
|
|
|
{
|
2019-10-01 10:48:46 +00:00
|
|
|
InputCreator input_creator;
|
2019-08-02 14:41:19 +00:00
|
|
|
OutputCreator output_creator;
|
|
|
|
InputProcessorCreator input_processor_creator;
|
|
|
|
OutputProcessorCreator output_processor_creator;
|
2019-10-01 10:48:46 +00:00
|
|
|
FileSegmentationEngine file_segmentation_engine;
|
2020-10-06 14:02:01 +00:00
|
|
|
bool supports_parallel_formatting{false};
|
2021-03-30 21:25:37 +00:00
|
|
|
bool is_column_oriented{false};
|
2021-06-23 13:17:34 +00:00
|
|
|
NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker;
|
2019-08-02 14:41:19 +00:00
|
|
|
};
|
2018-06-10 19:22:49 +00:00
|
|
|
|
|
|
|
using FormatsDictionary = std::unordered_map<String, Creators>;
|
|
|
|
|
|
|
|
public:
|
2019-08-22 03:24:05 +00:00
|
|
|
static FormatFactory & instance();
|
|
|
|
|
2020-05-18 10:00:22 +00:00
|
|
|
InputFormatPtr getInput(
|
2019-05-23 11:15:18 +00:00
|
|
|
const String & name,
|
|
|
|
ReadBuffer & buf,
|
|
|
|
const Block & sample,
|
2021-06-01 12:20:52 +00:00
|
|
|
ContextPtr context,
|
2019-05-23 11:15:18 +00:00
|
|
|
UInt64 max_block_size,
|
2020-11-07 08:53:39 +00:00
|
|
|
const std::optional<FormatSettings> & format_settings = std::nullopt) const;
|
2018-06-10 19:22:49 +00:00
|
|
|
|
2020-12-30 03:07:30 +00:00
|
|
|
/// Checks all preconditions. Returns ordinary stream if parallel formatting cannot be done.
|
2020-12-30 15:21:58 +00:00
|
|
|
/// Currently used only in Client. Don't use it something else! Better look at getOutputFormatParallelIfPossible.
|
2021-04-10 23:33:54 +00:00
|
|
|
BlockOutputStreamPtr getOutputStreamParallelIfPossible(
|
|
|
|
const String & name,
|
|
|
|
WriteBuffer & buf,
|
|
|
|
const Block & sample,
|
2021-06-01 12:20:52 +00:00
|
|
|
ContextPtr context,
|
2021-04-10 23:33:54 +00:00
|
|
|
WriteCallback callback = {},
|
2020-12-09 23:22:53 +00:00
|
|
|
const std::optional<FormatSettings> & format_settings = std::nullopt) const;
|
|
|
|
|
2020-12-30 15:21:58 +00:00
|
|
|
/// Currently used only in Client. Don't use it something else! Better look at getOutputFormat.
|
2021-04-10 23:33:54 +00:00
|
|
|
BlockOutputStreamPtr getOutputStream(
|
|
|
|
const String & name,
|
|
|
|
WriteBuffer & buf,
|
|
|
|
const Block & sample,
|
2021-06-01 12:20:52 +00:00
|
|
|
ContextPtr context,
|
2021-04-10 23:33:54 +00:00
|
|
|
WriteCallback callback = {},
|
2020-11-07 08:53:39 +00:00
|
|
|
const std::optional<FormatSettings> & format_settings = std::nullopt) const;
|
2018-06-10 19:22:49 +00:00
|
|
|
|
2019-07-24 18:00:09 +00:00
|
|
|
InputFormatPtr getInputFormat(
|
|
|
|
const String & name,
|
|
|
|
ReadBuffer & buf,
|
|
|
|
const Block & sample,
|
2021-06-01 12:20:52 +00:00
|
|
|
ContextPtr context,
|
2019-07-24 18:00:09 +00:00
|
|
|
UInt64 max_block_size,
|
2020-11-07 08:53:39 +00:00
|
|
|
const std::optional<FormatSettings> & format_settings = std::nullopt) const;
|
2019-02-19 18:41:18 +00:00
|
|
|
|
2020-12-30 03:07:30 +00:00
|
|
|
/// Checks all preconditions. Returns ordinary format if parallel formatting cannot be done.
|
|
|
|
OutputFormatPtr getOutputFormatParallelIfPossible(
|
2021-04-10 23:33:54 +00:00
|
|
|
const String & name,
|
|
|
|
WriteBuffer & buf,
|
|
|
|
const Block & sample,
|
2021-06-01 12:20:52 +00:00
|
|
|
ContextPtr context,
|
2021-04-10 23:33:54 +00:00
|
|
|
WriteCallback callback = {},
|
2020-12-30 03:07:30 +00:00
|
|
|
const std::optional<FormatSettings> & format_settings = std::nullopt) const;
|
|
|
|
|
2019-08-20 11:17:57 +00:00
|
|
|
OutputFormatPtr getOutputFormat(
|
2021-04-10 23:33:54 +00:00
|
|
|
const String & name,
|
|
|
|
WriteBuffer & buf,
|
|
|
|
const Block & sample,
|
2021-06-01 12:20:52 +00:00
|
|
|
ContextPtr context,
|
2021-04-10 23:33:54 +00:00
|
|
|
WriteCallback callback = {},
|
2020-11-07 08:53:39 +00:00
|
|
|
const std::optional<FormatSettings> & format_settings = std::nullopt) const;
|
2019-02-19 18:41:18 +00:00
|
|
|
|
2018-06-10 19:22:49 +00:00
|
|
|
/// Register format by its name.
|
|
|
|
void registerInputFormat(const String & name, InputCreator input_creator);
|
|
|
|
void registerOutputFormat(const String & name, OutputCreator output_creator);
|
2019-10-01 10:48:46 +00:00
|
|
|
void registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine);
|
2018-06-10 19:22:49 +00:00
|
|
|
|
2021-06-23 13:17:34 +00:00
|
|
|
void registerNonTrivialPrefixAndSuffixChecker(const String & name, NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker);
|
|
|
|
|
2019-02-19 18:41:18 +00:00
|
|
|
void registerInputFormatProcessor(const String & name, InputProcessorCreator input_creator);
|
|
|
|
void registerOutputFormatProcessor(const String & name, OutputProcessorCreator output_creator);
|
|
|
|
|
2020-10-06 14:02:01 +00:00
|
|
|
void markOutputFormatSupportsParallelFormatting(const String & name);
|
2021-03-30 21:25:37 +00:00
|
|
|
void markFormatAsColumnOriented(const String & name);
|
|
|
|
|
|
|
|
bool checkIfFormatIsColumnOriented(const String & name);
|
2020-10-06 14:02:01 +00:00
|
|
|
|
2019-08-02 14:41:19 +00:00
|
|
|
const FormatsDictionary & getAllFormats() const
|
2018-07-20 15:59:11 +00:00
|
|
|
{
|
2019-08-02 14:41:19 +00:00
|
|
|
return dict;
|
2018-07-20 10:00:56 +00:00
|
|
|
}
|
|
|
|
|
2018-06-10 19:22:49 +00:00
|
|
|
private:
|
2019-08-02 14:41:19 +00:00
|
|
|
FormatsDictionary dict;
|
2018-06-10 19:22:49 +00:00
|
|
|
|
2019-08-02 14:41:19 +00:00
|
|
|
const Creators & getCreators(const String & name) const;
|
2018-06-10 19:22:49 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
}
|