ClickHouse/src/Processors/Formats/IInputFormat.h

91 lines
2.9 KiB
C++
Raw Normal View History

#pragma once
#include <Processors/ISource.h>
2021-03-01 19:58:55 +00:00
#include <memory>
namespace DB
{
2021-03-01 19:58:55 +00:00
/// Used to pass info from header between different InputFormats in ParallelParsing
struct ColumnMapping
{
2021-03-02 13:31:19 +00:00
/// Non-atomic because there is strict `happens-before` between read and write access
/// See InputFormatParallelParsing
2021-06-10 13:57:19 +00:00
bool is_set{false};
2021-03-01 19:58:55 +00:00
/// Maps indexes of columns in the input file to indexes of table columns
using OptionalIndexes = std::vector<std::optional<size_t>>;
OptionalIndexes column_indexes_for_input_fields;
/// Tracks which columns we have read in a single read() call.
/// For columns that are never read, it is initialized to false when we
/// read the file header, and never changed afterwards.
/// For other columns, it is updated on each read() call.
std::vector<UInt8> read_columns;
2021-06-10 13:57:19 +00:00
/// Whether we have any columns that are not read from file at all,
/// and must be always initialized with defaults.
bool have_always_default_columns{false};
2021-03-01 19:58:55 +00:00
};
using ColumnMappingPtr = std::shared_ptr<ColumnMapping>;
class ReadBuffer;
/** Input format is a source, that reads data from ReadBuffer.
*/
class IInputFormat : public ISource
{
2019-02-19 18:41:18 +00:00
protected:
2019-02-05 13:01:40 +00:00
/// Skip GCC warning: maybe_unused attribute ignored
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wattributes"
ReadBuffer & in [[maybe_unused]];
2019-02-05 13:01:40 +00:00
#pragma GCC diagnostic pop
public:
IInputFormat(Block header, ReadBuffer & in_);
/** In some usecase (hello Kafka) we need to read a lot of tiny streams in exactly the same format.
* The recreating of parser for each small stream takes too long, so we introduce a method
2019-12-26 15:59:38 +00:00
* resetParser() which allow to reset the state of parser to continue reading of
* source stream w/o recreating that.
* That should be called after current buffer was fully read.
*/
virtual void resetParser();
2019-07-30 18:48:40 +00:00
virtual const BlockMissingValues & getMissingValues() const
{
static const BlockMissingValues none;
return none;
}
2021-03-01 19:58:55 +00:00
/// Must be called from ParallelParsingInputFormat after readSuffix
ColumnMappingPtr getColumnMapping() const { return column_mapping; }
/// Must be called from ParallelParsingInputFormat before readPrefix
2021-03-01 22:32:11 +00:00
void setColumnMapping(ColumnMappingPtr column_mapping_) { column_mapping = column_mapping_; }
2021-03-01 19:58:55 +00:00
size_t getCurrentUnitNumber() const { return current_unit_number; }
void setCurrentUnitNumber(size_t current_unit_number_) { current_unit_number = current_unit_number_; }
2021-07-20 18:18:43 +00:00
void addBuffer(std::unique_ptr<ReadBuffer> buffer) { owned_buffers.emplace_back(std::move(buffer)); }
void setReadBuffer(ReadBuffer & in_);
2021-07-20 18:18:43 +00:00
2021-03-01 19:58:55 +00:00
protected:
ColumnMappingPtr column_mapping{};
private:
/// Number of currently parsed chunk (if parallel parsing is enabled)
size_t current_unit_number = 0;
2021-07-20 18:18:43 +00:00
std::vector<std::unique_ptr<ReadBuffer>> owned_buffers;
};
using InputFormatPtr = std::shared_ptr<IInputFormat>;
}