2018-05-24 01:02:16 +00:00
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
|
|
#include <Processors/ISource.h>
|
2021-08-27 03:00:12 +00:00
|
|
|
|
#include <IO/ReadBuffer.h>
|
2022-04-26 10:42:56 +00:00
|
|
|
|
#include <Interpreters/Context.h>
|
|
|
|
|
#include <Formats/ColumnMapping.h>
|
2021-03-01 19:58:55 +00:00
|
|
|
|
|
2018-05-24 01:02:16 +00:00
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
|
{
|
2021-03-01 19:58:55 +00:00
|
|
|
|
|
|
|
|
|
using ColumnMappingPtr = std::shared_ptr<ColumnMapping>;
|
2018-05-24 01:02:16 +00:00
|
|
|
|
|
2022-08-23 04:17:36 +00:00
|
|
|
|
struct InputFormatErrorRow
|
|
|
|
|
{
|
|
|
|
|
String time;
|
|
|
|
|
size_t offset;
|
|
|
|
|
String reason;
|
|
|
|
|
String raw_data;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
using InputFormatErrorRows = std::vector<InputFormatErrorRow>;
|
|
|
|
|
|
2018-05-24 01:02:16 +00:00
|
|
|
|
/** Input format is a source, that reads data from ReadBuffer.
|
|
|
|
|
*/
|
|
|
|
|
class IInputFormat : public ISource
|
|
|
|
|
{
|
2019-02-19 18:41:18 +00:00
|
|
|
|
protected:
|
2019-02-05 13:01:40 +00:00
|
|
|
|
|
|
|
|
|
/// Skip GCC warning: ‘maybe_unused’ attribute ignored
|
|
|
|
|
#pragma GCC diagnostic push
|
|
|
|
|
#pragma GCC diagnostic ignored "-Wattributes"
|
|
|
|
|
|
2021-08-27 03:00:12 +00:00
|
|
|
|
ReadBuffer * in [[maybe_unused]];
|
2018-05-24 01:02:16 +00:00
|
|
|
|
|
2019-02-05 13:01:40 +00:00
|
|
|
|
#pragma GCC diagnostic pop
|
|
|
|
|
|
2018-05-24 01:02:16 +00:00
|
|
|
|
public:
|
2019-11-26 23:46:19 +00:00
|
|
|
|
IInputFormat(Block header, ReadBuffer & in_);
|
|
|
|
|
|
|
|
|
|
/** In some usecase (hello Kafka) we need to read a lot of tiny streams in exactly the same format.
|
|
|
|
|
* The recreating of parser for each small stream takes too long, so we introduce a method
|
2019-12-26 15:59:38 +00:00
|
|
|
|
* resetParser() which allow to reset the state of parser to continue reading of
|
2022-04-17 23:02:49 +00:00
|
|
|
|
* source stream without recreating that.
|
2019-11-26 23:46:19 +00:00
|
|
|
|
* That should be called after current buffer was fully read.
|
|
|
|
|
*/
|
|
|
|
|
virtual void resetParser();
|
2019-07-30 18:48:40 +00:00
|
|
|
|
|
2021-10-20 14:17:20 +00:00
|
|
|
|
virtual void setReadBuffer(ReadBuffer & in_);
|
2022-04-15 23:56:45 +00:00
|
|
|
|
const ReadBuffer & getReadBuffer() const { return *in; }
|
2021-10-20 14:17:20 +00:00
|
|
|
|
|
2019-07-30 18:48:40 +00:00
|
|
|
|
virtual const BlockMissingValues & getMissingValues() const
|
|
|
|
|
{
|
|
|
|
|
static const BlockMissingValues none;
|
|
|
|
|
return none;
|
|
|
|
|
}
|
2020-02-07 13:16:51 +00:00
|
|
|
|
|
2021-03-01 19:58:55 +00:00
|
|
|
|
/// Must be called from ParallelParsingInputFormat after readSuffix
|
|
|
|
|
ColumnMappingPtr getColumnMapping() const { return column_mapping; }
|
|
|
|
|
/// Must be called from ParallelParsingInputFormat before readPrefix
|
2021-03-01 22:32:11 +00:00
|
|
|
|
void setColumnMapping(ColumnMappingPtr column_mapping_) { column_mapping = column_mapping_; }
|
2021-03-01 19:58:55 +00:00
|
|
|
|
|
2020-02-07 13:16:51 +00:00
|
|
|
|
size_t getCurrentUnitNumber() const { return current_unit_number; }
|
|
|
|
|
void setCurrentUnitNumber(size_t current_unit_number_) { current_unit_number = current_unit_number_; }
|
|
|
|
|
|
2021-07-20 18:18:43 +00:00
|
|
|
|
void addBuffer(std::unique_ptr<ReadBuffer> buffer) { owned_buffers.emplace_back(std::move(buffer)); }
|
|
|
|
|
|
2022-08-23 04:17:36 +00:00
|
|
|
|
void addErrorRow(InputFormatErrorRow && error_row) { error_rows.emplace_back(error_row); }
|
|
|
|
|
InputFormatErrorRows & getErrorRows() { return error_rows; }
|
|
|
|
|
|
|
|
|
|
void addErrorRows(InputFormatErrorRows & source_error_rows)
|
|
|
|
|
{
|
|
|
|
|
multi_error_rows.emplace_back(InputFormatErrorRows());
|
|
|
|
|
multi_error_rows.back().swap(source_error_rows);
|
|
|
|
|
}
|
|
|
|
|
const std::list<InputFormatErrorRows> & getMultiErrorRows() { return multi_error_rows; }
|
|
|
|
|
|
|
|
|
|
bool isEmptyErrorRows() { return error_rows.empty(); }
|
|
|
|
|
bool isEmptyMultiErrorRows() { return multi_error_rows.empty(); }
|
|
|
|
|
|
2021-03-01 19:58:55 +00:00
|
|
|
|
protected:
|
|
|
|
|
ColumnMappingPtr column_mapping{};
|
|
|
|
|
|
2020-02-07 13:16:51 +00:00
|
|
|
|
private:
|
|
|
|
|
/// Number of currently parsed chunk (if parallel parsing is enabled)
|
|
|
|
|
size_t current_unit_number = 0;
|
2021-07-20 18:18:43 +00:00
|
|
|
|
|
2022-08-23 04:17:36 +00:00
|
|
|
|
InputFormatErrorRows error_rows;
|
|
|
|
|
std::list<InputFormatErrorRows> multi_error_rows;
|
|
|
|
|
|
2021-07-20 18:18:43 +00:00
|
|
|
|
std::vector<std::unique_ptr<ReadBuffer>> owned_buffers;
|
2018-05-24 01:02:16 +00:00
|
|
|
|
};
|
|
|
|
|
|
2021-08-26 00:31:46 +00:00
|
|
|
|
using InputFormatPtr = std::shared_ptr<IInputFormat>;
|
|
|
|
|
|
2018-05-24 01:02:16 +00:00
|
|
|
|
}
|