ClickHouse/src/DataStreams/ParallelParsingBlockInputStream.h

176 lines
5.7 KiB
C++
Raw Normal View History

2019-10-01 10:51:17 +00:00
#pragma once
#include <DataStreams/IBlockInputStream.h>
#include <Formats/FormatFactory.h>
#include <Common/ThreadPool.h>
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/InputStreamFromInputFormat.h>
namespace DB
{
class ReadBuffer;
2019-10-24 16:52:55 +00:00
/**
* ORDER-PRESERVING parallel parsing of data formats.
* It splits original data into chunks. Then each chunk is parsed by different thread.
2019-11-18 19:27:29 +00:00
* The number of chunks equals to the number or parser threads.
2019-11-18 19:25:17 +00:00
* The size of chunk is equal to min_chunk_bytes_for_parallel_parsing setting.
2019-11-14 15:53:20 +00:00
*
2019-11-18 19:27:29 +00:00
* This stream has three kinds of threads: one segmentator, multiple parsers,
* and one reader thread -- that is, the one from which readImpl() is called.
2019-11-14 15:53:20 +00:00
* They operate one after another on parts of data called "processing units".
* One unit consists of buffer with raw data from file, filled by segmentator
* thread. This raw data is then parsed by a parser thread to form a number of
* Blocks. These Blocks are returned to the parent stream from readImpl().
* After being read out, a processing unit is reused, to save on allocating
* memory for the raw buffer. The processing units are organized into a circular
* array to facilitate reuse and to apply backpressure on the segmentator thread
* -- after it runs out of processing units, it has to wait for the reader to
* read out the previous blocks.
* The outline of what the threads do is as follows:
* segmentator thread:
* 1) wait for the next processing unit to become empty
* 2) fill it with a part of input file
* 3) start a parser thread
* 4) repeat until eof
* parser thread:
* 1) parse the given raw buffer without any synchronization
* 2) signal that the given unit is ready to read
* 3) finish
* readImpl():
* 1) wait for the next processing unit to become ready to read
* 2) take the blocks from the processing unit to return them to the caller
* 3) signal that the processing unit is empty
* 4) repeat until it encounters unit that is marked as "past_the_end"
* All threads must also check for cancel/eof/exception flags.
2019-10-24 16:52:55 +00:00
*/
2019-10-01 10:51:17 +00:00
class ParallelParsingBlockInputStream : public IBlockInputStream
{
private:
using ReadCallback = std::function<void()>;
using InputProcessorCreator = std::function<InputFormatPtr(
ReadBuffer & buf,
const Block & header,
const RowInputFormatParams & params,
const FormatSettings & settings)>;
public:
struct InputCreatorParams
{
const Block & sample;
const RowInputFormatParams & row_input_format_params;
2019-10-01 10:51:17 +00:00
const FormatSettings &settings;
};
struct Params
2019-10-01 10:51:17 +00:00
{
2019-11-11 11:20:11 +00:00
ReadBuffer & read_buffer;
const InputProcessorCreator & input_processor_creator;
const InputCreatorParams & input_creator_params;
2019-10-01 10:51:17 +00:00
FormatFactory::FileSegmentationEngine file_segmentation_engine;
int max_threads;
2019-11-18 19:25:17 +00:00
size_t min_chunk_bytes;
2019-10-01 10:51:17 +00:00
};
explicit ParallelParsingBlockInputStream(const Params & params);
~ParallelParsingBlockInputStream() override;
2019-10-01 10:51:17 +00:00
String getName() const override { return "ParallelParsing"; }
Block getHeader() const override { return header; }
2019-10-01 10:51:17 +00:00
void cancel(bool kill) override;
2019-10-01 10:51:17 +00:00
protected:
// Reader routine
2019-10-01 10:51:17 +00:00
Block readImpl() override;
const BlockMissingValues & getMissingValues() const override
{
2019-10-22 18:01:44 +00:00
return last_block_missing_values;
2019-10-01 10:51:17 +00:00
}
private:
2019-10-24 14:00:51 +00:00
const Block header;
const RowInputFormatParams row_input_format_params;
const FormatSettings format_settings;
const InputProcessorCreator input_processor_creator;
2019-10-01 10:51:17 +00:00
2019-11-18 19:25:17 +00:00
const size_t min_chunk_bytes;
2019-10-01 10:51:17 +00:00
/*
* This is declared as atomic to avoid UB, because parser threads access it
* without synchronization.
*/
std::atomic<bool> finished{false};
2019-10-01 10:51:17 +00:00
2019-10-22 18:01:44 +00:00
BlockMissingValues last_block_missing_values;
2019-10-01 10:51:17 +00:00
// Original ReadBuffer to read from.
ReadBuffer & original_buffer;
//Non-atomic because it is used in one thread.
2019-11-14 15:53:20 +00:00
std::optional<size_t> next_block_in_current_unit;
2019-10-01 10:51:17 +00:00
size_t segmentator_ticket_number{0};
2019-11-14 15:53:20 +00:00
size_t reader_ticket_number{0};
2019-10-01 10:51:17 +00:00
std::mutex mutex;
std::condition_variable reader_condvar;
std::condition_variable segmentator_condvar;
// There are multiple "parsers", that's why we use thread pool.
ThreadPool pool;
// Reading and segmentating the file
ThreadFromGlobalPool segmentator_thread;
// Function to segment the file. Then "parsers" will parse that segments.
FormatFactory::FileSegmentationEngine file_segmentation_engine;
enum ProcessingUnitStatus
{
READY_TO_INSERT,
READY_TO_PARSE,
READY_TO_READ
};
2019-10-22 18:01:44 +00:00
struct BlockExt
2019-10-01 10:51:17 +00:00
{
2019-10-23 13:15:03 +00:00
std::vector<Block> block;
std::vector<BlockMissingValues> block_missing_values;
2019-10-01 10:51:17 +00:00
};
2019-11-11 11:20:11 +00:00
struct ProcessingUnit
{
explicit ProcessingUnit()
: status(ProcessingUnitStatus::READY_TO_INSERT)
2019-11-11 11:20:11 +00:00
{
}
2019-10-23 10:39:33 +00:00
2019-11-11 11:20:11 +00:00
BlockExt block_ext;
2019-11-18 13:10:14 +00:00
Memory<> segment;
2019-11-11 11:20:11 +00:00
std::atomic<ProcessingUnitStatus> status;
bool is_last{false};
2019-11-11 11:20:11 +00:00
};
2019-10-01 10:51:17 +00:00
2019-11-14 15:53:20 +00:00
std::exception_ptr background_exception = nullptr;
// We use deque instead of vector, because it does not require a move
// constructor, which is absent for atomics that are inside ProcessingUnit.
2019-11-11 11:20:11 +00:00
std::deque<ProcessingUnit> processing_units;
2019-10-01 10:51:17 +00:00
2019-11-14 15:53:20 +00:00
void scheduleParserThreadForUnitWithNumber(size_t ticket_number);
void finishAndWait();
2019-10-02 14:26:15 +00:00
void segmentatorThreadFunction(ThreadGroupStatusPtr thread_group);
void parserThreadFunction(ThreadGroupStatusPtr thread_group, size_t current_ticket_number);
2019-11-14 15:53:20 +00:00
// Save/log a background exception, set termination flag, wake up all
// threads. This function is used by segmentator and parsed threads.
// readImpl() is called from the main thread, so the exception handling
// is different.
void onBackgroundException();
2019-10-01 10:51:17 +00:00
};
2019-12-05 14:34:35 +00:00
}