ClickHouse/dbms/src/DataStreams/ParallelParsingBlockInputStream.h

#pragma once

#include <DataStreams/IBlockInputStream.h>
#include <Formats/FormatFactory.h>
#include <Common/ThreadPool.h>
#include <Common/setThreadName.h>
#include <IO/BufferWithOwnMemory.h>
#include <IO/ReadBuffer.h>
#include <Processors/Formats/IRowInputFormat.h>
#include <Processors/Formats/InputStreamFromInputFormat.h>
#include <Interpreters/Context.h>

namespace DB
{

/**
 * ORDER-PRESERVING parallel parsing of data formats.
 * It splits original data into chunks. Then each chunk is parsed by different thread.
 * The number of chunks equals to max_threads_for_parallel_parsing setting.
 * The size of chunk is equal to min_chunk_size_for_parallel_parsing setting.
 *
 * This stream has three kinds of threads: one segmentator, multiple parsers
 * (max_threads_for_parallel_parsing) and one reader thread -- that is, the one
 * from which readImpl() is called.
 * They operate one after another on parts of data called "processing units".
 * One unit consists of buffer with raw data from file, filled by segmentator
 * thread. This raw data is then parsed by a parser thread to form a number of
 * Blocks. These Blocks are returned to the parent stream from readImpl().
 * After being read out, a processing unit is reused, to save on allocating
 * memory for the raw buffer. The processing units are organized into a circular
 * array to facilitate reuse and to apply backpressure on the segmentator thread
 * -- after it runs out of processing units, it has to wait for the reader to
 * read out the previous blocks.
 * The outline of what the threads do is as follows:
 * segmentator thread:
 *  1) wait for the next processing unit to become empty
 *  2) fill it with a part of input file
 *  3) start a parser thread
 *  4) repeat until eof
 * parser thread:
 *  1) parse the given raw buffer without any synchronization
 *  2) signal that the given unit is ready to read
 *  3) finish
 * readImpl():
 *  1) wait for the next processing unit to become ready to read
 *  2) take the blocks from the processing unit to return them to the caller
 *  3) signal that the processing unit is empty
 *  4) repeat until it encounters unit that is marked as "past_the_end"
 * All threads must also check for cancel/eof/exception flags.
 */
class ParallelParsingBlockInputStream : public IBlockInputStream
{
private:
    using ReadCallback = std::function<void()>;

    using InputProcessorCreator = std::function<InputFormatPtr(
            ReadBuffer & buf,
            const Block & header,
            const Context & context,
            const RowInputFormatParams & params,
            const FormatSettings & settings)>;
public:
    struct InputCreatorParams
    {
        const Block &sample;
        const Context &context;
        const RowInputFormatParams& row_input_format_params;
        const FormatSettings &settings;
    };

    struct Builder
    {
        ReadBuffer & read_buffer;
        const InputProcessorCreator &input_processor_creator;
        const InputCreatorParams &input_creator_params;
        FormatFactory::FileSegmentationEngine file_segmentation_engine;
        size_t max_threads_to_use;
        size_t min_chunk_size;
    };

    explicit ParallelParsingBlockInputStream(const Builder & builder)
            : header(builder.input_creator_params.sample),
              context(builder.input_creator_params.context),
              row_input_format_params(builder.input_creator_params.row_input_format_params),
              format_settings(builder.input_creator_params.settings),
              input_processor_creator(builder.input_processor_creator),
              min_chunk_size(builder.min_chunk_size),
              original_buffer(builder.read_buffer),
              pool(builder.max_threads_to_use),
              file_segmentation_engine(builder.file_segmentation_engine)
    {
        // Allocate more units than threads to decrease segmentator
        // waiting on reader on wraparound. The number is random.
        processing_units.resize(builder.max_threads_to_use + 4);

        segmentator_thread = ThreadFromGlobalPool([this] { segmentatorThreadFunction(); });
    }

    String getName() const override { return "ParallelParsing"; }

    ~ParallelParsingBlockInputStream() override
    {
        finishAndWait();
    }

    void cancel(bool kill) override
    {
        /**
          * Can be called multiple times, from different threads. Saturate the
          * the kill flag with OR.
          */
        if (kill)
            is_killed = true;
        is_cancelled = true;

        /*
         * The format parsers themselves are not being cancelled here, so we'll
         * have to wait until they process the current block. Given that the
         * chunk size is on the order of megabytes, this should't be too long.
         * We can't call IInputFormat->cancel here, because the parser object is
         * local to the parser thread, and we don't want to introduce any
         * synchronization between parser threads and the other threads to get
         * better performance. An ideal solution would be to add a callback to
         * IInputFormat that checks whether it was cancelled.
         */

        finishAndWait();
    }

    Block getHeader() const override
    {
        return header;
    }

protected:
    //Reader routine
    Block readImpl() override;

    const BlockMissingValues & getMissingValues() const override
    {
        return last_block_missing_values;
    }

private:
    const Block header;
    const Context context;
    const RowInputFormatParams row_input_format_params;
    const FormatSettings format_settings;
    const InputProcessorCreator input_processor_creator;

    const size_t min_chunk_size;

    /*
     * This is declared as atomic to avoid UB, because parser threads access it
     * without synchronization.
     */
    std::atomic<bool> finished{false};

    BlockMissingValues last_block_missing_values;

    // Original ReadBuffer to read from.
    ReadBuffer & original_buffer;

    //Non-atomic because it is used in one thread.
    std::optional<size_t> next_block_in_current_unit;
    size_t segmentator_ticket_number{0};
    size_t reader_ticket_number{0};

    std::mutex mutex;
    std::condition_variable reader_condvar;
    std::condition_variable segmentator_condvar;

    // There are multiple "parsers", that's why we use thread pool.
    ThreadPool pool;
    // Reading and segmentating the file
    ThreadFromGlobalPool segmentator_thread;

    // Function to segment the file. Then "parsers" will parse that segments.
    FormatFactory::FileSegmentationEngine file_segmentation_engine;

    enum ProcessingUnitStatus
    {
        READY_TO_INSERT,
        READY_TO_PARSE,
        READY_TO_READ
    };

    struct BlockExt
    {
        std::vector<Block> block;
        std::vector<BlockMissingValues> block_missing_values;
    };

    struct ProcessingUnit
    {
        explicit ProcessingUnit()
            : status(ProcessingUnitStatus::READY_TO_INSERT)
        {
        }

        BlockExt block_ext;
        Memory<> segment;
        std::atomic<ProcessingUnitStatus> status;
        bool is_last{false};
    };

    std::exception_ptr background_exception = nullptr;

    // We use deque instead of vector, because it does not require a move
    // constructor, which is absent for atomics that are inside ProcessingUnit.
    std::deque<ProcessingUnit> processing_units;


    void scheduleParserThreadForUnitWithNumber(size_t unit_number)
    {
        pool.scheduleOrThrowOnError(std::bind(&ParallelParsingBlockInputStream::parserThreadFunction, this, unit_number));
    }

    void finishAndWait()
    {
        finished = true;

        {
            std::unique_lock lock(mutex);
            segmentator_condvar.notify_all();
            reader_condvar.notify_all();
        }

        if (segmentator_thread.joinable())
            segmentator_thread.join();

        try
        {
            pool.wait();
        }
        catch (...)
        {
            tryLogCurrentException(__PRETTY_FUNCTION__);
        }
    }

    void segmentatorThreadFunction();
    void parserThreadFunction(size_t bucket_num);

    // Save/log a background exception, set termination flag, wake up all
    // threads. This function is used by segmentator and parsed threads.
    // readImpl() is called from the main thread, so the exception handling
    // is different.
    void onBackgroundException();
};

};
lost files 2019-10-01 10:51:17 +00:00			`#pragma once`

			`#include <DataStreams/IBlockInputStream.h>`
			`#include <Formats/FormatFactory.h>`
			`#include <Common/ThreadPool.h>`
			`#include <Common/setThreadName.h>`
			`#include <IO/BufferWithOwnMemory.h>`
			`#include <IO/ReadBuffer.h>`
			`#include <Processors/Formats/IRowInputFormat.h>`
			`#include <Processors/Formats/InputStreamFromInputFormat.h>`
better 2019-10-24 14:00:51 +00:00			`#include <Interpreters/Context.h>`
lost files 2019-10-01 10:51:17 +00:00
			`namespace DB`
			`{`

comments 2019-10-24 16:52:55 +00:00			`/**`
			`* ORDER-PRESERVING parallel parsing of data formats.`
			`* It splits original data into chunks. Then each chunk is parsed by different thread.`
docs 2019-10-25 14:32:57 +00:00			`* The number of chunks equals to max_threads_for_parallel_parsing setting.`
			`* The size of chunk is equal to min_chunk_size_for_parallel_parsing setting.`
Cleaup 2019-11-14 15:53:20 +00:00			`*`
			`* This stream has three kinds of threads: one segmentator, multiple parsers`
			`* (max_threads_for_parallel_parsing) and one reader thread -- that is, the one`
			`* from which readImpl() is called.`
			`* They operate one after another on parts of data called "processing units".`
			`* One unit consists of buffer with raw data from file, filled by segmentator`
			`* thread. This raw data is then parsed by a parser thread to form a number of`
			`* Blocks. These Blocks are returned to the parent stream from readImpl().`
			`* After being read out, a processing unit is reused, to save on allocating`
			`* memory for the raw buffer. The processing units are organized into a circular`
			`* array to facilitate reuse and to apply backpressure on the segmentator thread`
			`* -- after it runs out of processing units, it has to wait for the reader to`
			`* read out the previous blocks.`
			`* The outline of what the threads do is as follows:`
			`* segmentator thread:`
			`* 1) wait for the next processing unit to become empty`
			`* 2) fill it with a part of input file`
			`* 3) start a parser thread`
			`* 4) repeat until eof`
			`* parser thread:`
			`* 1) parse the given raw buffer without any synchronization`
			`* 2) signal that the given unit is ready to read`
			`* 3) finish`
			`* readImpl():`
			`* 1) wait for the next processing unit to become ready to read`
			`* 2) take the blocks from the processing unit to return them to the caller`
			`* 3) signal that the processing unit is empty`
			`* 4) repeat until it encounters unit that is marked as "past_the_end"`
			`* All threads must also check for cancel/eof/exception flags.`
comments 2019-10-24 16:52:55 +00:00			`*/`
lost files 2019-10-01 10:51:17 +00:00			`class ParallelParsingBlockInputStream : public IBlockInputStream`
			`{`
			`private:`
			`using ReadCallback = std::function<void()>;`

			`using InputProcessorCreator = std::function<InputFormatPtr(`
			`ReadBuffer & buf,`
			`const Block & header,`
			`const Context & context,`
			`const RowInputFormatParams & params,`
			`const FormatSettings & settings)>;`
			`public:`
			`struct InputCreatorParams`
			`{`
			`const Block &sample;`
			`const Context &context;`
			`const RowInputFormatParams& row_input_format_params;`
			`const FormatSettings &settings;`
			`};`

			`struct Builder`
			`{`
processing unit added 2019-11-11 11:20:11 +00:00			`ReadBuffer & read_buffer;`
lost files 2019-10-01 10:51:17 +00:00			`const InputProcessorCreator &input_processor_creator;`
			`const InputCreatorParams &input_creator_params;`
			`FormatFactory::FileSegmentationEngine file_segmentation_engine;`
			`size_t max_threads_to_use;`
			`size_t min_chunk_size;`
			`};`

better 2019-10-24 14:00:51 +00:00			`explicit ParallelParsingBlockInputStream(const Builder & builder)`
			`: header(builder.input_creator_params.sample),`
			`context(builder.input_creator_params.context),`
			`row_input_format_params(builder.input_creator_params.row_input_format_params),`
			`format_settings(builder.input_creator_params.settings),`
			`input_processor_creator(builder.input_processor_creator),`
lost files 2019-10-01 10:51:17 +00:00			`min_chunk_size(builder.min_chunk_size),`
			`original_buffer(builder.read_buffer),`
			`pool(builder.max_threads_to_use),`
			`file_segmentation_engine(builder.file_segmentation_engine)`
			`{`
Cleaup 2019-11-14 15:53:20 +00:00			`// Allocate more units than threads to decrease segmentator`
			`// waiting on reader on wraparound. The number is random.`
Make parser and read buffer local to parser thread 2019-11-18 13:41:40 +00:00			`processing_units.resize(builder.max_threads_to_use + 4);`
lost files 2019-10-01 10:51:17 +00:00
			`segmentator_thread = ThreadFromGlobalPool([this] { segmentatorThreadFunction(); });`
			`}`

			`String getName() const override { return "ParallelParsing"; }`

			`~ParallelParsingBlockInputStream() override`
			`{`
Some renames & remove weird logic from cancel() 2019-11-14 13:57:10 +00:00			`finishAndWait();`
lost files 2019-10-01 10:51:17 +00:00			`}`

			`void cancel(bool kill) override`
			`{`
Some renames & remove weird logic from cancel() 2019-11-14 13:57:10 +00:00			`/**`
			`* Can be called multiple times, from different threads. Saturate the`
			`* the kill flag with OR.`
			`*/`
lost files 2019-10-01 10:51:17 +00:00			`if (kill)`
			`is_killed = true;`
Some renames & remove weird logic from cancel() 2019-11-14 13:57:10 +00:00			`is_cancelled = true;`
comment 2019-11-11 10:32:25 +00:00
Make parser and read buffer local to parser thread 2019-11-18 13:41:40 +00:00			`/*`
			`* The format parsers themselves are not being cancelled here, so we'll`
			`* have to wait until they process the current block. Given that the`
			`* chunk size is on the order of megabytes, this should't be too long.`
			`* We can't call IInputFormat->cancel here, because the parser object is`
			`* local to the parser thread, and we don't want to introduce any`
			`* synchronization between parser threads and the other threads to get`
			`* better performance. An ideal solution would be to add a callback to`
			`* IInputFormat that checks whether it was cancelled.`
			`*/`
lost files 2019-10-01 10:51:17 +00:00
Some renames & remove weird logic from cancel() 2019-11-14 13:57:10 +00:00			`finishAndWait();`
lost files 2019-10-01 10:51:17 +00:00			`}`

			`Block getHeader() const override`
			`{`
better 2019-10-24 14:00:51 +00:00			`return header;`
lost files 2019-10-01 10:51:17 +00:00			`}`

			`protected:`
			`//Reader routine`
			`Block readImpl() override;`

			`const BlockMissingValues & getMissingValues() const override`
			`{`
some changes after review 2019-10-22 18:01:44 +00:00			`return last_block_missing_values;`
lost files 2019-10-01 10:51:17 +00:00			`}`

			`private:`
better 2019-10-24 14:00:51 +00:00			`const Block header;`
			`const Context context;`
			`const RowInputFormatParams row_input_format_params;`
			`const FormatSettings format_settings;`
			`const InputProcessorCreator input_processor_creator;`
lost files 2019-10-01 10:51:17 +00:00
			`const size_t min_chunk_size;`

Some renames & remove weird logic from cancel() 2019-11-14 13:57:10 +00:00			`/*`
			`* This is declared as atomic to avoid UB, because parser threads access it`
			`* without synchronization.`
			`*/`
			`std::atomic<bool> finished{false};`
lost files 2019-10-01 10:51:17 +00:00
some changes after review 2019-10-22 18:01:44 +00:00			`BlockMissingValues last_block_missing_values;`
lost files 2019-10-01 10:51:17 +00:00
			`// Original ReadBuffer to read from.`
			`ReadBuffer & original_buffer;`

			`//Non-atomic because it is used in one thread.`
Cleaup 2019-11-14 15:53:20 +00:00			`std::optional<size_t> next_block_in_current_unit;`
lost files 2019-10-01 10:51:17 +00:00			`size_t segmentator_ticket_number{0};`
Cleaup 2019-11-14 15:53:20 +00:00			`size_t reader_ticket_number{0};`
lost files 2019-10-01 10:51:17 +00:00
			`std::mutex mutex;`
			`std::condition_variable reader_condvar;`
			`std::condition_variable segmentator_condvar;`

			`// There are multiple "parsers", that's why we use thread pool.`
			`ThreadPool pool;`
			`// Reading and segmentating the file`
			`ThreadFromGlobalPool segmentator_thread;`

			`// Function to segment the file. Then "parsers" will parse that segments.`
			`FormatFactory::FileSegmentationEngine file_segmentation_engine;`

			`enum ProcessingUnitStatus`
			`{`
			`READY_TO_INSERT,`
			`READY_TO_PARSE,`
			`READY_TO_READ`
			`};`

some changes after review 2019-10-22 18:01:44 +00:00			`struct BlockExt`
lost files 2019-10-01 10:51:17 +00:00			`{`
fix max_insert_block_size 2019-10-23 13:15:03 +00:00			`std::vector<Block> block;`
			`std::vector<BlockMissingValues> block_missing_values;`
lost files 2019-10-01 10:51:17 +00:00			`};`

processing unit added 2019-11-11 11:20:11 +00:00			`struct ProcessingUnit`
			`{`
Make parser and read buffer local to parser thread 2019-11-18 13:41:40 +00:00			`explicit ProcessingUnit()`
			`: status(ProcessingUnitStatus::READY_TO_INSERT)`
processing unit added 2019-11-11 11:20:11 +00:00			`{`
			`}`
fix data race on vector<bool> 2019-10-23 10:39:33 +00:00
processing unit added 2019-11-11 11:20:11 +00:00			`BlockExt block_ext;`
remove MemoryExt<> 2019-11-18 13:10:14 +00:00			`Memory<> segment;`
processing unit added 2019-11-11 11:20:11 +00:00			`std::atomic<ProcessingUnitStatus> status;`
wip: a saner segmentation function for TSV 2019-11-15 18:08:17 +00:00			`bool is_last{false};`
processing unit added 2019-11-11 11:20:11 +00:00			`};`
lost files 2019-10-01 10:51:17 +00:00
Cleaup 2019-11-14 15:53:20 +00:00			`std::exception_ptr background_exception = nullptr;`

			`// We use deque instead of vector, because it does not require a move`
			`// constructor, which is absent for atomics that are inside ProcessingUnit.`
processing unit added 2019-11-11 11:20:11 +00:00			`std::deque<ProcessingUnit> processing_units;`
lost files 2019-10-01 10:51:17 +00:00
Cleaup 2019-11-14 15:53:20 +00:00
lost files 2019-10-01 10:51:17 +00:00			`void scheduleParserThreadForUnitWithNumber(size_t unit_number)`
			`{`
some changes after review 2019-10-22 18:01:44 +00:00			`pool.scheduleOrThrowOnError(std::bind(&ParallelParsingBlockInputStream::parserThreadFunction, this, unit_number));`
lost files 2019-10-01 10:51:17 +00:00			`}`

Some renames & remove weird logic from cancel() 2019-11-14 13:57:10 +00:00			`void finishAndWait()`
small changes after review 2019-10-02 14:26:15 +00:00			`{`
Cleaup 2019-11-14 15:53:20 +00:00			`finished = true;`
Some renames & remove weird logic from cancel() 2019-11-14 13:57:10 +00:00
small changes after review 2019-10-02 14:26:15 +00:00			`{`
			`std::unique_lock lock(mutex);`
			`segmentator_condvar.notify_all();`
			`reader_condvar.notify_all();`
			`}`

			`if (segmentator_thread.joinable())`
			`segmentator_thread.join();`

			`try`
			`{`
			`pool.wait();`
			`}`
			`catch (...)`
			`{`
			`tryLogCurrentException(__PRETTY_FUNCTION__);`
			`}`
			`}`

lost files 2019-10-01 10:51:17 +00:00			`void segmentatorThreadFunction();`
			`void parserThreadFunction(size_t bucket_num);`
Cleaup 2019-11-14 15:53:20 +00:00
			`// Save/log a background exception, set termination flag, wake up all`
			`// threads. This function is used by segmentator and parsed threads.`
			`// readImpl() is called from the main thread, so the exception handling`
			`// is different.`
			`void onBackgroundException();`
lost files 2019-10-01 10:51:17 +00:00			`};`

			`};`