delete stream parsing

2024-09-28 20:50:49 +00:00 · 2020-10-06 20:50:50 +03:00 · 2020-10-06 20:50:50 +03:00 · 7dbf71cf23
commit 7dbf71cf23
parent a89d6bc75a
2 changed files with 0 additions and 480 deletions
--- a/src/DataStreams/ParallelParsingBlockInputStream.cpp
+++ b/src/DataStreams/ParallelParsingBlockInputStream.cpp
@ -1,306 +0,0 @@
-#include <DataStreams/ParallelParsingBlockInputStream.h>
-#include <IO/ReadBuffer.h>
-#include <Common/CurrentThread.h>
-#include <Common/setThreadName.h>
-#include <ext/scope_guard.h>
-
-namespace DB
-{
-
-ParallelParsingBlockInputStream::ParallelParsingBlockInputStream(const Params & params)
-    : header(params.input_creator_params.sample),
-      row_input_format_params(params.input_creator_params.row_input_format_params),
-      format_settings(params.input_creator_params.settings),
-      input_processor_creator(params.input_processor_creator),
-      min_chunk_bytes(params.min_chunk_bytes),
-      original_buffer(params.read_buffer),
-      // Subtract one thread that we use for segmentation and one for
-      // reading. After that, must have at least two threads left for
-      // parsing. See the assertion below.
-      pool(std::max(2, static_cast<int>(params.max_threads) - 2)),
-      file_segmentation_engine(params.file_segmentation_engine)
-{
-    // See comment above.
-    assert(params.max_threads >= 4);
-
-    // One unit for each thread, including segmentator and reader, plus a
-    // couple more units so that the segmentation thread doesn't spuriously
-    // bump into reader thread on wraparound.
-    processing_units.resize(params.max_threads + 2);
-
-    segmentator_thread = ThreadFromGlobalPool(
-        &ParallelParsingBlockInputStream::segmentatorThreadFunction, this, CurrentThread::getGroup());
-}
-
-ParallelParsingBlockInputStream::~ParallelParsingBlockInputStream()
-{
-    finishAndWait();
-}
-
-void ParallelParsingBlockInputStream::cancel(bool kill)
-{
-    /**
-      * Can be called multiple times, from different threads. Saturate the
-      * the kill flag with OR.
-      */
-    if (kill)
-        is_killed = true;
-    is_cancelled = true;
-
-    /*
-     * The format parsers themselves are not being cancelled here, so we'll
-     * have to wait until they process the current block. Given that the
-     * chunk size is on the order of megabytes, this shouldn't be too long.
-     * We can't call IInputFormat->cancel here, because the parser object is
-     * local to the parser thread, and we don't want to introduce any
-     * synchronization between parser threads and the other threads to get
-     * better performance. An ideal solution would be to add a callback to
-     * IInputFormat that checks whether it was cancelled.
-     */
-
-    finishAndWait();
-}
-
-void ParallelParsingBlockInputStream::scheduleParserThreadForUnitWithNumber(size_t ticket_number)
-{
-    pool.scheduleOrThrowOnError([this, ticket_number, group = CurrentThread::getGroup()]()
-    {
-        parserThreadFunction(group, ticket_number);
-    });
-}
-
-void ParallelParsingBlockInputStream::finishAndWait()
-{
-    finished = true;
-
-    {
-        std::unique_lock<std::mutex> lock(mutex);
-        segmentator_condvar.notify_all();
-        reader_condvar.notify_all();
-    }
-
-    if (segmentator_thread.joinable())
-        segmentator_thread.join();
-
-    try
-    {
-        pool.wait();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
-}
-
-void ParallelParsingBlockInputStream::segmentatorThreadFunction(ThreadGroupStatusPtr thread_group)
-{
-    SCOPE_EXIT(
-        if (thread_group)
-            CurrentThread::detachQueryIfNotDetached();
-    );
-    if (thread_group)
-        CurrentThread::attachTo(thread_group);
-
-    setThreadName("Segmentator");
-
-    try
-    {
-        while (!finished)
-        {
-            const auto current_unit_number = segmentator_ticket_number % processing_units.size();
-            auto & unit = processing_units[current_unit_number];
-
-            {
-                std::unique_lock<std::mutex> lock(mutex);
-                segmentator_condvar.wait(lock,
-                    [&]{ return unit.status == READY_TO_INSERT || finished; });
-            }
-
-            if (finished)
-            {
-                break;
-            }
-
-            assert(unit.status == READY_TO_INSERT);
-
-            // Segmentating the original input.
-            unit.segment.resize(0);
-
-            const bool have_more_data = file_segmentation_engine(original_buffer,
-                unit.segment, min_chunk_bytes);
-
-            unit.is_last = !have_more_data;
-            unit.status = READY_TO_PARSE;
-            scheduleParserThreadForUnitWithNumber(segmentator_ticket_number);
-            ++segmentator_ticket_number;
-
-            if (!have_more_data)
-            {
-                break;
-            }
-        }
-    }
-    catch (...)
-    {
-        onBackgroundException();
-    }
-}
-
-void ParallelParsingBlockInputStream::parserThreadFunction(ThreadGroupStatusPtr thread_group, size_t current_ticket_number)
-{
-    SCOPE_EXIT(
-        if (thread_group)
-            CurrentThread::detachQueryIfNotDetached();
-    );
-    if (thread_group)
-        CurrentThread::attachTo(thread_group);
-
-    setThreadName("ChunkParser");
-
-    try
-    {
-        const auto current_unit_number = current_ticket_number % processing_units.size();
-        auto & unit = processing_units[current_unit_number];
-
-        /*
-         * This is kind of suspicious -- the input_process_creator contract with
-         * respect to multithreaded use is not clear, but we hope that it is
-         * just a 'normal' factory class that doesn't have any state, and so we
-         * can use it from multiple threads simultaneously.
-         */
-        ReadBuffer read_buffer(unit.segment.data(), unit.segment.size(), 0);
-        auto format = input_processor_creator(read_buffer, header, row_input_format_params, format_settings);
-        auto parser = std::make_unique<InputStreamFromInputFormat>(std::move(format));
-
-        unit.block_ext.block.clear();
-        unit.block_ext.block_missing_values.clear();
-
-        // We don't know how many blocks will be. So we have to read them all
-        // until an empty block occurred.
-        Block block;
-        while (!finished && (block = parser->read()) != Block())
-        {
-            unit.block_ext.block.emplace_back(block);
-            unit.block_ext.block_missing_values.emplace_back(parser->getMissingValues());
-        }
-
-        // We suppose we will get at least some blocks for a non-empty buffer,
-        // except at the end of file. Also see a matching assert in readImpl().
-        assert(unit.is_last || !unit.block_ext.block.empty());
-
-        std::unique_lock<std::mutex> lock(mutex);
-        unit.status = READY_TO_READ;
-        reader_condvar.notify_all();
-    }
-    catch (...)
-    {
-        onBackgroundException();
-    }
-}
-
-void ParallelParsingBlockInputStream::onBackgroundException()
-{
-    tryLogCurrentException(__PRETTY_FUNCTION__);
-
-    std::unique_lock<std::mutex> lock(mutex);
-    if (!background_exception)
-    {
-        background_exception = std::current_exception();
-    }
-    finished = true;
-    reader_condvar.notify_all();
-    segmentator_condvar.notify_all();
-}
-
-Block ParallelParsingBlockInputStream::readImpl()
-{
-    if (isCancelledOrThrowIfKilled() || finished)
-    {
-        /**
-          * Check for background exception and rethrow it before we return.
-          */
-        std::unique_lock<std::mutex> lock(mutex);
-        if (background_exception)
-        {
-            lock.unlock();
-            cancel(false);
-            std::rethrow_exception(background_exception);
-        }
-
-        return Block{};
-    }
-
-    const auto current_unit_number = reader_ticket_number % processing_units.size();
-    auto & unit = processing_units[current_unit_number];
-
-    if (!next_block_in_current_unit.has_value())
-    {
-        // We have read out all the Blocks from the previous Processing Unit,
-        // wait for the current one to become ready.
-        std::unique_lock<std::mutex> lock(mutex);
-        reader_condvar.wait(lock, [&](){ return unit.status == READY_TO_READ || finished; });
-
-        if (finished)
-        {
-            /**
-              * Check for background exception and rethrow it before we return.
-              */
-            if (background_exception)
-            {
-                lock.unlock();
-                cancel(false);
-                std::rethrow_exception(background_exception);
-            }
-
-            return Block{};
-        }
-
-        assert(unit.status == READY_TO_READ);
-        next_block_in_current_unit = 0;
-    }
-
-    if (unit.block_ext.block.empty())
-    {
-        /*
-         * Can we get zero blocks for an entire segment, when the format parser
-         * skips it entire content and does not create any blocks? Probably not,
-         * but if we ever do, we should add a loop around the above if, to skip
-         * these. Also see a matching assert in the parser thread.
-         */
-        assert(unit.is_last);
-        finished = true;
-        return Block{};
-    }
-
-    assert(next_block_in_current_unit.value() < unit.block_ext.block.size());
-
-    Block res = std::move(unit.block_ext.block.at(*next_block_in_current_unit));
-    last_block_missing_values = std::move(unit.block_ext.block_missing_values[*next_block_in_current_unit]);
-
-    next_block_in_current_unit.value() += 1;
-
-    if (*next_block_in_current_unit == unit.block_ext.block.size())
-    {
-        // Finished reading this Processing Unit, move to the next one.
-        next_block_in_current_unit.reset();
-        ++reader_ticket_number;
-
-        if (unit.is_last)
-        {
-            // It it was the last unit, we're finished.
-            finished = true;
-        }
-        else
-        {
-            // Pass the unit back to the segmentator.
-            std::unique_lock<std::mutex> lock(mutex);
-            unit.status = READY_TO_INSERT;
-            segmentator_condvar.notify_all();
-        }
-    }
-
-    return res;
-}
-
-
-}
--- a/src/DataStreams/ParallelParsingBlockInputStream.h
+++ b/src/DataStreams/ParallelParsingBlockInputStream.h
@ -1,174 +0,0 @@
-#pragma once
-
-#include <DataStreams/IBlockInputStream.h>
-#include <Formats/FormatFactory.h>
-#include <Common/ThreadPool.h>
-#include <Processors/Formats/IRowInputFormat.h>
-#include <Processors/Formats/InputStreamFromInputFormat.h>
-
-namespace DB
-{
-
-class ReadBuffer;
-
-/**
- * ORDER-PRESERVING parallel parsing of data formats.
- * It splits original data into chunks. Then each chunk is parsed by different thread.
- * The number of chunks equals to the number or parser threads.
- * The size of chunk is equal to min_chunk_bytes_for_parallel_parsing setting.
- *
- * This stream has three kinds of threads: one segmentator, multiple parsers,
- * and one reader thread -- that is, the one from which readImpl() is called.
- * They operate one after another on parts of data called "processing units".
- * One unit consists of buffer with raw data from file, filled by segmentator
- * thread. This raw data is then parsed by a parser thread to form a number of
- * Blocks. These Blocks are returned to the parent stream from readImpl().
- * After being read out, a processing unit is reused, to save on allocating
- * memory for the raw buffer. The processing units are organized into a circular
- * array to facilitate reuse and to apply backpressure on the segmentator thread
- * -- after it runs out of processing units, it has to wait for the reader to
- * read out the previous blocks.
- * The outline of what the threads do is as follows:
- * segmentator thread:
- *  1) wait for the next processing unit to become empty
- *  2) fill it with a part of input file
- *  3) start a parser thread
- *  4) repeat until eof
- * parser thread:
- *  1) parse the given raw buffer without any synchronization
- *  2) signal that the given unit is ready to read
- *  3) finish
- * readImpl():
- *  1) wait for the next processing unit to become ready to read
- *  2) take the blocks from the processing unit to return them to the caller
- *  3) signal that the processing unit is empty
- *  4) repeat until it encounters unit that is marked as "past_the_end"
- * All threads must also check for cancel/eof/exception flags.
- */
-class ParallelParsingBlockInputStream : public IBlockInputStream
-{
-private:
-    using ReadCallback = std::function<void()>;
-
-    using InputProcessorCreator = std::function<InputFormatPtr(
-            ReadBuffer & buf,
-            const Block & header,
-            const RowInputFormatParams & params,
-            const FormatSettings & settings)>;
-public:
-    struct InputCreatorParams
-    {
-        const Block & sample;
-        const RowInputFormatParams & row_input_format_params;
-        const FormatSettings &settings;
-    };
-
-    struct Params
-    {
-        ReadBuffer & read_buffer;
-        const InputProcessorCreator & input_processor_creator;
-        const InputCreatorParams & input_creator_params;
-        FormatFactory::FileSegmentationEngine file_segmentation_engine;
-        size_t max_threads;
-        size_t min_chunk_bytes;
-    };
-
-    explicit ParallelParsingBlockInputStream(const Params & params);
-    ~ParallelParsingBlockInputStream() override;
-
-    String getName() const override { return "ParallelParsing"; }
-    Block getHeader() const override { return header; }
-
-    void cancel(bool kill) override;
-
-protected:
-    // Reader routine
-    Block readImpl() override;
-
-    const BlockMissingValues & getMissingValues() const override
-    {
-        return last_block_missing_values;
-    }
-
-private:
-    const Block header;
-    const RowInputFormatParams row_input_format_params;
-    const FormatSettings format_settings;
-    const InputProcessorCreator input_processor_creator;
-
-    const size_t min_chunk_bytes;
-
-    /*
-     * This is declared as atomic to avoid UB, because parser threads access it
-     * without synchronization.
-     */
-    std::atomic<bool> finished{false};
-
-    BlockMissingValues last_block_missing_values;
-
-    // Original ReadBuffer to read from.
-    ReadBuffer & original_buffer;
-
-    //Non-atomic because it is used in one thread.
-    std::optional<size_t> next_block_in_current_unit;
-    size_t segmentator_ticket_number{0};
-    size_t reader_ticket_number{0};
-
-    std::mutex mutex;
-    std::condition_variable reader_condvar;
-    std::condition_variable segmentator_condvar;
-
-    // There are multiple "parsers", that's why we use thread pool.
-    ThreadPool pool;
-    // Reading and segmentating the file
-    ThreadFromGlobalPool segmentator_thread;
-
-    // Function to segment the file. Then "parsers" will parse that segments.
-    FormatFactory::FileSegmentationEngine file_segmentation_engine;
-
-    enum ProcessingUnitStatus
-    {
-        READY_TO_INSERT,
-        READY_TO_PARSE,
-        READY_TO_READ
-    };
-
-    struct BlockExt
-    {
-        std::vector<Block> block;
-        std::vector<BlockMissingValues> block_missing_values;
-    };
-
-    struct ProcessingUnit
-    {
-        explicit ProcessingUnit()
-            : status(ProcessingUnitStatus::READY_TO_INSERT)
-        {
-        }
-
-        BlockExt block_ext;
-        Memory<> segment;
-        std::atomic<ProcessingUnitStatus> status;
-        bool is_last{false};
-    };
-
-    std::exception_ptr background_exception = nullptr;
-
-    // We use deque instead of vector, because it does not require a move
-    // constructor, which is absent for atomics that are inside ProcessingUnit.
-    std::deque<ProcessingUnit> processing_units;
-
-    void scheduleParserThreadForUnitWithNumber(size_t ticket_number);
-    void finishAndWait();
-
-    void segmentatorThreadFunction(ThreadGroupStatusPtr thread_group);
-    void parserThreadFunction(ThreadGroupStatusPtr thread_group, size_t current_ticket_number);
-
-    // Save/log a background exception, set termination flag, wake up all
-    // threads. This function is used by segmentator and parsed threads.
-    // readImpl() is called from the main thread, so the exception handling
-    // is different.
-    void onBackgroundException();
-};
-
-}