ClickHouse/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

258 lines
8.3 KiB
C++
Raw Normal View History

2020-06-11 00:36:57 +00:00
#include <Processors/Formats/Impl/ParallelParsingInputFormat.h>
2020-06-10 12:02:34 +00:00
#include <IO/ReadHelpers.h>
2022-04-15 23:56:45 +00:00
#include <IO/WithFileName.h>
2020-10-06 17:49:57 +00:00
#include <Common/CurrentThread.h>
#include <Common/setThreadName.h>
2022-11-28 20:31:55 +00:00
#include <Common/scope_guard_safe.h>
2020-05-18 10:00:22 +00:00
namespace DB
{
2020-10-06 17:49:57 +00:00
void ParallelParsingInputFormat::segmentatorThreadFunction(ThreadGroupStatusPtr thread_group)
2020-05-18 10:00:22 +00:00
{
2022-11-28 20:31:55 +00:00
SCOPE_EXIT_SAFE(
if (thread_group)
CurrentThread::detachQueryIfNotDetached();
);
2020-10-06 17:49:57 +00:00
if (thread_group)
CurrentThread::attachTo(thread_group);
2020-05-18 10:00:22 +00:00
setThreadName("Segmentator");
try
{
while (!parsing_finished)
{
2020-12-30 18:51:41 +00:00
const auto segmentator_unit_number = segmentator_ticket_number % processing_units.size();
auto & unit = processing_units[segmentator_unit_number];
2020-05-18 10:00:22 +00:00
{
std::unique_lock<std::mutex> lock(mutex);
2021-04-19 17:49:02 +00:00
segmentator_condvar.wait(lock, [&] { return unit.status == READY_TO_INSERT || parsing_finished; });
2020-05-18 10:00:22 +00:00
}
if (parsing_finished)
break;
assert(unit.status == READY_TO_INSERT);
// Segmentating the original input.
unit.segment.resize(0);
auto [have_more_data, currently_read_rows] = file_segmentation_engine(*in, unit.segment, min_chunk_bytes, max_block_size);
2020-12-28 12:53:58 +00:00
unit.offset = successfully_read_rows_count;
successfully_read_rows_count += currently_read_rows;
2020-05-18 10:00:22 +00:00
unit.is_last = !have_more_data;
unit.status = READY_TO_PARSE;
scheduleParserThreadForUnitWithNumber(segmentator_ticket_number);
++segmentator_ticket_number;
if (!have_more_data)
break;
}
}
catch (...)
{
2020-12-28 12:53:58 +00:00
onBackgroundException(successfully_read_rows_count);
2020-05-18 10:00:22 +00:00
}
}
2020-10-06 17:49:57 +00:00
void ParallelParsingInputFormat::parserThreadFunction(ThreadGroupStatusPtr thread_group, size_t current_ticket_number)
2020-05-18 10:00:22 +00:00
{
2022-11-28 20:31:55 +00:00
SCOPE_EXIT_SAFE(
if (thread_group)
CurrentThread::detachQueryIfNotDetached();
);
2020-10-06 17:49:57 +00:00
if (thread_group)
CurrentThread::attachToIfDetached(thread_group);
2020-10-06 17:49:57 +00:00
2020-12-30 13:55:31 +00:00
const auto parser_unit_number = current_ticket_number % processing_units.size();
auto & unit = processing_units[parser_unit_number];
2020-12-28 12:53:58 +00:00
2020-05-18 10:00:22 +00:00
try
{
setThreadName("ChunkParser");
/*
* This is kind of suspicious -- the input_process_creator contract with
* respect to multithreaded use is not clear, but we hope that it is
* just a 'normal' factory class that doesn't have any state, and so we
* can use it from multiple threads simultaneously.
*/
ReadBuffer read_buffer(unit.segment.data(), unit.segment.size(), 0);
2020-06-10 12:02:34 +00:00
2020-05-18 10:00:22 +00:00
InputFormatPtr input_format = internal_parser_creator(read_buffer);
2020-12-30 04:50:58 +00:00
input_format->setCurrentUnitNumber(current_ticket_number);
2022-09-05 15:42:49 +00:00
input_format->setErrorsLogger(errors_logger);
2020-05-18 10:00:22 +00:00
InternalParser parser(input_format);
unit.chunk_ext.chunk.clear();
unit.chunk_ext.block_missing_values.clear();
2021-03-01 19:58:55 +00:00
/// Propagate column_mapping to other parsers.
/// Note: column_mapping is used only for *WithNames types
if (current_ticket_number != 0)
input_format->setColumnMapping(column_mapping);
2020-05-18 10:00:22 +00:00
// We don't know how many blocks will be. So we have to read them all
2020-12-30 15:21:58 +00:00
// until an empty block occurred.
2020-05-18 10:00:22 +00:00
Chunk chunk;
2022-09-05 15:42:49 +00:00
while (!parsing_finished && (chunk = parser.getChunk()))
2020-05-18 10:00:22 +00:00
{
2022-09-05 15:42:49 +00:00
/// Variable chunk is moved, but it is not really used in the next iteration.
/// NOLINTNEXTLINE(bugprone-use-after-move, hicpp-invalid-access-moved)
unit.chunk_ext.chunk.emplace_back(std::move(chunk));
unit.chunk_ext.block_missing_values.emplace_back(parser.getMissingValues());
2020-05-18 10:00:22 +00:00
}
2021-03-01 22:32:11 +00:00
/// Extract column_mapping from first parser to propagate it to others
2021-03-01 19:58:55 +00:00
if (current_ticket_number == 0)
2021-03-02 13:31:19 +00:00
{
2021-03-01 19:58:55 +00:00
column_mapping = input_format->getColumnMapping();
2021-03-02 13:31:19 +00:00
column_mapping->is_set = true;
2021-03-09 22:20:38 +00:00
first_parser_finished.set();
2021-03-02 13:31:19 +00:00
}
2021-03-01 19:58:55 +00:00
2020-05-18 10:00:22 +00:00
// We suppose we will get at least some blocks for a non-empty buffer,
// except at the end of file. Also see a matching assert in readImpl().
2020-06-11 00:36:57 +00:00
assert(unit.is_last || !unit.chunk_ext.chunk.empty() || parsing_finished);
2020-05-18 10:00:22 +00:00
std::lock_guard<std::mutex> lock(mutex);
unit.status = READY_TO_READ;
reader_condvar.notify_all();
}
catch (...)
{
2020-12-28 12:53:58 +00:00
onBackgroundException(unit.offset);
2020-05-18 10:00:22 +00:00
}
}
2020-12-28 12:53:58 +00:00
void ParallelParsingInputFormat::onBackgroundException(size_t offset)
2020-05-18 10:00:22 +00:00
{
std::lock_guard lock(mutex);
2020-05-18 10:00:22 +00:00
if (!background_exception)
{
background_exception = std::current_exception();
2020-12-28 12:53:58 +00:00
if (ParsingException * e = exception_cast<ParsingException *>(background_exception))
2022-04-15 23:56:45 +00:00
{
/// NOTE: it is not that safe to use line number hack here (may exceed INT_MAX)
2020-12-28 12:53:58 +00:00
if (e->getLineNumber() != -1)
e->setLineNumber(static_cast<int>(e->getLineNumber() + offset));
2022-04-15 23:56:45 +00:00
auto file_name = getFileNameFromReadBuffer(getReadBuffer());
if (!file_name.empty())
e->setFileName(file_name);
}
2020-05-18 10:00:22 +00:00
}
2022-04-15 23:56:45 +00:00
if (is_server)
tryLogCurrentException(__PRETTY_FUNCTION__);
2022-04-15 23:56:45 +00:00
2020-05-18 10:00:22 +00:00
parsing_finished = true;
first_parser_finished.set();
2020-05-18 10:00:22 +00:00
reader_condvar.notify_all();
segmentator_condvar.notify_all();
}
2020-06-11 00:36:57 +00:00
Chunk ParallelParsingInputFormat::generate()
2020-05-18 10:00:22 +00:00
{
2021-04-26 22:56:47 +00:00
/// Delayed launching of segmentator thread
if (unlikely(!parsing_started.exchange(true)))
2021-04-26 15:50:40 +00:00
{
segmentator_thread = ThreadFromGlobalPool(
&ParallelParsingInputFormat::segmentatorThreadFunction, this, CurrentThread::getGroup());
}
2021-04-19 17:49:02 +00:00
2020-05-18 10:00:22 +00:00
if (isCancelled() || parsing_finished)
{
/**
* Check for background exception and rethrow it before we return.
*/
std::unique_lock<std::mutex> lock(mutex);
if (background_exception)
{
lock.unlock();
onCancel();
std::rethrow_exception(background_exception);
}
return {};
}
2020-12-30 13:55:31 +00:00
const auto inserter_unit_number = reader_ticket_number % processing_units.size();
auto & unit = processing_units[inserter_unit_number];
2020-05-18 10:00:22 +00:00
if (!next_block_in_current_unit.has_value())
{
// We have read out all the Blocks from the previous Processing Unit,
// wait for the current one to become ready.
std::unique_lock<std::mutex> lock(mutex);
reader_condvar.wait(lock, [&](){ return unit.status == READY_TO_READ || parsing_finished; });
if (parsing_finished)
{
/**
* Check for background exception and rethrow it before we return.
*/
if (background_exception)
{
lock.unlock();
cancel();
std::rethrow_exception(background_exception);
}
return {};
}
assert(unit.status == READY_TO_READ);
next_block_in_current_unit = 0;
}
if (unit.chunk_ext.chunk.empty())
{
/*
* Can we get zero blocks for an entire segment, when the format parser
* skips it entire content and does not create any blocks? Probably not,
* but if we ever do, we should add a loop around the above if, to skip
* these. Also see a matching assert in the parser thread.
*/
assert(unit.is_last);
parsing_finished = true;
return {};
}
assert(next_block_in_current_unit.value() < unit.chunk_ext.chunk.size());
Chunk res = std::move(unit.chunk_ext.chunk.at(*next_block_in_current_unit));
last_block_missing_values = std::move(unit.chunk_ext.block_missing_values[*next_block_in_current_unit]);
next_block_in_current_unit.value() += 1;
if (*next_block_in_current_unit == unit.chunk_ext.chunk.size())
{
// parsing_finished reading this Processing Unit, move to the next one.
next_block_in_current_unit.reset();
++reader_ticket_number;
if (unit.is_last)
{
// It it was the last unit, we're parsing_finished.
parsing_finished = true;
}
else
{
// Pass the unit back to the segmentator.
std::lock_guard lock(mutex);
2020-05-18 10:00:22 +00:00
unit.status = READY_TO_INSERT;
segmentator_condvar.notify_all();
}
}
return res;
}
}