2020-06-11 00:36:57 +00:00
|
|
|
#include <Processors/Formats/Impl/ParallelParsingInputFormat.h>
|
2020-06-10 12:02:34 +00:00
|
|
|
#include <IO/ReadHelpers.h>
|
2022-04-15 23:56:45 +00:00
|
|
|
#include <IO/WithFileName.h>
|
2020-10-06 17:49:57 +00:00
|
|
|
#include <Common/CurrentThread.h>
|
|
|
|
#include <Common/setThreadName.h>
|
2022-11-28 20:31:55 +00:00
|
|
|
#include <Common/scope_guard_safe.h>
|
2020-05-18 10:00:22 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2020-10-06 17:49:57 +00:00
|
|
|
void ParallelParsingInputFormat::segmentatorThreadFunction(ThreadGroupStatusPtr thread_group)
|
2020-05-18 10:00:22 +00:00
|
|
|
{
|
2022-11-28 20:31:55 +00:00
|
|
|
SCOPE_EXIT_SAFE(
|
|
|
|
if (thread_group)
|
|
|
|
CurrentThread::detachQueryIfNotDetached();
|
|
|
|
);
|
2020-10-06 17:49:57 +00:00
|
|
|
if (thread_group)
|
|
|
|
CurrentThread::attachTo(thread_group);
|
|
|
|
|
2020-05-18 10:00:22 +00:00
|
|
|
setThreadName("Segmentator");
|
|
|
|
try
|
|
|
|
{
|
|
|
|
while (!parsing_finished)
|
|
|
|
{
|
2020-12-30 18:51:41 +00:00
|
|
|
const auto segmentator_unit_number = segmentator_ticket_number % processing_units.size();
|
|
|
|
auto & unit = processing_units[segmentator_unit_number];
|
2020-05-18 10:00:22 +00:00
|
|
|
|
|
|
|
{
|
|
|
|
std::unique_lock<std::mutex> lock(mutex);
|
2021-04-19 17:49:02 +00:00
|
|
|
segmentator_condvar.wait(lock, [&] { return unit.status == READY_TO_INSERT || parsing_finished; });
|
2020-05-18 10:00:22 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (parsing_finished)
|
|
|
|
break;
|
|
|
|
|
|
|
|
assert(unit.status == READY_TO_INSERT);
|
|
|
|
|
|
|
|
// Segmentating the original input.
|
|
|
|
unit.segment.resize(0);
|
|
|
|
|
2022-09-27 20:14:15 +00:00
|
|
|
auto [have_more_data, currently_read_rows] = file_segmentation_engine(*in, unit.segment, min_chunk_bytes, max_block_size);
|
2020-12-28 12:53:58 +00:00
|
|
|
|
|
|
|
unit.offset = successfully_read_rows_count;
|
|
|
|
successfully_read_rows_count += currently_read_rows;
|
2020-05-18 10:00:22 +00:00
|
|
|
|
|
|
|
unit.is_last = !have_more_data;
|
|
|
|
unit.status = READY_TO_PARSE;
|
|
|
|
scheduleParserThreadForUnitWithNumber(segmentator_ticket_number);
|
|
|
|
++segmentator_ticket_number;
|
|
|
|
|
|
|
|
if (!have_more_data)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
catch (...)
|
|
|
|
{
|
2020-12-28 12:53:58 +00:00
|
|
|
onBackgroundException(successfully_read_rows_count);
|
2020-05-18 10:00:22 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-10-06 17:49:57 +00:00
|
|
|
void ParallelParsingInputFormat::parserThreadFunction(ThreadGroupStatusPtr thread_group, size_t current_ticket_number)
|
2020-05-18 10:00:22 +00:00
|
|
|
{
|
2022-11-28 20:31:55 +00:00
|
|
|
SCOPE_EXIT_SAFE(
|
|
|
|
if (thread_group)
|
|
|
|
CurrentThread::detachQueryIfNotDetached();
|
|
|
|
);
|
2020-10-06 17:49:57 +00:00
|
|
|
if (thread_group)
|
2022-01-27 08:55:38 +00:00
|
|
|
CurrentThread::attachToIfDetached(thread_group);
|
2020-10-06 17:49:57 +00:00
|
|
|
|
2020-12-30 13:55:31 +00:00
|
|
|
const auto parser_unit_number = current_ticket_number % processing_units.size();
|
|
|
|
auto & unit = processing_units[parser_unit_number];
|
2020-12-28 12:53:58 +00:00
|
|
|
|
2020-05-18 10:00:22 +00:00
|
|
|
try
|
|
|
|
{
|
|
|
|
setThreadName("ChunkParser");
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is kind of suspicious -- the input_process_creator contract with
|
|
|
|
* respect to multithreaded use is not clear, but we hope that it is
|
|
|
|
* just a 'normal' factory class that doesn't have any state, and so we
|
|
|
|
* can use it from multiple threads simultaneously.
|
|
|
|
*/
|
|
|
|
ReadBuffer read_buffer(unit.segment.data(), unit.segment.size(), 0);
|
2020-06-10 12:02:34 +00:00
|
|
|
|
2020-05-18 10:00:22 +00:00
|
|
|
InputFormatPtr input_format = internal_parser_creator(read_buffer);
|
2020-12-30 04:50:58 +00:00
|
|
|
input_format->setCurrentUnitNumber(current_ticket_number);
|
2022-09-05 15:42:49 +00:00
|
|
|
input_format->setErrorsLogger(errors_logger);
|
2020-05-18 10:00:22 +00:00
|
|
|
InternalParser parser(input_format);
|
|
|
|
|
|
|
|
unit.chunk_ext.chunk.clear();
|
|
|
|
unit.chunk_ext.block_missing_values.clear();
|
|
|
|
|
2021-03-01 19:58:55 +00:00
|
|
|
/// Propagate column_mapping to other parsers.
|
|
|
|
/// Note: column_mapping is used only for *WithNames types
|
|
|
|
if (current_ticket_number != 0)
|
|
|
|
input_format->setColumnMapping(column_mapping);
|
|
|
|
|
2020-05-18 10:00:22 +00:00
|
|
|
// We don't know how many blocks will be. So we have to read them all
|
2020-12-30 15:21:58 +00:00
|
|
|
// until an empty block occurred.
|
2020-05-18 10:00:22 +00:00
|
|
|
Chunk chunk;
|
2022-09-05 15:42:49 +00:00
|
|
|
while (!parsing_finished && (chunk = parser.getChunk()))
|
2020-05-18 10:00:22 +00:00
|
|
|
{
|
2022-09-05 15:42:49 +00:00
|
|
|
/// Variable chunk is moved, but it is not really used in the next iteration.
|
|
|
|
/// NOLINTNEXTLINE(bugprone-use-after-move, hicpp-invalid-access-moved)
|
|
|
|
unit.chunk_ext.chunk.emplace_back(std::move(chunk));
|
|
|
|
unit.chunk_ext.block_missing_values.emplace_back(parser.getMissingValues());
|
2020-05-18 10:00:22 +00:00
|
|
|
}
|
|
|
|
|
2021-03-01 22:32:11 +00:00
|
|
|
/// Extract column_mapping from first parser to propagate it to others
|
2021-03-01 19:58:55 +00:00
|
|
|
if (current_ticket_number == 0)
|
2021-03-02 13:31:19 +00:00
|
|
|
{
|
2021-03-01 19:58:55 +00:00
|
|
|
column_mapping = input_format->getColumnMapping();
|
2021-03-02 13:31:19 +00:00
|
|
|
column_mapping->is_set = true;
|
2021-03-09 22:20:38 +00:00
|
|
|
first_parser_finished.set();
|
2021-03-02 13:31:19 +00:00
|
|
|
}
|
2021-03-01 19:58:55 +00:00
|
|
|
|
2020-05-18 10:00:22 +00:00
|
|
|
// We suppose we will get at least some blocks for a non-empty buffer,
|
|
|
|
// except at the end of file. Also see a matching assert in readImpl().
|
2020-06-11 00:36:57 +00:00
|
|
|
assert(unit.is_last || !unit.chunk_ext.chunk.empty() || parsing_finished);
|
2020-05-18 10:00:22 +00:00
|
|
|
|
|
|
|
std::lock_guard<std::mutex> lock(mutex);
|
|
|
|
unit.status = READY_TO_READ;
|
|
|
|
reader_condvar.notify_all();
|
|
|
|
}
|
|
|
|
catch (...)
|
|
|
|
{
|
2020-12-28 12:53:58 +00:00
|
|
|
onBackgroundException(unit.offset);
|
2020-05-18 10:00:22 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-12-28 12:53:58 +00:00
|
|
|
void ParallelParsingInputFormat::onBackgroundException(size_t offset)
|
2020-05-18 10:00:22 +00:00
|
|
|
{
|
2022-06-28 19:19:06 +00:00
|
|
|
std::lock_guard lock(mutex);
|
2020-05-18 10:00:22 +00:00
|
|
|
if (!background_exception)
|
|
|
|
{
|
|
|
|
background_exception = std::current_exception();
|
2020-12-28 12:53:58 +00:00
|
|
|
if (ParsingException * e = exception_cast<ParsingException *>(background_exception))
|
2022-04-15 23:56:45 +00:00
|
|
|
{
|
2022-10-07 10:46:45 +00:00
|
|
|
/// NOTE: it is not that safe to use line number hack here (may exceed INT_MAX)
|
2020-12-28 12:53:58 +00:00
|
|
|
if (e->getLineNumber() != -1)
|
2022-10-07 10:46:45 +00:00
|
|
|
e->setLineNumber(static_cast<int>(e->getLineNumber() + offset));
|
2022-04-15 23:56:45 +00:00
|
|
|
|
|
|
|
auto file_name = getFileNameFromReadBuffer(getReadBuffer());
|
|
|
|
if (!file_name.empty())
|
|
|
|
e->setFileName(file_name);
|
|
|
|
}
|
2020-05-18 10:00:22 +00:00
|
|
|
}
|
2022-04-15 23:56:45 +00:00
|
|
|
|
2022-01-11 15:37:07 +00:00
|
|
|
if (is_server)
|
|
|
|
tryLogCurrentException(__PRETTY_FUNCTION__);
|
2022-04-15 23:56:45 +00:00
|
|
|
|
2020-05-18 10:00:22 +00:00
|
|
|
parsing_finished = true;
|
2021-03-10 23:15:16 +00:00
|
|
|
first_parser_finished.set();
|
2020-05-18 10:00:22 +00:00
|
|
|
reader_condvar.notify_all();
|
|
|
|
segmentator_condvar.notify_all();
|
|
|
|
}
|
|
|
|
|
2020-06-11 00:36:57 +00:00
|
|
|
Chunk ParallelParsingInputFormat::generate()
|
2020-05-18 10:00:22 +00:00
|
|
|
{
|
2021-04-26 22:56:47 +00:00
|
|
|
/// Delayed launching of segmentator thread
|
|
|
|
if (unlikely(!parsing_started.exchange(true)))
|
2021-04-26 15:50:40 +00:00
|
|
|
{
|
|
|
|
segmentator_thread = ThreadFromGlobalPool(
|
|
|
|
&ParallelParsingInputFormat::segmentatorThreadFunction, this, CurrentThread::getGroup());
|
|
|
|
}
|
2021-04-19 17:49:02 +00:00
|
|
|
|
2020-05-18 10:00:22 +00:00
|
|
|
if (isCancelled() || parsing_finished)
|
|
|
|
{
|
|
|
|
/**
|
|
|
|
* Check for background exception and rethrow it before we return.
|
|
|
|
*/
|
|
|
|
std::unique_lock<std::mutex> lock(mutex);
|
|
|
|
if (background_exception)
|
|
|
|
{
|
|
|
|
lock.unlock();
|
|
|
|
onCancel();
|
|
|
|
std::rethrow_exception(background_exception);
|
|
|
|
}
|
|
|
|
|
|
|
|
return {};
|
|
|
|
}
|
|
|
|
|
2020-12-30 13:55:31 +00:00
|
|
|
const auto inserter_unit_number = reader_ticket_number % processing_units.size();
|
|
|
|
auto & unit = processing_units[inserter_unit_number];
|
2020-05-18 10:00:22 +00:00
|
|
|
|
|
|
|
if (!next_block_in_current_unit.has_value())
|
|
|
|
{
|
|
|
|
// We have read out all the Blocks from the previous Processing Unit,
|
|
|
|
// wait for the current one to become ready.
|
|
|
|
std::unique_lock<std::mutex> lock(mutex);
|
|
|
|
reader_condvar.wait(lock, [&](){ return unit.status == READY_TO_READ || parsing_finished; });
|
|
|
|
|
|
|
|
if (parsing_finished)
|
|
|
|
{
|
|
|
|
/**
|
|
|
|
* Check for background exception and rethrow it before we return.
|
|
|
|
*/
|
|
|
|
if (background_exception)
|
|
|
|
{
|
|
|
|
lock.unlock();
|
|
|
|
cancel();
|
|
|
|
std::rethrow_exception(background_exception);
|
|
|
|
}
|
|
|
|
|
|
|
|
return {};
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(unit.status == READY_TO_READ);
|
|
|
|
next_block_in_current_unit = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unit.chunk_ext.chunk.empty())
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Can we get zero blocks for an entire segment, when the format parser
|
|
|
|
* skips it entire content and does not create any blocks? Probably not,
|
|
|
|
* but if we ever do, we should add a loop around the above if, to skip
|
|
|
|
* these. Also see a matching assert in the parser thread.
|
|
|
|
*/
|
|
|
|
assert(unit.is_last);
|
|
|
|
parsing_finished = true;
|
|
|
|
return {};
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(next_block_in_current_unit.value() < unit.chunk_ext.chunk.size());
|
|
|
|
|
|
|
|
Chunk res = std::move(unit.chunk_ext.chunk.at(*next_block_in_current_unit));
|
|
|
|
last_block_missing_values = std::move(unit.chunk_ext.block_missing_values[*next_block_in_current_unit]);
|
|
|
|
|
|
|
|
next_block_in_current_unit.value() += 1;
|
|
|
|
|
|
|
|
if (*next_block_in_current_unit == unit.chunk_ext.chunk.size())
|
|
|
|
{
|
|
|
|
// parsing_finished reading this Processing Unit, move to the next one.
|
|
|
|
next_block_in_current_unit.reset();
|
|
|
|
++reader_ticket_number;
|
|
|
|
|
|
|
|
if (unit.is_last)
|
|
|
|
{
|
|
|
|
// It it was the last unit, we're parsing_finished.
|
|
|
|
parsing_finished = true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
// Pass the unit back to the segmentator.
|
2022-06-28 19:19:06 +00:00
|
|
|
std::lock_guard lock(mutex);
|
2020-05-18 10:00:22 +00:00
|
|
|
unit.status = READY_TO_INSERT;
|
|
|
|
segmentator_condvar.notify_all();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
}
|