2012-02-27 06:28:20 +00:00
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
|
|
#include <DB/Interpreters/Aggregator.h>
|
|
|
|
|
#include <DB/DataStreams/IProfilingBlockInputStream.h>
|
2014-11-30 18:22:57 +00:00
|
|
|
|
#include <DB/DataStreams/ParallelInputsProcessor.h>
|
2012-02-27 06:28:20 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
using Poco::SharedPtr;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/** Агрегирует несколько источников параллельно.
|
2014-11-30 18:22:57 +00:00
|
|
|
|
* Производит агрегацию блоков из разных источников независимо в разных потоках, затем объединяет результаты.
|
2014-11-30 06:59:03 +00:00
|
|
|
|
* Если final == false, агрегатные функции не финализируются, то есть, не заменяются на своё значение, а содержат промежуточное состояние вычислений.
|
2012-02-27 06:28:20 +00:00
|
|
|
|
* Это необходимо, чтобы можно было продолжить агрегацию (например, объединяя потоки частично агрегированных данных).
|
|
|
|
|
*/
|
|
|
|
|
class ParallelAggregatingBlockInputStream : public IProfilingBlockInputStream
|
|
|
|
|
{
|
|
|
|
|
public:
|
2014-11-30 18:22:57 +00:00
|
|
|
|
ParallelAggregatingBlockInputStream(BlockInputStreams inputs, const ColumnNumbers & keys_,
|
|
|
|
|
AggregateDescriptions & aggregates_, bool overflow_row_, bool final_, size_t max_threads_,
|
2015-01-10 02:30:03 +00:00
|
|
|
|
size_t max_rows_to_group_by_, OverflowMode group_by_overflow_mode_,
|
|
|
|
|
Compiler * compiler_, UInt32 min_count_to_compile_)
|
|
|
|
|
: aggregator(keys_, aggregates_, overflow_row_, max_rows_to_group_by_, group_by_overflow_mode_, compiler_, min_count_to_compile_),
|
2015-01-02 03:16:28 +00:00
|
|
|
|
final(final_), max_threads(std::min(inputs.size(), max_threads_)),
|
2014-11-30 18:22:57 +00:00
|
|
|
|
keys_size(keys_.size()), aggregates_size(aggregates_.size()),
|
2014-12-14 23:07:57 +00:00
|
|
|
|
handler(*this), processor(inputs, max_threads, handler)
|
2012-02-27 06:28:20 +00:00
|
|
|
|
{
|
2014-11-30 18:22:57 +00:00
|
|
|
|
children.insert(children.end(), inputs.begin(), inputs.end());
|
2012-02-27 06:28:20 +00:00
|
|
|
|
}
|
|
|
|
|
|
2014-02-26 11:44:54 +00:00
|
|
|
|
/** Столбцы из key_names и аргументы агрегатных функций, уже должны быть вычислены.
|
2012-02-27 06:28:20 +00:00
|
|
|
|
*/
|
2014-11-30 18:22:57 +00:00
|
|
|
|
ParallelAggregatingBlockInputStream(BlockInputStreams inputs, const Names & key_names,
|
|
|
|
|
const AggregateDescriptions & aggregates, bool overflow_row_, bool final_, size_t max_threads_,
|
2015-01-10 02:30:03 +00:00
|
|
|
|
size_t max_rows_to_group_by_, OverflowMode group_by_overflow_mode_,
|
|
|
|
|
Compiler * compiler_, UInt32 min_count_to_compile_)
|
|
|
|
|
: aggregator(key_names, aggregates, overflow_row_, max_rows_to_group_by_, group_by_overflow_mode_, compiler_, min_count_to_compile_),
|
2015-01-08 18:52:48 +00:00
|
|
|
|
final(final_), max_threads(std::min(inputs.size(), max_threads_)),
|
2014-11-30 18:22:57 +00:00
|
|
|
|
keys_size(key_names.size()), aggregates_size(aggregates.size()),
|
2014-12-14 23:07:57 +00:00
|
|
|
|
handler(*this), processor(inputs, max_threads, handler)
|
2012-02-27 06:28:20 +00:00
|
|
|
|
{
|
2014-11-30 18:22:57 +00:00
|
|
|
|
children.insert(children.end(), inputs.begin(), inputs.end());
|
2012-02-27 06:28:20 +00:00
|
|
|
|
}
|
|
|
|
|
|
2014-11-08 23:52:18 +00:00
|
|
|
|
String getName() const override { return "ParallelAggregatingBlockInputStream"; }
|
2012-10-20 02:10:47 +00:00
|
|
|
|
|
2014-11-08 23:52:18 +00:00
|
|
|
|
String getID() const override
|
2013-05-03 10:20:53 +00:00
|
|
|
|
{
|
|
|
|
|
std::stringstream res;
|
|
|
|
|
res << "ParallelAggregating(";
|
|
|
|
|
|
|
|
|
|
Strings children_ids(children.size());
|
|
|
|
|
for (size_t i = 0; i < children.size(); ++i)
|
|
|
|
|
children_ids[i] = children[i]->getID();
|
|
|
|
|
|
|
|
|
|
/// Порядок не имеет значения.
|
|
|
|
|
std::sort(children_ids.begin(), children_ids.end());
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < children_ids.size(); ++i)
|
|
|
|
|
res << (i == 0 ? "" : ", ") << children_ids[i];
|
|
|
|
|
|
2015-01-08 18:52:48 +00:00
|
|
|
|
res << ", " << aggregator.getID() << ")";
|
2013-05-03 10:20:53 +00:00
|
|
|
|
return res.str();
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-30 18:22:57 +00:00
|
|
|
|
void cancel() override
|
|
|
|
|
{
|
|
|
|
|
if (!__sync_bool_compare_and_swap(&is_cancelled, false, true))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
processor.cancel();
|
|
|
|
|
}
|
|
|
|
|
|
2012-10-20 02:10:47 +00:00
|
|
|
|
protected:
|
2014-11-08 23:52:18 +00:00
|
|
|
|
Block readImpl() override
|
2012-02-27 06:28:20 +00:00
|
|
|
|
{
|
2015-01-02 03:16:28 +00:00
|
|
|
|
if (!executed)
|
2014-11-30 18:22:57 +00:00
|
|
|
|
{
|
2015-01-02 03:16:28 +00:00
|
|
|
|
executed = true;
|
|
|
|
|
AggregatedDataVariantsPtr data_variants = executeAndMerge();
|
2014-12-02 00:31:07 +00:00
|
|
|
|
|
2015-01-02 03:16:28 +00:00
|
|
|
|
if (data_variants)
|
2015-01-08 18:52:48 +00:00
|
|
|
|
blocks = aggregator.convertToBlocks(*data_variants, final, max_threads);
|
2015-01-02 03:16:28 +00:00
|
|
|
|
|
|
|
|
|
it = blocks.begin();
|
2014-11-30 18:22:57 +00:00
|
|
|
|
}
|
|
|
|
|
|
2015-01-02 03:16:28 +00:00
|
|
|
|
Block res;
|
|
|
|
|
if (isCancelled() || it == blocks.end())
|
|
|
|
|
return res;
|
2014-11-30 18:22:57 +00:00
|
|
|
|
|
2015-01-02 03:16:28 +00:00
|
|
|
|
res = *it;
|
|
|
|
|
++it;
|
2014-11-30 18:22:57 +00:00
|
|
|
|
|
2015-01-02 03:16:28 +00:00
|
|
|
|
return res;
|
2012-02-27 06:28:20 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
2015-01-08 18:52:48 +00:00
|
|
|
|
Aggregator aggregator;
|
2013-11-03 23:35:18 +00:00
|
|
|
|
bool final;
|
2012-02-27 06:28:20 +00:00
|
|
|
|
size_t max_threads;
|
|
|
|
|
|
2014-11-30 18:22:57 +00:00
|
|
|
|
size_t keys_size;
|
|
|
|
|
size_t aggregates_size;
|
|
|
|
|
|
|
|
|
|
/** Используется, если есть ограничение на максимальное количество строк при агрегации,
|
|
|
|
|
* и если group_by_overflow_mode == ANY.
|
|
|
|
|
* В этом случае, новые ключи не добавляются в набор, а производится агрегация только по
|
|
|
|
|
* ключам, которые уже успели попасть в набор.
|
|
|
|
|
*/
|
|
|
|
|
bool no_more_keys = false;
|
|
|
|
|
|
2015-01-02 03:16:28 +00:00
|
|
|
|
bool executed = false;
|
|
|
|
|
BlocksList blocks;
|
|
|
|
|
BlocksList::iterator it;
|
|
|
|
|
|
2014-11-30 18:22:57 +00:00
|
|
|
|
Logger * log = &Logger::get("ParallelAggregatingBlockInputStream");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct Handler
|
2012-02-27 06:28:20 +00:00
|
|
|
|
{
|
2014-11-30 18:22:57 +00:00
|
|
|
|
Handler(ParallelAggregatingBlockInputStream & parent_)
|
|
|
|
|
: parent(parent_) {}
|
2014-05-03 22:57:43 +00:00
|
|
|
|
|
2014-11-30 18:22:57 +00:00
|
|
|
|
void onBlock(Block & block, size_t thread_num)
|
2012-02-27 06:28:20 +00:00
|
|
|
|
{
|
2015-01-08 18:52:48 +00:00
|
|
|
|
parent.aggregator.executeOnBlock(block, *parent.many_data[thread_num],
|
2014-12-14 23:07:57 +00:00
|
|
|
|
parent.threads_data[thread_num].key_columns, parent.threads_data[thread_num].aggregate_columns,
|
|
|
|
|
parent.threads_data[thread_num].key_sizes, parent.threads_data[thread_num].key, parent.no_more_keys);
|
2014-11-30 18:22:57 +00:00
|
|
|
|
|
2014-12-14 23:07:57 +00:00
|
|
|
|
parent.threads_data[thread_num].src_rows += block.rowsInFirstColumn();
|
|
|
|
|
parent.threads_data[thread_num].src_bytes += block.bytes();
|
2012-02-27 06:28:20 +00:00
|
|
|
|
}
|
2014-11-30 18:22:57 +00:00
|
|
|
|
|
|
|
|
|
void onFinish()
|
2012-02-27 06:28:20 +00:00
|
|
|
|
{
|
|
|
|
|
}
|
2014-11-30 18:22:57 +00:00
|
|
|
|
|
|
|
|
|
void onException(ExceptionPtr & exception, size_t thread_num)
|
|
|
|
|
{
|
|
|
|
|
parent.exceptions[thread_num] = exception;
|
|
|
|
|
parent.cancel();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ParallelAggregatingBlockInputStream & parent;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
Handler handler;
|
|
|
|
|
ParallelInputsProcessor<Handler> processor;
|
|
|
|
|
|
|
|
|
|
ManyAggregatedDataVariants many_data;
|
|
|
|
|
Exceptions exceptions;
|
|
|
|
|
|
2014-12-14 23:07:57 +00:00
|
|
|
|
struct ThreadData
|
|
|
|
|
{
|
|
|
|
|
size_t src_rows = 0;
|
|
|
|
|
size_t src_bytes = 0;
|
|
|
|
|
|
|
|
|
|
StringRefs key;
|
|
|
|
|
ConstColumnPlainPtrs key_columns;
|
|
|
|
|
Aggregator::AggregateColumns aggregate_columns;
|
|
|
|
|
Sizes key_sizes;
|
|
|
|
|
|
|
|
|
|
ThreadData(size_t keys_size, size_t aggregates_size)
|
|
|
|
|
{
|
|
|
|
|
key.resize(keys_size);
|
|
|
|
|
key_columns.resize(keys_size);
|
|
|
|
|
aggregate_columns.resize(aggregates_size);
|
|
|
|
|
key_sizes.resize(keys_size);
|
|
|
|
|
}
|
|
|
|
|
};
|
2014-11-30 18:22:57 +00:00
|
|
|
|
|
2014-12-14 23:07:57 +00:00
|
|
|
|
std::vector<ThreadData> threads_data;
|
2015-01-02 03:16:28 +00:00
|
|
|
|
|
|
|
|
|
AggregatedDataVariantsPtr executeAndMerge()
|
|
|
|
|
{
|
|
|
|
|
many_data.resize(max_threads);
|
|
|
|
|
exceptions.resize(max_threads);
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < max_threads; ++i)
|
|
|
|
|
threads_data.emplace_back(keys_size, aggregates_size);
|
|
|
|
|
|
|
|
|
|
LOG_TRACE(log, "Aggregating");
|
|
|
|
|
|
|
|
|
|
Stopwatch watch;
|
|
|
|
|
|
|
|
|
|
for (auto & elem : many_data)
|
|
|
|
|
elem = new AggregatedDataVariants;
|
|
|
|
|
|
|
|
|
|
processor.process();
|
|
|
|
|
processor.wait();
|
|
|
|
|
|
|
|
|
|
rethrowFirstException(exceptions);
|
|
|
|
|
|
|
|
|
|
if (isCancelled())
|
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
|
|
double elapsed_seconds = watch.elapsedSeconds();
|
|
|
|
|
|
|
|
|
|
size_t total_src_rows = 0;
|
|
|
|
|
size_t total_src_bytes = 0;
|
|
|
|
|
for (size_t i = 0; i < max_threads; ++i)
|
|
|
|
|
{
|
|
|
|
|
size_t rows = many_data[i]->size();
|
|
|
|
|
LOG_TRACE(log, std::fixed << std::setprecision(3)
|
|
|
|
|
<< "Aggregated. " << threads_data[i].src_rows << " to " << rows << " rows"
|
|
|
|
|
<< " (from " << threads_data[i].src_bytes / 1048576.0 << " MiB)"
|
|
|
|
|
<< " in " << elapsed_seconds << " sec."
|
|
|
|
|
<< " (" << threads_data[i].src_rows / elapsed_seconds << " rows/sec., "
|
|
|
|
|
<< threads_data[i].src_bytes / elapsed_seconds / 1048576.0 << " MiB/sec.)");
|
|
|
|
|
|
|
|
|
|
total_src_rows += threads_data[i].src_rows;
|
|
|
|
|
total_src_bytes += threads_data[i].src_bytes;
|
|
|
|
|
}
|
|
|
|
|
LOG_TRACE(log, std::fixed << std::setprecision(3)
|
|
|
|
|
<< "Total aggregated. " << total_src_rows << " rows (from " << total_src_bytes / 1048576.0 << " MiB)"
|
|
|
|
|
<< " in " << elapsed_seconds << " sec."
|
|
|
|
|
<< " (" << total_src_rows / elapsed_seconds << " rows/sec., " << total_src_bytes / elapsed_seconds / 1048576.0 << " MiB/sec.)");
|
|
|
|
|
|
|
|
|
|
if (isCancelled())
|
|
|
|
|
return nullptr;
|
|
|
|
|
|
2015-01-08 18:52:48 +00:00
|
|
|
|
return aggregator.merge(many_data, max_threads);
|
2015-01-02 03:16:28 +00:00
|
|
|
|
}
|
2012-02-27 06:28:20 +00:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
}
|