ClickHouse/dbms/include/DB/DataStreams/ParallelAggregatingBlockInputStream.h

#pragma once

#include <DB/Interpreters/Aggregator.h>
#include <DB/IO/ReadBufferFromFile.h>
#include <DB/IO/CompressedReadBuffer.h>
#include <DB/DataStreams/IProfilingBlockInputStream.h>
#include <DB/DataStreams/ParallelInputsProcessor.h>


namespace DB
{


/** Агрегирует несколько источников параллельно.
  * Производит агрегацию блоков из разных источников независимо в разных потоках, затем объединяет результаты.
  * Если final == false, агрегатные функции не финализируются, то есть, не заменяются на своё значение, а содержат промежуточное состояние вычислений.
  * Это необходимо, чтобы можно было продолжить агрегацию (например, объединяя потоки частично агрегированных данных).
  */
class ParallelAggregatingBlockInputStream : public IProfilingBlockInputStream
{
public:
	/** Столбцы из key_names и аргументы агрегатных функций, уже должны быть вычислены.
	  */
	ParallelAggregatingBlockInputStream(
		BlockInputStreams inputs, BlockInputStreamPtr additional_input_at_end,
		const Aggregator::Params & params_, bool final_, size_t max_threads_, size_t temporary_data_merge_threads_);

	String getName() const override { return "ParallelAggregating"; }

	String getID() const override;

	void cancel() override;

protected:
	/// Ничего не делаем, чтобы подготовка к выполнению запроса делалась параллельно, в ParallelInputsProcessor.
	void readPrefix() override
	{
	}

	Block readImpl() override;

private:
	Aggregator::Params params;
	Aggregator aggregator;
	bool final;
	size_t max_threads;
	size_t temporary_data_merge_threads;

	size_t keys_size;
	size_t aggregates_size;

	/** Используется, если есть ограничение на максимальное количество строк при агрегации,
	  *  и если group_by_overflow_mode == ANY.
	  * В этом случае, новые ключи не добавляются в набор, а производится агрегация только по
	  *  ключам, которые уже успели попасть в набор.
	  */
	bool no_more_keys = false;

	std::atomic<bool> executed {false};

	/// Для чтения сброшенных во временный файл данных.
	struct TemporaryFileStream
	{
		ReadBufferFromFile file_in;
		CompressedReadBuffer compressed_in;
		BlockInputStreamPtr block_in;

		TemporaryFileStream(const std::string & path);
	};
	std::vector<std::unique_ptr<TemporaryFileStream>> temporary_inputs;

	Logger * log = &Logger::get("ParallelAggregatingBlockInputStream");


	ManyAggregatedDataVariants many_data;
	Exceptions exceptions;

	struct ThreadData
	{
		size_t src_rows = 0;
		size_t src_bytes = 0;

		StringRefs key;
		ConstColumnPlainPtrs key_columns;
		Aggregator::AggregateColumns aggregate_columns;
		Sizes key_sizes;

		ThreadData(size_t keys_size, size_t aggregates_size)
		{
			key.resize(keys_size);
			key_columns.resize(keys_size);
			aggregate_columns.resize(aggregates_size);
			key_sizes.resize(keys_size);
		}
	};

	std::vector<ThreadData> threads_data;


	struct Handler
	{
		Handler(ParallelAggregatingBlockInputStream & parent_)
			: parent(parent_) {}

		void onBlock(Block & block, size_t thread_num);
		void onFinishThread(size_t thread_num);
		void onFinish();
		void onException(std::exception_ptr & exception, size_t thread_num);

		ParallelAggregatingBlockInputStream & parent;
	};

	Handler handler;
	ParallelInputsProcessor<Handler> processor;


	void execute();


	/** Отсюда будем доставать готовые блоки после агрегации.
	  */
	std::unique_ptr<IBlockInputStream> impl;
};

}
-												dbms: development [#CONV-2944].



											
										
										
											2012-02-27 06:28:20 +00:00
+								#pragma once
 								#include <DB/Interpreters/Aggregator.h>
-												dbms: external aggregation: initial implementation [#METR-17000].

											
										
										
											2015-12-01 14:09:05 +00:00
+								#include <DB/IO/ReadBufferFromFile.h>
 								#include <DB/IO/CompressedReadBuffer.h>
-												dbms: development [#CONV-2944].



											
										
										
											2012-02-27 06:28:20 +00:00
+								#include <DB/DataStreams/IProfilingBlockInputStream.h>
-												dbms: more uniform load of threads while aggragation [#METR-2944].

											
										
										
											2014-11-30 18:22:57 +00:00
+								#include <DB/DataStreams/ParallelInputsProcessor.h>
-												Separate compilation of ProfileEvents; comments [#METR-23237].

											
										
										
											2016-10-24 02:02:37 +00:00
-												dbms: development [#CONV-2944].



											
										
										
											2012-02-27 06:28:20 +00:00
+								namespace DB
 								{
 								/** Агрегирует несколько источников параллельно.
-												dbms: more uniform load of threads while aggragation [#METR-2944].

											
										
										
											2014-11-30 18:22:57 +00:00
+								  * Производит агрегацию блоков из разных источников независимо в разных потоках, затем объединяет результаты.
-												dbms: tiny fix [#METR-2944].

											
										
										
											2014-11-30 06:59:03 +00:00
+								  * Если final == false, агрегатные функции не финализируются, то есть, не заменяются на своё значение, а содержат промежуточное состояние вычислений.
-												dbms: development [#CONV-2944].



											
										
										
											2012-02-27 06:28:20 +00:00
+								  * Это необходимо, чтобы можно было продолжить агрегацию (например, объединяя потоки частично агрегированных данных).
 								  */
 								class ParallelAggregatingBlockInputStream : public IProfilingBlockInputStream
 								{
 								public:
-												clickhouse: totals mode: development. [#METR-9365]

											
										
										
											2014-02-26 11:44:54 +00:00
+									/** Столбцы из key_names и аргументы агрегатных функций, уже должны быть вычислены.
-												dbms: development [#CONV-2944].



											
										
										
											2012-02-27 06:28:20 +00:00
+									  */
-												dbms: FULL and RIGHT JOIN: preparation [#METR-15418].

											
										
										
											2015-07-21 21:29:02 +00:00
+									ParallelAggregatingBlockInputStream(
 										BlockInputStreams inputs, BlockInputStreamPtr additional_input_at_end,
-												Do not include ClickHouseRevision.h in header files [#METR-2944].

											
										
										
											2016-12-12 05:14:46 +00:00
+										const Aggregator::Params & params_, bool final_, size_t max_threads_, size_t temporary_data_merge_threads_);
-												dbms: development [#CONV-2944].



											
										
										
											2012-02-27 06:28:20 +00:00
-												dbms: removed useless code [#METR-16739].

											
										
										
											2015-06-08 20:22:02 +00:00
+									String getName() const override { return "ParallelAggregating"; }
-												dbms: development [#CONV-2944].



											
										
										
											2012-10-20 02:10:47 +00:00
-												Do not include ClickHouseRevision.h in header files [#METR-2944].

											
										
										
											2016-12-12 05:14:46 +00:00
+									String getID() const override;
-												dbms: development of multi-queries [#CONV-2944].



											
										
										
											2013-05-03 10:20:53 +00:00
-												Do not include ClickHouseRevision.h in header files [#METR-2944].

											
										
										
											2016-12-12 05:14:46 +00:00
+									void cancel() override;
-												dbms: more uniform load of threads while aggragation [#METR-2944].

											
										
										
											2014-11-30 18:22:57 +00:00
-												dbms: development [#CONV-2944].



											
										
										
											2012-10-20 02:10:47 +00:00
+								protected:
-												Second attempt [#METR-21408].

											
										
										
											2016-05-20 20:43:07 +00:00
+									/// Ничего не делаем, чтобы подготовка к выполнению запроса делалась параллельно, в ParallelInputsProcessor.
 									void readPrefix() override
 									{
 									}
-												Do not include ClickHouseRevision.h in header files [#METR-2944].

											
										
										
											2016-12-12 05:14:46 +00:00
+									Block readImpl() override;
-												dbms: development [#CONV-2944].



											
										
										
											2012-02-27 06:28:20 +00:00
 								private:
-												dbms: external aggregation: initial implementation [#METR-17000].

											
										
										
											2015-12-01 14:09:05 +00:00
+									Aggregator::Params params;
-												dbms: tiny modifications [#METR-2944].

											
										
										
											2015-01-08 18:52:48 +00:00
+									Aggregator aggregator;
-												dbms: merged convertToBlock and finalize aggregation steps whenever possible [#METR-2944].



											
										
										
											2013-11-03 23:35:18 +00:00
+									bool final;
-												dbms: development [#CONV-2944].



											
										
										
											2012-02-27 06:28:20 +00:00
+									size_t max_threads;
-												dbms: added setting 'aggregation_memory_efficient_merge_threads' [#METR-17000].

											
										
										
											2015-12-03 05:03:23 +00:00
+									size_t temporary_data_merge_threads;
-												dbms: development [#CONV-2944].



											
										
										
											2012-02-27 06:28:20 +00:00
-												dbms: more uniform load of threads while aggragation [#METR-2944].

											
										
										
											2014-11-30 18:22:57 +00:00
+									size_t keys_size;
 									size_t aggregates_size;
 									/** Используется, если есть ограничение на максимальное количество строк при агрегации,
 									  *  и если group_by_overflow_mode == ANY.
 									  * В этом случае, новые ключи не добавляются в набор, а производится агрегация только по
 									  *  ключам, которые уже успели попасть в набор.
 									  */
 									bool no_more_keys = false;
-												dbms: fixed build [#METR-19278].

											
										
										
											2015-12-16 20:21:52 +00:00
+									std::atomic<bool> executed {false};
-												dbms: external aggregation: initial implementation [#METR-17000].

											
										
										
											2015-12-01 14:09:05 +00:00
 									/// Для чтения сброшенных во временный файл данных.
 									struct TemporaryFileStream
 									{
 										ReadBufferFromFile file_in;
 										CompressedReadBuffer compressed_in;
 										BlockInputStreamPtr block_in;
-												Do not include ClickHouseRevision.h in header files [#METR-2944].

											
										
										
											2016-12-12 05:14:46 +00:00
+										TemporaryFileStream(const std::string & path);
-												dbms: external aggregation: initial implementation [#METR-17000].

											
										
										
											2015-12-01 14:09:05 +00:00
+									};
 									std::vector<std::unique_ptr<TemporaryFileStream>> temporary_inputs;
-												dbms: more uniform load of threads while aggragation [#METR-2944].

											
										
										
											2014-11-30 18:22:57 +00:00
+									Logger * log = &Logger::get("ParallelAggregatingBlockInputStream");
-												dbms: fixed possible error [#METR-15530].

											
										
										
											2015-03-18 02:48:36 +00:00
+									ManyAggregatedDataVariants many_data;
 									Exceptions exceptions;
 									struct ThreadData
 									{
 										size_t src_rows = 0;
 										size_t src_bytes = 0;
 										StringRefs key;
 										ConstColumnPlainPtrs key_columns;
 										Aggregator::AggregateColumns aggregate_columns;
 										Sizes key_sizes;
 										ThreadData(size_t keys_size, size_t aggregates_size)
 										{
 											key.resize(keys_size);
 											key_columns.resize(keys_size);
 											aggregate_columns.resize(aggregates_size);
 											key_sizes.resize(keys_size);
 										}
 									};
 									std::vector<ThreadData> threads_data;
-												dbms: more uniform load of threads while aggragation [#METR-2944].

											
										
										
											2014-11-30 18:22:57 +00:00
+									struct Handler
-												dbms: development [#CONV-2944].



											
										
										
											2012-02-27 06:28:20 +00:00
+									{
-												dbms: more uniform load of threads while aggragation [#METR-2944].

											
										
										
											2014-11-30 18:22:57 +00:00
+										Handler(ParallelAggregatingBlockInputStream & parent_)
 											: parent(parent_) {}
-												dbms: tracking amount of memory usage per query [#METR-11015].

											
										
										
											2014-05-03 22:57:43 +00:00
-												Do not include ClickHouseRevision.h in header files [#METR-2944].

											
										
										
											2016-12-12 05:14:46 +00:00
+										void onBlock(Block & block, size_t thread_num);
 										void onFinishThread(size_t thread_num);
 										void onFinish();
 										void onException(std::exception_ptr & exception, size_t thread_num);
-												dbms: more uniform load of threads while aggragation [#METR-2944].

											
										
										
											2014-11-30 18:22:57 +00:00
 										ParallelAggregatingBlockInputStream & parent;
 									};
 									Handler handler;
 									ParallelInputsProcessor<Handler> processor;
-												dbms: more scalable aggregator: development [#METR-2944].

											
										
										
											2015-01-02 03:16:28 +00:00
-												Do not include ClickHouseRevision.h in header files [#METR-2944].

											
										
										
											2016-12-12 05:14:46 +00:00
+									void execute();
-												dbms: little better (but doesn't fix any error) [#METR-19278].

											
										
										
											2015-12-16 04:04:23 +00:00
 									/** Отсюда будем доставать готовые блоки после агрегации.
 									  */
 									std::unique_ptr<IBlockInputStream> impl;
-												dbms: development [#CONV-2944].



											
										
										
											2012-02-27 06:28:20 +00:00
+								};
 								}