ClickHouse/dbms/include/DB/DataStreams/MergingAggregatedMemoryEfficientBlockInputStream.h

#pragma once

#include <common/threadpool.hpp>
#include <DB/Interpreters/Aggregator.h>
#include <DB/DataStreams/IProfilingBlockInputStream.h>
#include <DB/Common/ConcurrentBoundedQueue.h>


namespace DB
{


/** Доагрегирует потоки блоков, держа в оперативной памяти только по одному или несколько (до merging_threads) блоков из каждого источника.
  * Это экономит оперативку в случае использования двухуровневой агрегации, где в каждом потоке будет до 256 блоков с частями результата.
  *
  * Агрегатные функции в блоках не должны быть финализированы, чтобы их состояния можно было объединить.
  */
class MergingAggregatedMemoryEfficientBlockInputStream : public IProfilingBlockInputStream
{
public:
	MergingAggregatedMemoryEfficientBlockInputStream(
		BlockInputStreams inputs_, const Aggregator::Params & params, bool final_,
		size_t reading_threads_, size_t merging_threads_);

	~MergingAggregatedMemoryEfficientBlockInputStream();

	String getName() const override { return "MergingAggregatedMemoryEfficient"; }

	String getID() const override;

	/// Отправляет запрос (инициирует вычисления) раньше, чем read.
	void readPrefix() override;

protected:
	Block readImpl() override;

private:
	Aggregator aggregator;
	bool final;
	size_t reading_threads;
	size_t merging_threads;

	bool started = false;
	volatile bool has_two_level = false;
	volatile bool has_overflows = false;
	int current_bucket_num = -1;

	struct Input
	{
		BlockInputStreamPtr stream;
		Block block;
		Block overflow_block;
		std::vector<Block> splitted_blocks;
		bool is_exhausted = false;

		Input(BlockInputStreamPtr & stream_) : stream(stream_) {}
	};

	std::vector<Input> inputs;

	using BlocksToMerge = Poco::SharedPtr<BlocksList>;

	void start();

	/// Получить блоки, которые можно мерджить. Это позволяет мерджить их параллельно в отдельных потоках.
	BlocksToMerge getNextBlocksToMerge();

	std::unique_ptr<boost::threadpool::pool> reading_pool;

	/// Для параллельного мерджа.
	struct OutputData
	{
		Block block;
		std::exception_ptr exception;

		OutputData() {}
		OutputData(Block && block_) : block(std::move(block_)) {}
		OutputData(std::exception_ptr && exception_) : exception(std::move(exception_)) {}
	};

	struct ParallelMergeData
	{
		boost::threadpool::pool pool;
		std::mutex get_next_blocks_mutex;
		ConcurrentBoundedQueue<OutputData> result_queue;
		bool exhausted = false;	/// Данных больше нет.
		bool finish = false;	/// Нужно завершить работу раньше, чем данные закончились.
		std::atomic<size_t> active_threads;

		ParallelMergeData(size_t max_threads) : pool(max_threads), result_queue(max_threads), active_threads(max_threads) {}
	};

	std::unique_ptr<ParallelMergeData> parallel_merge_data;

	void mergeThread(MemoryTracker * memory_tracker);
};

}
-												dbms: added memory-efficient mode of distributed aggregation [#METR-17536].

											
										
										
											2015-07-30 23:41:02 +00:00
+								#pragma once
-												dbms: allowed to merge partial-aggregated streams memory-efficient in parallel [#METR-17000].

											
										
										
											2015-12-01 22:35:48 +00:00
+								#include <common/threadpool.hpp>
-												dbms: added memory-efficient mode of distributed aggregation [#METR-17536].

											
										
										
											2015-07-30 23:41:02 +00:00
+								#include <DB/Interpreters/Aggregator.h>
 								#include <DB/DataStreams/IProfilingBlockInputStream.h>
-												dbms: allowed to merge partial-aggregated streams memory-efficient in parallel [#METR-17000].

											
										
										
											2015-12-01 22:35:48 +00:00
+								#include <DB/Common/ConcurrentBoundedQueue.h>
-												dbms: added memory-efficient mode of distributed aggregation [#METR-17536].

											
										
										
											2015-07-30 23:41:02 +00:00
 								namespace DB
 								{
-												dbms: MergingAggregatedMemoryEfficientBlockInputStream: reading data in parallel [#METR-17000].

											
										
										
											2015-12-05 04:20:37 +00:00
+								/** Доагрегирует потоки блоков, держа в оперативной памяти только по одному или несколько (до merging_threads) блоков из каждого источника.
-												dbms: added memory-efficient mode of distributed aggregation [#METR-17536].

											
										
										
											2015-07-30 23:41:02 +00:00
+								  * Это экономит оперативку в случае использования двухуровневой агрегации, где в каждом потоке будет до 256 блоков с частями результата.
 								  *
 								  * Агрегатные функции в блоках не должны быть финализированы, чтобы их состояния можно было объединить.
 								  */
 								class MergingAggregatedMemoryEfficientBlockInputStream : public IProfilingBlockInputStream
 								{
 								public:
-												dbms: allowed to merge partial-aggregated streams memory-efficient in parallel [#METR-17000].

											
										
										
											2015-12-01 22:35:48 +00:00
+									MergingAggregatedMemoryEfficientBlockInputStream(
-												dbms: MergingAggregatedMemoryEfficientBlockInputStream: reading data in parallel [#METR-17000].

											
										
										
											2015-12-05 04:20:37 +00:00
+										BlockInputStreams inputs_, const Aggregator::Params & params, bool final_,
 										size_t reading_threads_, size_t merging_threads_);
-												dbms: added memory-efficient mode of distributed aggregation [#METR-17536].

											
										
										
											2015-07-30 23:41:02 +00:00
-												dbms: fixed error [#METR-19262].

											
										
										
											2015-12-05 03:04:13 +00:00
+									~MergingAggregatedMemoryEfficientBlockInputStream();
-												dbms: memory efficient distributed aggregation: development [#METR-17536].

											
										
										
											2015-09-07 20:08:02 +00:00
+									String getName() const override { return "MergingAggregatedMemoryEfficient"; }
-												dbms: added memory-efficient mode of distributed aggregation [#METR-17536].

											
										
										
											2015-07-30 23:41:02 +00:00
-												dbms: improvement [#METR-17536].

											
										
										
											2015-09-08 19:53:16 +00:00
+									String getID() const override;
-												dbms: added memory-efficient mode of distributed aggregation [#METR-17536].

											
										
										
											2015-07-30 23:41:02 +00:00
-												dbms: MergingAggregatedMemoryEfficientBlockInputStream: reading data in parallel [#METR-17000].

											
										
										
											2015-12-05 04:20:37 +00:00
+									/// Отправляет запрос (инициирует вычисления) раньше, чем read.
 									void readPrefix() override;
-												dbms: added memory-efficient mode of distributed aggregation [#METR-17536].

											
										
										
											2015-07-30 23:41:02 +00:00
+								protected:
-												dbms: improvement [#METR-17536].

											
										
										
											2015-09-08 19:53:16 +00:00
+									Block readImpl() override;
-												dbms: added memory-efficient mode of distributed aggregation [#METR-17536].

											
										
										
											2015-07-30 23:41:02 +00:00
 								private:
 									Aggregator aggregator;
 									bool final;
-												dbms: MergingAggregatedMemoryEfficientBlockInputStream: reading data in parallel [#METR-17000].

											
										
										
											2015-12-05 04:20:37 +00:00
+									size_t reading_threads;
 									size_t merging_threads;
-												dbms: added memory-efficient mode of distributed aggregation [#METR-17536].

											
										
										
											2015-07-30 23:41:02 +00:00
-												dbms: development of memory efficient distributed aggregation method [#METR-17536].

											
										
										
											2015-09-07 07:40:14 +00:00
+									bool started = false;
-												dbms: MergingAggregatedMemoryEfficientBlockInputStream: reading data in parallel [#METR-17000].

											
										
										
											2015-12-05 04:20:37 +00:00
+									volatile bool has_two_level = false;
 									volatile bool has_overflows = false;
-												dbms: development of memory efficient distributed aggregation method [#METR-17536].

											
										
										
											2015-09-07 07:40:14 +00:00
+									int current_bucket_num = -1;
 									struct Input
 									{
 										BlockInputStreamPtr stream;
 										Block block;
 										Block overflow_block;
 										std::vector<Block> splitted_blocks;
 										bool is_exhausted = false;
 										Input(BlockInputStreamPtr & stream_) : stream(stream_) {}
 									};
-												dbms: added memory-efficient mode of distributed aggregation [#METR-17536].

											
										
										
											2015-07-30 23:41:02 +00:00
-												dbms: development of memory efficient distributed aggregation method [#METR-17536].

											
										
										
											2015-09-07 07:40:14 +00:00
+									std::vector<Input> inputs;
-												dbms: allowed to merge partial-aggregated streams memory-efficient in parallel [#METR-17000].

											
										
										
											2015-12-01 22:35:48 +00:00
 									using BlocksToMerge = Poco::SharedPtr<BlocksList>;
-												dbms: MergingAggregatedMemoryEfficientBlockInputStream: reading data in parallel [#METR-17000].

											
										
										
											2015-12-05 04:20:37 +00:00
+									void start();
-												dbms: allowed to merge partial-aggregated streams memory-efficient in parallel [#METR-17000].

											
										
										
											2015-12-01 22:35:48 +00:00
+									/// Получить блоки, которые можно мерджить. Это позволяет мерджить их параллельно в отдельных потоках.
 									BlocksToMerge getNextBlocksToMerge();
-												dbms: MergingAggregatedMemoryEfficientBlockInputStream: reading data in parallel [#METR-17000].

											
										
										
											2015-12-05 04:20:37 +00:00
+									std::unique_ptr<boost::threadpool::pool> reading_pool;
-												dbms: allowed to merge partial-aggregated streams memory-efficient in parallel [#METR-17000].

											
										
										
											2015-12-01 22:35:48 +00:00
+									/// Для параллельного мерджа.
 									struct OutputData
 									{
 										Block block;
 										std::exception_ptr exception;
 										OutputData() {}
 										OutputData(Block && block_) : block(std::move(block_)) {}
 										OutputData(std::exception_ptr && exception_) : exception(std::move(exception_)) {}
 									};
 									struct ParallelMergeData
 									{
 										boost::threadpool::pool pool;
 										std::mutex get_next_blocks_mutex;
 										ConcurrentBoundedQueue<OutputData> result_queue;
-												dbms: fixed error [#METR-19316].

											
										
										
											2015-12-09 04:28:01 +00:00
+										bool exhausted = false;	/// Данных больше нет.
 										bool finish = false;	/// Нужно завершить работу раньше, чем данные закончились.
-												dbms: MergingAggregatedMemoryEfficientBlockInputStream: fixed two errors [#METR-17000].

											
										
										
											2015-12-03 02:43:40 +00:00
+										std::atomic<size_t> active_threads;
-												dbms: allowed to merge partial-aggregated streams memory-efficient in parallel [#METR-17000].

											
										
										
											2015-12-01 22:35:48 +00:00
-												dbms: MergingAggregatedMemoryEfficientBlockInputStream: fixed two errors [#METR-17000].

											
										
										
											2015-12-03 02:43:40 +00:00
+										ParallelMergeData(size_t max_threads) : pool(max_threads), result_queue(max_threads), active_threads(max_threads) {}
-												dbms: allowed to merge partial-aggregated streams memory-efficient in parallel [#METR-17000].

											
										
										
											2015-12-01 22:35:48 +00:00
+									};
 									std::unique_ptr<ParallelMergeData> parallel_merge_data;
 									void mergeThread(MemoryTracker * memory_tracker);
-												dbms: added memory-efficient mode of distributed aggregation [#METR-17536].

											
										
										
											2015-07-30 23:41:02 +00:00
+								};
 								}