2011-09-04 01:42:14 +00:00
|
|
|
#pragma once
|
|
|
|
|
2015-01-07 15:30:11 +00:00
|
|
|
#include <queue>
|
2015-01-07 17:19:23 +00:00
|
|
|
#include <Poco/TemporaryFile.h>
|
2015-01-07 15:30:11 +00:00
|
|
|
|
2015-09-29 19:19:54 +00:00
|
|
|
#include <common/logger_useful.h>
|
2012-09-05 19:51:09 +00:00
|
|
|
|
2011-09-04 01:42:14 +00:00
|
|
|
#include <DB/Core/SortDescription.h>
|
|
|
|
|
2011-09-04 21:23:19 +00:00
|
|
|
#include <DB/DataStreams/IProfilingBlockInputStream.h>
|
2015-01-07 17:19:23 +00:00
|
|
|
#include <DB/DataStreams/NativeBlockInputStream.h>
|
|
|
|
|
|
|
|
#include <DB/IO/ReadBufferFromFile.h>
|
|
|
|
#include <DB/IO/CompressedReadBuffer.h>
|
2011-09-04 01:42:14 +00:00
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2016-07-25 20:57:05 +00:00
|
|
|
/** Merges stream of sorted each-separately blocks to sorted as-a-whole stream of blocks.
|
|
|
|
* If data to sort is too much, could use external sorting, with temporary files.
|
2011-09-04 01:42:14 +00:00
|
|
|
*/
|
2015-01-07 15:30:11 +00:00
|
|
|
|
2016-07-25 20:57:05 +00:00
|
|
|
/** Part of implementation. Merging array of ready (already read from somewhere) blocks.
|
|
|
|
* Returns result of merge as stream of blocks, not more than 'max_merged_block_size' rows in each.
|
2015-01-07 15:30:11 +00:00
|
|
|
*/
|
|
|
|
class MergeSortingBlocksBlockInputStream : public IProfilingBlockInputStream
|
|
|
|
{
|
|
|
|
public:
|
2016-07-25 20:57:05 +00:00
|
|
|
/// limit - if not 0, allowed to return just first 'limit' rows in sorted order.
|
2015-01-07 15:30:11 +00:00
|
|
|
MergeSortingBlocksBlockInputStream(Blocks & blocks_, SortDescription & description_,
|
|
|
|
size_t max_merged_block_size_, size_t limit_ = 0);
|
|
|
|
|
2015-06-08 20:22:02 +00:00
|
|
|
String getName() const override { return "MergeSortingBlocks"; }
|
2015-01-07 15:30:11 +00:00
|
|
|
String getID() const override { return getName(); }
|
|
|
|
|
|
|
|
protected:
|
|
|
|
Block readImpl() override;
|
|
|
|
|
|
|
|
private:
|
|
|
|
Blocks & blocks;
|
|
|
|
SortDescription description;
|
|
|
|
size_t max_merged_block_size;
|
|
|
|
size_t limit;
|
|
|
|
size_t total_merged_rows = 0;
|
|
|
|
|
|
|
|
using CursorImpls = std::vector<SortCursorImpl>;
|
|
|
|
CursorImpls cursors;
|
|
|
|
|
|
|
|
bool has_collation = false;
|
|
|
|
|
|
|
|
std::priority_queue<SortCursor> queue;
|
|
|
|
std::priority_queue<SortCursorWithCollation> queue_with_collation;
|
|
|
|
|
2016-07-25 20:57:05 +00:00
|
|
|
/** Two different cursors are supported - with and without Collation.
|
|
|
|
* Templates are used (instead of virtual functions in SortCursor) for zero-overhead.
|
2015-01-07 15:30:11 +00:00
|
|
|
*/
|
|
|
|
template <typename TSortCursor>
|
|
|
|
Block mergeImpl(std::priority_queue<TSortCursor> & queue);
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2011-09-04 21:23:19 +00:00
|
|
|
class MergeSortingBlockInputStream : public IProfilingBlockInputStream
|
2011-09-04 01:42:14 +00:00
|
|
|
{
|
|
|
|
public:
|
2016-07-25 20:57:05 +00:00
|
|
|
/// limit - if not 0, allowed to return just first 'limit' rows in sorted order.
|
2015-01-07 17:19:23 +00:00
|
|
|
MergeSortingBlockInputStream(BlockInputStreamPtr input_, SortDescription & description_,
|
|
|
|
size_t max_merged_block_size_, size_t limit_,
|
2015-05-28 03:49:28 +00:00
|
|
|
size_t max_bytes_before_external_sort_, const std::string & tmp_path_)
|
2015-01-07 17:19:23 +00:00
|
|
|
: description(description_), max_merged_block_size(max_merged_block_size_), limit(limit_),
|
2015-05-28 03:49:28 +00:00
|
|
|
max_bytes_before_external_sort(max_bytes_before_external_sort_), tmp_path(tmp_path_)
|
2011-09-04 21:23:19 +00:00
|
|
|
{
|
2013-05-04 04:05:15 +00:00
|
|
|
children.push_back(input_);
|
2011-09-04 21:23:19 +00:00
|
|
|
}
|
2011-09-04 01:42:14 +00:00
|
|
|
|
2015-06-08 20:22:02 +00:00
|
|
|
String getName() const override { return "MergeSorting"; }
|
2011-09-04 01:42:14 +00:00
|
|
|
|
2014-11-08 23:52:18 +00:00
|
|
|
String getID() const override
|
2013-05-03 10:20:53 +00:00
|
|
|
{
|
|
|
|
std::stringstream res;
|
2013-05-04 05:20:07 +00:00
|
|
|
res << "MergeSorting(" << children.back()->getID();
|
2014-11-08 23:52:18 +00:00
|
|
|
|
2013-05-03 10:20:53 +00:00
|
|
|
for (size_t i = 0; i < description.size(); ++i)
|
|
|
|
res << ", " << description[i].getID();
|
2014-11-08 23:52:18 +00:00
|
|
|
|
2013-05-03 10:20:53 +00:00
|
|
|
res << ")";
|
|
|
|
return res.str();
|
|
|
|
}
|
|
|
|
|
2012-10-20 02:10:47 +00:00
|
|
|
protected:
|
2014-11-08 23:52:18 +00:00
|
|
|
Block readImpl() override;
|
2012-10-20 02:10:47 +00:00
|
|
|
|
2011-09-04 01:42:14 +00:00
|
|
|
private:
|
|
|
|
SortDescription description;
|
2015-01-07 17:19:23 +00:00
|
|
|
size_t max_merged_block_size;
|
2013-09-16 05:44:47 +00:00
|
|
|
size_t limit;
|
2011-09-04 01:42:14 +00:00
|
|
|
|
2015-01-07 17:19:23 +00:00
|
|
|
size_t max_bytes_before_external_sort;
|
|
|
|
const std::string tmp_path;
|
|
|
|
|
2015-01-07 15:30:11 +00:00
|
|
|
Logger * log = &Logger::get("MergeSortingBlockInputStream");
|
2014-11-08 23:52:18 +00:00
|
|
|
|
2015-01-07 15:30:11 +00:00
|
|
|
Blocks blocks;
|
2015-01-07 17:19:23 +00:00
|
|
|
size_t sum_bytes_in_blocks = 0;
|
|
|
|
std::unique_ptr<IBlockInputStream> impl;
|
|
|
|
|
2016-07-25 20:57:05 +00:00
|
|
|
/// Before operation, will remove constant columns from blocks. And after, place constant columns back.
|
|
|
|
/// (to avoid excessive virtual function calls and because constants cannot be serialized in Native format for temporary files)
|
|
|
|
/// Save original block structure here.
|
|
|
|
Block sample_block;
|
|
|
|
|
|
|
|
/// Everything below is for external sorting.
|
2015-01-07 17:19:23 +00:00
|
|
|
std::vector<std::unique_ptr<Poco::TemporaryFile>> temporary_files;
|
|
|
|
|
2016-07-25 20:57:05 +00:00
|
|
|
/// For reading data from temporary file.
|
2015-01-07 17:19:23 +00:00
|
|
|
struct TemporaryFileStream
|
|
|
|
{
|
|
|
|
ReadBufferFromFile file_in;
|
|
|
|
CompressedReadBuffer compressed_in;
|
|
|
|
BlockInputStreamPtr block_in;
|
|
|
|
|
2015-05-28 03:49:28 +00:00
|
|
|
TemporaryFileStream(const std::string & path)
|
2016-05-28 12:22:22 +00:00
|
|
|
: file_in(path), compressed_in(file_in), block_in(std::make_shared<NativeBlockInputStream>(compressed_in)) {}
|
2015-01-07 17:19:23 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
std::vector<std::unique_ptr<TemporaryFileStream>> temporary_inputs;
|
|
|
|
|
|
|
|
BlockInputStreams inputs_to_merge;
|
2011-09-04 01:42:14 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
}
|