2011-09-04 01:42:14 +00:00
|
|
|
#pragma once
|
|
|
|
|
2015-09-29 19:19:54 +00:00
|
|
|
#include <common/logger_useful.h>
|
2012-09-05 19:51:09 +00:00
|
|
|
|
2019-10-15 16:31:49 +00:00
|
|
|
#include <Common/filesystemHelpers.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Core/SortDescription.h>
|
|
|
|
#include <Core/SortCursor.h>
|
2011-09-04 01:42:14 +00:00
|
|
|
|
2019-01-23 14:48:50 +00:00
|
|
|
#include <DataStreams/IBlockInputStream.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <DataStreams/NativeBlockInputStream.h>
|
2015-01-07 17:19:23 +00:00
|
|
|
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <IO/ReadBufferFromFile.h>
|
2018-12-28 18:15:26 +00:00
|
|
|
#include <Compression/CompressedReadBuffer.h>
|
2011-09-04 01:42:14 +00:00
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2019-10-15 16:31:49 +00:00
|
|
|
struct TemporaryFileStream;
|
|
|
|
|
2020-01-19 14:26:28 +00:00
|
|
|
class Volume;
|
|
|
|
using VolumePtr = std::shared_ptr<Volume>;
|
|
|
|
|
2019-08-27 00:23:07 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
}
|
2016-07-25 20:57:05 +00:00
|
|
|
/** Merges stream of sorted each-separately blocks to sorted as-a-whole stream of blocks.
|
|
|
|
* If data to sort is too much, could use external sorting, with temporary files.
|
2011-09-04 01:42:14 +00:00
|
|
|
*/
|
2015-01-07 15:30:11 +00:00
|
|
|
|
2016-07-25 20:57:05 +00:00
|
|
|
/** Part of implementation. Merging array of ready (already read from somewhere) blocks.
|
|
|
|
* Returns result of merge as stream of blocks, not more than 'max_merged_block_size' rows in each.
|
2015-01-07 15:30:11 +00:00
|
|
|
*/
|
2019-01-23 14:48:50 +00:00
|
|
|
class MergeSortingBlocksBlockInputStream : public IBlockInputStream
|
2015-01-07 15:30:11 +00:00
|
|
|
{
|
|
|
|
public:
|
2017-04-01 07:20:54 +00:00
|
|
|
/// limit - if not 0, allowed to return just first 'limit' rows in sorted order.
|
2019-10-15 16:31:49 +00:00
|
|
|
MergeSortingBlocksBlockInputStream(Blocks & blocks_, const SortDescription & description_,
|
2019-02-10 15:17:45 +00:00
|
|
|
size_t max_merged_block_size_, UInt64 limit_ = 0);
|
2015-01-07 15:30:11 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
String getName() const override { return "MergeSortingBlocks"; }
|
2015-01-07 15:30:11 +00:00
|
|
|
|
2017-04-07 19:47:21 +00:00
|
|
|
bool isSortedOutput() const override { return true; }
|
|
|
|
const SortDescription & getSortDescription() const override { return description; }
|
|
|
|
|
2018-02-19 04:29:56 +00:00
|
|
|
Block getHeader() const override { return header; }
|
2018-01-06 18:10:44 +00:00
|
|
|
|
2015-01-07 15:30:11 +00:00
|
|
|
protected:
|
2017-04-01 07:20:54 +00:00
|
|
|
Block readImpl() override;
|
2015-01-07 15:30:11 +00:00
|
|
|
|
|
|
|
private:
|
2017-04-01 07:20:54 +00:00
|
|
|
Blocks & blocks;
|
2018-02-19 04:29:56 +00:00
|
|
|
Block header;
|
2017-04-01 07:20:54 +00:00
|
|
|
SortDescription description;
|
|
|
|
size_t max_merged_block_size;
|
2019-02-10 15:17:45 +00:00
|
|
|
UInt64 limit;
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t total_merged_rows = 0;
|
2015-01-07 15:30:11 +00:00
|
|
|
|
2019-12-20 23:57:57 +00:00
|
|
|
SortCursorImpls cursors;
|
2015-01-07 15:30:11 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
bool has_collation = false;
|
2015-01-07 15:30:11 +00:00
|
|
|
|
2019-12-20 23:57:57 +00:00
|
|
|
SortingHeap<SortCursor> queue_without_collation;
|
2019-12-22 01:49:38 +00:00
|
|
|
SortingHeap<SimpleSortCursor> queue_simple;
|
2019-12-20 23:57:57 +00:00
|
|
|
SortingHeap<SortCursorWithCollation> queue_with_collation;
|
2015-01-07 15:30:11 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
/** Two different cursors are supported - with and without Collation.
|
|
|
|
* Templates are used (instead of virtual functions in SortCursor) for zero-overhead.
|
|
|
|
*/
|
2019-12-20 23:57:57 +00:00
|
|
|
template <typename TSortingHeap>
|
|
|
|
Block mergeImpl(TSortingHeap & queue);
|
2015-01-07 15:30:11 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2019-01-23 14:48:50 +00:00
|
|
|
class MergeSortingBlockInputStream : public IBlockInputStream
|
2011-09-04 01:42:14 +00:00
|
|
|
{
|
|
|
|
public:
|
2017-04-01 07:20:54 +00:00
|
|
|
/// limit - if not 0, allowed to return just first 'limit' rows in sorted order.
|
2017-09-08 02:29:47 +00:00
|
|
|
MergeSortingBlockInputStream(const BlockInputStreamPtr & input, SortDescription & description_,
|
2019-02-10 15:17:45 +00:00
|
|
|
size_t max_merged_block_size_, UInt64 limit_,
|
2018-09-24 20:07:30 +00:00
|
|
|
size_t max_bytes_before_remerge_,
|
2020-01-19 14:26:28 +00:00
|
|
|
size_t max_bytes_before_external_sort_, VolumePtr tmp_volume_,
|
2019-08-27 00:23:07 +00:00
|
|
|
size_t min_free_disk_space_);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
String getName() const override { return "MergeSorting"; }
|
|
|
|
|
2017-04-07 19:47:21 +00:00
|
|
|
bool isSortedOutput() const override { return true; }
|
|
|
|
const SortDescription & getSortDescription() const override { return description; }
|
|
|
|
|
2018-04-08 04:38:27 +00:00
|
|
|
Block getHeader() const override { return header; }
|
2018-01-06 18:10:44 +00:00
|
|
|
|
2012-10-20 02:10:47 +00:00
|
|
|
protected:
|
2017-04-01 07:20:54 +00:00
|
|
|
Block readImpl() override;
|
2012-10-20 02:10:47 +00:00
|
|
|
|
2011-09-04 01:42:14 +00:00
|
|
|
private:
|
2017-04-01 07:20:54 +00:00
|
|
|
SortDescription description;
|
|
|
|
size_t max_merged_block_size;
|
2019-02-10 15:17:45 +00:00
|
|
|
UInt64 limit;
|
2011-09-04 01:42:14 +00:00
|
|
|
|
2018-09-24 20:07:30 +00:00
|
|
|
size_t max_bytes_before_remerge;
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t max_bytes_before_external_sort;
|
2020-01-19 14:26:28 +00:00
|
|
|
VolumePtr tmp_volume;
|
2019-08-27 00:23:07 +00:00
|
|
|
size_t min_free_disk_space;
|
2015-01-07 17:19:23 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
Logger * log = &Logger::get("MergeSortingBlockInputStream");
|
2014-11-08 23:52:18 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
Blocks blocks;
|
2018-09-26 01:30:07 +00:00
|
|
|
size_t sum_rows_in_blocks = 0;
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t sum_bytes_in_blocks = 0;
|
|
|
|
std::unique_ptr<IBlockInputStream> impl;
|
2015-01-07 17:19:23 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
/// Before operation, will remove constant columns from blocks. And after, place constant columns back.
|
|
|
|
/// (to avoid excessive virtual function calls and because constants cannot be serialized in Native format for temporary files)
|
|
|
|
/// Save original block structure here.
|
2018-02-19 00:45:32 +00:00
|
|
|
Block header;
|
2018-04-08 04:38:27 +00:00
|
|
|
Block header_without_constants;
|
2016-07-25 20:57:05 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
/// Everything below is for external sorting.
|
2019-10-01 18:51:33 +00:00
|
|
|
std::vector<std::unique_ptr<TemporaryFile>> temporary_files;
|
2017-04-01 07:20:54 +00:00
|
|
|
std::vector<std::unique_ptr<TemporaryFileStream>> temporary_inputs;
|
2015-01-07 17:19:23 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
BlockInputStreams inputs_to_merge;
|
2018-09-24 20:07:30 +00:00
|
|
|
|
|
|
|
/// Merge all accumulated blocks to keep no more than limit rows.
|
|
|
|
void remerge();
|
|
|
|
/// If remerge doesn't save memory at least several times, mark it as useless and don't do it anymore.
|
|
|
|
bool remerge_is_useful = true;
|
2011-09-04 01:42:14 +00:00
|
|
|
};
|
|
|
|
}
|