2017-04-01 09:19:00 +00:00
|
|
|
#include <DataStreams/MergeSortingBlockInputStream.h>
|
|
|
|
#include <DataStreams/MergingSortedBlockInputStream.h>
|
|
|
|
#include <DataStreams/NativeBlockOutputStream.h>
|
|
|
|
#include <DataStreams/copyData.h>
|
2018-10-19 12:02:31 +00:00
|
|
|
#include <DataStreams/processConstants.h>
|
2018-09-24 20:07:30 +00:00
|
|
|
#include <Common/formatReadable.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <IO/WriteBufferFromFile.h>
|
|
|
|
#include <IO/CompressedWriteBuffer.h>
|
2018-10-04 10:24:51 +00:00
|
|
|
#include <Interpreters/sortBlock.h>
|
2011-09-04 01:42:14 +00:00
|
|
|
|
|
|
|
|
2016-10-24 02:02:37 +00:00
|
|
|
namespace ProfileEvents
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
extern const Event ExternalSortWritePart;
|
|
|
|
extern const Event ExternalSortMerge;
|
2016-10-24 02:02:37 +00:00
|
|
|
}
|
|
|
|
|
2011-09-04 01:42:14 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2018-04-07 01:46:50 +00:00
|
|
|
MergeSortingBlockInputStream::MergeSortingBlockInputStream(
|
|
|
|
const BlockInputStreamPtr & input, SortDescription & description_,
|
2018-09-24 20:07:30 +00:00
|
|
|
size_t max_merged_block_size_, size_t limit_, size_t max_bytes_before_remerge_,
|
2018-04-07 01:46:50 +00:00
|
|
|
size_t max_bytes_before_external_sort_, const std::string & tmp_path_)
|
|
|
|
: description(description_), max_merged_block_size(max_merged_block_size_), limit(limit_),
|
2018-09-24 20:07:30 +00:00
|
|
|
max_bytes_before_remerge(max_bytes_before_remerge_),
|
2018-04-07 01:46:50 +00:00
|
|
|
max_bytes_before_external_sort(max_bytes_before_external_sort_), tmp_path(tmp_path_)
|
2011-09-04 01:42:14 +00:00
|
|
|
{
|
2018-04-07 01:46:50 +00:00
|
|
|
children.push_back(input);
|
2018-04-08 04:38:27 +00:00
|
|
|
header = children.at(0)->getHeader();
|
|
|
|
header_without_constants = header;
|
|
|
|
removeConstantsFromBlock(header_without_constants);
|
2018-04-07 01:46:50 +00:00
|
|
|
removeConstantsFromSortDescription(header, description);
|
|
|
|
}
|
2018-02-19 00:45:32 +00:00
|
|
|
|
2018-04-07 01:46:50 +00:00
|
|
|
|
|
|
|
Block MergeSortingBlockInputStream::readImpl()
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
/** Algorithm:
|
|
|
|
* - read to memory blocks from source stream;
|
2018-03-09 23:23:15 +00:00
|
|
|
* - if too many of them and if external sorting is enabled,
|
2017-04-01 07:20:54 +00:00
|
|
|
* - merge all blocks to sorted stream and write it to temporary file;
|
|
|
|
* - at the end, merge all sorted streams from temporary files and also from rest of blocks in memory.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/// If has not read source blocks.
|
|
|
|
if (!impl)
|
|
|
|
{
|
|
|
|
while (Block block = children.back()->read())
|
|
|
|
{
|
2017-06-08 12:50:58 +00:00
|
|
|
/// If there were only const columns in sort description, then there is no need to sort.
|
|
|
|
/// Return the blocks as is.
|
|
|
|
if (description.empty())
|
|
|
|
return block;
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
removeConstantsFromBlock(block);
|
|
|
|
|
|
|
|
blocks.push_back(block);
|
2018-09-26 01:30:07 +00:00
|
|
|
sum_rows_in_blocks += block.rows();
|
2018-09-24 20:07:30 +00:00
|
|
|
sum_bytes_in_blocks += block.allocatedBytes();
|
|
|
|
|
|
|
|
/** If significant amount of data was accumulated, perform preliminary merging step.
|
|
|
|
*/
|
2018-09-26 01:30:07 +00:00
|
|
|
if (blocks.size() > 1
|
|
|
|
&& limit
|
|
|
|
&& limit * 2 < sum_rows_in_blocks /// 2 is just a guess.
|
|
|
|
&& remerge_is_useful
|
|
|
|
&& max_bytes_before_remerge
|
|
|
|
&& sum_bytes_in_blocks > max_bytes_before_remerge)
|
|
|
|
{
|
2018-09-24 20:07:30 +00:00
|
|
|
remerge();
|
2018-09-26 01:30:07 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-03-09 23:23:15 +00:00
|
|
|
/** If too many of them and if external sorting is enabled,
|
2017-04-01 07:20:54 +00:00
|
|
|
* will merge blocks that we have in memory at this moment and write merged stream to temporary (compressed) file.
|
|
|
|
* NOTE. It's possible to check free space in filesystem.
|
|
|
|
*/
|
|
|
|
if (max_bytes_before_external_sort && sum_bytes_in_blocks > max_bytes_before_external_sort)
|
|
|
|
{
|
2018-04-20 15:32:40 +00:00
|
|
|
Poco::File(tmp_path).createDirectories();
|
2017-04-01 07:20:54 +00:00
|
|
|
temporary_files.emplace_back(new Poco::TemporaryFile(tmp_path));
|
|
|
|
const std::string & path = temporary_files.back()->path();
|
|
|
|
WriteBufferFromFile file_buf(path);
|
|
|
|
CompressedWriteBuffer compressed_buf(file_buf);
|
2018-04-08 04:38:27 +00:00
|
|
|
NativeBlockOutputStream block_out(compressed_buf, 0, header_without_constants);
|
2017-04-01 07:20:54 +00:00
|
|
|
MergeSortingBlocksBlockInputStream block_in(blocks, description, max_merged_block_size, limit);
|
|
|
|
|
|
|
|
LOG_INFO(log, "Sorting and writing part of data into temporary file " + path);
|
|
|
|
ProfileEvents::increment(ProfileEvents::ExternalSortWritePart);
|
|
|
|
copyData(block_in, block_out, &is_cancelled); /// NOTE. Possibly limit disk usage.
|
|
|
|
LOG_INFO(log, "Done writing part of data into temporary file " + path);
|
|
|
|
|
|
|
|
blocks.clear();
|
|
|
|
sum_bytes_in_blocks = 0;
|
2018-09-26 01:30:07 +00:00
|
|
|
sum_rows_in_blocks = 0;
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-03-05 21:09:39 +00:00
|
|
|
if ((blocks.empty() && temporary_files.empty()) || isCancelledOrThrowIfKilled())
|
2017-04-01 07:20:54 +00:00
|
|
|
return Block();
|
|
|
|
|
|
|
|
if (temporary_files.empty())
|
|
|
|
{
|
|
|
|
impl = std::make_unique<MergeSortingBlocksBlockInputStream>(blocks, description, max_merged_block_size, limit);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/// If there was temporary files.
|
|
|
|
ProfileEvents::increment(ProfileEvents::ExternalSortMerge);
|
|
|
|
|
|
|
|
LOG_INFO(log, "There are " << temporary_files.size() << " temporary sorted parts to merge.");
|
|
|
|
|
|
|
|
/// Create sorted streams to merge.
|
|
|
|
for (const auto & file : temporary_files)
|
|
|
|
{
|
2018-04-08 04:38:27 +00:00
|
|
|
temporary_inputs.emplace_back(std::make_unique<TemporaryFileStream>(file->path(), header_without_constants));
|
2017-04-01 07:20:54 +00:00
|
|
|
inputs_to_merge.emplace_back(temporary_inputs.back()->block_in);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Rest of blocks in memory.
|
|
|
|
if (!blocks.empty())
|
|
|
|
inputs_to_merge.emplace_back(std::make_shared<MergeSortingBlocksBlockInputStream>(blocks, description, max_merged_block_size, limit));
|
|
|
|
|
|
|
|
/// Will merge that sorted streams.
|
|
|
|
impl = std::make_unique<MergingSortedBlockInputStream>(inputs_to_merge, description, max_merged_block_size, limit);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Block res = impl->read();
|
|
|
|
if (res)
|
2018-02-19 00:45:32 +00:00
|
|
|
enrichBlockWithConstants(res, header);
|
2017-04-01 07:20:54 +00:00
|
|
|
return res;
|
2011-09-04 01:42:14 +00:00
|
|
|
}
|
|
|
|
|
2012-07-23 06:23:29 +00:00
|
|
|
|
2015-01-07 15:30:11 +00:00
|
|
|
MergeSortingBlocksBlockInputStream::MergeSortingBlocksBlockInputStream(
|
2017-04-01 07:20:54 +00:00
|
|
|
Blocks & blocks_, SortDescription & description_, size_t max_merged_block_size_, size_t limit_)
|
2018-02-19 04:29:56 +00:00
|
|
|
: blocks(blocks_), header(blocks.at(0).cloneEmpty()), description(description_), max_merged_block_size(max_merged_block_size_), limit(limit_)
|
2015-01-07 15:30:11 +00:00
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
Blocks nonempty_blocks;
|
|
|
|
for (const auto & block : blocks)
|
|
|
|
{
|
|
|
|
if (block.rows() == 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
nonempty_blocks.push_back(block);
|
|
|
|
cursors.emplace_back(block, description);
|
|
|
|
has_collation |= cursors.back().has_collation;
|
|
|
|
}
|
|
|
|
|
|
|
|
blocks.swap(nonempty_blocks);
|
|
|
|
|
|
|
|
if (!has_collation)
|
|
|
|
{
|
|
|
|
for (size_t i = 0; i < cursors.size(); ++i)
|
2018-08-10 04:02:56 +00:00
|
|
|
queue_without_collation.push(SortCursor(&cursors[i]));
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
for (size_t i = 0; i < cursors.size(); ++i)
|
|
|
|
queue_with_collation.push(SortCursorWithCollation(&cursors[i]));
|
|
|
|
}
|
2015-01-07 15:30:11 +00:00
|
|
|
}
|
2014-08-22 02:31:54 +00:00
|
|
|
|
|
|
|
|
2015-01-07 15:30:11 +00:00
|
|
|
Block MergeSortingBlocksBlockInputStream::readImpl()
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
if (blocks.empty())
|
|
|
|
return Block();
|
|
|
|
|
|
|
|
if (blocks.size() == 1)
|
|
|
|
{
|
|
|
|
Block res = blocks[0];
|
|
|
|
blocks.clear();
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
return !has_collation
|
2018-08-10 04:02:56 +00:00
|
|
|
? mergeImpl<SortCursor>(queue_without_collation)
|
2017-04-01 07:20:54 +00:00
|
|
|
: mergeImpl<SortCursorWithCollation>(queue_with_collation);
|
2013-05-28 16:56:05 +00:00
|
|
|
}
|
2012-07-23 06:23:29 +00:00
|
|
|
|
2015-01-07 15:30:11 +00:00
|
|
|
|
2013-09-16 05:44:47 +00:00
|
|
|
template <typename TSortCursor>
|
2015-01-07 15:30:11 +00:00
|
|
|
Block MergeSortingBlocksBlockInputStream::mergeImpl(std::priority_queue<TSortCursor> & queue)
|
2013-05-28 16:56:05 +00:00
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t num_columns = blocks[0].columns();
|
|
|
|
|
2017-12-15 00:06:56 +00:00
|
|
|
MutableColumns merged_columns = blocks[0].cloneEmptyColumns();
|
2017-12-15 00:01:59 +00:00
|
|
|
/// TODO: reserve (in each column)
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
/// Take rows from queue in right order and push to 'merged'.
|
|
|
|
size_t merged_rows = 0;
|
|
|
|
while (!queue.empty())
|
|
|
|
{
|
|
|
|
TSortCursor current = queue.top();
|
|
|
|
queue.pop();
|
|
|
|
|
|
|
|
for (size_t i = 0; i < num_columns; ++i)
|
|
|
|
merged_columns[i]->insertFrom(*current->all_columns[i], current->pos);
|
|
|
|
|
|
|
|
if (!current->isLast())
|
|
|
|
{
|
|
|
|
current->next();
|
|
|
|
queue.push(current);
|
|
|
|
}
|
|
|
|
|
|
|
|
++total_merged_rows;
|
|
|
|
if (limit && total_merged_rows == limit)
|
|
|
|
{
|
2017-12-17 08:44:06 +00:00
|
|
|
auto res = blocks[0].cloneWithColumns(std::move(merged_columns));
|
2017-04-01 07:20:54 +00:00
|
|
|
blocks.clear();
|
2017-12-17 08:44:06 +00:00
|
|
|
return res;
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
++merged_rows;
|
|
|
|
if (merged_rows == max_merged_block_size)
|
2017-12-15 00:06:56 +00:00
|
|
|
return blocks[0].cloneWithColumns(std::move(merged_columns));
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (merged_rows == 0)
|
2017-12-15 00:01:59 +00:00
|
|
|
return {};
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-12-15 00:06:56 +00:00
|
|
|
return blocks[0].cloneWithColumns(std::move(merged_columns));
|
2012-07-23 20:01:29 +00:00
|
|
|
}
|
2012-07-23 06:23:29 +00:00
|
|
|
|
2015-01-07 15:30:11 +00:00
|
|
|
|
2018-09-24 20:07:30 +00:00
|
|
|
void MergeSortingBlockInputStream::remerge()
|
|
|
|
{
|
2018-09-26 01:30:07 +00:00
|
|
|
LOG_DEBUG(log, "Re-merging intermediate ORDER BY data (" << blocks.size() << " blocks with " << sum_rows_in_blocks << " rows) to save memory consumption");
|
2018-09-24 20:07:30 +00:00
|
|
|
|
2018-09-24 20:30:02 +00:00
|
|
|
/// NOTE Maybe concat all blocks and partial sort will be faster than merge?
|
2018-09-24 20:07:30 +00:00
|
|
|
MergeSortingBlocksBlockInputStream merger(blocks, description, max_merged_block_size, limit);
|
|
|
|
|
|
|
|
Blocks new_blocks;
|
2018-09-26 01:30:07 +00:00
|
|
|
size_t new_sum_rows_in_blocks = 0;
|
2018-09-24 20:07:30 +00:00
|
|
|
size_t new_sum_bytes_in_blocks = 0;
|
|
|
|
|
|
|
|
merger.readPrefix();
|
|
|
|
while (Block block = merger.read())
|
|
|
|
{
|
2018-09-26 01:30:07 +00:00
|
|
|
new_sum_rows_in_blocks += block.rows();
|
2018-09-24 20:07:30 +00:00
|
|
|
new_sum_bytes_in_blocks += block.allocatedBytes();
|
|
|
|
new_blocks.emplace_back(std::move(block));
|
|
|
|
}
|
|
|
|
merger.readSuffix();
|
|
|
|
|
|
|
|
LOG_DEBUG(log, "Memory usage is lowered from "
|
|
|
|
<< formatReadableSizeWithBinarySuffix(sum_bytes_in_blocks) << " to "
|
|
|
|
<< formatReadableSizeWithBinarySuffix(new_sum_bytes_in_blocks));
|
|
|
|
|
|
|
|
/// If the memory consumption was not lowered enough - we will not perform remerge anymore. 2 is a guess.
|
|
|
|
if (new_sum_bytes_in_blocks * 2 > sum_bytes_in_blocks)
|
|
|
|
remerge_is_useful = false;
|
|
|
|
|
|
|
|
blocks = std::move(new_blocks);
|
2018-09-26 01:30:07 +00:00
|
|
|
sum_rows_in_blocks = new_sum_rows_in_blocks;
|
2018-09-24 20:07:30 +00:00
|
|
|
sum_bytes_in_blocks = new_sum_bytes_in_blocks;
|
|
|
|
}
|
2011-09-04 01:42:14 +00:00
|
|
|
}
|