ClickHouse/dbms/src/DataStreams/MergeSortingBlockInputStream.cpp

173 lines
4.2 KiB
C++
Raw Normal View History

2012-07-23 06:23:29 +00:00
#include <queue>
2012-07-24 18:17:44 +00:00
#include <iomanip>
#include <statdaemons/Stopwatch.h>
2012-07-23 06:23:29 +00:00
2011-09-04 01:42:14 +00:00
#include <DB/DataStreams/MergeSortingBlockInputStream.h>
namespace DB
{
2011-09-04 21:23:19 +00:00
Block MergeSortingBlockInputStream::readImpl()
2011-09-04 01:42:14 +00:00
{
2012-07-23 20:01:29 +00:00
/** Достаточно простой алгоритм:
2011-09-04 01:42:14 +00:00
* - прочитать в оперативку все блоки;
2012-07-24 18:44:23 +00:00
* - объединить их всех;
2011-09-04 01:42:14 +00:00
*/
2012-03-05 02:34:20 +00:00
if (has_been_read)
return Block();
has_been_read = true;
2011-09-04 01:42:14 +00:00
2012-07-23 06:23:29 +00:00
Blocks blocks;
2011-09-04 01:42:14 +00:00
while (Block block = input->read())
blocks.push_back(block);
2012-07-25 20:33:43 +00:00
if (blocks.empty())
return Block();
else if (blocks.size() == 1)
return blocks[0];
else if (blocks.size() == 2)
{
merge(blocks[0], blocks[1]);
return blocks[0];
}
else
return merge(blocks);
2011-09-04 01:42:14 +00:00
}
2012-07-23 20:01:29 +00:00
Block MergeSortingBlockInputStream::merge(Blocks & blocks)
2011-09-04 01:42:14 +00:00
{
2012-07-24 18:29:45 +00:00
Stopwatch watch;
2011-09-04 01:42:14 +00:00
Block merged;
2012-07-23 06:23:29 +00:00
if (!blocks.size())
return merged;
2012-07-23 20:01:29 +00:00
if (blocks.size() == 1)
return blocks[0];
2012-07-25 20:24:38 +00:00
LOG_DEBUG(log, "Merge sorting");
2012-07-23 06:23:29 +00:00
merged = blocks[0].cloneEmpty();
2012-07-25 19:53:43 +00:00
typedef std::priority_queue<SortCursor> Queue;
2012-07-23 06:23:29 +00:00
Queue queue;
typedef std::vector<SortCursorImpl> CursorImpls;
CursorImpls cursors(blocks.size());
2012-07-24 17:46:55 +00:00
size_t i = 0;
2012-07-23 06:23:29 +00:00
size_t num_columns = blocks[0].columns();
2012-07-24 17:46:55 +00:00
for (Blocks::const_iterator it = blocks.begin(); it != blocks.end(); ++it, ++i)
2012-07-23 06:23:29 +00:00
{
if (!*it)
continue;
cursors[i] = SortCursorImpl(*it, description);
queue.push(SortCursor(&cursors[i]));
2012-07-23 06:23:29 +00:00
}
2012-07-23 20:01:29 +00:00
ColumnPlainPtrs merged_columns;
for (size_t i = 0; i < num_columns; ++i)
merged_columns.push_back(&*merged.getByPosition(i).column);
/// Вынимаем строки в нужном порядке и кладём в merged.
while (!queue.empty())
2012-07-23 06:23:29 +00:00
{
2012-07-25 19:53:43 +00:00
SortCursor current = queue.top();
2012-07-23 20:01:29 +00:00
queue.pop();
2012-07-23 06:23:29 +00:00
2012-07-23 20:01:29 +00:00
for (size_t i = 0; i < num_columns; ++i)
merged_columns[i]->insert((*current->all_columns[i])[current->pos]);
2012-07-23 20:01:29 +00:00
if (!current->isLast())
{
current->next();
queue.push(current);
}
2012-07-23 06:23:29 +00:00
}
2012-07-24 18:29:45 +00:00
LOG_DEBUG(log, std::fixed << std::setprecision(2)
<< "Merge sorted " << blocks.size() << " blocks, " << merged.rows() << " rows"
<< " in " << watch.elapsedSeconds() << " sec., "
<< merged.rows() / watch.elapsedSeconds() << " rows/sec., "
<< merged.bytes() / 1000000.0 / watch.elapsedSeconds() << " MiB/sec.");
2012-07-23 06:23:29 +00:00
return merged;
2012-07-23 20:01:29 +00:00
}
2012-07-23 06:23:29 +00:00
2012-07-25 20:33:43 +00:00
void MergeSortingBlockInputStream::merge(Block & left, Block & right)
{
Block merged = left.cloneEmpty();
size_t left_size = left.rows();
size_t right_size = right.rows();
size_t left_pos = 0;
size_t right_pos = 0;
/// Все столбцы блоков.
ConstColumnPlainPtrs left_columns;
ConstColumnPlainPtrs right_columns;
ColumnPlainPtrs merged_columns;
/// Столбцы, по которым идёт сортировка.
ConstColumnPlainPtrs left_sort_columns;
ConstColumnPlainPtrs right_sort_columns;
size_t num_columns = left.columns();
for (size_t i = 0; i < num_columns; ++i)
{
left_columns.push_back(&*left.getByPosition(i).column);
right_columns.push_back(&*right.getByPosition(i).column);
merged_columns.push_back(&*merged.getByPosition(i).column);
}
for (size_t i = 0, size = description.size(); i < size; ++i)
{
size_t column_number = !description[i].column_name.empty()
? left.getPositionByName(description[i].column_name)
: description[i].column_number;
left_sort_columns.push_back(&*left.getByPosition(column_number).column);
right_sort_columns.push_back(&*right.getByPosition(column_number).column);
}
/// Объединяем.
while (right_pos < right_size || left_pos < left_size)
{
/// Откуда брать строку - из левого или из правого блока?
int res = 0;
if (right_pos == right_size)
res = -1;
else if (left_pos == left_size)
res = 1;
else
for (size_t i = 0, size = description.size(); i < size; ++i)
if ((res = description[i].direction * left_sort_columns[i]->compareAt(left_pos, right_pos, *right_sort_columns[i])))
break;
/// Вставляем строку в объединённый блок.
if (res <= 0)
{
for (size_t i = 0; i < num_columns; ++i)
merged_columns[i]->insert((*left_columns[i])[left_pos]);
++left_pos;
}
else
{
for (size_t i = 0; i < num_columns; ++i)
merged_columns[i]->insert((*right_columns[i])[right_pos]);
++right_pos;
}
}
left = merged;
}
2011-09-04 01:42:14 +00:00
}