ClickHouse/src/Processors/LimitTransform.cpp

379 lines
11 KiB
C++
Raw Normal View History

#include <Processors/LimitTransform.h>
namespace DB
{
2020-03-12 17:02:12 +00:00
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
2019-04-09 10:17:25 +00:00
LimitTransform::LimitTransform(
2020-07-12 05:18:01 +00:00
const Block & header_, UInt64 limit_, UInt64 offset_, size_t num_streams,
2019-08-27 17:48:42 +00:00
bool always_read_till_end_, bool with_ties_,
SortDescription description_)
2020-03-12 15:37:21 +00:00
: IProcessor(InputPorts(num_streams, header_), OutputPorts(num_streams, header_))
2020-03-12 15:29:35 +00:00
, limit(limit_), offset(offset_)
2019-08-03 11:02:40 +00:00
, always_read_till_end(always_read_till_end_)
, with_ties(with_ties_), description(std::move(description_))
{
2020-03-12 16:59:49 +00:00
if (num_streams != 1 && with_ties)
throw Exception("Cannot use LimitTransform with multiple ports and ties.", ErrorCodes::LOGICAL_ERROR);
2020-03-12 15:29:35 +00:00
2020-03-12 16:59:49 +00:00
ports_data.resize(num_streams);
size_t cur_stream = 0;
2020-03-12 15:29:35 +00:00
for (auto & input : inputs)
2020-03-12 16:59:49 +00:00
{
ports_data[cur_stream].input_port = &input;
++cur_stream;
}
2020-03-12 15:29:35 +00:00
2020-03-12 16:59:49 +00:00
cur_stream = 0;
2020-03-12 15:29:35 +00:00
for (auto & output : outputs)
2020-03-12 16:59:49 +00:00
{
ports_data[cur_stream].output_port = &output;
++cur_stream;
}
2020-03-12 15:33:13 +00:00
2019-08-27 17:48:42 +00:00
for (const auto & desc : description)
{
if (!desc.column_name.empty())
sort_column_positions.push_back(header_.getPositionByName(desc.column_name));
else
sort_column_positions.push_back(desc.column_number);
}
}
2020-07-12 05:18:01 +00:00
Chunk LimitTransform::makeChunkWithPreviousRow(const Chunk & chunk, UInt64 row) const
2020-02-28 20:20:39 +00:00
{
assert(row < chunk.getNumRows());
2020-03-02 12:56:47 +00:00
ColumnRawPtrs current_columns = extractSortColumns(chunk.getColumns());
MutableColumns last_row_sort_columns;
2020-02-28 20:20:39 +00:00
for (size_t i = 0; i < current_columns.size(); ++i)
2020-03-02 12:56:47 +00:00
{
last_row_sort_columns.emplace_back(current_columns[i]->cloneEmpty());
last_row_sort_columns[i]->insertFrom(*current_columns[i], row);
}
return Chunk(std::move(last_row_sort_columns), 1);
2020-02-28 20:20:39 +00:00
}
2020-03-12 15:29:35 +00:00
IProcessor::Status LimitTransform::prepare(
const PortNumbers & updated_input_ports,
const PortNumbers & updated_output_ports)
{
bool has_full_port = false;
auto process_pair = [&](size_t pos)
{
2020-03-12 16:59:49 +00:00
auto status = preparePair(ports_data[pos]);
2020-03-12 15:29:35 +00:00
switch (status)
{
case IProcessor::Status::Finished:
{
2020-03-12 16:59:49 +00:00
if (!ports_data[pos].is_finished)
2020-03-12 15:29:35 +00:00
{
2020-03-12 16:59:49 +00:00
ports_data[pos].is_finished = true;
2020-03-12 15:29:35 +00:00
++num_finished_port_pairs;
}
return;
}
case IProcessor::Status::PortFull:
{
has_full_port = true;
return;
}
case IProcessor::Status::NeedData:
return;
default:
throw Exception(
"Unexpected status for LimitTransform::preparePair : " + IProcessor::statusToName(status),
ErrorCodes::LOGICAL_ERROR);
}
};
for (auto pos : updated_input_ports)
process_pair(pos);
for (auto pos : updated_output_ports)
process_pair(pos);
2020-03-12 15:52:27 +00:00
/// All ports are finished. It may happen even before we reached the limit (has less data then limit).
2020-03-12 16:59:49 +00:00
if (num_finished_port_pairs == ports_data.size())
2020-03-12 15:29:35 +00:00
return Status::Finished;
2020-07-12 05:18:01 +00:00
bool limit_is_unreachable = (limit > std::numeric_limits<UInt64>::max() - offset);
2020-03-12 15:52:27 +00:00
/// If we reached limit for some port, then close others. Otherwise some sources may infinitely read data.
/// Example: SELECT * FROM system.numbers_mt WHERE number = 1000000 LIMIT 1
2020-07-12 05:18:01 +00:00
if ((!limit_is_unreachable && rows_read >= offset + limit)
&& !previous_row_chunk && !always_read_till_end)
2020-03-12 15:52:27 +00:00
{
for (auto & input : inputs)
input.close();
for (auto & output : outputs)
output.finish();
return Status::Finished;
}
2020-03-12 15:29:35 +00:00
if (has_full_port)
return Status::PortFull;
return Status::NeedData;
}
2020-03-13 18:43:21 +00:00
LimitTransform::Status LimitTransform::prepare()
{
if (ports_data.size() != 1)
throw Exception("prepare without arguments is not supported for multi-port LimitTransform.",
ErrorCodes::LOGICAL_ERROR);
return prepare({0}, {0});
}
2020-03-12 16:59:49 +00:00
LimitTransform::Status LimitTransform::preparePair(PortsData & data)
{
2020-03-12 16:59:49 +00:00
auto & output = *data.output_port;
auto & input = *data.input_port;
2019-04-12 14:59:31 +00:00
/// Check can output.
bool output_finished = false;
2019-02-07 18:51:53 +00:00
if (output.isFinished())
{
2019-04-12 14:59:31 +00:00
output_finished = true;
2019-04-12 14:43:28 +00:00
if (!always_read_till_end)
{
input.close();
return Status::Finished;
}
2019-02-07 18:51:53 +00:00
}
2019-04-12 14:59:31 +00:00
if (!output_finished && !output.canPush())
2019-02-07 18:51:53 +00:00
{
input.setNotNeeded();
return Status::PortFull;
}
2020-07-12 05:18:01 +00:00
bool limit_is_unreachable = (limit > std::numeric_limits<UInt64>::max() - offset);
2019-04-12 14:59:31 +00:00
/// Check if we are done with pushing.
2020-07-12 05:18:01 +00:00
bool is_limit_reached = !limit_is_unreachable && rows_read >= offset + limit && !previous_row_chunk;
2020-03-11 13:17:50 +00:00
if (is_limit_reached)
{
if (!always_read_till_end)
{
output.finish();
2019-02-07 18:51:53 +00:00
input.close();
return Status::Finished;
}
}
2019-02-07 18:51:53 +00:00
/// Check can input.
2019-02-07 18:51:53 +00:00
if (input.isFinished())
{
output.finish();
return Status::Finished;
}
2019-02-07 18:51:53 +00:00
input.setNeeded();
if (!input.hasData())
return Status::NeedData;
2020-03-12 16:59:49 +00:00
data.current_chunk = input.pull(true);
2020-03-12 16:59:49 +00:00
auto rows = data.current_chunk.getNumRows();
if (rows_before_limit_at_least)
rows_before_limit_at_least->add(rows);
2019-04-08 14:55:20 +00:00
2019-02-07 18:51:53 +00:00
/// Skip block (for 'always_read_till_end' case).
2020-03-12 16:59:49 +00:00
if (is_limit_reached || output_finished)
{
2020-03-12 16:59:49 +00:00
data.current_chunk.clear();
2020-03-12 15:29:35 +00:00
if (input.isFinished())
2019-04-12 14:43:28 +00:00
{
output.finish();
return Status::Finished;
}
2019-02-07 18:51:53 +00:00
/// Now, we pulled from input, and it must be empty.
2020-01-28 11:50:22 +00:00
input.setNeeded();
return Status::NeedData;
}
2019-02-07 18:51:53 +00:00
/// Process block.
2020-03-12 15:29:35 +00:00
rows_read += rows;
if (rows_read <= offset)
{
2020-03-12 16:59:49 +00:00
data.current_chunk.clear();
2019-02-07 18:51:53 +00:00
2019-04-12 14:43:28 +00:00
if (input.isFinished())
{
output.finish();
return Status::Finished;
}
2019-02-07 18:51:53 +00:00
/// Now, we pulled from input, and it must be empty.
2020-01-28 11:50:22 +00:00
input.setNeeded();
return Status::NeedData;
}
2020-07-12 05:18:01 +00:00
if (rows <= std::numeric_limits<UInt64>::max() - offset && rows_read >= offset + rows
&& !limit_is_unreachable && rows_read <= offset + limit)
{
2020-03-12 15:29:35 +00:00
/// Return the whole chunk.
2020-03-12 15:29:35 +00:00
/// Save the last row of current chunk to check if next block begins with the same row (for WITH TIES).
2019-08-27 17:48:42 +00:00
if (with_ties && rows_read == offset + limit)
2020-03-12 16:59:49 +00:00
previous_row_chunk = makeChunkWithPreviousRow(data.current_chunk, data.current_chunk.getNumRows() - 1);
}
2020-03-12 15:29:35 +00:00
else
2020-12-29 10:16:22 +00:00
/// This function may be heavy to execute in prepare. But it happens no more than twice, and make code simpler.
2020-03-12 16:59:49 +00:00
splitChunk(data);
2020-02-28 20:20:39 +00:00
bool may_need_more_data_for_ties = previous_row_chunk || rows_read - rows <= offset + limit;
/// No more data is needed.
2020-07-12 05:18:01 +00:00
if (!always_read_till_end && !limit_is_unreachable && rows_read >= offset + limit && !may_need_more_data_for_ties)
input.close();
2020-03-12 16:59:49 +00:00
output.push(std::move(data.current_chunk));
2020-03-12 15:29:35 +00:00
return Status::PortFull;
}
2020-03-12 16:59:49 +00:00
void LimitTransform::splitChunk(PortsData & data)
{
2020-03-12 16:59:49 +00:00
auto current_chunk_sort_columns = extractSortColumns(data.current_chunk.getColumns());
2020-07-12 05:18:01 +00:00
UInt64 num_rows = data.current_chunk.getNumRows();
UInt64 num_columns = data.current_chunk.getNumColumns();
2019-08-27 17:48:42 +00:00
2020-07-12 05:18:01 +00:00
bool limit_is_unreachable = (limit > std::numeric_limits<UInt64>::max() - offset);
if (previous_row_chunk && !limit_is_unreachable && rows_read >= offset + limit)
2019-08-27 17:48:42 +00:00
{
2020-02-28 20:20:39 +00:00
/// Scan until the first row, which is not equal to previous_row_chunk (for WITH TIES)
2020-07-12 05:18:01 +00:00
UInt64 current_row_num = 0;
2020-02-28 20:20:39 +00:00
for (; current_row_num < num_rows; ++current_row_num)
2019-08-27 17:48:42 +00:00
{
2020-03-02 12:56:47 +00:00
if (!sortColumnsEqualAt(current_chunk_sort_columns, current_row_num))
2019-08-27 17:48:42 +00:00
break;
}
2020-03-12 16:59:49 +00:00
auto columns = data.current_chunk.detachColumns();
2019-08-27 17:48:42 +00:00
2020-02-28 20:20:39 +00:00
if (current_row_num < num_rows)
2019-08-27 17:48:42 +00:00
{
2020-02-28 20:20:39 +00:00
previous_row_chunk = {};
2020-07-12 05:18:01 +00:00
for (UInt64 i = 0; i < num_columns; ++i)
2020-02-28 20:20:39 +00:00
columns[i] = columns[i]->cut(0, current_row_num);
2019-08-27 17:48:42 +00:00
}
2020-03-12 16:59:49 +00:00
data.current_chunk.setColumns(std::move(columns), current_row_num);
2019-08-27 17:48:42 +00:00
return;
}
/// return a piece of the block
2020-07-12 05:18:01 +00:00
UInt64 start = 0;
/// ------------[....(...).]
/// <----------------------> rows_read
/// <----------> num_rows
/// <---------------> offset
/// <---> start
assert(offset < rows_read);
if (offset + num_rows > rows_read)
start = offset + num_rows - rows_read;
/// ------------[....(...).]
/// <----------------------> rows_read
/// <----------> num_rows
/// <---------------> offset
/// <---> limit
/// <---> length
/// <---> start
2020-07-12 21:44:27 +00:00
/// Or:
2020-07-12 21:45:23 +00:00
/// -----------------(------[....)....]
2020-07-12 21:44:27 +00:00
/// <---------------------------------> rows_read
/// <---------> num_rows
/// <---------------> offset
/// <-----------> limit
/// <----> length
/// 0 = start
2020-07-12 05:18:01 +00:00
UInt64 length = num_rows - start;
2020-07-12 21:44:27 +00:00
if (!limit_is_unreachable && offset + limit < rows_read)
{
if (offset + limit < rows_read - num_rows)
length = 0;
else
2020-07-12 22:23:21 +00:00
length = offset + limit - (rows_read - num_rows) - start;
2020-07-12 21:44:27 +00:00
}
2019-08-27 17:48:42 +00:00
/// check if other rows in current block equals to last one in limit
2020-02-28 20:20:39 +00:00
if (with_ties && length)
2019-08-27 17:48:42 +00:00
{
2020-07-12 05:18:01 +00:00
UInt64 current_row_num = start + length;
2020-03-12 16:59:49 +00:00
previous_row_chunk = makeChunkWithPreviousRow(data.current_chunk, current_row_num - 1);
2019-08-27 17:48:42 +00:00
2020-02-28 20:20:39 +00:00
for (; current_row_num < num_rows; ++current_row_num)
2019-08-27 17:48:42 +00:00
{
2020-03-02 12:56:47 +00:00
if (!sortColumnsEqualAt(current_chunk_sort_columns, current_row_num))
2019-08-27 17:48:42 +00:00
{
2020-02-28 20:20:39 +00:00
previous_row_chunk = {};
2019-08-27 17:48:42 +00:00
break;
}
}
2020-02-28 20:20:39 +00:00
length = current_row_num - start;
2019-08-27 17:48:42 +00:00
}
if (length == num_rows)
return;
2020-03-12 16:59:49 +00:00
auto columns = data.current_chunk.detachColumns();
2020-07-12 05:18:01 +00:00
for (UInt64 i = 0; i < num_columns; ++i)
columns[i] = columns[i]->cut(start, length);
2020-03-12 16:59:49 +00:00
data.current_chunk.setColumns(std::move(columns), length);
}
2020-02-28 20:20:39 +00:00
ColumnRawPtrs LimitTransform::extractSortColumns(const Columns & columns) const
2019-08-27 17:48:42 +00:00
{
ColumnRawPtrs res;
res.reserve(description.size());
for (size_t pos : sort_column_positions)
res.push_back(columns[pos].get());
return res;
}
2020-07-12 05:18:01 +00:00
bool LimitTransform::sortColumnsEqualAt(const ColumnRawPtrs & current_chunk_sort_columns, UInt64 current_chunk_row_num) const
2020-03-02 12:56:47 +00:00
{
assert(current_chunk_sort_columns.size() == previous_row_chunk.getNumColumns());
size_t size = current_chunk_sort_columns.size();
const auto & previous_row_sort_columns = previous_row_chunk.getColumns();
for (size_t i = 0; i < size; ++i)
if (0 != current_chunk_sort_columns[i]->compareAt(current_chunk_row_num, 0, *previous_row_sort_columns[i], 1))
return false;
return true;
}
}