2022-06-07 21:00:34 +00:00
|
|
|
#include <sstream>
|
2019-10-10 16:30:30 +00:00
|
|
|
#include <Storages/MergeTree/IMergeTreeReader.h>
|
2018-02-13 19:34:15 +00:00
|
|
|
#include <Columns/FilterDescription.h>
|
2022-01-17 04:33:47 +00:00
|
|
|
#include <Columns/ColumnConst.h>
|
2018-02-13 19:34:15 +00:00
|
|
|
#include <Columns/ColumnsCommon.h>
|
2022-05-27 17:54:11 +00:00
|
|
|
#include <Common/TargetSpecific.h>
|
2021-10-02 07:13:14 +00:00
|
|
|
#include <base/range.h>
|
2021-08-06 13:39:11 +00:00
|
|
|
#include <Interpreters/castColumn.h>
|
2018-11-16 12:22:51 +00:00
|
|
|
#include <DataTypes/DataTypeNothing.h>
|
2018-02-13 19:34:15 +00:00
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE2__
|
2018-02-13 19:34:15 +00:00
|
|
|
#include <emmintrin.h>
|
|
|
|
#endif
|
2017-06-14 10:50:22 +00:00
|
|
|
|
2022-06-15 13:19:29 +00:00
|
|
|
#if defined(__aarch64__) && defined(__ARM_NEON)
|
|
|
|
# include <arm_neon.h>
|
|
|
|
# ifdef HAS_RESERVED_IDENTIFIER
|
|
|
|
# pragma clang diagnostic ignored "-Wreserved-identifier"
|
|
|
|
# endif
|
|
|
|
#endif
|
2022-05-27 17:54:11 +00:00
|
|
|
|
2017-06-14 10:50:22 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
2020-02-25 18:10:48 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int LOGICAL_ERROR;
|
2021-12-09 10:39:28 +00:00
|
|
|
extern const int BAD_ARGUMENTS;
|
2020-02-25 18:10:48 +00:00
|
|
|
}
|
2017-06-14 10:50:22 +00:00
|
|
|
|
2020-03-18 00:57:00 +00:00
|
|
|
|
|
|
|
static void filterColumns(Columns & columns, const IColumn::Filter & filter)
|
|
|
|
{
|
|
|
|
for (auto & column : columns)
|
|
|
|
{
|
|
|
|
if (column)
|
|
|
|
{
|
|
|
|
column = column->filter(filter, -1);
|
|
|
|
|
|
|
|
if (column->empty())
|
|
|
|
{
|
|
|
|
columns.clear();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2022-06-07 07:03:11 +00:00
|
|
|
/*
|
2021-02-20 11:00:16 +00:00
|
|
|
static void filterColumns(Columns & columns, const ColumnPtr & filter)
|
|
|
|
{
|
|
|
|
ConstantFilterDescription const_descr(*filter);
|
|
|
|
if (const_descr.always_true)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (const_descr.always_false)
|
|
|
|
{
|
|
|
|
for (auto & col : columns)
|
|
|
|
if (col)
|
|
|
|
col = col->cloneEmpty();
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
FilterDescription descr(*filter);
|
|
|
|
filterColumns(columns, *descr.data);
|
|
|
|
}
|
2022-06-07 07:03:11 +00:00
|
|
|
*/
|
2020-03-18 00:57:00 +00:00
|
|
|
|
2022-05-06 13:06:56 +00:00
|
|
|
size_t MergeTreeRangeReader::ReadResult::getLastMark(const MergeTreeRangeReader::ReadResult::RangesInfo & ranges)
|
2021-10-27 21:54:06 +00:00
|
|
|
{
|
|
|
|
size_t current_task_last_mark = 0;
|
|
|
|
for (const auto & mark_range : ranges)
|
|
|
|
current_task_last_mark = std::max(current_task_last_mark, mark_range.range.end);
|
|
|
|
return current_task_last_mark;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
MergeTreeRangeReader::DelayedStream::DelayedStream(
|
2021-10-15 08:36:26 +00:00
|
|
|
size_t from_mark,
|
|
|
|
size_t current_task_last_mark_,
|
|
|
|
IMergeTreeReader * merge_tree_reader_)
|
2018-02-13 19:34:15 +00:00
|
|
|
: current_mark(from_mark), current_offset(0), num_delayed_rows(0)
|
2021-10-15 08:36:26 +00:00
|
|
|
, current_task_last_mark(current_task_last_mark_)
|
2019-08-03 11:02:40 +00:00
|
|
|
, merge_tree_reader(merge_tree_reader_)
|
2019-03-27 15:57:14 +00:00
|
|
|
, index_granularity(&(merge_tree_reader->data_part->index_granularity))
|
2018-02-13 19:34:15 +00:00
|
|
|
, continue_reading(false), is_finished(false)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
size_t MergeTreeRangeReader::DelayedStream::position() const
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
2019-04-01 10:34:22 +00:00
|
|
|
size_t num_rows_before_current_mark = index_granularity->getMarkStartingRow(current_mark);
|
2018-11-15 14:06:54 +00:00
|
|
|
return num_rows_before_current_mark + current_offset + num_delayed_rows;
|
2018-02-13 19:34:15 +00:00
|
|
|
}
|
|
|
|
|
2019-09-23 19:22:02 +00:00
|
|
|
size_t MergeTreeRangeReader::DelayedStream::readRows(Columns & columns, size_t num_rows)
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
|
|
|
if (num_rows)
|
|
|
|
{
|
2021-10-15 08:36:26 +00:00
|
|
|
size_t rows_read = merge_tree_reader->readRows(
|
|
|
|
current_mark, current_task_last_mark, continue_reading, num_rows, columns);
|
2018-02-13 19:34:15 +00:00
|
|
|
continue_reading = true;
|
|
|
|
|
2018-11-14 11:26:44 +00:00
|
|
|
/// Zero rows_read maybe either because reading has finished
|
2018-02-13 19:34:15 +00:00
|
|
|
/// or because there is no columns we can read in current part (for example, all columns are default).
|
|
|
|
/// In the last case we can't finish reading, but it's also ok for the first case
|
|
|
|
/// because we can finish reading by calculation the number of pending rows.
|
|
|
|
if (0 < rows_read && rows_read < num_rows)
|
|
|
|
is_finished = true;
|
|
|
|
|
|
|
|
return rows_read;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-09-23 19:22:02 +00:00
|
|
|
size_t MergeTreeRangeReader::DelayedStream::read(Columns & columns, size_t from_mark, size_t offset, size_t num_rows)
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
2019-04-01 10:34:22 +00:00
|
|
|
size_t num_rows_before_from_mark = index_granularity->getMarkStartingRow(from_mark);
|
2018-11-14 11:26:44 +00:00
|
|
|
/// We already stand accurately in required position,
|
|
|
|
/// so because stream is lazy, we don't read anything
|
|
|
|
/// and only increment amount delayed_rows
|
|
|
|
if (position() == num_rows_before_from_mark + offset)
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
|
|
|
num_delayed_rows += num_rows;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2019-09-23 19:22:02 +00:00
|
|
|
size_t read_rows = finalize(columns);
|
2018-02-13 19:34:15 +00:00
|
|
|
|
|
|
|
continue_reading = false;
|
|
|
|
current_mark = from_mark;
|
|
|
|
current_offset = offset;
|
|
|
|
num_delayed_rows = num_rows;
|
|
|
|
|
|
|
|
return read_rows;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-09-23 19:22:02 +00:00
|
|
|
size_t MergeTreeRangeReader::DelayedStream::finalize(Columns & columns)
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
2018-11-14 11:26:44 +00:00
|
|
|
/// We need to skip some rows before reading
|
2018-02-13 19:34:15 +00:00
|
|
|
if (current_offset && !continue_reading)
|
|
|
|
{
|
2021-06-15 19:55:21 +00:00
|
|
|
for (size_t mark_num : collections::range(current_mark, index_granularity->getMarksCount()))
|
2018-11-14 11:26:44 +00:00
|
|
|
{
|
2019-03-27 15:57:14 +00:00
|
|
|
size_t mark_index_granularity = index_granularity->getMarkRows(mark_num);
|
2018-11-14 11:26:44 +00:00
|
|
|
if (current_offset >= mark_index_granularity)
|
|
|
|
{
|
|
|
|
current_offset -= mark_index_granularity;
|
|
|
|
current_mark++;
|
|
|
|
}
|
2018-11-15 14:06:54 +00:00
|
|
|
else
|
|
|
|
break;
|
2018-02-13 19:34:15 +00:00
|
|
|
|
2018-11-14 11:26:44 +00:00
|
|
|
}
|
|
|
|
|
2019-09-23 19:22:02 +00:00
|
|
|
/// Skip some rows from begin of granule.
|
2018-11-14 11:26:44 +00:00
|
|
|
/// We don't know size of rows in compressed granule,
|
2019-09-23 19:22:02 +00:00
|
|
|
/// so have to read them and throw out.
|
2018-02-13 19:34:15 +00:00
|
|
|
if (current_offset)
|
|
|
|
{
|
2019-09-23 19:22:02 +00:00
|
|
|
Columns tmp_columns;
|
|
|
|
tmp_columns.resize(columns.size());
|
|
|
|
readRows(tmp_columns, current_offset);
|
2018-02-13 19:34:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t rows_to_read = num_delayed_rows;
|
|
|
|
current_offset += num_delayed_rows;
|
|
|
|
num_delayed_rows = 0;
|
|
|
|
|
2019-09-23 19:22:02 +00:00
|
|
|
return readRows(columns, rows_to_read);
|
2018-02-13 19:34:15 +00:00
|
|
|
}
|
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
|
|
|
|
MergeTreeRangeReader::Stream::Stream(
|
2021-10-15 08:36:26 +00:00
|
|
|
size_t from_mark, size_t to_mark, size_t current_task_last_mark, IMergeTreeReader * merge_tree_reader_)
|
2018-02-13 19:34:15 +00:00
|
|
|
: current_mark(from_mark), offset_after_current_mark(0)
|
2018-11-14 11:26:44 +00:00
|
|
|
, last_mark(to_mark)
|
2019-08-03 11:02:40 +00:00
|
|
|
, merge_tree_reader(merge_tree_reader_)
|
2019-03-27 15:57:14 +00:00
|
|
|
, index_granularity(&(merge_tree_reader->data_part->index_granularity))
|
|
|
|
, current_mark_index_granularity(index_granularity->getMarkRows(from_mark))
|
2021-10-15 08:36:26 +00:00
|
|
|
, stream(from_mark, current_task_last_mark, merge_tree_reader)
|
2017-06-14 10:50:22 +00:00
|
|
|
{
|
2019-03-27 15:57:14 +00:00
|
|
|
size_t marks_count = index_granularity->getMarksCount();
|
2019-03-25 13:55:24 +00:00
|
|
|
if (from_mark >= marks_count)
|
2018-12-04 14:44:42 +00:00
|
|
|
throw Exception("Trying create stream to read from mark №"+ toString(current_mark) + " but total marks count is "
|
2019-03-25 13:55:24 +00:00
|
|
|
+ toString(marks_count), ErrorCodes::LOGICAL_ERROR);
|
2018-12-04 14:44:42 +00:00
|
|
|
|
2019-03-25 13:55:24 +00:00
|
|
|
if (last_mark > marks_count)
|
2018-12-04 14:44:42 +00:00
|
|
|
throw Exception("Trying create stream to read to mark №"+ toString(current_mark) + " but total marks count is "
|
2019-03-25 13:55:24 +00:00
|
|
|
+ toString(marks_count), ErrorCodes::LOGICAL_ERROR);
|
2017-06-14 10:50:22 +00:00
|
|
|
}
|
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
void MergeTreeRangeReader::Stream::checkNotFinished() const
|
2017-06-15 17:01:13 +00:00
|
|
|
{
|
2018-02-13 19:34:15 +00:00
|
|
|
if (isFinished())
|
2021-12-09 10:39:28 +00:00
|
|
|
throw Exception("Cannot read out of marks range.", ErrorCodes::BAD_ARGUMENTS);
|
2017-06-15 17:01:13 +00:00
|
|
|
}
|
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
void MergeTreeRangeReader::Stream::checkEnoughSpaceInCurrentGranule(size_t num_rows) const
|
2017-06-15 17:01:13 +00:00
|
|
|
{
|
2018-11-14 11:26:44 +00:00
|
|
|
if (num_rows + offset_after_current_mark > current_mark_index_granularity)
|
2018-02-13 19:34:15 +00:00
|
|
|
throw Exception("Cannot read from granule more than index_granularity.", ErrorCodes::LOGICAL_ERROR);
|
2017-06-15 17:01:13 +00:00
|
|
|
}
|
2017-06-14 10:50:22 +00:00
|
|
|
|
2019-09-23 19:22:02 +00:00
|
|
|
size_t MergeTreeRangeReader::Stream::readRows(Columns & columns, size_t num_rows)
|
2017-06-14 10:50:22 +00:00
|
|
|
{
|
2019-09-23 19:22:02 +00:00
|
|
|
size_t rows_read = stream.read(columns, current_mark, offset_after_current_mark, num_rows);
|
2018-02-13 19:34:15 +00:00
|
|
|
|
|
|
|
if (stream.isFinished())
|
|
|
|
finish();
|
|
|
|
|
|
|
|
return rows_read;
|
|
|
|
}
|
|
|
|
|
2018-11-14 11:26:44 +00:00
|
|
|
void MergeTreeRangeReader::Stream::toNextMark()
|
|
|
|
{
|
|
|
|
++current_mark;
|
2018-12-04 08:05:58 +00:00
|
|
|
|
2019-03-27 15:57:14 +00:00
|
|
|
size_t total_marks_count = index_granularity->getMarksCount();
|
2019-03-19 13:10:24 +00:00
|
|
|
if (current_mark < total_marks_count)
|
2019-03-27 15:57:14 +00:00
|
|
|
current_mark_index_granularity = index_granularity->getMarkRows(current_mark);
|
|
|
|
else if (current_mark == total_marks_count)
|
2019-03-19 13:10:24 +00:00
|
|
|
current_mark_index_granularity = 0; /// HACK?
|
2019-03-27 15:57:14 +00:00
|
|
|
else
|
|
|
|
throw Exception("Trying to read from mark " + toString(current_mark) + ", but total marks count " + toString(total_marks_count), ErrorCodes::LOGICAL_ERROR);
|
2018-12-04 14:44:42 +00:00
|
|
|
|
2018-11-14 11:26:44 +00:00
|
|
|
offset_after_current_mark = 0;
|
|
|
|
}
|
|
|
|
|
2019-09-23 19:22:02 +00:00
|
|
|
size_t MergeTreeRangeReader::Stream::read(Columns & columns, size_t num_rows, bool skip_remaining_rows_in_current_granule)
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
2018-02-20 11:45:58 +00:00
|
|
|
checkEnoughSpaceInCurrentGranule(num_rows);
|
2018-02-13 19:34:15 +00:00
|
|
|
|
|
|
|
if (num_rows)
|
|
|
|
{
|
|
|
|
checkNotFinished();
|
|
|
|
|
2019-09-23 19:22:02 +00:00
|
|
|
size_t read_rows = readRows(columns, num_rows);
|
2019-03-25 16:55:48 +00:00
|
|
|
|
2018-02-13 19:34:15 +00:00
|
|
|
offset_after_current_mark += num_rows;
|
|
|
|
|
2018-11-14 11:26:44 +00:00
|
|
|
/// Start new granule; skipped_rows_after_offset is already zero.
|
|
|
|
if (offset_after_current_mark == current_mark_index_granularity || skip_remaining_rows_in_current_granule)
|
|
|
|
toNextMark();
|
2018-02-13 19:34:15 +00:00
|
|
|
|
|
|
|
return read_rows;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/// Nothing to read.
|
|
|
|
if (skip_remaining_rows_in_current_granule)
|
|
|
|
{
|
|
|
|
/// Skip the rest of the rows in granule and start new one.
|
|
|
|
checkNotFinished();
|
2018-11-14 11:26:44 +00:00
|
|
|
toNextMark();
|
2018-02-13 19:34:15 +00:00
|
|
|
}
|
2017-07-19 13:42:21 +00:00
|
|
|
|
2018-02-13 19:34:15 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
2017-07-19 16:39:18 +00:00
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
void MergeTreeRangeReader::Stream::skip(size_t num_rows)
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
|
|
|
if (num_rows)
|
|
|
|
{
|
|
|
|
checkNotFinished();
|
2018-02-20 11:45:58 +00:00
|
|
|
checkEnoughSpaceInCurrentGranule(num_rows);
|
2017-07-19 16:39:18 +00:00
|
|
|
|
2018-02-13 19:34:15 +00:00
|
|
|
offset_after_current_mark += num_rows;
|
2017-06-14 10:50:22 +00:00
|
|
|
|
2018-11-14 11:26:44 +00:00
|
|
|
if (offset_after_current_mark == current_mark_index_granularity)
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
|
|
|
/// Start new granule; skipped_rows_after_offset is already zero.
|
2018-11-14 11:26:44 +00:00
|
|
|
toNextMark();
|
2018-02-13 19:34:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-09-23 19:22:02 +00:00
|
|
|
size_t MergeTreeRangeReader::Stream::finalize(Columns & columns)
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
2019-09-23 19:22:02 +00:00
|
|
|
size_t read_rows = stream.finalize(columns);
|
2017-06-14 10:50:22 +00:00
|
|
|
|
2018-02-13 19:34:15 +00:00
|
|
|
if (stream.isFinished())
|
|
|
|
finish();
|
2017-06-16 20:11:02 +00:00
|
|
|
|
|
|
|
return read_rows;
|
2017-06-14 10:50:22 +00:00
|
|
|
}
|
|
|
|
|
2018-02-13 19:34:15 +00:00
|
|
|
|
2019-09-23 19:22:02 +00:00
|
|
|
void MergeTreeRangeReader::ReadResult::addGranule(size_t num_rows_)
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
2019-09-23 19:22:02 +00:00
|
|
|
rows_per_granule.push_back(num_rows_);
|
|
|
|
total_rows_per_granule += num_rows_;
|
2018-02-13 19:34:15 +00:00
|
|
|
}
|
|
|
|
|
2018-02-20 13:37:04 +00:00
|
|
|
void MergeTreeRangeReader::ReadResult::adjustLastGranule()
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
2018-03-05 14:41:43 +00:00
|
|
|
size_t num_rows_to_subtract = total_rows_per_granule - num_read_rows;
|
2018-02-20 13:37:04 +00:00
|
|
|
|
2018-02-13 19:34:15 +00:00
|
|
|
if (rows_per_granule.empty())
|
2021-12-09 10:39:28 +00:00
|
|
|
throw Exception("Can't adjust last granule because no granules were added", ErrorCodes::LOGICAL_ERROR);
|
2018-02-13 19:34:15 +00:00
|
|
|
|
|
|
|
if (num_rows_to_subtract > rows_per_granule.back())
|
2021-10-15 08:36:26 +00:00
|
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
|
|
|
"Can't adjust last granule because it has {} rows, but try to subtract {} rows.",
|
|
|
|
toString(rows_per_granule.back()), toString(num_rows_to_subtract));
|
2018-02-13 19:34:15 +00:00
|
|
|
|
|
|
|
rows_per_granule.back() -= num_rows_to_subtract;
|
2018-03-05 14:41:43 +00:00
|
|
|
total_rows_per_granule -= num_rows_to_subtract;
|
2018-02-13 19:34:15 +00:00
|
|
|
}
|
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
void MergeTreeRangeReader::ReadResult::clear()
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
|
|
|
/// Need to save information about the number of granules.
|
2018-02-22 12:43:57 +00:00
|
|
|
num_rows_to_skip_in_last_granule += rows_per_granule.back();
|
2018-02-13 19:34:15 +00:00
|
|
|
rows_per_granule.assign(rows_per_granule.size(), 0);
|
2018-03-05 14:41:43 +00:00
|
|
|
total_rows_per_granule = 0;
|
|
|
|
filter_holder = nullptr;
|
|
|
|
filter = nullptr;
|
2018-02-13 19:34:15 +00:00
|
|
|
}
|
|
|
|
|
2019-11-15 03:38:35 +00:00
|
|
|
void MergeTreeRangeReader::ReadResult::shrink(Columns & old_columns)
|
|
|
|
{
|
2020-03-09 02:55:28 +00:00
|
|
|
for (auto & column : old_columns)
|
2019-11-15 03:38:35 +00:00
|
|
|
{
|
2020-03-09 02:55:28 +00:00
|
|
|
if (!column)
|
2019-11-15 03:38:35 +00:00
|
|
|
continue;
|
2021-01-29 15:13:09 +00:00
|
|
|
|
|
|
|
if (const auto * column_const = typeid_cast<const ColumnConst *>(column.get()))
|
|
|
|
{
|
|
|
|
column = column_const->cloneResized(total_rows_per_granule);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-03-09 02:55:28 +00:00
|
|
|
auto new_column = column->cloneEmpty();
|
2019-11-15 03:38:35 +00:00
|
|
|
new_column->reserve(total_rows_per_granule);
|
2020-01-30 15:31:55 +00:00
|
|
|
for (size_t j = 0, pos = 0; j < rows_per_granule_original.size(); pos += rows_per_granule_original[j++])
|
2019-11-15 03:38:35 +00:00
|
|
|
{
|
|
|
|
if (rows_per_granule[j])
|
2020-03-09 02:55:28 +00:00
|
|
|
new_column->insertRangeFrom(*column, pos, rows_per_granule[j]);
|
2019-11-15 03:38:35 +00:00
|
|
|
}
|
2020-03-09 02:55:28 +00:00
|
|
|
column = std::move(new_column);
|
2019-11-15 03:38:35 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void MergeTreeRangeReader::ReadResult::setFilterConstTrue()
|
|
|
|
{
|
|
|
|
clearFilter();
|
|
|
|
filter_holder = DataTypeUInt8().createColumnConst(num_rows, 1u);
|
|
|
|
}
|
|
|
|
|
|
|
|
void MergeTreeRangeReader::ReadResult::setFilterConstFalse()
|
|
|
|
{
|
|
|
|
clearFilter();
|
|
|
|
columns.clear();
|
|
|
|
num_rows = 0;
|
|
|
|
}
|
|
|
|
|
2022-05-18 18:22:42 +00:00
|
|
|
///
|
2021-02-20 11:00:16 +00:00
|
|
|
void MergeTreeRangeReader::ReadResult::optimize(bool can_read_incomplete_granules, bool allow_filter_columns)
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
2018-03-05 14:41:43 +00:00
|
|
|
if (total_rows_per_granule == 0 || filter == nullptr)
|
2018-02-13 19:34:15 +00:00
|
|
|
return;
|
|
|
|
|
2018-03-05 14:25:20 +00:00
|
|
|
NumRows zero_tails;
|
2019-12-05 12:27:31 +00:00
|
|
|
auto total_zero_rows_in_tails = countZeroTails(filter->getData(), zero_tails, can_read_incomplete_granules);
|
2018-03-05 14:41:43 +00:00
|
|
|
|
|
|
|
if (total_zero_rows_in_tails == filter->size())
|
|
|
|
{
|
|
|
|
clear();
|
|
|
|
return;
|
|
|
|
}
|
2019-11-15 03:38:35 +00:00
|
|
|
else if (total_zero_rows_in_tails == 0 && countBytesInResultFilter(filter->getData()) == filter->size())
|
2018-03-05 14:41:43 +00:00
|
|
|
{
|
2019-11-15 03:38:35 +00:00
|
|
|
setFilterConstTrue();
|
2018-03-05 14:41:43 +00:00
|
|
|
return;
|
|
|
|
}
|
2018-03-05 14:25:20 +00:00
|
|
|
/// Just a guess. If only a few rows may be skipped, it's better not to skip at all.
|
2019-11-15 03:38:35 +00:00
|
|
|
else if (2 * total_zero_rows_in_tails > filter->size())
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
2021-06-15 19:55:21 +00:00
|
|
|
for (auto i : collections::range(0, rows_per_granule.size()))
|
2019-11-15 03:38:35 +00:00
|
|
|
{
|
|
|
|
rows_per_granule_original.push_back(rows_per_granule[i]);
|
|
|
|
rows_per_granule[i] -= zero_tails[i];
|
|
|
|
}
|
|
|
|
num_rows_to_skip_in_last_granule += rows_per_granule_original.back() - rows_per_granule.back();
|
2018-02-13 19:34:15 +00:00
|
|
|
|
2020-03-12 02:16:05 +00:00
|
|
|
filter_original = filter;
|
|
|
|
filter_holder_original = std::move(filter_holder);
|
|
|
|
|
2019-11-15 03:38:35 +00:00
|
|
|
/// Check if const 1 after shrink
|
2021-02-20 11:00:16 +00:00
|
|
|
if (allow_filter_columns && countBytesInResultFilter(filter->getData()) + total_zero_rows_in_tails == total_rows_per_granule)
|
2019-11-15 03:38:35 +00:00
|
|
|
{
|
2019-12-31 08:09:37 +00:00
|
|
|
total_rows_per_granule = total_rows_per_granule - total_zero_rows_in_tails;
|
|
|
|
num_rows = total_rows_per_granule;
|
2019-11-15 03:38:35 +00:00
|
|
|
setFilterConstTrue();
|
|
|
|
shrink(columns); /// shrink acts as filtering in such case
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
auto new_filter = ColumnUInt8::create(filter->size() - total_zero_rows_in_tails);
|
|
|
|
IColumn::Filter & new_data = new_filter->getData();
|
|
|
|
|
|
|
|
collapseZeroTails(filter->getData(), new_data);
|
2019-12-31 08:09:37 +00:00
|
|
|
total_rows_per_granule = new_filter->size();
|
|
|
|
num_rows = total_rows_per_granule;
|
2019-11-15 03:38:35 +00:00
|
|
|
filter = new_filter.get();
|
|
|
|
filter_holder = std::move(new_filter);
|
|
|
|
}
|
2020-03-12 02:16:05 +00:00
|
|
|
need_filter = true;
|
2018-02-13 19:34:15 +00:00
|
|
|
}
|
2019-11-15 03:38:35 +00:00
|
|
|
/// Another guess, if it's worth filtering at PREWHERE
|
|
|
|
else if (countBytesInResultFilter(filter->getData()) < 0.6 * filter->size())
|
|
|
|
need_filter = true;
|
2018-02-13 19:34:15 +00:00
|
|
|
}
|
|
|
|
|
2022-05-18 18:22:42 +00:00
|
|
|
/// For each read granule
|
2019-12-05 12:27:31 +00:00
|
|
|
size_t MergeTreeRangeReader::ReadResult::countZeroTails(const IColumn::Filter & filter_vec, NumRows & zero_tails, bool can_read_incomplete_granules) const
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
2018-02-22 12:43:57 +00:00
|
|
|
zero_tails.resize(0);
|
|
|
|
zero_tails.reserve(rows_per_granule.size());
|
|
|
|
|
2020-04-22 06:34:20 +00:00
|
|
|
const auto * filter_data = filter_vec.data();
|
2018-02-13 19:34:15 +00:00
|
|
|
|
2018-02-22 12:43:57 +00:00
|
|
|
size_t total_zero_rows_in_tails = 0;
|
|
|
|
|
|
|
|
for (auto rows_to_read : rows_per_granule)
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
2018-02-22 12:43:57 +00:00
|
|
|
/// Count the number of zeros at the end of filter for rows were read from current granule.
|
2019-12-05 12:27:31 +00:00
|
|
|
size_t zero_tail = numZerosInTail(filter_data, filter_data + rows_to_read);
|
|
|
|
if (!can_read_incomplete_granules && zero_tail != rows_to_read)
|
|
|
|
zero_tail = 0;
|
|
|
|
zero_tails.push_back(zero_tail);
|
2018-02-22 12:43:57 +00:00
|
|
|
total_zero_rows_in_tails += zero_tails.back();
|
|
|
|
filter_data += rows_to_read;
|
2018-02-13 19:34:15 +00:00
|
|
|
}
|
|
|
|
|
2018-02-22 12:43:57 +00:00
|
|
|
return total_zero_rows_in_tails;
|
|
|
|
}
|
|
|
|
|
2019-11-15 03:38:35 +00:00
|
|
|
void MergeTreeRangeReader::ReadResult::collapseZeroTails(const IColumn::Filter & filter_vec, IColumn::Filter & new_filter_vec)
|
2018-02-22 12:43:57 +00:00
|
|
|
{
|
2020-04-22 06:34:20 +00:00
|
|
|
const auto * filter_data = filter_vec.data();
|
|
|
|
auto * new_filter_data = new_filter_vec.data();
|
2018-02-22 12:43:57 +00:00
|
|
|
|
2021-06-15 19:55:21 +00:00
|
|
|
for (auto i : collections::range(0, rows_per_granule.size()))
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
2019-11-15 03:38:35 +00:00
|
|
|
memcpySmallAllowReadWriteOverflow15(new_filter_data, filter_data, rows_per_granule[i]);
|
|
|
|
filter_data += rows_per_granule_original[i];
|
|
|
|
new_filter_data += rows_per_granule[i];
|
2018-02-13 19:34:15 +00:00
|
|
|
}
|
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
new_filter_vec.resize(new_filter_data - new_filter_vec.data());
|
2018-02-13 19:34:15 +00:00
|
|
|
}
|
|
|
|
|
2022-06-02 09:23:44 +00:00
|
|
|
DECLARE_AVX512BW_SPECIFIC_CODE(
|
|
|
|
size_t numZerosInTail(const UInt8 * begin, const UInt8 * end)
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
|
|
|
size_t count = 0;
|
2022-06-02 09:23:44 +00:00
|
|
|
const __m512i zero64 = _mm512_setzero_epi32();
|
|
|
|
while (end - begin >= 64)
|
2022-05-31 19:41:10 +00:00
|
|
|
{
|
2022-06-02 09:23:44 +00:00
|
|
|
end -= 64;
|
|
|
|
const auto * pos = end;
|
|
|
|
UInt64 val = static_cast<UInt64>(_mm512_cmp_epi8_mask(
|
|
|
|
_mm512_loadu_si512(reinterpret_cast<const __m512i *>(pos)),
|
|
|
|
zero64,
|
|
|
|
_MM_CMPINT_EQ));
|
|
|
|
val = ~val;
|
|
|
|
if (val == 0)
|
|
|
|
count += 64;
|
|
|
|
else
|
2022-05-27 17:54:11 +00:00
|
|
|
{
|
2022-06-02 09:23:44 +00:00
|
|
|
count += __builtin_clzll(val);
|
|
|
|
return count;
|
2022-05-27 17:54:11 +00:00
|
|
|
}
|
|
|
|
}
|
2022-06-02 09:23:44 +00:00
|
|
|
while (end > begin && *(--end) == 0)
|
|
|
|
{
|
|
|
|
++count;
|
|
|
|
}
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
) /// DECLARE_AVX512BW_SPECIFIC_CODE
|
|
|
|
|
2022-06-02 11:01:13 +00:00
|
|
|
DECLARE_AVX2_SPECIFIC_CODE(
|
|
|
|
size_t numZerosInTail(const UInt8 * begin, const UInt8 * end)
|
|
|
|
{
|
|
|
|
size_t count = 0;
|
|
|
|
const __m256i zero32 = _mm256_setzero_si256();
|
|
|
|
while (end - begin >= 64)
|
|
|
|
{
|
|
|
|
end -= 64;
|
|
|
|
const auto * pos = end;
|
|
|
|
UInt64 val =
|
|
|
|
(static_cast<UInt64>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(
|
|
|
|
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(pos)),
|
|
|
|
zero32))) & 0xffffffffu)
|
|
|
|
| (static_cast<UInt64>(_mm256_movemask_epi8(_mm256_cmpeq_epi8(
|
|
|
|
_mm256_loadu_si256(reinterpret_cast<const __m256i *>(pos + 32)),
|
|
|
|
zero32))) << 32u);
|
|
|
|
|
|
|
|
val = ~val;
|
|
|
|
if (val == 0)
|
|
|
|
count += 64;
|
|
|
|
else
|
2022-05-27 17:54:11 +00:00
|
|
|
{
|
2022-06-02 11:01:13 +00:00
|
|
|
count += __builtin_clzll(val);
|
|
|
|
return count;
|
2022-05-27 17:54:11 +00:00
|
|
|
}
|
|
|
|
}
|
2022-06-02 11:01:13 +00:00
|
|
|
while (end > begin && *(--end) == 0)
|
|
|
|
{
|
|
|
|
++count;
|
|
|
|
}
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
) /// DECLARE_AVX2_SPECIFIC_CODE
|
|
|
|
|
2022-06-02 09:23:44 +00:00
|
|
|
size_t MergeTreeRangeReader::ReadResult::numZerosInTail(const UInt8 * begin, const UInt8 * end)
|
|
|
|
{
|
|
|
|
#if USE_MULTITARGET_CODE
|
|
|
|
/// check if cpu support avx512 dynamically, haveAVX512BW contains check of haveAVX512F
|
|
|
|
if (isArchSupported(TargetArch::AVX512BW))
|
|
|
|
return TargetSpecific::AVX512BW::numZerosInTail(begin, end);
|
2022-06-02 11:01:13 +00:00
|
|
|
else if (isArchSupported(TargetArch::AVX2))
|
|
|
|
return TargetSpecific::AVX2::numZerosInTail(begin, end);
|
2022-05-27 17:54:11 +00:00
|
|
|
#endif
|
|
|
|
|
2022-06-02 09:23:44 +00:00
|
|
|
size_t count = 0;
|
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
#if defined(__SSE2__) && defined(__POPCNT__)
|
2018-02-13 19:34:15 +00:00
|
|
|
const __m128i zero16 = _mm_setzero_si128();
|
|
|
|
while (end - begin >= 64)
|
|
|
|
{
|
|
|
|
end -= 64;
|
2020-04-22 06:34:20 +00:00
|
|
|
const auto * pos = end;
|
2018-02-13 19:34:15 +00:00
|
|
|
UInt64 val =
|
2021-01-29 07:37:57 +00:00
|
|
|
static_cast<UInt64>(_mm_movemask_epi8(_mm_cmpeq_epi8(
|
2018-02-13 19:34:15 +00:00
|
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(pos)),
|
|
|
|
zero16)))
|
2021-01-29 07:37:57 +00:00
|
|
|
| (static_cast<UInt64>(_mm_movemask_epi8(_mm_cmpeq_epi8(
|
2018-02-13 19:34:15 +00:00
|
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(pos + 16)),
|
2019-09-23 19:22:02 +00:00
|
|
|
zero16))) << 16u)
|
2021-01-29 07:37:57 +00:00
|
|
|
| (static_cast<UInt64>(_mm_movemask_epi8(_mm_cmpeq_epi8(
|
2018-02-13 19:34:15 +00:00
|
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(pos + 32)),
|
2019-09-23 19:22:02 +00:00
|
|
|
zero16))) << 32u)
|
2021-01-29 07:37:57 +00:00
|
|
|
| (static_cast<UInt64>(_mm_movemask_epi8(_mm_cmpeq_epi8(
|
2018-02-13 19:34:15 +00:00
|
|
|
_mm_loadu_si128(reinterpret_cast<const __m128i *>(pos + 48)),
|
2019-09-23 19:22:02 +00:00
|
|
|
zero16))) << 48u);
|
2021-01-29 07:37:57 +00:00
|
|
|
val = ~val;
|
2018-02-13 19:34:15 +00:00
|
|
|
if (val == 0)
|
|
|
|
count += 64;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
count += __builtin_clzll(val);
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
}
|
2022-06-15 13:19:29 +00:00
|
|
|
#elif defined(__aarch64__) && defined(__ARM_NEON)
|
|
|
|
const uint8x16_t bitmask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
|
|
|
|
while (end - begin >= 64)
|
|
|
|
{
|
|
|
|
end -= 64;
|
|
|
|
const auto * src = reinterpret_cast<const unsigned char *>(end);
|
|
|
|
const uint8x16_t p0 = vceqzq_u8(vld1q_u8(src));
|
|
|
|
const uint8x16_t p1 = vceqzq_u8(vld1q_u8(src + 16));
|
|
|
|
const uint8x16_t p2 = vceqzq_u8(vld1q_u8(src + 32));
|
|
|
|
const uint8x16_t p3 = vceqzq_u8(vld1q_u8(src + 48));
|
|
|
|
uint8x16_t t0 = vandq_u8(p0, bitmask);
|
|
|
|
uint8x16_t t1 = vandq_u8(p1, bitmask);
|
|
|
|
uint8x16_t t2 = vandq_u8(p2, bitmask);
|
|
|
|
uint8x16_t t3 = vandq_u8(p3, bitmask);
|
|
|
|
uint8x16_t sum0 = vpaddq_u8(t0, t1);
|
|
|
|
uint8x16_t sum1 = vpaddq_u8(t2, t3);
|
|
|
|
sum0 = vpaddq_u8(sum0, sum1);
|
|
|
|
sum0 = vpaddq_u8(sum0, sum0);
|
|
|
|
UInt64 val = vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
|
|
|
|
val = ~val;
|
|
|
|
if (val == 0)
|
|
|
|
count += 64;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
count += __builtin_clzll(val);
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
}
|
2018-02-13 19:34:15 +00:00
|
|
|
#endif
|
|
|
|
|
|
|
|
while (end > begin && *(--end) == 0)
|
|
|
|
{
|
|
|
|
++count;
|
|
|
|
}
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
2022-05-18 18:22:42 +00:00
|
|
|
/// Filter size must match total_rows_per_granule
|
2021-02-20 11:00:16 +00:00
|
|
|
void MergeTreeRangeReader::ReadResult::setFilter(const ColumnPtr & new_filter)
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
2021-02-20 11:00:16 +00:00
|
|
|
if (!new_filter && filter)
|
|
|
|
throw Exception("Can't replace existing filter with empty.", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
2018-03-05 14:41:43 +00:00
|
|
|
if (filter)
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
2021-02-20 11:00:16 +00:00
|
|
|
size_t new_size = new_filter->size();
|
2018-02-13 19:34:15 +00:00
|
|
|
|
2018-03-05 14:41:43 +00:00
|
|
|
if (new_size != total_rows_per_granule)
|
2021-02-20 11:00:16 +00:00
|
|
|
throw Exception("Can't set filter because it's size is " + toString(new_size) + " but "
|
2018-03-05 14:41:43 +00:00
|
|
|
+ toString(total_rows_per_granule) + " rows was read.", ErrorCodes::LOGICAL_ERROR);
|
2018-03-05 14:25:20 +00:00
|
|
|
}
|
2018-02-13 19:34:15 +00:00
|
|
|
|
2018-03-05 14:41:43 +00:00
|
|
|
ConstantFilterDescription const_description(*new_filter);
|
2020-02-05 16:42:27 +00:00
|
|
|
if (const_description.always_true)
|
2021-04-11 19:39:22 +00:00
|
|
|
{
|
2021-02-20 11:00:16 +00:00
|
|
|
setFilterConstTrue();
|
2021-04-11 19:39:22 +00:00
|
|
|
}
|
2020-02-05 16:42:27 +00:00
|
|
|
else if (const_description.always_false)
|
2021-04-11 19:39:22 +00:00
|
|
|
{
|
2018-02-22 11:31:15 +00:00
|
|
|
clear();
|
2021-04-11 19:39:22 +00:00
|
|
|
}
|
2020-02-05 16:42:27 +00:00
|
|
|
else
|
2018-02-22 11:31:15 +00:00
|
|
|
{
|
2021-02-20 11:00:16 +00:00
|
|
|
FilterDescription filter_description(*new_filter);
|
|
|
|
filter_holder = filter_description.data_holder ? filter_description.data_holder : new_filter;
|
|
|
|
filter = typeid_cast<const ColumnUInt8 *>(filter_holder.get());
|
|
|
|
if (!filter)
|
|
|
|
throw Exception("setFilter function expected ColumnUInt8.", ErrorCodes::LOGICAL_ERROR);
|
2018-02-22 11:31:15 +00:00
|
|
|
}
|
2018-02-13 19:34:15 +00:00
|
|
|
}
|
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
|
2019-11-15 03:38:35 +00:00
|
|
|
size_t MergeTreeRangeReader::ReadResult::countBytesInResultFilter(const IColumn::Filter & filter_)
|
|
|
|
{
|
|
|
|
auto it = filter_bytes_map.find(&filter_);
|
|
|
|
if (it == filter_bytes_map.end())
|
|
|
|
{
|
|
|
|
auto bytes = countBytesInFilter(filter_);
|
|
|
|
filter_bytes_map[&filter_] = bytes;
|
|
|
|
return bytes;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
return it->second;
|
|
|
|
}
|
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
MergeTreeRangeReader::MergeTreeRangeReader(
|
2020-01-09 12:54:37 +00:00
|
|
|
IMergeTreeReader * merge_tree_reader_,
|
2019-11-15 03:38:35 +00:00
|
|
|
MergeTreeRangeReader * prev_reader_,
|
2022-06-07 07:03:11 +00:00
|
|
|
const PrewhereExprStep * prewhere_info_,
|
2022-03-15 06:34:25 +00:00
|
|
|
bool last_reader_in_chain_,
|
2022-04-11 13:43:09 +00:00
|
|
|
const Names & non_const_virtual_column_names_)
|
2019-11-15 03:38:35 +00:00
|
|
|
: merge_tree_reader(merge_tree_reader_)
|
2021-01-25 14:31:59 +00:00
|
|
|
, index_granularity(&(merge_tree_reader->data_part->index_granularity))
|
|
|
|
, prev_reader(prev_reader_)
|
2021-02-13 22:07:13 +00:00
|
|
|
, prewhere_info(prewhere_info_)
|
2021-01-25 14:31:59 +00:00
|
|
|
, last_reader_in_chain(last_reader_in_chain_)
|
|
|
|
, is_initialized(true)
|
2022-04-11 13:43:09 +00:00
|
|
|
, non_const_virtual_column_names(non_const_virtual_column_names_)
|
2018-02-20 11:45:58 +00:00
|
|
|
{
|
2019-09-26 17:29:41 +00:00
|
|
|
if (prev_reader)
|
|
|
|
sample_block = prev_reader->getSampleBlock();
|
|
|
|
|
2020-04-22 06:34:20 +00:00
|
|
|
for (const auto & name_and_type : merge_tree_reader->getColumns())
|
2019-09-26 17:29:41 +00:00
|
|
|
sample_block.insert({name_and_type.type->createColumn(), name_and_type.type, name_and_type.name});
|
|
|
|
|
2022-04-11 13:43:09 +00:00
|
|
|
for (const auto & column_name : non_const_virtual_column_names)
|
2022-03-15 06:34:25 +00:00
|
|
|
{
|
2022-04-11 13:43:09 +00:00
|
|
|
if (sample_block.has(column_name))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (column_name == "_part_offset")
|
|
|
|
sample_block.insert(ColumnWithTypeAndName(ColumnUInt64::create(), std::make_shared<DataTypeUInt64>(), column_name));
|
2022-03-15 06:34:25 +00:00
|
|
|
}
|
|
|
|
|
2021-02-13 22:07:13 +00:00
|
|
|
if (prewhere_info)
|
2019-11-15 03:38:35 +00:00
|
|
|
{
|
2022-06-07 07:03:11 +00:00
|
|
|
// for (const auto & step : prewhere_info->steps)
|
|
|
|
const auto & step = *prewhere_info;
|
|
|
|
{
|
|
|
|
if (step.actions)
|
|
|
|
step.actions->execute(sample_block, true);
|
|
|
|
|
|
|
|
if (step.remove_column)
|
|
|
|
sample_block.erase(step.column_name);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* if (prewhere_info->row_level_filter)
|
2021-02-20 11:00:16 +00:00
|
|
|
{
|
2021-02-15 19:48:06 +00:00
|
|
|
prewhere_info->row_level_filter->execute(sample_block, true);
|
2021-02-20 11:00:16 +00:00
|
|
|
sample_block.erase(prewhere_info->row_level_column_name);
|
|
|
|
}
|
2021-02-15 19:48:06 +00:00
|
|
|
|
2021-02-13 22:07:13 +00:00
|
|
|
if (prewhere_info->prewhere_actions)
|
|
|
|
prewhere_info->prewhere_actions->execute(sample_block, true);
|
|
|
|
|
|
|
|
if (prewhere_info->remove_prewhere_column)
|
|
|
|
sample_block.erase(prewhere_info->prewhere_column_name);
|
2022-06-07 07:03:11 +00:00
|
|
|
*/
|
2019-11-15 03:38:35 +00:00
|
|
|
}
|
2018-02-20 11:45:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool MergeTreeRangeReader::isReadingFinished() const
|
|
|
|
{
|
|
|
|
return prev_reader ? prev_reader->isReadingFinished() : stream.isFinished();
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t MergeTreeRangeReader::numReadRowsInCurrentGranule() const
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
2018-02-20 11:45:58 +00:00
|
|
|
return prev_reader ? prev_reader->numReadRowsInCurrentGranule() : stream.numReadRowsInCurrentGranule();
|
|
|
|
}
|
2019-07-18 14:41:11 +00:00
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
size_t MergeTreeRangeReader::numPendingRowsInCurrentGranule() const
|
|
|
|
{
|
2018-02-22 12:43:57 +00:00
|
|
|
if (prev_reader)
|
|
|
|
return prev_reader->numPendingRowsInCurrentGranule();
|
2018-02-13 19:34:15 +00:00
|
|
|
|
2018-03-05 14:41:43 +00:00
|
|
|
auto pending_rows = stream.numPendingRowsInCurrentGranule();
|
2019-03-25 16:55:48 +00:00
|
|
|
|
|
|
|
if (pending_rows)
|
|
|
|
return pending_rows;
|
|
|
|
|
2019-03-28 15:03:49 +00:00
|
|
|
return numRowsInCurrentGranule();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
size_t MergeTreeRangeReader::numRowsInCurrentGranule() const
|
|
|
|
{
|
2018-02-22 12:43:57 +00:00
|
|
|
/// If pending_rows is zero, than stream is not initialized.
|
2019-03-25 16:55:48 +00:00
|
|
|
if (stream.current_mark_index_granularity)
|
|
|
|
return stream.current_mark_index_granularity;
|
|
|
|
|
|
|
|
/// We haven't read anything, return first
|
|
|
|
size_t first_mark = merge_tree_reader->getFirstMarkToRead();
|
2019-03-27 15:57:14 +00:00
|
|
|
return index_granularity->getMarkRows(first_mark);
|
2018-11-14 11:26:44 +00:00
|
|
|
}
|
|
|
|
|
2019-03-28 15:03:49 +00:00
|
|
|
size_t MergeTreeRangeReader::currentMark() const
|
|
|
|
{
|
|
|
|
return stream.currentMark();
|
|
|
|
}
|
|
|
|
|
2019-03-28 08:52:09 +00:00
|
|
|
size_t MergeTreeRangeReader::Stream::numPendingRows() const
|
|
|
|
{
|
|
|
|
size_t rows_between_marks = index_granularity->getRowsCountInRange(current_mark, last_mark);
|
2018-11-15 14:06:54 +00:00
|
|
|
return rows_between_marks - offset_after_current_mark;
|
2018-02-20 11:45:58 +00:00
|
|
|
}
|
|
|
|
|
2022-03-15 06:34:25 +00:00
|
|
|
UInt64 MergeTreeRangeReader::Stream::currentPartOffset() const
|
|
|
|
{
|
|
|
|
return index_granularity->getMarkStartingRow(current_mark) + offset_after_current_mark;
|
|
|
|
}
|
|
|
|
|
|
|
|
UInt64 MergeTreeRangeReader::Stream::lastPartOffset() const
|
|
|
|
{
|
|
|
|
return index_granularity->getMarkStartingRow(last_mark);
|
|
|
|
}
|
|
|
|
|
2019-12-02 17:10:22 +00:00
|
|
|
|
|
|
|
size_t MergeTreeRangeReader::Stream::ceilRowsToCompleteGranules(size_t rows_num) const
|
|
|
|
{
|
|
|
|
/// FIXME suboptimal
|
|
|
|
size_t result = 0;
|
|
|
|
size_t from_mark = current_mark;
|
|
|
|
while (result < rows_num && from_mark < last_mark)
|
|
|
|
result += index_granularity->getMarkRows(from_mark++);
|
2019-12-18 16:41:11 +00:00
|
|
|
|
2019-12-02 17:10:22 +00:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2018-02-20 11:50:33 +00:00
|
|
|
bool MergeTreeRangeReader::isCurrentRangeFinished() const
|
2018-02-20 11:45:58 +00:00
|
|
|
{
|
|
|
|
return prev_reader ? prev_reader->isCurrentRangeFinished() : stream.isFinished();
|
|
|
|
}
|
2018-02-13 19:34:15 +00:00
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
MergeTreeRangeReader::ReadResult MergeTreeRangeReader::read(size_t max_rows, MarkRanges & ranges)
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
|
|
|
if (max_rows == 0)
|
|
|
|
throw Exception("Expected at least 1 row to read, got 0.", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
|
|
|
ReadResult read_result;
|
|
|
|
|
|
|
|
if (prev_reader)
|
2018-02-20 11:45:58 +00:00
|
|
|
{
|
|
|
|
read_result = prev_reader->read(max_rows, ranges);
|
2019-09-23 19:22:02 +00:00
|
|
|
|
|
|
|
size_t num_read_rows;
|
|
|
|
Columns columns = continueReadingChain(read_result, num_read_rows);
|
|
|
|
|
|
|
|
/// Nothing to do. Return empty result.
|
|
|
|
if (read_result.num_rows == 0)
|
|
|
|
return read_result;
|
|
|
|
|
|
|
|
bool has_columns = false;
|
2019-11-15 03:38:35 +00:00
|
|
|
size_t total_bytes = 0;
|
|
|
|
for (auto & column : columns)
|
|
|
|
{
|
|
|
|
if (column)
|
2021-05-04 10:52:37 +00:00
|
|
|
{
|
2019-11-15 03:38:35 +00:00
|
|
|
total_bytes += column->byteSize();
|
2021-05-04 10:52:37 +00:00
|
|
|
has_columns = true;
|
|
|
|
}
|
2019-11-15 03:38:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
read_result.addNumBytesRead(total_bytes);
|
|
|
|
|
2018-02-22 12:43:57 +00:00
|
|
|
bool should_evaluate_missing_defaults = false;
|
2019-09-23 19:22:02 +00:00
|
|
|
|
|
|
|
if (has_columns)
|
2018-02-22 12:43:57 +00:00
|
|
|
{
|
2019-09-23 19:22:02 +00:00
|
|
|
/// num_read_rows >= read_result.num_rows
|
|
|
|
/// We must filter block before adding columns to read_result.block
|
2018-02-22 12:43:57 +00:00
|
|
|
|
|
|
|
/// Fill missing columns before filtering because some arrays from Nested may have empty data.
|
2019-09-23 19:22:02 +00:00
|
|
|
merge_tree_reader->fillMissingColumns(columns, should_evaluate_missing_defaults, num_read_rows);
|
2018-02-22 12:43:57 +00:00
|
|
|
|
2018-03-05 14:41:43 +00:00
|
|
|
if (read_result.getFilter())
|
2019-09-23 19:22:02 +00:00
|
|
|
filterColumns(columns, read_result.getFilter()->getData());
|
2018-02-22 12:43:57 +00:00
|
|
|
}
|
2018-10-03 13:55:21 +00:00
|
|
|
else
|
|
|
|
{
|
2019-09-23 19:22:02 +00:00
|
|
|
size_t num_rows = read_result.num_rows;
|
2018-10-04 08:58:19 +00:00
|
|
|
|
2018-10-03 13:55:21 +00:00
|
|
|
/// If block is empty, we still may need to add missing columns.
|
|
|
|
/// In that case use number of rows in result block and don't filter block.
|
2018-10-04 08:58:19 +00:00
|
|
|
if (num_rows)
|
2019-09-23 19:22:02 +00:00
|
|
|
merge_tree_reader->fillMissingColumns(columns, should_evaluate_missing_defaults, num_rows);
|
2018-10-03 13:55:21 +00:00
|
|
|
}
|
|
|
|
|
2020-11-13 03:43:35 +00:00
|
|
|
if (!columns.empty())
|
2019-11-15 03:38:35 +00:00
|
|
|
{
|
2020-12-24 10:11:07 +00:00
|
|
|
/// If some columns absent in part, then evaluate default values
|
2020-11-13 03:43:35 +00:00
|
|
|
if (should_evaluate_missing_defaults)
|
2019-11-15 03:38:35 +00:00
|
|
|
{
|
2020-11-13 03:43:35 +00:00
|
|
|
auto block = prev_reader->sample_block.cloneWithColumns(read_result.columns);
|
|
|
|
auto block_before_prewhere = read_result.block_before_prewhere;
|
2022-02-16 09:01:47 +00:00
|
|
|
for (const auto & column : block)
|
2019-11-15 03:38:35 +00:00
|
|
|
{
|
2022-02-16 09:01:47 +00:00
|
|
|
if (block_before_prewhere.has(column.name))
|
|
|
|
block_before_prewhere.erase(column.name);
|
2019-11-15 03:38:35 +00:00
|
|
|
}
|
|
|
|
|
2020-11-13 03:43:35 +00:00
|
|
|
if (block_before_prewhere)
|
|
|
|
{
|
|
|
|
if (read_result.need_filter)
|
|
|
|
{
|
|
|
|
auto old_columns = block_before_prewhere.getColumns();
|
|
|
|
filterColumns(old_columns, read_result.getFilterOriginal()->getData());
|
2022-03-02 17:22:12 +00:00
|
|
|
block_before_prewhere.setColumns(old_columns);
|
2020-11-13 03:43:35 +00:00
|
|
|
}
|
|
|
|
|
2022-02-16 09:01:47 +00:00
|
|
|
for (auto & column : block_before_prewhere)
|
|
|
|
block.insert(std::move(column));
|
2020-11-13 03:43:35 +00:00
|
|
|
}
|
|
|
|
merge_tree_reader->evaluateMissingDefaults(block, columns);
|
2019-11-15 03:38:35 +00:00
|
|
|
}
|
2020-11-13 13:24:14 +00:00
|
|
|
/// If columns not empty, then apply on-fly alter conversions if any required
|
2020-01-15 13:00:08 +00:00
|
|
|
merge_tree_reader->performRequiredConversions(columns);
|
2019-11-15 03:38:35 +00:00
|
|
|
}
|
2019-10-02 11:57:17 +00:00
|
|
|
|
2019-09-23 19:22:02 +00:00
|
|
|
read_result.columns.reserve(read_result.columns.size() + columns.size());
|
|
|
|
for (auto & column : columns)
|
|
|
|
read_result.columns.emplace_back(std::move(column));
|
2018-02-20 11:45:58 +00:00
|
|
|
}
|
|
|
|
else
|
2018-02-22 12:43:57 +00:00
|
|
|
{
|
2018-02-20 11:45:58 +00:00
|
|
|
read_result = startReadingChain(max_rows, ranges);
|
2019-09-23 19:22:02 +00:00
|
|
|
read_result.num_rows = read_result.numReadRows();
|
|
|
|
|
|
|
|
if (read_result.num_rows)
|
2018-02-22 12:43:57 +00:00
|
|
|
{
|
2022-04-11 13:43:09 +00:00
|
|
|
/// Physical columns go first and then some virtual columns follow
|
|
|
|
const size_t physical_columns_count = read_result.columns.size() - non_const_virtual_column_names.size();
|
|
|
|
Columns physical_columns(read_result.columns.begin(), read_result.columns.begin() + physical_columns_count);
|
2022-03-15 06:34:25 +00:00
|
|
|
|
2018-02-22 12:43:57 +00:00
|
|
|
bool should_evaluate_missing_defaults;
|
2022-03-15 06:34:25 +00:00
|
|
|
merge_tree_reader->fillMissingColumns(physical_columns, should_evaluate_missing_defaults,
|
2019-09-23 19:22:02 +00:00
|
|
|
read_result.num_rows);
|
2018-02-22 12:43:57 +00:00
|
|
|
|
2020-12-24 10:11:07 +00:00
|
|
|
/// If some columns absent in part, then evaluate default values
|
2018-02-22 12:43:57 +00:00
|
|
|
if (should_evaluate_missing_defaults)
|
2022-03-15 06:34:25 +00:00
|
|
|
merge_tree_reader->evaluateMissingDefaults({}, physical_columns);
|
2020-01-15 13:00:08 +00:00
|
|
|
|
2020-11-13 13:24:14 +00:00
|
|
|
/// If result not empty, then apply on-fly alter conversions if any required
|
2022-03-15 06:34:25 +00:00
|
|
|
merge_tree_reader->performRequiredConversions(physical_columns);
|
|
|
|
|
|
|
|
for (size_t i = 0; i < physical_columns.size(); ++i)
|
|
|
|
read_result.columns[i] = std::move(physical_columns[i]);
|
2018-02-22 12:43:57 +00:00
|
|
|
}
|
2019-09-23 19:22:02 +00:00
|
|
|
else
|
|
|
|
read_result.columns.clear();
|
2019-11-15 03:38:35 +00:00
|
|
|
|
|
|
|
size_t total_bytes = 0;
|
|
|
|
for (auto & column : read_result.columns)
|
|
|
|
total_bytes += column->byteSize();
|
|
|
|
|
|
|
|
read_result.addNumBytesRead(total_bytes);
|
2018-02-22 12:43:57 +00:00
|
|
|
}
|
2018-02-13 19:34:15 +00:00
|
|
|
|
2019-09-23 19:22:02 +00:00
|
|
|
if (read_result.num_rows == 0)
|
2018-02-13 19:34:15 +00:00
|
|
|
return read_result;
|
|
|
|
|
2021-01-29 15:13:09 +00:00
|
|
|
executePrewhereActionsAndFilterColumns(read_result);
|
2018-09-10 15:28:03 +00:00
|
|
|
|
2018-02-13 19:34:15 +00:00
|
|
|
return read_result;
|
|
|
|
}
|
|
|
|
|
2018-02-22 12:43:57 +00:00
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
MergeTreeRangeReader::ReadResult MergeTreeRangeReader::startReadingChain(size_t max_rows, MarkRanges & ranges)
|
|
|
|
{
|
|
|
|
ReadResult result;
|
2019-09-23 19:22:02 +00:00
|
|
|
result.columns.resize(merge_tree_reader->getColumns().size());
|
2018-02-20 11:45:58 +00:00
|
|
|
|
2021-10-27 21:54:06 +00:00
|
|
|
size_t current_task_last_mark = getLastMark(ranges);
|
2021-10-15 08:36:26 +00:00
|
|
|
|
2022-04-11 13:43:09 +00:00
|
|
|
/// The stream could be unfinished by the previous read request because of max_rows limit.
|
|
|
|
/// In this case it will have some rows from the previously started range. We need to save their begin and
|
|
|
|
/// end offsets to properly fill _part_offset column.
|
|
|
|
UInt64 leading_begin_part_offset = 0;
|
|
|
|
UInt64 leading_end_part_offset = 0;
|
2022-03-15 06:34:25 +00:00
|
|
|
if (!stream.isFinished())
|
|
|
|
{
|
2022-04-11 13:43:09 +00:00
|
|
|
leading_begin_part_offset = stream.currentPartOffset();
|
|
|
|
leading_end_part_offset = stream.lastPartOffset();
|
2022-03-15 06:34:25 +00:00
|
|
|
}
|
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
/// Stream is lazy. result.num_added_rows is the number of rows added to block which is not equal to
|
|
|
|
/// result.num_rows_read until call to stream.finalize(). Also result.num_added_rows may be less than
|
|
|
|
/// result.num_rows_read if the last granule in range also the last in part (so we have to adjust last granule).
|
|
|
|
{
|
|
|
|
size_t space_left = max_rows;
|
2018-02-20 12:02:25 +00:00
|
|
|
while (space_left && (!stream.isFinished() || !ranges.empty()))
|
2018-02-20 11:45:58 +00:00
|
|
|
{
|
|
|
|
if (stream.isFinished())
|
|
|
|
{
|
2019-09-23 19:22:02 +00:00
|
|
|
result.addRows(stream.finalize(result.columns));
|
2021-10-15 08:36:26 +00:00
|
|
|
stream = Stream(ranges.front().begin, ranges.front().end, current_task_last_mark, merge_tree_reader);
|
2020-02-10 12:36:01 +00:00
|
|
|
result.addRange(ranges.front());
|
|
|
|
ranges.pop_front();
|
2018-02-20 11:45:58 +00:00
|
|
|
}
|
|
|
|
|
2019-12-02 17:10:22 +00:00
|
|
|
size_t current_space = space_left;
|
|
|
|
|
|
|
|
/// If reader can't read part of granule, we have to increase number of reading rows
|
|
|
|
/// to read complete granules and exceed max_rows a bit.
|
|
|
|
if (!merge_tree_reader->canReadIncompleteGranules())
|
|
|
|
current_space = stream.ceilRowsToCompleteGranules(space_left);
|
2019-10-31 14:44:17 +00:00
|
|
|
|
2019-12-02 17:10:22 +00:00
|
|
|
auto rows_to_read = std::min(current_space, stream.numPendingRowsInCurrentGranule());
|
2019-10-31 14:44:17 +00:00
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
bool last = rows_to_read == space_left;
|
2019-09-23 19:22:02 +00:00
|
|
|
result.addRows(stream.read(result.columns, rows_to_read, !last));
|
2018-02-20 11:45:58 +00:00
|
|
|
result.addGranule(rows_to_read);
|
2019-12-02 17:10:22 +00:00
|
|
|
space_left = (rows_to_read > space_left ? 0 : space_left - rows_to_read);
|
2018-02-20 11:45:58 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-09-23 19:22:02 +00:00
|
|
|
result.addRows(stream.finalize(result.columns));
|
2018-02-20 11:45:58 +00:00
|
|
|
|
|
|
|
/// Last granule may be incomplete.
|
2018-02-20 13:37:04 +00:00
|
|
|
result.adjustLastGranule();
|
2018-02-20 11:45:58 +00:00
|
|
|
|
2022-04-11 13:43:09 +00:00
|
|
|
for (const auto & column_name : non_const_virtual_column_names)
|
|
|
|
{
|
|
|
|
if (column_name == "_part_offset")
|
|
|
|
fillPartOffsetColumn(result, leading_begin_part_offset, leading_end_part_offset);
|
|
|
|
}
|
2022-03-15 06:34:25 +00:00
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2022-04-11 13:43:09 +00:00
|
|
|
void MergeTreeRangeReader::fillPartOffsetColumn(ReadResult & result, UInt64 leading_begin_part_offset, UInt64 leading_end_part_offset)
|
2022-03-15 06:34:25 +00:00
|
|
|
{
|
|
|
|
size_t num_rows = result.numReadRows();
|
|
|
|
|
|
|
|
auto column = ColumnUInt64::create(num_rows);
|
|
|
|
ColumnUInt64::Container & vec = column->getData();
|
|
|
|
|
|
|
|
UInt64 * pos = vec.data();
|
|
|
|
UInt64 * end = &vec[num_rows];
|
|
|
|
|
2022-04-11 13:43:09 +00:00
|
|
|
while (pos < end && leading_begin_part_offset < leading_end_part_offset)
|
|
|
|
*pos++ = leading_begin_part_offset++;
|
2022-03-15 06:34:25 +00:00
|
|
|
|
|
|
|
const auto start_ranges = result.startedRanges();
|
|
|
|
|
2022-04-11 13:43:09 +00:00
|
|
|
for (const auto & start_range : start_ranges)
|
2022-03-15 06:34:25 +00:00
|
|
|
{
|
2022-04-11 13:43:09 +00:00
|
|
|
UInt64 start_part_offset = index_granularity->getMarkStartingRow(start_range.range.begin);
|
|
|
|
UInt64 end_part_offset = index_granularity->getMarkStartingRow(start_range.range.end);
|
2022-03-15 06:34:25 +00:00
|
|
|
|
|
|
|
while (pos < end && start_part_offset < end_part_offset)
|
|
|
|
*pos++ = start_part_offset++;
|
|
|
|
}
|
|
|
|
|
|
|
|
result.columns.emplace_back(std::move(column));
|
|
|
|
}
|
|
|
|
|
2019-09-23 19:22:02 +00:00
|
|
|
Columns MergeTreeRangeReader::continueReadingChain(ReadResult & result, size_t & num_rows)
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
2019-09-23 19:22:02 +00:00
|
|
|
Columns columns;
|
|
|
|
num_rows = 0;
|
2018-02-22 12:43:57 +00:00
|
|
|
|
|
|
|
if (result.rowsPerGranule().empty())
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
|
|
|
/// If zero rows were read on prev step, than there is no more rows to read.
|
|
|
|
/// Last granule may have less rows than index_granularity, so finish reading manually.
|
|
|
|
stream.finish();
|
2019-09-23 19:22:02 +00:00
|
|
|
return columns;
|
2018-02-13 19:34:15 +00:00
|
|
|
}
|
|
|
|
|
2019-10-31 11:32:24 +00:00
|
|
|
columns.resize(merge_tree_reader->numColumnsInResult());
|
2019-09-23 19:22:02 +00:00
|
|
|
|
2020-04-22 06:34:20 +00:00
|
|
|
const auto & rows_per_granule = result.rowsPerGranule();
|
|
|
|
const auto & started_ranges = result.startedRanges();
|
2018-02-13 19:34:15 +00:00
|
|
|
|
2022-05-06 13:06:56 +00:00
|
|
|
size_t current_task_last_mark = ReadResult::getLastMark(started_ranges);
|
2018-02-20 11:45:58 +00:00
|
|
|
size_t next_range_to_start = 0;
|
2018-02-13 19:34:15 +00:00
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
auto size = rows_per_granule.size();
|
2021-06-15 19:55:21 +00:00
|
|
|
for (auto i : collections::range(0, size))
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
2018-02-20 11:45:58 +00:00
|
|
|
if (next_range_to_start < started_ranges.size()
|
|
|
|
&& i == started_ranges[next_range_to_start].num_granules_read_before_start)
|
2018-02-13 19:34:15 +00:00
|
|
|
{
|
2019-09-23 19:22:02 +00:00
|
|
|
num_rows += stream.finalize(columns);
|
2020-04-22 06:34:20 +00:00
|
|
|
const auto & range = started_ranges[next_range_to_start].range;
|
2018-02-22 11:31:15 +00:00
|
|
|
++next_range_to_start;
|
2021-10-18 08:06:30 +00:00
|
|
|
stream = Stream(range.begin, range.end, current_task_last_mark, merge_tree_reader);
|
2018-02-13 19:34:15 +00:00
|
|
|
}
|
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
bool last = i + 1 == size;
|
2019-09-23 19:22:02 +00:00
|
|
|
num_rows += stream.read(columns, rows_per_granule[i], !last);
|
2018-02-13 19:34:15 +00:00
|
|
|
}
|
2018-02-20 11:45:58 +00:00
|
|
|
|
2018-02-22 12:43:57 +00:00
|
|
|
stream.skip(result.numRowsToSkipInLastGranule());
|
2019-09-23 19:22:02 +00:00
|
|
|
num_rows += stream.finalize(columns);
|
2018-02-20 11:45:58 +00:00
|
|
|
|
2022-05-18 18:22:42 +00:00
|
|
|
|
|
|
|
// TODO: here we can verify that stream and prev_reader->stream are at exactly same offset
|
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
/// added_rows may be zero if all columns were read in prewhere and it's ok.
|
2019-09-23 19:22:02 +00:00
|
|
|
if (num_rows && num_rows != result.totalRowsPerGranule())
|
|
|
|
throw Exception("RangeReader read " + toString(num_rows) + " rows, but "
|
2018-03-05 14:41:43 +00:00
|
|
|
+ toString(result.totalRowsPerGranule()) + " expected.", ErrorCodes::LOGICAL_ERROR);
|
2018-02-22 12:43:57 +00:00
|
|
|
|
2019-09-23 19:22:02 +00:00
|
|
|
return columns;
|
2018-02-13 19:34:15 +00:00
|
|
|
}
|
|
|
|
|
2021-08-09 09:09:09 +00:00
|
|
|
static void checkCombinedFiltersSize(size_t bytes_in_first_filter, size_t second_filter_size)
|
2021-02-20 11:00:16 +00:00
|
|
|
{
|
|
|
|
if (bytes_in_first_filter != second_filter_size)
|
|
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
|
|
|
"Cannot combine filters because number of bytes in a first filter ({}) "
|
|
|
|
"does not match second filter size ({})", bytes_in_first_filter, second_filter_size);
|
|
|
|
}
|
|
|
|
|
2022-05-18 18:22:42 +00:00
|
|
|
/// Second filter size must be equal to number of 1s in the first filter.
|
|
|
|
/// The result size is equal to first filter size.
|
2021-02-20 11:00:16 +00:00
|
|
|
static ColumnPtr combineFilters(ColumnPtr first, ColumnPtr second)
|
|
|
|
{
|
2021-08-09 09:09:09 +00:00
|
|
|
ConstantFilterDescription first_const_descr(*first);
|
2021-02-20 11:00:16 +00:00
|
|
|
|
2021-08-09 09:09:09 +00:00
|
|
|
if (first_const_descr.always_true)
|
2021-02-20 11:00:16 +00:00
|
|
|
{
|
2021-08-09 09:09:09 +00:00
|
|
|
checkCombinedFiltersSize(first->size(), second->size());
|
2021-02-20 11:00:16 +00:00
|
|
|
return second;
|
|
|
|
}
|
|
|
|
|
2021-08-09 09:09:09 +00:00
|
|
|
if (first_const_descr.always_false)
|
2021-02-20 11:00:16 +00:00
|
|
|
{
|
2021-08-09 09:09:09 +00:00
|
|
|
checkCombinedFiltersSize(0, second->size());
|
2021-02-20 11:00:16 +00:00
|
|
|
return first;
|
|
|
|
}
|
|
|
|
|
2021-08-09 09:09:09 +00:00
|
|
|
FilterDescription first_descr(*first);
|
2021-02-20 11:00:16 +00:00
|
|
|
|
2021-08-09 09:09:09 +00:00
|
|
|
size_t bytes_in_first_filter = countBytesInFilter(*first_descr.data);
|
|
|
|
checkCombinedFiltersSize(bytes_in_first_filter, second->size());
|
2021-02-20 11:00:16 +00:00
|
|
|
|
|
|
|
ConstantFilterDescription second_const_descr(*second);
|
|
|
|
|
|
|
|
if (second_const_descr.always_true)
|
2021-08-06 18:16:06 +00:00
|
|
|
return first;
|
2021-02-20 11:00:16 +00:00
|
|
|
|
|
|
|
if (second_const_descr.always_false)
|
2021-08-06 18:16:06 +00:00
|
|
|
return second->cloneResized(first->size());
|
2021-02-20 11:00:16 +00:00
|
|
|
|
|
|
|
FilterDescription second_descr(*second);
|
2021-08-06 18:16:06 +00:00
|
|
|
|
|
|
|
MutableColumnPtr mut_first;
|
2021-08-09 09:09:09 +00:00
|
|
|
if (first_descr.data_holder)
|
|
|
|
mut_first = IColumn::mutate(std::move(first_descr.data_holder));
|
2021-08-06 18:16:06 +00:00
|
|
|
else
|
|
|
|
mut_first = IColumn::mutate(std::move(first));
|
|
|
|
|
|
|
|
auto & first_data = typeid_cast<ColumnUInt8 *>(mut_first.get())->getData();
|
2021-02-20 11:00:16 +00:00
|
|
|
const auto * second_data = second_descr.data->data();
|
|
|
|
|
|
|
|
for (auto & val : first_data)
|
|
|
|
{
|
|
|
|
if (val)
|
|
|
|
{
|
|
|
|
val = *second_data;
|
|
|
|
++second_data;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return mut_first;
|
|
|
|
}
|
|
|
|
|
2021-01-29 15:13:09 +00:00
|
|
|
void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & result)
|
2017-06-15 17:01:13 +00:00
|
|
|
{
|
2021-02-13 22:07:13 +00:00
|
|
|
if (!prewhere_info)
|
2021-01-29 15:13:09 +00:00
|
|
|
return;
|
|
|
|
|
2020-04-22 06:34:20 +00:00
|
|
|
const auto & header = merge_tree_reader->getColumns();
|
2021-02-20 11:00:16 +00:00
|
|
|
size_t num_columns = header.size();
|
2019-09-23 19:22:02 +00:00
|
|
|
|
2022-06-07 07:03:11 +00:00
|
|
|
// TODO: properly check that we have columns from previous steps and newly read required columns
|
|
|
|
if (result.columns.size() < num_columns + non_const_virtual_column_names.size())
|
2019-09-23 19:22:02 +00:00
|
|
|
throw Exception("Invalid number of columns passed to MergeTreeRangeReader. "
|
|
|
|
"Expected " + toString(num_columns) + ", "
|
|
|
|
"got " + toString(result.columns.size()), ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
2022-06-07 07:03:11 +00:00
|
|
|
ColumnPtr current_filter;
|
2021-02-20 11:00:16 +00:00
|
|
|
ColumnPtr filter;
|
2022-06-07 07:03:11 +00:00
|
|
|
// ColumnPtr row_level_filter;
|
2021-02-20 11:00:16 +00:00
|
|
|
size_t prewhere_column_pos;
|
2019-09-23 19:22:02 +00:00
|
|
|
|
|
|
|
{
|
|
|
|
/// Restore block from columns list.
|
|
|
|
Block block;
|
2019-10-02 11:57:17 +00:00
|
|
|
size_t pos = 0;
|
|
|
|
|
|
|
|
if (prev_reader)
|
|
|
|
{
|
2020-04-22 06:34:20 +00:00
|
|
|
for (const auto & col : prev_reader->getSampleBlock())
|
2019-10-02 11:57:17 +00:00
|
|
|
{
|
|
|
|
block.insert({result.columns[pos], col.type, col.name});
|
|
|
|
++pos;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-06-07 07:03:11 +00:00
|
|
|
// for (auto name_and_type = header.begin(); pos < num_columns; ++pos, ++name_and_type)
|
|
|
|
// block.insert({result.columns[pos], name_and_type->type, name_and_type->name});
|
|
|
|
for (const auto & name_and_type : header) {
|
|
|
|
block.insert({result.columns[pos], name_and_type.type, name_and_type.name});
|
|
|
|
++pos;
|
|
|
|
}
|
2019-09-23 19:22:02 +00:00
|
|
|
|
2022-04-11 13:43:09 +00:00
|
|
|
for (const auto & column_name : non_const_virtual_column_names)
|
2022-03-15 06:34:25 +00:00
|
|
|
{
|
2022-04-11 13:43:09 +00:00
|
|
|
if (column_name == "_part_offset")
|
|
|
|
block.insert({result.columns[pos], std::make_shared<DataTypeUInt64>(), column_name});
|
|
|
|
else
|
|
|
|
throw Exception("Unexpected non-const virtual column: " + column_name, ErrorCodes::LOGICAL_ERROR);
|
|
|
|
++pos;
|
2022-03-15 06:34:25 +00:00
|
|
|
}
|
|
|
|
|
2021-02-13 22:07:13 +00:00
|
|
|
/// Columns might be projected out. We need to store them here so that default columns can be evaluated later.
|
|
|
|
result.block_before_prewhere = block;
|
|
|
|
|
2022-06-07 07:03:11 +00:00
|
|
|
/* if (prewhere_info->row_level_filter)
|
2021-02-15 19:48:06 +00:00
|
|
|
{
|
|
|
|
prewhere_info->row_level_filter->execute(block);
|
2021-02-20 11:00:16 +00:00
|
|
|
auto row_level_filter_pos = block.getPositionByName(prewhere_info->row_level_column_name);
|
|
|
|
row_level_filter = block.getByPosition(row_level_filter_pos).column;
|
|
|
|
block.erase(row_level_filter_pos);
|
2021-02-19 23:18:16 +00:00
|
|
|
|
2021-02-20 11:00:16 +00:00
|
|
|
auto columns = block.getColumns();
|
|
|
|
filterColumns(columns, row_level_filter);
|
2021-04-29 11:40:14 +00:00
|
|
|
if (columns.empty())
|
|
|
|
block = block.cloneEmpty();
|
|
|
|
else
|
|
|
|
block.setColumns(columns);
|
2021-02-15 19:48:06 +00:00
|
|
|
}
|
2022-06-07 07:03:11 +00:00
|
|
|
*/
|
|
|
|
prewhere_info->actions->execute(block);
|
2021-02-15 19:48:06 +00:00
|
|
|
|
2022-06-07 07:03:11 +00:00
|
|
|
prewhere_column_pos = block.getPositionByName(prewhere_info->column_name);
|
2021-02-13 22:07:13 +00:00
|
|
|
|
2019-09-23 19:22:02 +00:00
|
|
|
result.columns.clear();
|
2019-10-02 11:57:17 +00:00
|
|
|
result.columns.reserve(block.columns());
|
2019-09-23 19:22:02 +00:00
|
|
|
for (auto & col : block)
|
|
|
|
result.columns.emplace_back(std::move(col.column));
|
2021-02-20 11:00:16 +00:00
|
|
|
|
2022-06-07 07:03:11 +00:00
|
|
|
current_filter.swap(result.columns[prewhere_column_pos]);
|
|
|
|
filter = current_filter;
|
|
|
|
|
|
|
|
|
2019-09-23 19:22:02 +00:00
|
|
|
}
|
2018-02-13 19:34:15 +00:00
|
|
|
|
2021-02-20 11:00:16 +00:00
|
|
|
if (result.getFilter())
|
|
|
|
{
|
2022-06-07 07:03:11 +00:00
|
|
|
ColumnPtr prev_filter = result.getFilterHolder();
|
|
|
|
filter = combineFilters(prev_filter, std::move(filter));
|
2021-02-20 11:00:16 +00:00
|
|
|
|
2022-06-07 07:03:11 +00:00
|
|
|
// /// TODO: implement for prewhere chain.
|
|
|
|
// /// In order to do it we need combine filter and result.filter, where filter filters only '1' in result.filter.
|
|
|
|
// throw Exception("MergeTreeRangeReader chain with several prewhere actions in not implemented.",
|
|
|
|
// ErrorCodes::LOGICAL_ERROR);
|
2021-02-20 11:00:16 +00:00
|
|
|
}
|
2022-06-07 07:03:11 +00:00
|
|
|
|
|
|
|
// if (filter && row_level_filter)
|
|
|
|
// {
|
|
|
|
// row_level_filter = combineFilters(std::move(row_level_filter), filter);
|
|
|
|
// result.setFilter(row_level_filter);
|
|
|
|
// }
|
|
|
|
// else
|
2021-02-20 11:00:16 +00:00
|
|
|
result.setFilter(filter);
|
|
|
|
|
2019-11-15 03:38:35 +00:00
|
|
|
/// If there is a WHERE, we filter in there, and only optimize IO and shrink columns here
|
2018-03-05 14:41:43 +00:00
|
|
|
if (!last_reader_in_chain)
|
2022-06-07 07:03:11 +00:00
|
|
|
result.optimize(merge_tree_reader->canReadIncompleteGranules(), true); // TODO: prewhere_info->row_level_filter == nullptr);
|
2018-03-05 14:41:43 +00:00
|
|
|
|
2019-11-15 03:38:35 +00:00
|
|
|
/// If we read nothing or filter gets optimized to nothing
|
2018-03-05 14:41:43 +00:00
|
|
|
if (result.totalRowsPerGranule() == 0)
|
2019-11-15 03:38:35 +00:00
|
|
|
result.setFilterConstFalse();
|
|
|
|
/// If we need to filter in PREWHERE
|
2022-06-07 07:03:11 +00:00
|
|
|
else if (prewhere_info->need_filter || result.need_filter)// || prewhere_info->row_level_filter)
|
2019-09-23 19:22:02 +00:00
|
|
|
{
|
2019-11-15 03:38:35 +00:00
|
|
|
/// If there is a filter and without optimized
|
|
|
|
if (result.getFilter() && last_reader_in_chain)
|
2019-09-23 19:22:02 +00:00
|
|
|
{
|
2020-04-22 06:34:20 +00:00
|
|
|
const auto * result_filter = result.getFilter();
|
2019-11-15 03:38:35 +00:00
|
|
|
/// optimize is not called, need to check const 1 and const 0
|
|
|
|
size_t bytes_in_filter = result.countBytesInResultFilter(result_filter->getData());
|
2019-09-23 19:22:02 +00:00
|
|
|
if (bytes_in_filter == 0)
|
2019-11-15 03:38:35 +00:00
|
|
|
result.setFilterConstFalse();
|
|
|
|
else if (bytes_in_filter == result.num_rows)
|
|
|
|
result.setFilterConstTrue();
|
2018-03-05 14:41:43 +00:00
|
|
|
}
|
|
|
|
|
2019-11-15 03:38:35 +00:00
|
|
|
/// If there is still a filter, do the filtering now
|
|
|
|
if (result.getFilter())
|
2019-09-23 19:22:02 +00:00
|
|
|
{
|
2020-08-08 00:47:03 +00:00
|
|
|
/// filter might be shrunk while columns not
|
2020-04-22 06:34:20 +00:00
|
|
|
const auto * result_filter = result.getFilterOriginal();
|
2021-02-20 11:00:16 +00:00
|
|
|
|
2022-06-07 07:03:11 +00:00
|
|
|
// if (row_level_filter)
|
|
|
|
// filterColumns(result.columns, filter);
|
|
|
|
// else
|
|
|
|
// filterColumns(result.columns, result_filter->getData());
|
|
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
FilterDescription current_filter_descr(*current_filter);
|
|
|
|
// TODO: need to filter by current filter column that has num_rows size, not the original size
|
|
|
|
|
|
|
|
// TODO: properly handle const true and const false cases
|
|
|
|
if (current_filter_descr.countBytesInFilter() == 0)
|
|
|
|
result.columns.clear();
|
|
|
|
else if (current_filter_descr.data)
|
|
|
|
filterColumns(result.columns, *current_filter_descr.data);
|
|
|
|
}
|
2021-02-20 11:00:16 +00:00
|
|
|
|
2019-11-15 03:38:35 +00:00
|
|
|
result.need_filter = true;
|
2019-09-23 19:22:02 +00:00
|
|
|
|
2019-10-02 11:57:17 +00:00
|
|
|
bool has_column = false;
|
|
|
|
for (auto & column : result.columns)
|
|
|
|
{
|
|
|
|
if (column)
|
|
|
|
{
|
|
|
|
has_column = true;
|
|
|
|
result.num_rows = column->size();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-11-15 03:38:35 +00:00
|
|
|
/// There is only one filter column. Record the actual number
|
2019-10-02 11:57:17 +00:00
|
|
|
if (!has_column)
|
2019-11-15 03:38:35 +00:00
|
|
|
result.num_rows = result.countBytesInResultFilter(result_filter->getData());
|
2019-09-23 19:22:02 +00:00
|
|
|
}
|
2018-03-05 14:41:43 +00:00
|
|
|
|
2019-11-15 03:38:35 +00:00
|
|
|
/// Check if the PREWHERE column is needed
|
2020-03-09 02:55:28 +00:00
|
|
|
if (!result.columns.empty())
|
2019-11-15 03:38:35 +00:00
|
|
|
{
|
2022-06-07 07:03:11 +00:00
|
|
|
if (prewhere_info->remove_column)
|
2019-11-15 03:38:35 +00:00
|
|
|
result.columns.erase(result.columns.begin() + prewhere_column_pos);
|
|
|
|
else
|
2020-06-11 16:34:36 +00:00
|
|
|
result.columns[prewhere_column_pos] =
|
2022-06-07 07:03:11 +00:00
|
|
|
getSampleBlock().getByName(prewhere_info->column_name).type->
|
2020-06-11 16:34:36 +00:00
|
|
|
createColumnConst(result.num_rows, 1u)->convertToFullColumnIfConst();
|
2019-11-15 03:38:35 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
/// Filter in WHERE instead
|
2018-03-05 14:41:43 +00:00
|
|
|
else
|
2019-11-15 03:38:35 +00:00
|
|
|
{
|
2022-06-07 07:03:11 +00:00
|
|
|
if (prewhere_info->remove_column)
|
2021-10-21 21:12:30 +00:00
|
|
|
result.columns.erase(result.columns.begin() + prewhere_column_pos);
|
|
|
|
else
|
|
|
|
{
|
2022-06-07 07:03:11 +00:00
|
|
|
auto type = getSampleBlock().getByName(prewhere_info->column_name).type;
|
2021-10-21 21:12:30 +00:00
|
|
|
ColumnWithTypeAndName col(result.getFilterHolder()->convertToFullColumnIfConst(), std::make_shared<DataTypeUInt8>(), "");
|
|
|
|
result.columns[prewhere_column_pos] = castColumn(col, type);
|
|
|
|
result.clearFilter(); // Acting as a flag to not filter in PREWHERE
|
|
|
|
}
|
2019-11-15 03:38:35 +00:00
|
|
|
}
|
2017-06-15 17:01:13 +00:00
|
|
|
}
|
|
|
|
|
2022-06-07 21:00:34 +00:00
|
|
|
std::string PrewhereExprInfo::dump() const
|
|
|
|
{
|
|
|
|
std::ostringstream s;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < steps.size(); ++i)
|
|
|
|
{
|
|
|
|
s << "STEP " << i << ":\n"
|
|
|
|
<< " ACTIONS: " << steps[i].actions->dumpActions() << "\n"
|
|
|
|
<< " COLUMN: " << steps[i].column_name << "\n"
|
|
|
|
<< " REMOVE_COLUMN: " << steps[i].remove_column << "\n"
|
|
|
|
<< " NEED_FILTER: " << steps[i].need_filter << "\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
return s.str();
|
|
|
|
}
|
|
|
|
|
2017-06-14 10:50:22 +00:00
|
|
|
}
|