2017-03-24 13:52:50 +00:00
|
|
|
#pragma once
|
2018-03-14 03:19:23 +00:00
|
|
|
|
|
|
|
#include <optional>
|
2017-03-24 13:52:50 +00:00
|
|
|
#include <Core/NamesAndTypes.h>
|
|
|
|
#include <Storages/MergeTree/RangesInDataPart.h>
|
2017-06-14 10:50:22 +00:00
|
|
|
#include <Storages/MergeTree/MergeTreeRangeReader.h>
|
2017-03-24 13:52:50 +00:00
|
|
|
|
2018-03-14 03:19:23 +00:00
|
|
|
|
2017-03-24 13:52:50 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2017-04-15 04:03:12 +00:00
|
|
|
class MergeTreeData;
|
2017-03-24 13:52:50 +00:00
|
|
|
struct MergeTreeReadTask;
|
2017-04-15 04:03:12 +00:00
|
|
|
struct MergeTreeBlockSizePredictor;
|
2017-03-24 13:52:50 +00:00
|
|
|
|
2017-04-06 17:21:45 +00:00
|
|
|
using MergeTreeReadTaskPtr = std::unique_ptr<MergeTreeReadTask>;
|
2017-05-18 16:03:08 +00:00
|
|
|
using MergeTreeBlockSizePredictorPtr = std::unique_ptr<MergeTreeBlockSizePredictor>;
|
2017-04-06 17:21:45 +00:00
|
|
|
|
2017-03-24 13:52:50 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** If some of the requested columns are not in the part,
|
|
|
|
* then find out which columns may need to be read further,
|
|
|
|
* so that you can calculate the DEFAULT expression for these columns.
|
|
|
|
* Adds them to the `columns`.
|
|
|
|
*/
|
2020-06-17 16:39:58 +00:00
|
|
|
NameSet injectRequiredColumns(const MergeTreeData & storage, const StorageMetadataPtr & metadata_snapshot, const MergeTreeData::DataPartPtr & part, Names & columns);
|
2017-03-24 13:52:50 +00:00
|
|
|
|
|
|
|
|
2018-11-29 09:19:42 +00:00
|
|
|
/// A batch of work for MergeTreeThreadSelectBlockInputStream
|
2017-03-24 13:52:50 +00:00
|
|
|
struct MergeTreeReadTask
|
|
|
|
{
|
|
|
|
/// data part which should be read while performing this task
|
|
|
|
MergeTreeData::DataPartPtr data_part;
|
2020-02-10 12:36:01 +00:00
|
|
|
/// Ranges to read from `data_part`.
|
2017-03-24 13:52:50 +00:00
|
|
|
MarkRanges mark_ranges;
|
|
|
|
/// for virtual `part_index` virtual column
|
2017-07-21 06:35:58 +00:00
|
|
|
size_t part_index_in_query;
|
2017-03-24 13:52:50 +00:00
|
|
|
/// ordered list of column names used in this query, allows returning blocks with consistent ordering
|
|
|
|
const Names & ordered_names;
|
|
|
|
/// used to determine whether column should be filtered during PREWHERE or WHERE
|
|
|
|
const NameSet & column_name_set;
|
|
|
|
/// column names to read during WHERE
|
2017-12-25 21:57:29 +00:00
|
|
|
const NamesAndTypesList & columns;
|
2017-03-24 13:52:50 +00:00
|
|
|
/// column names to read during PREWHERE
|
2017-12-25 21:57:29 +00:00
|
|
|
const NamesAndTypesList & pre_columns;
|
2017-03-24 13:52:50 +00:00
|
|
|
/// should PREWHERE column be returned to requesting side?
|
|
|
|
const bool remove_prewhere_column;
|
|
|
|
/// resulting block may require reordering in accordance with `ordered_names`
|
|
|
|
const bool should_reorder;
|
2017-04-06 17:21:45 +00:00
|
|
|
/// Used to satistfy preferred_block_size_bytes limitation
|
|
|
|
MergeTreeBlockSizePredictorPtr size_predictor;
|
2018-02-20 11:45:58 +00:00
|
|
|
/// Used to save current range processing status
|
|
|
|
MergeTreeRangeReader range_reader;
|
|
|
|
MergeTreeRangeReader pre_range_reader;
|
2017-03-24 13:52:50 +00:00
|
|
|
|
2018-02-20 11:45:58 +00:00
|
|
|
bool isFinished() const { return mark_ranges.empty() && range_reader.isCurrentRangeFinished(); }
|
2017-06-15 17:01:13 +00:00
|
|
|
|
2017-03-24 13:52:50 +00:00
|
|
|
MergeTreeReadTask(
|
2019-08-03 11:02:40 +00:00
|
|
|
const MergeTreeData::DataPartPtr & data_part_, const MarkRanges & mark_ranges_, const size_t part_index_in_query_,
|
|
|
|
const Names & ordered_names_, const NameSet & column_name_set_, const NamesAndTypesList & columns_,
|
|
|
|
const NamesAndTypesList & pre_columns_, const bool remove_prewhere_column_, const bool should_reorder_,
|
|
|
|
MergeTreeBlockSizePredictorPtr && size_predictor_);
|
2017-04-05 20:34:19 +00:00
|
|
|
|
|
|
|
virtual ~MergeTreeReadTask();
|
2017-03-24 13:52:50 +00:00
|
|
|
};
|
|
|
|
|
2019-07-19 14:56:00 +00:00
|
|
|
struct MergeTreeReadTaskColumns
|
|
|
|
{
|
|
|
|
/// column names to read during WHERE
|
|
|
|
NamesAndTypesList columns;
|
|
|
|
/// column names to read during PREWHERE
|
|
|
|
NamesAndTypesList pre_columns;
|
|
|
|
/// resulting block may require reordering in accordance with `ordered_names`
|
|
|
|
bool should_reorder;
|
|
|
|
};
|
|
|
|
|
2020-06-17 16:39:58 +00:00
|
|
|
MergeTreeReadTaskColumns getReadTaskColumns(
|
|
|
|
const MergeTreeData & storage,
|
|
|
|
const StorageMetadataPtr & metadata_snapshot,
|
|
|
|
const MergeTreeData::DataPartPtr & data_part,
|
|
|
|
const Names & required_columns,
|
|
|
|
const PrewhereInfoPtr & prewhere_info,
|
|
|
|
bool check_columns);
|
2017-03-24 13:52:50 +00:00
|
|
|
|
|
|
|
struct MergeTreeBlockSizePredictor
|
|
|
|
{
|
2017-05-18 16:03:08 +00:00
|
|
|
MergeTreeBlockSizePredictor(const MergeTreeData::DataPartPtr & data_part_, const Names & columns, const Block & sample_block);
|
2017-03-24 13:52:50 +00:00
|
|
|
|
2017-04-10 14:06:44 +00:00
|
|
|
/// Reset some values for correct statistics calculating
|
2017-04-06 17:21:45 +00:00
|
|
|
void startBlock();
|
2017-03-24 13:52:50 +00:00
|
|
|
|
2017-04-10 14:06:44 +00:00
|
|
|
/// Updates statistic for more accurate prediction
|
2019-09-26 17:29:41 +00:00
|
|
|
void update(const Block & sample_block, const Columns & columns, size_t num_rows, double decay = DECAY());
|
2017-03-24 13:52:50 +00:00
|
|
|
|
2017-04-10 14:06:44 +00:00
|
|
|
/// Return current block size (after update())
|
|
|
|
inline size_t getBlockSize() const
|
2017-03-24 13:52:50 +00:00
|
|
|
{
|
2017-04-10 14:06:44 +00:00
|
|
|
return block_size_bytes;
|
2017-03-24 13:52:50 +00:00
|
|
|
}
|
|
|
|
|
2017-06-30 16:28:27 +00:00
|
|
|
|
|
|
|
/// Predicts what number of rows should be read to exhaust byte quota per column
|
|
|
|
inline size_t estimateNumRowsForMaxSizeColumn(size_t bytes_quota) const
|
2017-06-28 09:44:02 +00:00
|
|
|
{
|
|
|
|
double max_size_per_row = std::max<double>(std::max<size_t>(max_size_per_row_fixed, 1), max_size_per_row_dynamic);
|
|
|
|
return (bytes_quota > block_size_rows * max_size_per_row)
|
2017-07-11 09:32:39 +00:00
|
|
|
? static_cast<size_t>(bytes_quota / max_size_per_row) - block_size_rows
|
|
|
|
: 0;
|
2017-06-28 09:44:02 +00:00
|
|
|
}
|
|
|
|
|
2017-06-30 16:28:27 +00:00
|
|
|
/// Predicts what number of rows should be read to exhaust byte quota per block
|
2017-04-10 14:06:44 +00:00
|
|
|
inline size_t estimateNumRows(size_t bytes_quota) const
|
2017-03-24 13:52:50 +00:00
|
|
|
{
|
2017-04-10 14:06:44 +00:00
|
|
|
return (bytes_quota > block_size_bytes)
|
2017-08-31 18:49:48 +00:00
|
|
|
? static_cast<size_t>((bytes_quota - block_size_bytes) / std::max<size_t>(1, bytes_per_row_current))
|
2017-07-11 09:32:39 +00:00
|
|
|
: 0;
|
2017-03-24 13:52:50 +00:00
|
|
|
}
|
|
|
|
|
2017-06-30 16:28:27 +00:00
|
|
|
inline void updateFilteredRowsRation(size_t rows_was_read, size_t rows_was_filtered, double decay = DECAY())
|
|
|
|
{
|
|
|
|
double alpha = std::pow(1. - decay, rows_was_read);
|
2018-06-04 19:46:47 +00:00
|
|
|
double current_ration = rows_was_filtered / std::max(1.0, static_cast<double>(rows_was_read));
|
2017-07-11 09:32:39 +00:00
|
|
|
filtered_rows_ratio = current_ration < filtered_rows_ratio
|
|
|
|
? current_ration
|
|
|
|
: alpha * filtered_rows_ratio + (1.0 - alpha) * current_ration;
|
2017-06-30 16:28:27 +00:00
|
|
|
}
|
|
|
|
|
2017-04-10 14:06:44 +00:00
|
|
|
/// Aggressiveness of bytes_per_row updates. See update() implementation.
|
|
|
|
/// After n=NUM_UPDATES_TO_TARGET_WEIGHT updates v_{n} = (1 - TARGET_WEIGHT) * v_{0} + TARGET_WEIGHT * v_{target}
|
|
|
|
static constexpr double TARGET_WEIGHT = 0.5;
|
|
|
|
static constexpr size_t NUM_UPDATES_TO_TARGET_WEIGHT = 8192;
|
2017-04-15 04:03:12 +00:00
|
|
|
static double DECAY() { return 1. - std::pow(TARGET_WEIGHT, 1. / NUM_UPDATES_TO_TARGET_WEIGHT); }
|
2017-04-07 11:43:24 +00:00
|
|
|
|
2017-04-10 14:06:44 +00:00
|
|
|
protected:
|
|
|
|
|
|
|
|
MergeTreeData::DataPartPtr data_part;
|
2017-03-24 13:52:50 +00:00
|
|
|
|
|
|
|
struct ColumnInfo
|
|
|
|
{
|
|
|
|
String name;
|
|
|
|
double bytes_per_row_global = 0;
|
|
|
|
double bytes_per_row = 0;
|
|
|
|
size_t size_bytes = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
std::vector<ColumnInfo> dynamic_columns_infos;
|
|
|
|
size_t fixed_columns_bytes_per_row = 0;
|
|
|
|
|
2017-06-28 09:44:02 +00:00
|
|
|
size_t max_size_per_row_fixed = 0;
|
|
|
|
double max_size_per_row_dynamic = 0;
|
2017-06-30 16:28:27 +00:00
|
|
|
|
2017-07-20 11:43:12 +00:00
|
|
|
size_t number_of_rows_in_part;
|
2017-07-19 19:19:27 +00:00
|
|
|
|
|
|
|
bool is_initialized_in_update = false;
|
|
|
|
|
2019-09-26 17:29:41 +00:00
|
|
|
void initialize(const Block & sample_block, const Columns & columns, const Names & names, bool from_update = false);
|
2017-07-19 19:19:27 +00:00
|
|
|
|
2017-04-10 14:06:44 +00:00
|
|
|
public:
|
|
|
|
|
2017-03-24 13:52:50 +00:00
|
|
|
size_t block_size_bytes = 0;
|
|
|
|
size_t block_size_rows = 0;
|
|
|
|
|
|
|
|
/// Total statistics
|
|
|
|
double bytes_per_row_current = 0;
|
|
|
|
double bytes_per_row_global = 0;
|
2017-06-30 16:28:27 +00:00
|
|
|
|
2017-07-11 09:32:39 +00:00
|
|
|
double filtered_rows_ratio = 0;
|
2017-03-24 13:52:50 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
}
|