2017-03-24 13:52:50 +00:00
|
|
|
#include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
|
|
|
|
#include <Storages/MergeTree/MergeTreeData.h>
|
2021-04-24 04:09:01 +00:00
|
|
|
#include <DataTypes/NestedUtils.h>
|
2020-10-23 17:57:17 +00:00
|
|
|
#include <Core/NamesAndTypes.h>
|
2020-09-16 13:24:07 +00:00
|
|
|
#include <Common/checkStackSize.h>
|
2017-07-19 19:19:27 +00:00
|
|
|
#include <Common/typeid_cast.h>
|
|
|
|
#include <Columns/ColumnConst.h>
|
2017-07-20 20:07:59 +00:00
|
|
|
#include <unordered_set>
|
2017-03-24 13:52:50 +00:00
|
|
|
|
|
|
|
|
2018-01-10 00:04:08 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
2020-02-25 18:10:48 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int LOGICAL_ERROR;
|
2020-09-15 13:29:11 +00:00
|
|
|
extern const int NO_SUCH_COLUMN_IN_TABLE;
|
2020-02-25 18:10:48 +00:00
|
|
|
}
|
2017-03-24 13:52:50 +00:00
|
|
|
|
2020-09-15 11:17:58 +00:00
|
|
|
namespace
|
|
|
|
{
|
|
|
|
|
|
|
|
/// Columns absent in part may depend on other absent columns so we are
|
2020-09-15 13:29:11 +00:00
|
|
|
/// searching all required physical columns recursively. Return true if found at
|
|
|
|
/// least one existing (physical) column in part.
|
2020-09-15 11:17:58 +00:00
|
|
|
bool injectRequiredColumnsRecursively(
|
|
|
|
const String & column_name,
|
2022-03-28 17:21:47 +00:00
|
|
|
const StorageSnapshotPtr & storage_snapshot,
|
2020-09-15 11:17:58 +00:00
|
|
|
const MergeTreeData::AlterConversions & alter_conversions,
|
|
|
|
const MergeTreeData::DataPartPtr & part,
|
|
|
|
Names & columns,
|
|
|
|
NameSet & required_columns,
|
|
|
|
NameSet & injected_columns)
|
|
|
|
{
|
2020-09-16 13:24:07 +00:00
|
|
|
/// This is needed to prevent stack overflow in case of cyclic defaults or
|
|
|
|
/// huge AST which for some reason was not validated on parsing/interpreter
|
|
|
|
/// stages.
|
|
|
|
checkStackSize();
|
2020-09-15 11:17:58 +00:00
|
|
|
|
2022-03-28 17:21:47 +00:00
|
|
|
auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withSubcolumns().withExtendedObjects();
|
|
|
|
auto column_in_storage = storage_snapshot->tryGetColumn(options, column_name);
|
2021-07-15 03:12:37 +00:00
|
|
|
if (column_in_storage)
|
2020-09-15 11:17:58 +00:00
|
|
|
{
|
2021-07-15 03:12:37 +00:00
|
|
|
auto column_name_in_part = column_in_storage->getNameInStorage();
|
2020-10-23 17:57:17 +00:00
|
|
|
if (alter_conversions.isColumnRenamed(column_name_in_part))
|
|
|
|
column_name_in_part = alter_conversions.getColumnOldName(column_name_in_part);
|
|
|
|
|
|
|
|
auto column_in_part = NameAndTypePair(
|
2021-07-15 03:12:37 +00:00
|
|
|
column_name_in_part, column_in_storage->getSubcolumnName(),
|
|
|
|
column_in_storage->getTypeInStorage(), column_in_storage->type);
|
2020-10-23 17:57:17 +00:00
|
|
|
|
|
|
|
/// column has files and hence does not require evaluation
|
|
|
|
if (part->hasColumnFiles(column_in_part))
|
2020-09-15 11:17:58 +00:00
|
|
|
{
|
2020-10-23 17:57:17 +00:00
|
|
|
/// ensure each column is added only once
|
2022-04-18 10:18:43 +00:00
|
|
|
if (!required_columns.contains(column_name))
|
2020-10-23 17:57:17 +00:00
|
|
|
{
|
|
|
|
columns.emplace_back(column_name);
|
|
|
|
required_columns.emplace(column_name);
|
|
|
|
injected_columns.emplace(column_name);
|
|
|
|
}
|
|
|
|
return true;
|
2020-09-15 11:17:58 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Column doesn't have default value and don't exist in part
|
|
|
|
/// don't need to add to required set.
|
2022-03-28 17:21:47 +00:00
|
|
|
auto metadata_snapshot = storage_snapshot->getMetadataForQuery();
|
|
|
|
const auto column_default = metadata_snapshot->getColumns().getDefault(column_name);
|
2020-09-15 11:17:58 +00:00
|
|
|
if (!column_default)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/// collect identifiers required for evaluation
|
|
|
|
IdentifierNameSet identifiers;
|
|
|
|
column_default->expression->collectIdentifierNames(identifiers);
|
|
|
|
|
|
|
|
bool result = false;
|
|
|
|
for (const auto & identifier : identifiers)
|
2022-03-28 17:21:47 +00:00
|
|
|
result |= injectRequiredColumnsRecursively(identifier, storage_snapshot, alter_conversions, part, columns, required_columns, injected_columns);
|
2020-09-15 11:17:58 +00:00
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
2019-11-21 16:10:22 +00:00
|
|
|
|
2022-03-28 17:21:47 +00:00
|
|
|
NameSet injectRequiredColumns(
|
|
|
|
const MergeTreeData & storage,
|
|
|
|
const StorageSnapshotPtr & storage_snapshot,
|
|
|
|
const MergeTreeData::DataPartPtr & part,
|
|
|
|
Names & columns)
|
2017-03-24 13:52:50 +00:00
|
|
|
{
|
|
|
|
NameSet required_columns{std::begin(columns), std::end(columns)};
|
|
|
|
NameSet injected_columns;
|
|
|
|
|
2020-09-15 11:17:58 +00:00
|
|
|
bool have_at_least_one_physical_column = false;
|
2021-02-10 14:12:49 +00:00
|
|
|
MergeTreeData::AlterConversions alter_conversions;
|
|
|
|
if (!part->isProjectionPart())
|
|
|
|
alter_conversions = storage.getAlterConversionsForPart(part);
|
2022-03-28 17:21:47 +00:00
|
|
|
|
2017-03-24 13:52:50 +00:00
|
|
|
for (size_t i = 0; i < columns.size(); ++i)
|
|
|
|
{
|
2020-09-15 13:29:11 +00:00
|
|
|
/// We are going to fetch only physical columns
|
2022-03-28 17:21:47 +00:00
|
|
|
auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withSubcolumns().withExtendedObjects();
|
|
|
|
if (!storage_snapshot->tryGetColumn(options, columns[i]))
|
|
|
|
throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "There is no physical column or subcolumn {} in table", columns[i]);
|
2017-03-24 13:52:50 +00:00
|
|
|
|
2020-09-15 11:17:58 +00:00
|
|
|
have_at_least_one_physical_column |= injectRequiredColumnsRecursively(
|
2022-03-28 17:21:47 +00:00
|
|
|
columns[i], storage_snapshot, alter_conversions,
|
2020-09-15 11:17:58 +00:00
|
|
|
part, columns, required_columns, injected_columns);
|
2017-03-24 13:52:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/** Add a column of the minimum size.
|
|
|
|
* Used in case when no column is needed or files are missing, but at least you need to know number of rows.
|
|
|
|
* Adds to the columns.
|
|
|
|
*/
|
2020-09-15 11:17:58 +00:00
|
|
|
if (!have_at_least_one_physical_column)
|
2017-03-24 13:52:50 +00:00
|
|
|
{
|
2022-03-28 17:21:47 +00:00
|
|
|
const auto minimum_size_column_name = part->getColumnNameWithMinimumCompressedSize(storage_snapshot);
|
2017-03-24 13:52:50 +00:00
|
|
|
columns.push_back(minimum_size_column_name);
|
|
|
|
/// correctly report added column
|
|
|
|
injected_columns.insert(columns.back());
|
|
|
|
}
|
|
|
|
|
|
|
|
return injected_columns;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-04-06 17:21:45 +00:00
|
|
|
MergeTreeReadTask::MergeTreeReadTask(
|
2022-03-13 12:23:51 +00:00
|
|
|
const MergeTreeData::DataPartPtr & data_part_, const MarkRanges & mark_ranges_, size_t part_index_in_query_,
|
2019-08-03 11:02:40 +00:00
|
|
|
const Names & ordered_names_, const NameSet & column_name_set_, const NamesAndTypesList & columns_,
|
2022-03-13 12:23:51 +00:00
|
|
|
const NamesAndTypesList & pre_columns_, bool remove_prewhere_column_, bool should_reorder_,
|
2021-02-13 22:07:13 +00:00
|
|
|
MergeTreeBlockSizePredictorPtr && size_predictor_)
|
2019-08-03 11:02:40 +00:00
|
|
|
: data_part{data_part_}, mark_ranges{mark_ranges_}, part_index_in_query{part_index_in_query_},
|
|
|
|
ordered_names{ordered_names_}, column_name_set{column_name_set_}, columns{columns_}, pre_columns{pre_columns_},
|
2021-02-13 22:07:13 +00:00
|
|
|
remove_prewhere_column{remove_prewhere_column_}, should_reorder{should_reorder_}, size_predictor{std::move(size_predictor_)}
|
2020-02-07 21:07:18 +00:00
|
|
|
{
|
|
|
|
}
|
2017-04-06 17:21:45 +00:00
|
|
|
|
|
|
|
|
|
|
|
MergeTreeBlockSizePredictor::MergeTreeBlockSizePredictor(
|
2017-05-18 16:03:08 +00:00
|
|
|
const MergeTreeData::DataPartPtr & data_part_, const Names & columns, const Block & sample_block)
|
2017-07-19 19:19:27 +00:00
|
|
|
: data_part(data_part_)
|
2017-04-06 17:21:45 +00:00
|
|
|
{
|
2017-10-24 14:11:53 +00:00
|
|
|
number_of_rows_in_part = data_part->rows_count;
|
2019-01-22 19:56:53 +00:00
|
|
|
/// Initialize with sample block until update won't called.
|
2019-09-26 17:29:41 +00:00
|
|
|
initialize(sample_block, {}, columns);
|
2017-07-19 19:19:27 +00:00
|
|
|
}
|
|
|
|
|
2019-09-26 17:29:41 +00:00
|
|
|
void MergeTreeBlockSizePredictor::initialize(const Block & sample_block, const Columns & columns, const Names & names, bool from_update)
|
2017-07-19 19:19:27 +00:00
|
|
|
{
|
|
|
|
fixed_columns_bytes_per_row = 0;
|
|
|
|
dynamic_columns_infos.clear();
|
|
|
|
|
2017-07-20 20:07:59 +00:00
|
|
|
std::unordered_set<String> names_set;
|
|
|
|
if (!from_update)
|
2019-09-26 17:29:41 +00:00
|
|
|
names_set.insert(names.begin(), names.end());
|
2017-07-20 20:07:59 +00:00
|
|
|
|
2019-09-26 17:29:41 +00:00
|
|
|
size_t num_columns = sample_block.columns();
|
|
|
|
for (size_t pos = 0; pos < num_columns; ++pos)
|
2017-04-06 17:21:45 +00:00
|
|
|
{
|
2019-09-26 17:29:41 +00:00
|
|
|
const auto & column_with_type_and_name = sample_block.getByPosition(pos);
|
2017-07-19 19:19:27 +00:00
|
|
|
const String & column_name = column_with_type_and_name.name;
|
2019-09-26 17:29:41 +00:00
|
|
|
const ColumnPtr & column_data = from_update ? columns[pos]
|
|
|
|
: column_with_type_and_name.column;
|
2017-04-06 17:21:45 +00:00
|
|
|
|
2022-04-18 10:18:43 +00:00
|
|
|
if (!from_update && !names_set.contains(column_name))
|
2017-07-20 20:07:59 +00:00
|
|
|
continue;
|
|
|
|
|
2017-07-19 19:19:27 +00:00
|
|
|
/// At least PREWHERE filter column might be const.
|
2017-07-21 06:40:05 +00:00
|
|
|
if (typeid_cast<const ColumnConst *>(column_data.get()))
|
2017-07-19 19:19:27 +00:00
|
|
|
continue;
|
2017-05-18 16:03:08 +00:00
|
|
|
|
2017-12-09 10:14:45 +00:00
|
|
|
if (column_data->valuesHaveFixedSize())
|
2017-04-06 17:21:45 +00:00
|
|
|
{
|
2017-12-09 10:14:45 +00:00
|
|
|
size_t size_of_value = column_data->sizeOfValueIfFixed();
|
|
|
|
fixed_columns_bytes_per_row += column_data->sizeOfValueIfFixed();
|
|
|
|
max_size_per_row_fixed = std::max<size_t>(max_size_per_row_fixed, size_of_value);
|
2017-04-06 17:21:45 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
ColumnInfo info;
|
2017-05-18 16:03:08 +00:00
|
|
|
info.name = column_name;
|
2017-07-19 19:19:27 +00:00
|
|
|
/// If column isn't fixed and doesn't have checksum, than take first
|
2021-12-09 10:39:28 +00:00
|
|
|
ColumnSize column_size = data_part->getColumnSize(column_name);
|
2018-03-26 14:18:04 +00:00
|
|
|
|
|
|
|
info.bytes_per_row_global = column_size.data_uncompressed
|
|
|
|
? column_size.data_uncompressed / number_of_rows_in_part
|
2017-07-19 19:19:27 +00:00
|
|
|
: column_data->byteSize() / std::max<size_t>(1, column_data->size());
|
2017-04-06 17:21:45 +00:00
|
|
|
|
|
|
|
dynamic_columns_infos.emplace_back(info);
|
|
|
|
}
|
2017-07-19 19:19:27 +00:00
|
|
|
}
|
2017-04-06 17:21:45 +00:00
|
|
|
|
|
|
|
bytes_per_row_global = fixed_columns_bytes_per_row;
|
|
|
|
for (auto & info : dynamic_columns_infos)
|
|
|
|
{
|
|
|
|
info.bytes_per_row = info.bytes_per_row_global;
|
|
|
|
bytes_per_row_global += info.bytes_per_row_global;
|
2017-06-28 09:44:02 +00:00
|
|
|
|
|
|
|
max_size_per_row_dynamic = std::max<double>(max_size_per_row_dynamic, info.bytes_per_row);
|
2017-04-06 17:21:45 +00:00
|
|
|
}
|
|
|
|
bytes_per_row_current = bytes_per_row_global;
|
|
|
|
}
|
|
|
|
|
|
|
|
void MergeTreeBlockSizePredictor::startBlock()
|
|
|
|
{
|
|
|
|
block_size_bytes = 0;
|
|
|
|
block_size_rows = 0;
|
|
|
|
for (auto & info : dynamic_columns_infos)
|
|
|
|
info.size_bytes = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-05-18 16:03:08 +00:00
|
|
|
/// TODO: add last_read_row_in_part parameter to take into account gaps between adjacent ranges
|
2019-09-26 17:29:41 +00:00
|
|
|
void MergeTreeBlockSizePredictor::update(const Block & sample_block, const Columns & columns, size_t num_rows, double decay)
|
2017-04-07 11:43:24 +00:00
|
|
|
{
|
2019-09-26 17:29:41 +00:00
|
|
|
if (columns.size() != sample_block.columns())
|
|
|
|
throw Exception("Inconsistent number of columns passed to MergeTreeBlockSizePredictor. "
|
|
|
|
"Have " + toString(sample_block.columns()) + " in sample block "
|
|
|
|
"and " + toString(columns.size()) + " columns in list", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
2017-07-19 19:19:27 +00:00
|
|
|
if (!is_initialized_in_update)
|
|
|
|
{
|
|
|
|
/// Reinitialize with read block to update estimation for DEFAULT and MATERIALIZED columns without data.
|
2019-09-26 17:29:41 +00:00
|
|
|
initialize(sample_block, columns, {}, true);
|
2017-07-19 19:19:27 +00:00
|
|
|
is_initialized_in_update = true;
|
|
|
|
}
|
2019-09-26 17:29:41 +00:00
|
|
|
|
|
|
|
if (num_rows < block_size_rows)
|
2017-04-10 14:06:44 +00:00
|
|
|
{
|
2019-09-26 17:29:41 +00:00
|
|
|
throw Exception("Updated block has less rows (" + toString(num_rows) + ") than previous one (" + toString(block_size_rows) + ")",
|
2017-04-10 14:06:44 +00:00
|
|
|
ErrorCodes::LOGICAL_ERROR);
|
|
|
|
}
|
|
|
|
|
2019-09-26 17:29:41 +00:00
|
|
|
size_t diff_rows = num_rows - block_size_rows;
|
|
|
|
block_size_bytes = num_rows * fixed_columns_bytes_per_row;
|
2017-04-07 11:43:24 +00:00
|
|
|
bytes_per_row_current = fixed_columns_bytes_per_row;
|
2019-09-26 17:29:41 +00:00
|
|
|
block_size_rows = num_rows;
|
2017-04-06 17:21:45 +00:00
|
|
|
|
2017-04-10 14:06:44 +00:00
|
|
|
/// Make recursive updates for each read row: v_{i+1} = (1 - decay) v_{i} + decay v_{target}
|
2019-01-22 19:56:53 +00:00
|
|
|
/// Use sum of geometric sequence formula to update multiple rows: v{n} = (1 - decay)^n v_{0} + (1 - (1 - decay)^n) v_{target}
|
2017-07-19 19:19:27 +00:00
|
|
|
/// NOTE: DEFAULT and MATERIALIZED columns without data has inaccurate estimation of v_{target}
|
2017-04-15 03:32:33 +00:00
|
|
|
double alpha = std::pow(1. - decay, diff_rows);
|
2017-04-06 17:21:45 +00:00
|
|
|
|
2017-06-28 09:44:02 +00:00
|
|
|
max_size_per_row_dynamic = 0;
|
2017-04-07 11:43:24 +00:00
|
|
|
for (auto & info : dynamic_columns_infos)
|
|
|
|
{
|
2019-09-26 17:29:41 +00:00
|
|
|
size_t new_size = columns[sample_block.getPositionByName(info.name)]->byteSize();
|
2017-04-15 03:32:33 +00:00
|
|
|
size_t diff_size = new_size - info.size_bytes;
|
2017-04-06 17:21:45 +00:00
|
|
|
|
2017-04-15 03:32:33 +00:00
|
|
|
double local_bytes_per_row = static_cast<double>(diff_size) / diff_rows;
|
2017-04-07 11:43:24 +00:00
|
|
|
info.bytes_per_row = alpha * info.bytes_per_row + (1. - alpha) * local_bytes_per_row;
|
2017-04-06 17:21:45 +00:00
|
|
|
|
2017-04-07 11:43:24 +00:00
|
|
|
info.size_bytes = new_size;
|
|
|
|
block_size_bytes += new_size;
|
|
|
|
bytes_per_row_current += info.bytes_per_row;
|
2017-06-28 09:44:02 +00:00
|
|
|
|
|
|
|
max_size_per_row_dynamic = std::max<double>(max_size_per_row_dynamic, info.bytes_per_row);
|
2017-04-06 17:21:45 +00:00
|
|
|
}
|
2017-04-07 11:43:24 +00:00
|
|
|
}
|
2017-04-06 17:21:45 +00:00
|
|
|
|
2019-07-19 14:56:00 +00:00
|
|
|
|
2020-06-17 16:39:58 +00:00
|
|
|
MergeTreeReadTaskColumns getReadTaskColumns(
|
|
|
|
const MergeTreeData & storage,
|
2021-07-09 03:15:41 +00:00
|
|
|
const StorageSnapshotPtr & storage_snapshot,
|
2020-06-17 16:39:58 +00:00
|
|
|
const MergeTreeData::DataPartPtr & data_part,
|
|
|
|
const Names & required_columns,
|
2021-10-20 21:56:17 +00:00
|
|
|
const PrewhereInfoPtr & prewhere_info)
|
2019-07-19 14:56:00 +00:00
|
|
|
{
|
|
|
|
Names column_names = required_columns;
|
|
|
|
Names pre_column_names;
|
|
|
|
|
|
|
|
/// inject columns required for defaults evaluation
|
2022-03-28 17:21:47 +00:00
|
|
|
bool should_reorder = !injectRequiredColumns(storage, storage_snapshot, data_part, column_names).empty();
|
2019-07-19 14:56:00 +00:00
|
|
|
|
2021-02-13 22:07:13 +00:00
|
|
|
if (prewhere_info)
|
2019-07-19 14:56:00 +00:00
|
|
|
{
|
2021-02-13 22:07:13 +00:00
|
|
|
if (prewhere_info->alias_actions)
|
2021-06-25 14:49:28 +00:00
|
|
|
pre_column_names = prewhere_info->alias_actions->getRequiredColumnsNames();
|
2021-02-20 11:00:16 +00:00
|
|
|
else
|
|
|
|
{
|
2021-06-25 14:49:28 +00:00
|
|
|
pre_column_names = prewhere_info->prewhere_actions->getRequiredColumnsNames();
|
2019-07-19 14:56:00 +00:00
|
|
|
|
2021-02-20 11:00:16 +00:00
|
|
|
if (prewhere_info->row_level_filter)
|
|
|
|
{
|
|
|
|
NameSet names(pre_column_names.begin(), pre_column_names.end());
|
|
|
|
|
2021-06-25 14:49:28 +00:00
|
|
|
for (auto & name : prewhere_info->row_level_filter->getRequiredColumnsNames())
|
2021-02-20 11:00:16 +00:00
|
|
|
{
|
2022-04-18 10:18:43 +00:00
|
|
|
if (!names.contains(name))
|
2021-02-20 11:00:16 +00:00
|
|
|
pre_column_names.push_back(name);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-07-19 14:56:00 +00:00
|
|
|
if (pre_column_names.empty())
|
|
|
|
pre_column_names.push_back(column_names[0]);
|
|
|
|
|
2022-03-28 17:21:47 +00:00
|
|
|
const auto injected_pre_columns = injectRequiredColumns(storage, storage_snapshot, data_part, pre_column_names);
|
2019-07-19 14:56:00 +00:00
|
|
|
if (!injected_pre_columns.empty())
|
|
|
|
should_reorder = true;
|
|
|
|
|
|
|
|
const NameSet pre_name_set(pre_column_names.begin(), pre_column_names.end());
|
|
|
|
|
|
|
|
Names post_column_names;
|
|
|
|
for (const auto & name : column_names)
|
2022-04-18 10:18:43 +00:00
|
|
|
if (!pre_name_set.contains(name))
|
2019-07-19 14:56:00 +00:00
|
|
|
post_column_names.push_back(name);
|
|
|
|
|
|
|
|
column_names = post_column_names;
|
|
|
|
}
|
|
|
|
|
|
|
|
MergeTreeReadTaskColumns result;
|
2021-07-23 16:30:18 +00:00
|
|
|
NamesAndTypesList all_columns;
|
2019-07-19 14:56:00 +00:00
|
|
|
|
2021-11-09 12:36:25 +00:00
|
|
|
auto options = GetColumnsOptions(GetColumnsOptions::All).withSubcolumns().withExtendedObjects();
|
|
|
|
result.pre_columns = storage_snapshot->getColumnsByNames(options, pre_column_names);
|
|
|
|
result.columns = storage_snapshot->getColumnsByNames(options, column_names);
|
2019-07-19 14:56:00 +00:00
|
|
|
result.should_reorder = should_reorder;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2017-03-24 13:52:50 +00:00
|
|
|
}
|