2017-03-12 19:18:07 +00:00
|
|
|
#include <boost/rational.hpp> /// For calculations related to sampling coefficients.
|
2020-07-31 23:55:08 +00:00
|
|
|
#include <ext/scope_guard.h>
|
2017-11-20 04:15:43 +00:00
|
|
|
#include <optional>
|
2020-10-05 20:50:03 +00:00
|
|
|
#include <unordered_set>
|
2015-11-19 21:34:53 +00:00
|
|
|
|
2019-01-07 12:51:14 +00:00
|
|
|
#include <Poco/File.h>
|
|
|
|
|
2017-11-24 13:55:31 +00:00
|
|
|
#include <Common/FieldVisitors.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Storages/MergeTree/MergeTreeDataSelectExecutor.h>
|
2019-10-01 16:50:08 +00:00
|
|
|
#include <Storages/MergeTree/MergeTreeSelectProcessor.h>
|
|
|
|
#include <Storages/MergeTree/MergeTreeReverseSelectProcessor.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Storages/MergeTree/MergeTreeReadPool.h>
|
2019-10-01 16:50:08 +00:00
|
|
|
#include <Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.h>
|
2019-01-17 12:11:36 +00:00
|
|
|
#include <Storages/MergeTree/MergeTreeIndices.h>
|
2019-01-07 12:51:14 +00:00
|
|
|
#include <Storages/MergeTree/MergeTreeIndexReader.h>
|
2018-04-20 00:20:36 +00:00
|
|
|
#include <Storages/MergeTree/KeyCondition.h>
|
2020-11-20 17:23:53 +00:00
|
|
|
#include <Storages/MergeTree/MergeTreeDataPartUUID.h>
|
2019-12-10 23:18:24 +00:00
|
|
|
#include <Storages/ReadInOrderOptimizer.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Parsers/ASTIdentifier.h>
|
2018-10-29 19:04:28 +00:00
|
|
|
#include <Parsers/ASTLiteral.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Parsers/ASTFunction.h>
|
|
|
|
#include <Parsers/ASTSampleRatio.h>
|
2020-10-06 06:46:12 +00:00
|
|
|
#include <Parsers/parseIdentifierOrStringLiteral.h>
|
2019-08-14 14:06:16 +00:00
|
|
|
#include <Interpreters/ExpressionAnalyzer.h>
|
2020-05-20 20:16:32 +00:00
|
|
|
#include <Interpreters/Context.h>
|
2020-10-01 17:34:22 +00:00
|
|
|
#include <Processors/ConcatProcessor.h>
|
|
|
|
#include <Processors/QueryPlan/QueryPlan.h>
|
|
|
|
#include <Processors/QueryPlan/FilterStep.h>
|
|
|
|
#include <Processors/QueryPlan/ExpressionStep.h>
|
|
|
|
#include <Processors/QueryPlan/ReadFromPreparedSource.h>
|
2021-03-30 10:25:26 +00:00
|
|
|
#include <Processors/QueryPlan/ReadFromMergeTree.h>
|
2020-10-01 17:34:22 +00:00
|
|
|
#include <Processors/QueryPlan/MergingSortedStep.h>
|
|
|
|
#include <Processors/QueryPlan/UnionStep.h>
|
|
|
|
#include <Processors/QueryPlan/MergingFinal.h>
|
2017-12-01 20:38:50 +00:00
|
|
|
|
2020-11-20 17:23:53 +00:00
|
|
|
#include <Core/UUID.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <DataTypes/DataTypeDate.h>
|
|
|
|
#include <DataTypes/DataTypeEnum.h>
|
2020-11-20 17:23:53 +00:00
|
|
|
#include <DataTypes/DataTypeUUID.h>
|
2020-04-22 13:52:07 +00:00
|
|
|
#include <DataTypes/DataTypesNumber.h>
|
|
|
|
#include <Storages/VirtualColumnUtils.h>
|
2015-02-03 14:37:35 +00:00
|
|
|
|
2016-10-24 02:02:37 +00:00
|
|
|
namespace ProfileEvents
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
extern const Event SelectedParts;
|
|
|
|
extern const Event SelectedRanges;
|
|
|
|
extern const Event SelectedMarks;
|
2016-10-24 02:02:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-03-13 12:48:07 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2016-01-11 21:46:36 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
2020-02-25 18:02:41 +00:00
|
|
|
extern const int LOGICAL_ERROR;
|
2017-04-01 07:20:54 +00:00
|
|
|
extern const int INDEX_NOT_USED;
|
|
|
|
extern const int ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER;
|
|
|
|
extern const int ILLEGAL_COLUMN;
|
2017-12-20 08:14:33 +00:00
|
|
|
extern const int ARGUMENT_OUT_OF_BOUND;
|
2020-08-13 15:03:29 +00:00
|
|
|
extern const int TOO_MANY_ROWS;
|
2020-10-06 06:46:12 +00:00
|
|
|
extern const int CANNOT_PARSE_TEXT;
|
2021-01-04 04:40:48 +00:00
|
|
|
extern const int TOO_MANY_PARTITIONS;
|
2020-11-20 17:23:53 +00:00
|
|
|
extern const int DUPLICATED_PART_UUIDS;
|
2016-01-11 21:46:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2018-10-17 03:13:00 +00:00
|
|
|
MergeTreeDataSelectExecutor::MergeTreeDataSelectExecutor(const MergeTreeData & data_)
|
2020-05-30 21:57:37 +00:00
|
|
|
: data(data_), log(&Poco::Logger::get(data.getLogName() + " (SelectExecutor)"))
|
2014-03-13 12:48:07 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2015-11-18 21:37:28 +00:00
|
|
|
|
2017-03-13 18:02:45 +00:00
|
|
|
/// Construct a block consisting only of possible values of virtual columns
|
2020-11-20 17:23:53 +00:00
|
|
|
static Block getBlockWithVirtualPartColumns(const MergeTreeData::DataPartsVector & parts, bool with_uuid)
|
2014-07-28 10:36:11 +00:00
|
|
|
{
|
2020-11-20 17:23:53 +00:00
|
|
|
auto part_column = ColumnString::create();
|
|
|
|
auto part_uuid_column = ColumnUUID::create();
|
2014-07-28 10:36:11 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
for (const auto & part : parts)
|
2020-11-20 17:23:53 +00:00
|
|
|
{
|
|
|
|
part_column->insert(part->name);
|
|
|
|
if (with_uuid)
|
|
|
|
part_uuid_column->insert(part->uuid);
|
|
|
|
}
|
2014-07-28 10:36:11 +00:00
|
|
|
|
2020-11-20 17:23:53 +00:00
|
|
|
if (with_uuid)
|
|
|
|
{
|
|
|
|
return Block(std::initializer_list<ColumnWithTypeAndName>{
|
|
|
|
ColumnWithTypeAndName(std::move(part_column), std::make_shared<DataTypeString>(), "_part"),
|
|
|
|
ColumnWithTypeAndName(std::move(part_uuid_column), std::make_shared<DataTypeUUID>(), "_part_uuid"),
|
|
|
|
});
|
|
|
|
}
|
2014-07-28 10:36:11 +00:00
|
|
|
|
2020-11-20 17:23:53 +00:00
|
|
|
return Block{ColumnWithTypeAndName(std::move(part_column), std::make_shared<DataTypeString>(), "_part")};
|
2014-07-28 10:36:11 +00:00
|
|
|
}
|
|
|
|
|
2015-11-18 21:37:28 +00:00
|
|
|
|
|
|
|
size_t MergeTreeDataSelectExecutor::getApproximateTotalRowsToRead(
|
2020-06-17 12:39:20 +00:00
|
|
|
const MergeTreeData::DataPartsVector & parts,
|
|
|
|
const StorageMetadataPtr & metadata_snapshot,
|
|
|
|
const KeyCondition & key_condition,
|
|
|
|
const Settings & settings) const
|
2015-11-18 21:37:28 +00:00
|
|
|
{
|
2019-03-25 13:55:24 +00:00
|
|
|
size_t rows_count = 0;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
/// We will find out how many rows we would have read without sampling.
|
2020-05-23 22:24:01 +00:00
|
|
|
LOG_DEBUG(log, "Preliminary index scan with condition: {}", key_condition.toString());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-03-09 01:59:08 +00:00
|
|
|
for (const auto & part : parts)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2020-07-20 15:09:00 +00:00
|
|
|
MarkRanges ranges = markRangesFromPKRange(part, metadata_snapshot, key_condition, settings, log);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
/** In order to get a lower bound on the number of rows that match the condition on PK,
|
|
|
|
* consider only guaranteed full marks.
|
|
|
|
* That is, do not take into account the first and last marks, which may be incomplete.
|
|
|
|
*/
|
2020-03-09 01:59:08 +00:00
|
|
|
for (const auto & range : ranges)
|
|
|
|
if (range.end - range.begin > 2)
|
|
|
|
rows_count += part->index_granularity.getRowsCountInRange({range.begin + 1, range.end - 1});
|
2019-03-25 13:55:24 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2019-03-25 13:55:24 +00:00
|
|
|
return rows_count;
|
2015-11-18 21:37:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2015-11-19 21:34:53 +00:00
|
|
|
using RelativeSize = boost::rational<ASTSampleRatio::BigNum>;
|
|
|
|
|
2019-12-15 06:34:43 +00:00
|
|
|
static std::string toString(const RelativeSize & x)
|
2015-11-19 21:34:53 +00:00
|
|
|
{
|
2017-07-28 20:41:51 +00:00
|
|
|
return ASTSampleRatio::toString(x.numerator()) + "/" + ASTSampleRatio::toString(x.denominator());
|
2015-11-19 21:34:53 +00:00
|
|
|
}
|
2015-11-18 21:37:28 +00:00
|
|
|
|
2017-03-12 19:18:07 +00:00
|
|
|
/// Converts sample size to an approximate number of rows (ex. `SAMPLE 1000000`) to relative value (ex. `SAMPLE 0.1`).
|
2015-11-18 21:37:28 +00:00
|
|
|
static RelativeSize convertAbsoluteSampleSizeToRelative(const ASTPtr & node, size_t approx_total_rows)
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
if (approx_total_rows == 0)
|
|
|
|
return 1;
|
2015-11-18 21:37:28 +00:00
|
|
|
|
2019-03-15 16:14:13 +00:00
|
|
|
const auto & node_sample = node->as<ASTSampleRatio &>();
|
2015-11-19 21:34:53 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
auto absolute_sample_size = node_sample.ratio.numerator / node_sample.ratio.denominator;
|
2017-07-28 20:41:51 +00:00
|
|
|
return std::min(RelativeSize(1), RelativeSize(absolute_sample_size) / RelativeSize(approx_total_rows));
|
2015-11-18 21:37:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-10-01 17:34:22 +00:00
|
|
|
QueryPlanPtr MergeTreeDataSelectExecutor::read(
|
2017-04-01 07:20:54 +00:00
|
|
|
const Names & column_names_to_return,
|
2020-06-16 14:25:08 +00:00
|
|
|
const StorageMetadataPtr & metadata_snapshot,
|
2017-07-15 03:48:36 +00:00
|
|
|
const SelectQueryInfo & query_info,
|
2017-04-01 07:20:54 +00:00
|
|
|
const Context & context,
|
2019-02-10 16:55:12 +00:00
|
|
|
const UInt64 max_block_size,
|
2017-06-02 15:54:39 +00:00
|
|
|
const unsigned num_streams,
|
2018-10-10 16:20:15 +00:00
|
|
|
const PartitionIdToMaxBlock * max_block_numbers_to_read) const
|
2014-03-13 12:48:07 +00:00
|
|
|
{
|
2018-07-18 12:17:48 +00:00
|
|
|
return readFromParts(
|
2020-06-16 14:25:08 +00:00
|
|
|
data.getDataPartsVector(), column_names_to_return, metadata_snapshot,
|
|
|
|
query_info, context, max_block_size, num_streams,
|
|
|
|
max_block_numbers_to_read);
|
2018-07-18 12:17:48 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-10-01 17:34:22 +00:00
|
|
|
QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
|
2018-07-18 12:17:48 +00:00
|
|
|
MergeTreeData::DataPartsVector parts,
|
|
|
|
const Names & column_names_to_return,
|
2020-06-16 14:25:08 +00:00
|
|
|
const StorageMetadataPtr & metadata_snapshot,
|
2018-07-18 12:17:48 +00:00
|
|
|
const SelectQueryInfo & query_info,
|
|
|
|
const Context & context,
|
2019-02-10 16:55:12 +00:00
|
|
|
const UInt64 max_block_size,
|
2018-07-18 12:17:48 +00:00
|
|
|
const unsigned num_streams,
|
2018-10-10 16:20:15 +00:00
|
|
|
const PartitionIdToMaxBlock * max_block_numbers_to_read) const
|
2018-07-18 12:17:48 +00:00
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
/// If query contains restrictions on the virtual column `_part` or `_part_index`, select only parts suitable for it.
|
2017-08-29 14:08:09 +00:00
|
|
|
/// The virtual column `_sample_factor` (which is equal to 1 / used sample rate) can be requested in the query.
|
2017-04-01 07:20:54 +00:00
|
|
|
Names virt_column_names;
|
|
|
|
Names real_column_names;
|
|
|
|
|
2021-02-13 10:59:09 +00:00
|
|
|
size_t total_parts = parts.size();
|
2017-04-01 07:20:54 +00:00
|
|
|
bool part_column_queried = false;
|
2020-11-20 17:23:53 +00:00
|
|
|
bool part_uuid_column_queried = false;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
bool sample_factor_column_queried = false;
|
|
|
|
Float64 used_sample_factor = 1;
|
|
|
|
|
|
|
|
for (const String & name : column_names_to_return)
|
|
|
|
{
|
|
|
|
if (name == "_part")
|
|
|
|
{
|
|
|
|
part_column_queried = true;
|
|
|
|
virt_column_names.push_back(name);
|
|
|
|
}
|
|
|
|
else if (name == "_part_index")
|
|
|
|
{
|
|
|
|
virt_column_names.push_back(name);
|
|
|
|
}
|
2018-09-10 09:53:13 +00:00
|
|
|
else if (name == "_partition_id")
|
|
|
|
{
|
|
|
|
virt_column_names.push_back(name);
|
|
|
|
}
|
2020-11-20 17:23:53 +00:00
|
|
|
else if (name == "_part_uuid")
|
|
|
|
{
|
|
|
|
part_uuid_column_queried = true;
|
|
|
|
virt_column_names.push_back(name);
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
else if (name == "_sample_factor")
|
|
|
|
{
|
|
|
|
sample_factor_column_queried = true;
|
|
|
|
virt_column_names.push_back(name);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
real_column_names.push_back(name);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-16 14:25:08 +00:00
|
|
|
NamesAndTypesList available_real_columns = metadata_snapshot->getColumns().getAllPhysical();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
/// If there are only virtual columns in the query, you must request at least one non-virtual one.
|
|
|
|
if (real_column_names.empty())
|
|
|
|
real_column_names.push_back(ExpressionActions::getSmallestColumn(available_real_columns));
|
|
|
|
|
2020-11-20 17:23:53 +00:00
|
|
|
/// If `_part` or `_part_uuid` virtual columns are requested, we try to filter out data by them.
|
|
|
|
Block virtual_columns_block = getBlockWithVirtualPartColumns(parts, part_uuid_column_queried);
|
|
|
|
if (part_column_queried || part_uuid_column_queried)
|
2017-07-15 03:48:36 +00:00
|
|
|
VirtualColumnUtils::filterBlockWithQuery(query_info.query, virtual_columns_block, context);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-09-25 08:38:09 +00:00
|
|
|
auto part_values = VirtualColumnUtils::extractSingleValueFromBlock<String>(virtual_columns_block, "_part");
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-06-19 17:17:13 +00:00
|
|
|
metadata_snapshot->check(real_column_names, data.getVirtuals(), data.getStorageID());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
PKCondition: infer index use with pk subexpression
By default only constraints explicitly matching
primary key expression (or expression wrapped in
a monotonic function) are eligible for part and
range selection. So for example, if index is:
(toStartOfHour(dt), UserID)
Then a query such as this resorts to full scan:
SELECT count() FROM t WHERE dt = now()
Intuitively, only parts with toStartOfHour(now())
could be selected, but it is less trivial to prove.
The primary key currently can be wrapped in a chain
of monotonic functions, so following would work:
toStartOfHour(dt) = toStartOfHour(now()) AND dt = now()
It must be however explicitly stated, if we wanted
to infer that we’d have to know the inverse function,
and prove that the inverse function is monotonic
on given interval. This is not practical as
there is no inverse function that for example undos
rounding, it isn’t strictly monotonic.
There are however functions that don’t transform
output range and preserve monotonicity on the
complete input range, such as rounding or casts
to a same or wider numeric type. This eliminates
the need to find inverse function, as no check for monotonicity over arbitrary interval is needed,
and thus makes this optimisation possible.
2017-07-06 05:39:05 +00:00
|
|
|
const Settings & settings = context.getSettingsRef();
|
2020-06-17 12:39:20 +00:00
|
|
|
const auto & primary_key = metadata_snapshot->getPrimaryKey();
|
2020-05-21 19:46:03 +00:00
|
|
|
Names primary_key_columns = primary_key.column_names;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-05-21 19:46:03 +00:00
|
|
|
KeyCondition key_condition(query_info, context, primary_key_columns, primary_key.expression);
|
2017-05-24 21:06:29 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (settings.force_primary_key && key_condition.alwaysUnknownOrTrue())
|
|
|
|
{
|
2020-11-10 18:22:26 +00:00
|
|
|
throw Exception(ErrorCodes::INDEX_NOT_USED, "Primary key ({}) is not used and setting 'force_primary_key' is set.",
|
|
|
|
boost::algorithm::join(primary_key_columns, ", "));
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2018-04-20 00:20:36 +00:00
|
|
|
std::optional<KeyCondition> minmax_idx_condition;
|
2020-10-22 06:18:10 +00:00
|
|
|
std::optional<PartitionPruner> partition_pruner;
|
2021-03-02 10:33:54 +00:00
|
|
|
DataTypes minmax_columns_types;
|
|
|
|
if (metadata_snapshot->hasPartitionKey())
|
2017-09-01 20:33:17 +00:00
|
|
|
{
|
2021-03-02 10:33:54 +00:00
|
|
|
const auto & partition_key = metadata_snapshot->getPartitionKey();
|
|
|
|
auto minmax_columns_names = data.getMinMaxColumnsNames(partition_key);
|
|
|
|
minmax_columns_types = data.getMinMaxColumnsTypes(partition_key);
|
|
|
|
|
2021-03-05 15:21:14 +00:00
|
|
|
minmax_idx_condition.emplace(query_info, context, minmax_columns_names, data.getMinMaxExpr(partition_key, ExpressionActionsSettings::fromContext(context)));
|
2020-11-06 11:18:42 +00:00
|
|
|
partition_pruner.emplace(metadata_snapshot->getPartitionKey(), query_info, context, false /* strict */);
|
2017-08-21 15:35:29 +00:00
|
|
|
|
2020-10-22 06:18:10 +00:00
|
|
|
if (settings.force_index_by_date && (minmax_idx_condition->alwaysUnknownOrTrue() && partition_pruner->isUseless()))
|
2017-09-08 13:17:38 +00:00
|
|
|
{
|
2020-10-22 06:18:10 +00:00
|
|
|
String msg = "Neither MinMax index by columns (";
|
2017-09-08 13:17:38 +00:00
|
|
|
bool first = true;
|
2021-03-02 10:33:54 +00:00
|
|
|
for (const String & col : minmax_columns_names)
|
2017-09-08 13:17:38 +00:00
|
|
|
{
|
|
|
|
if (first)
|
|
|
|
first = false;
|
|
|
|
else
|
|
|
|
msg += ", ";
|
|
|
|
msg += col;
|
|
|
|
}
|
2020-10-22 06:18:10 +00:00
|
|
|
msg += ") nor partition expr is used and setting 'force_index_by_date' is set";
|
2017-09-08 13:17:38 +00:00
|
|
|
|
|
|
|
throw Exception(msg, ErrorCodes::INDEX_NOT_USED);
|
|
|
|
}
|
2017-09-01 20:33:17 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2021-02-05 09:54:34 +00:00
|
|
|
const Context & query_context = context.hasQueryContext() ? context.getQueryContext() : context;
|
2018-05-23 19:34:37 +00:00
|
|
|
|
2021-04-06 12:39:55 +00:00
|
|
|
PartFilterCounters part_filter_counters;
|
2021-04-08 08:19:04 +00:00
|
|
|
auto index_stats = std::make_unique<ReadFromMergeTree::IndexStats>();
|
2021-04-06 12:39:55 +00:00
|
|
|
|
2021-02-05 09:54:34 +00:00
|
|
|
if (query_context.getSettingsRef().allow_experimental_query_deduplication)
|
2021-04-06 12:39:55 +00:00
|
|
|
selectPartsToReadWithUUIDFilter(parts, part_values, minmax_idx_condition, minmax_columns_types, partition_pruner, max_block_numbers_to_read, query_context, part_filter_counters);
|
2021-02-05 09:54:34 +00:00
|
|
|
else
|
2021-04-06 12:39:55 +00:00
|
|
|
selectPartsToRead(parts, part_values, minmax_idx_condition, minmax_columns_types, partition_pruner, max_block_numbers_to_read, part_filter_counters);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2021-04-08 11:48:54 +00:00
|
|
|
index_stats->emplace_back(ReadFromMergeTree::IndexStat{
|
|
|
|
.type = ReadFromMergeTree::IndexType::None,
|
|
|
|
.num_parts_after = part_filter_counters.num_initial_selected_parts,
|
|
|
|
.num_granules_after = part_filter_counters.num_initial_selected_granules});
|
2021-04-08 08:19:04 +00:00
|
|
|
|
|
|
|
if (minmax_idx_condition)
|
|
|
|
{
|
2021-04-08 11:48:54 +00:00
|
|
|
index_stats->emplace_back(ReadFromMergeTree::IndexStat{
|
|
|
|
.type = ReadFromMergeTree::IndexType::MinMax,
|
|
|
|
.description = minmax_idx_condition->toString(),
|
|
|
|
.num_parts_after = part_filter_counters.num_parts_after_minmax,
|
|
|
|
.num_granules_after = part_filter_counters.num_granules_after_minmax});
|
2021-04-08 08:19:04 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (partition_pruner)
|
|
|
|
{
|
2021-04-08 11:48:54 +00:00
|
|
|
index_stats->emplace_back(ReadFromMergeTree::IndexStat{
|
|
|
|
.type = ReadFromMergeTree::IndexType::Partition,
|
|
|
|
.description = partition_pruner->toString(),
|
|
|
|
.num_parts_after = part_filter_counters.num_parts_after_partition_pruner,
|
|
|
|
.num_granules_after = part_filter_counters.num_granules_after_partition_pruner});
|
2021-04-08 08:19:04 +00:00
|
|
|
}
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
/// Sampling.
|
|
|
|
Names column_names_to_read = real_column_names;
|
|
|
|
std::shared_ptr<ASTFunction> filter_function;
|
2020-11-11 10:08:57 +00:00
|
|
|
ActionsDAGPtr filter_expression;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
RelativeSize relative_sample_size = 0;
|
|
|
|
RelativeSize relative_sample_offset = 0;
|
|
|
|
|
2019-03-15 16:14:13 +00:00
|
|
|
const auto & select = query_info.query->as<ASTSelectQuery &>();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-03-23 02:12:31 +00:00
|
|
|
auto select_sample_size = select.sampleSize();
|
|
|
|
auto select_sample_offset = select.sampleOffset();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
if (select_sample_size)
|
|
|
|
{
|
|
|
|
relative_sample_size.assign(
|
2019-03-15 16:14:13 +00:00
|
|
|
select_sample_size->as<ASTSampleRatio &>().ratio.numerator,
|
|
|
|
select_sample_size->as<ASTSampleRatio &>().ratio.denominator);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
if (relative_sample_size < 0)
|
|
|
|
throw Exception("Negative sample size", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
|
|
|
|
|
|
|
|
relative_sample_offset = 0;
|
|
|
|
if (select_sample_offset)
|
|
|
|
relative_sample_offset.assign(
|
2019-03-15 16:14:13 +00:00
|
|
|
select_sample_offset->as<ASTSampleRatio &>().ratio.numerator,
|
|
|
|
select_sample_offset->as<ASTSampleRatio &>().ratio.denominator);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
if (relative_sample_offset < 0)
|
|
|
|
throw Exception("Negative sample offset", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
|
|
|
|
|
|
|
|
/// Convert absolute value of the sampling (in form `SAMPLE 1000000` - how many rows to read) into the relative `SAMPLE 0.1` (how much data to read).
|
|
|
|
size_t approx_total_rows = 0;
|
|
|
|
if (relative_sample_size > 1 || relative_sample_offset > 1)
|
2020-06-17 12:39:20 +00:00
|
|
|
approx_total_rows = getApproximateTotalRowsToRead(parts, metadata_snapshot, key_condition, settings);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
if (relative_sample_size > 1)
|
|
|
|
{
|
|
|
|
relative_sample_size = convertAbsoluteSampleSizeToRelative(select_sample_size, approx_total_rows);
|
2020-05-23 22:24:01 +00:00
|
|
|
LOG_DEBUG(log, "Selected relative sample size: {}", toString(relative_sample_size));
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// SAMPLE 1 is the same as the absence of SAMPLE.
|
2017-07-28 20:41:51 +00:00
|
|
|
if (relative_sample_size == RelativeSize(1))
|
2017-04-01 07:20:54 +00:00
|
|
|
relative_sample_size = 0;
|
|
|
|
|
2018-08-10 04:02:56 +00:00
|
|
|
if (relative_sample_offset > 0 && RelativeSize(0) == relative_sample_size)
|
2017-04-01 07:20:54 +00:00
|
|
|
throw Exception("Sampling offset is incorrect because no sampling", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
|
|
|
|
|
|
|
|
if (relative_sample_offset > 1)
|
|
|
|
{
|
|
|
|
relative_sample_offset = convertAbsoluteSampleSizeToRelative(select_sample_offset, approx_total_rows);
|
2020-05-23 22:24:01 +00:00
|
|
|
LOG_DEBUG(log, "Selected relative sample offset: {}", toString(relative_sample_offset));
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Which range of sampling key values do I need to read?
|
|
|
|
* First, in the whole range ("universe") we select the interval
|
|
|
|
* of relative `relative_sample_size` size, offset from the beginning by `relative_sample_offset`.
|
|
|
|
*
|
|
|
|
* Example: SAMPLE 0.4 OFFSET 0.3
|
|
|
|
*
|
|
|
|
* [------********------]
|
|
|
|
* ^ - offset
|
|
|
|
* <------> - size
|
|
|
|
*
|
|
|
|
* If the interval passes through the end of the universe, then cut its right side.
|
|
|
|
*
|
|
|
|
* Example: SAMPLE 0.4 OFFSET 0.8
|
|
|
|
*
|
|
|
|
* [----------------****]
|
|
|
|
* ^ - offset
|
|
|
|
* <------> - size
|
|
|
|
*
|
|
|
|
* Next, if the `parallel_replicas_count`, `parallel_replica_offset` settings are set,
|
|
|
|
* then it is necessary to break the received interval into pieces of the number `parallel_replicas_count`,
|
|
|
|
* and select a piece with the number `parallel_replica_offset` (from zero).
|
|
|
|
*
|
|
|
|
* Example: SAMPLE 0.4 OFFSET 0.3, parallel_replicas_count = 2, parallel_replica_offset = 1
|
|
|
|
*
|
|
|
|
* [----------****------]
|
|
|
|
* ^ - offset
|
|
|
|
* <------> - size
|
|
|
|
* <--><--> - pieces for different `parallel_replica_offset`, select the second one.
|
|
|
|
*
|
|
|
|
* It is very important that the intervals for different `parallel_replica_offset` cover the entire range without gaps and overlaps.
|
|
|
|
* It is also important that the entire universe can be covered using SAMPLE 0.1 OFFSET 0, ... OFFSET 0.9 and similar decimals.
|
|
|
|
*/
|
|
|
|
|
2020-11-04 15:59:14 +00:00
|
|
|
/// Parallel replicas has been requested but there is no way to sample data.
|
|
|
|
/// Select all data from first replica and no data from other replicas.
|
|
|
|
if (settings.parallel_replicas_count > 1 && !data.supportsSampling() && settings.parallel_replica_offset > 0)
|
|
|
|
{
|
|
|
|
LOG_DEBUG(log, "Will use no data on this replica because parallel replicas processing has been requested"
|
|
|
|
" (the setting 'max_parallel_replicas') but the table does not support sampling and this replica is not the first.");
|
|
|
|
return {};
|
|
|
|
}
|
|
|
|
|
2019-03-09 14:30:55 +00:00
|
|
|
bool use_sampling = relative_sample_size > 0 || (settings.parallel_replicas_count > 1 && data.supportsSampling());
|
2017-04-01 07:20:54 +00:00
|
|
|
bool no_data = false; /// There is nothing left after sampling.
|
|
|
|
|
|
|
|
if (use_sampling)
|
|
|
|
{
|
2018-08-10 04:02:56 +00:00
|
|
|
if (sample_factor_column_queried && relative_sample_size != RelativeSize(0))
|
2017-04-01 07:20:54 +00:00
|
|
|
used_sample_factor = 1.0 / boost::rational_cast<Float64>(relative_sample_size);
|
|
|
|
|
|
|
|
RelativeSize size_of_universum = 0;
|
2020-06-17 12:07:09 +00:00
|
|
|
const auto & sampling_key = metadata_snapshot->getSamplingKey();
|
2020-05-20 15:16:39 +00:00
|
|
|
DataTypePtr sampling_column_type = sampling_key.data_types[0];
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-08-27 13:10:10 +00:00
|
|
|
if (sampling_key.data_types.size() == 1)
|
|
|
|
{
|
|
|
|
if (typeid_cast<const DataTypeUInt64 *>(sampling_column_type.get()))
|
|
|
|
size_of_universum = RelativeSize(std::numeric_limits<UInt64>::max()) + RelativeSize(1);
|
|
|
|
else if (typeid_cast<const DataTypeUInt32 *>(sampling_column_type.get()))
|
|
|
|
size_of_universum = RelativeSize(std::numeric_limits<UInt32>::max()) + RelativeSize(1);
|
|
|
|
else if (typeid_cast<const DataTypeUInt16 *>(sampling_column_type.get()))
|
|
|
|
size_of_universum = RelativeSize(std::numeric_limits<UInt16>::max()) + RelativeSize(1);
|
|
|
|
else if (typeid_cast<const DataTypeUInt8 *>(sampling_column_type.get()))
|
|
|
|
size_of_universum = RelativeSize(std::numeric_limits<UInt8>::max()) + RelativeSize(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (size_of_universum == RelativeSize(0))
|
|
|
|
throw Exception(
|
|
|
|
"Invalid sampling column type in storage parameters: " + sampling_column_type->getName()
|
|
|
|
+ ". Must be one unsigned integer type",
|
2017-04-01 07:20:54 +00:00
|
|
|
ErrorCodes::ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER);
|
|
|
|
|
|
|
|
if (settings.parallel_replicas_count > 1)
|
|
|
|
{
|
2017-07-28 20:41:51 +00:00
|
|
|
if (relative_sample_size == RelativeSize(0))
|
2017-04-01 07:20:54 +00:00
|
|
|
relative_sample_size = 1;
|
|
|
|
|
2019-08-13 11:24:18 +00:00
|
|
|
relative_sample_size /= settings.parallel_replicas_count.value;
|
|
|
|
relative_sample_offset += relative_sample_size * RelativeSize(settings.parallel_replica_offset.value);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2017-07-28 20:41:51 +00:00
|
|
|
if (relative_sample_offset >= RelativeSize(1))
|
2017-04-01 07:20:54 +00:00
|
|
|
no_data = true;
|
|
|
|
|
|
|
|
/// Calculate the half-interval of `[lower, upper)` column values.
|
|
|
|
bool has_lower_limit = false;
|
|
|
|
bool has_upper_limit = false;
|
|
|
|
|
|
|
|
RelativeSize lower_limit_rational = relative_sample_offset * size_of_universum;
|
|
|
|
RelativeSize upper_limit_rational = (relative_sample_offset + relative_sample_size) * size_of_universum;
|
|
|
|
|
|
|
|
UInt64 lower = boost::rational_cast<ASTSampleRatio::BigNum>(lower_limit_rational);
|
|
|
|
UInt64 upper = boost::rational_cast<ASTSampleRatio::BigNum>(upper_limit_rational);
|
|
|
|
|
|
|
|
if (lower > 0)
|
|
|
|
has_lower_limit = true;
|
|
|
|
|
|
|
|
if (upper_limit_rational < size_of_universum)
|
|
|
|
has_upper_limit = true;
|
|
|
|
|
|
|
|
/*std::cerr << std::fixed << std::setprecision(100)
|
|
|
|
<< "relative_sample_size: " << relative_sample_size << "\n"
|
|
|
|
<< "relative_sample_offset: " << relative_sample_offset << "\n"
|
|
|
|
<< "lower_limit_float: " << lower_limit_rational << "\n"
|
|
|
|
<< "upper_limit_float: " << upper_limit_rational << "\n"
|
|
|
|
<< "lower: " << lower << "\n"
|
|
|
|
<< "upper: " << upper << "\n";*/
|
|
|
|
|
|
|
|
if ((has_upper_limit && upper == 0)
|
|
|
|
|| (has_lower_limit && has_upper_limit && lower == upper))
|
|
|
|
no_data = true;
|
|
|
|
|
|
|
|
if (no_data || (!has_lower_limit && !has_upper_limit))
|
|
|
|
{
|
|
|
|
use_sampling = false;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/// Let's add the conditions to cut off something else when the index is scanned again and when the request is processed.
|
|
|
|
|
|
|
|
std::shared_ptr<ASTFunction> lower_function;
|
|
|
|
std::shared_ptr<ASTFunction> upper_function;
|
|
|
|
|
2019-11-25 12:05:29 +00:00
|
|
|
/// If sample and final are used together no need to calculate sampling expression twice.
|
|
|
|
/// The first time it was calculated for final, because sample key is a part of the PK.
|
|
|
|
/// So, assume that we already have calculated column.
|
2020-06-17 12:07:09 +00:00
|
|
|
ASTPtr sampling_key_ast = metadata_snapshot->getSamplingKeyAST();
|
2020-05-20 15:16:39 +00:00
|
|
|
|
2019-11-25 12:05:29 +00:00
|
|
|
if (select.final())
|
2019-11-25 12:49:05 +00:00
|
|
|
{
|
2020-05-21 19:46:03 +00:00
|
|
|
sampling_key_ast = std::make_shared<ASTIdentifier>(sampling_key.column_names[0]);
|
2019-11-25 12:49:05 +00:00
|
|
|
/// We do spoil available_real_columns here, but it is not used later.
|
2020-05-21 19:46:03 +00:00
|
|
|
available_real_columns.emplace_back(sampling_key.column_names[0], std::move(sampling_column_type));
|
2019-11-25 12:49:05 +00:00
|
|
|
}
|
2019-11-25 12:05:29 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (has_lower_limit)
|
|
|
|
{
|
2020-05-21 19:46:03 +00:00
|
|
|
if (!key_condition.addCondition(sampling_key.column_names[0], Range::createLeftBounded(lower, true)))
|
2017-04-01 07:20:54 +00:00
|
|
|
throw Exception("Sampling column not in primary key", ErrorCodes::ILLEGAL_COLUMN);
|
|
|
|
|
|
|
|
ASTPtr args = std::make_shared<ASTExpressionList>();
|
2019-11-25 12:05:29 +00:00
|
|
|
args->children.push_back(sampling_key_ast);
|
2018-02-26 03:37:08 +00:00
|
|
|
args->children.push_back(std::make_shared<ASTLiteral>(lower));
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
lower_function = std::make_shared<ASTFunction>();
|
|
|
|
lower_function->name = "greaterOrEquals";
|
|
|
|
lower_function->arguments = args;
|
|
|
|
lower_function->children.push_back(lower_function->arguments);
|
|
|
|
|
|
|
|
filter_function = lower_function;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (has_upper_limit)
|
|
|
|
{
|
2020-05-21 19:46:03 +00:00
|
|
|
if (!key_condition.addCondition(sampling_key.column_names[0], Range::createRightBounded(upper, false)))
|
2017-04-01 07:20:54 +00:00
|
|
|
throw Exception("Sampling column not in primary key", ErrorCodes::ILLEGAL_COLUMN);
|
|
|
|
|
|
|
|
ASTPtr args = std::make_shared<ASTExpressionList>();
|
2019-11-25 12:05:29 +00:00
|
|
|
args->children.push_back(sampling_key_ast);
|
2018-02-26 03:37:08 +00:00
|
|
|
args->children.push_back(std::make_shared<ASTLiteral>(upper));
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
upper_function = std::make_shared<ASTFunction>();
|
|
|
|
upper_function->name = "less";
|
|
|
|
upper_function->arguments = args;
|
|
|
|
upper_function->children.push_back(upper_function->arguments);
|
|
|
|
|
|
|
|
filter_function = upper_function;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (has_lower_limit && has_upper_limit)
|
|
|
|
{
|
|
|
|
ASTPtr args = std::make_shared<ASTExpressionList>();
|
|
|
|
args->children.push_back(lower_function);
|
|
|
|
args->children.push_back(upper_function);
|
|
|
|
|
|
|
|
filter_function = std::make_shared<ASTFunction>();
|
|
|
|
filter_function->name = "and";
|
|
|
|
filter_function->arguments = args;
|
|
|
|
filter_function->children.push_back(filter_function->arguments);
|
|
|
|
}
|
|
|
|
|
2018-11-08 15:43:14 +00:00
|
|
|
ASTPtr query = filter_function;
|
2020-07-22 17:13:05 +00:00
|
|
|
auto syntax_result = TreeRewriter(context).analyze(query, available_real_columns);
|
2020-11-11 10:08:57 +00:00
|
|
|
filter_expression = ExpressionAnalyzer(filter_function, syntax_result, context).getActionsDAG(false);
|
2018-02-15 21:31:09 +00:00
|
|
|
|
2019-11-25 12:05:29 +00:00
|
|
|
if (!select.final())
|
|
|
|
{
|
|
|
|
/// Add columns needed for `sample_by_ast` to `column_names_to_read`.
|
|
|
|
/// Skip this if final was used, because such columns were already added from PK.
|
2020-11-11 10:08:57 +00:00
|
|
|
std::vector<String> add_columns = filter_expression->getRequiredColumns().getNames();
|
2019-11-25 12:05:29 +00:00
|
|
|
column_names_to_read.insert(column_names_to_read.end(), add_columns.begin(), add_columns.end());
|
|
|
|
std::sort(column_names_to_read.begin(), column_names_to_read.end());
|
|
|
|
column_names_to_read.erase(std::unique(column_names_to_read.begin(), column_names_to_read.end()),
|
|
|
|
column_names_to_read.end());
|
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (no_data)
|
|
|
|
{
|
2020-05-23 22:24:01 +00:00
|
|
|
LOG_DEBUG(log, "Sampling yields no data.");
|
2020-10-01 18:47:20 +00:00
|
|
|
return std::make_unique<QueryPlan>();
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2020-05-23 22:24:01 +00:00
|
|
|
LOG_DEBUG(log, "Key condition: {}", key_condition.toString());
|
2017-09-01 20:33:17 +00:00
|
|
|
if (minmax_idx_condition)
|
2020-05-23 22:24:01 +00:00
|
|
|
LOG_DEBUG(log, "MinMax index condition: {}", minmax_idx_condition->toString());
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-06-25 19:31:54 +00:00
|
|
|
MergeTreeReaderSettings reader_settings =
|
|
|
|
{
|
|
|
|
.min_bytes_to_use_direct_io = settings.min_bytes_to_use_direct_io,
|
|
|
|
.min_bytes_to_use_mmap_io = settings.min_bytes_to_use_mmap_io,
|
2021-03-28 19:24:28 +00:00
|
|
|
.mmap_cache = context.getMMappedFileCache(),
|
2020-06-25 19:31:54 +00:00
|
|
|
.max_read_buffer_size = settings.max_read_buffer_size,
|
2021-01-25 20:29:04 +00:00
|
|
|
.save_marks_in_cache = true,
|
2021-01-25 20:48:10 +00:00
|
|
|
.checksum_on_read = settings.checksum_on_read,
|
2020-06-25 19:31:54 +00:00
|
|
|
};
|
|
|
|
|
2021-02-13 10:59:09 +00:00
|
|
|
struct DataSkippingIndexAndCondition
|
|
|
|
{
|
|
|
|
MergeTreeIndexPtr index;
|
|
|
|
MergeTreeIndexConditionPtr condition;
|
2021-03-28 01:10:30 +00:00
|
|
|
std::atomic<size_t> total_granules{0};
|
|
|
|
std::atomic<size_t> granules_dropped{0};
|
2021-04-06 12:39:55 +00:00
|
|
|
std::atomic<size_t> total_parts{0};
|
|
|
|
std::atomic<size_t> parts_dropped{0};
|
2021-02-13 10:59:09 +00:00
|
|
|
|
|
|
|
DataSkippingIndexAndCondition(MergeTreeIndexPtr index_, MergeTreeIndexConditionPtr condition_)
|
|
|
|
: index(index_)
|
|
|
|
, condition(condition_)
|
2021-03-28 01:10:30 +00:00
|
|
|
{
|
|
|
|
}
|
2021-02-13 10:59:09 +00:00
|
|
|
};
|
|
|
|
std::list<DataSkippingIndexAndCondition> useful_indices;
|
2020-05-28 12:37:05 +00:00
|
|
|
|
2020-06-17 09:38:47 +00:00
|
|
|
for (const auto & index : metadata_snapshot->getSecondaryIndices())
|
2019-02-05 15:22:47 +00:00
|
|
|
{
|
2020-05-28 12:37:05 +00:00
|
|
|
auto index_helper = MergeTreeIndexFactory::instance().get(index);
|
|
|
|
auto condition = index_helper->createIndexCondition(query_info, context);
|
2019-02-05 15:22:47 +00:00
|
|
|
if (!condition->alwaysUnknownOrTrue())
|
2020-05-28 12:37:05 +00:00
|
|
|
useful_indices.emplace_back(index_helper, condition);
|
2019-02-05 15:22:47 +00:00
|
|
|
}
|
|
|
|
|
2020-10-05 20:50:03 +00:00
|
|
|
if (settings.force_data_skipping_indices.changed)
|
|
|
|
{
|
2020-10-06 06:46:12 +00:00
|
|
|
const auto & indices = settings.force_data_skipping_indices.toString();
|
|
|
|
|
|
|
|
Strings forced_indices;
|
|
|
|
{
|
|
|
|
Tokens tokens(&indices[0], &indices[indices.size()], settings.max_query_size);
|
|
|
|
IParser::Pos pos(tokens, settings.max_parser_depth);
|
|
|
|
Expected expected;
|
|
|
|
if (!parseIdentifiersOrStringLiterals(pos, expected, forced_indices))
|
|
|
|
throw Exception(ErrorCodes::CANNOT_PARSE_TEXT,
|
|
|
|
"Cannot parse force_data_skipping_indices ('{}')", indices);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (forced_indices.empty())
|
|
|
|
throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "No indices parsed from force_data_skipping_indices ('{}')", indices);
|
|
|
|
|
2020-10-05 20:50:03 +00:00
|
|
|
std::unordered_set<std::string> useful_indices_names;
|
|
|
|
for (const auto & useful_index : useful_indices)
|
2021-02-13 10:59:09 +00:00
|
|
|
useful_indices_names.insert(useful_index.index->index.name);
|
2020-10-05 20:50:03 +00:00
|
|
|
|
|
|
|
for (const auto & index_name : forced_indices)
|
|
|
|
{
|
|
|
|
if (!useful_indices_names.count(index_name))
|
|
|
|
{
|
|
|
|
throw Exception(ErrorCodes::INDEX_NOT_USED,
|
|
|
|
"Index {} is not used and setting 'force_data_skipping_indices' contains it",
|
|
|
|
backQuote(index_name));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-07-20 15:09:00 +00:00
|
|
|
RangesInDataParts parts_with_ranges(parts.size());
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t sum_marks = 0;
|
2020-07-28 16:58:19 +00:00
|
|
|
std::atomic<size_t> sum_marks_pk = 0;
|
2021-04-08 08:19:04 +00:00
|
|
|
std::atomic<size_t> sum_parts_pk = 0;
|
2021-02-13 10:59:09 +00:00
|
|
|
std::atomic<size_t> total_marks_pk = 0;
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t sum_ranges = 0;
|
|
|
|
|
2020-07-20 15:09:00 +00:00
|
|
|
/// Let's find what range to read from each part.
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2020-08-13 15:03:29 +00:00
|
|
|
std::atomic<size_t> total_rows {0};
|
|
|
|
|
2020-08-28 14:50:25 +00:00
|
|
|
SizeLimits limits;
|
2020-09-17 19:54:21 +00:00
|
|
|
if (settings.read_overflow_mode == OverflowMode::THROW && settings.max_rows_to_read)
|
|
|
|
limits = SizeLimits(settings.max_rows_to_read, 0, settings.read_overflow_mode);
|
2020-08-28 14:50:25 +00:00
|
|
|
|
|
|
|
SizeLimits leaf_limits;
|
2020-09-17 19:54:21 +00:00
|
|
|
if (settings.read_overflow_mode_leaf == OverflowMode::THROW && settings.max_rows_to_read_leaf)
|
|
|
|
leaf_limits = SizeLimits(settings.max_rows_to_read_leaf, 0, settings.read_overflow_mode_leaf);
|
2020-08-28 14:50:25 +00:00
|
|
|
|
2020-07-22 11:51:35 +00:00
|
|
|
auto process_part = [&](size_t part_index)
|
2019-11-05 17:42:35 +00:00
|
|
|
{
|
2020-07-22 11:51:35 +00:00
|
|
|
auto & part = parts[part_index];
|
2020-07-20 15:09:00 +00:00
|
|
|
|
2020-07-22 11:51:35 +00:00
|
|
|
RangesInDataPart ranges(part, part_index);
|
2020-07-20 15:09:00 +00:00
|
|
|
|
2021-04-06 12:39:55 +00:00
|
|
|
size_t total_marks_count = part->getMarksCount();
|
|
|
|
if (total_marks_count && part->index_granularity.hasFinalMark())
|
|
|
|
--total_marks_count;
|
|
|
|
|
|
|
|
total_marks_pk.fetch_add(total_marks_count, std::memory_order_relaxed);
|
2021-02-13 10:59:09 +00:00
|
|
|
|
2020-07-22 11:51:35 +00:00
|
|
|
if (metadata_snapshot->hasPrimaryKey())
|
|
|
|
ranges.ranges = markRangesFromPKRange(part, metadata_snapshot, key_condition, settings, log);
|
2021-04-06 12:39:55 +00:00
|
|
|
else if (total_marks_count)
|
|
|
|
ranges.ranges = MarkRanges{MarkRange{0, total_marks_count}};
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-07-28 16:58:19 +00:00
|
|
|
sum_marks_pk.fetch_add(ranges.getMarksCount(), std::memory_order_relaxed);
|
2020-07-24 22:45:35 +00:00
|
|
|
|
2021-04-08 08:19:04 +00:00
|
|
|
if (!ranges.ranges.empty())
|
|
|
|
sum_parts_pk.fetch_add(1, std::memory_order_relaxed);
|
|
|
|
|
2021-02-13 10:59:09 +00:00
|
|
|
for (auto & index_and_condition : useful_indices)
|
|
|
|
{
|
2021-04-06 12:39:55 +00:00
|
|
|
if (ranges.ranges.empty())
|
|
|
|
break;
|
|
|
|
|
|
|
|
index_and_condition.total_parts.fetch_add(1, std::memory_order_relaxed);
|
|
|
|
|
2021-02-13 10:59:09 +00:00
|
|
|
size_t total_granules = 0;
|
|
|
|
size_t granules_dropped = 0;
|
2020-07-22 11:51:35 +00:00
|
|
|
ranges.ranges = filterMarksUsingIndex(
|
2021-02-13 10:59:09 +00:00
|
|
|
index_and_condition.index, index_and_condition.condition,
|
|
|
|
part, ranges.ranges,
|
|
|
|
settings, reader_settings,
|
|
|
|
total_granules, granules_dropped,
|
|
|
|
log);
|
|
|
|
|
|
|
|
index_and_condition.total_granules.fetch_add(total_granules, std::memory_order_relaxed);
|
|
|
|
index_and_condition.granules_dropped.fetch_add(granules_dropped, std::memory_order_relaxed);
|
2021-04-06 12:39:55 +00:00
|
|
|
|
|
|
|
if (ranges.ranges.empty())
|
|
|
|
index_and_condition.parts_dropped.fetch_add(1, std::memory_order_relaxed);
|
2021-02-13 10:59:09 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-07-22 11:51:35 +00:00
|
|
|
if (!ranges.ranges.empty())
|
2020-08-13 15:03:29 +00:00
|
|
|
{
|
2020-09-17 19:54:21 +00:00
|
|
|
if (limits.max_rows || leaf_limits.max_rows)
|
2020-08-13 15:03:29 +00:00
|
|
|
{
|
|
|
|
/// Fail fast if estimated number of rows to read exceeds the limit
|
2020-08-17 08:52:04 +00:00
|
|
|
auto current_rows_estimate = ranges.getRowsCount();
|
|
|
|
size_t prev_total_rows_estimate = total_rows.fetch_add(current_rows_estimate);
|
|
|
|
size_t total_rows_estimate = current_rows_estimate + prev_total_rows_estimate;
|
2020-08-28 14:50:25 +00:00
|
|
|
limits.check(total_rows_estimate, 0, "rows (controlled by 'max_rows_to_read' setting)", ErrorCodes::TOO_MANY_ROWS);
|
|
|
|
leaf_limits.check(total_rows_estimate, 0, "rows (controlled by 'max_rows_to_read_leaf' setting)", ErrorCodes::TOO_MANY_ROWS);
|
2020-08-13 15:03:29 +00:00
|
|
|
}
|
|
|
|
|
2020-07-22 11:51:35 +00:00
|
|
|
parts_with_ranges[part_index] = std::move(ranges);
|
2020-08-13 15:03:29 +00:00
|
|
|
}
|
2020-07-22 11:51:35 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
size_t num_threads = std::min(size_t(num_streams), parts.size());
|
|
|
|
|
|
|
|
if (num_threads <= 1)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2020-07-22 11:51:35 +00:00
|
|
|
for (size_t part_index = 0; part_index < parts.size(); ++part_index)
|
|
|
|
process_part(part_index);
|
2020-07-20 15:09:00 +00:00
|
|
|
}
|
2020-07-22 11:51:35 +00:00
|
|
|
else
|
|
|
|
{
|
|
|
|
/// Parallel loading of data parts.
|
|
|
|
ThreadPool pool(num_threads);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-07-22 11:51:35 +00:00
|
|
|
for (size_t part_index = 0; part_index < parts.size(); ++part_index)
|
2020-07-31 23:55:08 +00:00
|
|
|
pool.scheduleOrThrowOnError([&, part_index, thread_group = CurrentThread::getGroup()] {
|
|
|
|
SCOPE_EXIT(
|
|
|
|
if (thread_group)
|
|
|
|
CurrentThread::detachQueryIfNotDetached();
|
|
|
|
);
|
|
|
|
if (thread_group)
|
|
|
|
CurrentThread::attachTo(thread_group);
|
|
|
|
|
|
|
|
process_part(part_index);
|
|
|
|
});
|
2020-07-22 11:51:35 +00:00
|
|
|
|
|
|
|
pool.wait();
|
|
|
|
}
|
2020-07-20 04:43:10 +00:00
|
|
|
|
2020-07-20 15:09:00 +00:00
|
|
|
/// Skip empty ranges.
|
|
|
|
size_t next_part = 0;
|
|
|
|
for (size_t part_index = 0; part_index < parts.size(); ++part_index)
|
|
|
|
{
|
|
|
|
auto & part = parts_with_ranges[part_index];
|
|
|
|
if (!part.data_part)
|
|
|
|
continue;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-07-20 15:09:00 +00:00
|
|
|
sum_ranges += part.ranges.size();
|
|
|
|
sum_marks += part.getMarksCount();
|
|
|
|
|
|
|
|
if (next_part != part_index)
|
|
|
|
std::swap(parts_with_ranges[next_part], part);
|
|
|
|
|
|
|
|
++next_part;
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
2020-07-21 14:08:18 +00:00
|
|
|
|
|
|
|
parts_with_ranges.resize(next_part);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2021-04-08 08:19:04 +00:00
|
|
|
if (metadata_snapshot->hasPrimaryKey())
|
|
|
|
{
|
2021-04-08 11:48:54 +00:00
|
|
|
index_stats->emplace_back(ReadFromMergeTree::IndexStat{
|
|
|
|
.type = ReadFromMergeTree::IndexType::PrimaryKey,
|
|
|
|
.description = key_condition.toString(),
|
|
|
|
.num_parts_after = sum_parts_pk.load(std::memory_order_relaxed),
|
|
|
|
.num_granules_after = sum_marks_pk.load(std::memory_order_relaxed)});
|
2021-04-08 08:19:04 +00:00
|
|
|
}
|
|
|
|
|
2021-02-13 10:59:09 +00:00
|
|
|
for (const auto & index_and_condition : useful_indices)
|
|
|
|
{
|
|
|
|
const auto & index_name = index_and_condition.index->index.name;
|
|
|
|
LOG_DEBUG(log, "Index {} has dropped {}/{} granules.",
|
|
|
|
backQuote(index_name),
|
|
|
|
index_and_condition.granules_dropped, index_and_condition.total_granules);
|
2021-04-08 08:19:04 +00:00
|
|
|
|
2021-04-08 11:48:54 +00:00
|
|
|
std::string description = index_and_condition.index->index.type
|
|
|
|
+ " GRANULARITY " + std::to_string(index_and_condition.index->index.granularity);
|
|
|
|
|
|
|
|
index_stats->emplace_back(ReadFromMergeTree::IndexStat{
|
|
|
|
.type = ReadFromMergeTree::IndexType::Skip,
|
|
|
|
.name = index_name,
|
|
|
|
.description = std::move(description),
|
|
|
|
.num_parts_after = index_and_condition.total_parts - index_and_condition.parts_dropped,
|
|
|
|
.num_granules_after = index_and_condition.total_granules - index_and_condition.granules_dropped});
|
2021-02-13 10:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
LOG_DEBUG(log, "Selected {}/{} parts by partition key, {} parts by primary key, {}/{} marks by primary key, {} marks to read from {} ranges",
|
|
|
|
parts.size(), total_parts, parts_with_ranges.size(),
|
|
|
|
sum_marks_pk.load(std::memory_order_relaxed),
|
|
|
|
total_marks_pk.load(std::memory_order_relaxed),
|
|
|
|
sum_marks, sum_ranges);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
if (parts_with_ranges.empty())
|
2020-10-01 18:47:20 +00:00
|
|
|
return std::make_unique<QueryPlan>();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2021-01-25 05:01:39 +00:00
|
|
|
const auto data_settings = data.getSettings();
|
2021-01-06 09:18:48 +00:00
|
|
|
auto max_partitions_to_read
|
2021-01-25 05:01:39 +00:00
|
|
|
= settings.max_partitions_to_read.changed ? settings.max_partitions_to_read : data_settings->max_partitions_to_read;
|
2021-01-08 04:28:09 +00:00
|
|
|
if (max_partitions_to_read > 0)
|
2021-01-04 04:40:48 +00:00
|
|
|
{
|
|
|
|
std::set<String> partitions;
|
|
|
|
for (auto & part_with_ranges : parts_with_ranges)
|
|
|
|
partitions.insert(part_with_ranges.data_part->info.partition_id);
|
2021-01-08 04:28:09 +00:00
|
|
|
if (partitions.size() > size_t(max_partitions_to_read))
|
2021-01-04 04:40:48 +00:00
|
|
|
throw Exception(
|
|
|
|
ErrorCodes::TOO_MANY_PARTITIONS,
|
|
|
|
"Too many partitions to read. Current {}, max {}",
|
|
|
|
partitions.size(),
|
|
|
|
max_partitions_to_read);
|
|
|
|
}
|
|
|
|
|
2021-01-25 05:01:39 +00:00
|
|
|
String query_id;
|
|
|
|
if (data_settings->max_concurrent_queries > 0)
|
|
|
|
{
|
|
|
|
if (data_settings->min_marks_to_honor_max_concurrent_queries > 0
|
|
|
|
&& sum_marks >= data_settings->min_marks_to_honor_max_concurrent_queries)
|
|
|
|
{
|
|
|
|
query_id = context.getCurrentQueryId();
|
|
|
|
if (!query_id.empty())
|
|
|
|
data.insertQueryIdOrThrow(query_id, data_settings->max_concurrent_queries);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
ProfileEvents::increment(ProfileEvents::SelectedParts, parts_with_ranges.size());
|
|
|
|
ProfileEvents::increment(ProfileEvents::SelectedRanges, sum_ranges);
|
|
|
|
ProfileEvents::increment(ProfileEvents::SelectedMarks, sum_marks);
|
|
|
|
|
2020-10-01 17:34:22 +00:00
|
|
|
QueryPlanPtr plan;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-05-12 18:22:58 +00:00
|
|
|
/// Projection, that needed to drop columns, which have appeared by execution
|
|
|
|
/// of some extra expressions, and to allow execute the same expressions later.
|
|
|
|
/// NOTE: It may lead to double computation of expressions.
|
2020-11-03 11:28:28 +00:00
|
|
|
ActionsDAGPtr result_projection;
|
2020-05-12 18:22:58 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (select.final())
|
|
|
|
{
|
2018-10-11 14:53:23 +00:00
|
|
|
/// Add columns needed to calculate the sorting expression and the sign.
|
2020-06-17 11:05:11 +00:00
|
|
|
std::vector<String> add_columns = metadata_snapshot->getColumnsRequiredForSortingKey();
|
2017-04-01 07:20:54 +00:00
|
|
|
column_names_to_read.insert(column_names_to_read.end(), add_columns.begin(), add_columns.end());
|
|
|
|
|
|
|
|
if (!data.merging_params.sign_column.empty())
|
|
|
|
column_names_to_read.push_back(data.merging_params.sign_column);
|
|
|
|
if (!data.merging_params.version_column.empty())
|
|
|
|
column_names_to_read.push_back(data.merging_params.version_column);
|
|
|
|
|
|
|
|
std::sort(column_names_to_read.begin(), column_names_to_read.end());
|
|
|
|
column_names_to_read.erase(std::unique(column_names_to_read.begin(), column_names_to_read.end()), column_names_to_read.end());
|
|
|
|
|
2020-10-01 17:34:22 +00:00
|
|
|
plan = spreadMarkRangesAmongStreamsFinal(
|
2017-11-24 23:03:58 +00:00
|
|
|
std::move(parts_with_ranges),
|
2021-04-08 08:19:04 +00:00
|
|
|
std::move(index_stats),
|
2020-04-22 13:52:07 +00:00
|
|
|
num_streams,
|
2017-04-01 07:20:54 +00:00
|
|
|
column_names_to_read,
|
2020-06-16 14:25:08 +00:00
|
|
|
metadata_snapshot,
|
2017-04-01 07:20:54 +00:00
|
|
|
max_block_size,
|
|
|
|
settings.use_uncompressed_cache,
|
2019-04-17 21:20:51 +00:00
|
|
|
query_info,
|
2017-04-01 07:20:54 +00:00
|
|
|
virt_column_names,
|
2019-10-10 16:30:30 +00:00
|
|
|
settings,
|
2020-05-12 18:22:58 +00:00
|
|
|
reader_settings,
|
2021-01-25 05:01:39 +00:00
|
|
|
result_projection,
|
|
|
|
query_id);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
2020-05-13 13:49:10 +00:00
|
|
|
else if ((settings.optimize_read_in_order || settings.optimize_aggregation_in_order) && query_info.input_order_info)
|
2019-05-18 12:21:40 +00:00
|
|
|
{
|
2020-05-13 13:49:10 +00:00
|
|
|
size_t prefix_size = query_info.input_order_info->order_key_prefix_descr.size();
|
2020-06-17 11:05:11 +00:00
|
|
|
auto order_key_prefix_ast = metadata_snapshot->getSortingKey().expression_list_ast->clone();
|
2019-07-28 00:41:26 +00:00
|
|
|
order_key_prefix_ast->children.resize(prefix_size);
|
|
|
|
|
2020-07-22 17:13:05 +00:00
|
|
|
auto syntax_result = TreeRewriter(context).analyze(order_key_prefix_ast, metadata_snapshot->getColumns().getAllPhysical());
|
2020-11-11 10:08:57 +00:00
|
|
|
auto sorting_key_prefix_expr = ExpressionAnalyzer(order_key_prefix_ast, syntax_result, context).getActionsDAG(false);
|
2019-07-28 00:41:26 +00:00
|
|
|
|
2020-10-01 17:34:22 +00:00
|
|
|
plan = spreadMarkRangesAmongStreamsWithOrder(
|
2019-05-18 12:21:40 +00:00
|
|
|
std::move(parts_with_ranges),
|
2021-04-08 08:19:04 +00:00
|
|
|
std::move(index_stats),
|
2019-07-18 14:41:11 +00:00
|
|
|
num_streams,
|
2019-05-18 12:21:40 +00:00
|
|
|
column_names_to_read,
|
2020-06-16 14:25:08 +00:00
|
|
|
metadata_snapshot,
|
2019-05-18 12:21:40 +00:00
|
|
|
max_block_size,
|
|
|
|
settings.use_uncompressed_cache,
|
|
|
|
query_info,
|
2019-07-28 00:41:26 +00:00
|
|
|
sorting_key_prefix_expr,
|
2017-04-01 07:20:54 +00:00
|
|
|
virt_column_names,
|
2019-10-10 16:30:30 +00:00
|
|
|
settings,
|
2020-05-12 18:22:58 +00:00
|
|
|
reader_settings,
|
2021-01-25 05:01:39 +00:00
|
|
|
result_projection,
|
|
|
|
query_id);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2020-10-01 17:34:22 +00:00
|
|
|
plan = spreadMarkRangesAmongStreams(
|
2017-11-24 23:03:58 +00:00
|
|
|
std::move(parts_with_ranges),
|
2021-04-08 08:19:04 +00:00
|
|
|
std::move(index_stats),
|
2017-06-02 15:54:39 +00:00
|
|
|
num_streams,
|
2017-04-01 07:20:54 +00:00
|
|
|
column_names_to_read,
|
2020-06-16 14:25:08 +00:00
|
|
|
metadata_snapshot,
|
2017-04-01 07:20:54 +00:00
|
|
|
max_block_size,
|
|
|
|
settings.use_uncompressed_cache,
|
2019-04-17 21:20:51 +00:00
|
|
|
query_info,
|
2017-04-01 07:20:54 +00:00
|
|
|
virt_column_names,
|
2019-10-10 16:30:30 +00:00
|
|
|
settings,
|
2021-01-25 05:01:39 +00:00
|
|
|
reader_settings,
|
|
|
|
query_id);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2020-10-01 18:47:20 +00:00
|
|
|
if (!plan)
|
|
|
|
return std::make_unique<QueryPlan>();
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (use_sampling)
|
2019-10-01 16:50:08 +00:00
|
|
|
{
|
2020-10-01 17:34:22 +00:00
|
|
|
auto sampling_step = std::make_unique<FilterStep>(
|
|
|
|
plan->getCurrentDataStream(),
|
|
|
|
filter_expression,
|
|
|
|
filter_function->getColumnName(),
|
2021-03-04 17:38:12 +00:00
|
|
|
false);
|
2020-10-01 17:34:22 +00:00
|
|
|
|
|
|
|
sampling_step->setStepDescription("Sampling");
|
|
|
|
plan->addStep(std::move(sampling_step));
|
2019-10-01 16:50:08 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-05-12 18:22:58 +00:00
|
|
|
if (result_projection)
|
|
|
|
{
|
2021-03-04 17:38:12 +00:00
|
|
|
auto projection_step = std::make_unique<ExpressionStep>(plan->getCurrentDataStream(), result_projection);
|
2020-10-01 17:34:22 +00:00
|
|
|
projection_step->setStepDescription("Remove unused columns after reading from storage");
|
|
|
|
plan->addStep(std::move(projection_step));
|
2020-05-12 18:22:58 +00:00
|
|
|
}
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
/// By the way, if a distributed query or query to a Merge table is made, then the `_sample_factor` column can have different values.
|
|
|
|
if (sample_factor_column_queried)
|
2019-10-01 16:50:08 +00:00
|
|
|
{
|
2020-10-01 17:34:22 +00:00
|
|
|
ColumnWithTypeAndName column;
|
|
|
|
column.name = "_sample_factor";
|
|
|
|
column.type = std::make_shared<DataTypeFloat64>();
|
|
|
|
column.column = column.type->createColumnConst(0, Field(used_sample_factor));
|
|
|
|
|
2021-02-04 14:25:11 +00:00
|
|
|
auto adding_column_action = ActionsDAG::makeAddingColumnActions(std::move(column));
|
|
|
|
|
2021-03-04 17:38:12 +00:00
|
|
|
auto adding_column = std::make_unique<ExpressionStep>(plan->getCurrentDataStream(), std::move(adding_column_action));
|
2020-10-01 17:34:22 +00:00
|
|
|
adding_column->setStepDescription("Add _sample_factor column");
|
|
|
|
plan->addStep(std::move(adding_column));
|
2019-10-01 16:50:08 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-10-04 08:58:19 +00:00
|
|
|
if (query_info.prewhere_info && query_info.prewhere_info->remove_columns_actions)
|
2019-10-01 16:50:08 +00:00
|
|
|
{
|
2020-10-01 17:34:22 +00:00
|
|
|
auto expression_step = std::make_unique<ExpressionStep>(
|
|
|
|
plan->getCurrentDataStream(),
|
2020-11-11 10:08:57 +00:00
|
|
|
query_info.prewhere_info->remove_columns_actions->getActionsDAG().clone());
|
2020-10-01 17:34:22 +00:00
|
|
|
|
|
|
|
expression_step->setStepDescription("Remove unused columns after PREWHERE");
|
|
|
|
plan->addStep(std::move(expression_step));
|
2019-10-01 16:50:08 +00:00
|
|
|
}
|
2018-09-07 15:13:08 +00:00
|
|
|
|
2020-10-01 17:34:22 +00:00
|
|
|
return plan;
|
2014-03-13 12:48:07 +00:00
|
|
|
}
|
|
|
|
|
2019-04-01 12:10:32 +00:00
|
|
|
namespace
|
|
|
|
{
|
2019-04-01 11:09:30 +00:00
|
|
|
|
2021-01-24 08:00:08 +00:00
|
|
|
/// Marks are placed whenever threshold on rows or bytes is met.
|
|
|
|
/// So we have to return the number of marks on whatever estimate is higher - by rows or by bytes.
|
2019-04-01 11:09:30 +00:00
|
|
|
size_t roundRowsOrBytesToMarks(
|
|
|
|
size_t rows_setting,
|
|
|
|
size_t bytes_setting,
|
2019-06-19 10:07:56 +00:00
|
|
|
size_t rows_granularity,
|
|
|
|
size_t bytes_granularity)
|
2019-04-01 11:09:30 +00:00
|
|
|
{
|
2020-06-25 20:18:49 +00:00
|
|
|
size_t res = (rows_setting + rows_granularity - 1) / rows_granularity;
|
|
|
|
|
2019-06-19 10:07:56 +00:00
|
|
|
if (bytes_granularity == 0)
|
2020-06-25 20:18:49 +00:00
|
|
|
return res;
|
2019-04-01 11:09:30 +00:00
|
|
|
else
|
2020-06-25 20:18:49 +00:00
|
|
|
return std::max(res, (bytes_setting + bytes_granularity - 1) / bytes_granularity);
|
2019-04-01 11:09:30 +00:00
|
|
|
}
|
2021-01-24 08:00:08 +00:00
|
|
|
/// Same as roundRowsOrBytesToMarks() but do not return more then max_marks
|
|
|
|
size_t minMarksForConcurrentRead(
|
|
|
|
size_t rows_setting,
|
|
|
|
size_t bytes_setting,
|
|
|
|
size_t rows_granularity,
|
|
|
|
size_t bytes_granularity,
|
|
|
|
size_t max_marks)
|
|
|
|
{
|
|
|
|
size_t marks = 1;
|
|
|
|
|
|
|
|
if (rows_setting + rows_granularity <= rows_setting) /// overflow
|
|
|
|
marks = max_marks;
|
|
|
|
else if (rows_setting)
|
|
|
|
marks = (rows_setting + rows_granularity - 1) / rows_granularity;
|
|
|
|
|
|
|
|
if (bytes_granularity == 0)
|
|
|
|
return marks;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/// Overflow
|
|
|
|
if (bytes_setting + bytes_granularity <= bytes_setting) /// overflow
|
|
|
|
return max_marks;
|
|
|
|
if (bytes_setting)
|
|
|
|
return std::max(marks, (bytes_setting + bytes_granularity - 1) / bytes_granularity);
|
|
|
|
else
|
|
|
|
return marks;
|
|
|
|
}
|
|
|
|
}
|
2019-04-01 11:09:30 +00:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2020-10-01 17:34:22 +00:00
|
|
|
QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreams(
|
2017-11-24 23:03:58 +00:00
|
|
|
RangesInDataParts && parts,
|
2021-04-08 08:19:04 +00:00
|
|
|
ReadFromMergeTree::IndexStatPtr index_stats,
|
2017-06-02 15:54:39 +00:00
|
|
|
size_t num_streams,
|
2017-04-01 07:20:54 +00:00
|
|
|
const Names & column_names,
|
2020-06-16 14:25:08 +00:00
|
|
|
const StorageMetadataPtr & metadata_snapshot,
|
2019-02-10 16:55:12 +00:00
|
|
|
UInt64 max_block_size,
|
2017-04-01 07:20:54 +00:00
|
|
|
bool use_uncompressed_cache,
|
2019-04-17 21:20:51 +00:00
|
|
|
const SelectQueryInfo & query_info,
|
2017-04-01 07:20:54 +00:00
|
|
|
const Names & virt_columns,
|
2019-10-10 16:30:30 +00:00
|
|
|
const Settings & settings,
|
2021-01-25 05:01:39 +00:00
|
|
|
const MergeTreeReaderSettings & reader_settings,
|
|
|
|
const String & query_id) const
|
2014-03-13 12:48:07 +00:00
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
/// Count marks for each part.
|
|
|
|
std::vector<size_t> sum_marks_in_parts(parts.size());
|
|
|
|
size_t sum_marks = 0;
|
2019-03-25 13:55:24 +00:00
|
|
|
size_t total_rows = 0;
|
2019-06-19 10:07:56 +00:00
|
|
|
|
2019-08-26 14:24:29 +00:00
|
|
|
const auto data_settings = data.getSettings();
|
2019-06-19 10:07:56 +00:00
|
|
|
size_t adaptive_parts = 0;
|
2017-04-01 07:20:54 +00:00
|
|
|
for (size_t i = 0; i < parts.size(); ++i)
|
|
|
|
{
|
2019-03-25 13:55:24 +00:00
|
|
|
total_rows += parts[i].getRowsCount();
|
2019-07-18 14:41:11 +00:00
|
|
|
sum_marks_in_parts[i] = parts[i].getMarksCount();
|
2017-04-01 07:20:54 +00:00
|
|
|
sum_marks += sum_marks_in_parts[i];
|
2019-07-18 14:41:11 +00:00
|
|
|
|
2019-06-19 10:07:56 +00:00
|
|
|
if (parts[i].data_part->index_granularity_info.is_adaptive)
|
2020-06-25 20:18:49 +00:00
|
|
|
++adaptive_parts;
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2019-06-19 10:07:56 +00:00
|
|
|
size_t index_granularity_bytes = 0;
|
|
|
|
if (adaptive_parts > parts.size() / 2)
|
2019-08-13 10:29:31 +00:00
|
|
|
index_granularity_bytes = data_settings->index_granularity_bytes;
|
2019-06-19 10:07:56 +00:00
|
|
|
|
|
|
|
const size_t max_marks_to_use_cache = roundRowsOrBytesToMarks(
|
|
|
|
settings.merge_tree_max_rows_to_use_cache,
|
|
|
|
settings.merge_tree_max_bytes_to_use_cache,
|
2019-08-13 10:29:31 +00:00
|
|
|
data_settings->index_granularity,
|
2019-06-19 10:07:56 +00:00
|
|
|
index_granularity_bytes);
|
|
|
|
|
2021-01-24 08:00:08 +00:00
|
|
|
const size_t min_marks_for_concurrent_read = minMarksForConcurrentRead(
|
2019-06-19 10:07:56 +00:00
|
|
|
settings.merge_tree_min_rows_for_concurrent_read,
|
|
|
|
settings.merge_tree_min_bytes_for_concurrent_read,
|
2019-08-13 10:29:31 +00:00
|
|
|
data_settings->index_granularity,
|
2021-01-24 08:00:08 +00:00
|
|
|
index_granularity_bytes,
|
|
|
|
sum_marks);
|
2019-06-19 10:07:56 +00:00
|
|
|
|
2019-04-01 11:09:30 +00:00
|
|
|
if (sum_marks > max_marks_to_use_cache)
|
2017-04-01 07:20:54 +00:00
|
|
|
use_uncompressed_cache = false;
|
|
|
|
|
2019-12-19 22:38:05 +00:00
|
|
|
if (0 == sum_marks)
|
2020-08-03 11:33:11 +00:00
|
|
|
return {};
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2021-03-30 10:25:26 +00:00
|
|
|
ReadFromMergeTree::Settings step_settings
|
|
|
|
{
|
|
|
|
.max_block_size = max_block_size,
|
|
|
|
.preferred_block_size_bytes = settings.preferred_block_size_bytes,
|
|
|
|
.preferred_max_column_in_block_size_bytes = settings.preferred_max_column_in_block_size_bytes,
|
|
|
|
.min_marks_for_concurrent_read = min_marks_for_concurrent_read,
|
|
|
|
.use_uncompressed_cache = use_uncompressed_cache,
|
|
|
|
.reader_settings = reader_settings,
|
|
|
|
.backoff_settings = MergeTreeReadPool::BackoffSettings(settings),
|
|
|
|
};
|
|
|
|
|
2019-12-19 22:38:05 +00:00
|
|
|
if (num_streams > 1)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2017-06-02 15:54:39 +00:00
|
|
|
/// Reduce the number of num_streams if the data is small.
|
|
|
|
if (sum_marks < num_streams * min_marks_for_concurrent_read && parts.size() < num_streams)
|
|
|
|
num_streams = std::max((sum_marks + min_marks_for_concurrent_read - 1) / min_marks_for_concurrent_read, parts.size());
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2021-03-30 10:25:26 +00:00
|
|
|
auto plan = std::make_unique<QueryPlan>();
|
|
|
|
auto step = std::make_unique<ReadFromMergeTree>(
|
|
|
|
data, metadata_snapshot, query_id,
|
2021-04-08 08:19:04 +00:00
|
|
|
column_names, std::move(parts), std::move(index_stats), query_info.prewhere_info, virt_columns,
|
2021-04-08 12:54:27 +00:00
|
|
|
step_settings, num_streams, ReadFromMergeTree::ReadType::Default);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2021-03-30 10:25:26 +00:00
|
|
|
plan->addStep(std::move(step));
|
|
|
|
return plan;
|
2014-03-13 12:48:07 +00:00
|
|
|
}
|
|
|
|
|
2020-11-11 10:08:57 +00:00
|
|
|
static ActionsDAGPtr createProjection(const Block & header)
|
2020-05-06 22:27:35 +00:00
|
|
|
{
|
2020-11-03 11:28:28 +00:00
|
|
|
auto projection = std::make_shared<ActionsDAG>(header.getNamesAndTypesList());
|
|
|
|
projection->removeUnusedActions(header.getNames());
|
|
|
|
projection->projectInput();
|
2020-05-06 22:27:35 +00:00
|
|
|
return projection;
|
|
|
|
}
|
|
|
|
|
2020-10-01 17:34:22 +00:00
|
|
|
QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsWithOrder(
|
2019-05-18 12:21:40 +00:00
|
|
|
RangesInDataParts && parts,
|
2021-04-08 08:19:04 +00:00
|
|
|
ReadFromMergeTree::IndexStatPtr index_stats,
|
2019-07-18 14:41:11 +00:00
|
|
|
size_t num_streams,
|
2019-05-18 12:21:40 +00:00
|
|
|
const Names & column_names,
|
2020-06-16 14:25:08 +00:00
|
|
|
const StorageMetadataPtr & metadata_snapshot,
|
2019-05-18 12:21:40 +00:00
|
|
|
UInt64 max_block_size,
|
|
|
|
bool use_uncompressed_cache,
|
|
|
|
const SelectQueryInfo & query_info,
|
2020-11-11 10:08:57 +00:00
|
|
|
const ActionsDAGPtr & sorting_key_prefix_expr,
|
2019-05-18 12:21:40 +00:00
|
|
|
const Names & virt_columns,
|
2019-10-10 16:30:30 +00:00
|
|
|
const Settings & settings,
|
2020-05-12 18:22:58 +00:00
|
|
|
const MergeTreeReaderSettings & reader_settings,
|
2021-01-25 05:01:39 +00:00
|
|
|
ActionsDAGPtr & out_projection,
|
|
|
|
const String & query_id) const
|
2019-05-18 12:21:40 +00:00
|
|
|
{
|
2019-07-18 14:41:11 +00:00
|
|
|
size_t sum_marks = 0;
|
2020-05-13 13:49:10 +00:00
|
|
|
const InputOrderInfoPtr & input_order_info = query_info.input_order_info;
|
2020-05-05 14:35:23 +00:00
|
|
|
|
2019-07-18 14:41:11 +00:00
|
|
|
size_t adaptive_parts = 0;
|
|
|
|
std::vector<size_t> sum_marks_in_parts(parts.size());
|
2019-08-26 14:24:29 +00:00
|
|
|
const auto data_settings = data.getSettings();
|
2019-07-18 14:41:11 +00:00
|
|
|
|
|
|
|
for (size_t i = 0; i < parts.size(); ++i)
|
|
|
|
{
|
|
|
|
sum_marks_in_parts[i] = parts[i].getMarksCount();
|
|
|
|
sum_marks += sum_marks_in_parts[i];
|
|
|
|
|
|
|
|
if (parts[i].data_part->index_granularity_info.is_adaptive)
|
2020-06-25 20:18:49 +00:00
|
|
|
++adaptive_parts;
|
2019-07-18 14:41:11 +00:00
|
|
|
}
|
|
|
|
|
2019-07-18 18:34:15 +00:00
|
|
|
size_t index_granularity_bytes = 0;
|
|
|
|
if (adaptive_parts > parts.size() / 2)
|
2019-08-13 10:29:31 +00:00
|
|
|
index_granularity_bytes = data_settings->index_granularity_bytes;
|
2019-07-18 18:34:15 +00:00
|
|
|
|
|
|
|
const size_t max_marks_to_use_cache = roundRowsOrBytesToMarks(
|
|
|
|
settings.merge_tree_max_rows_to_use_cache,
|
|
|
|
settings.merge_tree_max_bytes_to_use_cache,
|
2019-08-13 10:29:31 +00:00
|
|
|
data_settings->index_granularity,
|
2019-07-18 18:34:15 +00:00
|
|
|
index_granularity_bytes);
|
|
|
|
|
2021-01-24 08:00:08 +00:00
|
|
|
const size_t min_marks_for_concurrent_read = minMarksForConcurrentRead(
|
2019-07-18 18:34:15 +00:00
|
|
|
settings.merge_tree_min_rows_for_concurrent_read,
|
|
|
|
settings.merge_tree_min_bytes_for_concurrent_read,
|
2019-08-13 10:29:31 +00:00
|
|
|
data_settings->index_granularity,
|
2021-01-24 08:00:08 +00:00
|
|
|
index_granularity_bytes,
|
|
|
|
sum_marks);
|
2019-07-18 18:34:15 +00:00
|
|
|
|
|
|
|
if (sum_marks > max_marks_to_use_cache)
|
|
|
|
use_uncompressed_cache = false;
|
|
|
|
|
2019-10-20 09:12:42 +00:00
|
|
|
Pipes res;
|
2019-07-18 14:41:11 +00:00
|
|
|
|
|
|
|
if (sum_marks == 0)
|
2020-08-03 11:33:11 +00:00
|
|
|
return {};
|
2019-07-18 14:41:11 +00:00
|
|
|
|
2019-08-02 16:16:18 +00:00
|
|
|
/// Let's split ranges to avoid reading much data.
|
2019-08-19 08:25:07 +00:00
|
|
|
auto split_ranges = [rows_granularity = data_settings->index_granularity, max_block_size](const auto & ranges, int direction)
|
2019-08-01 19:03:39 +00:00
|
|
|
{
|
|
|
|
MarkRanges new_ranges;
|
|
|
|
const size_t max_marks_in_range = (max_block_size + rows_granularity - 1) / rows_granularity;
|
2019-08-02 17:34:29 +00:00
|
|
|
size_t marks_in_range = 1;
|
2019-08-01 19:03:39 +00:00
|
|
|
|
2019-08-02 17:34:29 +00:00
|
|
|
if (direction == 1)
|
2019-08-01 19:03:39 +00:00
|
|
|
{
|
2019-08-02 17:34:29 +00:00
|
|
|
/// Split first few ranges to avoid reading much data.
|
2020-08-08 00:47:03 +00:00
|
|
|
bool split = false;
|
2019-08-02 17:34:29 +00:00
|
|
|
for (auto range : ranges)
|
2019-08-01 19:03:39 +00:00
|
|
|
{
|
2020-08-08 00:47:03 +00:00
|
|
|
while (!split && range.begin + marks_in_range < range.end)
|
2019-08-01 19:03:39 +00:00
|
|
|
{
|
|
|
|
new_ranges.emplace_back(range.begin, range.begin + marks_in_range);
|
|
|
|
range.begin += marks_in_range;
|
|
|
|
marks_in_range *= 2;
|
|
|
|
|
2019-08-02 17:34:29 +00:00
|
|
|
if (marks_in_range > max_marks_in_range)
|
2020-08-08 00:47:03 +00:00
|
|
|
split = true;
|
2019-08-01 19:03:39 +00:00
|
|
|
}
|
2019-08-02 17:34:29 +00:00
|
|
|
new_ranges.emplace_back(range.begin, range.end);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/// Split all ranges to avoid reading much data, because we have to
|
|
|
|
/// store whole range in memory to reverse it.
|
|
|
|
for (auto it = ranges.rbegin(); it != ranges.rend(); ++it)
|
|
|
|
{
|
|
|
|
auto range = *it;
|
|
|
|
while (range.begin + marks_in_range < range.end)
|
2019-08-01 19:03:39 +00:00
|
|
|
{
|
2020-02-10 12:36:01 +00:00
|
|
|
new_ranges.emplace_front(range.end - marks_in_range, range.end);
|
2019-08-01 19:03:39 +00:00
|
|
|
range.end -= marks_in_range;
|
|
|
|
marks_in_range = std::min(marks_in_range * 2, max_marks_in_range);
|
|
|
|
}
|
2020-02-10 12:36:01 +00:00
|
|
|
new_ranges.emplace_front(range.begin, range.end);
|
2019-08-01 19:03:39 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return new_ranges;
|
|
|
|
};
|
|
|
|
|
2019-07-18 14:41:11 +00:00
|
|
|
const size_t min_marks_per_stream = (sum_marks - 1) / num_streams + 1;
|
2020-07-04 12:48:51 +00:00
|
|
|
bool need_preliminary_merge = (parts.size() > settings.read_in_order_two_level_merge_threshold);
|
2019-07-18 14:41:11 +00:00
|
|
|
|
2020-10-01 17:34:22 +00:00
|
|
|
std::vector<QueryPlanPtr> plans;
|
2019-07-18 14:41:11 +00:00
|
|
|
|
|
|
|
for (size_t i = 0; i < num_streams && !parts.empty(); ++i)
|
2019-05-18 12:21:40 +00:00
|
|
|
{
|
2019-07-18 14:41:11 +00:00
|
|
|
size_t need_marks = min_marks_per_stream;
|
2021-03-30 10:25:26 +00:00
|
|
|
RangesInDataParts new_parts;
|
2019-05-18 12:21:40 +00:00
|
|
|
|
2019-07-18 14:41:11 +00:00
|
|
|
/// Loop over parts.
|
|
|
|
/// We will iteratively take part or some subrange of a part from the back
|
|
|
|
/// and assign a stream to read from it.
|
|
|
|
while (need_marks > 0 && !parts.empty())
|
|
|
|
{
|
|
|
|
RangesInDataPart part = parts.back();
|
|
|
|
parts.pop_back();
|
|
|
|
|
2020-02-18 21:00:51 +00:00
|
|
|
size_t & marks_in_part = sum_marks_in_parts.back();
|
2019-07-18 14:41:11 +00:00
|
|
|
|
|
|
|
/// We will not take too few rows from a part.
|
|
|
|
if (marks_in_part >= min_marks_for_concurrent_read &&
|
|
|
|
need_marks < min_marks_for_concurrent_read)
|
|
|
|
need_marks = min_marks_for_concurrent_read;
|
|
|
|
|
|
|
|
/// Do not leave too few rows in the part.
|
|
|
|
if (marks_in_part > need_marks &&
|
|
|
|
marks_in_part - need_marks < min_marks_for_concurrent_read)
|
|
|
|
need_marks = marks_in_part;
|
2019-05-18 12:21:40 +00:00
|
|
|
|
2019-07-18 14:41:11 +00:00
|
|
|
MarkRanges ranges_to_get_from_part;
|
|
|
|
|
|
|
|
/// We take the whole part if it is small enough.
|
|
|
|
if (marks_in_part <= need_marks)
|
|
|
|
{
|
|
|
|
ranges_to_get_from_part = part.ranges;
|
|
|
|
|
|
|
|
need_marks -= marks_in_part;
|
|
|
|
sum_marks_in_parts.pop_back();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/// Loop through ranges in part. Take enough ranges to cover "need_marks".
|
|
|
|
while (need_marks > 0)
|
|
|
|
{
|
|
|
|
if (part.ranges.empty())
|
|
|
|
throw Exception("Unexpected end of ranges while spreading marks among streams", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
2020-02-10 12:36:01 +00:00
|
|
|
MarkRange & range = part.ranges.front();
|
2019-07-18 14:41:11 +00:00
|
|
|
|
|
|
|
const size_t marks_in_range = range.end - range.begin;
|
|
|
|
const size_t marks_to_get_from_range = std::min(marks_in_range, need_marks);
|
|
|
|
|
|
|
|
ranges_to_get_from_part.emplace_back(range.begin, range.begin + marks_to_get_from_range);
|
|
|
|
range.begin += marks_to_get_from_range;
|
|
|
|
marks_in_part -= marks_to_get_from_range;
|
|
|
|
need_marks -= marks_to_get_from_range;
|
|
|
|
if (range.begin == range.end)
|
2020-02-10 12:36:01 +00:00
|
|
|
part.ranges.pop_front();
|
2019-07-18 14:41:11 +00:00
|
|
|
}
|
|
|
|
parts.emplace_back(part);
|
|
|
|
}
|
2020-05-13 13:49:10 +00:00
|
|
|
ranges_to_get_from_part = split_ranges(ranges_to_get_from_part, input_order_info->direction);
|
2021-03-30 10:25:26 +00:00
|
|
|
new_parts.emplace_back(part.data_part, part.part_index_in_query, std::move(ranges_to_get_from_part));
|
2019-07-18 14:41:11 +00:00
|
|
|
}
|
|
|
|
|
2021-03-30 10:25:26 +00:00
|
|
|
ReadFromMergeTree::Settings step_settings
|
|
|
|
{
|
|
|
|
.max_block_size = max_block_size,
|
|
|
|
.preferred_block_size_bytes = settings.preferred_block_size_bytes,
|
|
|
|
.preferred_max_column_in_block_size_bytes = settings.preferred_max_column_in_block_size_bytes,
|
|
|
|
.min_marks_for_concurrent_read = min_marks_for_concurrent_read,
|
|
|
|
.use_uncompressed_cache = use_uncompressed_cache,
|
|
|
|
.reader_settings = reader_settings,
|
|
|
|
.backoff_settings = MergeTreeReadPool::BackoffSettings(settings),
|
|
|
|
};
|
|
|
|
|
2021-04-08 12:54:27 +00:00
|
|
|
auto read_type = input_order_info->direction == 1
|
|
|
|
? ReadFromMergeTree::ReadType::InOrder
|
|
|
|
: ReadFromMergeTree::ReadType::InReverseOrder;
|
2021-03-30 10:25:26 +00:00
|
|
|
|
|
|
|
auto plan = std::make_unique<QueryPlan>();
|
|
|
|
auto step = std::make_unique<ReadFromMergeTree>(
|
|
|
|
data, metadata_snapshot, query_id,
|
2021-04-08 08:19:04 +00:00
|
|
|
column_names, std::move(new_parts), std::move(index_stats), query_info.prewhere_info, virt_columns,
|
2021-04-08 12:54:27 +00:00
|
|
|
step_settings, num_streams, read_type);
|
2021-03-30 10:25:26 +00:00
|
|
|
|
|
|
|
plan->addStep(std::move(step));
|
2020-11-10 12:02:22 +00:00
|
|
|
plans.emplace_back(std::move(plan));
|
2020-11-03 18:19:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (need_preliminary_merge)
|
|
|
|
{
|
2020-11-10 12:02:22 +00:00
|
|
|
SortDescription sort_description;
|
|
|
|
for (size_t j = 0; j < input_order_info->order_key_prefix_descr.size(); ++j)
|
|
|
|
sort_description.emplace_back(metadata_snapshot->getSortingKey().column_names[j],
|
|
|
|
input_order_info->direction, 1);
|
2019-07-26 22:18:27 +00:00
|
|
|
|
2020-11-10 12:02:22 +00:00
|
|
|
for (auto & plan : plans)
|
2019-07-26 22:18:27 +00:00
|
|
|
{
|
2020-05-13 15:53:47 +00:00
|
|
|
/// Drop temporary columns, added by 'sorting_key_prefix_expr'
|
2020-11-11 10:08:57 +00:00
|
|
|
out_projection = createProjection(plan->getCurrentDataStream().header);
|
2019-07-26 22:18:27 +00:00
|
|
|
|
2020-10-01 17:34:22 +00:00
|
|
|
auto expression_step = std::make_unique<ExpressionStep>(
|
|
|
|
plan->getCurrentDataStream(),
|
2021-03-04 17:38:12 +00:00
|
|
|
sorting_key_prefix_expr);
|
2019-07-24 14:23:57 +00:00
|
|
|
|
2020-10-01 17:34:22 +00:00
|
|
|
expression_step->setStepDescription("Calculate sorting key prefix");
|
|
|
|
plan->addStep(std::move(expression_step));
|
2020-08-03 11:33:11 +00:00
|
|
|
|
2020-10-01 17:34:22 +00:00
|
|
|
auto merging_sorted = std::make_unique<MergingSortedStep>(
|
|
|
|
plan->getCurrentDataStream(),
|
|
|
|
sort_description,
|
|
|
|
max_block_size);
|
|
|
|
|
|
|
|
merging_sorted->setStepDescription("Merge sorting mark ranges");
|
|
|
|
plan->addStep(std::move(merging_sorted));
|
2020-11-03 18:19:50 +00:00
|
|
|
}
|
2019-05-18 12:21:40 +00:00
|
|
|
}
|
|
|
|
|
2020-10-01 17:34:22 +00:00
|
|
|
if (plans.size() == 1)
|
|
|
|
return std::move(plans.front());
|
|
|
|
|
|
|
|
DataStreams input_streams;
|
|
|
|
for (const auto & plan : plans)
|
|
|
|
input_streams.emplace_back(plan->getCurrentDataStream());
|
|
|
|
|
|
|
|
const auto & common_header = plans.front()->getCurrentDataStream().header;
|
2021-03-04 17:38:12 +00:00
|
|
|
auto union_step = std::make_unique<UnionStep>(std::move(input_streams), common_header);
|
2020-10-01 17:34:22 +00:00
|
|
|
|
|
|
|
auto plan = std::make_unique<QueryPlan>();
|
|
|
|
plan->unitePlans(std::move(union_step), std::move(plans));
|
|
|
|
|
|
|
|
return plan;
|
2019-05-18 12:21:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-10-01 17:34:22 +00:00
|
|
|
QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsFinal(
|
2017-11-24 23:03:58 +00:00
|
|
|
RangesInDataParts && parts,
|
2021-04-08 08:19:04 +00:00
|
|
|
ReadFromMergeTree::IndexStatPtr index_stats,
|
2020-04-22 13:52:07 +00:00
|
|
|
size_t num_streams,
|
2017-04-01 07:20:54 +00:00
|
|
|
const Names & column_names,
|
2020-06-16 14:25:08 +00:00
|
|
|
const StorageMetadataPtr & metadata_snapshot,
|
2019-02-10 16:55:12 +00:00
|
|
|
UInt64 max_block_size,
|
2017-04-01 07:20:54 +00:00
|
|
|
bool use_uncompressed_cache,
|
2019-04-17 21:20:51 +00:00
|
|
|
const SelectQueryInfo & query_info,
|
2017-04-01 07:20:54 +00:00
|
|
|
const Names & virt_columns,
|
2019-10-10 16:30:30 +00:00
|
|
|
const Settings & settings,
|
2020-05-12 18:22:58 +00:00
|
|
|
const MergeTreeReaderSettings & reader_settings,
|
2021-01-25 05:01:39 +00:00
|
|
|
ActionsDAGPtr & out_projection,
|
|
|
|
const String & query_id) const
|
2014-03-13 12:48:07 +00:00
|
|
|
{
|
2019-08-26 14:24:29 +00:00
|
|
|
const auto data_settings = data.getSettings();
|
2019-04-01 11:09:30 +00:00
|
|
|
size_t sum_marks = 0;
|
2019-06-19 10:07:56 +00:00
|
|
|
size_t adaptive_parts = 0;
|
2020-03-09 01:59:08 +00:00
|
|
|
for (const auto & part : parts)
|
2019-06-19 10:07:56 +00:00
|
|
|
{
|
2020-03-09 01:59:08 +00:00
|
|
|
for (const auto & range : part.ranges)
|
|
|
|
sum_marks += range.end - range.begin;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-03-09 01:59:08 +00:00
|
|
|
if (part.data_part->index_granularity_info.is_adaptive)
|
|
|
|
++adaptive_parts;
|
2019-06-19 10:07:56 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
size_t index_granularity_bytes = 0;
|
|
|
|
if (adaptive_parts >= parts.size() / 2)
|
2019-08-13 10:29:31 +00:00
|
|
|
index_granularity_bytes = data_settings->index_granularity_bytes;
|
2019-06-19 10:07:56 +00:00
|
|
|
|
|
|
|
const size_t max_marks_to_use_cache = roundRowsOrBytesToMarks(
|
|
|
|
settings.merge_tree_max_rows_to_use_cache,
|
|
|
|
settings.merge_tree_max_bytes_to_use_cache,
|
2019-08-13 10:29:31 +00:00
|
|
|
data_settings->index_granularity,
|
2019-06-19 10:07:56 +00:00
|
|
|
index_granularity_bytes);
|
|
|
|
|
2019-04-01 11:09:30 +00:00
|
|
|
if (sum_marks > max_marks_to_use_cache)
|
2017-04-01 07:20:54 +00:00
|
|
|
use_uncompressed_cache = false;
|
|
|
|
|
2020-10-13 14:54:52 +00:00
|
|
|
if (num_streams > settings.max_final_threads)
|
|
|
|
num_streams = settings.max_final_threads;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-10-13 18:55:03 +00:00
|
|
|
/// If setting do_not_merge_across_partitions_select_final is true than we won't merge parts from different partitions.
|
2021-03-30 10:25:26 +00:00
|
|
|
/// We have all parts in parts vector, where parts with same partition are nearby.
|
2020-10-13 18:55:03 +00:00
|
|
|
/// So we will store iterators pointed to the beginning of each partition range (and parts.end()),
|
|
|
|
/// then we will create a pipe for each partition that will run selecting processor and merging processor
|
|
|
|
/// for the parts with this partition. In the end we will unite all the pipes.
|
2020-10-13 14:54:52 +00:00
|
|
|
std::vector<RangesInDataParts::iterator> parts_to_merge_ranges;
|
|
|
|
auto it = parts.begin();
|
|
|
|
parts_to_merge_ranges.push_back(it);
|
2020-08-03 11:33:11 +00:00
|
|
|
|
2020-10-21 17:35:31 +00:00
|
|
|
if (settings.do_not_merge_across_partitions_select_final)
|
2020-10-13 14:54:52 +00:00
|
|
|
{
|
|
|
|
while (it != parts.end())
|
2020-08-03 11:33:11 +00:00
|
|
|
{
|
2020-10-13 14:54:52 +00:00
|
|
|
it = std::find_if(
|
|
|
|
it, parts.end(), [&it](auto & part) { return it->data_part->info.partition_id != part.data_part->info.partition_id; });
|
|
|
|
parts_to_merge_ranges.push_back(it);
|
2020-08-03 11:33:11 +00:00
|
|
|
}
|
2020-10-13 18:55:03 +00:00
|
|
|
/// We divide threads for each partition equally. But we will create at least the number of partitions threads.
|
|
|
|
/// (So, the total number of threads could be more than initial num_streams.
|
2020-10-13 14:54:52 +00:00
|
|
|
num_streams /= (parts_to_merge_ranges.size() - 1);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
2020-10-13 14:54:52 +00:00
|
|
|
else
|
2020-08-03 11:33:11 +00:00
|
|
|
{
|
2020-10-13 18:55:03 +00:00
|
|
|
/// If do_not_merge_across_partitions_select_final is false we just merge all the parts.
|
2020-10-13 14:54:52 +00:00
|
|
|
parts_to_merge_ranges.push_back(parts.end());
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2020-11-06 12:43:46 +00:00
|
|
|
std::vector<QueryPlanPtr> partition_plans;
|
2020-08-03 11:33:11 +00:00
|
|
|
|
2021-01-29 14:30:14 +00:00
|
|
|
/// If do_not_merge_across_partitions_select_final is true and num_streams > 1
|
|
|
|
/// we will store lonely parts with level > 0 to use parallel select on them.
|
|
|
|
std::vector<RangesInDataPart> lonely_parts;
|
|
|
|
size_t total_rows_in_lonely_parts = 0;
|
2021-01-29 18:00:08 +00:00
|
|
|
size_t sum_marks_in_lonely_parts = 0;
|
2021-01-29 14:30:14 +00:00
|
|
|
|
2020-10-13 14:54:52 +00:00
|
|
|
for (size_t range_index = 0; range_index < parts_to_merge_ranges.size() - 1; ++range_index)
|
2020-08-03 11:33:11 +00:00
|
|
|
{
|
2020-11-06 12:43:46 +00:00
|
|
|
QueryPlanPtr plan;
|
2020-10-13 14:54:52 +00:00
|
|
|
|
2020-04-22 13:52:07 +00:00
|
|
|
{
|
2021-03-30 10:25:26 +00:00
|
|
|
RangesInDataParts new_parts;
|
2020-10-13 14:54:52 +00:00
|
|
|
|
2021-01-21 17:33:08 +00:00
|
|
|
/// If do_not_merge_across_partitions_select_final is true and there is only one part in partition
|
|
|
|
/// with level > 0 then we won't postprocess this part and if num_streams > 1 we
|
2021-01-29 14:30:14 +00:00
|
|
|
/// can use parallel select on such parts. We save such parts in one vector and then use
|
|
|
|
/// MergeTreeReadPool and MergeTreeThreadSelectBlockInputProcessor for parallel select.
|
2021-01-21 17:33:08 +00:00
|
|
|
if (num_streams > 1 && settings.do_not_merge_across_partitions_select_final &&
|
|
|
|
std::distance(parts_to_merge_ranges[range_index], parts_to_merge_ranges[range_index + 1]) == 1 &&
|
|
|
|
parts_to_merge_ranges[range_index]->data_part->info.level > 0)
|
2020-04-22 13:52:07 +00:00
|
|
|
{
|
2021-01-29 14:30:14 +00:00
|
|
|
total_rows_in_lonely_parts += parts_to_merge_ranges[range_index]->getRowsCount();
|
2021-01-29 18:00:08 +00:00
|
|
|
sum_marks_in_lonely_parts += parts_to_merge_ranges[range_index]->getMarksCount();
|
2021-01-29 14:30:14 +00:00
|
|
|
lonely_parts.push_back(std::move(*parts_to_merge_ranges[range_index]));
|
|
|
|
continue;
|
2021-01-21 17:33:08 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
for (auto part_it = parts_to_merge_ranges[range_index]; part_it != parts_to_merge_ranges[range_index + 1]; ++part_it)
|
|
|
|
{
|
2021-03-30 10:25:26 +00:00
|
|
|
new_parts.emplace_back(part_it->data_part, part_it->part_index_in_query, part_it->ranges);
|
2021-01-21 17:33:08 +00:00
|
|
|
}
|
2020-04-22 13:52:07 +00:00
|
|
|
}
|
2018-06-30 21:35:01 +00:00
|
|
|
|
2021-03-30 10:25:26 +00:00
|
|
|
if (new_parts.empty())
|
2020-11-10 09:35:05 +00:00
|
|
|
continue;
|
|
|
|
|
2021-03-30 10:25:26 +00:00
|
|
|
ReadFromMergeTree::Settings step_settings
|
|
|
|
{
|
|
|
|
.max_block_size = max_block_size,
|
|
|
|
.preferred_block_size_bytes = settings.preferred_block_size_bytes,
|
|
|
|
.preferred_max_column_in_block_size_bytes = settings.preferred_max_column_in_block_size_bytes,
|
|
|
|
.min_marks_for_concurrent_read = 0, /// this setting is not used for reading in order
|
|
|
|
.use_uncompressed_cache = use_uncompressed_cache,
|
|
|
|
.reader_settings = reader_settings,
|
|
|
|
.backoff_settings = MergeTreeReadPool::BackoffSettings(settings),
|
|
|
|
};
|
|
|
|
|
|
|
|
plan = std::make_unique<QueryPlan>();
|
|
|
|
auto step = std::make_unique<ReadFromMergeTree>(
|
|
|
|
data, metadata_snapshot, query_id,
|
2021-04-08 08:19:04 +00:00
|
|
|
column_names, std::move(new_parts), std::move(index_stats), query_info.prewhere_info, virt_columns,
|
2021-04-08 12:54:27 +00:00
|
|
|
step_settings, num_streams, ReadFromMergeTree::ReadType::InOrder);
|
2021-03-30 10:25:26 +00:00
|
|
|
|
|
|
|
plan->addStep(std::move(step));
|
2020-07-06 14:33:31 +00:00
|
|
|
|
2020-11-06 12:43:46 +00:00
|
|
|
/// Drop temporary columns, added by 'sorting_key_expr'
|
|
|
|
if (!out_projection)
|
2021-03-30 10:25:26 +00:00
|
|
|
out_projection = createProjection(plan->getCurrentDataStream().header);
|
2020-11-06 12:43:46 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2021-01-21 17:33:08 +00:00
|
|
|
auto expression_step = std::make_unique<ExpressionStep>(
|
|
|
|
plan->getCurrentDataStream(),
|
|
|
|
metadata_snapshot->getSortingKey().expression->getActionsDAG().clone());
|
|
|
|
|
|
|
|
expression_step->setStepDescription("Calculate sorting key expression");
|
|
|
|
plan->addStep(std::move(expression_step));
|
|
|
|
|
2020-10-15 12:22:41 +00:00
|
|
|
/// If do_not_merge_across_partitions_select_final is true and there is only one part in partition
|
2020-10-21 17:35:31 +00:00
|
|
|
/// with level > 0 then we won't postprocess this part
|
|
|
|
if (settings.do_not_merge_across_partitions_select_final &&
|
|
|
|
std::distance(parts_to_merge_ranges[range_index], parts_to_merge_ranges[range_index + 1]) == 1 &&
|
|
|
|
parts_to_merge_ranges[range_index]->data_part->info.level > 0)
|
2020-04-22 13:52:07 +00:00
|
|
|
{
|
2020-11-06 12:43:46 +00:00
|
|
|
partition_plans.emplace_back(std::move(plan));
|
2020-10-13 14:54:52 +00:00
|
|
|
continue;
|
|
|
|
}
|
2019-10-01 16:50:08 +00:00
|
|
|
|
2020-10-13 14:54:52 +00:00
|
|
|
Names sort_columns = metadata_snapshot->getSortingKeyColumns();
|
|
|
|
SortDescription sort_description;
|
|
|
|
size_t sort_columns_size = sort_columns.size();
|
|
|
|
sort_description.reserve(sort_columns_size);
|
2020-04-22 13:52:07 +00:00
|
|
|
|
2020-10-13 14:54:52 +00:00
|
|
|
Names partition_key_columns = metadata_snapshot->getPartitionKey().column_names;
|
2020-04-22 13:52:07 +00:00
|
|
|
|
2020-11-06 12:43:46 +00:00
|
|
|
const auto & header = plan->getCurrentDataStream().header;
|
2020-10-13 14:54:52 +00:00
|
|
|
for (size_t i = 0; i < sort_columns_size; ++i)
|
|
|
|
sort_description.emplace_back(header.getPositionByName(sort_columns[i]), 1, 1);
|
2019-10-01 16:50:08 +00:00
|
|
|
|
2020-11-06 12:43:46 +00:00
|
|
|
auto final_step = std::make_unique<MergingFinal>(
|
|
|
|
plan->getCurrentDataStream(),
|
|
|
|
std::min<size_t>(num_streams, settings.max_final_threads),
|
|
|
|
sort_description,
|
|
|
|
data.merging_params,
|
|
|
|
partition_key_columns,
|
|
|
|
max_block_size);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-11-06 12:43:46 +00:00
|
|
|
final_step->setStepDescription("Merge rows for FINAL");
|
|
|
|
plan->addStep(std::move(final_step));
|
2020-04-22 13:52:07 +00:00
|
|
|
|
2020-11-06 12:43:46 +00:00
|
|
|
partition_plans.emplace_back(std::move(plan));
|
|
|
|
}
|
2020-10-13 14:54:52 +00:00
|
|
|
|
2021-01-29 14:30:14 +00:00
|
|
|
if (!lonely_parts.empty())
|
|
|
|
{
|
2021-03-30 10:25:26 +00:00
|
|
|
RangesInDataParts new_parts;
|
2021-01-29 14:30:14 +00:00
|
|
|
|
|
|
|
size_t num_streams_for_lonely_parts = num_streams * lonely_parts.size();
|
|
|
|
|
2021-01-29 18:00:08 +00:00
|
|
|
const size_t min_marks_for_concurrent_read = minMarksForConcurrentRead(
|
|
|
|
settings.merge_tree_min_rows_for_concurrent_read,
|
|
|
|
settings.merge_tree_min_bytes_for_concurrent_read,
|
|
|
|
data_settings->index_granularity,
|
|
|
|
index_granularity_bytes,
|
|
|
|
sum_marks_in_lonely_parts);
|
|
|
|
|
|
|
|
/// Reduce the number of num_streams_for_lonely_parts if the data is small.
|
|
|
|
if (sum_marks_in_lonely_parts < num_streams_for_lonely_parts * min_marks_for_concurrent_read && lonely_parts.size() < num_streams_for_lonely_parts)
|
|
|
|
num_streams_for_lonely_parts = std::max((sum_marks_in_lonely_parts + min_marks_for_concurrent_read - 1) / min_marks_for_concurrent_read, lonely_parts.size());
|
|
|
|
|
2021-03-30 10:25:26 +00:00
|
|
|
ReadFromMergeTree::Settings step_settings
|
2021-01-29 14:30:14 +00:00
|
|
|
{
|
2021-03-30 10:25:26 +00:00
|
|
|
.max_block_size = max_block_size,
|
|
|
|
.preferred_block_size_bytes = settings.preferred_block_size_bytes,
|
|
|
|
.preferred_max_column_in_block_size_bytes = settings.preferred_max_column_in_block_size_bytes,
|
|
|
|
.min_marks_for_concurrent_read = min_marks_for_concurrent_read,
|
|
|
|
.use_uncompressed_cache = use_uncompressed_cache,
|
|
|
|
.reader_settings = reader_settings,
|
|
|
|
.backoff_settings = MergeTreeReadPool::BackoffSettings(settings),
|
|
|
|
};
|
2021-01-29 14:30:14 +00:00
|
|
|
|
2021-03-30 10:25:26 +00:00
|
|
|
auto plan = std::make_unique<QueryPlan>();
|
|
|
|
auto step = std::make_unique<ReadFromMergeTree>(
|
|
|
|
data, metadata_snapshot, query_id,
|
2021-04-08 08:19:04 +00:00
|
|
|
column_names, std::move(lonely_parts), std::move(index_stats), query_info.prewhere_info, virt_columns,
|
2021-04-08 12:54:27 +00:00
|
|
|
step_settings, num_streams_for_lonely_parts, ReadFromMergeTree::ReadType::Default);
|
2021-01-29 14:30:14 +00:00
|
|
|
|
2021-03-30 10:25:26 +00:00
|
|
|
plan->addStep(std::move(step));
|
2021-01-29 14:30:14 +00:00
|
|
|
|
|
|
|
/// Drop temporary columns, added by 'sorting_key_expr'
|
|
|
|
if (!out_projection)
|
2021-03-30 10:25:26 +00:00
|
|
|
out_projection = createProjection(plan->getCurrentDataStream().header);
|
2021-01-29 14:30:14 +00:00
|
|
|
|
|
|
|
auto expression_step = std::make_unique<ExpressionStep>(
|
|
|
|
plan->getCurrentDataStream(),
|
|
|
|
metadata_snapshot->getSortingKey().expression->getActionsDAG().clone());
|
|
|
|
|
|
|
|
expression_step->setStepDescription("Calculate sorting key expression");
|
|
|
|
plan->addStep(std::move(expression_step));
|
|
|
|
|
|
|
|
partition_plans.emplace_back(std::move(plan));
|
|
|
|
}
|
|
|
|
|
2020-11-06 12:43:46 +00:00
|
|
|
if (partition_plans.empty())
|
|
|
|
return {};
|
2020-04-22 13:52:07 +00:00
|
|
|
|
2020-11-06 12:43:46 +00:00
|
|
|
if (partition_plans.size() == 1)
|
|
|
|
return std::move(partition_plans.front());
|
2020-10-13 14:54:52 +00:00
|
|
|
|
2020-11-06 12:43:46 +00:00
|
|
|
auto result_header = partition_plans.front()->getCurrentDataStream().header;
|
|
|
|
DataStreams input_streams;
|
|
|
|
for (const auto & partition_plan : partition_plans)
|
|
|
|
input_streams.push_back(partition_plan->getCurrentDataStream());
|
2019-10-01 16:50:08 +00:00
|
|
|
|
2021-03-04 17:38:12 +00:00
|
|
|
auto union_step = std::make_unique<UnionStep>(std::move(input_streams), result_header);
|
2020-11-06 12:43:46 +00:00
|
|
|
union_step->setStepDescription("Unite sources after FINAL");
|
|
|
|
QueryPlanPtr plan = std::make_unique<QueryPlan>();
|
|
|
|
plan->unitePlans(std::move(union_step), std::move(partition_plans));
|
2020-10-01 17:34:22 +00:00
|
|
|
return plan;
|
2014-03-13 12:48:07 +00:00
|
|
|
}
|
|
|
|
|
2017-01-20 02:22:18 +00:00
|
|
|
/// Calculates a set of mark ranges, that could possibly contain keys, required by condition.
|
2017-03-12 19:18:07 +00:00
|
|
|
/// In other words, it removes subranges from whole range, that definitely could not contain required keys.
|
2015-11-29 11:58:44 +00:00
|
|
|
MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
|
2020-06-17 12:39:20 +00:00
|
|
|
const MergeTreeData::DataPartPtr & part,
|
|
|
|
const StorageMetadataPtr & metadata_snapshot,
|
|
|
|
const KeyCondition & key_condition,
|
2020-07-20 15:09:00 +00:00
|
|
|
const Settings & settings,
|
|
|
|
Poco::Logger * log)
|
2014-03-13 12:48:07 +00:00
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
MarkRanges res;
|
|
|
|
|
2019-03-25 13:55:24 +00:00
|
|
|
size_t marks_count = part->index_granularity.getMarksCount();
|
|
|
|
const auto & index = part->index;
|
2018-05-23 19:34:37 +00:00
|
|
|
if (marks_count == 0)
|
|
|
|
return res;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2019-06-18 12:54:27 +00:00
|
|
|
bool has_final_mark = part->index_granularity.hasFinalMark();
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
/// If index is not used.
|
|
|
|
if (key_condition.alwaysUnknownOrTrue())
|
|
|
|
{
|
2019-06-18 12:54:27 +00:00
|
|
|
if (has_final_mark)
|
|
|
|
res.push_back(MarkRange(0, marks_count - 1));
|
|
|
|
else
|
|
|
|
res.push_back(MarkRange(0, marks_count));
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t used_key_size = key_condition.getMaxKeyColumn() + 1;
|
|
|
|
|
|
|
|
std::function<void(size_t, size_t, FieldRef &)> create_field_ref;
|
|
|
|
/// If there are no monotonic functions, there is no need to save block reference.
|
|
|
|
/// Passing explicit field to FieldRef allows to optimize ranges and shows better performance.
|
|
|
|
const auto & primary_key = metadata_snapshot->getPrimaryKey();
|
|
|
|
if (key_condition.hasMonotonicFunctionsChain())
|
|
|
|
{
|
2020-10-09 07:41:28 +00:00
|
|
|
auto index_columns = std::make_shared<ColumnsWithTypeAndName>();
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
for (size_t i = 0; i < used_key_size; ++i)
|
2020-10-09 07:41:28 +00:00
|
|
|
index_columns->emplace_back(ColumnWithTypeAndName{index[i], primary_key.data_types[i], primary_key.column_names[i]});
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
|
2020-10-09 07:41:28 +00:00
|
|
|
create_field_ref = [index_columns](size_t row, size_t column, FieldRef & field)
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
{
|
2020-10-09 07:41:28 +00:00
|
|
|
field = {index_columns.get(), row, column};
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
};
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
create_field_ref = [&index](size_t row, size_t column, FieldRef & field)
|
|
|
|
{
|
|
|
|
index[column]->get(row, field);
|
|
|
|
};
|
|
|
|
}
|
2018-05-23 19:34:37 +00:00
|
|
|
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
/// NOTE Creating temporary Field objects to pass to KeyCondition.
|
|
|
|
std::vector<FieldRef> index_left(used_key_size);
|
|
|
|
std::vector<FieldRef> index_right(used_key_size);
|
2020-04-02 17:27:07 +00:00
|
|
|
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
auto may_be_true_in_range = [&](MarkRange & range)
|
|
|
|
{
|
|
|
|
if (range.end == marks_count && !has_final_mark)
|
2020-04-06 10:36:56 +00:00
|
|
|
{
|
|
|
|
for (size_t i = 0; i < used_key_size; ++i)
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
create_field_ref(range.begin, i, index_left[i]);
|
2020-04-06 10:36:56 +00:00
|
|
|
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
return key_condition.mayBeTrueAfter(
|
|
|
|
used_key_size, index_left.data(), primary_key.data_types);
|
2020-04-06 10:36:56 +00:00
|
|
|
}
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
|
|
|
|
if (has_final_mark && range.end == marks_count)
|
|
|
|
range.end -= 1; /// Remove final empty mark. It's useful only for primary key condition.
|
|
|
|
|
|
|
|
for (size_t i = 0; i < used_key_size; ++i)
|
2020-04-06 10:36:56 +00:00
|
|
|
{
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
create_field_ref(range.begin, i, index_left[i]);
|
|
|
|
create_field_ref(range.end, i, index_right[i]);
|
2020-04-06 10:36:56 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
return key_condition.mayBeTrueInRange(
|
|
|
|
used_key_size, index_left.data(), index_right.data(), primary_key.data_types);
|
|
|
|
};
|
|
|
|
|
|
|
|
if (!key_condition.matchesExactContinuousRange())
|
|
|
|
{
|
|
|
|
// Do exclusion search, where we drop ranges that do not match
|
|
|
|
|
|
|
|
size_t min_marks_for_seek = roundRowsOrBytesToMarks(
|
|
|
|
settings.merge_tree_min_rows_for_seek,
|
|
|
|
settings.merge_tree_min_bytes_for_seek,
|
|
|
|
part->index_granularity_info.fixed_index_granularity,
|
|
|
|
part->index_granularity_info.index_granularity_bytes);
|
|
|
|
|
|
|
|
/** There will always be disjoint suspicious segments on the stack, the leftmost one at the top (back).
|
|
|
|
* At each step, take the left segment and check if it fits.
|
|
|
|
* If fits, split it into smaller ones and put them on the stack. If not, discard it.
|
|
|
|
* If the segment is already of one mark length, add it to response and discard it.
|
|
|
|
*/
|
|
|
|
std::vector<MarkRange> ranges_stack = { {0, marks_count} };
|
|
|
|
|
|
|
|
size_t steps = 0;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
while (!ranges_stack.empty())
|
|
|
|
{
|
|
|
|
MarkRange range = ranges_stack.back();
|
|
|
|
ranges_stack.pop_back();
|
|
|
|
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
steps++;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
if (!may_be_true_in_range(range))
|
2017-04-01 07:20:54 +00:00
|
|
|
continue;
|
|
|
|
|
|
|
|
if (range.end == range.begin + 1)
|
|
|
|
{
|
|
|
|
/// We saw a useful gap between neighboring marks. Either add it to the last range, or start a new range.
|
|
|
|
if (res.empty() || range.begin - res.back().end > min_marks_for_seek)
|
|
|
|
res.push_back(range);
|
|
|
|
else
|
|
|
|
res.back().end = range.end;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/// Break the segment and put the result on the stack from right to left.
|
|
|
|
size_t step = (range.end - range.begin - 1) / settings.merge_tree_coarse_index_granularity + 1;
|
|
|
|
size_t end;
|
|
|
|
|
|
|
|
for (end = range.end; end > range.begin + step; end -= step)
|
2020-04-02 17:27:07 +00:00
|
|
|
ranges_stack.emplace_back(end - step, end);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-04-02 17:27:07 +00:00
|
|
|
ranges_stack.emplace_back(range.begin, end);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
}
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
|
|
|
|
LOG_TRACE(log, "Used generic exclusion search over index for part {} with {} steps", part->name, steps);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2020-09-18 13:03:48 +00:00
|
|
|
/// In case when SELECT's predicate defines a single continuous interval of keys,
|
|
|
|
/// we can use binary search algorithm to find the left and right endpoint key marks of such interval.
|
2020-10-27 11:04:03 +00:00
|
|
|
/// The returned value is the minimum range of marks, containing all keys for which KeyCondition holds
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
|
2020-09-15 14:22:32 +00:00
|
|
|
LOG_TRACE(log, "Running binary search on index range for part {} ({} marks)", part->name, marks_count);
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
|
2020-09-15 14:22:32 +00:00
|
|
|
size_t steps = 0;
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
|
2020-09-15 14:22:32 +00:00
|
|
|
MarkRange result_range;
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
|
2020-09-15 14:22:32 +00:00
|
|
|
size_t searched_left = 0;
|
|
|
|
size_t searched_right = marks_count;
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
|
2020-09-15 14:22:32 +00:00
|
|
|
while (searched_left + 1 < searched_right)
|
|
|
|
{
|
|
|
|
const size_t middle = (searched_left + searched_right) / 2;
|
|
|
|
MarkRange range(0, middle);
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
if (may_be_true_in_range(range))
|
2020-09-15 14:22:32 +00:00
|
|
|
searched_right = middle;
|
|
|
|
else
|
|
|
|
searched_left = middle;
|
|
|
|
++steps;
|
|
|
|
}
|
|
|
|
result_range.begin = searched_left;
|
|
|
|
LOG_TRACE(log, "Found (LEFT) boundary mark: {}", searched_left);
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
|
2020-09-15 14:22:32 +00:00
|
|
|
searched_right = marks_count;
|
|
|
|
while (searched_left + 1 < searched_right)
|
|
|
|
{
|
|
|
|
const size_t middle = (searched_left + searched_right) / 2;
|
|
|
|
MarkRange range(middle, marks_count);
|
|
|
|
if (may_be_true_in_range(range))
|
|
|
|
searched_left = middle;
|
|
|
|
else
|
|
|
|
searched_right = middle;
|
|
|
|
++steps;
|
|
|
|
}
|
|
|
|
result_range.end = searched_right;
|
|
|
|
LOG_TRACE(log, "Found (RIGHT) boundary mark: {}", searched_right);
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
|
2020-09-18 13:03:48 +00:00
|
|
|
if (result_range.begin < result_range.end && may_be_true_in_range(result_range))
|
2020-09-15 14:22:32 +00:00
|
|
|
res.emplace_back(std::move(result_range));
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
|
2020-09-15 14:22:32 +00:00
|
|
|
LOG_TRACE(log, "Found {} range in {} steps", res.empty() ? "empty" : "continuous", steps);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return res;
|
2014-03-13 12:48:07 +00:00
|
|
|
}
|
|
|
|
|
2020-09-15 14:22:32 +00:00
|
|
|
|
2019-01-08 19:41:36 +00:00
|
|
|
MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex(
|
2020-05-28 13:45:08 +00:00
|
|
|
MergeTreeIndexPtr index_helper,
|
2019-06-19 15:30:48 +00:00
|
|
|
MergeTreeIndexConditionPtr condition,
|
2019-01-09 17:05:52 +00:00
|
|
|
MergeTreeData::DataPartPtr part,
|
|
|
|
const MarkRanges & ranges,
|
2020-06-25 19:31:54 +00:00
|
|
|
const Settings & settings,
|
2020-07-20 15:09:00 +00:00
|
|
|
const MergeTreeReaderSettings & reader_settings,
|
2021-02-13 10:59:09 +00:00
|
|
|
size_t & total_granules,
|
|
|
|
size_t & granules_dropped,
|
2020-07-20 15:09:00 +00:00
|
|
|
Poco::Logger * log)
|
2019-01-08 19:41:36 +00:00
|
|
|
{
|
2020-05-28 13:45:08 +00:00
|
|
|
if (!part->volume->getDisk()->exists(part->getFullRelativePath() + index_helper->getFileName() + ".idx"))
|
2019-01-30 13:34:28 +00:00
|
|
|
{
|
2020-05-28 13:45:08 +00:00
|
|
|
LOG_DEBUG(log, "File for index {} does not exist. Skipping it.", backQuote(index_helper->index.name));
|
2019-01-08 19:41:36 +00:00
|
|
|
return ranges;
|
2019-01-30 13:34:28 +00:00
|
|
|
}
|
2019-01-07 12:51:14 +00:00
|
|
|
|
2020-05-28 13:45:08 +00:00
|
|
|
auto index_granularity = index_helper->index.granularity;
|
2020-05-28 12:37:05 +00:00
|
|
|
|
2019-04-01 11:09:30 +00:00
|
|
|
const size_t min_marks_for_seek = roundRowsOrBytesToMarks(
|
|
|
|
settings.merge_tree_min_rows_for_seek,
|
|
|
|
settings.merge_tree_min_bytes_for_seek,
|
2020-01-28 20:10:06 +00:00
|
|
|
part->index_granularity_info.fixed_index_granularity,
|
|
|
|
part->index_granularity_info.index_granularity_bytes);
|
2019-01-07 12:51:14 +00:00
|
|
|
|
2019-06-18 12:54:27 +00:00
|
|
|
size_t marks_count = part->getMarksCount();
|
|
|
|
size_t final_mark = part->index_granularity.hasFinalMark();
|
2020-05-28 12:37:05 +00:00
|
|
|
size_t index_marks_count = (marks_count - final_mark + index_granularity - 1) / index_granularity;
|
2019-06-18 12:54:27 +00:00
|
|
|
|
2019-01-08 19:41:36 +00:00
|
|
|
MergeTreeIndexReader reader(
|
2020-06-25 19:31:54 +00:00
|
|
|
index_helper, part,
|
|
|
|
index_marks_count,
|
|
|
|
ranges,
|
|
|
|
reader_settings);
|
2019-01-07 12:51:14 +00:00
|
|
|
|
2019-01-08 19:41:36 +00:00
|
|
|
MarkRanges res;
|
2019-01-07 12:51:14 +00:00
|
|
|
|
2019-01-22 15:58:11 +00:00
|
|
|
/// Some granules can cover two or more ranges,
|
|
|
|
/// this variable is stored to avoid reading the same granule twice.
|
2019-01-08 19:41:36 +00:00
|
|
|
MergeTreeIndexGranulePtr granule = nullptr;
|
|
|
|
size_t last_index_mark = 0;
|
|
|
|
for (const auto & range : ranges)
|
|
|
|
{
|
|
|
|
MarkRange index_range(
|
2020-05-28 12:37:05 +00:00
|
|
|
range.begin / index_granularity,
|
|
|
|
(range.end + index_granularity - 1) / index_granularity);
|
2019-01-07 12:51:14 +00:00
|
|
|
|
2019-01-22 15:58:11 +00:00
|
|
|
if (last_index_mark != index_range.begin || !granule)
|
2019-01-08 19:41:36 +00:00
|
|
|
reader.seek(index_range.begin);
|
2019-01-07 12:51:14 +00:00
|
|
|
|
2020-07-24 21:48:16 +00:00
|
|
|
total_granules += index_range.end - index_range.begin;
|
|
|
|
|
2019-01-08 19:41:36 +00:00
|
|
|
for (size_t index_mark = index_range.begin; index_mark < index_range.end; ++index_mark)
|
|
|
|
{
|
|
|
|
if (index_mark != index_range.begin || !granule || last_index_mark != index_range.begin)
|
|
|
|
granule = reader.read();
|
2019-01-07 12:51:14 +00:00
|
|
|
|
2019-01-08 19:41:36 +00:00
|
|
|
MarkRange data_range(
|
2020-05-28 12:37:05 +00:00
|
|
|
std::max(range.begin, index_mark * index_granularity),
|
|
|
|
std::min(range.end, (index_mark + 1) * index_granularity));
|
2019-01-07 12:51:14 +00:00
|
|
|
|
2019-01-08 19:41:36 +00:00
|
|
|
if (!condition->mayBeTrueOnGranule(granule))
|
2019-01-09 14:15:23 +00:00
|
|
|
{
|
2019-01-30 13:34:28 +00:00
|
|
|
++granules_dropped;
|
2019-01-08 19:41:36 +00:00
|
|
|
continue;
|
2019-01-09 14:15:23 +00:00
|
|
|
}
|
2019-01-07 12:51:14 +00:00
|
|
|
|
2019-02-14 16:59:26 +00:00
|
|
|
if (res.empty() || res.back().end - data_range.begin > min_marks_for_seek)
|
2019-01-08 19:41:36 +00:00
|
|
|
res.push_back(data_range);
|
|
|
|
else
|
|
|
|
res.back().end = data_range.end;
|
2019-01-07 12:51:14 +00:00
|
|
|
}
|
2019-01-08 19:41:36 +00:00
|
|
|
|
|
|
|
last_index_mark = index_range.end - 1;
|
2019-01-07 12:51:14 +00:00
|
|
|
}
|
2019-01-30 13:34:28 +00:00
|
|
|
|
2019-01-08 19:41:36 +00:00
|
|
|
return res;
|
|
|
|
}
|
2019-04-01 11:09:30 +00:00
|
|
|
|
2021-02-05 09:54:34 +00:00
|
|
|
void MergeTreeDataSelectExecutor::selectPartsToRead(
|
|
|
|
MergeTreeData::DataPartsVector & parts,
|
|
|
|
const std::unordered_set<String> & part_values,
|
|
|
|
const std::optional<KeyCondition> & minmax_idx_condition,
|
2021-03-02 10:33:54 +00:00
|
|
|
const DataTypes & minmax_columns_types,
|
2021-02-05 09:54:34 +00:00
|
|
|
std::optional<PartitionPruner> & partition_pruner,
|
2021-04-06 12:39:55 +00:00
|
|
|
const PartitionIdToMaxBlock * max_block_numbers_to_read,
|
|
|
|
PartFilterCounters & counters)
|
2021-02-05 09:54:34 +00:00
|
|
|
{
|
|
|
|
auto prev_parts = parts;
|
|
|
|
parts.clear();
|
|
|
|
|
|
|
|
for (const auto & part : prev_parts)
|
|
|
|
{
|
|
|
|
if (part_values.find(part->name) == part_values.end())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (part->isEmpty())
|
|
|
|
continue;
|
|
|
|
|
2021-04-06 12:39:55 +00:00
|
|
|
if (max_block_numbers_to_read)
|
|
|
|
{
|
|
|
|
auto blocks_iterator = max_block_numbers_to_read->find(part->info.partition_id);
|
|
|
|
if (blocks_iterator == max_block_numbers_to_read->end() || part->info.max_block > blocks_iterator->second)
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t num_granules = part->getMarksCount();
|
|
|
|
if (num_granules && part->index_granularity.hasFinalMark())
|
|
|
|
--num_granules;
|
|
|
|
|
|
|
|
counters.num_initial_selected_parts += 1;
|
|
|
|
counters.num_initial_selected_granules += num_granules;
|
|
|
|
|
2021-02-05 09:54:34 +00:00
|
|
|
if (minmax_idx_condition && !minmax_idx_condition->checkInHyperrectangle(
|
2021-03-02 10:33:54 +00:00
|
|
|
part->minmax_idx.hyperrectangle, minmax_columns_types).can_be_true)
|
2021-02-05 09:54:34 +00:00
|
|
|
continue;
|
|
|
|
|
2021-04-06 12:39:55 +00:00
|
|
|
counters.num_parts_after_minmax += 1;
|
|
|
|
counters.num_granules_after_minmax += num_granules;
|
|
|
|
|
2021-02-05 09:54:34 +00:00
|
|
|
if (partition_pruner)
|
|
|
|
{
|
|
|
|
if (partition_pruner->canBePruned(part))
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2021-04-08 11:48:54 +00:00
|
|
|
counters.num_parts_after_partition_pruner += 1;
|
|
|
|
counters.num_granules_after_partition_pruner += num_granules;
|
2021-02-05 09:54:34 +00:00
|
|
|
|
|
|
|
parts.push_back(part);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter(
|
|
|
|
MergeTreeData::DataPartsVector & parts,
|
|
|
|
const std::unordered_set<String> & part_values,
|
|
|
|
const std::optional<KeyCondition> & minmax_idx_condition,
|
2021-03-02 10:33:54 +00:00
|
|
|
const DataTypes & minmax_columns_types,
|
2021-02-05 09:54:34 +00:00
|
|
|
std::optional<PartitionPruner> & partition_pruner,
|
|
|
|
const PartitionIdToMaxBlock * max_block_numbers_to_read,
|
2021-04-06 12:39:55 +00:00
|
|
|
const Context & query_context,
|
|
|
|
PartFilterCounters & counters) const
|
2021-02-05 09:54:34 +00:00
|
|
|
{
|
|
|
|
/// const_cast to add UUIDs to context. Bad practice.
|
|
|
|
Context & non_const_context = const_cast<Context &>(query_context);
|
|
|
|
|
|
|
|
/// process_parts prepare parts that have to be read for the query,
|
|
|
|
/// returns false if duplicated parts' UUID have been met
|
|
|
|
auto select_parts = [&] (MergeTreeData::DataPartsVector & selected_parts) -> bool
|
|
|
|
{
|
|
|
|
auto ignored_part_uuids = non_const_context.getIgnoredPartUUIDs();
|
|
|
|
std::unordered_set<UUID> temp_part_uuids;
|
|
|
|
|
|
|
|
auto prev_parts = selected_parts;
|
|
|
|
selected_parts.clear();
|
|
|
|
|
|
|
|
for (const auto & part : prev_parts)
|
|
|
|
{
|
|
|
|
if (part_values.find(part->name) == part_values.end())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (part->isEmpty())
|
|
|
|
continue;
|
|
|
|
|
2021-04-06 12:39:55 +00:00
|
|
|
if (max_block_numbers_to_read)
|
|
|
|
{
|
|
|
|
auto blocks_iterator = max_block_numbers_to_read->find(part->info.partition_id);
|
|
|
|
if (blocks_iterator == max_block_numbers_to_read->end() || part->info.max_block > blocks_iterator->second)
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Skip the part if its uuid is meant to be excluded
|
|
|
|
if (part->uuid != UUIDHelpers::Nil && ignored_part_uuids->has(part->uuid))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
size_t num_granules = part->getMarksCount();
|
|
|
|
if (num_granules && part->index_granularity.hasFinalMark())
|
|
|
|
--num_granules;
|
|
|
|
|
|
|
|
counters.num_initial_selected_parts += 1;
|
|
|
|
counters.num_initial_selected_granules += num_granules;
|
|
|
|
|
2021-02-05 09:54:34 +00:00
|
|
|
if (minmax_idx_condition
|
2021-03-02 10:33:54 +00:00
|
|
|
&& !minmax_idx_condition->checkInHyperrectangle(part->minmax_idx.hyperrectangle, minmax_columns_types)
|
2021-02-05 09:54:34 +00:00
|
|
|
.can_be_true)
|
|
|
|
continue;
|
|
|
|
|
2021-04-06 12:39:55 +00:00
|
|
|
counters.num_parts_after_minmax += 1;
|
|
|
|
counters.num_granules_after_minmax += num_granules;
|
|
|
|
|
2021-02-05 09:54:34 +00:00
|
|
|
if (partition_pruner)
|
|
|
|
{
|
|
|
|
if (partition_pruner->canBePruned(part))
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2021-04-08 11:48:54 +00:00
|
|
|
counters.num_parts_after_partition_pruner += 1;
|
|
|
|
counters.num_granules_after_partition_pruner += num_granules;
|
2021-02-05 09:54:34 +00:00
|
|
|
|
|
|
|
/// populate UUIDs and exclude ignored parts if enabled
|
|
|
|
if (part->uuid != UUIDHelpers::Nil)
|
|
|
|
{
|
|
|
|
auto result = temp_part_uuids.insert(part->uuid);
|
|
|
|
if (!result.second)
|
|
|
|
throw Exception("Found a part with the same UUID on the same replica.", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
}
|
|
|
|
|
|
|
|
selected_parts.push_back(part);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!temp_part_uuids.empty())
|
|
|
|
{
|
|
|
|
auto duplicates = non_const_context.getPartUUIDs()->add(std::vector<UUID>{temp_part_uuids.begin(), temp_part_uuids.end()});
|
|
|
|
if (!duplicates.empty())
|
|
|
|
{
|
|
|
|
/// on a local replica with prefer_localhost_replica=1 if any duplicates appeared during the first pass,
|
|
|
|
/// adding them to the exclusion, so they will be skipped on second pass
|
|
|
|
non_const_context.getIgnoredPartUUIDs()->add(duplicates);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
|
|
|
/// Process parts that have to be read for a query.
|
|
|
|
auto needs_retry = !select_parts(parts);
|
|
|
|
|
|
|
|
/// If any duplicated part UUIDs met during the first step, try to ignore them in second pass
|
|
|
|
if (needs_retry)
|
|
|
|
{
|
|
|
|
LOG_DEBUG(log, "Found duplicate uuids locally, will retry part selection without them");
|
|
|
|
|
2021-04-06 12:39:55 +00:00
|
|
|
counters = PartFilterCounters();
|
|
|
|
|
2021-02-05 09:54:34 +00:00
|
|
|
/// Second attempt didn't help, throw an exception
|
|
|
|
if (!select_parts(parts))
|
|
|
|
throw Exception("Found duplicate UUIDs while processing query.", ErrorCodes::DUPLICATED_PART_UUIDS);
|
|
|
|
}
|
|
|
|
}
|
2019-01-07 12:51:14 +00:00
|
|
|
|
2014-03-13 12:48:07 +00:00
|
|
|
}
|