ClickHouse/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h

214 lines
8.4 KiB
C++
Raw Normal View History

2014-03-13 12:48:07 +00:00
#pragma once
#include <Core/QueryProcessingStage.h>
#include <Storages/SelectQueryInfo.h>
#include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/MergeTree/RangesInDataPart.h>
2021-02-05 09:54:34 +00:00
#include <Storages/MergeTree/PartitionPruner.h>
2021-04-08 08:19:04 +00:00
#include <Processors/QueryPlan/ReadFromMergeTree.h>
2015-04-12 04:39:20 +00:00
2014-03-13 12:48:07 +00:00
namespace DB
{
class KeyCondition;
Squashed commit of the following: commit e712f469a55ff34ad34b482b15cc4153b7ad7233 Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:59:13 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 2a002823084e3a79bffcc17d479620a68eb0644b Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:58:30 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 9e06f407c8ee781ed8ddf98bdfcc31846bf2a0fe Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:55:14 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 9581620f1e839f456fa7894aa1f996d5162ac6cd Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:54:22 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 2a8564c68cb6cc3649fafaf401256d43c9a2e777 Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:47:34 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit cf60632d78ec656be3304ef4565e859bb6ce80ba Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:40:09 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit ee3d1dc6e0c4ca60e3ac1e0c30d4b3ed1e66eca0 Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:22:49 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 65592ef7116a90104fcd524b53ef8b7cf22640f2 Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:18:17 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 37972c257320d3b7e7b294e0fdeffff218647bfd Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:17:06 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit dd909d149974ce5bed2456de1261aa5a368fd3ff Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:16:28 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 3cf43266ca7e30adf01212b1a739ba5fe43639fd Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:15:42 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 6731a3df96d1609286e2536b6432916af7743f0f Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:13:35 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 1b5727e0d56415b7add4cb76110105358663602c Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:11:18 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit bbcf726a55685b8e72f5b40ba0bf1904bd1c0407 Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:09:04 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit c03b477d5e2e65014e8906ecfa2efb67ee295af1 Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:06:30 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 2986e2fb0466bc18d73693dcdded28fccc0dc66b Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:05:44 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 5d6cdef13d2e02bd5c4954983334e9162ab2635b Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:04:53 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit f2b819b25ce8b2ccdcb201eefb03e1e6f5aab590 Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:01:47 2017 +0300 Less dependencies [#CLICKHOUSE-2]
2017-01-14 09:00:19 +00:00
struct MergeTreeDataSelectSamplingData
{
2021-05-23 23:56:03 +00:00
bool use_sampling = false;
2021-05-25 16:34:43 +00:00
bool read_nothing = false;
2021-05-27 13:40:33 +00:00
Float64 used_sample_factor = 1.0;
std::shared_ptr<ASTFunction> filter_function;
ActionsDAGPtr filter_expression;
};
2021-05-27 13:40:33 +00:00
using PartitionIdToMaxBlock = std::unordered_map<String, Int64>;
2014-03-13 12:48:07 +00:00
2017-04-16 15:00:33 +00:00
/** Executes SELECT queries on data from the merge tree.
2014-03-13 12:48:07 +00:00
*/
class MergeTreeDataSelectExecutor
{
public:
2019-10-01 16:50:08 +00:00
explicit MergeTreeDataSelectExecutor(const MergeTreeData & data_);
2014-03-13 12:48:07 +00:00
2017-04-16 15:00:33 +00:00
/** When reading, selects a set of parts that covers the desired range of the index.
2018-10-10 16:20:15 +00:00
* max_blocks_number_to_read - if not nullptr, do not read all the parts whose right border is greater than max_block in partition.
*/
2018-10-23 08:19:47 +00:00
QueryPlanPtr read(
const Names & column_names,
const StorageMetadataPtr & metadata_snapshot,
const SelectQueryInfo & query_info,
ContextPtr context,
2019-02-10 16:55:12 +00:00
UInt64 max_block_size,
2017-06-02 15:54:39 +00:00
unsigned num_streams,
QueryProcessingStage::Enum processed_stage,
2021-05-27 16:53:58 +00:00
std::shared_ptr<PartitionIdToMaxBlock> max_block_numbers_to_read = nullptr) const;
2014-03-13 12:48:07 +00:00
2021-05-28 17:16:09 +00:00
/// The same as read, but with specified set of parts.
QueryPlanPtr readFromParts(
2021-05-27 13:40:33 +00:00
MergeTreeData::DataPartsVector parts,
const Names & column_names,
const StorageMetadataPtr & metadata_snapshot_base,
const StorageMetadataPtr & metadata_snapshot,
const SelectQueryInfo & query_info,
ContextPtr context,
2021-05-28 17:16:09 +00:00
UInt64 max_block_size,
2021-05-27 13:40:33 +00:00
unsigned num_streams,
2021-05-27 16:53:58 +00:00
std::shared_ptr<PartitionIdToMaxBlock> max_block_numbers_to_read = nullptr) const;
2021-05-27 13:40:33 +00:00
2021-05-28 17:16:09 +00:00
/// Get an estimation for the number of marks we are going to read.
/// Reads nothing. Secondary indexes are not used.
/// This method is used to select best projection for table.
size_t estimateNumMarksToRead(
MergeTreeData::DataPartsVector parts,
const Names & column_names,
const StorageMetadataPtr & metadata_snapshot_base,
const StorageMetadataPtr & metadata_snapshot,
const SelectQueryInfo & query_info,
ContextPtr context,
unsigned num_streams,
2021-05-27 16:53:58 +00:00
std::shared_ptr<PartitionIdToMaxBlock> max_block_numbers_to_read = nullptr) const;
2014-03-13 12:48:07 +00:00
private:
2018-10-17 03:13:00 +00:00
const MergeTreeData & data;
2020-05-30 21:57:37 +00:00
Poco::Logger * log;
2014-03-13 12:48:07 +00:00
2017-04-16 15:00:33 +00:00
/// Get the approximate value (bottom estimate - only by full marks) of the number of rows falling under the index.
2021-05-25 16:34:43 +00:00
static size_t getApproximateTotalRowsToRead(
const MergeTreeData::DataPartsVector & parts,
2020-06-17 12:39:20 +00:00
const StorageMetadataPtr & metadata_snapshot,
const KeyCondition & key_condition,
2021-05-25 16:34:43 +00:00
const Settings & settings,
Poco::Logger * log);
2014-03-13 12:48:07 +00:00
2020-07-20 15:09:00 +00:00
static MarkRanges markRangesFromPKRange(
2019-03-25 13:55:24 +00:00
const MergeTreeData::DataPartPtr & part,
2020-06-17 12:39:20 +00:00
const StorageMetadataPtr & metadata_snapshot,
const KeyCondition & key_condition,
2020-07-20 15:09:00 +00:00
const Settings & settings,
Poco::Logger * log);
Data Skipping Indices (#4143) * made index parser * added index parsing * some fixes * added index interface and factory * fixed compilation * ptrs * added indexParts * indextypes * index condition * IndexCondition * added indexes in selectexecutor * fix * changed comment * fix * added granularity * comments * fix * fix * added writing indexes * removed indexpart class * fix * added setSkipIndexes * add rw for MergeTreeIndexes * fixes * upd error * fix * fix * reading * test index * fixed nullptr error * fixed * fix * unique names * asts -> exprlist * minmax index * fix * fixed select * fixed merging * fixed mutation * working minmax * removed test index * fixed style * added indexes to checkDataPart * added tests for minmax index * fixed constructor * fix style * fixed includes * fixed setSkipIndexes * added indexes meta to zookeeper * added parsing * removed throw * alter cmds parse * fix * added alter * fix * alters fix * fix alters * fix "after" * fixed alter * alter fix + test * fixes * upd setSkipIndexes * fixed alter bug with drop all indices * fix metadata editing * new test and repl fix * rm test files * fixed repl alter * fix * fix * indices * MTReadStream * upd test for bug * fix * added useful parsers and ast classes * fix * fix comments * replaced columns * fix * fixed parsing * fixed printing * fix err * basic IndicesDescription * go to IndicesDescr * moved indices * go to indicesDescr * fix test minmax_index* * fixed MT alter * fixed bug with replMT indices storing in zk * rename * refactoring * docs ru * docs ru * docs en * refactor * rename tests * fix docs * refactoring * fix * fix * fix * fixed style * unique idx * unique * fix * better minmax calculation * upd * added getBlock * unique_condition * added termForAST * unique * fixed not * uniqueCondition::mayBeTrueOnGranule * fix * fixed bug with double column * is always true * fix * key set * spaces * test * tests * fix * unique * fix * fix * fixed bug with duplicate column * removed unused data * fix * fixes * __bitSwapLastTwo * fix
2019-02-05 14:50:25 +00:00
2020-07-20 15:09:00 +00:00
static MarkRanges filterMarksUsingIndex(
2020-05-28 13:45:08 +00:00
MergeTreeIndexPtr index_helper,
2019-06-19 15:30:48 +00:00
MergeTreeIndexConditionPtr condition,
Data Skipping Indices (#4143) * made index parser * added index parsing * some fixes * added index interface and factory * fixed compilation * ptrs * added indexParts * indextypes * index condition * IndexCondition * added indexes in selectexecutor * fix * changed comment * fix * added granularity * comments * fix * fix * added writing indexes * removed indexpart class * fix * added setSkipIndexes * add rw for MergeTreeIndexes * fixes * upd error * fix * fix * reading * test index * fixed nullptr error * fixed * fix * unique names * asts -> exprlist * minmax index * fix * fixed select * fixed merging * fixed mutation * working minmax * removed test index * fixed style * added indexes to checkDataPart * added tests for minmax index * fixed constructor * fix style * fixed includes * fixed setSkipIndexes * added indexes meta to zookeeper * added parsing * removed throw * alter cmds parse * fix * added alter * fix * alters fix * fix alters * fix "after" * fixed alter * alter fix + test * fixes * upd setSkipIndexes * fixed alter bug with drop all indices * fix metadata editing * new test and repl fix * rm test files * fixed repl alter * fix * fix * indices * MTReadStream * upd test for bug * fix * added useful parsers and ast classes * fix * fix comments * replaced columns * fix * fixed parsing * fixed printing * fix err * basic IndicesDescription * go to IndicesDescr * moved indices * go to indicesDescr * fix test minmax_index* * fixed MT alter * fixed bug with replMT indices storing in zk * rename * refactoring * docs ru * docs ru * docs en * refactor * rename tests * fix docs * refactoring * fix * fix * fix * fixed style * unique idx * unique * fix * better minmax calculation * upd * added getBlock * unique_condition * added termForAST * unique * fixed not * uniqueCondition::mayBeTrueOnGranule * fix * fixed bug with double column * is always true * fix * key set * spaces * test * tests * fix * unique * fix * fix * fixed bug with duplicate column * removed unused data * fix * fixes * __bitSwapLastTwo * fix
2019-02-05 14:50:25 +00:00
MergeTreeData::DataPartPtr part,
const MarkRanges & ranges,
const Settings & settings,
2020-07-20 15:09:00 +00:00
const MergeTreeReaderSettings & reader_settings,
size_t & total_granules,
size_t & granules_dropped,
2020-07-20 15:09:00 +00:00
Poco::Logger * log);
2021-02-05 09:54:34 +00:00
struct PartFilterCounters
{
size_t num_initial_selected_parts = 0;
size_t num_initial_selected_granules = 0;
size_t num_parts_after_minmax = 0;
size_t num_granules_after_minmax = 0;
size_t num_parts_after_partition_pruner = 0;
size_t num_granules_after_partition_pruner = 0;
};
2021-02-05 09:54:34 +00:00
/// Select the parts in which there can be data that satisfy `minmax_idx_condition` and that match the condition on `_part`,
/// as well as `max_block_number_to_read`.
2021-03-02 16:13:36 +00:00
static void selectPartsToRead(
2021-02-05 09:54:34 +00:00
MergeTreeData::DataPartsVector & parts,
2021-05-25 16:34:43 +00:00
const std::optional<std::unordered_set<String>> & part_values,
2021-02-05 09:54:34 +00:00
const std::optional<KeyCondition> & minmax_idx_condition,
const DataTypes & minmax_columns_types,
2021-02-05 09:54:34 +00:00
std::optional<PartitionPruner> & partition_pruner,
const PartitionIdToMaxBlock * max_block_numbers_to_read,
PartFilterCounters & counters);
2021-02-05 09:54:34 +00:00
/// Same as previous but also skip parts uuids if any to the query context, or skip parts which uuids marked as excluded.
2021-05-25 16:34:43 +00:00
static void selectPartsToReadWithUUIDFilter(
2021-02-05 09:54:34 +00:00
MergeTreeData::DataPartsVector & parts,
2021-05-25 16:34:43 +00:00
const std::optional<std::unordered_set<String>> & part_values,
MergeTreeData::PinnedPartUUIDsPtr pinned_part_uuids,
2021-02-05 09:54:34 +00:00
const std::optional<KeyCondition> & minmax_idx_condition,
const DataTypes & minmax_columns_types,
2021-02-05 09:54:34 +00:00
std::optional<PartitionPruner> & partition_pruner,
const PartitionIdToMaxBlock * max_block_numbers_to_read,
ContextPtr query_context,
2021-05-25 16:34:43 +00:00
PartFilterCounters & counters,
Poco::Logger * log);
2021-05-27 13:40:33 +00:00
public:
2021-05-28 17:16:09 +00:00
/// For given number rows and bytes, get the number of marks to read.
/// It is a minimal number of marks which contain so many rows and bytes.
2021-05-28 09:41:07 +00:00
static size_t roundRowsOrBytesToMarks(
size_t rows_setting,
size_t bytes_setting,
size_t rows_granularity,
size_t bytes_granularity);
2021-05-28 17:16:09 +00:00
/// The same as roundRowsOrBytesToMarks, but return no more than max_marks.
2021-05-28 09:41:07 +00:00
static size_t minMarksForConcurrentRead(
size_t rows_setting,
size_t bytes_setting,
size_t rows_granularity,
size_t bytes_granularity,
size_t max_marks);
2021-05-28 17:16:09 +00:00
/// If possible, filter using expression on virtual columns.
/// Example: SELECT count() FROM table WHERE _part = 'part_name'
/// If expression found, return a set with allowed part names (std::nullopt otherwise).
2021-05-27 13:40:33 +00:00
static std::optional<std::unordered_set<String>> filterPartsByVirtualColumns(
const MergeTreeData & data,
2021-05-28 17:16:09 +00:00
const MergeTreeData::DataPartsVector & parts,
2021-05-27 13:40:33 +00:00
const ASTPtr & query,
ContextPtr context);
2021-05-28 17:16:09 +00:00
/// Filter parts using minmax index and partition key.
2021-05-27 13:40:33 +00:00
static void filterPartsByPartition(
2021-05-28 17:16:09 +00:00
MergeTreeData::DataPartsVector & parts,
const std::optional<std::unordered_set<String>> & part_values,
2021-05-27 13:40:33 +00:00
const StorageMetadataPtr & metadata_snapshot,
const MergeTreeData & data,
const SelectQueryInfo & query_info,
2021-05-27 16:53:58 +00:00
const ContextPtr & context,
2021-05-27 13:40:33 +00:00
const PartitionIdToMaxBlock * max_block_numbers_to_read,
Poco::Logger * log,
ReadFromMergeTree::IndexStats & index_stats);
2021-05-28 17:16:09 +00:00
/// Filter parts using primary key and secondary indexes.
/// For every part, select mark ranges to read.
2021-07-13 02:02:51 +00:00
/// If not 'ignore_size_limits' it will throw exception if the amount of data exceed the limits from settings.
2021-05-27 13:40:33 +00:00
static RangesInDataParts filterPartsByPrimaryKeyAndSkipIndexes(
MergeTreeData::DataPartsVector && parts,
2021-05-26 18:14:43 +00:00
StorageMetadataPtr metadata_snapshot,
2021-05-27 13:40:33 +00:00
const SelectQueryInfo & query_info,
2021-05-27 16:53:58 +00:00
const ContextPtr & context,
2021-05-28 17:16:09 +00:00
const KeyCondition & key_condition,
2021-05-26 18:14:43 +00:00
const MergeTreeReaderSettings & reader_settings,
Poco::Logger * log,
size_t num_streams,
2021-05-27 13:40:33 +00:00
ReadFromMergeTree::IndexStats & index_stats,
bool use_skip_indexes,
bool ignore_size_limits);
2021-05-26 18:14:43 +00:00
2021-05-28 17:16:09 +00:00
/// Create expression for sampling.
/// Also, calculate _sample_factor if needed.
/// Also, update key condition with selected sampling range.
2021-05-25 16:34:43 +00:00
static MergeTreeDataSelectSamplingData getSampling(
const ASTSelectQuery & select,
2021-05-28 17:16:09 +00:00
NamesAndTypesList available_real_columns,
const MergeTreeData::DataPartsVector & parts,
2021-05-25 16:34:43 +00:00
KeyCondition & key_condition,
const MergeTreeData & data,
2021-05-28 17:16:09 +00:00
const StorageMetadataPtr & metadata_snapshot,
ContextPtr context,
2021-05-25 16:34:43 +00:00
bool sample_factor_column_queried,
2021-05-28 17:16:09 +00:00
Poco::Logger * log);
2021-05-26 18:14:43 +00:00
2021-05-28 17:16:09 +00:00
/// Check query limits: max_partitions_to_read, max_concurrent_queries.
/// Also, return QueryIdHolder. If not null, we should keep it until query finishes.
static std::shared_ptr<QueryIdHolder> checkLimits(
2021-05-28 09:41:07 +00:00
const MergeTreeData & data,
const RangesInDataParts & parts_with_ranges,
const ContextPtr & context);
2014-03-13 12:48:07 +00:00
};
}