ClickHouse/src/Storages/StorageDistributed.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

287 lines
12 KiB
C++
Raw Normal View History

2012-05-21 20:38:34 +00:00
#pragma once
#include <Storages/IStorage.h>
#include <Storages/IStorageCluster.h>
#include <Storages/Distributed/DistributedAsyncInsertDirectoryQueue.h>
#include <Storages/Distributed/DistributedSettings.h>
#include <Storages/getStructureOfRemoteTable.h>
#include <Common/SimpleIncrement.h>
#include <Client/ConnectionPool.h>
#include <Client/ConnectionPoolWithFailover.h>
#include <Common/ActionBlocker.h>
#include <Interpreters/Cluster.h>
2012-05-21 20:38:34 +00:00
#include <pcg_random.hpp>
2012-05-21 20:38:34 +00:00
namespace DB
{
struct Settings;
2016-12-08 02:49:04 +00:00
class Context;
2020-07-22 17:21:08 +00:00
class IVolume;
using VolumePtr = std::shared_ptr<IVolume>;
class IDisk;
using DiskPtr = std::shared_ptr<IDisk>;
class ExpressionActions;
using ExpressionActionsPtr = std::shared_ptr<ExpressionActions>;
2016-12-08 02:49:04 +00:00
struct TreeRewriterResult;
using TreeRewriterResultPtr = std::shared_ptr<const TreeRewriterResult>;
2017-04-16 15:00:33 +00:00
/** A distributed table that resides on multiple servers.
* Uses data from the specified database and tables on each server.
2012-05-30 04:45:49 +00:00
*
2017-04-16 15:00:33 +00:00
* You can pass one address, not several.
* In this case, the table can be considered remote, rather than distributed.
2012-05-21 20:38:34 +00:00
*/
class StorageDistributed final : public IStorage, WithContext
2012-05-21 20:38:34 +00:00
{
2021-07-23 14:25:35 +00:00
friend class DistributedSink;
friend class DistributedAsyncInsertBatch;
friend class DistributedAsyncInsertDirectoryQueue;
friend class StorageSystemDistributionQueue;
2012-05-21 20:38:34 +00:00
public:
StorageDistributed(
const StorageID & id_,
const ColumnsDescription & columns_,
const ConstraintsDescription & constraints_,
const String & comment,
const String & remote_database_,
const String & remote_table_,
const String & cluster_name_,
ContextPtr context_,
const ASTPtr & sharding_key_,
const String & storage_policy_name_,
const String & relative_data_path_,
const DistributedSettings & distributed_settings_,
bool attach_,
ClusterPtr owned_cluster_ = {},
ASTPtr remote_table_function_ptr_ = {});
StorageDistributed(
const StorageID & id_,
const ColumnsDescription & columns_,
const ConstraintsDescription & constraints_,
ASTPtr remote_table_function_ptr_,
const String & cluster_name_,
ContextPtr context_,
const ASTPtr & sharding_key_,
const String & storage_policy_name_,
const String & relative_data_path_,
const DistributedSettings & distributed_settings_,
bool attach,
ClusterPtr owned_cluster_ = {});
~StorageDistributed() override;
std::string getName() const override { return "Distributed"; }
bool supportsSampling() const override { return true; }
bool supportsFinal() const override { return true; }
bool supportsPrewhere() const override { return true; }
bool supportsSubcolumns() const override { return true; }
bool supportsDynamicSubcolumns() const override { return true; }
StoragePolicyPtr getStoragePolicy() const override;
/// Do not apply moving to PREWHERE optimization for distributed tables,
/// because we can't be sure that underlying table supports PREWHERE.
bool canMoveConditionsToPrewhere() const override { return false; }
bool isRemote() const override { return true; }
2022-03-14 17:29:18 +00:00
/// Snapshot for StorageDistributed contains descriptions
/// of columns of type Object for each shard at the moment
/// of the start of query.
struct SnapshotData : public StorageSnapshot::Data
{
ColumnsDescriptionByShardNum objects_by_shard;
};
StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context) const override;
StorageSnapshotPtr getStorageSnapshotForQuery(
const StorageMetadataPtr & metadata_snapshot, const ASTPtr & query, ContextPtr query_context) const override;
QueryProcessingStage::Enum
getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override;
2020-09-25 13:19:26 +00:00
void read(
QueryPlan & query_plan,
const Names & column_names,
const StorageSnapshotPtr & storage_snapshot,
SelectQueryInfo & query_info,
ContextPtr context,
QueryProcessingStage::Enum processed_stage,
2020-09-25 13:03:12 +00:00
size_t /*max_block_size*/,
size_t /*num_streams*/) override;
bool supportsParallelInsert() const override { return true; }
std::optional<UInt64> totalBytes(const Settings &) const override;
SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool /*async_insert*/) override;
2022-05-20 19:49:31 +00:00
std::optional<QueryPipeline> distributedWrite(const ASTInsertQuery & query, ContextPtr context) override;
2018-06-09 15:48:22 +00:00
/// Removes temporary data in local filesystem.
void truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder &) override;
2019-08-27 20:43:08 +00:00
2020-04-07 14:05:51 +00:00
void rename(const String & new_path_to_table_data, const StorageID & new_table_id) override;
2018-04-21 00:35:20 +00:00
void checkAlterIsPossible(const AlterCommands & commands, ContextPtr context) const override;
2019-12-26 18:17:05 +00:00
2017-04-16 15:00:33 +00:00
/// in the sub-tables, you need to manually add and delete columns
/// the structure of the sub-table is not checked
2021-10-25 17:49:49 +00:00
void alter(const AlterCommands & params, ContextPtr context, AlterLockHolder & table_lock_holder) override;
void initializeFromDisk();
2023-11-06 14:40:01 +00:00
void shutdown(bool is_drop) override;
void flushAndPrepareForShutdown() override;
void drop() override;
2021-09-03 17:04:30 +00:00
bool storesDataOnDisk() const override { return data_volume != nullptr; }
Strings getDataPaths() const override;
ActionLock getActionLock(StorageActionBlockType type) override;
NamesAndTypesList getVirtuals() const override;
/// Used by InterpreterInsertQuery
std::string getRemoteDatabaseName() const { return remote_database; }
std::string getRemoteTableName() const { return remote_table; }
ClusterPtr getCluster() const;
/// Used by InterpreterSystemQuery
void flushClusterNodesAllData(ContextPtr context);
/// Used by ClusterCopier
size_t getShardCount() const;
bool initializeDiskOnConfigChange(const std::set<String> & new_added_disks) override;
private:
void renameOnDisk(const String & new_path_to_table_data);
const ExpressionActionsPtr & getShardingKeyExpr() const { return sharding_key_expr; }
const String & getShardingKeyColumnName() const { return sharding_key_column_name; }
2020-07-23 14:10:48 +00:00
const String & getRelativeDataPath() const { return relative_data_path; }
/// create directory monitors for each existing subdirectory
void initializeDirectoryQueuesForDisk(const DiskPtr & disk);
/// Get directory queue thread and connection pool created by disk and subdirectory name
///
Rename directory monitor concept into background INSERT (#55978) * Limit log frequence for "Skipping send data over distributed table" message After SYSTEM STOP DISTRIBUTED SENDS it will constantly print this message. Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> * Rename directory monitor concept into async INSERT Rename the following query settings (with preserving backward compatiblity, by keeping old name as an alias): - distributed_directory_monitor_sleep_time_ms -> distributed_async_insert_sleep_time_ms - distributed_directory_monitor_max_sleep_time_ms -> distributed_async_insert_max_sleep_time_ms - distributed_directory_monitor_batch -> distributed_async_insert_batch_inserts - distributed_directory_monitor_split_batch_on_failure -> distributed_async_insert_split_batch_on_failure Rename the following table settings (with preserving backward compatiblity, by keeping old name as an alias): - monitor_batch_inserts -> async_insert_batch - monitor_split_batch_on_failure -> async_insert_split_batch_on_failure - directory_monitor_sleep_time_ms -> async_insert_sleep_time_ms - directory_monitor_max_sleep_time_ms -> async_insert_max_sleep_time_ms And also update all the references: $ gg -e directory_monitor_ -e monitor_ tests docs | cut -d: -f1 | sort -u | xargs sed -e 's/distributed_directory_monitor_sleep_time_ms/distributed_async_insert_sleep_time_ms/g' -e 's/distributed_directory_monitor_max_sleep_time_ms/distributed_async_insert_max_sleep_time_ms/g' -e 's/distributed_directory_monitor_batch_inserts/distributed_async_insert_batch/g' -e 's/distributed_directory_monitor_split_batch_on_failure/distributed_async_insert_split_batch_on_failure/g' -e 's/monitor_batch_inserts/async_insert_batch/g' -e 's/monitor_split_batch_on_failure/async_insert_split_batch_on_failure/g' -e 's/monitor_sleep_time_ms/async_insert_sleep_time_ms/g' -e 's/monitor_max_sleep_time_ms/async_insert_max_sleep_time_ms/g' -i Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> * Rename async_insert for Distributed into background_insert This will avoid amigibuity between general async INSERT's and INSERT into Distributed, which are indeed background, so new term express it even better. Mostly done with: $ git di HEAD^ --name-only | xargs sed -i -e 's/distributed_async_insert/distributed_background_insert/g' -e 's/async_insert_batch/background_insert_batch/g' -e 's/async_insert_split_batch_on_failure/background_insert_split_batch_on_failure/g' -e 's/async_insert_sleep_time_ms/background_insert_sleep_time_ms/g' -e 's/async_insert_max_sleep_time_ms/background_insert_max_sleep_time_ms/g' Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> * Mark 02417_opentelemetry_insert_on_distributed_table as long CI: https://s3.amazonaws.com/clickhouse-test-reports/55978/7a6abb03a0b507e29e999cb7e04f246a119c6f28/stateless_tests_flaky_check__asan_.html Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> --------- Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2023-11-01 14:09:39 +00:00
/// Used for the INSERT into Distributed in case of distributed_foreground_insert==1, from DistributedSink.
DistributedAsyncInsertDirectoryQueue & getDirectoryQueue(const DiskPtr & disk, const std::string & name);
/// Parse the address corresponding to the directory name of the directory queue
Cluster::Addresses parseAddresses(const std::string & name) const;
/// Return list of metrics for all created monitors
/// (note that monitors are created lazily, i.e. until at least one INSERT executed)
///
/// Used by StorageSystemDistributionQueue
std::vector<DistributedAsyncInsertDirectoryQueue::Status> getDirectoryQueueStatuses() const;
static IColumn::Selector createSelector(ClusterPtr cluster, const ColumnWithTypeAndName & result);
/// Apply the following settings:
/// - optimize_skip_unused_shards
/// - force_optimize_skip_unused_shards
2023-09-09 12:51:34 +00:00
ClusterPtr getOptimizedCluster(
ContextPtr local_context,
const StorageSnapshotPtr & storage_snapshot,
2023-10-04 09:01:29 +00:00
const SelectQueryInfo & query_info,
2023-09-09 12:51:34 +00:00
const TreeRewriterResultPtr & syntax_analyzer_result) const;
2021-07-24 01:52:18 +00:00
2023-09-09 12:51:34 +00:00
ClusterPtr skipUnusedShards(
ClusterPtr cluster,
2023-10-04 09:01:29 +00:00
const SelectQueryInfo & query_info,
2023-09-09 12:51:34 +00:00
const TreeRewriterResultPtr & syntax_analyzer_result,
const StorageSnapshotPtr & storage_snapshot,
ContextPtr context) const;
ClusterPtr skipUnusedShardsWithAnalyzer(
ClusterPtr cluster, const SelectQueryInfo & query_info, const StorageSnapshotPtr & storage_snapshot, ContextPtr context) const;
/// This method returns optimal query processing stage.
///
/// Here is the list of stages (from the less optimal to more optimal):
/// - WithMergeableState
/// - WithMergeableStateAfterAggregation
/// - WithMergeableStateAfterAggregationAndLimit
/// - Complete
///
2022-04-17 23:02:49 +00:00
/// Some simple queries without GROUP BY/DISTINCT can use more optimal stage.
///
/// Also in case of optimize_distributed_group_by_sharding_key=1 the queries
/// with GROUP BY/DISTINCT sharding_key can also use more optimal stage.
/// (see also optimize_skip_unused_shards/allow_nondeterministic_optimize_skip_unused_shards)
///
/// @return QueryProcessingStage or empty std::optoinal
/// (in this case regular WithMergeableState should be used)
std::optional<QueryProcessingStage::Enum> getOptimizedQueryProcessingStage(const SelectQueryInfo & query_info, const Settings & settings) const;
std::optional<QueryProcessingStage::Enum> getOptimizedQueryProcessingStageAnalyzer(const SelectQueryInfo & query_info, const Settings & settings) const;
size_t getRandomShardIndex(const Cluster::ShardsInfo & shards);
Improvements for `parallel_distributed_insert_select` (and related) (#34728) * Add a warning if parallel_distributed_insert_select was ignored Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> * Respect max_distributed_depth for parallel_distributed_insert_select Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> * Print warning for non applied parallel_distributed_insert_select only for initial query Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> * Remove Cluster::getHashOfAddresses() Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> * Forbid parallel_distributed_insert_select for remote()/cluster() with different addresses Before it uses empty cluster name (getClusterName()) which is not correct, compare all addresses instead. Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> * Fix max_distributed_depth check max_distributed_depth=1 must mean not more then one distributed query, not two, since max_distributed_depth=0 means no limit, and distribute_depth is 0 for the first query. Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> * Fix INSERT INTO remote()/cluster() with parallel_distributed_insert_select Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> * Add a test for parallel_distributed_insert_select with cluster()/remote() Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> * Return <remote> instead of empty cluster name in Distributed engine Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> * Make user with sharding_key and w/o in remote()/cluster() identical Before with sharding_key the user was "default", while w/o it it was empty. Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2022-03-08 14:24:39 +00:00
std::string getClusterName() const { return cluster_name.empty() ? "<remote>" : cluster_name; }
const DistributedSettings & getDistributedSettingsRef() const { return distributed_settings; }
void delayInsertOrThrowIfNeeded() const;
std::optional<QueryPipeline> distributedWriteFromClusterStorage(const IStorageCluster & src_storage_cluster, const ASTInsertQuery & query, ContextPtr context) const;
std::optional<QueryPipeline> distributedWriteBetweenDistributedTables(const StorageDistributed & src_distributed, const ASTInsertQuery & query, ContextPtr context) const;
2012-05-21 20:38:34 +00:00
String remote_database;
String remote_table;
ASTPtr remote_table_function_ptr;
2020-05-30 21:57:37 +00:00
Poco::Logger * log;
/// Used to implement TableFunctionRemote.
std::shared_ptr<Cluster> owned_cluster;
/// Is empty if this storage implements TableFunctionRemote.
const String cluster_name;
bool has_sharding_key;
bool sharding_key_is_deterministic = false;
ExpressionActionsPtr sharding_key_expr;
String sharding_key_column_name;
/// Used for global monotonic ordering of files to send.
SimpleIncrement file_names_increment;
Rename directory monitor concept into background INSERT (#55978) * Limit log frequence for "Skipping send data over distributed table" message After SYSTEM STOP DISTRIBUTED SENDS it will constantly print this message. Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> * Rename directory monitor concept into async INSERT Rename the following query settings (with preserving backward compatiblity, by keeping old name as an alias): - distributed_directory_monitor_sleep_time_ms -> distributed_async_insert_sleep_time_ms - distributed_directory_monitor_max_sleep_time_ms -> distributed_async_insert_max_sleep_time_ms - distributed_directory_monitor_batch -> distributed_async_insert_batch_inserts - distributed_directory_monitor_split_batch_on_failure -> distributed_async_insert_split_batch_on_failure Rename the following table settings (with preserving backward compatiblity, by keeping old name as an alias): - monitor_batch_inserts -> async_insert_batch - monitor_split_batch_on_failure -> async_insert_split_batch_on_failure - directory_monitor_sleep_time_ms -> async_insert_sleep_time_ms - directory_monitor_max_sleep_time_ms -> async_insert_max_sleep_time_ms And also update all the references: $ gg -e directory_monitor_ -e monitor_ tests docs | cut -d: -f1 | sort -u | xargs sed -e 's/distributed_directory_monitor_sleep_time_ms/distributed_async_insert_sleep_time_ms/g' -e 's/distributed_directory_monitor_max_sleep_time_ms/distributed_async_insert_max_sleep_time_ms/g' -e 's/distributed_directory_monitor_batch_inserts/distributed_async_insert_batch/g' -e 's/distributed_directory_monitor_split_batch_on_failure/distributed_async_insert_split_batch_on_failure/g' -e 's/monitor_batch_inserts/async_insert_batch/g' -e 's/monitor_split_batch_on_failure/async_insert_split_batch_on_failure/g' -e 's/monitor_sleep_time_ms/async_insert_sleep_time_ms/g' -e 's/monitor_max_sleep_time_ms/async_insert_max_sleep_time_ms/g' -i Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> * Rename async_insert for Distributed into background_insert This will avoid amigibuity between general async INSERT's and INSERT into Distributed, which are indeed background, so new term express it even better. Mostly done with: $ git di HEAD^ --name-only | xargs sed -i -e 's/distributed_async_insert/distributed_background_insert/g' -e 's/async_insert_batch/background_insert_batch/g' -e 's/async_insert_split_batch_on_failure/background_insert_split_batch_on_failure/g' -e 's/async_insert_sleep_time_ms/background_insert_sleep_time_ms/g' -e 's/async_insert_max_sleep_time_ms/background_insert_max_sleep_time_ms/g' Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> * Mark 02417_opentelemetry_insert_on_distributed_table as long CI: https://s3.amazonaws.com/clickhouse-test-reports/55978/7a6abb03a0b507e29e999cb7e04f246a119c6f28/stateless_tests_flaky_check__asan_.html Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> --------- Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2023-11-01 14:09:39 +00:00
ActionBlocker async_insert_blocker;
String relative_data_path;
2020-07-23 14:10:48 +00:00
/// Can be empty if relative_data_path is empty. In this case, a directory for the data to be sent is not created.
2020-07-23 14:10:48 +00:00
StoragePolicyPtr storage_policy;
/// The main volume to store data.
/// Storage policy may have several configured volumes, but second and other volumes are used for parts movement in MergeTree engine.
/// For Distributed engine such configuration doesn't make sense and only the first (main) volume will be used to store data.
/// Other volumes will be ignored. It's needed to allow using the same multi-volume policy both for Distributed and other engines.
VolumePtr data_volume;
DistributedSettings distributed_settings;
struct ClusterNodeData
{
Rename directory monitor concept into background INSERT (#55978) * Limit log frequence for "Skipping send data over distributed table" message After SYSTEM STOP DISTRIBUTED SENDS it will constantly print this message. Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> * Rename directory monitor concept into async INSERT Rename the following query settings (with preserving backward compatiblity, by keeping old name as an alias): - distributed_directory_monitor_sleep_time_ms -> distributed_async_insert_sleep_time_ms - distributed_directory_monitor_max_sleep_time_ms -> distributed_async_insert_max_sleep_time_ms - distributed_directory_monitor_batch -> distributed_async_insert_batch_inserts - distributed_directory_monitor_split_batch_on_failure -> distributed_async_insert_split_batch_on_failure Rename the following table settings (with preserving backward compatiblity, by keeping old name as an alias): - monitor_batch_inserts -> async_insert_batch - monitor_split_batch_on_failure -> async_insert_split_batch_on_failure - directory_monitor_sleep_time_ms -> async_insert_sleep_time_ms - directory_monitor_max_sleep_time_ms -> async_insert_max_sleep_time_ms And also update all the references: $ gg -e directory_monitor_ -e monitor_ tests docs | cut -d: -f1 | sort -u | xargs sed -e 's/distributed_directory_monitor_sleep_time_ms/distributed_async_insert_sleep_time_ms/g' -e 's/distributed_directory_monitor_max_sleep_time_ms/distributed_async_insert_max_sleep_time_ms/g' -e 's/distributed_directory_monitor_batch_inserts/distributed_async_insert_batch/g' -e 's/distributed_directory_monitor_split_batch_on_failure/distributed_async_insert_split_batch_on_failure/g' -e 's/monitor_batch_inserts/async_insert_batch/g' -e 's/monitor_split_batch_on_failure/async_insert_split_batch_on_failure/g' -e 's/monitor_sleep_time_ms/async_insert_sleep_time_ms/g' -e 's/monitor_max_sleep_time_ms/async_insert_max_sleep_time_ms/g' -i Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> * Rename async_insert for Distributed into background_insert This will avoid amigibuity between general async INSERT's and INSERT into Distributed, which are indeed background, so new term express it even better. Mostly done with: $ git di HEAD^ --name-only | xargs sed -i -e 's/distributed_async_insert/distributed_background_insert/g' -e 's/async_insert_batch/background_insert_batch/g' -e 's/async_insert_split_batch_on_failure/background_insert_split_batch_on_failure/g' -e 's/async_insert_sleep_time_ms/background_insert_sleep_time_ms/g' -e 's/async_insert_max_sleep_time_ms/background_insert_max_sleep_time_ms/g' Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> * Mark 02417_opentelemetry_insert_on_distributed_table as long CI: https://s3.amazonaws.com/clickhouse-test-reports/55978/7a6abb03a0b507e29e999cb7e04f246a119c6f28/stateless_tests_flaky_check__asan_.html Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> --------- Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2023-11-01 14:09:39 +00:00
std::shared_ptr<DistributedAsyncInsertDirectoryQueue> directory_queue;
2020-06-03 00:10:39 +00:00
ConnectionPoolPtr connection_pool;
Cluster::Addresses addresses;
size_t clusters_version;
};
std::unordered_map<std::string, ClusterNodeData> cluster_nodes_data;
mutable std::mutex cluster_nodes_mutex;
// For random shard index generation
mutable std::mutex rng_mutex;
pcg64 rng;
2012-05-21 20:38:34 +00:00
};
}