2020-04-15 20:28:05 +00:00
# include <Core/Defines.h>
2022-10-03 21:30:50 +00:00
# include <ranges>
2021-06-20 08:24:43 +00:00
# include "Common/hex.h"
2019-05-03 02:00:57 +00:00
# include <Common/Macros.h>
# include <Common/StringUtils/StringUtils.h>
2019-11-27 09:39:44 +00:00
# include <Common/ZooKeeper/KeeperException.h>
# include <Common/ZooKeeper/Types.h>
# include <Common/escapeForFileName.h>
# include <Common/formatReadable.h>
# include <Common/thread_local_rng.h>
# include <Common/typeid_cast.h>
2022-04-21 19:19:13 +00:00
# include <Storages/MergeTree/DataPartStorageOnDisk.h>
2017-04-01 09:19:00 +00:00
2022-06-02 16:09:40 +00:00
# include <Disks/ObjectStorages/IMetadataStorage.h>
2022-01-30 19:49:48 +00:00
# include <base/sort.h>
2018-12-25 23:15:28 +00:00
# include <Storages/AlterCommands.h>
2018-12-25 23:18:07 +00:00
# include <Storages/PartitionCommands.h>
2017-04-01 09:19:00 +00:00
# include <Storages/ColumnsDescription.h>
# include <Storages/StorageReplicatedMergeTree.h>
2019-10-10 16:30:30 +00:00
# include <Storages/MergeTree/IMergeTreeDataPart.h>
2018-12-11 13:30:20 +00:00
# include <Storages/MergeTree/MergeList.h>
2021-09-06 12:01:16 +00:00
# include <Storages/MergeTree/MergeTreeBackgroundExecutor.h>
2021-06-29 15:14:44 +00:00
# include <Storages/MergeTree/MergedBlockOutputStream.h>
2020-11-24 14:24:48 +00:00
# include <Storages/MergeTree/PinnedPartUUIDs.h>
2022-08-19 08:17:02 +00:00
# include <Storages/MergeTree/ReplicatedMergeTreeAttachThread.h>
2018-11-01 13:30:38 +00:00
# include <Storages/MergeTree/ReplicatedMergeTreeTableMetadata.h>
2021-07-26 16:48:25 +00:00
# include <Storages/MergeTree/ReplicatedMergeTreeSink.h>
2017-04-01 09:19:00 +00:00
# include <Storages/MergeTree/ReplicatedMergeTreeQuorumEntry.h>
2018-04-19 10:33:16 +00:00
# include <Storages/MergeTree/ReplicatedMergeTreeMutationEntry.h>
2017-04-01 09:19:00 +00:00
# include <Storages/MergeTree/ReplicatedMergeTreeAddress.h>
2018-09-19 11:08:04 +00:00
# include <Storages/MergeTree/ReplicatedMergeTreeQuorumAddedParts.h>
2018-12-11 13:30:20 +00:00
# include <Storages/MergeTree/ReplicatedMergeTreePartHeader.h>
2021-09-16 21:19:58 +00:00
# include <Storages/MergeTree/MergeFromLogEntryTask.h>
# include <Storages/MergeTree/MutateFromLogEntryTask.h>
2019-05-03 02:00:57 +00:00
# include <Storages/VirtualColumnUtils.h>
2021-04-04 09:23:40 +00:00
# include <Storages/MergeTree/MergeTreeReaderCompact.h>
2021-12-07 16:55:55 +00:00
# include <Storages/MergeTree/LeaderElection.h>
2022-01-17 11:52:51 +00:00
# include <Storages/MergeTree/ZeroCopyLock.h>
2022-06-24 19:29:38 +00:00
# include <Storages/MergeTree/extractZkPathFromCreateQuery.h>
2022-06-08 12:09:59 +00:00
# include <Storages/Freeze.h>
2017-04-01 09:19:00 +00:00
2020-11-24 10:24:39 +00:00
# include <Databases/DatabaseOnDisk.h>
2022-07-29 16:33:16 +00:00
# include <Databases/DatabaseReplicated.h>
2017-04-01 09:19:00 +00:00
# include <Parsers/formatAST.h>
2022-01-10 19:01:41 +00:00
# include <Parsers/parseQuery.h>
2018-10-13 20:58:04 +00:00
# include <Parsers/ASTDropQuery.h>
2021-11-26 17:21:54 +00:00
# include <Parsers/ASTFunction.h>
2017-06-22 15:01:08 +00:00
# include <Parsers/ASTOptimizeQuery.h>
2022-03-31 09:50:07 +00:00
# include <Parsers/ASTPartition.h>
2017-09-06 20:34:26 +00:00
# include <Parsers/ASTLiteral.h>
2019-05-03 02:00:57 +00:00
# include <Parsers/queryToString.h>
2019-07-03 13:17:19 +00:00
# include <Parsers/ASTCheckQuery.h>
2022-01-10 19:01:41 +00:00
# include <Parsers/ExpressionListParsers.h>
2017-04-01 09:19:00 +00:00
2021-09-08 18:29:38 +00:00
# include <Processors/QueryPlan/QueryPlan.h>
2021-12-09 10:39:28 +00:00
# include <Processors/Sources/RemoteSource.h>
2021-03-04 17:38:12 +00:00
# include <Processors/QueryPlan/BuildQueryPipelineSettings.h>
# include <Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h>
2021-12-09 10:39:28 +00:00
# include <Processors/QueryPlan/ReadFromPreparedSource.h>
2021-03-04 17:38:12 +00:00
2017-04-01 09:19:00 +00:00
# include <IO/ReadBufferFromString.h>
# include <IO/Operators.h>
2017-12-27 17:58:52 +00:00
# include <IO/ConnectionTimeouts.h>
2020-12-10 22:05:02 +00:00
# include <IO/ConnectionTimeoutsContext.h>
2022-04-27 14:52:03 +00:00
# include <Disks/createVolume.h>
2017-04-01 09:19:00 +00:00
# include <Interpreters/InterpreterAlterQuery.h>
# include <Interpreters/PartLog.h>
2020-05-20 20:16:32 +00:00
# include <Interpreters/Context.h>
2020-11-20 16:06:27 +00:00
# include <Interpreters/DDLTask.h>
2021-04-07 13:52:11 +00:00
# include <Interpreters/InterserverCredentials.h>
2021-12-09 10:39:28 +00:00
# include <Interpreters/SelectQueryOptions.h>
# include <Interpreters/InterpreterSelectQuery.h>
2017-04-01 09:19:00 +00:00
2022-05-29 19:53:56 +00:00
# include <Backups/BackupEntriesCollector.h>
2022-05-31 09:33:23 +00:00
# include <Backups/IBackup.h>
2022-05-23 12:05:35 +00:00
# include <Backups/IBackupCoordination.h>
# include <Backups/IBackupEntry.h>
2022-04-19 18:15:27 +00:00
# include <Backups/IRestoreCoordination.h>
2022-05-31 09:33:23 +00:00
# include <Backups/RestorerFromBackup.h>
2022-04-13 13:26:17 +00:00
2015-04-16 06:12:35 +00:00
# include <Poco/DirectoryIterator.h>
2014-03-21 13:42:14 +00:00
2021-10-02 07:13:14 +00:00
# include <base/scope_guard.h>
2022-04-27 15:05:45 +00:00
# include <Common/scope_guard_safe.h>
2016-08-10 07:20:21 +00:00
2021-11-18 18:07:35 +00:00
# include <boost/algorithm/string/join.hpp>
# include <boost/algorithm/string/replace.hpp>
2022-04-15 16:36:23 +00:00
# include <boost/algorithm/string.hpp>
2021-11-18 18:07:35 +00:00
2021-06-20 08:24:43 +00:00
# include <algorithm>
2016-01-28 01:00:27 +00:00
# include <ctime>
2021-06-20 08:24:43 +00:00
# include <filesystem>
# include <iterator>
# include <numeric>
2016-01-28 01:00:27 +00:00
# include <thread>
# include <future>
2022-04-10 22:44:30 +00:00
2021-05-08 10:59:55 +00:00
namespace fs = std : : filesystem ;
2016-10-24 02:02:37 +00:00
namespace ProfileEvents
{
extern const Event ReplicatedPartFailedFetches ;
extern const Event ReplicatedPartFetchesOfMerged ;
extern const Event ObsoleteReplicatedParts ;
extern const Event ReplicatedPartFetches ;
2020-06-12 20:38:43 +00:00
extern const Event CreatedLogEntryForMerge ;
extern const Event NotCreatedLogEntryForMerge ;
extern const Event CreatedLogEntryForMutation ;
extern const Event NotCreatedLogEntryForMutation ;
2018-04-06 16:06:07 +00:00
}
2020-10-26 11:02:47 +00:00
namespace CurrentMetrics
{
extern const Metric BackgroundFetchesPoolTask ;
}
2018-04-06 16:06:07 +00:00
2014-03-21 13:42:14 +00:00
namespace DB
{
2016-01-11 21:46:36 +00:00
namespace ErrorCodes
{
2020-02-25 18:02:41 +00:00
extern const int CANNOT_READ_ALL_DATA ;
extern const int NOT_IMPLEMENTED ;
2016-01-11 21:46:36 +00:00
extern const int NO_ZOOKEEPER ;
extern const int INCORRECT_DATA ;
extern const int INCOMPATIBLE_COLUMNS ;
extern const int REPLICA_IS_ALREADY_EXIST ;
extern const int NO_REPLICA_HAS_PART ;
extern const int LOGICAL_ERROR ;
extern const int TOO_MANY_UNEXPECTED_DATA_PARTS ;
extern const int ABORTED ;
extern const int REPLICA_IS_NOT_IN_QUORUM ;
extern const int TABLE_IS_READ_ONLY ;
2022-10-05 18:32:43 +00:00
extern const int TABLE_IS_DROPPED ;
2016-01-11 21:46:36 +00:00
extern const int NOT_FOUND_NODE ;
extern const int NO_ACTIVE_REPLICAS ;
2020-06-19 14:18:58 +00:00
extern const int NOT_A_LEADER ;
2016-01-11 21:46:36 +00:00
extern const int TABLE_WAS_NOT_DROPPED ;
extern const int PARTITION_ALREADY_EXISTS ;
2018-03-09 23:23:15 +00:00
extern const int TOO_MANY_RETRIES_TO_FETCH_PARTS ;
2016-01-11 21:46:36 +00:00
extern const int RECEIVED_ERROR_FROM_REMOTE_IO_SERVER ;
extern const int PARTITION_DOESNT_EXIST ;
2016-04-09 05:43:55 +00:00
extern const int UNFINISHED ;
2017-04-06 13:03:23 +00:00
extern const int RECEIVED_ERROR_TOO_MANY_REQUESTS ;
2017-11-20 19:33:12 +00:00
extern const int PART_IS_TEMPORARILY_LOCKED ;
2018-01-12 17:30:21 +00:00
extern const int CANNOT_ASSIGN_OPTIMIZE ;
2018-08-20 13:31:24 +00:00
extern const int ALL_REPLICAS_LOST ;
2018-08-23 13:55:59 +00:00
extern const int REPLICA_STATUS_CHANGED ;
2020-02-13 14:48:38 +00:00
extern const int CANNOT_ASSIGN_ALTER ;
2020-08-28 00:53:22 +00:00
extern const int DIRECTORY_ALREADY_EXISTS ;
2020-08-27 14:19:18 +00:00
extern const int ILLEGAL_TYPE_OF_ARGUMENT ;
2020-10-16 11:58:47 +00:00
extern const int UNKNOWN_POLICY ;
2020-11-03 09:24:10 +00:00
extern const int NO_SUCH_DATA_PART ;
2020-12-25 13:38:04 +00:00
extern const int INTERSERVER_SCHEME_DOESNT_MATCH ;
2021-04-14 09:11:59 +00:00
extern const int DUPLICATE_DATA_PART ;
2020-11-24 14:24:48 +00:00
extern const int BAD_ARGUMENTS ;
2021-06-20 08:24:43 +00:00
extern const int CONCURRENT_ACCESS_NOT_SUPPORTED ;
2021-08-18 09:49:22 +00:00
extern const int CHECKSUM_DOESNT_MATCH ;
2022-08-12 09:32:13 +00:00
extern const int NOT_INITIALIZED ;
2018-05-21 13:49:54 +00:00
}
namespace ActionLocks
{
extern const StorageActionBlockType PartsMerge ;
extern const StorageActionBlockType PartsFetch ;
extern const StorageActionBlockType PartsSend ;
extern const StorageActionBlockType ReplicationQueue ;
2019-08-01 15:36:12 +00:00
extern const StorageActionBlockType PartsTTLMerge ;
2019-09-03 14:50:49 +00:00
extern const StorageActionBlockType PartsMove ;
2016-01-11 21:46:36 +00:00
}
2014-04-03 11:48:28 +00:00
2020-04-27 16:19:04 +00:00
static const auto QUEUE_UPDATE_ERROR_SLEEP_MS = 1 * 1000 ;
static const auto MUTATIONS_FINALIZING_SLEEP_MS = 1 * 1000 ;
static const auto MUTATIONS_FINALIZING_IDLE_SLEEP_MS = 5 * 1000 ;
2014-04-03 11:48:28 +00:00
2020-11-16 08:27:33 +00:00
void StorageReplicatedMergeTree : : setZooKeeper ( )
2016-01-17 08:12:48 +00:00
{
2021-02-27 08:07:14 +00:00
/// Every ReplicatedMergeTree table is using only one ZooKeeper session.
/// But if several ReplicatedMergeTree tables are using different
/// ZooKeeper sessions, some queries like ATTACH PARTITION FROM may have
/// strange effects. So we always use only one session for all tables.
/// (excluding auxiliary zookeepers)
2019-01-02 06:44:36 +00:00
std : : lock_guard lock ( current_zookeeper_mutex ) ;
2020-11-16 08:27:33 +00:00
if ( zookeeper_name = = default_zookeeper_name )
{
2021-04-10 23:33:54 +00:00
current_zookeeper = getContext ( ) - > getZooKeeper ( ) ;
2020-11-16 08:27:33 +00:00
}
else
{
2021-04-10 23:33:54 +00:00
current_zookeeper = getContext ( ) - > getAuxiliaryZooKeeper ( zookeeper_name ) ;
2020-11-16 08:27:33 +00:00
}
2016-01-17 08:12:48 +00:00
}
2019-10-28 17:27:43 +00:00
zkutil : : ZooKeeperPtr StorageReplicatedMergeTree : : tryGetZooKeeper ( ) const
2016-01-17 08:12:48 +00:00
{
2019-01-02 06:44:36 +00:00
std : : lock_guard lock ( current_zookeeper_mutex ) ;
2016-01-17 08:12:48 +00:00
return current_zookeeper ;
}
2019-10-28 17:27:43 +00:00
zkutil : : ZooKeeperPtr StorageReplicatedMergeTree : : getZooKeeper ( ) const
2016-01-17 08:12:48 +00:00
{
auto res = tryGetZooKeeper ( ) ;
if ( ! res )
throw Exception ( " Cannot get ZooKeeper " , ErrorCodes : : NO_ZOOKEEPER ) ;
return res ;
}
2022-02-03 10:10:05 +00:00
zkutil : : ZooKeeperPtr StorageReplicatedMergeTree : : getZooKeeperAndAssertNotReadonly ( ) const
{
/// There's a short period of time after connection loss when new session is created,
/// but replication queue is not reinitialized. We must ensure that table is not readonly anymore
/// before using new ZooKeeper session to write something (except maybe GET_PART) into replication log.
auto res = getZooKeeper ( ) ;
assertNotReadonly ( ) ;
return res ;
}
2021-05-13 11:29:59 +00:00
static MergeTreePartInfo makeDummyDropRangeForMovePartitionOrAttachPartitionFrom ( const String & partition_id )
{
/// NOTE We don't have special log entry type for MOVE PARTITION/ATTACH PARTITION FROM,
/// so we use REPLACE_RANGE with dummy range of one block, which means "attach, not replace".
/// It's safe to fill drop range for MOVE PARTITION/ATTACH PARTITION FROM with zeros,
/// because drop range for REPLACE PARTITION must contain at least 2 blocks,
/// so we can distinguish dummy drop range from any real or virtual part.
/// But we should never construct such part name, even for virtual part,
/// because it can be confused with real part <partition>_0_0_0.
/// TODO get rid of this.
MergeTreePartInfo drop_range ;
drop_range . partition_id = partition_id ;
drop_range . min_block = 0 ;
drop_range . max_block = 0 ;
drop_range . level = 0 ;
drop_range . mutation = 0 ;
return drop_range ;
}
2014-03-21 13:42:14 +00:00
StorageReplicatedMergeTree : : StorageReplicatedMergeTree (
const String & zookeeper_path_ ,
const String & replica_name_ ,
2014-03-21 19:17:59 +00:00
bool attach ,
2019-12-04 16:06:55 +00:00
const StorageID & table_id_ ,
2019-10-28 20:12:14 +00:00
const String & relative_data_path_ ,
2020-06-08 18:23:26 +00:00
const StorageInMemoryMetadata & metadata_ ,
2021-05-31 14:49:02 +00:00
ContextMutablePtr context_ ,
2017-09-08 18:11:09 +00:00
const String & date_column_name ,
2019-05-03 02:00:57 +00:00
const MergingParams & merging_params_ ,
2019-08-26 14:24:29 +00:00
std : : unique_ptr < MergeTreeSettings > settings_ ,
2020-09-26 19:18:28 +00:00
bool has_force_restore_data_flag ,
2022-04-13 14:51:59 +00:00
RenamingRestrictions renaming_restrictions_ )
2019-12-30 11:08:09 +00:00
: MergeTreeData ( table_id_ ,
relative_data_path_ ,
2020-06-08 18:23:26 +00:00
metadata_ ,
2019-12-30 11:08:09 +00:00
context_ ,
date_column_name ,
merging_params_ ,
std : : move ( settings_ ) ,
true , /// require_part_metadata
attach ,
[ this ] ( const std : : string & name ) { enqueuePartForCheck ( name ) ; } )
2021-12-15 11:30:57 +00:00
, zookeeper_name ( zkutil : : extractZooKeeperName ( zookeeper_path_ ) )
, zookeeper_path ( zkutil : : extractZooKeeperPath ( zookeeper_path_ , /* check_starts_with_slash */ ! attach , log ) )
2020-07-10 09:19:32 +00:00
, replica_name ( replica_name_ )
2021-05-09 11:59:49 +00:00
, replica_path ( fs : : path ( zookeeper_path ) / " replicas " / replica_name_ )
2019-12-30 11:08:09 +00:00
, reader ( * this )
, writer ( * this )
2022-05-26 12:14:58 +00:00
, merger_mutator ( * this , getContext ( ) - > getMergeMutateExecutor ( ) - > getMaxTasksCount ( ) )
2020-09-18 10:57:33 +00:00
, merge_strategy_picker ( * this )
, queue ( * this , merge_strategy_picker )
2019-12-30 11:08:09 +00:00
, fetcher ( * this )
, cleanup_thread ( * this )
, part_check_thread ( * this )
, restarting_thread ( * this )
2020-11-24 14:24:48 +00:00
, part_moves_between_shards_orchestrator ( * this )
2022-04-13 14:51:59 +00:00
, renaming_restrictions ( renaming_restrictions_ )
2022-10-06 11:18:46 +00:00
, replicated_fetches_pool_size ( getContext ( ) - > getFetchesExecutor ( ) - > getMaxTasksCount ( ) )
2021-05-26 20:37:44 +00:00
, replicated_fetches_throttler ( std : : make_shared < Throttler > ( getSettings ( ) - > max_replicated_fetches_network_bandwidth , getContext ( ) - > getReplicatedFetchesThrottler ( ) ) )
, replicated_sends_throttler ( std : : make_shared < Throttler > ( getSettings ( ) - > max_replicated_sends_network_bandwidth , getContext ( ) - > getReplicatedSendsThrottler ( ) ) )
2014-03-21 13:42:14 +00:00
{
2022-10-18 14:36:11 +00:00
/// We create and deactivate all tasks for consistency.
/// They all will be scheduled and activated by the restarting thread.
2021-04-10 23:33:54 +00:00
queue_updating_task = getContext ( ) - > getSchedulePool ( ) . createTask (
2020-06-11 03:24:52 +00:00
getStorageID ( ) . getFullTableName ( ) + " (StorageReplicatedMergeTree::queueUpdatingTask) " , [ this ] { queueUpdatingTask ( ) ; } ) ;
2018-05-31 13:05:05 +00:00
2022-10-18 13:53:52 +00:00
queue_updating_task - > deactivate ( ) ;
2021-04-10 23:33:54 +00:00
mutations_updating_task = getContext ( ) - > getSchedulePool ( ) . createTask (
2020-06-11 03:24:52 +00:00
getStorageID ( ) . getFullTableName ( ) + " (StorageReplicatedMergeTree::mutationsUpdatingTask) " , [ this ] { mutationsUpdatingTask ( ) ; } ) ;
2018-05-31 13:05:05 +00:00
2022-10-18 13:53:52 +00:00
mutations_updating_task - > deactivate ( ) ;
2021-04-10 23:33:54 +00:00
merge_selecting_task = getContext ( ) - > getSchedulePool ( ) . createTask (
2020-06-11 03:24:52 +00:00
getStorageID ( ) . getFullTableName ( ) + " (StorageReplicatedMergeTree::mergeSelectingTask) " , [ this ] { mergeSelectingTask ( ) ; } ) ;
2020-07-09 23:45:29 +00:00
2022-10-18 13:50:02 +00:00
/// Will be activated if we will achieve leader state.
2018-05-31 13:05:05 +00:00
merge_selecting_task - > deactivate ( ) ;
2021-04-10 23:33:54 +00:00
mutations_finalizing_task = getContext ( ) - > getSchedulePool ( ) . createTask (
2020-06-11 03:24:52 +00:00
getStorageID ( ) . getFullTableName ( ) + " (StorageReplicatedMergeTree::mutationsFinalizingTask) " , [ this ] { mutationsFinalizingTask ( ) ; } ) ;
2018-06-21 13:27:36 +00:00
2022-10-18 13:50:02 +00:00
/// This task can be scheduled by different parts of code even when storage is readonly.
/// This can lead to redundant exceptions during startup.
/// Will be activated by restarting thread.
mutations_finalizing_task - > deactivate ( ) ;
2022-08-12 09:32:13 +00:00
bool has_zookeeper = getContext ( ) - > hasZooKeeper ( ) | | getContext ( ) - > hasAuxiliaryZooKeeper ( zookeeper_name ) ;
if ( has_zookeeper )
2020-09-07 23:01:49 +00:00
{
/// It's possible for getZooKeeper() to timeout if zookeeper host(s) can't
/// be reached. In such cases Poco::Exception is thrown after a connection
/// timeout - refer to src/Common/ZooKeeper/ZooKeeperImpl.cpp:866 for more info.
///
/// Side effect of this is that the CreateQuery gets interrupted and it exits.
/// But the data Directories for the tables being created aren't cleaned up.
/// This unclean state will hinder table creation on any retries and will
/// complain that the Directory for table already exists.
///
2020-09-10 16:09:48 +00:00
/// To achieve a clean state on failed table creations, catch this error and
/// call dropIfEmpty() method only if the operation isn't ATTACH then proceed
/// throwing the exception. Without this, the Directory for the tables need
/// to be manually deleted before retrying the CreateQuery.
2020-09-07 23:01:49 +00:00
try
{
2020-11-16 08:27:33 +00:00
if ( zookeeper_name = = default_zookeeper_name )
{
2021-04-10 23:33:54 +00:00
current_zookeeper = getContext ( ) - > getZooKeeper ( ) ;
2020-11-16 08:27:33 +00:00
}
else
{
2021-04-10 23:33:54 +00:00
current_zookeeper = getContext ( ) - > getAuxiliaryZooKeeper ( zookeeper_name ) ;
2020-11-16 08:27:33 +00:00
}
2020-09-07 23:01:49 +00:00
}
2020-09-10 10:21:13 +00:00
catch ( . . . )
2020-09-07 23:01:49 +00:00
{
2020-09-10 10:21:13 +00:00
if ( ! attach )
2022-08-12 09:32:13 +00:00
{
2020-09-10 10:21:13 +00:00
dropIfEmpty ( ) ;
2022-08-12 09:32:13 +00:00
throw ;
}
else
{
current_zookeeper = nullptr ;
}
2020-09-07 23:01:49 +00:00
}
}
2018-04-21 18:41:06 +00:00
2014-08-13 08:07:52 +00:00
bool skip_sanity_checks = false ;
2022-04-04 22:51:48 +00:00
/// It does not make sense for CREATE query
if ( attach )
2014-08-13 08:07:52 +00:00
{
2022-08-25 20:08:48 +00:00
if ( current_zookeeper & & current_zookeeper - > exists ( replica_path + " /host " ) )
{
/// Check it earlier if we can (we don't want incompatible version to start).
/// If "/host" doesn't exist, then replica is probably dropped and there's nothing to check.
ReplicatedMergeTreeAttachThread : : checkHasReplicaMetadataInZooKeeper ( current_zookeeper , replica_path ) ;
}
2022-04-04 22:51:48 +00:00
if ( current_zookeeper & & current_zookeeper - > exists ( replica_path + " /flags/force_restore_data " ) )
{
skip_sanity_checks = true ;
current_zookeeper - > remove ( replica_path + " /flags/force_restore_data " ) ;
2017-04-01 07:20:54 +00:00
2022-04-04 22:51:48 +00:00
LOG_WARNING ( log , " Skipping the limits on severity of changes to data parts and columns (flag {}/flags/force_restore_data). " , replica_path ) ;
}
else if ( has_force_restore_data_flag )
{
skip_sanity_checks = true ;
2018-04-09 15:49:12 +00:00
2022-04-04 22:51:48 +00:00
LOG_WARNING ( log , " Skipping the limits on severity of changes to data parts and columns (flag force_restore_data). " ) ;
}
2014-08-13 08:07:52 +00:00
}
2017-04-01 07:20:54 +00:00
2019-05-03 02:00:57 +00:00
loadDataParts ( skip_sanity_checks ) ;
2017-04-01 07:20:54 +00:00
2014-12-12 20:50:32 +00:00
if ( ! current_zookeeper )
2014-05-13 11:24:04 +00:00
{
2014-07-23 13:58:38 +00:00
if ( ! attach )
2020-09-10 10:21:13 +00:00
{
dropIfEmpty ( ) ;
2014-07-23 13:58:38 +00:00
throw Exception ( " Can't create replicated table without ZooKeeper " , ErrorCodes : : NO_ZOOKEEPER ) ;
2020-09-10 10:21:13 +00:00
}
2017-04-01 07:20:54 +00:00
2022-02-03 10:10:05 +00:00
has_metadata_in_zookeeper = std : : nullopt ;
2022-08-12 09:32:13 +00:00
if ( ! has_zookeeper )
2022-08-12 11:56:46 +00:00
{
/// Do not activate the replica. It will be readonly.
LOG_ERROR ( log , " No ZooKeeper defined: table will stay in readonly mode. " ) ;
2022-08-12 09:32:13 +00:00
return ;
2022-08-12 11:56:46 +00:00
}
2014-05-13 11:24:04 +00:00
}
2017-04-01 07:20:54 +00:00
2022-08-12 09:32:13 +00:00
if ( attach )
2019-03-14 11:49:44 +00:00
{
2022-08-19 08:49:51 +00:00
LOG_INFO ( log , " Table will be in readonly mode until initialization is finished " ) ;
2022-08-24 17:44:14 +00:00
attach_thread . emplace ( * this ) ;
attach_thread - > setSkipSanityChecks ( skip_sanity_checks ) ;
2019-03-14 11:49:44 +00:00
return ;
}
2020-06-16 16:55:04 +00:00
auto metadata_snapshot = getInMemoryMetadataPtr ( ) ;
2022-02-03 10:10:05 +00:00
has_metadata_in_zookeeper = true ;
2022-08-12 09:32:13 +00:00
if ( ! getDataPartsForInternalUsage ( ) . empty ( ) )
throw Exception ( " Data directory for table already contains data parts "
" - probably it was unclean DROP table or manual intervention. "
" You must either clear directory by hand or use ATTACH TABLE "
" instead of CREATE TABLE if you need to use that parts. " , ErrorCodes : : INCORRECT_DATA ) ;
try
2014-03-21 19:17:59 +00:00
{
2022-08-12 09:32:13 +00:00
bool is_first_replica = createTableIfNotExists ( metadata_snapshot ) ;
2017-04-01 07:20:54 +00:00
2020-06-11 03:24:52 +00:00
try
{
2022-08-12 09:32:13 +00:00
/// NOTE If it's the first replica, these requests to ZooKeeper look redundant, we already know everything.
2017-04-01 07:20:54 +00:00
2022-08-12 09:32:13 +00:00
/// We have to check granularity on other replicas. If it's fixed we
/// must create our new replica with fixed granularity and store this
/// information in /replica/metadata.
2021-06-28 17:02:22 +00:00
other_replicas_fixed_granularity = checkFixedGranularityInZookeeper ( ) ;
2021-06-20 08:24:43 +00:00
2022-08-19 08:49:51 +00:00
checkTableStructure ( zookeeper_path , metadata_snapshot ) ;
2017-05-31 15:01:25 +00:00
2020-02-14 10:17:04 +00:00
Coordination : : Stat metadata_stat ;
current_zookeeper - > get ( zookeeper_path + " /metadata " , & metadata_stat ) ;
metadata_version = metadata_stat . version ;
}
2022-08-12 09:32:13 +00:00
catch ( Coordination : : Exception & e )
2020-06-11 03:24:52 +00:00
{
2022-08-12 09:32:13 +00:00
if ( ! is_first_replica & & e . code = = Coordination : : Error : : ZNONODE )
throw Exception ( " Table " + zookeeper_path + " was suddenly removed. " , ErrorCodes : : ALL_REPLICAS_LOST ) ;
else
throw ;
2020-06-11 03:24:52 +00:00
}
2022-08-12 09:32:13 +00:00
if ( ! is_first_replica )
createReplica ( metadata_snapshot ) ;
2014-03-21 19:17:59 +00:00
}
2022-08-12 09:32:13 +00:00
catch ( . . . )
2014-03-21 19:17:59 +00:00
{
2022-08-12 09:32:13 +00:00
/// If replica was not created, rollback creation of data directory.
dropIfEmpty ( ) ;
throw ;
2014-03-21 19:17:59 +00:00
}
2017-04-01 07:20:54 +00:00
2015-11-09 20:30:54 +00:00
createNewZooKeeperNodes ( ) ;
2020-11-24 14:24:48 +00:00
syncPinnedPartUUIDs ( ) ;
2021-12-27 16:27:06 +00:00
2022-10-05 18:32:43 +00:00
if ( ! has_metadata_in_zookeeper . has_value ( ) | | * has_metadata_in_zookeeper )
createTableSharedID ( ) ;
2021-12-27 16:27:06 +00:00
2022-08-19 11:12:20 +00:00
initialization_done = true ;
2019-08-12 13:30:29 +00:00
}
2022-05-29 19:53:56 +00:00
String StorageReplicatedMergeTree : : getDefaultZooKeeperPath ( const Poco : : Util : : AbstractConfiguration & config )
{
return config . getString ( " default_replica_path " , " /clickhouse/tables/{uuid}/{shard} " ) ;
}
String StorageReplicatedMergeTree : : getDefaultReplicaName ( const Poco : : Util : : AbstractConfiguration & config )
{
return config . getString ( " default_replica_name " , " {replica} " ) ;
}
2021-06-28 17:02:22 +00:00
bool StorageReplicatedMergeTree : : checkFixedGranularityInZookeeper ( )
2019-08-12 13:30:29 +00:00
{
auto zookeeper = getZooKeeper ( ) ;
String metadata_str = zookeeper - > get ( zookeeper_path + " /metadata " ) ;
auto metadata_from_zk = ReplicatedMergeTreeTableMetadata : : parse ( metadata_str ) ;
return metadata_from_zk . index_granularity_bytes = = 0 ;
2014-03-21 13:42:14 +00:00
}
2014-10-17 01:05:51 +00:00
2019-12-19 15:27:56 +00:00
void StorageReplicatedMergeTree : : waitMutationToFinishOnReplicas (
const Strings & replicas , const String & mutation_id ) const
2019-12-16 15:51:15 +00:00
{
2019-12-19 15:27:56 +00:00
if ( replicas . empty ( ) )
return ;
2022-08-04 18:26:39 +00:00
/// Current replica must always be present in the list as the first element because we use local mutation status
/// to check for mutation errors. So if it is not there, just add it.
const Strings * all_required_replicas = & replicas ;
Strings extended_list_of_replicas ;
if ( replicas . front ( ) ! = replica_name )
{
extended_list_of_replicas . push_back ( replica_name ) ;
extended_list_of_replicas . insert ( extended_list_of_replicas . end ( ) , replicas . begin ( ) , replicas . end ( ) ) ;
all_required_replicas = & extended_list_of_replicas ;
}
2019-12-19 15:27:56 +00:00
2019-12-16 15:51:15 +00:00
std : : set < String > inactive_replicas ;
2022-08-04 18:26:39 +00:00
for ( const String & replica : * all_required_replicas )
2019-12-16 15:51:15 +00:00
{
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Waiting for {} to apply mutation {} " , replica , mutation_id ) ;
2021-04-05 15:08:43 +00:00
zkutil : : EventPtr wait_event = std : : make_shared < Poco : : Event > ( ) ;
2019-12-19 15:27:56 +00:00
2019-12-16 15:51:15 +00:00
while ( ! partial_shutdown_called )
{
2019-12-19 15:27:56 +00:00
/// Mutation maybe killed or whole replica was deleted.
/// Wait event will unblock at this moment.
Coordination : : Stat exists_stat ;
2021-05-09 11:59:49 +00:00
if ( ! getZooKeeper ( ) - > exists ( fs : : path ( zookeeper_path ) / " mutations " / mutation_id , & exists_stat , wait_event ) )
2019-12-19 15:27:56 +00:00
{
2020-07-22 12:36:19 +00:00
throw Exception ( ErrorCodes : : UNFINISHED , " Mutation {} was killed, manually removed or table was dropped " , mutation_id ) ;
2019-12-19 15:27:56 +00:00
}
2019-12-16 15:51:15 +00:00
auto zookeeper = getZooKeeper ( ) ;
/// Replica could be inactive.
2021-05-09 11:59:49 +00:00
if ( ! zookeeper - > exists ( fs : : path ( zookeeper_path ) / " replicas " / replica / " is_active " ) )
2019-12-16 15:51:15 +00:00
{
2020-05-23 22:24:01 +00:00
LOG_WARNING ( log , " Replica {} is not active during mutation. Mutation will be done asynchronously when replica becomes active. " , replica ) ;
2019-12-16 15:51:15 +00:00
inactive_replicas . emplace ( replica ) ;
break ;
}
2021-05-09 11:59:49 +00:00
String mutation_pointer = fs : : path ( zookeeper_path ) / " replicas " / replica / " mutation_pointer " ;
2019-12-19 15:27:56 +00:00
std : : string mutation_pointer_value ;
2019-12-16 15:51:15 +00:00
/// Replica could be removed
2021-04-05 15:08:43 +00:00
if ( ! zookeeper - > tryGet ( mutation_pointer , mutation_pointer_value , nullptr , wait_event ) )
2019-12-16 15:51:15 +00:00
{
2020-05-23 22:24:01 +00:00
LOG_WARNING ( log , " Replica {} was removed " , replica ) ;
2019-12-16 15:51:15 +00:00
break ;
}
2019-12-19 15:27:56 +00:00
else if ( mutation_pointer_value > = mutation_id ) /// Maybe we already processed more fresh mutation
break ; /// (numbers like 0000000000 and 0000000001)
2019-12-16 15:51:15 +00:00
2020-05-12 14:11:09 +00:00
/// Replica can become inactive, so wait with timeout and recheck it
if ( wait_event - > tryWait ( 1000 ) )
2021-04-05 15:08:43 +00:00
continue ;
2020-07-22 12:36:19 +00:00
2021-05-31 15:03:45 +00:00
/// Here we check mutation for errors on local replica. If they happen on this replica
2021-04-05 15:08:43 +00:00
/// they will happen on each replica, so we can check only in-memory info.
2020-07-22 15:19:54 +00:00
auto mutation_status = queue . getIncompleteMutationsStatus ( mutation_id ) ;
2021-05-31 15:03:45 +00:00
/// If mutation status is empty, than local replica may just not loaded it into memory.
if ( mutation_status & & ! mutation_status - > latest_fail_reason . empty ( ) )
2020-07-22 12:36:19 +00:00
break ;
2019-12-16 15:51:15 +00:00
}
2021-06-27 16:18:15 +00:00
/// This replica inactive, don't check anything
2022-04-18 10:18:43 +00:00
if ( ! inactive_replicas . empty ( ) & & inactive_replicas . contains ( replica ) )
2021-06-27 16:18:15 +00:00
break ;
2020-07-22 12:36:19 +00:00
/// It maybe already removed from zk, but local in-memory mutations
2020-08-16 10:15:35 +00:00
/// state was not updated.
2021-05-09 11:59:49 +00:00
if ( ! getZooKeeper ( ) - > exists ( fs : : path ( zookeeper_path ) / " mutations " / mutation_id ) )
2020-07-22 12:36:19 +00:00
{
throw Exception ( ErrorCodes : : UNFINISHED , " Mutation {} was killed, manually removed or table was dropped " , mutation_id ) ;
}
2021-06-28 08:28:45 +00:00
if ( partial_shutdown_called )
throw Exception ( " Mutation is not finished because table shutdown was called. It will be done after table restart. " ,
ErrorCodes : : UNFINISHED ) ;
/// Replica inactive, don't check mutation status
2022-04-18 10:18:43 +00:00
if ( ! inactive_replicas . empty ( ) & & inactive_replicas . contains ( replica ) )
2021-06-28 08:28:45 +00:00
continue ;
2020-07-31 12:22:32 +00:00
/// At least we have our current mutation
2020-07-31 11:37:16 +00:00
std : : set < String > mutation_ids ;
mutation_ids . insert ( mutation_id ) ;
2021-04-05 15:08:43 +00:00
/// Here we check mutation for errors or kill on local replica. If they happen on this replica
/// they will happen on each replica, so we can check only in-memory info.
2020-07-22 15:19:54 +00:00
auto mutation_status = queue . getIncompleteMutationsStatus ( mutation_id , & mutation_ids ) ;
checkMutationStatus ( mutation_status , mutation_ids ) ;
2019-12-16 15:51:15 +00:00
}
2019-12-19 15:27:56 +00:00
if ( ! inactive_replicas . empty ( ) )
2019-12-16 15:51:15 +00:00
{
2020-11-10 18:22:26 +00:00
throw Exception ( ErrorCodes : : UNFINISHED ,
" Mutation is not finished because some replicas are inactive right now: {}. Mutation will be done asynchronously " ,
boost : : algorithm : : join ( inactive_replicas , " , " ) ) ;
2019-12-16 15:51:15 +00:00
}
}
2015-11-09 20:30:54 +00:00
void StorageReplicatedMergeTree : : createNewZooKeeperNodes ( )
{
auto zookeeper = getZooKeeper ( ) ;
2022-04-04 22:51:48 +00:00
std : : vector < zkutil : : ZooKeeper : : FutureCreate > futures ;
2021-03-09 08:45:41 +00:00
2022-06-30 13:10:09 +00:00
/// These 4 nodes used to be created in createNewZookeeperNodes() and they were moved to createTable()
/// This means that if the first replica creating the table metadata has an older version of CH (22.3 or previous)
/// there will be a time between its calls to `createTable` and `createNewZookeeperNodes` where the nodes won't exists
/// and that will cause issues in newer replicas
/// See https://github.com/ClickHouse/ClickHouse/issues/38600 for example
2022-06-28 18:12:07 +00:00
futures . push_back ( zookeeper - > asyncTryCreateNoThrow ( zookeeper_path + " /quorum " , String ( ) , zkutil : : CreateMode : : Persistent ) ) ;
2022-06-30 13:10:09 +00:00
futures . push_back ( zookeeper - > asyncTryCreateNoThrow ( zookeeper_path + " /quorum/last_part " , String ( ) , zkutil : : CreateMode : : Persistent ) ) ;
futures . push_back ( zookeeper - > asyncTryCreateNoThrow ( zookeeper_path + " /quorum/failed_parts " , String ( ) , zkutil : : CreateMode : : Persistent ) ) ;
futures . push_back ( zookeeper - > asyncTryCreateNoThrow ( zookeeper_path + " /mutations " , String ( ) , zkutil : : CreateMode : : Persistent ) ) ;
2021-03-09 08:45:41 +00:00
2022-06-30 13:10:09 +00:00
futures . push_back ( zookeeper - > asyncTryCreateNoThrow ( zookeeper_path + " /quorum/parallel " , String ( ) , zkutil : : CreateMode : : Persistent ) ) ;
2021-07-05 03:32:56 +00:00
/// Nodes for remote fs zero-copy replication
2021-06-24 08:25:05 +00:00
const auto settings = getSettings ( ) ;
2021-07-05 03:32:56 +00:00
if ( settings - > allow_remote_fs_zero_copy_replication )
2021-03-09 08:45:41 +00:00
{
2022-04-04 22:51:48 +00:00
futures . push_back ( zookeeper - > asyncTryCreateNoThrow ( zookeeper_path + " /zero_copy_s3 " , String ( ) , zkutil : : CreateMode : : Persistent ) ) ;
futures . push_back ( zookeeper - > asyncTryCreateNoThrow ( zookeeper_path + " /zero_copy_s3/shared " , String ( ) , zkutil : : CreateMode : : Persistent ) ) ;
futures . push_back ( zookeeper - > asyncTryCreateNoThrow ( zookeeper_path + " /zero_copy_hdfs " , String ( ) , zkutil : : CreateMode : : Persistent ) ) ;
futures . push_back ( zookeeper - > asyncTryCreateNoThrow ( zookeeper_path + " /zero_copy_hdfs/shared " , String ( ) , zkutil : : CreateMode : : Persistent ) ) ;
2021-03-09 08:45:41 +00:00
}
2020-11-24 14:24:48 +00:00
/// Part movement.
2022-04-04 22:51:48 +00:00
futures . push_back ( zookeeper - > asyncTryCreateNoThrow ( zookeeper_path + " /part_moves_shard " , String ( ) , zkutil : : CreateMode : : Persistent ) ) ;
futures . push_back ( zookeeper - > asyncTryCreateNoThrow ( zookeeper_path + " /pinned_part_uuids " , getPinnedPartUUIDs ( ) - > toString ( ) , zkutil : : CreateMode : : Persistent ) ) ;
2021-05-17 19:23:38 +00:00
/// For ALTER PARTITION with multi-leaders
2022-04-04 22:51:48 +00:00
futures . push_back ( zookeeper - > asyncTryCreateNoThrow ( zookeeper_path + " /alter_partition_version " , String ( ) , zkutil : : CreateMode : : Persistent ) ) ;
for ( auto & future : futures )
{
auto res = future . get ( ) ;
if ( res . error ! = Coordination : : Error : : ZOK & & res . error ! = Coordination : : Error : : ZNODEEXISTS )
throw Coordination : : Exception ( fmt : : format ( " Failed to create new nodes at {} " , zookeeper_path ) , res . error ) ;
}
2016-04-09 05:45:42 +00:00
}
2020-06-16 16:55:04 +00:00
bool StorageReplicatedMergeTree : : createTableIfNotExists ( const StorageMetadataPtr & metadata_snapshot )
2014-03-21 19:17:59 +00:00
{
2014-12-12 20:50:32 +00:00
auto zookeeper = getZooKeeper ( ) ;
2020-06-11 03:24:52 +00:00
zookeeper - > createAncestors ( zookeeper_path ) ;
2017-04-01 07:20:54 +00:00
2020-06-11 03:24:52 +00:00
for ( size_t i = 0 ; i < 1000 ; + + i )
{
/// Invariant: "replicas" does not exist if there is no table or if there are leftovers from incompletely dropped table.
if ( zookeeper - > exists ( zookeeper_path + " /replicas " ) )
{
LOG_DEBUG ( log , " This table {} is already created, will add new replica " , zookeeper_path ) ;
return false ;
}
2017-04-01 07:20:54 +00:00
2020-06-11 03:24:52 +00:00
/// There are leftovers from incompletely dropped table.
if ( zookeeper - > exists ( zookeeper_path + " /dropped " ) )
{
/// This condition may happen when the previous drop attempt was not completed
/// or when table is dropped by another replica right now.
/// This is Ok because another replica is definitely going to drop the table.
2017-04-01 07:20:54 +00:00
2020-06-11 03:24:52 +00:00
LOG_WARNING ( log , " Removing leftovers from table {} (this might take several minutes) " , zookeeper_path ) ;
2021-04-19 08:21:42 +00:00
String drop_lock_path = zookeeper_path + " /dropped/lock " ;
Coordination : : Error code = zookeeper - > tryCreate ( drop_lock_path , " " , zkutil : : CreateMode : : Ephemeral ) ;
2017-04-01 07:20:54 +00:00
2021-04-19 08:21:42 +00:00
if ( code = = Coordination : : Error : : ZNONODE | | code = = Coordination : : Error : : ZNODEEXISTS )
2020-06-11 03:24:52 +00:00
{
2021-04-19 08:21:42 +00:00
LOG_WARNING ( log , " The leftovers from table {} were removed by another replica " , zookeeper_path ) ;
}
else if ( code ! = Coordination : : Error : : ZOK )
{
throw Coordination : : Exception ( code , drop_lock_path ) ;
2020-06-11 03:24:52 +00:00
}
else
{
2021-04-19 10:40:20 +00:00
auto metadata_drop_lock = zkutil : : EphemeralNodeHolder : : existing ( drop_lock_path , * zookeeper ) ;
if ( ! removeTableNodesFromZooKeeper ( zookeeper , zookeeper_path , metadata_drop_lock , log ) )
2020-06-11 03:24:52 +00:00
{
2021-04-19 08:21:42 +00:00
/// Someone is recursively removing table right now, we cannot create new table until old one is removed
continue ;
2020-06-11 03:24:52 +00:00
}
}
}
2017-04-01 07:20:54 +00:00
2020-06-11 03:24:52 +00:00
LOG_DEBUG ( log , " Creating table {} " , zookeeper_path ) ;
/// We write metadata of table so that the replicas can check table parameters with them.
2020-06-16 16:55:04 +00:00
String metadata_str = ReplicatedMergeTreeTableMetadata ( * this , metadata_snapshot ) . toString ( ) ;
2020-06-11 03:24:52 +00:00
Coordination : : Requests ops ;
ops . emplace_back ( zkutil : : makeCreateRequest ( zookeeper_path , " " , zkutil : : CreateMode : : Persistent ) ) ;
2020-06-11 21:03:25 +00:00
ops . emplace_back ( zkutil : : makeCreateRequest ( zookeeper_path + " /metadata " , metadata_str ,
2020-06-11 03:24:52 +00:00
zkutil : : CreateMode : : Persistent ) ) ;
2020-06-17 16:39:58 +00:00
ops . emplace_back ( zkutil : : makeCreateRequest ( zookeeper_path + " /columns " , metadata_snapshot - > getColumns ( ) . toString ( ) ,
2020-06-11 03:24:52 +00:00
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( zookeeper_path + " /log " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( zookeeper_path + " /blocks " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( zookeeper_path + " /block_numbers " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( zookeeper_path + " /nonincrement_block_numbers " , " " ,
zkutil : : CreateMode : : Persistent ) ) ; /// /nonincrement_block_numbers dir is unused, but is created nonetheless for backwards compatibility.
ops . emplace_back ( zkutil : : makeCreateRequest ( zookeeper_path + " /leader_election " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( zookeeper_path + " /temp " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( zookeeper_path + " /replicas " , " last added replica: " + replica_name ,
zkutil : : CreateMode : : Persistent ) ) ;
2022-04-04 22:51:48 +00:00
/// The following 4 nodes were added in version 1.1.xxx, so we create them here, not in createNewZooKeeperNodes()
ops . emplace_back ( zkutil : : makeCreateRequest ( zookeeper_path + " /quorum " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( zookeeper_path + " /quorum/last_part " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( zookeeper_path + " /quorum/failed_parts " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( zookeeper_path + " /mutations " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
2020-06-11 03:24:52 +00:00
/// And create first replica atomically. See also "createReplica" method that is used to create not the first replicas.
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /host " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /log_pointer " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /queue " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /parts " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /flags " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /is_lost " , " 0 " ,
zkutil : : CreateMode : : Persistent ) ) ;
2020-06-11 21:03:25 +00:00
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /metadata " , metadata_str ,
2020-06-11 03:24:52 +00:00
zkutil : : CreateMode : : Persistent ) ) ;
2020-06-17 16:39:58 +00:00
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /columns " , metadata_snapshot - > getColumns ( ) . toString ( ) ,
2020-06-11 03:24:52 +00:00
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /metadata_version " , std : : to_string ( metadata_version ) ,
zkutil : : CreateMode : : Persistent ) ) ;
2022-04-04 22:51:48 +00:00
/// The following 3 nodes were added in version 1.1.xxx, so we create them here, not in createNewZooKeeperNodes()
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /min_unprocessed_insert_time " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /max_processed_insert_time " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /mutation_pointer " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
2020-06-11 03:24:52 +00:00
Coordination : : Responses responses ;
auto code = zookeeper - > tryMulti ( ops , responses ) ;
2020-06-12 15:09:12 +00:00
if ( code = = Coordination : : Error : : ZNODEEXISTS )
2020-06-11 03:24:52 +00:00
{
LOG_WARNING ( log , " It looks like the table {} was created by another server at the same moment, will retry " , zookeeper_path ) ;
continue ;
}
2020-06-12 15:09:12 +00:00
else if ( code ! = Coordination : : Error : : ZOK )
2020-06-11 03:24:52 +00:00
{
zkutil : : KeeperMultiException : : check ( code , ops , responses ) ;
}
return true ;
}
2020-09-09 17:39:49 +00:00
/// Do not use LOGICAL_ERROR code, because it may happen if user has specified wrong zookeeper_path
throw Exception ( " Cannot create table, because it is created concurrently every time "
" or because of wrong zookeeper_path "
" or because of logical error " , ErrorCodes : : REPLICA_IS_ALREADY_EXIST ) ;
2020-06-11 03:24:52 +00:00
}
2020-06-16 16:55:04 +00:00
void StorageReplicatedMergeTree : : createReplica ( const StorageMetadataPtr & metadata_snapshot )
2020-06-11 03:24:52 +00:00
{
auto zookeeper = getZooKeeper ( ) ;
LOG_DEBUG ( log , " Creating replica {} " , replica_path ) ;
2020-06-12 15:09:12 +00:00
Coordination : : Error code ;
2020-06-11 03:24:52 +00:00
do
{
Coordination : : Stat replicas_stat ;
2020-06-11 03:35:59 +00:00
String replicas_value ;
2020-06-11 03:24:52 +00:00
2020-06-11 19:59:15 +00:00
if ( ! zookeeper - > tryGet ( zookeeper_path + " /replicas " , replicas_value , & replicas_stat ) )
2021-06-20 08:24:43 +00:00
throw Exception ( ErrorCodes : : ALL_REPLICAS_LOST ,
" Cannot create a replica of the table {}, because the last replica of the table was dropped right now " ,
zookeeper_path ) ;
2020-06-11 03:24:52 +00:00
/// It is not the first replica, we will mark it as "lost", to immediately repair (clone) from existing replica.
/// By the way, it's possible that the replica will be first, if all previous replicas were removed concurrently.
2021-06-20 08:24:43 +00:00
const String is_lost_value = replicas_stat . numChildren ? " 1 " : " 0 " ;
2020-06-11 03:24:52 +00:00
Coordination : : Requests ops ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /host " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /log_pointer " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /queue " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /parts " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /flags " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /is_lost " , is_lost_value ,
zkutil : : CreateMode : : Persistent ) ) ;
2020-06-16 16:55:04 +00:00
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /metadata " , ReplicatedMergeTreeTableMetadata ( * this , metadata_snapshot ) . toString ( ) ,
2020-06-11 03:24:52 +00:00
zkutil : : CreateMode : : Persistent ) ) ;
2020-06-17 16:39:58 +00:00
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /columns " , metadata_snapshot - > getColumns ( ) . toString ( ) ,
2020-06-11 03:24:52 +00:00
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /metadata_version " , std : : to_string ( metadata_version ) ,
zkutil : : CreateMode : : Persistent ) ) ;
2022-04-04 22:51:48 +00:00
/// The following 3 nodes were added in version 1.1.xxx, so we create them here, not in createNewZooKeeperNodes()
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /min_unprocessed_insert_time " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /max_processed_insert_time " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( replica_path + " /mutation_pointer " , " " ,
zkutil : : CreateMode : : Persistent ) ) ;
2020-06-11 03:24:52 +00:00
/// Check version of /replicas to see if there are any replicas created at the same moment of time.
ops . emplace_back ( zkutil : : makeSetRequest ( zookeeper_path + " /replicas " , " last added replica: " + replica_name , replicas_stat . version ) ) ;
Coordination : : Responses responses ;
code = zookeeper - > tryMulti ( ops , responses ) ;
2021-06-20 08:24:43 +00:00
switch ( code )
2020-06-11 03:24:52 +00:00
{
2021-06-20 08:24:43 +00:00
case Coordination : : Error : : ZNODEEXISTS :
throw Exception ( ErrorCodes : : REPLICA_IS_ALREADY_EXIST , " Replica {} already exists " , replica_path ) ;
case Coordination : : Error : : ZBADVERSION :
LOG_ERROR ( log , " Retrying createReplica(), because some other replicas were created at the same time " ) ;
break ;
case Coordination : : Error : : ZNONODE :
throw Exception ( ErrorCodes : : ALL_REPLICAS_LOST , " Table {} was suddenly removed " , zookeeper_path ) ;
default :
zkutil : : KeeperMultiException : : check ( code , ops , responses ) ;
2020-06-11 03:24:52 +00:00
}
2020-06-12 15:09:12 +00:00
} while ( code = = Coordination : : Error : : ZBADVERSION ) ;
2020-06-11 03:24:52 +00:00
}
2022-09-11 11:37:39 +00:00
zkutil : : ZooKeeperPtr StorageReplicatedMergeTree : : getZooKeeperIfTableShutDown ( ) const
{
2022-09-12 10:22:08 +00:00
zkutil : : ZooKeeperPtr maybe_new_zookeeper ;
2022-09-11 11:37:39 +00:00
if ( zookeeper_name = = default_zookeeper_name )
2022-09-12 10:22:08 +00:00
maybe_new_zookeeper = getContext ( ) - > getZooKeeper ( ) ;
2022-09-11 11:37:39 +00:00
else
2022-09-12 10:22:08 +00:00
maybe_new_zookeeper = getContext ( ) - > getAuxiliaryZooKeeper ( zookeeper_name ) ;
maybe_new_zookeeper - > sync ( zookeeper_path ) ;
return maybe_new_zookeeper ;
2022-09-11 11:37:39 +00:00
}
2020-06-11 03:24:52 +00:00
void StorageReplicatedMergeTree : : drop ( )
{
2020-06-23 12:01:51 +00:00
/// There is also the case when user has configured ClickHouse to wrong ZooKeeper cluster
/// or metadata of staled replica were removed manually,
2020-06-14 01:23:53 +00:00
/// in this case, has_metadata_in_zookeeper = false, and we also permit to drop the table.
2022-01-20 18:55:59 +00:00
bool maybe_has_metadata_in_zookeeper = ! has_metadata_in_zookeeper . has_value ( ) | | * has_metadata_in_zookeeper ;
if ( maybe_has_metadata_in_zookeeper )
2020-06-11 03:24:52 +00:00
{
2020-09-17 11:27:17 +00:00
/// Table can be shut down, restarting thread is not active
2021-02-24 08:33:39 +00:00
/// and calling StorageReplicatedMergeTree::getZooKeeper()/getAuxiliaryZooKeeper() won't suffice.
2022-09-11 11:37:39 +00:00
zkutil : : ZooKeeperPtr zookeeper = getZooKeeperIfTableShutDown ( ) ;
2020-06-11 03:24:52 +00:00
2020-06-14 01:23:53 +00:00
/// If probably there is metadata in ZooKeeper, we don't allow to drop the table.
2021-02-15 12:04:30 +00:00
if ( ! zookeeper )
2020-06-11 03:24:52 +00:00
throw Exception ( " Can't drop readonly replicated table (need to drop data in ZooKeeper as well) " , ErrorCodes::TABLE_IS_READ_ONLY) ;
2022-04-05 15:36:53 +00:00
dropReplica ( zookeeper , zookeeper_path , replica_name , log , getSettings ( ) ) ;
2020-06-11 03:24:52 +00:00
}
dropAllData ( ) ;
2014-03-22 14:44:44 +00:00
}
2020-06-11 03:24:52 +00:00
2022-04-05 15:36:53 +00:00
void StorageReplicatedMergeTree : : dropReplica ( zkutil : : ZooKeeperPtr zookeeper , const String & zookeeper_path , const String & replica ,
Poco : : Logger * logger , MergeTreeSettingsPtr table_settings )
2020-06-23 12:01:51 +00:00
{
if ( zookeeper - > expired ( ) )
throw Exception ( " Table was not dropped because ZooKeeper session has expired. " , ErrorCodes : : TABLE_WAS_NOT_DROPPED ) ;
2020-06-11 03:24:52 +00:00
2020-06-23 12:01:51 +00:00
auto remote_replica_path = zookeeper_path + " /replicas/ " + replica ;
2021-10-28 18:00:33 +00:00
2020-10-07 19:47:31 +00:00
LOG_INFO ( logger , " Removing replica {}, marking it as lost " , remote_replica_path ) ;
/// Mark itself lost before removing, because the following recursive removal may fail
/// and partially dropped replica may be considered as alive one (until someone will mark it lost)
2021-10-28 18:00:33 +00:00
zookeeper - > trySet ( remote_replica_path + " /is_lost " , " 1 " ) ;
2021-10-28 18:00:33 +00:00
/// NOTE: we should check for remote_replica_path existence,
/// since otherwise DROP REPLICA will fail if the replica had been already removed.
if ( ! zookeeper - > exists ( remote_replica_path ) )
{
LOG_INFO ( logger , " Removing replica {} does not exist " , remote_replica_path ) ;
return ;
}
{
2022-08-25 20:08:48 +00:00
/// Remove "host" node first to mark replica as dropped (the choice is arbitrary,
/// it could be any node without children that exists since ancient server versions and not re-created on startup)
[[maybe_unused]] auto code = zookeeper - > tryRemove ( fs : : path ( remote_replica_path ) / " host " ) ;
2022-04-04 22:51:48 +00:00
assert ( code = = Coordination : : Error : : ZOK | | code = = Coordination : : Error : : ZNONODE ) ;
2021-10-28 18:00:33 +00:00
2022-04-04 22:51:48 +00:00
/// Then try to remove paths that are known to be flat (all children are leafs)
2022-04-05 15:36:53 +00:00
Strings flat_nodes = { " flags " , " queue " } ;
if ( table_settings & & table_settings - > use_minimalistic_part_header_in_zookeeper )
flat_nodes . emplace_back ( " parts " ) ;
2022-04-04 22:51:48 +00:00
for ( const auto & node : flat_nodes )
2021-10-28 18:00:33 +00:00
{
2022-04-05 15:36:53 +00:00
bool removed_quickly = zookeeper - > tryRemoveChildrenRecursive ( fs : : path ( remote_replica_path ) / node , /* probably flat */ true ) ;
2022-04-04 22:51:48 +00:00
if ( ! removed_quickly )
2022-04-05 15:36:53 +00:00
LOG_WARNING ( logger , " Failed to quickly remove node '{}' and its children, fell back to recursive removal (replica: {}) " ,
2022-04-04 22:51:48 +00:00
node , remote_replica_path ) ;
2021-10-28 18:00:33 +00:00
}
2022-04-04 22:51:48 +00:00
/// Then try to remove nodes that are known to have no children (and should always exist)
Coordination : : Requests ops ;
for ( const auto & node : flat_nodes )
ops . emplace_back ( zkutil : : makeRemoveRequest ( remote_replica_path + " / " + node , - 1 ) ) ;
ops . emplace_back ( zkutil : : makeRemoveRequest ( remote_replica_path + " /columns " , - 1 ) ) ;
ops . emplace_back ( zkutil : : makeRemoveRequest ( remote_replica_path + " /is_lost " , - 1 ) ) ;
ops . emplace_back ( zkutil : : makeRemoveRequest ( remote_replica_path + " /log_pointer " , - 1 ) ) ;
ops . emplace_back ( zkutil : : makeRemoveRequest ( remote_replica_path + " /max_processed_insert_time " , - 1 ) ) ;
ops . emplace_back ( zkutil : : makeRemoveRequest ( remote_replica_path + " /min_unprocessed_insert_time " , - 1 ) ) ;
2022-08-25 20:08:48 +00:00
ops . emplace_back ( zkutil : : makeRemoveRequest ( remote_replica_path + " /metadata " , - 1 ) ) ;
2022-04-04 22:51:48 +00:00
ops . emplace_back ( zkutil : : makeRemoveRequest ( remote_replica_path + " /metadata_version " , - 1 ) ) ;
ops . emplace_back ( zkutil : : makeRemoveRequest ( remote_replica_path + " /mutation_pointer " , - 1 ) ) ;
Coordination : : Responses res ;
code = zookeeper - > tryMulti ( ops , res ) ;
if ( code ! = Coordination : : Error : : ZOK )
LOG_WARNING ( logger , " Cannot quickly remove nodes without children: {} (replica: {}). Will remove recursively. " ,
Coordination : : errorMessage ( code ) , remote_replica_path ) ;
/// And finally remove everything else recursively
zookeeper - > tryRemoveRecursive ( remote_replica_path ) ;
2021-10-28 18:00:33 +00:00
}
2020-06-23 12:01:51 +00:00
/// It may left some garbage if replica_path subtree are concurrently modified
if ( zookeeper - > exists ( remote_replica_path ) )
LOG_ERROR ( logger , " Replica was not completely removed from ZooKeeper, {} still exists and may contain some garbage. " , remote_replica_path ) ;
2020-06-11 03:24:52 +00:00
2020-06-23 12:01:51 +00:00
/// Check that `zookeeper_path` exists: it could have been deleted by another replica after execution of previous line.
Strings replicas ;
if ( Coordination : : Error : : ZOK ! = zookeeper - > tryGetChildren ( zookeeper_path + " /replicas " , replicas ) | | ! replicas . empty ( ) )
return ;
2020-06-11 03:24:52 +00:00
2020-06-23 12:01:51 +00:00
LOG_INFO ( logger , " {} is the last replica, will remove table " , remote_replica_path ) ;
/** At this moment, another replica can be created and we cannot remove the table.
* Try to remove / replicas node first . If we successfully removed it ,
* it guarantees that we are the only replica that proceed to remove the table
* and no new replicas can be created after that moment ( it requires the existence of / replicas node ) .
* and table cannot be recreated with new / replicas node on another servers while we are removing data ,
* because table creation is executed in single transaction that will conflict with remaining nodes .
*/
2020-06-11 03:24:52 +00:00
2021-04-19 08:21:42 +00:00
/// Node /dropped works like a lock that protects from concurrent removal of old table and creation of new table.
/// But recursive removal may fail in the middle of operation leaving some garbage in zookeeper_path, so
/// we remove it on table creation if there is /dropped node. Creating thread may remove /dropped node created by
/// removing thread, and it causes race condition if removing thread is not finished yet.
/// To avoid this we also create ephemeral child before starting recursive removal.
2021-04-21 13:01:54 +00:00
/// (The existence of child node does not allow to remove parent node).
2020-06-23 12:01:51 +00:00
Coordination : : Requests ops ;
Coordination : : Responses responses ;
2021-04-19 08:21:42 +00:00
String drop_lock_path = zookeeper_path + " /dropped/lock " ;
2020-06-23 12:01:51 +00:00
ops . emplace_back ( zkutil : : makeRemoveRequest ( zookeeper_path + " /replicas " , - 1 ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( zookeeper_path + " /dropped " , " " , zkutil : : CreateMode : : Persistent ) ) ;
2021-04-19 08:21:42 +00:00
ops . emplace_back ( zkutil : : makeCreateRequest ( drop_lock_path , " " , zkutil : : CreateMode : : Ephemeral ) ) ;
2020-06-23 12:01:51 +00:00
Coordination : : Error code = zookeeper - > tryMulti ( ops , responses ) ;
2020-06-11 03:24:52 +00:00
2020-06-23 12:01:51 +00:00
if ( code = = Coordination : : Error : : ZNONODE | | code = = Coordination : : Error : : ZNODEEXISTS )
{
LOG_WARNING ( logger , " Table {} is already started to be removing by another replica right now " , remote_replica_path ) ;
}
else if ( code = = Coordination : : Error : : ZNOTEMPTY )
{
LOG_WARNING ( logger , " Another replica was suddenly created, will keep the table {} " , remote_replica_path ) ;
}
else if ( code ! = Coordination : : Error : : ZOK )
{
zkutil : : KeeperMultiException : : check ( code , ops , responses ) ;
}
else
{
2021-04-19 10:40:20 +00:00
auto metadata_drop_lock = zkutil : : EphemeralNodeHolder : : existing ( drop_lock_path , * zookeeper ) ;
2020-06-23 12:01:51 +00:00
LOG_INFO ( logger , " Removing table {} (this might take several minutes) " , zookeeper_path ) ;
2021-04-19 10:40:20 +00:00
removeTableNodesFromZooKeeper ( zookeeper , zookeeper_path , metadata_drop_lock , logger ) ;
2021-04-19 08:21:42 +00:00
}
}
2020-06-11 03:24:52 +00:00
2021-04-19 08:21:42 +00:00
bool StorageReplicatedMergeTree : : removeTableNodesFromZooKeeper ( zkutil : : ZooKeeperPtr zookeeper ,
2021-04-19 10:40:20 +00:00
const String & zookeeper_path , const zkutil : : EphemeralNodeHolder : : Ptr & metadata_drop_lock , Poco : : Logger * logger )
2021-04-19 08:21:42 +00:00
{
bool completely_removed = false ;
2022-04-04 22:51:48 +00:00
2022-04-05 15:36:53 +00:00
/// NOTE /block_numbers/ actually is not flat, because /block_numbers/<partition_id>/ may have ephemeral children,
/// but we assume that all ephemeral block locks are already removed when table is being dropped.
static constexpr std : : array flat_nodes = { " block_numbers " , " blocks " , " leader_election " , " log " , " mutations " , " pinned_part_uuids " } ;
2022-04-04 22:51:48 +00:00
/// First try to remove paths that are known to be flat
2022-04-06 11:56:26 +00:00
for ( const auto * node : flat_nodes )
2022-04-04 22:51:48 +00:00
{
bool removed_quickly = zookeeper - > tryRemoveChildrenRecursive ( fs : : path ( zookeeper_path ) / node , /* probably flat */ true ) ;
if ( ! removed_quickly )
2022-04-05 15:36:53 +00:00
LOG_WARNING ( logger , " Failed to quickly remove node '{}' and its children, fell back to recursive removal (table: {}) " ,
2022-04-04 22:51:48 +00:00
node , zookeeper_path ) ;
}
/// Then try to remove nodes that are known to have no children (and should always exist)
Coordination : : Requests ops ;
2022-04-06 11:56:26 +00:00
for ( const auto * node : flat_nodes )
2022-04-04 22:51:48 +00:00
ops . emplace_back ( zkutil : : makeRemoveRequest ( zookeeper_path + " / " + node , - 1 ) ) ;
ops . emplace_back ( zkutil : : makeRemoveRequest ( zookeeper_path + " /alter_partition_version " , - 1 ) ) ;
ops . emplace_back ( zkutil : : makeRemoveRequest ( zookeeper_path + " /columns " , - 1 ) ) ;
ops . emplace_back ( zkutil : : makeRemoveRequest ( zookeeper_path + " /metadata " , - 1 ) ) ;
ops . emplace_back ( zkutil : : makeRemoveRequest ( zookeeper_path + " /table_shared_id " , - 1 ) ) ;
Coordination : : Responses res ;
auto code = zookeeper - > tryMulti ( ops , res ) ;
if ( code ! = Coordination : : Error : : ZOK )
LOG_WARNING ( logger , " Cannot quickly remove nodes without children: {} (table: {}). Will remove recursively. " ,
Coordination : : errorMessage ( code ) , zookeeper_path ) ;
2021-04-19 08:21:42 +00:00
Strings children ;
2022-04-04 22:51:48 +00:00
code = zookeeper - > tryGetChildren ( zookeeper_path , children ) ;
2021-04-19 08:21:42 +00:00
if ( code = = Coordination : : Error : : ZNONODE )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " There is a race condition between creation and removal of replicated table. It's a bug " ) ;
2020-06-23 12:01:51 +00:00
2021-04-19 08:21:42 +00:00
for ( const auto & child : children )
2022-04-04 22:51:48 +00:00
{
2021-04-19 08:21:42 +00:00
if ( child ! = " dropped " )
2021-05-08 10:59:55 +00:00
zookeeper - > tryRemoveRecursive ( fs : : path ( zookeeper_path ) / child ) ;
2022-04-04 22:51:48 +00:00
}
2021-04-19 08:21:42 +00:00
2022-04-04 22:51:48 +00:00
ops . clear ( ) ;
2021-04-19 08:21:42 +00:00
Coordination : : Responses responses ;
2021-04-19 10:40:20 +00:00
ops . emplace_back ( zkutil : : makeRemoveRequest ( metadata_drop_lock - > getPath ( ) , - 1 ) ) ;
2021-05-08 10:59:55 +00:00
ops . emplace_back ( zkutil : : makeRemoveRequest ( fs : : path ( zookeeper_path ) / " dropped " , - 1 ) ) ;
2021-04-19 08:21:42 +00:00
ops . emplace_back ( zkutil : : makeRemoveRequest ( zookeeper_path , - 1 ) ) ;
code = zookeeper - > tryMulti ( ops , responses ) ;
if ( code = = Coordination : : Error : : ZNONODE )
{
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " There is a race condition between creation and removal of replicated table. It's a bug " ) ;
2020-06-11 03:24:52 +00:00
}
2021-04-19 08:21:42 +00:00
else if ( code = = Coordination : : Error : : ZNOTEMPTY )
{
LOG_ERROR ( logger , " Table was not completely removed from ZooKeeper, {} still exists and may contain some garbage, "
" but someone is removing it right now. " , zookeeper_path ) ;
}
else if ( code ! = Coordination : : Error : : ZOK )
{
/// It is still possible that ZooKeeper session is expired or server is killed in the middle of the delete operation.
zkutil : : KeeperMultiException : : check ( code , ops , responses ) ;
}
else
{
2021-04-19 10:40:20 +00:00
metadata_drop_lock - > setAlreadyRemoved ( ) ;
2021-04-19 08:21:42 +00:00
completely_removed = true ;
LOG_INFO ( logger , " Table {} was successfully removed from ZooKeeper " , zookeeper_path ) ;
2020-06-11 03:24:52 +00:00
}
2021-04-19 08:21:42 +00:00
return completely_removed ;
2014-03-22 14:44:44 +00:00
}
2014-03-21 19:17:59 +00:00
2014-10-18 17:37:55 +00:00
2020-08-15 12:30:17 +00:00
/** Verify that list of columns and table storage_settings_ptr match those specified in ZK (/metadata).
* If not , throw an exception .
*/
2020-06-16 16:55:04 +00:00
void StorageReplicatedMergeTree : : checkTableStructure ( const String & zookeeper_prefix , const StorageMetadataPtr & metadata_snapshot )
2014-03-22 14:44:44 +00:00
{
2014-12-12 20:50:32 +00:00
auto zookeeper = getZooKeeper ( ) ;
2017-04-01 07:20:54 +00:00
2020-06-16 16:55:04 +00:00
ReplicatedMergeTreeTableMetadata old_metadata ( * this , metadata_snapshot ) ;
2018-11-02 11:53:05 +00:00
2018-11-01 13:30:38 +00:00
Coordination : : Stat metadata_stat ;
2021-05-08 10:59:55 +00:00
String metadata_str = zookeeper - > get ( fs : : path ( zookeeper_prefix ) / " metadata " , & metadata_stat ) ;
2018-11-02 15:39:19 +00:00
auto metadata_from_zk = ReplicatedMergeTreeTableMetadata : : parse ( metadata_str ) ;
2021-04-10 23:33:54 +00:00
old_metadata . checkEquals ( metadata_from_zk , metadata_snapshot - > getColumns ( ) , getContext ( ) ) ;
2017-04-01 07:20:54 +00:00
2018-11-01 13:30:38 +00:00
Coordination : : Stat columns_stat ;
2021-05-08 10:59:55 +00:00
auto columns_from_zk = ColumnsDescription : : parse ( zookeeper - > get ( fs : : path ( zookeeper_prefix ) / " columns " , & columns_stat ) ) ;
2017-04-01 07:20:54 +00:00
2020-06-17 16:39:58 +00:00
const ColumnsDescription & old_columns = metadata_snapshot - > getColumns ( ) ;
2020-02-14 10:17:04 +00:00
if ( columns_from_zk ! = old_columns )
2014-03-22 14:44:44 +00:00
{
2021-09-21 19:39:30 +00:00
throw Exception ( ErrorCodes : : INCOMPATIBLE_COLUMNS ,
" Table columns structure in ZooKeeper is different from local table structure. Local columns: \n "
" {} \n Zookeeper columns: \n {} " , old_columns . toString ( ) , columns_from_zk . toString ( ) ) ;
2014-03-22 14:44:44 +00:00
}
}
2014-03-21 19:17:59 +00:00
2022-07-29 16:33:16 +00:00
void StorageReplicatedMergeTree : : setTableStructure ( const StorageID & table_id , const ContextPtr & local_context ,
2020-10-15 13:02:39 +00:00
ColumnsDescription new_columns , const ReplicatedMergeTreeTableMetadata : : Diff & metadata_diff )
2018-11-02 11:53:05 +00:00
{
2020-06-17 10:34:23 +00:00
StorageInMemoryMetadata old_metadata = getInMemoryMetadata ( ) ;
2022-07-29 16:33:16 +00:00
StorageInMemoryMetadata new_metadata = metadata_diff . getNewMetadata ( new_columns , local_context , old_metadata ) ;
2021-03-15 08:22:15 +00:00
2020-06-27 20:13:16 +00:00
/// Even if the primary/sorting/partition keys didn't change we must reinitialize it
/// because primary/partition key column types might have changed.
2020-06-18 17:09:06 +00:00
checkTTLExpressions ( new_metadata , old_metadata ) ;
2020-06-17 10:34:23 +00:00
setProperties ( new_metadata , old_metadata ) ;
2020-06-18 17:03:42 +00:00
2022-07-29 16:33:16 +00:00
DatabaseCatalog : : instance ( ) . getDatabase ( table_id . database_name ) - > alterTable ( local_context , table_id , new_metadata ) ;
2018-11-02 11:53:05 +00:00
}
2017-03-12 19:18:07 +00:00
/** If necessary, restore a part, replica itself adds a record for its receipt.
* What time should I put for this entry in the queue ? Time is taken into account when calculating lag of replica .
* For these purposes , it makes sense to use creation time of missing part
* ( that is , in calculating lag , it will be taken into account how old is the part we need to recover ) .
2015-09-20 05:50:15 +00:00
*/
static time_t tryGetPartCreateTime ( zkutil : : ZooKeeperPtr & zookeeper , const String & replica_path , const String & part_name )
{
time_t res = 0 ;
2017-03-13 18:01:46 +00:00
/// We get creation time of part, if it still exists (was not merged, for example).
2018-08-25 01:58:14 +00:00
Coordination : : Stat stat ;
2015-09-20 05:50:15 +00:00
String unused ;
2021-05-08 10:59:55 +00:00
if ( zookeeper - > tryGet ( fs : : path ( replica_path ) / " parts " / part_name , unused , & stat ) )
2015-09-20 05:50:15 +00:00
res = stat . ctime / 1000 ;
return res ;
}
2014-07-10 08:40:59 +00:00
void StorageReplicatedMergeTree : : checkParts ( bool skip_sanity_checks )
2014-04-02 07:59:43 +00:00
{
2014-12-12 20:50:32 +00:00
auto zookeeper = getZooKeeper ( ) ;
2017-04-01 07:20:54 +00:00
2021-05-08 10:59:55 +00:00
Strings expected_parts_vec = zookeeper - > getChildren ( fs : : path ( replica_path ) / " parts " ) ;
2017-04-01 07:20:54 +00:00
2017-03-12 19:18:07 +00:00
/// Parts in ZK.
2014-04-02 07:59:43 +00:00
NameSet expected_parts ( expected_parts_vec . begin ( ) , expected_parts_vec . end ( ) ) ;
2017-04-01 07:20:54 +00:00
2021-12-30 14:27:22 +00:00
/// There are no PreActive parts at startup.
auto parts = getDataParts ( { MergeTreeDataPartState : : Active , MergeTreeDataPartState : : Outdated } ) ;
2017-04-01 07:20:54 +00:00
2019-03-14 14:46:28 +00:00
/** Local parts that are not in ZK.
* In very rare cases they may cover missing parts
* and someone may think that pushing them to zookeeper is good idea .
* But actually we can ' t precisely determine that ALL missing parts
* covered by this unexpected part . So missing parts will be downloaded .
*/
2019-05-03 02:00:57 +00:00
DataParts unexpected_parts ;
2017-04-01 07:20:54 +00:00
2022-06-04 11:33:29 +00:00
/// Intersection of local parts and expected parts
2022-06-03 14:32:02 +00:00
ActiveDataPartSet local_expected_parts_set ( format_version ) ;
2019-03-14 18:22:04 +00:00
/// Collect unexpected parts
2019-03-14 14:46:28 +00:00
for ( const auto & part : parts )
2022-06-03 14:32:02 +00:00
{
if ( expected_parts . contains ( part - > name ) )
local_expected_parts_set . add ( part - > name ) ;
else
2019-03-14 14:46:28 +00:00
unexpected_parts . insert ( part ) ; /// this parts we will place to detached with ignored_ prefix
2022-06-03 14:32:02 +00:00
}
2017-04-01 07:20:54 +00:00
2019-03-14 18:22:04 +00:00
/// Which parts should be taken from other replicas.
Strings parts_to_fetch ;
for ( const String & missing_name : expected_parts )
2019-05-03 02:00:57 +00:00
if ( ! getActiveContainingPart ( missing_name ) )
2019-03-14 18:22:04 +00:00
parts_to_fetch . push_back ( missing_name ) ;
2018-02-21 17:06:29 +00:00
2017-03-13 18:01:46 +00:00
/** To check the adequacy, for the parts that are in the FS, but not in ZK, we will only consider not the most recent parts.
* Because unexpected new parts usually arise only because they did not have time to enroll in ZK with a rough restart of the server .
* It also occurs from deduplicated parts that did not have time to retire .
2015-09-24 01:17:10 +00:00
*/
size_t unexpected_parts_nonnew = 0 ;
2018-02-21 17:06:29 +00:00
UInt64 unexpected_parts_nonnew_rows = 0 ;
UInt64 unexpected_parts_rows = 0 ;
2021-06-20 08:24:43 +00:00
2022-06-03 14:32:02 +00:00
Strings covered_unexpected_parts ;
Strings uncovered_unexpected_parts ;
UInt64 uncovered_unexpected_parts_rows = 0 ;
2015-09-24 01:17:10 +00:00
for ( const auto & part : unexpected_parts )
2018-02-21 17:06:29 +00:00
{
2022-06-03 14:32:02 +00:00
unexpected_parts_rows + = part - > rows_count ;
/// This part may be covered by some expected part that is active and present locally
/// Probably we just did not remove this part from disk before restart (but removed from ZooKeeper)
String covering_local_part = local_expected_parts_set . getContainingPart ( part - > name ) ;
if ( ! covering_local_part . empty ( ) )
{
covered_unexpected_parts . push_back ( part - > name ) ;
continue ;
}
/// Part is unexpected and we don't have covering part: it's suspicious
uncovered_unexpected_parts . push_back ( part - > name ) ;
uncovered_unexpected_parts_rows + = part - > rows_count ;
2017-08-14 18:16:11 +00:00
if ( part - > info . level > 0 )
2018-02-21 17:06:29 +00:00
{
2015-09-24 01:17:10 +00:00
+ + unexpected_parts_nonnew ;
2018-02-21 17:06:29 +00:00
unexpected_parts_nonnew_rows + = part - > rows_count ;
}
}
2021-06-20 08:24:43 +00:00
const UInt64 parts_to_fetch_blocks = std : : accumulate ( parts_to_fetch . cbegin ( ) , parts_to_fetch . cend ( ) , 0 ,
[ & ] ( UInt64 acc , const String & part_name )
{
2021-08-24 12:57:49 +00:00
if ( const auto part_info = MergeTreePartInfo : : tryParsePartName ( part_name , format_version ) )
return acc + part_info - > getBlocksCount ( ) ;
2018-02-21 17:06:29 +00:00
2021-06-20 08:24:43 +00:00
LOG_ERROR ( log , " Unexpected part name: {} " , part_name ) ;
return acc ;
} ) ;
2018-02-21 17:06:29 +00:00
2017-06-21 19:07:08 +00:00
/** We can automatically synchronize data,
* if the ratio of the total number of errors to the total number of parts ( minimum - on the local filesystem or in ZK )
* is no more than some threshold ( for example 50 % ) .
2015-09-24 01:17:10 +00:00
*
2017-06-21 19:07:08 +00:00
* A large ratio of mismatches in the data on the filesystem and the expected data
* may indicate a configuration error ( the server accidentally connected as a replica not from right shard ) .
2017-03-13 18:01:46 +00:00
* In this case , the protection mechanism does not allow the server to start .
2014-10-15 19:42:32 +00:00
*/
2017-04-01 07:20:54 +00:00
2018-02-21 17:06:29 +00:00
UInt64 total_rows_on_filesystem = 0 ;
for ( const auto & part : parts )
total_rows_on_filesystem + = part - > rows_count ;
2019-08-26 18:08:58 +00:00
const auto storage_settings_ptr = getSettings ( ) ;
2022-06-03 14:32:02 +00:00
bool insane = uncovered_unexpected_parts_rows > total_rows_on_filesystem * storage_settings_ptr - > replicated_max_ratio_of_wrong_parts ;
2017-04-01 07:20:54 +00:00
2020-11-10 18:22:26 +00:00
constexpr const char * sanity_report_fmt = " The local set of parts of table {} doesn't look like the set of parts in ZooKeeper: "
" {} rows of {} total rows in filesystem are suspicious. "
2022-06-03 14:32:02 +00:00
" There are {} uncovered unexpected parts with {} rows ({} of them is not just-written with {} rows), "
" {} missing parts (with {} blocks), {} covered unexpected parts (with {} rows). " ;
constexpr const char * sanity_report_debug_fmt = " Uncovered unexpected parts: {}. Missing parts: {}. Covered unexpected parts: {}. Expected parts: {}. " ;
2020-11-10 18:22:26 +00:00
2015-09-24 01:17:10 +00:00
if ( insane & & ! skip_sanity_checks )
2018-02-21 17:06:29 +00:00
{
2022-06-03 14:32:02 +00:00
LOG_DEBUG ( log , sanity_report_debug_fmt , fmt : : join ( uncovered_unexpected_parts , " , " ) , fmt : : join ( parts_to_fetch , " , " ) ,
fmt : : join ( covered_unexpected_parts , " , " ) , fmt : : join ( expected_parts , " , " ) ) ;
2020-11-10 18:22:26 +00:00
throw Exception ( ErrorCodes : : TOO_MANY_UNEXPECTED_DATA_PARTS , sanity_report_fmt , getStorageID ( ) . getNameForLogs ( ) ,
2022-06-03 14:32:02 +00:00
formatReadableQuantity ( uncovered_unexpected_parts_rows ) , formatReadableQuantity ( total_rows_on_filesystem ) ,
uncovered_unexpected_parts . size ( ) , uncovered_unexpected_parts_rows , unexpected_parts_nonnew , unexpected_parts_nonnew_rows ,
parts_to_fetch . size ( ) , parts_to_fetch_blocks , covered_unexpected_parts . size ( ) , unexpected_parts_rows - uncovered_unexpected_parts_rows ) ;
2018-02-21 17:06:29 +00:00
}
2017-04-01 07:20:54 +00:00
2022-06-03 14:32:02 +00:00
if ( unexpected_parts_nonnew_rows > 0 | | uncovered_unexpected_parts_rows > 0 )
2020-11-10 18:22:26 +00:00
{
2022-06-03 14:32:02 +00:00
LOG_DEBUG ( log , sanity_report_debug_fmt , fmt : : join ( uncovered_unexpected_parts , " , " ) , fmt : : join ( parts_to_fetch , " , " ) ,
fmt : : join ( covered_unexpected_parts , " , " ) , fmt : : join ( expected_parts , " , " ) ) ;
Use fmt::runtime() for LOG_* for non constexpr
Here is oneliner:
$ gg 'LOG_\(DEBUG\|TRACE\|INFO\|TEST\|WARNING\|ERROR\|FATAL\)([^,]*, [a-zA-Z]' -- :*.cpp :*.h | cut -d: -f1 | sort -u | xargs -r sed -E -i 's#(LOG_[A-Z]*)\(([^,]*), ([A-Za-z][^,)]*)#\1(\2, fmt::runtime(\3)#'
Note, that I tried to do this with coccinelle (tool for semantic
patchin), but it cannot parse C++:
$ cat fmt.cocci
@@
expression log;
expression var;
@@
-LOG_DEBUG(log, var)
+LOG_DEBUG(log, fmt::runtime(var))
I've also tried to use some macros/templates magic to do this implicitly
in logger_useful.h, but I failed to do so, and apparently it is not
possible for now.
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
v2: manual fixes
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2022-02-01 09:10:27 +00:00
LOG_WARNING ( log , fmt : : runtime ( sanity_report_fmt ) , getStorageID ( ) . getNameForLogs ( ) ,
2022-06-03 14:32:02 +00:00
formatReadableQuantity ( uncovered_unexpected_parts_rows ) , formatReadableQuantity ( total_rows_on_filesystem ) ,
uncovered_unexpected_parts . size ( ) , uncovered_unexpected_parts_rows , unexpected_parts_nonnew , unexpected_parts_nonnew_rows ,
parts_to_fetch . size ( ) , parts_to_fetch_blocks , covered_unexpected_parts . size ( ) , unexpected_parts_rows - uncovered_unexpected_parts_rows ) ;
2020-11-10 18:22:26 +00:00
}
2017-04-01 07:20:54 +00:00
2018-12-11 13:30:20 +00:00
/// Add to the queue jobs to pick up the missing parts from other replicas and remove from ZK the information that we have them.
2021-10-18 20:16:02 +00:00
queue . setBrokenPartsToEnqueueFetchesOnLoading ( std : : move ( parts_to_fetch ) ) ;
2018-12-11 13:30:20 +00:00
2017-03-13 18:01:46 +00:00
/// Remove extra local parts.
2019-05-03 02:00:57 +00:00
for ( const DataPartPtr & part : unexpected_parts )
2014-04-02 07:59:43 +00:00
{
2020-05-23 22:24:01 +00:00
LOG_ERROR ( log , " Renaming unexpected part {} to ignored_{} " , part - > name , part - > name ) ;
2022-09-30 17:01:06 +00:00
forcefullyMovePartToDetachedAndRemoveFromMemory ( part , " ignored " , true ) ;
2014-04-02 07:59:43 +00:00
}
}
2014-03-21 19:17:59 +00:00
2014-10-18 17:37:55 +00:00
2020-11-24 14:24:48 +00:00
void StorageReplicatedMergeTree : : syncPinnedPartUUIDs ( )
{
auto zookeeper = getZooKeeper ( ) ;
Coordination : : Stat stat ;
String s = zookeeper - > get ( zookeeper_path + " /pinned_part_uuids " , & stat ) ;
std : : lock_guard lock ( pinned_part_uuids_mutex ) ;
/// Unsure whether or not this can be called concurrently.
if ( pinned_part_uuids - > stat . version < stat . version )
{
auto new_pinned_part_uuids = std : : make_shared < PinnedPartUUIDs > ( ) ;
new_pinned_part_uuids - > fromString ( s ) ;
new_pinned_part_uuids - > stat = stat ;
pinned_part_uuids = new_pinned_part_uuids ;
}
}
2018-03-21 23:30:20 +00:00
void StorageReplicatedMergeTree : : checkPartChecksumsAndAddCommitOps ( const zkutil : : ZooKeeperPtr & zookeeper ,
2019-05-03 02:00:57 +00:00
const DataPartPtr & part , Coordination : : Requests & ops , String part_name , NameSet * absent_replicas_paths )
2014-04-08 17:45:21 +00:00
{
2014-08-08 08:28:13 +00:00
if ( part_name . empty ( ) )
part_name = part - > name ;
2017-04-01 07:20:54 +00:00
2018-12-11 13:30:20 +00:00
auto local_part_header = ReplicatedMergeTreePartHeader : : fromColumnsAndChecksums (
2020-01-16 16:15:01 +00:00
part - > getColumns ( ) , part - > checksums ) ;
2018-12-11 13:30:20 +00:00
2021-05-08 10:59:55 +00:00
Strings replicas = zookeeper - > getChildren ( fs : : path ( zookeeper_path ) / " replicas " ) ;
2019-07-28 15:30:38 +00:00
std : : shuffle ( replicas . begin ( ) , replicas . end ( ) , thread_local_rng ) ;
2018-12-11 13:30:20 +00:00
bool has_been_already_added = false ;
2017-04-01 07:20:54 +00:00
2014-07-10 10:16:50 +00:00
for ( const String & replica : replicas )
2014-04-08 17:45:21 +00:00
{
2021-05-08 10:59:55 +00:00
String current_part_path = fs : : path ( zookeeper_path ) / " replicas " / replica / " parts " / part_name ;
2018-03-21 23:30:20 +00:00
2018-12-11 13:30:20 +00:00
String part_zk_str ;
if ( ! zookeeper - > tryGet ( current_part_path , part_zk_str ) )
2018-03-21 23:30:20 +00:00
{
if ( absent_replicas_paths )
absent_replicas_paths - > emplace ( current_part_path ) ;
2014-07-10 10:16:50 +00:00
continue ;
2018-03-21 23:30:20 +00:00
}
2018-12-11 13:30:20 +00:00
ReplicatedMergeTreePartHeader replica_part_header ;
2021-08-18 09:49:22 +00:00
if ( part_zk_str . empty ( ) )
2014-07-10 10:16:50 +00:00
{
2018-12-11 13:30:20 +00:00
String columns_str ;
String checksums_str ;
2022-02-13 20:08:29 +00:00
2021-08-18 09:49:22 +00:00
if ( zookeeper - > tryGet ( fs : : path ( current_part_path ) / " columns " , columns_str ) & &
zookeeper - > tryGet ( fs : : path ( current_part_path ) / " checksums " , checksums_str ) )
2018-12-11 13:30:20 +00:00
{
2021-08-18 09:49:22 +00:00
replica_part_header = ReplicatedMergeTreePartHeader : : fromColumnsAndChecksumsZNodes ( columns_str , checksums_str ) ;
}
else
2018-12-11 13:30:20 +00:00
{
2021-08-18 09:49:22 +00:00
if ( zookeeper - > exists ( current_part_path ) )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Part {} has empty header and does not have columns and checksums. "
" Looks like a bug. " , current_part_path ) ;
LOG_INFO ( log , " Not checking checksums of part {} with replica {} because part was removed from ZooKeeper " , part_name , replica ) ;
2018-12-11 13:30:20 +00:00
continue ;
}
2021-08-18 09:49:22 +00:00
}
else
{
replica_part_header = ReplicatedMergeTreePartHeader : : fromString ( part_zk_str ) ;
2014-07-10 10:16:50 +00:00
}
2018-03-21 23:30:20 +00:00
2018-12-11 13:30:20 +00:00
if ( replica_part_header . getColumnsHash ( ) ! = local_part_header . getColumnsHash ( ) )
2014-04-14 13:08:26 +00:00
{
2022-03-25 14:54:05 +00:00
/// Currently there are two (known) cases when it may happen:
/// - KILL MUTATION query had removed mutation before all replicas have executed assigned MUTATE_PART entries.
/// Some replicas may skip this mutation and update part version without actually applying any changes.
/// It leads to mismatching checksum if changes were applied on other replicas.
/// - ALTER_METADATA and MERGE_PARTS were reordered on some replicas.
/// It may lead to different number of columns in merged parts on these replicas.
throw Exception ( ErrorCodes : : CHECKSUM_DOESNT_MATCH , " Part {} from {} has different columns hash "
" (it may rarely happen on race condition with KILL MUTATION or ALTER COLUMN). " , part_name , replica ) ;
2014-04-14 13:08:26 +00:00
}
2017-04-01 07:20:54 +00:00
2018-12-11 13:30:20 +00:00
replica_part_header . getChecksums ( ) . checkEqual ( local_part_header . getChecksums ( ) , true ) ;
2018-03-21 23:30:20 +00:00
if ( replica = = replica_name )
2018-12-11 13:30:20 +00:00
has_been_already_added = true ;
2018-05-17 22:57:31 +00:00
/// If we verify checksums in "sequential manner" (i.e. recheck absence of checksums on other replicas when commit)
/// then it is enough to verify checksums on at least one replica since checksums on other replicas must be the same.
if ( absent_replicas_paths )
{
absent_replicas_paths - > clear ( ) ;
break ;
}
2014-04-08 17:45:21 +00:00
}
2017-04-01 07:20:54 +00:00
2018-12-11 13:30:20 +00:00
if ( ! has_been_already_added )
2014-07-28 14:31:07 +00:00
{
2019-08-26 18:08:58 +00:00
const auto storage_settings_ptr = getSettings ( ) ;
2021-05-08 10:59:55 +00:00
String part_path = fs : : path ( replica_path ) / " parts " / part_name ;
2018-03-21 23:30:20 +00:00
2019-08-26 18:08:58 +00:00
if ( storage_settings_ptr - > use_minimalistic_part_header_in_zookeeper )
2018-12-11 13:30:20 +00:00
{
ops . emplace_back ( zkutil : : makeCreateRequest (
part_path , local_part_header . toString ( ) , zkutil : : CreateMode : : Persistent ) ) ;
}
else
{
ops . emplace_back ( zkutil : : makeCreateRequest (
part_path , " " , zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest (
2021-05-08 10:59:55 +00:00
fs : : path ( part_path ) / " columns " , part - > getColumns ( ) . toString ( ) , zkutil : : CreateMode : : Persistent ) ) ;
2018-12-11 13:30:20 +00:00
ops . emplace_back ( zkutil : : makeCreateRequest (
2021-05-08 10:59:55 +00:00
fs : : path ( part_path ) / " checksums " , getChecksumsForZooKeeper ( part - > checksums ) , zkutil : : CreateMode : : Persistent ) ) ;
2018-12-11 13:30:20 +00:00
}
2018-03-21 23:30:20 +00:00
}
else
{
2021-05-08 10:59:55 +00:00
LOG_WARNING ( log , " checkPartAndAddToZooKeeper: node {} already exists. Will not commit any nodes. " ,
( fs : : path ( replica_path ) / " parts " / part_name ) . string ( ) ) ;
2014-07-28 14:31:07 +00:00
}
2018-03-21 23:30:20 +00:00
}
2017-04-01 07:20:54 +00:00
2019-05-03 02:00:57 +00:00
MergeTreeData : : DataPartsVector StorageReplicatedMergeTree : : checkPartChecksumsAndCommit ( Transaction & transaction ,
2022-06-30 20:51:27 +00:00
const DataPartPtr & part , std : : optional < MergeTreeData : : HardlinkedFiles > hardlinked_files )
2018-03-21 23:30:20 +00:00
{
auto zookeeper = getZooKeeper ( ) ;
2017-04-01 07:20:54 +00:00
2022-04-15 14:24:38 +00:00
2018-03-21 23:30:20 +00:00
while ( true )
{
2018-08-25 01:58:14 +00:00
Coordination : : Requests ops ;
2018-03-21 23:30:20 +00:00
NameSet absent_part_paths_on_replicas ;
2022-06-30 20:51:27 +00:00
lockSharedData ( * part , false , hardlinked_files ) ;
2022-04-15 14:24:38 +00:00
2018-03-21 23:30:20 +00:00
/// Checksums are checked here and `ops` is filled. In fact, the part is added to ZK just below, when executing `multi`.
checkPartChecksumsAndAddCommitOps ( zookeeper , part , ops , part - > name , & absent_part_paths_on_replicas ) ;
/// Do not commit if the part is obsolete, we have just briefly checked its checksums
if ( transaction . isEmpty ( ) )
return { } ;
/// Will check that the part did not suddenly appear on skipped replicas
if ( ! absent_part_paths_on_replicas . empty ( ) )
{
2018-08-25 01:58:14 +00:00
Coordination : : Requests new_ops ;
2018-03-21 23:30:20 +00:00
for ( const String & part_path : absent_part_paths_on_replicas )
{
2021-12-10 13:29:51 +00:00
/// NOTE Create request may fail with ZNONODE if replica is being dropped, we will throw an exception
2018-03-24 00:45:04 +00:00
new_ops . emplace_back ( zkutil : : makeCreateRequest ( part_path , " " , zkutil : : CreateMode : : Persistent ) ) ;
new_ops . emplace_back ( zkutil : : makeRemoveRequest ( part_path , - 1 ) ) ;
2018-03-21 23:30:20 +00:00
}
/// Add check ops at the beginning
new_ops . insert ( new_ops . end ( ) , ops . begin ( ) , ops . end ( ) ) ;
ops = std : : move ( new_ops ) ;
}
2022-09-27 21:16:16 +00:00
Coordination : : Responses responses ;
Coordination : : Error e = zookeeper - > tryMulti ( ops , responses ) ;
if ( e = = Coordination : : Error : : ZOK )
return transaction . commit ( ) ;
2018-03-21 23:30:20 +00:00
2022-09-27 21:16:16 +00:00
if ( e = = Coordination : : Error : : ZNODEEXISTS )
{
size_t num_check_ops = 2 * absent_part_paths_on_replicas . size ( ) ;
size_t failed_op_index = zkutil : : getFailedOpIndex ( e , responses ) ;
if ( failed_op_index < num_check_ops )
2022-04-15 14:24:38 +00:00
{
2022-09-27 21:16:16 +00:00
LOG_INFO ( log , " The part {} on a replica suddenly appeared, will recheck checksums " , ops [ failed_op_index ] - > getPath ( ) ) ;
continue ;
2022-04-15 14:24:38 +00:00
}
2018-03-21 23:30:20 +00:00
}
2022-09-27 21:16:16 +00:00
throw zkutil : : KeeperException ( e ) ;
2018-03-21 23:30:20 +00:00
}
}
2018-05-21 13:49:54 +00:00
String StorageReplicatedMergeTree : : getChecksumsForZooKeeper ( const MergeTreeDataPartChecksums & checksums ) const
2018-03-21 23:30:20 +00:00
{
return MinimalisticDataPartChecksums : : getSerializedString ( checksums ,
2019-08-26 14:24:29 +00:00
getSettings ( ) - > use_minimalistic_checksums_in_zookeeper ) ;
2014-04-08 17:45:21 +00:00
}
2021-02-16 13:19:21 +00:00
MergeTreeData : : MutableDataPartPtr StorageReplicatedMergeTree : : attachPartHelperFoundValidPart ( const LogEntry & entry ) const
2021-02-15 15:06:48 +00:00
{
2021-03-16 21:59:12 +00:00
const MergeTreePartInfo actual_part_info = MergeTreePartInfo : : fromPartName ( entry . new_part_name , format_version ) ;
const String part_new_name = actual_part_info . getPartName ( ) ;
2021-02-15 15:06:48 +00:00
2021-03-16 21:59:12 +00:00
for ( const DiskPtr & disk : getStoragePolicy ( ) - > getDisks ( ) )
2022-10-22 22:51:59 +00:00
{
2021-05-08 10:59:55 +00:00
for ( const auto it = disk - > iterateDirectory ( fs : : path ( relative_data_path ) / " detached/ " ) ; it - > isValid ( ) ; it - > next ( ) )
2021-02-15 15:06:48 +00:00
{
2021-08-24 12:57:49 +00:00
const auto part_info = MergeTreePartInfo : : tryParsePartName ( it - > name ( ) , format_version ) ;
2021-02-19 14:28:29 +00:00
2021-08-24 12:57:49 +00:00
if ( ! part_info | | part_info - > partition_id ! = actual_part_info . partition_id )
2021-02-15 15:06:48 +00:00
continue ;
2021-08-24 12:57:49 +00:00
const String part_old_name = part_info - > getPartName ( ) ;
2021-03-15 15:44:15 +00:00
2021-03-16 21:59:12 +00:00
const VolumePtr volume = std : : make_shared < SingleDiskVolume > ( " volume_ " + part_old_name , disk ) ;
2021-03-30 22:49:15 +00:00
2022-04-21 19:19:13 +00:00
auto data_part_storage = std : : make_shared < DataPartStorageOnDisk > (
volume ,
fs : : path ( relative_data_path ) / " detached " ,
part_old_name ) ;
2021-03-30 22:49:15 +00:00
/// actual_part_info is more recent than part_info so we use it
2022-04-21 19:19:13 +00:00
MergeTreeData : : MutableDataPartPtr part = createPart ( part_new_name , actual_part_info , data_part_storage ) ;
2021-03-15 15:44:15 +00:00
2021-03-22 14:52:21 +00:00
try
{
2021-03-22 15:44:44 +00:00
part - > loadColumnsChecksumsIndexes ( true , true ) ;
2021-03-22 14:52:21 +00:00
}
2021-03-22 15:44:44 +00:00
catch ( const Exception & )
2021-03-22 14:52:21 +00:00
{
2021-03-22 15:44:44 +00:00
/// This method throws if the part data is corrupted or partly missing. In this case, we simply don't
/// process the part.
2021-03-22 14:52:21 +00:00
continue ;
}
2021-03-17 16:22:56 +00:00
2021-03-16 21:59:12 +00:00
if ( entry . part_checksum = = part - > checksums . getTotalChecksumHex ( ) )
2021-03-17 15:49:04 +00:00
{
2022-04-21 19:19:13 +00:00
part - > modification_time = data_part_storage - > getLastModified ( ) . epochTime ( ) ;
2021-03-16 21:59:12 +00:00
return part ;
2021-03-17 15:49:04 +00:00
}
2021-02-15 15:06:48 +00:00
}
2022-10-22 22:51:59 +00:00
}
2021-02-15 15:06:48 +00:00
return { } ;
}
2018-05-23 14:33:55 +00:00
bool StorageReplicatedMergeTree : : executeLogEntry ( LogEntry & entry )
2014-04-03 11:48:28 +00:00
{
2014-08-05 13:49:44 +00:00
if ( entry . type = = LogEntry : : DROP_RANGE )
2014-08-08 08:28:13 +00:00
{
executeDropRange ( entry ) ;
return true ;
}
2017-04-01 07:20:54 +00:00
2018-05-21 13:49:54 +00:00
if ( entry . type = = LogEntry : : REPLACE_RANGE )
{
executeReplaceRange ( entry ) ;
return true ;
}
2021-02-16 15:36:30 +00:00
const bool is_get_or_attach = entry . type = = LogEntry : : GET_PART | | entry . type = = LogEntry : : ATTACH_PART ;
2021-02-19 00:24:53 +00:00
if ( is_get_or_attach | | entry . type = = LogEntry : : MERGE_PARTS | | entry . type = = LogEntry : : MUTATE_PART )
2014-04-03 11:48:28 +00:00
{
2017-03-13 18:01:46 +00:00
/// If we already have this part or a part covering it, we do not need to do anything.
2021-12-30 14:27:22 +00:00
/// The part may be still in the PreActive -> Active transition so we first search
/// among PreActive parts to definitely find the desired part if it exists.
DataPartPtr existing_part = getPartIfExists ( entry . new_part_name , { MergeTreeDataPartState : : PreActive } ) ;
2021-02-16 15:36:30 +00:00
2017-12-19 14:55:50 +00:00
if ( ! existing_part )
2019-05-03 02:00:57 +00:00
existing_part = getActiveContainingPart ( entry . new_part_name ) ;
2017-04-01 07:20:54 +00:00
2021-02-14 22:59:13 +00:00
/// Even if the part is local, it (in exceptional cases) may not be in ZooKeeper. Let's check that it is there.
2021-05-08 10:59:55 +00:00
if ( existing_part & & getZooKeeper ( ) - > exists ( fs : : path ( replica_path ) / " parts " / existing_part - > name ) )
2014-04-04 12:47:57 +00:00
{
2021-02-16 15:36:30 +00:00
if ( ! is_get_or_attach | | entry . source_replica ! = replica_name )
2021-02-15 15:06:48 +00:00
LOG_DEBUG ( log , " Skipping action for part {} because part {} already exists. " ,
entry . new_part_name , existing_part - > name ) ;
2021-02-16 15:36:30 +00:00
2014-07-18 15:41:04 +00:00
return true ;
2014-04-04 12:47:57 +00:00
}
2014-04-03 11:48:28 +00:00
}
2017-04-01 07:20:54 +00:00
2021-03-15 15:44:15 +00:00
if ( entry . type = = LogEntry : : ATTACH_PART )
{
if ( MutableDataPartPtr part = attachPartHelperFoundValidPart ( entry ) ; part )
{
2021-06-20 08:24:43 +00:00
LOG_TRACE ( log , " Found valid local part for {}, preparing the transaction " , part - > name ) ;
2021-03-15 15:44:15 +00:00
2022-03-16 19:16:26 +00:00
Transaction transaction ( * this , NO_TRANSACTION_RAW ) ;
2021-03-15 15:44:15 +00:00
2022-02-15 15:00:45 +00:00
part - > version . setCreationTID ( Tx : : PrehistoricTID , nullptr ) ;
2022-10-22 22:51:59 +00:00
renameTempPartAndReplace ( part , transaction ) ;
2022-06-30 20:51:27 +00:00
checkPartChecksumsAndCommit ( transaction , part ) ;
2021-03-15 15:44:15 +00:00
2021-03-22 13:27:35 +00:00
writePartLog ( PartLogElement : : Type : : NEW_PART , { } , 0 /** log entry is fake so we don't measure the time */ ,
part - > name , part , { } /** log entry is fake so there are no initial parts */ , nullptr ) ;
2021-03-17 15:49:04 +00:00
2021-03-15 15:44:15 +00:00
return true ;
}
2021-06-20 08:24:43 +00:00
LOG_TRACE ( log , " Didn't find valid local part for {} ({}), will fetch it from other replica " ,
entry . new_part_name ,
entry . actual_new_part_name ) ;
2021-03-15 15:44:15 +00:00
}
2021-02-16 15:36:30 +00:00
if ( is_get_or_attach & & entry . source_replica = = replica_name )
2020-05-23 22:24:01 +00:00
LOG_WARNING ( log , " Part {} from own log doesn't exist. " , entry . new_part_name ) ;
2017-04-01 07:20:54 +00:00
2021-02-15 15:06:48 +00:00
/// Perhaps we don't need this part, because during write with quorum, the quorum has failed
/// (see below about `/quorum/failed_parts`).
2021-05-08 10:59:55 +00:00
if ( entry . quorum & & getZooKeeper ( ) - > exists ( fs : : path ( zookeeper_path ) / " quorum " / " failed_parts " / entry . new_part_name ) )
2015-09-20 11:02:59 +00:00
{
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Skipping action for part {} because quorum for that part was failed. " , entry . new_part_name ) ;
2017-03-13 18:01:46 +00:00
return true ; /// NOTE Deletion from `virtual_parts` is not done, but it is only necessary for merge.
2015-09-20 11:02:59 +00:00
}
2017-04-01 07:20:54 +00:00
2021-02-14 22:59:13 +00:00
switch ( entry . type )
{
2021-02-15 15:06:48 +00:00
case LogEntry : : ATTACH_PART :
2021-02-14 22:59:13 +00:00
/// We surely don't have this part locally as we've checked it before, so download it.
2021-02-15 15:06:48 +00:00
[[fallthrough]] ;
case LogEntry : : GET_PART :
2021-09-16 21:19:58 +00:00
return executeFetch ( entry ) ;
2021-02-14 22:59:13 +00:00
case LogEntry : : MERGE_PARTS :
2021-09-16 21:19:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Merge has to be executed by another function " ) ;
2021-02-14 22:59:13 +00:00
case LogEntry : : MUTATE_PART :
2021-09-16 21:19:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Mutation has to be executed by another function " ) ;
2021-02-14 22:59:13 +00:00
case LogEntry : : ALTER_METADATA :
return executeMetadataAlter ( entry ) ;
2020-11-24 14:24:48 +00:00
case LogEntry : : SYNC_PINNED_PART_UUIDS :
syncPinnedPartUUIDs ( ) ;
return true ;
case LogEntry : : CLONE_PART_FROM_SHARD :
executeClonePartFromShard ( entry ) ;
return true ;
2021-02-14 22:59:13 +00:00
default :
2021-03-22 13:27:35 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Unexpected log entry type: {} " , static_cast < int > ( entry . type ) ) ;
2021-02-14 22:59:13 +00:00
}
2018-01-23 22:56:46 +00:00
}
2017-04-01 07:20:54 +00:00
2022-03-19 16:31:33 +00:00
bool StorageReplicatedMergeTree : : executeFetch ( LogEntry & entry , bool need_to_check_missing_part )
2018-01-23 22:56:46 +00:00
{
2020-02-17 16:33:05 +00:00
/// Looking for covering part. After that entry.actual_new_part_name may be filled.
2018-01-23 22:56:46 +00:00
String replica = findReplicaHavingCoveringPart ( entry , true ) ;
2019-08-26 18:08:58 +00:00
const auto storage_settings_ptr = getSettings ( ) ;
2020-06-26 11:30:23 +00:00
auto metadata_snapshot = getInMemoryMetadataPtr ( ) ;
2017-04-01 07:20:54 +00:00
2018-01-23 22:56:46 +00:00
try
{
if ( replica . empty ( ) )
2016-11-16 06:08:29 +00:00
{
2018-01-23 22:56:46 +00:00
/** If a part is to be written with a quorum and the quorum is not reached yet,
* then ( due to the fact that a part is impossible to download right now ) ,
* the quorum entry should be considered unsuccessful .
* TODO Complex code , extract separately .
*/
if ( entry . quorum )
2014-04-08 17:45:21 +00:00
{
2018-01-23 22:56:46 +00:00
if ( entry . type ! = LogEntry : : GET_PART )
throw Exception ( " Logical error: log entry with quorum but type is not GET_PART " , ErrorCodes : : LOGICAL_ERROR ) ;
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " No active replica has part {} which needs to be written with quorum. Will try to mark that quorum as failed. " , entry . new_part_name ) ;
2018-01-23 22:56:46 +00:00
/** Atomically:
* - if replicas do not become active ;
* - if there is a ` quorum ` node with this part ;
* - delete ` quorum ` node ;
* - add a part to the list ` quorum / failed_parts ` ;
* - if the part is not already removed from the list for deduplication ` blocks / block_num ` , then delete it ;
*
* If something changes , then we will nothing - we ' ll get here again next time .
2015-09-20 11:02:59 +00:00
*/
2017-04-01 07:20:54 +00:00
2018-01-23 22:56:46 +00:00
/** We collect the `host` node versions from the replicas.
* When the replica becomes active , it changes the value of host in the same transaction ( with the creation of ` is_active ` ) .
* This will ensure that the replicas do not become active .
*/
2017-04-01 07:20:54 +00:00
2018-01-23 22:56:46 +00:00
auto zookeeper = getZooKeeper ( ) ;
2017-04-01 07:20:54 +00:00
2021-05-08 10:59:55 +00:00
Strings replicas = zookeeper - > getChildren ( fs : : path ( zookeeper_path ) / " replicas " ) ;
2017-04-01 07:20:54 +00:00
2018-08-25 01:58:14 +00:00
Coordination : : Requests ops ;
2017-04-01 07:20:54 +00:00
2020-03-09 01:22:33 +00:00
for ( const auto & path_part : replicas )
2018-01-23 22:56:46 +00:00
{
2018-08-25 01:58:14 +00:00
Coordination : : Stat stat ;
2021-05-08 10:59:55 +00:00
String path = fs : : path ( zookeeper_path ) / " replicas " / path_part / " host " ;
2018-01-23 22:56:46 +00:00
zookeeper - > get ( path , & stat ) ;
2018-03-24 00:45:04 +00:00
ops . emplace_back ( zkutil : : makeCheckRequest ( path , stat . version ) ) ;
2018-01-23 22:56:46 +00:00
}
2017-04-01 07:20:54 +00:00
2018-01-23 22:56:46 +00:00
/// We verify that while we were collecting versions, the replica with the necessary part did not come alive.
replica = findReplicaHavingPart ( entry . new_part_name , true ) ;
2017-04-01 07:20:54 +00:00
2018-01-23 22:56:46 +00:00
/// Also during this time a completely new replica could be created.
/// But if a part does not appear on the old, then it can not be on the new one either.
if ( replica . empty ( ) )
{
2018-08-25 01:58:14 +00:00
Coordination : : Stat quorum_stat ;
2021-05-08 10:59:55 +00:00
const String quorum_unparallel_path = fs : : path ( zookeeper_path ) / " quorum " / " status " ;
const String quorum_parallel_path = fs : : path ( zookeeper_path ) / " quorum " / " parallel " / entry . new_part_name ;
2020-10-06 21:49:48 +00:00
String quorum_str , quorum_path ;
2018-01-23 22:56:46 +00:00
ReplicatedMergeTreeQuorumEntry quorum_entry ;
2020-10-06 21:49:48 +00:00
if ( zookeeper - > tryGet ( quorum_unparallel_path , quorum_str , & quorum_stat ) )
quorum_path = quorum_unparallel_path ;
2020-10-06 22:36:42 +00:00
else
{
2020-10-06 21:49:48 +00:00
quorum_str = zookeeper - > get ( quorum_parallel_path , & quorum_stat ) ;
quorum_path = quorum_parallel_path ;
}
2018-01-23 22:56:46 +00:00
quorum_entry . fromString ( quorum_str ) ;
if ( quorum_entry . part_name = = entry . new_part_name )
2015-09-20 11:02:59 +00:00
{
2018-03-24 00:45:04 +00:00
ops . emplace_back ( zkutil : : makeRemoveRequest ( quorum_path , quorum_stat . version ) ) ;
2019-05-03 02:00:57 +00:00
auto part_info = MergeTreePartInfo : : fromPartName ( entry . new_part_name , format_version ) ;
2017-04-01 07:20:54 +00:00
2018-01-23 22:56:46 +00:00
if ( part_info . min_block ! = part_info . max_block )
throw Exception ( " Logical error: log entry with quorum for part covering more than one block number " ,
ErrorCodes : : LOGICAL_ERROR ) ;
2017-04-01 07:20:54 +00:00
2018-03-24 00:45:04 +00:00
ops . emplace_back ( zkutil : : makeCreateRequest (
2021-05-08 10:59:55 +00:00
fs : : path ( zookeeper_path ) / " quorum " / " failed_parts " / entry . new_part_name ,
2018-01-23 22:56:46 +00:00
" " ,
zkutil : : CreateMode : : Persistent ) ) ;
/// Deleting from `blocks`.
2021-05-08 10:59:55 +00:00
if ( ! entry . block_id . empty ( ) & & zookeeper - > exists ( fs : : path ( zookeeper_path ) / " blocks " / entry . block_id ) )
ops . emplace_back ( zkutil : : makeRemoveRequest ( fs : : path ( zookeeper_path ) / " blocks " / entry . block_id , - 1 ) ) ;
2018-01-23 22:56:46 +00:00
2018-08-25 01:58:14 +00:00
Coordination : : Responses responses ;
2018-03-25 00:15:52 +00:00
auto code = zookeeper - > tryMulti ( ops , responses ) ;
2018-01-23 22:56:46 +00:00
2020-06-12 15:09:12 +00:00
if ( code = = Coordination : : Error : : ZOK )
2015-09-20 11:02:59 +00:00
{
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Marked quorum for part {} as failed. " , entry . new_part_name ) ;
2021-05-30 21:29:37 +00:00
queue . removeFailedQuorumPart ( part_info ) ;
2018-05-10 15:01:10 +00:00
return true ;
2015-09-20 11:02:59 +00:00
}
2020-06-12 15:09:12 +00:00
else if ( code = = Coordination : : Error : : ZBADVERSION | | code = = Coordination : : Error : : ZNONODE | | code = = Coordination : : Error : : ZNODEEXISTS )
2015-09-20 11:02:59 +00:00
{
2021-05-08 10:59:55 +00:00
LOG_DEBUG ( log , " State was changed or isn't expected when trying to mark quorum for part {} as failed. Code: {} " ,
entry . new_part_name , Coordination : : errorMessage ( code ) ) ;
2015-09-20 11:02:59 +00:00
}
2018-01-23 22:56:46 +00:00
else
2018-08-25 01:58:14 +00:00
throw Coordination : : Exception ( code ) ;
2018-01-23 22:56:46 +00:00
}
else
{
2021-05-30 21:29:37 +00:00
LOG_WARNING ( log , " No active replica has part {}, "
" but that part needs quorum and /quorum/status contains entry about another part {}. "
" It means that part was successfully written to {} replicas, but then all of them goes offline. "
" Or it is a bug. " , entry . new_part_name , quorum_entry . part_name , entry . quorum ) ;
2015-09-20 11:02:59 +00:00
}
}
2014-04-08 17:45:21 +00:00
}
2017-04-01 07:20:54 +00:00
2018-01-23 22:56:46 +00:00
if ( replica . empty ( ) )
2017-04-06 13:03:23 +00:00
{
2018-01-23 22:56:46 +00:00
ProfileEvents : : increment ( ProfileEvents : : ReplicatedPartFailedFetches ) ;
2022-03-19 16:31:33 +00:00
if ( ! need_to_check_missing_part )
return false ;
2018-01-23 22:56:46 +00:00
throw Exception ( " No active replica has part " + entry . new_part_name + " or covering part " , ErrorCodes : : NO_REPLICA_HAS_PART ) ;
2017-04-06 13:03:23 +00:00
}
2018-01-23 22:56:46 +00:00
}
2017-04-01 07:20:54 +00:00
2018-01-23 22:56:46 +00:00
try
{
2020-02-13 20:09:48 +00:00
String part_name = entry . actual_new_part_name . empty ( ) ? entry . new_part_name : entry . actual_new_part_name ;
2021-07-05 19:58:36 +00:00
if ( ! entry . actual_new_part_name . empty ( ) )
LOG_DEBUG ( log , " Will fetch part {} instead of {} " , entry . actual_new_part_name , entry . new_part_name ) ;
2021-05-08 10:59:55 +00:00
if ( ! fetchPart ( part_name , metadata_snapshot , fs : : path ( zookeeper_path ) / " replicas " / replica , false , entry . quorum ) )
2018-01-23 22:56:46 +00:00
return false ;
2014-04-07 15:45:46 +00:00
}
2018-01-23 22:56:46 +00:00
catch ( Exception & e )
2014-04-07 15:45:46 +00:00
{
2018-01-23 22:56:46 +00:00
/// No stacktrace, just log message
if ( e . code ( ) = = ErrorCodes : : RECEIVED_ERROR_TOO_MANY_REQUESTS )
e . addMessage ( " Too busy replica. Will try later. " ) ;
throw ;
}
2017-04-01 07:20:54 +00:00
2018-01-23 22:56:46 +00:00
if ( entry . type = = LogEntry : : MERGE_PARTS )
ProfileEvents : : increment ( ProfileEvents : : ReplicatedPartFetchesOfMerged ) ;
}
catch ( . . . )
{
2020-04-20 04:19:50 +00:00
/** If we can not download the part we need for some merge, it's better not to try to get other parts for this merge,
2018-01-23 22:56:46 +00:00
* but try to get already merged part . To do this , move the action to get the remaining parts
* for this merge at the end of the queue .
*/
try
{
auto parts_for_merge = queue . moveSiblingPartsForMergeToEndOfQueue ( entry . new_part_name ) ;
2017-04-01 07:20:54 +00:00
2018-01-23 22:56:46 +00:00
if ( ! parts_for_merge . empty ( ) & & replica . empty ( ) )
2014-04-07 15:45:46 +00:00
{
2022-04-13 19:33:55 +00:00
LOG_INFO ( log , " No active replica has part {}. Will fetch merged part instead. " , entry . new_part_name ) ;
2021-08-18 09:49:22 +00:00
/// We should enqueue it for check, because merged part may never appear if source part is lost
enqueuePartForCheck ( entry . new_part_name ) ;
2018-01-23 22:56:46 +00:00
return false ;
2014-04-07 15:45:46 +00:00
}
2017-04-01 07:20:54 +00:00
2018-01-23 22:56:46 +00:00
/** If no active replica has a part, and there is no merge in the queue with its participation,
* check to see if any ( active or inactive ) replica has such a part or covering it .
*/
if ( replica . empty ( ) )
enqueuePartForCheck ( entry . new_part_name ) ;
2014-04-07 15:45:46 +00:00
}
2018-01-23 22:56:46 +00:00
catch ( . . . )
{
2018-04-06 21:46:57 +00:00
tryLogCurrentException ( log , __PRETTY_FUNCTION__ ) ;
2018-01-23 22:56:46 +00:00
}
throw ;
2014-04-07 15:45:46 +00:00
}
2017-04-01 07:20:54 +00:00
2014-07-18 15:41:04 +00:00
return true ;
2014-04-03 11:48:28 +00:00
}
2014-10-18 17:37:55 +00:00
2022-10-22 22:51:59 +00:00
bool StorageReplicatedMergeTree : : executeFetchShared (
2021-03-04 23:10:20 +00:00
const String & source_replica ,
const String & new_part_name ,
const DiskPtr & disk ,
const String & path )
2021-01-14 16:26:56 +00:00
{
2021-03-04 23:10:20 +00:00
if ( source_replica . empty ( ) )
2021-01-14 16:26:56 +00:00
{
2021-07-05 03:32:56 +00:00
LOG_INFO ( log , " No active replica has part {} on shared storage. " , new_part_name ) ;
2022-10-22 22:51:59 +00:00
return false ;
2021-01-14 16:26:56 +00:00
}
const auto storage_settings_ptr = getSettings ( ) ;
auto metadata_snapshot = getInMemoryMetadataPtr ( ) ;
try
{
2022-04-22 16:58:09 +00:00
return fetchExistsPart ( new_part_name , metadata_snapshot , fs : : path ( zookeeper_path ) / " replicas " / source_replica , disk , path ) ;
2021-01-14 16:26:56 +00:00
}
catch ( Exception & e )
{
if ( e . code ( ) = = ErrorCodes : : RECEIVED_ERROR_TOO_MANY_REQUESTS )
e . addMessage ( " Too busy replica. Will try later. " ) ;
tryLogCurrentException ( log , __PRETTY_FUNCTION__ ) ;
throw ;
}
}
2018-05-23 14:33:55 +00:00
void StorageReplicatedMergeTree : : executeDropRange ( const LogEntry & entry )
2014-08-05 13:49:44 +00:00
{
2021-10-27 12:59:26 +00:00
LOG_TRACE ( log , " Executing DROP_RANGE {} " , entry . new_part_name ) ;
2019-05-03 02:00:57 +00:00
auto drop_range_info = MergeTreePartInfo : : fromPartName ( entry . new_part_name , format_version ) ;
2021-06-24 14:07:43 +00:00
getContext ( ) - > getMergeList ( ) . cancelInPartition ( getStorageID ( ) , drop_range_info . partition_id , drop_range_info . max_block ) ;
2018-08-06 12:59:51 +00:00
queue . removePartProducingOpsInRange ( getZooKeeper ( ) , drop_range_info , entry ) ;
2022-07-18 21:37:07 +00:00
part_check_thread . cancelRemovedPartsCheck ( drop_range_info ) ;
2017-04-01 07:20:54 +00:00
2017-03-13 18:01:46 +00:00
/// Delete the parts contained in the range to be deleted.
/// It's important that no old parts remain (after the merge), because otherwise,
/// after adding a new replica, this new replica downloads them, but does not delete them.
/// And, if you do not, the parts will come to life after the server is restarted.
2017-11-20 19:33:12 +00:00
/// Therefore, we use all data parts.
2020-06-26 11:30:23 +00:00
auto metadata_snapshot = getInMemoryMetadataPtr ( ) ;
2019-05-03 02:00:57 +00:00
DataPartsVector parts_to_remove ;
2014-08-05 13:49:44 +00:00
{
2019-05-03 02:00:57 +00:00
auto data_parts_lock = lockParts ( ) ;
2022-05-04 14:22:06 +00:00
parts_to_remove = removePartsInRangeFromWorkingSetAndGetPartsToRemoveFromZooKeeper ( NO_TRANSACTION_RAW , drop_range_info , data_parts_lock ) ;
2021-06-23 20:57:49 +00:00
if ( parts_to_remove . empty ( ) )
2021-09-10 14:32:45 +00:00
{
if ( ! drop_range_info . isFakeDropRangePart ( ) )
LOG_INFO ( log , " Log entry {} tried to drop single part {}, but part does not exist " , entry . znode_name , entry . new_part_name ) ;
2021-06-23 20:57:49 +00:00
return ;
2021-09-10 14:32:45 +00:00
}
2018-05-21 13:49:54 +00:00
}
2017-04-01 07:20:54 +00:00
2021-06-23 20:57:49 +00:00
if ( entry . detach )
LOG_DEBUG ( log , " Detaching parts. " ) ;
else
LOG_DEBUG ( log , " Removing parts. " ) ;
2018-05-21 13:49:54 +00:00
if ( entry . detach )
{
/// If DETACH clone parts to detached/ directory
for ( const auto & part : parts_to_remove )
{
2022-05-03 19:32:24 +00:00
LOG_INFO ( log , " Detaching {} " , part - > data_part_storage - > getPartDirectory ( ) ) ;
2020-06-26 11:30:23 +00:00
part - > makeCloneInDetached ( " " , metadata_snapshot ) ;
2018-05-21 13:49:54 +00:00
}
}
2017-04-01 07:20:54 +00:00
2018-05-21 13:49:54 +00:00
/// Forcibly remove parts from ZooKeeper
2021-03-05 09:50:26 +00:00
removePartsFromZooKeeperWithRetries ( parts_to_remove ) ;
2017-04-01 07:20:54 +00:00
2020-05-23 20:37:37 +00:00
if ( entry . detach )
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Detached {} parts inside {}. " , parts_to_remove . size ( ) , entry . new_part_name ) ;
2020-05-23 20:37:37 +00:00
else
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Removed {} parts inside {}. " , parts_to_remove . size ( ) , entry . new_part_name ) ;
2017-04-01 07:20:54 +00:00
2018-05-21 13:49:54 +00:00
/// We want to remove dropped parts from disk as soon as possible
/// To be removed a partition should have zero refcount, therefore call the cleanup thread at exit
parts_to_remove . clear ( ) ;
2018-07-30 17:34:55 +00:00
cleanup_thread . wakeup ( ) ;
2014-08-08 08:28:13 +00:00
}
2014-10-18 17:37:55 +00:00
2018-05-23 14:33:55 +00:00
bool StorageReplicatedMergeTree : : executeReplaceRange ( const LogEntry & entry )
2014-04-03 11:48:28 +00:00
{
2018-05-21 13:49:54 +00:00
Stopwatch watch ;
auto & entry_replace = * entry . replace_range_entry ;
2021-05-13 11:29:59 +00:00
LOG_DEBUG ( log , " Executing log entry {} to replace parts range {} with {} parts from {}.{} " ,
entry . znode_name , entry_replace . drop_range_part_name , entry_replace . new_part_names . size ( ) ,
entry_replace . from_database , entry_replace . from_table ) ;
2020-06-26 11:30:23 +00:00
auto metadata_snapshot = getInMemoryMetadataPtr ( ) ;
2022-04-21 12:39:12 +00:00
auto storage_settings_ptr = getSettings ( ) ;
2017-04-01 07:20:54 +00:00
2019-05-03 02:00:57 +00:00
MergeTreePartInfo drop_range = MergeTreePartInfo : : fromPartName ( entry_replace . drop_range_part_name , format_version ) ;
2021-05-13 14:23:00 +00:00
/// Range with only one block has special meaning: it's ATTACH PARTITION or MOVE PARTITION, so there is no drop range
2021-05-14 16:11:40 +00:00
bool replace = ! LogEntry : : ReplaceRangeEntry : : isMovePartitionOrAttachFrom ( drop_range ) ;
2018-05-21 13:49:54 +00:00
2021-05-13 14:23:00 +00:00
if ( replace )
2021-06-24 14:07:43 +00:00
{
getContext ( ) - > getMergeList ( ) . cancelInPartition ( getStorageID ( ) , drop_range . partition_id , drop_range . max_block ) ;
2021-05-13 14:23:00 +00:00
queue . removePartProducingOpsInRange ( getZooKeeper ( ) , drop_range , entry ) ;
2022-07-18 21:37:07 +00:00
part_check_thread . cancelRemovedPartsCheck ( drop_range ) ;
2021-06-24 14:07:43 +00:00
}
2021-05-13 14:23:00 +00:00
else
2021-06-24 14:07:43 +00:00
{
2021-05-13 14:23:00 +00:00
drop_range = { } ;
2021-06-24 14:07:43 +00:00
}
2018-05-21 13:49:54 +00:00
struct PartDescription
2017-11-17 08:58:35 +00:00
{
2021-04-10 23:33:54 +00:00
PartDescription (
size_t index_ ,
const String & src_part_name_ ,
const String & new_part_name_ ,
const String & checksum_hex_ ,
MergeTreeDataFormatVersion format_version )
: index ( index_ )
, src_part_name ( src_part_name_ )
, src_part_info ( MergeTreePartInfo : : fromPartName ( src_part_name_ , format_version ) )
, new_part_name ( new_part_name_ )
, new_part_info ( MergeTreePartInfo : : fromPartName ( new_part_name_ , format_version ) )
, checksum_hex ( checksum_hex_ )
{
}
2018-05-21 13:49:54 +00:00
size_t index ; // in log entry arrays
String src_part_name ;
MergeTreePartInfo src_part_info ;
String new_part_name ;
MergeTreePartInfo new_part_info ;
String checksum_hex ;
2019-01-22 19:56:53 +00:00
/// Part which will be committed
2019-05-03 02:00:57 +00:00
MutableDataPartPtr res_part ;
2018-05-21 13:49:54 +00:00
/// We could find a covering part
MergeTreePartInfo found_new_part_info ;
String found_new_part_name ;
/// Hold pointer to part in source table if will clone it from local table
2019-05-03 02:00:57 +00:00
DataPartPtr src_table_part ;
2018-05-21 13:49:54 +00:00
/// A replica that will be used to fetch part
String replica ;
2022-04-19 13:53:10 +00:00
MergeTreeData : : HardlinkedFiles hardlinked_files ;
2022-08-09 16:44:51 +00:00
scope_guard temporary_part_lock ;
2018-05-21 13:49:54 +00:00
} ;
using PartDescriptionPtr = std : : shared_ptr < PartDescription > ;
using PartDescriptions = std : : vector < PartDescriptionPtr > ;
PartDescriptions all_parts ;
PartDescriptions parts_to_add ;
2019-05-03 02:00:57 +00:00
DataPartsVector parts_to_remove ;
2018-05-21 13:49:54 +00:00
2020-06-18 16:10:47 +00:00
auto table_lock_holder_dst_table = lockForShare (
RWLockImpl : : NO_QUERY , getSettings ( ) - > lock_acquire_timeout_for_background_operations ) ;
2020-06-17 10:34:23 +00:00
auto dst_metadata_snapshot = getInMemoryMetadataPtr ( ) ;
2018-05-21 13:49:54 +00:00
for ( size_t i = 0 ; i < entry_replace . new_part_names . size ( ) ; + + i )
{
all_parts . emplace_back ( std : : make_shared < PartDescription > ( i ,
entry_replace . src_part_names . at ( i ) ,
entry_replace . new_part_names . at ( i ) ,
entry_replace . part_names_checksums . at ( i ) ,
2019-05-03 02:00:57 +00:00
format_version ) ) ;
2018-05-21 13:49:54 +00:00
}
2019-01-22 19:56:53 +00:00
/// What parts we should add? Or we have already added all required parts (we an replica-initializer)
2018-05-21 13:49:54 +00:00
{
2019-05-03 02:00:57 +00:00
auto data_parts_lock = lockParts ( ) ;
2018-05-21 13:49:54 +00:00
for ( const PartDescriptionPtr & part_desc : all_parts )
2017-12-21 18:17:06 +00:00
{
2021-12-30 14:27:22 +00:00
if ( ! getActiveContainingPart ( part_desc - > new_part_info , MergeTreeDataPartState : : Active , data_parts_lock ) )
2018-05-21 13:49:54 +00:00
parts_to_add . emplace_back ( part_desc ) ;
2017-12-21 18:17:06 +00:00
}
2018-05-21 13:49:54 +00:00
if ( parts_to_add . empty ( ) & & replace )
2021-05-13 11:29:59 +00:00
{
2022-05-04 14:22:06 +00:00
parts_to_remove = removePartsInRangeFromWorkingSetAndGetPartsToRemoveFromZooKeeper ( NO_TRANSACTION_RAW , drop_range , data_parts_lock ) ;
2021-05-13 11:29:59 +00:00
String parts_to_remove_str ;
for ( const auto & part : parts_to_remove )
{
parts_to_remove_str + = part - > name ;
parts_to_remove_str + = " " ;
}
LOG_TRACE ( log , " Replacing {} parts {}with empty set " , parts_to_remove . size ( ) , parts_to_remove_str ) ;
}
2018-05-21 13:49:54 +00:00
}
if ( parts_to_add . empty ( ) )
{
LOG_INFO ( log , " All parts from REPLACE PARTITION command have been already attached " ) ;
2021-03-05 09:50:26 +00:00
removePartsFromZooKeeperWithRetries ( parts_to_remove ) ;
2018-05-21 13:49:54 +00:00
return true ;
}
2018-06-04 19:46:47 +00:00
if ( parts_to_add . size ( ) < all_parts . size ( ) )
2018-05-21 13:49:54 +00:00
{
LOG_WARNING ( log , " Some (but not all) parts from REPLACE PARTITION command already exist. REPLACE PARTITION will not be atomic. " ) ;
}
StoragePtr source_table ;
2020-06-18 16:10:47 +00:00
TableLockHolder table_lock_holder_src_table ;
2020-02-17 19:28:25 +00:00
StorageID source_table_id { entry_replace . from_database , entry_replace . from_table } ;
2018-05-21 13:49:54 +00:00
auto clone_data_parts_from_source_table = [ & ] ( ) - > size_t
{
2021-04-10 23:33:54 +00:00
source_table = DatabaseCatalog : : instance ( ) . tryGetTable ( source_table_id , getContext ( ) ) ;
2018-05-21 13:49:54 +00:00
if ( ! source_table )
{
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Can't use {} as source table for REPLACE PARTITION command. It does not exist. " , source_table_id . getNameForLogs ( ) ) ;
2018-05-21 13:49:54 +00:00
return 0 ;
}
2020-06-17 10:34:23 +00:00
auto src_metadata_snapshot = source_table - > getInMemoryMetadataPtr ( ) ;
2018-05-21 13:49:54 +00:00
MergeTreeData * src_data = nullptr ;
2017-12-21 18:17:06 +00:00
try
{
2020-06-17 10:34:23 +00:00
src_data = & checkStructureAndGetMergeTreeData ( source_table , src_metadata_snapshot , dst_metadata_snapshot ) ;
2017-12-21 18:17:06 +00:00
}
2018-08-10 04:02:56 +00:00
catch ( Exception & )
2018-04-19 18:01:50 +00:00
{
2020-05-23 22:24:01 +00:00
LOG_INFO ( log , " Can't use {} as source table for REPLACE PARTITION command. Will fetch all parts. Reason: {} " , source_table_id . getNameForLogs ( ) , getCurrentExceptionMessage ( false ) ) ;
2018-05-21 13:49:54 +00:00
return 0 ;
}
2018-04-19 18:01:50 +00:00
2020-06-18 16:10:47 +00:00
table_lock_holder_src_table = source_table - > lockForShare (
RWLockImpl : : NO_QUERY , getSettings ( ) - > lock_acquire_timeout_for_background_operations ) ;
2018-05-21 13:49:54 +00:00
2020-06-18 16:10:47 +00:00
DataPartStates valid_states {
2021-12-30 14:27:22 +00:00
MergeTreeDataPartState : : PreActive , MergeTreeDataPartState : : Active , MergeTreeDataPartState : : Outdated } ;
2018-05-21 13:49:54 +00:00
size_t num_clonable_parts = 0 ;
for ( PartDescriptionPtr & part_desc : parts_to_add )
{
auto src_part = src_data - > getPartIfExists ( part_desc - > src_part_info , valid_states ) ;
if ( ! src_part )
{
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " There is no part {} in {} " , part_desc - > src_part_name , source_table_id . getNameForLogs ( ) ) ;
2018-05-21 13:49:54 +00:00
continue ;
}
2022-04-21 12:39:12 +00:00
bool avoid_copy_local_part = storage_settings_ptr - > allow_remote_fs_zero_copy_replication & & src_part - > isStoredOnRemoteDiskWithZeroCopySupport ( ) ;
if ( avoid_copy_local_part )
{
LOG_DEBUG ( log , " Avoid copy local part {} from table {} because of zero-copy replication " , part_desc - > src_part_name , source_table_id . getNameForLogs ( ) ) ;
continue ;
}
2020-03-17 15:10:56 +00:00
String checksum_hex = src_part - > checksums . getTotalChecksumHex ( ) ;
2018-05-21 13:49:54 +00:00
if ( checksum_hex ! = part_desc - > checksum_hex )
{
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Part {} of {} has inappropriate checksum " , part_desc - > src_part_name , source_table_id . getNameForLogs ( ) ) ;
2018-05-21 13:49:54 +00:00
/// TODO: check version
continue ;
}
part_desc - > found_new_part_name = part_desc - > new_part_name ;
part_desc - > found_new_part_info = part_desc - > new_part_info ;
part_desc - > src_table_part = src_part ;
+ + num_clonable_parts ;
2018-04-19 18:01:50 +00:00
}
2018-05-21 13:49:54 +00:00
return num_clonable_parts ;
} ;
size_t num_clonable_parts = clone_data_parts_from_source_table ( ) ;
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Found {} parts that could be cloned (of {} required parts) " , num_clonable_parts , parts_to_add . size ( ) ) ;
2018-05-21 13:49:54 +00:00
2019-05-03 02:00:57 +00:00
ActiveDataPartSet adding_parts_active_set ( format_version ) ;
2018-05-21 13:49:54 +00:00
std : : unordered_map < String , PartDescriptionPtr > part_name_to_desc ;
for ( PartDescriptionPtr & part_desc : parts_to_add )
{
if ( part_desc - > src_table_part )
2017-12-21 18:17:06 +00:00
{
2018-05-21 13:49:54 +00:00
/// It is clonable part
2018-05-28 15:37:30 +00:00
adding_parts_active_set . add ( part_desc - > new_part_name ) ;
2018-05-21 13:49:54 +00:00
part_name_to_desc . emplace ( part_desc - > new_part_name , part_desc ) ;
continue ;
2017-12-21 18:17:06 +00:00
}
2018-05-21 13:49:54 +00:00
/// Firstly, try find exact part to produce more accurate part set
String replica = findReplicaHavingPart ( part_desc - > new_part_name , true ) ;
String found_part_name ;
/// TODO: check version
if ( replica . empty ( ) )
{
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Part {} is not found on remote replicas " , part_desc - > new_part_name ) ;
2018-05-21 13:49:54 +00:00
/// Fallback to covering part
replica = findReplicaHavingCoveringPart ( part_desc - > new_part_name , true , found_part_name ) ;
if ( replica . empty ( ) )
{
/// It is not fail, since adjacent parts could cover current part
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Parts covering {} are not found on remote replicas " , part_desc - > new_part_name ) ;
2018-05-21 13:49:54 +00:00
continue ;
}
}
else
{
found_part_name = part_desc - > new_part_name ;
}
part_desc - > found_new_part_name = found_part_name ;
2019-05-03 02:00:57 +00:00
part_desc - > found_new_part_info = MergeTreePartInfo : : fromPartName ( found_part_name , format_version ) ;
2018-05-21 13:49:54 +00:00
part_desc - > replica = replica ;
2018-05-28 15:37:30 +00:00
adding_parts_active_set . add ( part_desc - > found_new_part_name ) ;
2018-05-21 13:49:54 +00:00
part_name_to_desc . emplace ( part_desc - > found_new_part_name , part_desc ) ;
2017-11-17 08:58:35 +00:00
}
2017-12-21 18:17:06 +00:00
2018-05-21 13:49:54 +00:00
/// Check that we could cover whole range
for ( PartDescriptionPtr & part_desc : parts_to_add )
{
2018-05-28 15:37:30 +00:00
if ( adding_parts_active_set . getContainingPart ( part_desc - > new_part_info ) . empty ( ) )
2018-05-21 13:49:54 +00:00
{
2022-05-04 15:40:53 +00:00
/// We should enqueue missing part for check, so it will be replaced with empty one (if needed)
/// and we will be able to execute this REPLACE_RANGE.
/// However, it's quite dangerous, because part may appear in source table.
/// So we enqueue it for check only if no replicas of source table have part either.
bool need_check = true ;
if ( auto * replicated_src_table = typeid_cast < StorageReplicatedMergeTree * > ( source_table . get ( ) ) )
{
String src_replica = replicated_src_table - > findReplicaHavingPart ( part_desc - > src_part_name , false ) ;
if ( ! src_replica . empty ( ) )
{
LOG_DEBUG ( log , " Found part {} on replica {} of source table, will not check part {} required for {} " ,
part_desc - > src_part_name , src_replica , part_desc - > new_part_name , entry . znode_name ) ;
need_check = false ;
}
}
if ( need_check )
{
LOG_DEBUG ( log , " Will check part {} required for {}, because no replicas have it (including replicas of source table) " ,
part_desc - > new_part_name , entry . znode_name ) ;
enqueuePartForCheck ( part_desc - > new_part_name ) ;
}
throw Exception ( ErrorCodes : : NO_REPLICA_HAS_PART ,
" Not found part {} (or part covering it) neither source table neither remote replicas " ,
part_desc - > new_part_name ) ;
2018-05-21 13:49:54 +00:00
}
}
2014-03-21 19:17:59 +00:00
2018-05-21 13:49:54 +00:00
/// Filter covered parts
PartDescriptions final_parts ;
2021-05-13 11:29:59 +00:00
Strings final_part_names ;
2018-05-21 13:49:54 +00:00
{
2021-05-13 11:29:59 +00:00
final_part_names = adding_parts_active_set . getParts ( ) ;
2014-10-18 17:37:55 +00:00
2018-05-21 13:49:54 +00:00
for ( const String & final_part_name : final_part_names )
{
auto part_desc = part_name_to_desc [ final_part_name ] ;
if ( ! part_desc )
throw Exception ( " There is no final part " + final_part_name + " . This is a bug " , ErrorCodes : : LOGICAL_ERROR ) ;
final_parts . emplace_back ( part_desc ) ;
2018-04-19 14:20:18 +00:00
2018-05-21 13:49:54 +00:00
if ( final_parts . size ( ) > 1 )
{
auto & prev = * final_parts [ final_parts . size ( ) - 2 ] ;
auto & curr = * final_parts [ final_parts . size ( ) - 1 ] ;
if ( ! prev . found_new_part_info . isDisjoint ( curr . found_new_part_info ) )
{
throw Exception ( " Intersected final parts detected: " + prev . found_new_part_name
2021-05-13 11:29:59 +00:00
+ " and " + curr . found_new_part_name + " . It should be investigated. " , ErrorCodes : : LOGICAL_ERROR ) ;
2018-05-21 13:49:54 +00:00
}
}
}
}
static const String TMP_PREFIX = " tmp_replace_from_ " ;
2022-04-19 13:53:10 +00:00
std : : vector < MergeTreeData : : HardlinkedFiles > hardlinked_files_for_parts ;
2022-04-21 12:39:12 +00:00
2018-05-21 13:49:54 +00:00
auto obtain_part = [ & ] ( PartDescriptionPtr & part_desc )
2018-04-19 14:20:18 +00:00
{
2018-05-21 13:49:54 +00:00
if ( part_desc - > src_table_part )
2018-04-19 14:20:18 +00:00
{
2018-05-21 13:49:54 +00:00
if ( part_desc - > checksum_hex ! = part_desc - > src_table_part - > checksums . getTotalChecksumHex ( ) )
throw Exception ( " Checksums of " + part_desc - > src_table_part - > name + " is suddenly changed " , ErrorCodes : : UNFINISHED ) ;
2022-08-09 16:44:51 +00:00
auto [ res_part , temporary_part_lock ] = cloneAndLoadDataPartOnSameDisk (
2022-09-27 13:23:02 +00:00
part_desc - > src_table_part , TMP_PREFIX + " clone_ " , part_desc - > new_part_info , metadata_snapshot , NO_TRANSACTION_PTR , & part_desc - > hardlinked_files , false , { } ) ;
2022-08-09 16:44:51 +00:00
part_desc - > res_part = std : : move ( res_part ) ;
part_desc - > temporary_part_lock = std : : move ( temporary_part_lock ) ;
2018-04-19 14:20:18 +00:00
}
2018-05-21 13:49:54 +00:00
else if ( ! part_desc - > replica . empty ( ) )
2018-04-19 14:20:18 +00:00
{
2021-05-08 10:59:55 +00:00
String source_replica_path = fs : : path ( zookeeper_path ) / " replicas " / part_desc - > replica ;
ReplicatedMergeTreeAddress address ( getZooKeeper ( ) - > get ( fs : : path ( source_replica_path ) / " host " ) ) ;
2021-04-10 23:33:54 +00:00
auto timeouts = getFetchPartHTTPTimeouts ( getContext ( ) ) ;
2021-02-04 17:25:10 +00:00
2021-04-10 23:33:54 +00:00
auto credentials = getContext ( ) - > getInterserverCredentials ( ) ;
String interserver_scheme = getContext ( ) - > getInterserverScheme ( ) ;
2018-05-21 13:49:54 +00:00
2018-07-31 10:34:35 +00:00
if ( interserver_scheme ! = address . scheme )
2019-05-11 18:00:43 +00:00
throw Exception ( " Interserver schemas are different ' " + interserver_scheme + " ' != ' " + address . scheme + " ', can't fetch part from " + address . host , ErrorCodes : : LOGICAL_ERROR ) ;
2018-05-21 13:49:54 +00:00
2022-09-08 14:18:21 +00:00
part_desc - > res_part = fetcher . fetchSelectedPart (
2021-05-21 16:14:01 +00:00
metadata_snapshot , getContext ( ) , part_desc - > found_new_part_name , source_replica_path ,
2021-05-26 20:37:44 +00:00
address . host , address . replication_port , timeouts , credentials - > getUser ( ) , credentials - > getPassword ( ) ,
interserver_scheme , replicated_fetches_throttler , false , TMP_PREFIX + " fetch_ " ) ;
2018-05-21 13:49:54 +00:00
/// TODO: check columns_version of fetched part
ProfileEvents : : increment ( ProfileEvents : : ReplicatedPartFetches ) ;
}
else
throw Exception ( " There is no receipt to produce part " + part_desc - > new_part_name + " . This is bug " , ErrorCodes : : LOGICAL_ERROR ) ;
} ;
/// Download or clone parts
/// TODO: make it in parallel
for ( PartDescriptionPtr & part_desc : final_parts )
obtain_part ( part_desc ) ;
2019-05-03 02:00:57 +00:00
MutableDataPartsVector res_parts ;
2018-05-21 13:49:54 +00:00
for ( PartDescriptionPtr & part_desc : final_parts )
res_parts . emplace_back ( part_desc - > res_part ) ;
try
{
/// Commit parts
auto zookeeper = getZooKeeper ( ) ;
2022-03-16 19:16:26 +00:00
Transaction transaction ( * this , NO_TRANSACTION_RAW ) ;
2018-05-21 13:49:54 +00:00
2018-08-25 01:58:14 +00:00
Coordination : : Requests ops ;
2018-05-21 13:49:54 +00:00
for ( PartDescriptionPtr & part_desc : final_parts )
{
2022-10-22 22:51:59 +00:00
renameTempPartAndReplace ( part_desc - > res_part , transaction ) ;
2018-05-21 13:49:54 +00:00
getCommitPartOps ( ops , part_desc - > res_part ) ;
2022-04-19 13:53:10 +00:00
2022-06-30 20:51:27 +00:00
lockSharedData ( * part_desc - > res_part , false , part_desc - > hardlinked_files ) ;
2018-04-19 14:20:18 +00:00
}
2018-05-21 13:49:54 +00:00
2022-04-19 13:53:10 +00:00
2018-05-21 13:49:54 +00:00
if ( ! ops . empty ( ) )
zookeeper - > multi ( ops ) ;
{
2019-05-03 02:00:57 +00:00
auto data_parts_lock = lockParts ( ) ;
2018-05-21 13:49:54 +00:00
transaction . commit ( & data_parts_lock ) ;
if ( replace )
2021-05-13 11:29:59 +00:00
{
2022-05-04 14:22:06 +00:00
parts_to_remove = removePartsInRangeFromWorkingSetAndGetPartsToRemoveFromZooKeeper ( NO_TRANSACTION_RAW , drop_range , data_parts_lock ) ;
2021-05-13 11:29:59 +00:00
String parts_to_remove_str ;
for ( const auto & part : parts_to_remove )
{
parts_to_remove_str + = part - > name ;
parts_to_remove_str + = " " ;
}
2021-05-14 12:55:30 +00:00
LOG_TRACE ( log , " Replacing {} parts {}with {} parts {} " , parts_to_remove . size ( ) , parts_to_remove_str ,
2021-05-13 11:29:59 +00:00
final_parts . size ( ) , boost : : algorithm : : join ( final_part_names , " , " ) ) ;
}
2018-05-21 13:49:54 +00:00
}
2021-04-10 23:33:54 +00:00
PartLog : : addNewParts ( getContext ( ) , res_parts , watch . elapsed ( ) ) ;
2018-05-21 13:49:54 +00:00
}
catch ( . . . )
{
2021-04-10 23:33:54 +00:00
PartLog : : addNewParts ( getContext ( ) , res_parts , watch . elapsed ( ) , ExecutionStatus : : fromCurrentException ( ) ) ;
2022-04-19 13:53:10 +00:00
for ( const auto & res_part : res_parts )
unlockSharedData ( * res_part ) ;
2018-05-21 13:49:54 +00:00
throw ;
}
2021-03-05 09:50:26 +00:00
removePartsFromZooKeeperWithRetries ( parts_to_remove ) ;
2018-05-21 13:49:54 +00:00
res_parts . clear ( ) ;
parts_to_remove . clear ( ) ;
2018-07-30 17:34:55 +00:00
cleanup_thread . wakeup ( ) ;
2018-05-21 13:49:54 +00:00
return true ;
}
2020-11-24 14:24:48 +00:00
void StorageReplicatedMergeTree : : executeClonePartFromShard ( const LogEntry & entry )
{
auto zookeeper = getZooKeeper ( ) ;
Strings replicas = zookeeper - > getChildren ( entry . source_shard + " /replicas " ) ;
std : : shuffle ( replicas . begin ( ) , replicas . end ( ) , thread_local_rng ) ;
2021-05-21 09:30:49 +00:00
String replica ;
for ( const String & candidate : replicas )
{
if ( zookeeper - > exists ( entry . source_shard + " /replicas/ " + candidate + " /is_active " ) )
{
replica = candidate ;
break ;
}
}
if ( replica . empty ( ) )
throw Exception ( ErrorCodes : : NO_REPLICA_HAS_PART , " Not found active replica on shard {} to clone part {} " , entry . source_shard , entry . new_part_name ) ;
2020-11-24 14:24:48 +00:00
2022-02-01 09:52:02 +00:00
LOG_INFO ( log , " Will clone part from shard {} and replica {} " , entry . source_shard , replica ) ;
2020-11-24 14:24:48 +00:00
MutableDataPartPtr part ;
{
auto metadata_snapshot = getInMemoryMetadataPtr ( ) ;
String source_replica_path = entry . source_shard + " /replicas/ " + replica ;
ReplicatedMergeTreeAddress address ( getZooKeeper ( ) - > get ( source_replica_path + " /host " ) ) ;
2021-04-20 12:26:05 +00:00
auto timeouts = ConnectionTimeouts : : getHTTPTimeouts ( getContext ( ) ) ;
auto credentials = getContext ( ) - > getInterserverCredentials ( ) ;
String interserver_scheme = getContext ( ) - > getInterserverScheme ( ) ;
2020-11-24 14:24:48 +00:00
2021-04-20 12:26:05 +00:00
auto get_part = [ & , address , timeouts , credentials , interserver_scheme ] ( )
2020-11-24 14:24:48 +00:00
{
if ( interserver_scheme ! = address . scheme )
throw Exception ( " Interserver schemes are different: ' " + interserver_scheme
+ " ' != ' " + address . scheme + " ', can't fetch part from " + address . host ,
ErrorCodes : : LOGICAL_ERROR ) ;
2022-09-08 14:18:21 +00:00
return fetcher . fetchSelectedPart (
2021-05-23 21:54:22 +00:00
metadata_snapshot , getContext ( ) , entry . new_part_name , source_replica_path ,
2020-11-24 14:24:48 +00:00
address . host , address . replication_port ,
2021-05-26 20:37:44 +00:00
timeouts , credentials - > getUser ( ) , credentials - > getPassword ( ) , interserver_scheme ,
replicated_fetches_throttler , true ) ;
2020-11-24 14:24:48 +00:00
} ;
part = get_part ( ) ;
2021-05-21 09:30:49 +00:00
// The fetched part is valuable and should not be cleaned like a temp part.
part - > is_temp = false ;
2022-10-22 22:51:59 +00:00
part - > renameTo ( " detached/ " + entry . new_part_name , true ) ;
2022-06-27 19:41:29 +00:00
2021-05-21 09:30:49 +00:00
LOG_INFO ( log , " Cloned part {} to detached directory " , part - > name ) ;
2020-11-24 14:24:48 +00:00
}
}
2018-08-27 13:51:22 +00:00
void StorageReplicatedMergeTree : : cloneReplica ( const String & source_replica , Coordination : : Stat source_is_lost_stat , zkutil : : ZooKeeperPtr & zookeeper )
2018-08-27 19:06:32 +00:00
{
2021-05-08 10:59:55 +00:00
String source_path = fs : : path ( zookeeper_path ) / " replicas " / source_replica ;
2018-08-27 19:06:32 +00:00
2020-03-10 13:15:12 +00:00
/// The order of the following three actions is important.
2018-08-27 19:06:32 +00:00
2020-03-10 13:15:12 +00:00
Strings source_queue_names ;
/// We are trying to get consistent /log_pointer and /queue state. Otherwise
/// we can possibly duplicate entries in queue of cloned replica.
while ( true )
{
Coordination : : Stat log_pointer_stat ;
2021-05-08 10:59:55 +00:00
String raw_log_pointer = zookeeper - > get ( fs : : path ( source_path ) / " log_pointer " , & log_pointer_stat ) ;
2018-08-20 17:15:04 +00:00
2020-03-10 13:15:12 +00:00
Coordination : : Requests ops ;
2021-05-08 10:59:55 +00:00
ops . push_back ( zkutil : : makeSetRequest ( fs : : path ( replica_path ) / " log_pointer " , raw_log_pointer , - 1 ) ) ;
2018-08-27 19:06:32 +00:00
2020-03-10 13:15:12 +00:00
/// For support old versions CH.
if ( source_is_lost_stat . version = = - 1 )
{
/// We check that it was not suddenly upgraded to new version.
/// Otherwise it can be upgraded and instantly become lost, but we cannot notice that.
2021-05-08 10:59:55 +00:00
ops . push_back ( zkutil : : makeCreateRequest ( fs : : path ( source_path ) / " is_lost " , " 0 " , zkutil : : CreateMode : : Persistent ) ) ;
ops . push_back ( zkutil : : makeRemoveRequest ( fs : : path ( source_path ) / " is_lost " , - 1 ) ) ;
2020-03-10 13:15:12 +00:00
}
else /// The replica we clone should not suddenly become lost.
2021-05-08 10:59:55 +00:00
ops . push_back ( zkutil : : makeCheckRequest ( fs : : path ( source_path ) / " is_lost " , source_is_lost_stat . version ) ) ;
2018-08-27 19:06:32 +00:00
2020-03-10 13:15:12 +00:00
Coordination : : Responses responses ;
2018-08-27 19:06:32 +00:00
2020-03-10 13:15:12 +00:00
/// Let's remember the queue of the reference/master replica.
2021-05-08 10:59:55 +00:00
source_queue_names = zookeeper - > getChildren ( fs : : path ( source_path ) / " queue " ) ;
2020-03-10 13:15:12 +00:00
2021-04-28 17:49:27 +00:00
/// Check that log pointer of source replica didn't changed while we read queue entries
2021-05-09 10:45:50 +00:00
ops . push_back ( zkutil : : makeCheckRequest ( fs : : path ( source_path ) / " log_pointer " , log_pointer_stat . version ) ) ;
2020-03-10 13:15:12 +00:00
auto rc = zookeeper - > tryMulti ( ops , responses ) ;
2020-06-12 15:09:12 +00:00
if ( rc = = Coordination : : Error : : ZOK )
2020-03-10 13:15:12 +00:00
{
break ;
}
else if ( rc = = Coordination : : Error : : ZNODEEXISTS )
{
throw Exception (
" Can not clone replica, because the " + source_replica + " updated to new ClickHouse version " ,
ErrorCodes : : REPLICA_STATUS_CHANGED ) ;
}
else if ( responses [ 1 ] - > error = = Coordination : : Error : : ZBADVERSION )
{
/// If is_lost node version changed than source replica also lost,
/// so we cannot clone from it.
throw Exception (
" Can not clone replica, because the " + source_replica + " became lost " , ErrorCodes : : REPLICA_STATUS_CHANGED ) ;
}
else if ( responses . back ( ) - > error = = Coordination : : Error : : ZBADVERSION )
{
/// If source replica's log_pointer changed than we probably read
/// stale state of /queue and have to try one more time.
2020-05-23 22:24:01 +00:00
LOG_WARNING ( log , " Log pointer of source replica {} changed while we loading queue nodes. Will retry. " , source_replica ) ;
2020-03-10 13:15:12 +00:00
continue ;
}
else
{
zkutil : : KeeperMultiException : : check ( rc , ops , responses ) ;
}
}
2018-08-27 19:06:32 +00:00
2022-01-30 19:49:48 +00:00
: : sort ( source_queue_names . begin ( ) , source_queue_names . end ( ) ) ;
2020-03-10 13:15:12 +00:00
2021-10-18 20:16:02 +00:00
struct QueueEntryInfo
2018-08-07 15:21:42 +00:00
{
2021-10-19 11:56:04 +00:00
String data = { } ;
Coordination : : Stat stat = { } ;
LogEntryPtr parsed_entry = { } ;
2021-10-18 20:16:02 +00:00
} ;
2021-10-25 14:01:23 +00:00
/// We got log pointer and list of queue entries of source replica.
/// At first we will get queue entries and then we will get list of active parts of source replica
/// to enqueue fetches for missing parts. If source replica executes and removes some entry concurrently
/// we will see produced part (or covering part) in replicas/source/parts and will enqueue fetch.
/// We will try to parse queue entries before copying them
/// to avoid creation of excessive and duplicating entries in our queue.
/// See also removePartAndEnqueueFetch(...)
2021-10-18 20:16:02 +00:00
std : : vector < QueueEntryInfo > source_queue ;
ActiveDataPartSet get_part_set { format_version } ;
ActiveDataPartSet drop_range_set { format_version } ;
2022-10-06 19:09:12 +00:00
std : : unordered_set < String > exact_part_names ;
2021-10-18 20:16:02 +00:00
{
std : : vector < zkutil : : ZooKeeper : : FutureGet > queue_get_futures ;
queue_get_futures . reserve ( source_queue_names . size ( ) ) ;
for ( const String & entry_name : source_queue_names )
queue_get_futures . push_back ( zookeeper - > asyncTryGet ( fs : : path ( source_path ) / " queue " / entry_name ) ) ;
2021-10-19 11:56:04 +00:00
source_queue . reserve ( source_queue_names . size ( ) ) ;
2021-10-18 20:16:02 +00:00
for ( size_t i = 0 ; i < source_queue_names . size ( ) ; + + i )
{
auto res = queue_get_futures [ i ] . get ( ) ;
/// It's ok if entry is already executed and removed: we also will get source parts set.
if ( res . error = = Coordination : : Error : : ZNONODE )
continue ;
assert ( res . error = = Coordination : : Error : : ZOK ) ;
2021-10-19 11:56:04 +00:00
source_queue . emplace_back ( ) ;
auto & info = source_queue . back ( ) ;
2021-10-18 20:16:02 +00:00
info . data = std : : move ( res . data ) ;
2022-06-07 11:53:10 +00:00
info . stat = std : : move ( res . stat ) ;
2021-10-18 20:16:02 +00:00
try
{
info . parsed_entry = LogEntry : : parse ( info . data , info . stat ) ;
}
catch ( . . . )
{
tryLogCurrentException ( log , " Cannot parse source queue entry " + source_queue_names [ i ] ) ;
}
/// It may be ok if source replica has newer version. We will copy entry as is.
if ( ! info . parsed_entry )
continue ;
info . parsed_entry - > znode_name = source_queue_names [ i ] ;
if ( info . parsed_entry - > type = = LogEntry : : DROP_RANGE )
2022-10-06 19:09:12 +00:00
{
2021-10-18 20:16:02 +00:00
drop_range_set . add ( info . parsed_entry - > new_part_name ) ;
2022-10-06 19:09:12 +00:00
}
else if ( info . parsed_entry - > type = = LogEntry : : GET_PART )
2021-10-25 14:01:23 +00:00
{
String maybe_covering_drop_range = drop_range_set . getContainingPart ( info . parsed_entry - > new_part_name ) ;
if ( maybe_covering_drop_range . empty ( ) )
get_part_set . add ( info . parsed_entry - > new_part_name ) ;
}
2022-10-06 19:09:12 +00:00
else
{
/// We should keep local parts if they present in the queue of source replica.
/// There's a chance that we are the only replica that has these parts.
Strings entry_virtual_parts = info . parsed_entry - > getVirtualPartNames ( format_version ) ;
std : : move ( entry_virtual_parts . begin ( ) , entry_virtual_parts . end ( ) , std : : inserter ( exact_part_names , exact_part_names . end ( ) ) ) ;
}
2021-10-18 20:16:02 +00:00
}
2018-08-07 15:21:42 +00:00
}
2021-04-28 17:49:27 +00:00
/// We should do it after copying queue, because some ALTER_METADATA entries can be lost otherwise.
2021-05-07 17:09:39 +00:00
cloneMetadataIfNeeded ( source_replica , source_path , zookeeper ) ;
2021-04-28 17:49:27 +00:00
2018-08-07 15:21:42 +00:00
/// Add to the queue jobs to receive all the active parts that the reference/master replica has.
2021-05-08 10:59:55 +00:00
Strings source_replica_parts = zookeeper - > getChildren ( fs : : path ( source_path ) / " parts " ) ;
2021-10-18 20:16:02 +00:00
for ( const auto & active_part : source_replica_parts )
get_part_set . add ( active_part ) ;
2018-08-07 15:21:42 +00:00
2021-10-18 20:16:02 +00:00
Strings active_parts = get_part_set . getParts ( ) ;
2019-08-19 12:06:44 +00:00
/// Remove local parts if source replica does not have them, because such parts will never be fetched by other replicas.
2021-05-08 10:59:55 +00:00
Strings local_parts_in_zk = zookeeper - > getChildren ( fs : : path ( replica_path ) / " parts " ) ;
2019-08-19 12:06:44 +00:00
Strings parts_to_remove_from_zk ;
2021-06-20 08:24:43 +00:00
2019-08-19 12:06:44 +00:00
for ( const auto & part : local_parts_in_zk )
{
2022-10-06 19:09:12 +00:00
/// We look for exact match (and not for any covering part)
/// because our part might be dropped and covering part might be merged though gap.
/// (avoid resurrection of data that was removed a long time ago)
if ( get_part_set . getContainingPart ( part ) = = part )
continue ;
if ( exact_part_names . contains ( part ) )
continue ;
parts_to_remove_from_zk . emplace_back ( part ) ;
LOG_WARNING ( log , " Source replica does not have part {}. Removing it from ZooKeeper. " , part ) ;
2019-08-19 12:06:44 +00:00
}
2021-06-20 08:24:43 +00:00
2021-07-14 17:05:50 +00:00
{
/// Check "is_lost" version after retrieving queue and parts.
/// If version has changed, then replica most likely has been dropped and parts set is inconsistent,
/// so throw exception and retry cloning.
Coordination : : Stat is_lost_stat_new ;
zookeeper - > get ( fs : : path ( source_path ) / " is_lost " , & is_lost_stat_new ) ;
if ( is_lost_stat_new . version ! = source_is_lost_stat . version )
2021-10-25 14:01:23 +00:00
throw Exception ( ErrorCodes : : REPLICA_STATUS_CHANGED , " Cannot clone {}, because it suddenly become lost "
" or removed broken part from ZooKeeper " , source_replica ) ;
2021-07-14 17:05:50 +00:00
}
2021-03-05 09:50:26 +00:00
removePartsFromZooKeeperWithRetries ( parts_to_remove_from_zk ) ;
2019-08-19 12:06:44 +00:00
2021-11-17 18:14:14 +00:00
auto local_active_parts = getDataPartsForInternalUsage ( ) ;
2021-06-20 08:24:43 +00:00
2019-08-19 12:06:44 +00:00
DataPartsVector parts_to_remove_from_working_set ;
2021-06-20 08:24:43 +00:00
2019-08-19 12:06:44 +00:00
for ( const auto & part : local_active_parts )
{
2022-10-06 19:09:12 +00:00
if ( get_part_set . getContainingPart ( part - > name ) = = part - > name )
continue ;
if ( exact_part_names . contains ( part - > name ) )
continue ;
parts_to_remove_from_working_set . emplace_back ( part ) ;
LOG_WARNING ( log , " Source replica does not have part {}. Removing it from working set. " , part - > name ) ;
2019-08-19 12:06:44 +00:00
}
2020-10-06 20:05:28 +00:00
if ( getSettings ( ) - > detach_old_local_parts_when_cloning_replica )
{
auto metadata_snapshot = getInMemoryMetadataPtr ( ) ;
2021-06-20 08:24:43 +00:00
2020-10-06 20:05:28 +00:00
for ( const auto & part : parts_to_remove_from_working_set )
{
2022-05-03 19:32:24 +00:00
LOG_INFO ( log , " Detaching {} " , part - > data_part_storage - > getPartDirectory ( ) ) ;
2020-10-06 20:05:28 +00:00
part - > makeCloneInDetached ( " clone " , metadata_snapshot ) ;
}
}
2022-03-18 11:01:26 +00:00
removePartsFromWorkingSet ( NO_TRANSACTION_RAW , parts_to_remove_from_working_set , true ) ;
2019-08-19 12:06:44 +00:00
2021-10-18 20:16:02 +00:00
std : : unordered_set < String > created_get_parts ;
/// Avoid creation of GET_PART entries which covered by another GET_PART or DROP_RANGE
/// and creation of multiple entries with the same new_part_name.
2021-10-25 14:01:23 +00:00
auto should_ignore_log_entry = [ & drop_range_set , & get_part_set , this ] ( std : : unordered_set < String > & created_gets ,
const String & part_name , const String & log_msg_context ) - > bool
2021-10-18 20:16:02 +00:00
{
/// We should not create entries covered by DROP_RANGE, because we will remove them anyway (kind of optimization).
String covering_drop_range = drop_range_set . getContainingPart ( part_name ) ;
if ( ! covering_drop_range . empty ( ) )
{
LOG_TRACE ( log , " {} {}: it's covered by DROP_RANGE {} " , log_msg_context , part_name , covering_drop_range ) ;
return true ;
}
/// We should not create entries covered by GET_PART,
/// because GET_PART entry has no source parts and we can execute it only by fetching.
/// Parts covered by GET_PART are useless and may cause replication to stuck if covered part is lost.
String covering_get_part_entry = get_part_set . getContainingPart ( part_name ) ;
if ( covering_get_part_entry . empty ( ) )
return false ;
if ( covering_get_part_entry ! = part_name )
{
LOG_TRACE ( log , " {} {}: it's covered by GET_PART {} " , log_msg_context , part_name , covering_get_part_entry ) ;
return true ;
}
2021-10-19 11:56:04 +00:00
/// NOTE: It does not completely avoids duplication of GET_PART entries,
/// because it's possible that source replica has executed some GET_PART after we copied it's queue,
/// but before we copied its active parts set. In this case we will GET_PART entry in our queue
/// and later will pull the original GET_PART from replication log.
/// It should not cause any issues, but it does not allow to get rid of duplicated entries and add an assertion.
2022-04-18 10:18:43 +00:00
if ( created_gets . contains ( part_name ) )
2021-10-18 20:16:02 +00:00
{
/// NOTE It would be better to copy log entry instead of creating GET_PART
/// if there are GET_PART and log entry of other type with the same new_part_name.
/// But it's a bit harder to implement, because it requires full-fledged virtual_parts set.
LOG_TRACE ( log , " {} {}: GET_PART for it is already created " , log_msg_context , part_name ) ;
return true ;
}
return false ;
} ;
2018-08-07 15:21:42 +00:00
for ( const String & name : active_parts )
{
2021-10-25 14:01:23 +00:00
if ( should_ignore_log_entry ( created_get_parts , name , " Not fetching " ) )
2021-10-18 20:16:02 +00:00
continue ;
2018-08-07 15:21:42 +00:00
LogEntry log_entry ;
2021-06-20 08:24:43 +00:00
2021-10-18 20:16:02 +00:00
if ( are_restoring_replica )
2021-06-20 08:24:43 +00:00
{
LOG_DEBUG ( log , " Obtaining checksum for path {} " , name ) ;
// The part we want to fetch is probably present in detached/ folder.
// However, we need to get part's checksum to check if it's not corrupt.
log_entry . type = LogEntry : : ATTACH_PART ;
MinimalisticDataPartChecksums desired_checksums ;
const fs : : path part_path = fs : : path ( source_path ) / " parts " / name ;
const String part_znode = zookeeper - > get ( part_path ) ;
if ( ! part_znode . empty ( ) )
desired_checksums = ReplicatedMergeTreePartHeader : : fromString ( part_znode ) . getChecksums ( ) ;
else
{
String desired_checksums_str = zookeeper - > get ( part_path / " checksums " ) ;
desired_checksums = MinimalisticDataPartChecksums : : deserializeFrom ( desired_checksums_str ) ;
}
const auto [ lo , hi ] = desired_checksums . hash_of_all_files ;
log_entry . part_checksum = getHexUIntUppercase ( hi ) + getHexUIntUppercase ( lo ) ;
}
2021-10-18 20:16:02 +00:00
else
{
log_entry . type = LogEntry : : GET_PART ;
}
2021-06-20 08:24:43 +00:00
2018-11-26 00:56:50 +00:00
log_entry . source_replica = " " ;
2018-08-07 15:21:42 +00:00
log_entry . new_part_name = name ;
log_entry . create_time = tryGetPartCreateTime ( zookeeper , source_path , name ) ;
2021-10-18 20:16:02 +00:00
LOG_TEST ( log , " Enqueueing {} for fetch " , name ) ;
2021-05-08 10:59:55 +00:00
zookeeper - > create ( fs : : path ( replica_path ) / " queue/queue- " , log_entry . toString ( ) , zkutil : : CreateMode : : PersistentSequential ) ;
2021-10-18 20:16:02 +00:00
created_get_parts . insert ( name ) ;
2018-08-07 15:21:42 +00:00
}
2018-08-27 19:06:32 +00:00
2021-10-18 20:16:02 +00:00
size_t total_parts_to_fetch = created_get_parts . size ( ) ;
LOG_DEBUG ( log , " Queued {} parts to be fetched, {} parts ignored " , total_parts_to_fetch , active_parts . size ( ) - total_parts_to_fetch ) ;
2018-08-07 15:21:42 +00:00
/// Add content of the reference/master replica queue to the queue.
2021-10-18 20:16:02 +00:00
size_t total_entries_to_copy = 0 ;
for ( const auto & entry_info : source_queue )
2018-08-07 15:21:42 +00:00
{
2021-10-19 11:56:04 +00:00
assert ( ! entry_info . data . empty ( ) ) ;
2021-10-18 20:16:02 +00:00
if ( entry_info . parsed_entry & & ! entry_info . parsed_entry - > new_part_name . empty ( ) )
{
const String & part_name = entry_info . parsed_entry - > new_part_name ;
const String & entry_name = entry_info . parsed_entry - > znode_name ;
const auto & entry_type = entry_info . parsed_entry - > type ;
2021-10-25 14:01:23 +00:00
if ( should_ignore_log_entry ( created_get_parts , part_name , fmt : : format ( " Not copying {} {} " , entry_name , entry_type ) ) )
2021-10-18 20:16:02 +00:00
continue ;
if ( entry_info . parsed_entry - > type = = LogEntry : : GET_PART )
created_get_parts . insert ( part_name ) ;
}
LOG_TEST ( log , " Copying entry {} " , entry_info . data ) ;
zookeeper - > create ( fs : : path ( replica_path ) / " queue/queue- " , entry_info . data , zkutil : : CreateMode : : PersistentSequential ) ;
+ + total_entries_to_copy ;
2018-08-07 15:21:42 +00:00
}
2021-10-18 20:16:02 +00:00
LOG_DEBUG ( log , " Copied {} queue entries, {} entries ignored " , total_entries_to_copy , source_queue . size ( ) - total_entries_to_copy ) ;
2018-08-20 23:08:45 +00:00
}
2021-05-07 17:09:39 +00:00
void StorageReplicatedMergeTree : : cloneMetadataIfNeeded ( const String & source_replica , const String & source_path , zkutil : : ZooKeeperPtr & zookeeper )
{
String source_metadata_version_str ;
bool metadata_version_exists = zookeeper - > tryGet ( source_path + " /metadata_version " , source_metadata_version_str ) ;
if ( ! metadata_version_exists )
{
/// For compatibility with version older than 20.3
/// TODO fix tests and delete it
LOG_WARNING ( log , " Node {} does not exist. "
" Most likely it's because too old version of ClickHouse is running on replica {}. "
" Will not check metadata consistency " ,
source_path + " /metadata_version " , source_replica ) ;
return ;
}
Int32 source_metadata_version = parse < Int32 > ( source_metadata_version_str ) ;
if ( metadata_version = = source_metadata_version )
return ;
/// Our metadata it not up to date with source replica metadata.
/// Metadata is updated by ALTER_METADATA entries, but some entries are probably cleaned up from the log.
/// It's also possible that some newer ALTER_METADATA entries are present in source_queue list,
/// and source replica are executing such entry right now (or had executed recently).
/// More than that, /metadata_version update is not atomic with /columns and /metadata update...
/// Fortunately, ALTER_METADATA seems to be idempotent,
/// and older entries of such type can be replaced with newer entries.
/// Let's try to get consistent values of source replica's /columns and /metadata
/// and prepend dummy ALTER_METADATA to our replication queue.
/// It should not break anything if source_queue already contains ALTER_METADATA entry
/// with greater or equal metadata_version, but it will update our metadata
/// if all such entries were cleaned up from the log and source_queue.
LOG_WARNING ( log , " Metadata version ({}) on replica is not up to date with metadata ({}) on source replica {} " ,
metadata_version , source_metadata_version , source_replica ) ;
String source_metadata ;
String source_columns ;
while ( true )
{
Coordination : : Stat metadata_stat ;
Coordination : : Stat columns_stat ;
source_metadata = zookeeper - > get ( source_path + " /metadata " , & metadata_stat ) ;
source_columns = zookeeper - > get ( source_path + " /columns " , & columns_stat ) ;
Coordination : : Requests ops ;
Coordination : : Responses responses ;
ops . emplace_back ( zkutil : : makeCheckRequest ( source_path + " /metadata " , metadata_stat . version ) ) ;
ops . emplace_back ( zkutil : : makeCheckRequest ( source_path + " /columns " , columns_stat . version ) ) ;
Coordination : : Error code = zookeeper - > tryMulti ( ops , responses ) ;
if ( code = = Coordination : : Error : : ZOK )
break ;
else if ( code = = Coordination : : Error : : ZBADVERSION )
LOG_WARNING ( log , " Metadata of replica {} was changed " , source_path ) ;
else
zkutil : : KeeperMultiException : : check ( code , ops , responses ) ;
}
ReplicatedMergeTreeLogEntryData dummy_alter ;
dummy_alter . type = LogEntry : : ALTER_METADATA ;
dummy_alter . source_replica = source_replica ;
dummy_alter . metadata_str = source_metadata ;
dummy_alter . columns_str = source_columns ;
dummy_alter . alter_version = source_metadata_version ;
dummy_alter . create_time = time ( nullptr ) ;
zookeeper - > create ( replica_path + " /queue/queue- " , dummy_alter . toString ( ) , zkutil : : CreateMode : : PersistentSequential ) ;
/// We don't need to do anything with mutation_pointer, because mutation log cleanup process is different from
/// replication log cleanup. A mutation is removed from ZooKeeper only if all replicas had executed the mutation,
/// so all mutations which are greater or equal to our mutation pointer are still present in ZooKeeper.
}
2018-08-22 14:01:54 +00:00
void StorageReplicatedMergeTree : : cloneReplicaIfNeeded ( zkutil : : ZooKeeperPtr zookeeper )
2014-04-03 11:48:28 +00:00
{
2020-10-06 20:05:28 +00:00
Coordination : : Stat is_lost_stat ;
bool is_new_replica = true ;
2018-08-20 23:08:45 +00:00
String res ;
2021-06-20 08:24:43 +00:00
2021-05-08 10:59:55 +00:00
if ( zookeeper - > tryGet ( fs : : path ( replica_path ) / " is_lost " , res , & is_lost_stat ) )
2018-08-20 23:08:45 +00:00
{
if ( res = = " 0 " )
return ;
2020-10-06 20:05:28 +00:00
if ( is_lost_stat . version )
is_new_replica = false ;
2018-08-20 23:08:45 +00:00
}
else
{
2018-08-27 15:44:51 +00:00
/// Replica was created by old version of CH, so me must create "/is_lost".
2018-08-27 19:16:38 +00:00
/// Note that in old version of CH there was no "lost" replicas possible.
2020-10-06 20:05:28 +00:00
/// TODO is_lost node should always exist since v18.12, maybe we can replace `tryGet` with `get` and remove old code?
2021-05-08 10:59:55 +00:00
zookeeper - > create ( fs : : path ( replica_path ) / " is_lost " , " 0 " , zkutil : : CreateMode : : Persistent ) ;
2017-12-29 22:32:04 +00:00
return ;
2018-08-20 23:08:45 +00:00
}
2018-08-07 15:21:42 +00:00
2018-08-28 00:44:42 +00:00
/// is_lost is "1": it means that we are in repair mode.
2020-10-06 20:05:28 +00:00
/// Try choose source replica to clone.
/// Source replica must not be lost and should have minimal queue size and maximal log pointer.
2021-05-08 10:59:55 +00:00
Strings replicas = zookeeper - > getChildren ( fs : : path ( zookeeper_path ) / " replicas " ) ;
2020-10-06 20:05:28 +00:00
std : : vector < zkutil : : ZooKeeper : : FutureGet > futures ;
for ( const String & source_replica_name : replicas )
{
/// Do not clone from myself.
if ( source_replica_name = = replica_name )
continue ;
2021-05-08 10:59:55 +00:00
String source_replica_path = fs : : path ( zookeeper_path ) / " replicas " / source_replica_name ;
2018-08-28 00:44:42 +00:00
2020-10-06 20:05:28 +00:00
/// Obviously the following get operations are not atomic, but it's ok to choose good enough replica, not the best one.
/// NOTE: We may count some entries twice if log_pointer is moved.
2021-05-08 10:59:55 +00:00
futures . emplace_back ( zookeeper - > asyncTryGet ( fs : : path ( source_replica_path ) / " is_lost " ) ) ;
futures . emplace_back ( zookeeper - > asyncTryGet ( fs : : path ( source_replica_path ) / " log_pointer " ) ) ;
futures . emplace_back ( zookeeper - > asyncTryGet ( fs : : path ( source_replica_path ) / " queue " ) ) ;
2020-10-06 20:05:28 +00:00
}
2020-10-07 00:05:48 +00:00
/// Wait for results before getting log entries
for ( auto & future : futures )
future . wait ( ) ;
2018-08-28 00:44:42 +00:00
2021-05-08 10:59:55 +00:00
Strings log_entries = zookeeper - > getChildren ( fs : : path ( zookeeper_path ) / " log " ) ;
2020-10-06 20:05:28 +00:00
size_t max_log_entry = 0 ;
if ( ! log_entries . empty ( ) )
{
2020-10-07 19:47:31 +00:00
String last_entry = * std : : max_element ( log_entries . begin ( ) , log_entries . end ( ) ) ;
max_log_entry = parse < UInt64 > ( last_entry . substr ( strlen ( " log- " ) ) ) ;
2020-10-06 20:05:28 +00:00
}
2020-10-07 00:05:48 +00:00
/// log_pointer can point to future entry, which was not created yet
2020-10-06 20:05:28 +00:00
+ + max_log_entry ;
size_t min_replication_lag = std : : numeric_limits < size_t > : : max ( ) ;
2018-08-07 15:21:42 +00:00
String source_replica ;
2018-08-27 13:51:22 +00:00
Coordination : : Stat source_is_lost_stat ;
2020-10-06 20:05:28 +00:00
size_t future_num = 0 ;
2018-08-07 15:21:42 +00:00
2020-10-06 20:05:28 +00:00
for ( const String & source_replica_name : replicas )
2018-08-07 15:21:42 +00:00
{
2020-10-06 20:05:28 +00:00
if ( source_replica_name = = replica_name )
continue ;
2017-04-01 07:20:54 +00:00
2020-10-06 20:05:28 +00:00
auto get_is_lost = futures [ future_num + + ] . get ( ) ;
auto get_log_pointer = futures [ future_num + + ] . get ( ) ;
auto get_queue = futures [ future_num + + ] . get ( ) ;
2020-10-07 19:47:31 +00:00
if ( get_is_lost . error ! = Coordination : : Error : : ZOK )
2018-08-20 23:08:45 +00:00
{
2021-01-04 08:15:13 +00:00
LOG_INFO ( log , " Not cloning {}, cannot get '/is_lost': {} " , source_replica_name , Coordination : : errorMessage ( get_is_lost . error ) ) ;
2020-10-07 19:47:31 +00:00
continue ;
2020-10-06 20:05:28 +00:00
}
else if ( get_is_lost . data ! = " 0 " )
2020-10-07 19:47:31 +00:00
{
2021-01-04 08:21:04 +00:00
LOG_INFO ( log , " Not cloning {}, it's lost " , source_replica_name ) ;
2020-10-06 20:05:28 +00:00
continue ;
2020-10-07 19:47:31 +00:00
}
2020-10-06 20:05:28 +00:00
if ( get_log_pointer . error ! = Coordination : : Error : : ZOK )
2020-10-07 19:47:31 +00:00
{
2021-01-04 08:15:13 +00:00
LOG_INFO ( log , " Not cloning {}, cannot get '/log_pointer': {} " , source_replica_name , Coordination : : errorMessage ( get_log_pointer . error ) ) ;
2020-10-06 20:05:28 +00:00
continue ;
2020-10-07 19:47:31 +00:00
}
2020-10-06 20:05:28 +00:00
if ( get_queue . error ! = Coordination : : Error : : ZOK )
2020-10-07 19:47:31 +00:00
{
2021-01-04 08:15:13 +00:00
LOG_INFO ( log , " Not cloning {}, cannot get '/queue': {} " , source_replica_name , Coordination : : errorMessage ( get_queue . error ) ) ;
2020-10-06 20:05:28 +00:00
continue ;
2020-10-07 19:47:31 +00:00
}
2020-10-06 20:05:28 +00:00
/// Replica is not lost and we can clone it. Let's calculate approx replication lag.
size_t source_log_pointer = get_log_pointer . data . empty ( ) ? 0 : parse < UInt64 > ( get_log_pointer . data ) ;
assert ( source_log_pointer < = max_log_entry ) ;
size_t replica_queue_lag = max_log_entry - source_log_pointer ;
size_t replica_queue_size = get_queue . stat . numChildren ;
size_t replication_lag = replica_queue_lag + replica_queue_size ;
2020-10-07 19:47:31 +00:00
LOG_INFO ( log , " Replica {} has log pointer '{}', approximate {} queue lag and {} queue size " ,
source_replica_name , get_log_pointer . data , replica_queue_lag , replica_queue_size ) ;
2020-10-06 20:05:28 +00:00
if ( replication_lag < min_replication_lag )
{
source_replica = source_replica_name ;
source_is_lost_stat = get_is_lost . stat ;
min_replication_lag = replication_lag ;
2018-08-20 23:08:45 +00:00
}
2018-08-20 13:31:24 +00:00
}
2018-08-27 19:06:32 +00:00
2018-08-27 19:16:38 +00:00
if ( source_replica . empty ( ) )
2018-08-20 13:31:24 +00:00
throw Exception ( " All replicas are lost " , ErrorCodes : : ALL_REPLICAS_LOST ) ;
2018-08-27 19:06:32 +00:00
2020-10-06 20:05:28 +00:00
if ( is_new_replica )
LOG_INFO ( log , " Will mimic {} " , source_replica ) ;
else
LOG_WARNING ( log , " Will mimic {} " , source_replica ) ;
2019-02-14 14:04:28 +00:00
/// Clear obsolete queue that we no longer need.
2021-05-08 10:59:55 +00:00
zookeeper - > removeChildren ( fs : : path ( replica_path ) / " queue " ) ;
2021-05-30 21:29:37 +00:00
queue . clear ( ) ;
2019-02-14 14:04:28 +00:00
2018-08-28 00:44:42 +00:00
/// Will do repair from the selected replica.
2018-08-23 13:55:59 +00:00
cloneReplica ( source_replica , source_is_lost_stat , zookeeper ) ;
2018-08-28 00:44:42 +00:00
/// If repair fails to whatever reason, the exception is thrown, is_lost will remain "1" and the replica will be repaired later.
2018-08-07 15:21:42 +00:00
2018-08-28 00:44:42 +00:00
/// If replica is repaired successfully, we remove is_lost flag.
2021-05-08 10:59:55 +00:00
zookeeper - > set ( fs : : path ( replica_path ) / " is_lost " , " 0 " ) ;
2018-08-07 15:21:42 +00:00
}
2021-08-09 12:58:23 +00:00
String StorageReplicatedMergeTree : : getLastQueueUpdateException ( ) const
{
2022-06-28 19:19:06 +00:00
std : : lock_guard lock ( last_queue_update_exception_lock ) ;
2021-08-09 12:58:23 +00:00
return last_queue_update_exception ;
}
2018-08-07 15:21:42 +00:00
2018-05-31 13:05:05 +00:00
void StorageReplicatedMergeTree : : queueUpdatingTask ( )
2014-04-03 11:48:28 +00:00
{
2017-12-29 22:32:04 +00:00
if ( ! queue_update_in_progress )
2017-11-17 08:58:35 +00:00
{
2017-12-29 22:32:04 +00:00
last_queue_update_start_time . store ( time ( nullptr ) ) ;
queue_update_in_progress = true ;
}
try
{
2022-02-03 10:10:05 +00:00
queue . pullLogsToQueue ( getZooKeeperAndAssertNotReadonly ( ) , queue_updating_task - > getWatchCallback ( ) , ReplicatedMergeTreeQueue : : UPDATE ) ;
2017-12-29 22:32:04 +00:00
last_queue_update_finish_time . store ( time ( nullptr ) ) ;
queue_update_in_progress = false ;
2018-04-19 14:20:18 +00:00
}
2018-08-25 01:58:14 +00:00
catch ( const Coordination : : Exception & e )
2018-04-24 17:11:59 +00:00
{
tryLogCurrentException ( log , __PRETTY_FUNCTION__ ) ;
2018-04-19 14:20:18 +00:00
2022-06-28 19:19:06 +00:00
std : : lock_guard lock ( last_queue_update_exception_lock ) ;
2021-08-09 12:58:23 +00:00
last_queue_update_exception = getCurrentExceptionMessage ( false ) ;
2020-06-12 15:09:12 +00:00
if ( e . code = = Coordination : : Error : : ZSESSIONEXPIRED )
2018-07-05 16:30:52 +00:00
{
2018-08-21 14:03:06 +00:00
restarting_thread . wakeup ( ) ;
2018-04-24 17:11:59 +00:00
return ;
2018-07-05 16:30:52 +00:00
}
2017-12-21 18:17:06 +00:00
2018-05-31 13:05:05 +00:00
queue_updating_task - > scheduleAfter ( QUEUE_UPDATE_ERROR_SLEEP_MS ) ;
2018-04-24 17:11:59 +00:00
}
2017-12-29 22:32:04 +00:00
catch ( . . . )
{
2018-04-10 13:20:14 +00:00
tryLogCurrentException ( log , __PRETTY_FUNCTION__ ) ;
2021-08-09 12:58:23 +00:00
2022-06-28 19:19:06 +00:00
std : : lock_guard lock ( last_queue_update_exception_lock ) ;
2021-08-09 12:58:23 +00:00
last_queue_update_exception = getCurrentExceptionMessage ( false ) ;
2018-05-31 13:05:05 +00:00
queue_updating_task - > scheduleAfter ( QUEUE_UPDATE_ERROR_SLEEP_MS ) ;
2017-11-17 08:58:35 +00:00
}
2014-04-03 08:47:59 +00:00
}
2014-03-21 19:17:59 +00:00
2014-10-18 17:37:55 +00:00
2018-05-31 13:05:05 +00:00
void StorageReplicatedMergeTree : : mutationsUpdatingTask ( )
2018-05-28 15:37:30 +00:00
{
try
{
2018-05-31 13:05:05 +00:00
queue . updateMutations ( getZooKeeper ( ) , mutations_updating_task - > getWatchCallback ( ) ) ;
2018-05-28 15:37:30 +00:00
}
2018-08-25 01:58:14 +00:00
catch ( const Coordination : : Exception & e )
2018-05-28 15:37:30 +00:00
{
tryLogCurrentException ( log , __PRETTY_FUNCTION__ ) ;
2020-06-12 15:09:12 +00:00
if ( e . code = = Coordination : : Error : : ZSESSIONEXPIRED )
2018-05-28 15:37:30 +00:00
return ;
2018-05-31 13:05:05 +00:00
mutations_updating_task - > scheduleAfter ( QUEUE_UPDATE_ERROR_SLEEP_MS ) ;
2018-05-28 15:37:30 +00:00
}
catch ( . . . )
{
tryLogCurrentException ( log , __PRETTY_FUNCTION__ ) ;
2018-05-31 13:05:05 +00:00
mutations_updating_task - > scheduleAfter ( QUEUE_UPDATE_ERROR_SLEEP_MS ) ;
2018-05-28 15:37:30 +00:00
}
2018-04-19 14:20:18 +00:00
}
2020-10-23 08:54:00 +00:00
ReplicatedMergeTreeQueue : : SelectedEntryPtr StorageReplicatedMergeTree : : selectQueueEntry ( )
2014-04-03 11:48:28 +00:00
{
2017-03-12 19:18:07 +00:00
/// This object will mark the element of the queue as running.
2020-10-23 08:54:00 +00:00
ReplicatedMergeTreeQueue : : SelectedEntryPtr selected ;
2017-04-01 07:20:54 +00:00
2014-07-02 12:30:38 +00:00
try
{
2019-05-03 02:00:57 +00:00
selected = queue . selectEntryToProcess ( merger_mutator , * this ) ;
2014-07-02 12:30:38 +00:00
}
catch ( . . . )
{
2018-04-06 21:46:57 +00:00
tryLogCurrentException ( log , __PRETTY_FUNCTION__ ) ;
2014-07-02 12:30:38 +00:00
}
2017-04-01 07:20:54 +00:00
2020-09-30 12:40:46 +00:00
return selected ;
}
2017-04-01 07:20:54 +00:00
2021-09-16 21:19:58 +00:00
2020-10-23 08:54:00 +00:00
bool StorageReplicatedMergeTree : : processQueueEntry ( ReplicatedMergeTreeQueue : : SelectedEntryPtr selected_entry )
2020-09-30 12:40:46 +00:00
{
2020-10-23 08:54:00 +00:00
LogEntryPtr & entry = selected_entry - > log_entry ;
2020-09-30 12:40:46 +00:00
return queue . processEntry ( [ this ] { return getZooKeeper ( ) ; } , entry , [ & ] ( LogEntryPtr & entry_to_process )
2014-07-02 12:30:38 +00:00
{
2015-09-21 22:43:38 +00:00
try
2014-07-18 16:05:57 +00:00
{
2019-01-04 12:10:00 +00:00
return executeLogEntry ( * entry_to_process ) ;
2016-01-10 04:43:30 +00:00
}
catch ( const Exception & e )
{
if ( e . code ( ) = = ErrorCodes : : NO_REPLICA_HAS_PART )
2015-09-21 22:43:38 +00:00
{
2017-03-13 18:01:46 +00:00
/// If no one has the right part, probably not all replicas work; We will not write to log with Error level.
Use fmt::runtime() for LOG_* for non constexpr
Here is oneliner:
$ gg 'LOG_\(DEBUG\|TRACE\|INFO\|TEST\|WARNING\|ERROR\|FATAL\)([^,]*, [a-zA-Z]' -- :*.cpp :*.h | cut -d: -f1 | sort -u | xargs -r sed -E -i 's#(LOG_[A-Z]*)\(([^,]*), ([A-Za-z][^,)]*)#\1(\2, fmt::runtime(\3)#'
Note, that I tried to do this with coccinelle (tool for semantic
patchin), but it cannot parse C++:
$ cat fmt.cocci
@@
expression log;
expression var;
@@
-LOG_DEBUG(log, var)
+LOG_DEBUG(log, fmt::runtime(var))
I've also tried to use some macros/templates magic to do this implicitly
in logger_useful.h, but I failed to do so, and apparently it is not
possible for now.
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
v2: manual fixes
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2022-02-01 09:10:27 +00:00
LOG_INFO ( log , fmt : : runtime ( e . displayText ( ) ) ) ;
2015-09-21 22:43:38 +00:00
}
2016-01-10 04:43:30 +00:00
else if ( e . code ( ) = = ErrorCodes : : ABORTED )
{
2017-03-13 18:01:46 +00:00
/// Interrupted merge or downloading a part is not an error.
Use fmt::runtime() for LOG_* for non constexpr
Here is oneliner:
$ gg 'LOG_\(DEBUG\|TRACE\|INFO\|TEST\|WARNING\|ERROR\|FATAL\)([^,]*, [a-zA-Z]' -- :*.cpp :*.h | cut -d: -f1 | sort -u | xargs -r sed -E -i 's#(LOG_[A-Z]*)\(([^,]*), ([A-Za-z][^,)]*)#\1(\2, fmt::runtime(\3)#'
Note, that I tried to do this with coccinelle (tool for semantic
patchin), but it cannot parse C++:
$ cat fmt.cocci
@@
expression log;
expression var;
@@
-LOG_DEBUG(log, var)
+LOG_DEBUG(log, fmt::runtime(var))
I've also tried to use some macros/templates magic to do this implicitly
in logger_useful.h, but I failed to do so, and apparently it is not
possible for now.
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
v2: manual fixes
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2022-02-01 09:10:27 +00:00
LOG_INFO ( log , fmt : : runtime ( e . message ( ) ) ) ;
2016-01-10 04:43:30 +00:00
}
2017-11-20 19:33:12 +00:00
else if ( e . code ( ) = = ErrorCodes : : PART_IS_TEMPORARILY_LOCKED )
{
/// Part cannot be added temporarily
Use fmt::runtime() for LOG_* for non constexpr
Here is oneliner:
$ gg 'LOG_\(DEBUG\|TRACE\|INFO\|TEST\|WARNING\|ERROR\|FATAL\)([^,]*, [a-zA-Z]' -- :*.cpp :*.h | cut -d: -f1 | sort -u | xargs -r sed -E -i 's#(LOG_[A-Z]*)\(([^,]*), ([A-Za-z][^,)]*)#\1(\2, fmt::runtime(\3)#'
Note, that I tried to do this with coccinelle (tool for semantic
patchin), but it cannot parse C++:
$ cat fmt.cocci
@@
expression log;
expression var;
@@
-LOG_DEBUG(log, var)
+LOG_DEBUG(log, fmt::runtime(var))
I've also tried to use some macros/templates magic to do this implicitly
in logger_useful.h, but I failed to do so, and apparently it is not
possible for now.
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
v2: manual fixes
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2022-02-01 09:10:27 +00:00
LOG_INFO ( log , fmt : : runtime ( e . displayText ( ) ) ) ;
2018-09-20 14:30:52 +00:00
cleanup_thread . wakeup ( ) ;
2017-11-20 19:33:12 +00:00
}
2016-01-10 04:43:30 +00:00
else
2018-04-06 21:46:57 +00:00
tryLogCurrentException ( log , __PRETTY_FUNCTION__ ) ;
2017-04-01 07:20:54 +00:00
2017-03-13 18:01:46 +00:00
/** This exception will be written to the queue element, and it can be looked up using `system.replication_queue` table.
* The thread that performs this action will sleep a few seconds after the exception .
* See ` queue . processEntry ` function .
2016-01-20 00:18:58 +00:00
*/
2016-01-19 21:38:01 +00:00
throw ;
2020-12-03 13:54:05 +00:00
}
2015-09-21 22:43:38 +00:00
catch ( . . . )
{
2018-04-06 21:46:57 +00:00
tryLogCurrentException ( log , __PRETTY_FUNCTION__ ) ;
2016-01-19 21:38:01 +00:00
throw ;
2014-08-04 15:48:03 +00:00
}
2016-01-10 04:43:30 +00:00
} ) ;
2014-04-03 11:48:28 +00:00
}
2021-09-08 00:21:21 +00:00
bool StorageReplicatedMergeTree : : scheduleDataProcessingJob ( BackgroundJobsAssignee & assignee )
2019-09-05 13:12:29 +00:00
{
2020-10-13 14:25:42 +00:00
/// If replication queue is stopped exit immediately as we successfully executed the task
if ( queue . actions_blocker . isCancelled ( ) )
2021-06-21 13:36:21 +00:00
return false ;
2019-08-19 17:59:16 +00:00
2020-10-13 14:25:42 +00:00
/// This object will mark the element of the queue as running.
2020-10-23 08:54:00 +00:00
ReplicatedMergeTreeQueue : : SelectedEntryPtr selected_entry = selectQueueEntry ( ) ;
2019-09-02 11:35:53 +00:00
2020-10-23 08:54:00 +00:00
if ( ! selected_entry )
2021-06-21 13:36:21 +00:00
return false ;
2014-10-18 17:37:55 +00:00
2021-09-16 21:19:58 +00:00
auto job_type = selected_entry - > log_entry - > type ;
2020-11-09 09:14:20 +00:00
/// Depending on entry type execute in fetches (small) pool or big merge_mutate pool
2021-09-16 21:19:58 +00:00
if ( job_type = = LogEntry : : GET_PART )
2021-06-21 13:36:21 +00:00
{
2022-04-19 20:47:29 +00:00
assignee . scheduleFetchTask ( std : : make_shared < ExecutableLambdaAdapter > (
2021-08-30 19:37:03 +00:00
[ this , selected_entry ] ( ) mutable
{
return processQueueEntry ( selected_entry ) ;
2021-09-02 21:31:32 +00:00
} , common_assignee_trigger , getStorageID ( ) ) ) ;
2021-06-21 13:36:21 +00:00
return true ;
}
2021-09-16 21:19:58 +00:00
else if ( job_type = = LogEntry : : MERGE_PARTS )
{
2022-04-19 20:47:29 +00:00
auto task = std : : make_shared < MergeFromLogEntryTask > ( selected_entry , * this , common_assignee_trigger ) ;
2021-09-16 21:19:58 +00:00
assignee . scheduleMergeMutateTask ( task ) ;
return true ;
}
else if ( job_type = = LogEntry : : MUTATE_PART )
{
2022-04-19 20:47:29 +00:00
auto task = std : : make_shared < MutateFromLogEntryTask > ( selected_entry , * this , common_assignee_trigger ) ;
2021-09-16 21:19:58 +00:00
assignee . scheduleMergeMutateTask ( task ) ;
2021-06-21 13:36:21 +00:00
return true ;
}
2020-10-26 11:02:47 +00:00
else
2019-08-19 17:59:16 +00:00
{
2022-04-19 20:47:29 +00:00
assignee . scheduleCommonTask ( std : : make_shared < ExecutableLambdaAdapter > (
2021-08-30 19:37:03 +00:00
[ this , selected_entry ] ( ) mutable
{
return processQueueEntry ( selected_entry ) ;
2021-11-09 12:26:51 +00:00
} , common_assignee_trigger , getStorageID ( ) ) , /* need_trigger */ true ) ;
2021-06-21 13:36:21 +00:00
return true ;
}
2019-09-05 13:12:29 +00:00
}
2019-08-19 17:59:16 +00:00
2020-10-26 11:02:47 +00:00
bool StorageReplicatedMergeTree : : canExecuteFetch ( const ReplicatedMergeTreeLogEntry & entry , String & disable_reason ) const
2019-09-05 13:12:29 +00:00
{
2020-10-26 11:02:47 +00:00
if ( fetcher . blocker . isCancelled ( ) )
2019-08-21 16:02:13 +00:00
{
2020-10-26 11:02:47 +00:00
disable_reason = fmt : : format ( " Not executing fetch of part {} because replicated fetches are cancelled now. " , entry . new_part_name ) ;
return false ;
2019-09-02 11:35:53 +00:00
}
2020-10-26 11:02:47 +00:00
size_t busy_threads_in_pool = CurrentMetrics : : values [ CurrentMetrics : : BackgroundFetchesPoolTask ] . load ( std : : memory_order_relaxed ) ;
if ( busy_threads_in_pool > = replicated_fetches_pool_size )
2019-08-19 17:59:16 +00:00
{
2020-10-26 11:02:47 +00:00
disable_reason = fmt : : format ( " Not executing fetch of part {} because {} fetches already executing, max {}. " , entry . new_part_name , busy_threads_in_pool , replicated_fetches_pool_size ) ;
return false ;
2019-08-19 17:59:16 +00:00
}
2020-10-26 11:02:47 +00:00
2021-05-27 12:54:47 +00:00
if ( replicated_fetches_throttler - > isThrottling ( ) )
{
disable_reason = fmt : : format ( " Not executing fetch of part {} because fetches have already throttled by network settings "
" <max_replicated_fetches_network_bandwidth> or <max_replicated_fetches_network_bandwidth_for_server>. " , entry . new_part_name ) ;
return false ;
}
2020-10-26 11:02:47 +00:00
return true ;
2019-08-19 17:59:16 +00:00
}
2014-10-18 17:37:55 +00:00
2019-09-05 13:12:29 +00:00
bool StorageReplicatedMergeTree : : partIsAssignedToBackgroundOperation ( const DataPartPtr & part ) const
{
2019-09-10 11:21:59 +00:00
return queue . isVirtualPart ( part ) ;
2019-09-05 13:12:29 +00:00
}
2019-08-20 09:59:19 +00:00
2018-05-31 13:05:05 +00:00
void StorageReplicatedMergeTree : : mergeSelectingTask ( )
2016-05-16 18:43:38 +00:00
{
2018-04-10 13:20:14 +00:00
if ( ! is_leader )
2017-12-29 22:32:04 +00:00
return ;
2017-12-21 18:17:06 +00:00
2019-08-26 18:08:58 +00:00
const auto storage_settings_ptr = getSettings ( ) ;
2018-05-28 15:37:30 +00:00
const bool deduplicate = false ; /// TODO: read deduplicate option from table config
2020-12-01 09:10:12 +00:00
const Names deduplicate_by_columns = { } ;
2020-06-12 18:24:32 +00:00
CreateMergeEntryResult create_result = CreateMergeEntryResult : : Other ;
2017-04-01 07:20:54 +00:00
2017-12-29 22:32:04 +00:00
try
2017-11-17 08:58:35 +00:00
{
2018-05-28 15:37:30 +00:00
/// We must select parts for merge under merge_selecting_mutex because other threads
/// (OPTIMIZE queries) can assign new merges.
2019-01-02 06:44:36 +00:00
std : : lock_guard merge_selecting_lock ( merge_selecting_mutex ) ;
2017-12-21 18:17:06 +00:00
2022-02-03 10:10:05 +00:00
auto zookeeper = getZooKeeperAndAssertNotReadonly ( ) ;
2017-04-01 07:20:54 +00:00
2018-05-28 15:37:30 +00:00
ReplicatedMergeTreeMergePredicate merge_pred = queue . getMergePredicate ( zookeeper ) ;
2018-05-10 15:01:10 +00:00
2017-12-29 22:32:04 +00:00
/// If many merges is already queued, then will queue only small enough merges.
/// Otherwise merge queue could be filled with only large merges,
/// and in the same time, many small parts could be created and won't be merged.
2019-08-26 10:07:32 +00:00
2019-08-21 13:10:33 +00:00
auto merges_and_mutations_queued = queue . countMergesAndPartMutations ( ) ;
2020-09-03 13:00:13 +00:00
size_t merges_and_mutations_sum = merges_and_mutations_queued . merges + merges_and_mutations_queued . mutations ;
2019-08-26 18:08:58 +00:00
if ( merges_and_mutations_sum > = storage_settings_ptr - > max_replicated_merges_in_queue )
2017-12-29 22:32:04 +00:00
{
2020-06-12 18:24:32 +00:00
LOG_TRACE ( log , " Number of queued merges ({}) and part mutations ({}) "
" is greater than max_replicated_merges_in_queue ({}), so won't select new parts to merge or mutate. " ,
2020-09-03 13:00:13 +00:00
merges_and_mutations_queued . merges ,
merges_and_mutations_queued . mutations ,
2020-06-12 18:24:32 +00:00
storage_settings_ptr - > max_replicated_merges_in_queue ) ;
2017-12-29 22:32:04 +00:00
}
else
2014-04-04 10:37:33 +00:00
{
2019-06-17 19:41:48 +00:00
UInt64 max_source_parts_size_for_merge = merger_mutator . getMaxSourcePartsSizeForMerge (
2019-08-26 18:08:58 +00:00
storage_settings_ptr - > max_replicated_merges_in_queue , merges_and_mutations_sum ) ;
2020-09-02 08:18:50 +00:00
2019-06-17 19:41:48 +00:00
UInt64 max_source_part_size_for_mutation = merger_mutator . getMaxSourcePartSizeForMutation ( ) ;
2017-04-01 07:20:54 +00:00
2020-09-04 06:55:19 +00:00
bool merge_with_ttl_allowed = merges_and_mutations_queued . merges_with_ttl < storage_settings_ptr - > max_replicated_merges_with_ttl_in_queue & &
2020-09-04 10:08:09 +00:00
getTotalMergesWithTTLInMergeList ( ) < storage_settings_ptr - > max_number_of_merges_with_ttl_in_pool ;
2020-09-03 13:00:13 +00:00
2021-09-16 21:19:58 +00:00
auto future_merged_part = std : : make_shared < FutureMergedMutatedPart > ( ) ;
2020-11-02 14:38:18 +00:00
if ( storage_settings . get ( ) - > assign_part_uuids )
2021-09-16 21:19:58 +00:00
future_merged_part - > uuid = UUIDHelpers : : generateV4 ( ) ;
2020-11-02 14:38:18 +00:00
2019-06-17 19:41:48 +00:00
if ( max_source_parts_size_for_merge > 0 & &
2022-03-16 19:16:26 +00:00
merger_mutator . selectPartsToMerge ( future_merged_part , false , max_source_parts_size_for_merge , merge_pred , merge_with_ttl_allowed , NO_TRANSACTION_PTR , nullptr ) = = SelectPartsDecision : : SELECTED )
2017-12-21 18:17:06 +00:00
{
2020-11-02 14:38:18 +00:00
create_result = createLogEntryToMergeParts (
zookeeper ,
2021-09-16 21:19:58 +00:00
future_merged_part - > parts ,
future_merged_part - > name ,
future_merged_part - > uuid ,
future_merged_part - > type ,
2020-11-02 14:38:18 +00:00
deduplicate ,
2020-12-01 09:10:12 +00:00
deduplicate_by_columns ,
2020-11-02 14:38:18 +00:00
nullptr ,
merge_pred . getVersion ( ) ,
2021-09-16 21:19:58 +00:00
future_merged_part - > merge_type ) ;
2019-06-17 19:41:48 +00:00
}
2020-06-12 18:24:32 +00:00
/// If there are many mutations in queue, it may happen, that we cannot enqueue enough merges to merge all new parts
2019-08-21 13:10:33 +00:00
else if ( max_source_part_size_for_mutation > 0 & & queue . countMutations ( ) > 0
2020-09-03 13:00:13 +00:00
& & merges_and_mutations_queued . mutations < storage_settings_ptr - > max_replicated_mutations_in_queue )
2019-06-17 19:41:48 +00:00
{
/// Choose a part to mutate.
2022-01-28 17:47:37 +00:00
DataPartsVector data_parts = getDataPartsVectorForInternalUsage ( ) ;
2019-06-17 19:41:48 +00:00
for ( const auto & part : data_parts )
2018-05-28 15:37:30 +00:00
{
2020-03-23 13:32:02 +00:00
if ( part - > getBytesOnDisk ( ) > max_source_part_size_for_mutation )
2019-06-17 19:41:48 +00:00
continue ;
2017-04-01 07:20:54 +00:00
2020-01-31 12:25:31 +00:00
std : : optional < std : : pair < Int64 , int > > desired_mutation_version = merge_pred . getDesiredMutationVersion ( part ) ;
2019-06-17 19:41:48 +00:00
if ( ! desired_mutation_version )
continue ;
2017-04-01 07:20:54 +00:00
2020-11-02 14:38:18 +00:00
create_result = createLogEntryToMutatePart (
* part ,
2021-09-16 21:19:58 +00:00
future_merged_part - > uuid ,
2020-11-02 14:38:18 +00:00
desired_mutation_version - > first ,
desired_mutation_version - > second ,
merge_pred . getVersion ( ) ) ;
2020-06-12 18:24:32 +00:00
2022-02-16 08:06:41 +00:00
if ( create_result = = CreateMergeEntryResult : : Ok | |
create_result = = CreateMergeEntryResult : : LogUpdated )
2019-06-17 19:41:48 +00:00
break ;
2017-12-21 18:17:06 +00:00
}
2014-04-04 10:37:33 +00:00
}
}
2017-12-29 22:32:04 +00:00
}
catch ( . . . )
{
2018-04-10 13:20:14 +00:00
tryLogCurrentException ( log , __PRETTY_FUNCTION__ ) ;
2017-12-21 18:17:06 +00:00
}
2017-04-01 07:20:54 +00:00
2018-04-10 13:20:14 +00:00
if ( ! is_leader )
2017-12-29 22:32:04 +00:00
return ;
2017-11-17 08:58:35 +00:00
2020-06-12 18:24:32 +00:00
if ( create_result ! = CreateMergeEntryResult : : Ok
& & create_result ! = CreateMergeEntryResult : : LogUpdated )
{
2021-07-09 09:29:17 +00:00
merge_selecting_task - > scheduleAfter ( storage_settings_ptr - > merge_selecting_sleep_ms ) ;
2020-06-12 18:24:32 +00:00
}
2017-12-29 22:32:04 +00:00
else
2020-06-12 18:24:32 +00:00
{
2018-05-31 13:05:05 +00:00
merge_selecting_task - > schedule ( ) ;
2020-06-12 18:24:32 +00:00
}
2014-04-14 10:56:06 +00:00
}
2014-04-04 10:37:33 +00:00
2018-06-21 13:27:36 +00:00
void StorageReplicatedMergeTree : : mutationsFinalizingTask ( )
{
bool needs_reschedule = false ;
try
{
2022-02-03 10:10:05 +00:00
needs_reschedule = queue . tryFinalizeMutations ( getZooKeeperAndAssertNotReadonly ( ) ) ;
2018-06-21 13:27:36 +00:00
}
catch ( . . . )
{
tryLogCurrentException ( log , __PRETTY_FUNCTION__ ) ;
needs_reschedule = true ;
}
if ( needs_reschedule )
2020-04-27 16:19:04 +00:00
{
2018-06-21 13:27:36 +00:00
mutations_finalizing_task - > scheduleAfter ( MUTATIONS_FINALIZING_SLEEP_MS ) ;
2020-04-27 16:19:04 +00:00
}
else
{
/// Even if no mutations seems to be done or appeared we are trying to
/// finalize them in background because manual control the launch of
/// this function is error prone. This can lead to mutations that
/// processed all the parts but have is_done=0 state for a long time. Or
/// killed mutations, which are also considered as undone.
mutations_finalizing_task - > scheduleAfter ( MUTATIONS_FINALIZING_IDLE_SLEEP_MS ) ;
}
2018-06-21 13:27:36 +00:00
}
2020-06-12 18:24:32 +00:00
StorageReplicatedMergeTree : : CreateMergeEntryResult StorageReplicatedMergeTree : : createLogEntryToMergeParts (
2018-05-10 15:01:10 +00:00
zkutil : : ZooKeeperPtr & zookeeper ,
2019-05-03 02:00:57 +00:00
const DataPartsVector & parts ,
2018-05-10 15:01:10 +00:00
const String & merged_name ,
2020-11-02 14:38:18 +00:00
const UUID & merged_part_uuid ,
2020-02-11 13:41:26 +00:00
const MergeTreeDataPartType & merged_part_type ,
2018-05-10 15:01:10 +00:00
bool deduplicate ,
2020-12-01 09:10:12 +00:00
const Names & deduplicate_by_columns ,
2020-06-12 18:24:32 +00:00
ReplicatedMergeTreeLogEntryData * out_log_entry ,
2020-09-03 13:00:13 +00:00
int32_t log_version ,
MergeType merge_type )
2016-05-16 18:43:38 +00:00
{
2022-10-11 09:27:46 +00:00
Strings exists_paths ;
exists_paths . reserve ( parts . size ( ) ) ;
2016-05-16 18:43:38 +00:00
for ( const auto & part : parts )
2022-10-11 09:27:46 +00:00
exists_paths . emplace_back ( fs : : path ( replica_path ) / " parts " / part - > name ) ;
2018-07-04 15:44:12 +00:00
2022-10-11 09:27:46 +00:00
auto exists_results = zookeeper - > exists ( exists_paths ) ;
2018-07-04 15:44:12 +00:00
bool all_in_zk = true ;
for ( size_t i = 0 ; i < parts . size ( ) ; + + i )
2016-05-16 18:43:38 +00:00
{
2017-03-13 18:01:46 +00:00
/// If there is no information about part in ZK, we will not merge it.
2022-10-11 09:27:46 +00:00
if ( exists_results [ i ] . error = = Coordination : : Error : : ZNONODE )
2016-05-16 18:43:38 +00:00
{
all_in_zk = false ;
2017-04-01 07:20:54 +00:00
2018-07-04 15:44:12 +00:00
const auto & part = parts [ i ] ;
2017-10-02 16:34:01 +00:00
if ( part - > modification_time + MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER < time ( nullptr ) )
2016-05-16 18:43:38 +00:00
{
2020-05-23 22:24:01 +00:00
LOG_WARNING ( log , " Part {} (that was selected for merge) with age {} seconds exists locally but not in ZooKeeper. Won't do merge with that part and will check it. " , part - > name , ( time ( nullptr ) - part - > modification_time ) ) ;
2016-05-16 18:43:38 +00:00
enqueuePartForCheck ( part - > name ) ;
}
}
}
2018-07-04 15:44:12 +00:00
2016-05-16 18:43:38 +00:00
if ( ! all_in_zk )
2020-06-12 18:24:32 +00:00
return CreateMergeEntryResult : : MissingPart ;
2017-04-01 07:20:54 +00:00
2017-10-02 16:34:01 +00:00
ReplicatedMergeTreeLogEntryData entry ;
2016-05-16 18:43:38 +00:00
entry . type = LogEntry : : MERGE_PARTS ;
entry . source_replica = replica_name ;
entry . new_part_name = merged_name ;
2020-11-02 14:38:18 +00:00
entry . new_part_uuid = merged_part_uuid ;
2020-02-11 13:41:26 +00:00
entry . new_part_type = merged_part_type ;
2020-09-02 08:18:50 +00:00
entry . merge_type = merge_type ;
2017-04-17 15:14:56 +00:00
entry . deduplicate = deduplicate ;
2020-12-01 09:10:12 +00:00
entry . deduplicate_by_columns = deduplicate_by_columns ;
2017-08-04 14:00:26 +00:00
entry . create_time = time ( nullptr ) ;
2017-04-01 07:20:54 +00:00
2016-05-16 18:43:38 +00:00
for ( const auto & part : parts )
2018-04-20 16:18:16 +00:00
entry . source_parts . push_back ( part - > name ) ;
2017-04-01 07:20:54 +00:00
2020-06-12 18:24:32 +00:00
Coordination : : Requests ops ;
Coordination : : Responses responses ;
ops . emplace_back ( zkutil : : makeCreateRequest (
2021-05-08 10:59:55 +00:00
fs : : path ( zookeeper_path ) / " log/log- " , entry . toString ( ) ,
2020-06-12 18:24:32 +00:00
zkutil : : CreateMode : : PersistentSequential ) ) ;
2020-06-12 20:22:55 +00:00
ops . emplace_back ( zkutil : : makeSetRequest (
2021-05-08 10:59:55 +00:00
fs : : path ( zookeeper_path ) / " log " , " " , log_version ) ) ; /// Check and update version.
2020-06-12 18:24:32 +00:00
Coordination : : Error code = zookeeper - > tryMulti ( ops , responses ) ;
if ( code = = Coordination : : Error : : ZOK )
{
String path_created = dynamic_cast < const Coordination : : CreateResponse & > ( * responses . front ( ) ) . path_created ;
entry . znode_name = path_created . substr ( path_created . find_last_of ( ' / ' ) + 1 ) ;
2020-06-12 20:22:55 +00:00
2020-06-12 20:38:43 +00:00
ProfileEvents : : increment ( ProfileEvents : : CreatedLogEntryForMerge ) ;
2020-06-12 20:22:55 +00:00
LOG_TRACE ( log , " Created log entry {} for merge {} " , path_created , merged_name ) ;
2020-06-12 18:24:32 +00:00
}
else if ( code = = Coordination : : Error : : ZBADVERSION )
{
2020-06-12 20:38:43 +00:00
ProfileEvents : : increment ( ProfileEvents : : NotCreatedLogEntryForMerge ) ;
2020-06-12 20:22:55 +00:00
LOG_TRACE ( log , " Log entry is not created for merge {} because log was updated " , merged_name ) ;
2020-06-12 18:24:32 +00:00
return CreateMergeEntryResult : : LogUpdated ;
}
else
{
zkutil : : KeeperMultiException : : check ( code , ops , responses ) ;
}
2017-04-01 07:20:54 +00:00
2016-05-16 18:43:38 +00:00
if ( out_log_entry )
* out_log_entry = entry ;
2017-04-01 07:20:54 +00:00
2020-06-12 18:24:32 +00:00
return CreateMergeEntryResult : : Ok ;
2016-05-16 18:43:38 +00:00
}
2020-06-12 18:24:32 +00:00
StorageReplicatedMergeTree : : CreateMergeEntryResult StorageReplicatedMergeTree : : createLogEntryToMutatePart (
2020-11-02 14:38:18 +00:00
const IMergeTreeDataPart & part , const UUID & new_part_uuid , Int64 mutation_version , int32_t alter_version , int32_t log_version )
2018-04-20 16:18:16 +00:00
{
auto zookeeper = getZooKeeper ( ) ;
/// If there is no information about part in ZK, we will not mutate it.
2021-05-08 10:59:55 +00:00
if ( ! zookeeper - > exists ( fs : : path ( replica_path ) / " parts " / part . name ) )
2018-04-20 16:18:16 +00:00
{
if ( part . modification_time + MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER < time ( nullptr ) )
{
2020-06-12 19:01:44 +00:00
LOG_WARNING ( log , " Part {} (that was selected for mutation) with age {} seconds exists locally but not in ZooKeeper. "
" Won't mutate that part and will check it. " , part . name , ( time ( nullptr ) - part . modification_time ) ) ;
2018-04-20 16:18:16 +00:00
enqueuePartForCheck ( part . name ) ;
}
2020-06-12 18:24:32 +00:00
return CreateMergeEntryResult : : MissingPart ;
2018-04-20 16:18:16 +00:00
}
MergeTreePartInfo new_part_info = part . info ;
new_part_info . mutation = mutation_version ;
2018-05-23 19:34:37 +00:00
String new_part_name = part . getNewName ( new_part_info ) ;
2018-04-20 16:18:16 +00:00
ReplicatedMergeTreeLogEntryData entry ;
entry . type = LogEntry : : MUTATE_PART ;
entry . source_replica = replica_name ;
entry . source_parts . push_back ( part . name ) ;
entry . new_part_name = new_part_name ;
2020-11-02 14:38:18 +00:00
entry . new_part_uuid = new_part_uuid ;
2018-04-20 16:18:16 +00:00
entry . create_time = time ( nullptr ) ;
2020-01-31 12:25:31 +00:00
entry . alter_version = alter_version ;
2018-04-20 16:18:16 +00:00
2020-06-12 18:24:32 +00:00
Coordination : : Requests ops ;
Coordination : : Responses responses ;
ops . emplace_back ( zkutil : : makeCreateRequest (
2021-05-08 10:59:55 +00:00
fs : : path ( zookeeper_path ) / " log/log- " , entry . toString ( ) ,
2020-06-12 18:24:32 +00:00
zkutil : : CreateMode : : PersistentSequential ) ) ;
2020-06-12 20:22:55 +00:00
ops . emplace_back ( zkutil : : makeSetRequest (
2021-05-08 10:59:55 +00:00
fs : : path ( zookeeper_path ) / " log " , " " , log_version ) ) ; /// Check and update version.
2020-06-12 18:24:32 +00:00
Coordination : : Error code = zookeeper - > tryMulti ( ops , responses ) ;
if ( code = = Coordination : : Error : : ZBADVERSION )
2020-06-12 20:22:55 +00:00
{
2020-06-12 20:38:43 +00:00
ProfileEvents : : increment ( ProfileEvents : : NotCreatedLogEntryForMutation ) ;
2020-06-12 20:22:55 +00:00
LOG_TRACE ( log , " Log entry is not created for mutation {} because log was updated " , new_part_name ) ;
2020-06-12 18:24:32 +00:00
return CreateMergeEntryResult : : LogUpdated ;
2020-06-12 20:22:55 +00:00
}
2020-06-12 18:24:32 +00:00
zkutil : : KeeperMultiException : : check ( code , ops , responses ) ;
2020-06-12 20:22:55 +00:00
2020-06-12 20:38:43 +00:00
ProfileEvents : : increment ( ProfileEvents : : CreatedLogEntryForMutation ) ;
2020-06-12 20:22:55 +00:00
LOG_TRACE ( log , " Created log entry for mutation {} " , new_part_name ) ;
2020-06-12 18:24:32 +00:00
return CreateMergeEntryResult : : Ok ;
2018-04-20 16:18:16 +00:00
}
2018-12-11 13:30:20 +00:00
void StorageReplicatedMergeTree : : removePartFromZooKeeper ( const String & part_name , Coordination : : Requests & ops , bool has_children )
2015-09-20 11:02:59 +00:00
{
2021-05-08 10:59:55 +00:00
String part_path = fs : : path ( replica_path ) / " parts " / part_name ;
2015-09-20 11:02:59 +00:00
2018-12-11 13:30:20 +00:00
if ( has_children )
{
2021-05-08 10:59:55 +00:00
ops . emplace_back ( zkutil : : makeRemoveRequest ( fs : : path ( part_path ) / " checksums " , - 1 ) ) ;
ops . emplace_back ( zkutil : : makeRemoveRequest ( fs : : path ( part_path ) / " columns " , - 1 ) ) ;
2018-12-11 13:30:20 +00:00
}
2018-03-24 00:45:04 +00:00
ops . emplace_back ( zkutil : : makeRemoveRequest ( part_path , - 1 ) ) ;
2015-09-20 11:02:59 +00:00
}
2021-01-27 10:07:18 +00:00
void StorageReplicatedMergeTree : : removePartFromZooKeeper ( const String & part_name )
{
auto zookeeper = getZooKeeper ( ) ;
2021-05-08 10:59:55 +00:00
String part_path = fs : : path ( replica_path ) / " parts " / part_name ;
2021-01-27 10:07:18 +00:00
Coordination : : Stat stat ;
/// Part doesn't exist, nothing to remove
if ( ! zookeeper - > exists ( part_path , & stat ) )
return ;
Coordination : : Requests ops ;
removePartFromZooKeeper ( part_name , ops , stat . numChildren > 0 ) ;
zookeeper - > multi ( ops ) ;
}
2015-09-20 11:02:59 +00:00
2014-07-22 13:49:52 +00:00
void StorageReplicatedMergeTree : : removePartAndEnqueueFetch ( const String & part_name )
{
2014-12-12 20:50:32 +00:00
auto zookeeper = getZooKeeper ( ) ;
2021-10-18 20:16:02 +00:00
/// We don't know exactly what happened to broken part
/// and we are going to remove all covered log entries.
/// It's quite dangerous, so clone covered parts to detached.
auto broken_part_info = MergeTreePartInfo : : fromPartName ( part_name , format_version ) ;
2021-12-07 11:39:29 +00:00
auto partition_range = getVisibleDataPartsVectorInPartition ( getContext ( ) , broken_part_info . partition_id ) ;
2021-10-18 20:16:02 +00:00
for ( const auto & part : partition_range )
{
if ( ! broken_part_info . contains ( part - > info ) )
continue ;
2022-04-04 11:17:33 +00:00
/// Broken part itself either already moved to detached or does not exist.
2021-10-18 20:16:02 +00:00
assert ( broken_part_info ! = part - > info ) ;
part - > makeCloneInDetached ( " covered-by-broken " , getInMemoryMetadataPtr ( ) ) ;
}
/// It's possible that queue contains entries covered by part_name.
/// For example, we had GET_PART all_1_42_5 and MUTATE_PART all_1_42_5_63,
/// then all_1_42_5_63 was executed by fetching, but part was written to disk incorrectly.
/// In this case we have to remove it as broken and create GET_PART all_1_42_5_63 to fetch it again,
/// but GET_PART all_1_42_5 may be still in the queue.
/// We should remove all covered entries before creating GET_PART entry, because:
/// 1. In the situation described above, we do not know how to merge/mutate all_1_42_5_63 from all_1_42_5,
/// so GET_PART all_1_42_5 (and all source parts) is useless. The only thing we can do is to fetch all_1_42_5_63.
/// 2. If all_1_42_5_63 is lost, then replication may stuck waiting for all_1_42_5_63 to appear,
/// because we may have some covered parts (more precisely, parts with the same min and max blocks)
queue . removePartProducingOpsInRange ( zookeeper , broken_part_info , { } ) ;
2021-05-08 10:59:55 +00:00
String part_path = fs : : path ( replica_path ) / " parts " / part_name ;
2014-07-22 13:49:52 +00:00
2018-12-11 13:30:20 +00:00
Coordination : : Requests ops ;
time_t part_create_time = 0 ;
Coordination : : Stat stat ;
if ( zookeeper - > exists ( part_path , & stat ) )
{
2021-10-25 14:01:23 +00:00
/// Update version of /is_lost node to avoid race condition with cloneReplica(...).
/// cloneReplica(...) expects that if some entry was executed, then its new_part_name is added to /parts,
/// but we are going to remove it from /parts and add to queue again.
Coordination : : Stat is_lost_stat ;
String is_lost_value = zookeeper - > get ( replica_path + " /is_lost " , & is_lost_stat ) ;
assert ( is_lost_value = = " 0 " ) ;
ops . emplace_back ( zkutil : : makeSetRequest ( replica_path + " /is_lost " , is_lost_value , is_lost_stat . version ) ) ;
2018-12-11 13:30:20 +00:00
part_create_time = stat . ctime / 1000 ;
removePartFromZooKeeper ( part_name , ops , stat . numChildren > 0 ) ;
}
2016-05-28 10:15:36 +00:00
LogEntryPtr log_entry = std : : make_shared < LogEntry > ( ) ;
2014-08-05 13:49:44 +00:00
log_entry - > type = LogEntry : : GET_PART ;
2018-12-11 13:30:20 +00:00
log_entry - > create_time = part_create_time ;
2014-08-05 13:49:44 +00:00
log_entry - > source_replica = " " ;
log_entry - > new_part_name = part_name ;
2014-07-22 13:49:52 +00:00
2018-03-24 00:45:04 +00:00
ops . emplace_back ( zkutil : : makeCreateRequest (
2021-05-08 10:59:55 +00:00
fs : : path ( replica_path ) / " queue/queue- " , log_entry - > toString ( ) ,
2014-07-22 13:49:52 +00:00
zkutil : : CreateMode : : PersistentSequential ) ) ;
2015-09-20 11:02:59 +00:00
2014-07-22 13:49:52 +00:00
auto results = zookeeper - > multi ( ops ) ;
2019-07-03 08:49:52 +00:00
String path_created = dynamic_cast < const Coordination : : CreateResponse & > ( * results . back ( ) ) . path_created ;
2016-01-10 04:43:30 +00:00
log_entry - > znode_name = path_created . substr ( path_created . find_last_of ( ' / ' ) + 1 ) ;
2016-01-17 13:00:42 +00:00
queue . insert ( zookeeper , log_entry ) ;
2014-07-22 13:49:52 +00:00
}
2014-10-18 17:37:55 +00:00
2021-12-07 16:55:55 +00:00
void StorageReplicatedMergeTree : : startBeingLeader ( )
2014-04-04 10:37:33 +00:00
{
2022-08-19 08:49:51 +00:00
auto zookeeper = getZooKeeper ( ) ;
2022-08-24 17:44:14 +00:00
2021-12-07 16:55:55 +00:00
if ( ! getSettings ( ) - > replicated_can_become_leader )
2018-04-06 16:06:07 +00:00
{
2021-12-07 16:55:55 +00:00
LOG_INFO ( log , " Will not enter leader election because replicated_can_become_leader=0 " ) ;
return ;
}
2018-04-06 16:06:07 +00:00
2022-08-12 09:32:13 +00:00
zkutil : : checkNoOldLeaders ( log , * zookeeper , fs : : path ( zookeeper_path ) / " leader_election " ) ;
2016-10-14 11:47:11 +00:00
2021-12-07 16:55:55 +00:00
LOG_INFO ( log , " Became leader " ) ;
is_leader = true ;
2018-04-06 16:06:07 +00:00
}
2021-12-07 16:55:55 +00:00
void StorageReplicatedMergeTree : : stopBeingLeader ( )
2018-04-06 16:06:07 +00:00
{
2021-12-07 16:55:55 +00:00
if ( ! is_leader )
2016-10-14 09:59:10 +00:00
return ;
2021-12-07 16:55:55 +00:00
LOG_INFO ( log , " Stopped being leader " ) ;
is_leader = false ;
2014-04-04 10:37:33 +00:00
}
2021-04-10 23:33:54 +00:00
ConnectionTimeouts StorageReplicatedMergeTree : : getFetchPartHTTPTimeouts ( ContextPtr local_context )
2021-02-04 17:25:10 +00:00
{
2021-04-10 23:33:54 +00:00
auto timeouts = ConnectionTimeouts : : getHTTPTimeouts ( local_context ) ;
2021-02-04 17:25:10 +00:00
auto settings = getSettings ( ) ;
if ( settings - > replicated_fetches_http_connection_timeout . changed )
timeouts . connection_timeout = settings - > replicated_fetches_http_connection_timeout ;
if ( settings - > replicated_fetches_http_send_timeout . changed )
timeouts . send_timeout = settings - > replicated_fetches_http_send_timeout ;
if ( settings - > replicated_fetches_http_receive_timeout . changed )
timeouts . receive_timeout = settings - > replicated_fetches_http_receive_timeout ;
return timeouts ;
}
2020-09-18 10:57:33 +00:00
bool StorageReplicatedMergeTree : : checkReplicaHavePart ( const String & replica , const String & part_name )
{
auto zookeeper = getZooKeeper ( ) ;
2021-05-08 10:59:55 +00:00
return zookeeper - > exists ( fs : : path ( zookeeper_path ) / " replicas " / replica / " parts " / part_name ) ;
2020-09-18 10:57:33 +00:00
}
2014-10-18 17:37:55 +00:00
2014-04-08 17:45:21 +00:00
String StorageReplicatedMergeTree : : findReplicaHavingPart ( const String & part_name , bool active )
2014-04-03 11:48:28 +00:00
{
2014-12-12 20:50:32 +00:00
auto zookeeper = getZooKeeper ( ) ;
2021-05-08 10:59:55 +00:00
Strings replicas = zookeeper - > getChildren ( fs : : path ( zookeeper_path ) / " replicas " ) ;
2014-04-03 11:48:28 +00:00
2016-08-09 20:39:28 +00:00
/// Select replicas in uniformly random order.
2019-07-28 15:30:38 +00:00
std : : shuffle ( replicas . begin ( ) , replicas . end ( ) , thread_local_rng ) ;
2014-04-03 11:48:28 +00:00
2021-03-22 15:44:44 +00:00
LOG_TRACE ( log , " Candidate replicas: {} " , replicas . size ( ) ) ;
2014-04-03 11:48:28 +00:00
for ( const String & replica : replicas )
{
2021-02-14 22:59:13 +00:00
/// We aren't interested in ourself.
2016-08-10 04:20:50 +00:00
if ( replica = = replica_name )
continue ;
2021-03-22 15:44:44 +00:00
LOG_TRACE ( log , " Candidate replica: {} " , replica ) ;
2020-09-18 10:57:33 +00:00
if ( checkReplicaHavePart ( replica , part_name ) & &
2021-05-08 10:59:55 +00:00
( ! active | | zookeeper - > exists ( fs : : path ( zookeeper_path ) / " replicas " / replica / " is_active " ) ) )
2014-04-03 11:48:28 +00:00
return replica ;
2015-09-15 01:20:40 +00:00
2016-08-10 04:20:50 +00:00
/// Obviously, replica could become inactive or even vanish after return from this method.
2014-04-03 11:48:28 +00:00
}
2016-08-09 20:39:28 +00:00
return { } ;
}
2018-05-23 14:33:55 +00:00
String StorageReplicatedMergeTree : : findReplicaHavingCoveringPart ( LogEntry & entry , bool active )
2016-08-09 20:39:28 +00:00
{
auto zookeeper = getZooKeeper ( ) ;
2021-05-08 10:59:55 +00:00
Strings replicas = zookeeper - > getChildren ( fs : : path ( zookeeper_path ) / " replicas " ) ;
2017-04-01 07:20:54 +00:00
2016-08-09 20:39:28 +00:00
/// Select replicas in uniformly random order.
2019-07-28 15:30:38 +00:00
std : : shuffle ( replicas . begin ( ) , replicas . end ( ) , thread_local_rng ) ;
2017-04-01 07:20:54 +00:00
2016-08-09 20:39:28 +00:00
for ( const String & replica : replicas )
{
2016-08-10 04:20:50 +00:00
if ( replica = = replica_name )
continue ;
2017-04-01 07:20:54 +00:00
2021-05-08 10:59:55 +00:00
if ( active & & ! zookeeper - > exists ( fs : : path ( zookeeper_path ) / " replicas " / replica / " is_active " ) )
2016-08-09 20:39:28 +00:00
continue ;
2017-04-01 07:20:54 +00:00
2016-08-09 20:39:28 +00:00
String largest_part_found ;
2021-05-08 10:59:55 +00:00
Strings parts = zookeeper - > getChildren ( fs : : path ( zookeeper_path ) / " replicas " / replica / " parts " ) ;
2016-08-09 20:39:28 +00:00
for ( const String & part_on_replica : parts )
{
2017-08-25 20:41:45 +00:00
if ( part_on_replica = = entry . new_part_name
2019-05-03 02:00:57 +00:00
| | MergeTreePartInfo : : contains ( part_on_replica , entry . new_part_name , format_version ) )
2016-08-09 20:39:28 +00:00
{
2017-08-25 20:41:45 +00:00
if ( largest_part_found . empty ( )
2019-05-03 02:00:57 +00:00
| | MergeTreePartInfo : : contains ( part_on_replica , largest_part_found , format_version ) )
2016-08-09 20:39:28 +00:00
{
largest_part_found = part_on_replica ;
}
}
}
2017-04-01 07:20:54 +00:00
2016-08-09 20:39:28 +00:00
if ( ! largest_part_found . empty ( ) )
{
2017-05-12 13:47:42 +00:00
bool the_same_part = largest_part_found = = entry . new_part_name ;
/// Make a check in case if selected part differs from source part
if ( ! the_same_part )
{
String reject_reason ;
if ( ! queue . addFuturePartIfNotCoveredByThem ( largest_part_found , entry , reject_reason ) )
{
2020-05-23 22:24:01 +00:00
LOG_INFO ( log , " Will not fetch part {} covering {}. {} " , largest_part_found , entry . new_part_name , reject_reason ) ;
2017-05-12 13:47:42 +00:00
return { } ;
}
}
2016-08-10 04:20:50 +00:00
return replica ;
2016-08-09 20:39:28 +00:00
}
}
2017-04-01 07:20:54 +00:00
2016-08-09 20:39:28 +00:00
return { } ;
2014-04-03 11:48:28 +00:00
}
2014-10-18 17:37:55 +00:00
2018-05-21 13:49:54 +00:00
String StorageReplicatedMergeTree : : findReplicaHavingCoveringPart (
const String & part_name , bool active , String & found_part_name )
{
auto zookeeper = getZooKeeper ( ) ;
2021-05-08 10:59:55 +00:00
Strings replicas = zookeeper - > getChildren ( fs : : path ( zookeeper_path ) / " replicas " ) ;
2018-05-21 13:49:54 +00:00
/// Select replicas in uniformly random order.
2019-07-28 15:30:38 +00:00
std : : shuffle ( replicas . begin ( ) , replicas . end ( ) , thread_local_rng ) ;
2018-05-21 13:49:54 +00:00
String largest_part_found ;
String largest_replica_found ;
for ( const String & replica : replicas )
{
if ( replica = = replica_name )
continue ;
2021-05-08 10:59:55 +00:00
if ( active & & ! zookeeper - > exists ( fs : : path ( zookeeper_path ) / " replicas " / replica / " is_active " ) )
2018-05-21 13:49:54 +00:00
continue ;
2021-05-08 10:59:55 +00:00
Strings parts = zookeeper - > getChildren ( fs : : path ( zookeeper_path ) / " replicas " / replica / " parts " ) ;
2018-05-21 13:49:54 +00:00
for ( const String & part_on_replica : parts )
{
if ( part_on_replica = = part_name
2019-05-03 02:00:57 +00:00
| | MergeTreePartInfo : : contains ( part_on_replica , part_name , format_version ) )
2018-05-21 13:49:54 +00:00
{
if ( largest_part_found . empty ( )
2019-05-03 02:00:57 +00:00
| | MergeTreePartInfo : : contains ( part_on_replica , largest_part_found , format_version ) )
2018-05-21 13:49:54 +00:00
{
largest_part_found = part_on_replica ;
largest_replica_found = replica ;
}
}
}
}
found_part_name = largest_part_found ;
return largest_replica_found ;
}
2017-03-12 19:18:07 +00:00
/** If a quorum is tracked for a part, update information about it in ZK.
2015-09-11 02:13:59 +00:00
*/
2020-10-07 11:28:48 +00:00
void StorageReplicatedMergeTree : : updateQuorum ( const String & part_name , bool is_parallel )
2015-09-11 02:13:59 +00:00
{
2015-09-20 11:02:59 +00:00
auto zookeeper = getZooKeeper ( ) ;
2017-04-01 07:20:54 +00:00
2017-03-13 18:01:46 +00:00
/// Information on which replicas a part has been added, if the quorum has not yet been reached.
2021-05-08 10:59:55 +00:00
String quorum_status_path = fs : : path ( zookeeper_path ) / " quorum " / " status " ;
2020-10-09 11:20:20 +00:00
if ( is_parallel )
2021-05-08 10:59:55 +00:00
quorum_status_path = fs : : path ( zookeeper_path ) / " quorum " / " parallel " / part_name ;
2017-03-12 19:18:07 +00:00
/// The name of the previous part for which the quorum was reached.
2021-05-08 10:59:55 +00:00
const String quorum_last_part_path = fs : : path ( zookeeper_path ) / " quorum " / " last_part " ;
2017-04-01 07:20:54 +00:00
2015-09-11 02:13:59 +00:00
String value ;
2018-08-25 01:58:14 +00:00
Coordination : : Stat stat ;
2017-04-01 07:20:54 +00:00
2017-03-13 18:01:46 +00:00
/// If there is no node, then all quorum INSERTs have already reached the quorum, and nothing is needed.
2015-09-11 02:13:59 +00:00
while ( zookeeper - > tryGet ( quorum_status_path , value , & stat ) )
{
2020-10-07 11:28:48 +00:00
ReplicatedMergeTreeQuorumEntry quorum_entry ( value ) ;
2015-09-11 02:13:59 +00:00
if ( quorum_entry . part_name ! = part_name )
{
2020-12-17 16:13:01 +00:00
LOG_TRACE ( log , " Quorum {}, already achieved for part {} current part {} " ,
quorum_status_path , part_name , quorum_entry . part_name ) ;
2017-03-12 19:18:07 +00:00
/// The quorum has already been achieved. Moreover, another INSERT with a quorum has already started.
2015-09-11 02:13:59 +00:00
break ;
}
2017-04-01 07:20:54 +00:00
2015-09-11 02:13:59 +00:00
quorum_entry . replicas . insert ( replica_name ) ;
2017-04-01 07:20:54 +00:00
2015-09-11 02:13:59 +00:00
if ( quorum_entry . replicas . size ( ) > = quorum_entry . required_number_of_replicas )
{
2017-03-13 18:01:46 +00:00
/// The quorum is reached. Delete the node, and update information about the last part that was successfully written with quorum.
2022-09-13 13:49:51 +00:00
LOG_TRACE ( log , " Got {} (of {} required) replicas confirmed quorum {}, going to remove node " ,
2022-09-06 12:09:03 +00:00
quorum_entry . replicas . size ( ) , quorum_entry . required_number_of_replicas , quorum_status_path ) ;
2017-04-01 07:20:54 +00:00
2018-08-25 01:58:14 +00:00
Coordination : : Requests ops ;
Coordination : : Responses responses ;
2018-10-23 08:19:47 +00:00
2020-10-07 11:28:48 +00:00
if ( ! is_parallel )
2020-10-06 21:49:48 +00:00
{
Coordination : : Stat added_parts_stat ;
String old_added_parts = zookeeper - > get ( quorum_last_part_path , & added_parts_stat ) ;
2018-09-17 12:28:29 +00:00
2020-10-06 21:49:48 +00:00
ReplicatedMergeTreeQuorumAddedParts parts_with_quorum ( format_version ) ;
2018-10-18 11:51:40 +00:00
2020-10-06 21:49:48 +00:00
if ( ! old_added_parts . empty ( ) )
parts_with_quorum . fromString ( old_added_parts ) ;
2018-09-19 11:08:04 +00:00
2020-10-06 21:49:48 +00:00
auto part_info = MergeTreePartInfo : : fromPartName ( part_name , format_version ) ;
/// We store one last part which reached quorum for each partition.
parts_with_quorum . added_parts [ part_info . partition_id ] = part_name ;
2018-10-23 08:19:47 +00:00
2020-10-06 21:49:48 +00:00
String new_added_parts = parts_with_quorum . toString ( ) ;
2018-10-23 08:19:47 +00:00
2020-10-06 21:49:48 +00:00
ops . emplace_back ( zkutil : : makeRemoveRequest ( quorum_status_path , stat . version ) ) ;
ops . emplace_back ( zkutil : : makeSetRequest ( quorum_last_part_path , new_added_parts , added_parts_stat . version ) ) ;
}
else
ops . emplace_back ( zkutil : : makeRemoveRequest ( quorum_status_path , stat . version ) ) ;
2018-08-14 09:57:18 +00:00
2018-03-25 00:15:52 +00:00
auto code = zookeeper - > tryMulti ( ops , responses ) ;
2017-04-01 07:20:54 +00:00
2020-06-12 15:09:12 +00:00
if ( code = = Coordination : : Error : : ZOK )
2015-09-11 02:51:35 +00:00
{
break ;
}
2020-06-12 15:09:12 +00:00
else if ( code = = Coordination : : Error : : ZNONODE )
2015-09-11 02:13:59 +00:00
{
2017-03-13 18:01:46 +00:00
/// The quorum has already been achieved.
2015-09-11 02:13:59 +00:00
break ;
}
2020-06-12 15:09:12 +00:00
else if ( code = = Coordination : : Error : : ZBADVERSION )
2015-09-11 02:13:59 +00:00
{
2017-03-13 18:01:46 +00:00
/// Node was updated meanwhile. We must re-read it and repeat all the actions.
2015-09-11 02:13:59 +00:00
continue ;
}
else
2018-08-25 01:58:14 +00:00
throw Coordination : : Exception ( code , quorum_status_path ) ;
2015-09-11 02:13:59 +00:00
}
else
{
2022-09-06 12:09:03 +00:00
LOG_TRACE ( log , " Quorum {} still not satisfied (have only {} of {} replicas), updating node " ,
quorum_status_path , quorum_entry . replicas . size ( ) , quorum_entry . required_number_of_replicas ) ;
2017-03-12 19:18:07 +00:00
/// We update the node, registering there one more replica.
2015-09-11 02:13:59 +00:00
auto code = zookeeper - > trySet ( quorum_status_path , quorum_entry . toString ( ) , stat . version ) ;
2017-04-01 07:20:54 +00:00
2020-06-12 15:09:12 +00:00
if ( code = = Coordination : : Error : : ZOK )
2015-09-11 02:51:35 +00:00
{
break ;
}
2020-06-12 15:09:12 +00:00
else if ( code = = Coordination : : Error : : ZNONODE )
2015-09-11 02:13:59 +00:00
{
2017-03-12 19:18:07 +00:00
/// The quorum has already been achieved.
2015-09-11 02:13:59 +00:00
break ;
}
2020-06-12 15:09:12 +00:00
else if ( code = = Coordination : : Error : : ZBADVERSION )
2015-09-11 02:13:59 +00:00
{
2017-03-13 18:01:46 +00:00
/// Node was updated meanwhile. We must re-read it and repeat all the actions.
2015-09-11 02:13:59 +00:00
continue ;
}
else
2018-08-25 01:58:14 +00:00
throw Coordination : : Exception ( code , quorum_status_path ) ;
2015-09-11 02:13:59 +00:00
}
}
}
2020-04-20 10:56:59 +00:00
void StorageReplicatedMergeTree : : cleanLastPartNode ( const String & partition_id )
2020-04-10 21:29:54 +00:00
{
auto zookeeper = getZooKeeper ( ) ;
/// The name of the previous part for which the quorum was reached.
2021-05-08 10:59:55 +00:00
const String quorum_last_part_path = fs : : path ( zookeeper_path ) / " quorum " / " last_part " ;
2020-04-10 21:29:54 +00:00
/// Delete information from "last_part" node.
while ( true )
{
Coordination : : Stat added_parts_stat ;
String old_added_parts = zookeeper - > get ( quorum_last_part_path , & added_parts_stat ) ;
ReplicatedMergeTreeQuorumAddedParts parts_with_quorum ( format_version ) ;
if ( ! old_added_parts . empty ( ) )
parts_with_quorum . fromString ( old_added_parts ) ;
/// Delete information about particular partition.
2022-04-18 10:18:43 +00:00
if ( ! parts_with_quorum . added_parts . contains ( partition_id ) )
2020-04-10 21:29:54 +00:00
{
/// There is no information about interested part.
break ;
}
2020-04-20 10:56:59 +00:00
parts_with_quorum . added_parts . erase ( partition_id ) ;
2020-04-10 21:29:54 +00:00
String new_added_parts = parts_with_quorum . toString ( ) ;
auto code = zookeeper - > trySet ( quorum_last_part_path , new_added_parts , added_parts_stat . version ) ;
2020-06-12 15:09:12 +00:00
if ( code = = Coordination : : Error : : ZOK )
2020-04-10 21:29:54 +00:00
{
break ;
}
2020-06-12 15:09:12 +00:00
else if ( code = = Coordination : : Error : : ZNONODE )
2020-04-10 21:29:54 +00:00
{
/// Node is deleted. It is impossible, but it is Ok.
break ;
}
2020-06-12 15:09:12 +00:00
else if ( code = = Coordination : : Error : : ZBADVERSION )
2020-04-10 21:29:54 +00:00
{
/// Node was updated meanwhile. We must re-read it and repeat all the actions.
continue ;
}
else
throw Coordination : : Exception ( code , quorum_last_part_path ) ;
}
}
2020-11-03 09:24:10 +00:00
bool StorageReplicatedMergeTree : : partIsInsertingWithParallelQuorum ( const MergeTreePartInfo & part_info ) const
2020-10-01 10:38:50 +00:00
{
auto zookeeper = getZooKeeper ( ) ;
2021-05-08 10:59:55 +00:00
return zookeeper - > exists ( fs : : path ( zookeeper_path ) / " quorum " / " parallel " / part_info . getPartName ( ) ) ;
2020-11-03 09:24:10 +00:00
}
2020-10-01 10:38:50 +00:00
2021-03-10 13:27:08 +00:00
2020-11-03 09:24:10 +00:00
bool StorageReplicatedMergeTree : : partIsLastQuorumPart ( const MergeTreePartInfo & part_info ) const
{
auto zookeeper = getZooKeeper ( ) ;
2020-10-01 10:38:50 +00:00
2021-05-08 10:59:55 +00:00
const String parts_with_quorum_path = fs : : path ( zookeeper_path ) / " quorum " / " last_part " ;
2020-10-01 10:38:50 +00:00
2020-11-03 09:24:10 +00:00
String parts_with_quorum_str = zookeeper - > get ( parts_with_quorum_path ) ;
2020-10-01 10:38:50 +00:00
2020-11-03 09:24:10 +00:00
if ( parts_with_quorum_str . empty ( ) )
return false ;
2020-10-01 10:38:50 +00:00
2020-11-03 09:24:10 +00:00
ReplicatedMergeTreeQuorumAddedParts parts_with_quorum ( format_version ) ;
parts_with_quorum . fromString ( parts_with_quorum_str ) ;
2020-10-01 10:38:50 +00:00
2020-11-03 09:24:10 +00:00
auto partition_it = parts_with_quorum . added_parts . find ( part_info . partition_id ) ;
if ( partition_it = = parts_with_quorum . added_parts . end ( ) )
return false ;
2020-10-01 10:38:50 +00:00
2020-11-03 09:24:10 +00:00
return partition_it - > second = = part_info . getPartName ( ) ;
2020-10-01 10:38:50 +00:00
}
2021-03-10 13:27:08 +00:00
2020-06-26 11:30:23 +00:00
bool StorageReplicatedMergeTree : : fetchPart ( const String & part_name , const StorageMetadataPtr & metadata_snapshot ,
2022-04-22 17:18:18 +00:00
const String & source_replica_path , bool to_detached , size_t quorum , zkutil : : ZooKeeper : : Ptr zookeeper_ , bool try_fetch_shared )
2014-04-03 11:48:28 +00:00
{
2020-08-27 14:19:18 +00:00
auto zookeeper = zookeeper_ ? zookeeper_ : getZooKeeper ( ) ;
2019-05-03 02:00:57 +00:00
const auto part_info = MergeTreePartInfo : : fromPartName ( part_name , format_version ) ;
2018-09-11 14:41:04 +00:00
2020-10-30 02:55:50 +00:00
if ( ! to_detached )
2017-10-03 19:04:56 +00:00
{
2022-08-12 11:03:57 +00:00
if ( auto part = getPartIfExists ( part_info , { MergeTreeDataPartState : : Outdated , MergeTreeDataPartState : : Deleting } ) )
2020-10-30 02:55:50 +00:00
{
LOG_DEBUG ( log , " Part {} should be deleted after previous attempt before fetch " , part - > name ) ;
/// Force immediate parts cleanup to delete the part that was left from the previous fetch attempt.
cleanup_thread . wakeup ( ) ;
return false ;
}
2017-10-03 19:04:56 +00:00
}
2016-08-10 07:20:21 +00:00
{
2019-01-02 06:44:36 +00:00
std : : lock_guard lock ( currently_fetching_parts_mutex ) ;
2016-08-10 07:20:21 +00:00
if ( ! currently_fetching_parts . insert ( part_name ) . second )
{
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Part {} is already fetching right now " , part_name ) ;
2016-08-10 07:20:21 +00:00
return false ;
}
}
2017-04-01 07:20:54 +00:00
2021-04-04 09:23:40 +00:00
SCOPE_EXIT_MEMORY
2017-03-03 21:10:41 +00:00
( {
2019-01-02 06:44:36 +00:00
std : : lock_guard lock ( currently_fetching_parts_mutex ) ;
2016-08-10 07:20:21 +00:00
currently_fetching_parts . erase ( part_name ) ;
2017-03-03 21:10:41 +00:00
} ) ;
2017-04-01 07:20:54 +00:00
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Fetching part {} from {} " , part_name , source_replica_path ) ;
2017-04-01 07:20:54 +00:00
2022-09-06 22:00:10 +00:00
auto settings_ptr = getSettings ( ) ;
2020-06-18 16:10:47 +00:00
TableLockHolder table_lock_holder ;
2014-10-13 17:28:59 +00:00
if ( ! to_detached )
2022-09-06 22:00:10 +00:00
table_lock_holder = lockForShare ( RWLockImpl : : NO_QUERY , settings_ptr - > lock_acquire_timeout_for_background_operations ) ;
2017-04-01 07:20:54 +00:00
2018-01-23 22:56:46 +00:00
/// Logging
2017-03-07 17:13:54 +00:00
Stopwatch stopwatch ;
2019-05-03 02:00:57 +00:00
MutableDataPartPtr part ;
DataPartsVector replaced_parts ;
2017-04-01 07:20:54 +00:00
2018-01-23 22:56:46 +00:00
auto write_part_log = [ & ] ( const ExecutionStatus & execution_status )
2014-10-13 17:28:59 +00:00
{
2018-05-25 19:44:14 +00:00
writePartLog (
PartLogElement : : DOWNLOAD_PART , execution_status , stopwatch . elapsed ( ) ,
part_name , part , replaced_parts , nullptr ) ;
2018-01-23 22:56:46 +00:00
} ;
2019-05-03 02:00:57 +00:00
DataPartPtr part_to_clone ;
2018-09-11 14:41:04 +00:00
{
/// If the desired part is a result of a part mutation, try to find the source part and compare
/// its checksums to the checksums of the desired part. If they match, we can just clone the local part.
/// If we have the source part, its part_info will contain covered_part_info.
auto covered_part_info = part_info ;
covered_part_info . mutation = 0 ;
2019-05-03 02:00:57 +00:00
auto source_part = getActiveContainingPart ( covered_part_info ) ;
2018-05-21 13:49:54 +00:00
2022-09-06 22:00:10 +00:00
/// Fetch for zero-copy replication is cheap and straightforward, so we don't use local clone here
if ( source_part & & ( ! settings_ptr - > allow_remote_fs_zero_copy_replication | | ! source_part - > data_part_storage - > supportZeroCopyReplication ( ) ) )
2018-09-11 14:41:04 +00:00
{
2022-02-13 20:08:29 +00:00
auto source_part_header = ReplicatedMergeTreePartHeader : : fromColumnsAndChecksums (
source_part - > getColumns ( ) , source_part - > checksums ) ;
2018-09-11 14:41:04 +00:00
2021-05-08 10:59:55 +00:00
String part_path = fs : : path ( source_replica_path ) / " parts " / part_name ;
2018-12-11 13:30:20 +00:00
String part_znode = zookeeper - > get ( part_path ) ;
2021-06-20 08:24:43 +00:00
2022-02-13 20:08:29 +00:00
std : : optional < ReplicatedMergeTreePartHeader > desired_part_header ;
2018-12-11 13:30:20 +00:00
if ( ! part_znode . empty ( ) )
2022-02-13 20:08:29 +00:00
{
desired_part_header = ReplicatedMergeTreePartHeader : : fromString ( part_znode ) ;
}
2018-12-11 13:30:20 +00:00
else
{
2022-02-13 20:08:29 +00:00
String columns_str ;
String checksums_str ;
if ( zookeeper - > tryGet ( fs : : path ( part_path ) / " columns " , columns_str ) & &
zookeeper - > tryGet ( fs : : path ( part_path ) / " checksums " , checksums_str ) )
{
desired_part_header = ReplicatedMergeTreePartHeader : : fromColumnsAndChecksumsZNodes ( columns_str , checksums_str ) ;
}
else
{
LOG_INFO ( log , " Not checking checksums of part {} with replica {} because part was removed from ZooKeeper " , part_name , source_replica_path ) ;
}
2018-12-11 13:30:20 +00:00
}
2022-02-15 09:11:50 +00:00
/// Checking both checksums and columns hash. For example we can have empty part
/// with same checksums but different columns. And we attaching it exception will
/// be thrown.
2022-02-13 20:08:29 +00:00
if ( desired_part_header
& & source_part_header . getColumnsHash ( ) = = desired_part_header - > getColumnsHash ( )
& & source_part_header . getChecksums ( ) = = desired_part_header - > getChecksums ( ) )
2018-09-11 14:41:04 +00:00
{
2022-02-13 20:08:29 +00:00
LOG_TRACE ( log , " Found local part {} with the same checksums and columns hash as {} " , source_part - > name , part_name ) ;
2018-09-11 14:41:04 +00:00
part_to_clone = source_part ;
}
}
}
2021-03-10 13:27:08 +00:00
ReplicatedMergeTreeAddress address ;
ConnectionTimeouts timeouts ;
String interserver_scheme ;
2021-04-07 16:06:20 +00:00
InterserverCredentialsPtr credentials ;
2021-03-08 09:38:07 +00:00
std : : optional < CurrentlySubmergingEmergingTagger > tagger_ptr ;
2019-05-03 02:00:57 +00:00
std : : function < MutableDataPartPtr ( ) > get_part ;
2022-04-19 13:53:10 +00:00
MergeTreeData : : HardlinkedFiles hardlinked_files ;
2022-08-09 16:44:51 +00:00
scope_guard part_to_clone_lock ;
2021-03-17 15:49:04 +00:00
2018-09-11 14:41:04 +00:00
if ( part_to_clone )
2018-01-23 22:56:46 +00:00
{
2018-09-11 14:41:04 +00:00
get_part = [ & , part_to_clone ] ( )
{
2022-09-27 13:23:02 +00:00
auto [ cloned_part , lock ] = cloneAndLoadDataPartOnSameDisk ( part_to_clone , " tmp_clone_ " , part_info , metadata_snapshot , NO_TRANSACTION_PTR , & hardlinked_files , false , { } ) ;
2022-08-09 16:44:51 +00:00
part_to_clone_lock = std : : move ( lock ) ;
return cloned_part ;
2018-09-11 14:41:04 +00:00
} ;
}
else
{
2021-05-08 10:59:55 +00:00
address . fromString ( zookeeper - > get ( fs : : path ( source_replica_path ) / " host " ) ) ;
2021-04-10 23:33:54 +00:00
timeouts = getFetchPartHTTPTimeouts ( getContext ( ) ) ;
2021-02-04 17:25:10 +00:00
2021-04-10 23:33:54 +00:00
credentials = getContext ( ) - > getInterserverCredentials ( ) ;
interserver_scheme = getContext ( ) - > getInterserverScheme ( ) ;
2018-09-11 14:41:04 +00:00
2021-04-07 13:52:11 +00:00
get_part = [ & , address , timeouts , credentials , interserver_scheme ] ( )
2018-09-11 14:41:04 +00:00
{
if ( interserver_scheme ! = address . scheme )
throw Exception ( " Interserver schemes are different: ' " + interserver_scheme
+ " ' != ' " + address . scheme + " ', can't fetch part from " + address . host ,
2020-12-25 13:38:04 +00:00
ErrorCodes : : INTERSERVER_SCHEME_DOESNT_MATCH ) ;
2018-07-31 10:34:35 +00:00
2022-09-08 14:18:21 +00:00
return fetcher . fetchSelectedPart (
2021-03-08 09:38:07 +00:00
metadata_snapshot ,
2021-05-21 16:14:01 +00:00
getContext ( ) ,
2021-03-08 09:38:07 +00:00
part_name ,
source_replica_path ,
address . host ,
address . replication_port ,
timeouts ,
2021-04-07 13:52:11 +00:00
credentials - > getUser ( ) ,
credentials - > getPassword ( ) ,
2021-03-08 09:38:07 +00:00
interserver_scheme ,
2021-05-26 20:37:44 +00:00
replicated_fetches_throttler ,
2021-03-08 09:38:07 +00:00
to_detached ,
" " ,
2021-03-12 09:58:32 +00:00
& tagger_ptr ,
2022-04-22 17:18:18 +00:00
try_fetch_shared ) ;
2018-09-11 14:41:04 +00:00
} ;
}
2018-05-21 13:49:54 +00:00
2018-01-23 22:56:46 +00:00
try
{
2018-09-11 14:41:04 +00:00
part = get_part ( ) ;
2017-11-20 19:33:12 +00:00
2018-01-23 22:56:46 +00:00
if ( ! to_detached )
2017-03-28 17:18:49 +00:00
{
2022-03-16 19:16:26 +00:00
Transaction transaction ( * this , NO_TRANSACTION_RAW ) ;
2022-10-22 22:51:59 +00:00
renameTempPartAndReplace ( part , transaction ) ;
2017-04-01 07:20:54 +00:00
2022-06-30 20:51:27 +00:00
replaced_parts = checkPartChecksumsAndCommit ( transaction , part , hardlinked_files ) ;
2017-04-01 07:20:54 +00:00
2018-01-23 22:56:46 +00:00
/** If a quorum is tracked for this part, you must update it.
* If you do not have time , in case of losing the session , when you restart the server - see the ` ReplicatedMergeTreeRestartingThread : : updateQuorumIfWeHavePart ` method .
*/
if ( quorum )
2020-10-07 11:28:48 +00:00
{
/// Check if this quorum insert is parallel or not
2021-05-08 10:59:55 +00:00
if ( zookeeper - > exists ( fs : : path ( zookeeper_path ) / " quorum " / " parallel " / part_name ) )
2020-10-07 11:28:48 +00:00
updateQuorum ( part_name , true ) ;
2021-05-08 10:59:55 +00:00
else if ( zookeeper - > exists ( fs : : path ( zookeeper_path ) / " quorum " / " status " ) )
2020-10-08 15:35:41 +00:00
updateQuorum ( part_name , false ) ;
2020-10-07 11:28:48 +00:00
}
2020-10-06 21:49:48 +00:00
/// merged parts that are still inserted with quorum. if it only contains one block, it hasn't been merged before
2020-10-07 11:28:48 +00:00
if ( part_info . level ! = 0 | | part_info . mutation ! = 0 )
2020-10-06 21:49:48 +00:00
{
2021-05-08 10:59:55 +00:00
Strings quorum_parts = zookeeper - > getChildren ( fs : : path ( zookeeper_path ) / " quorum " / " parallel " ) ;
2020-10-06 21:49:48 +00:00
for ( const String & quorum_part : quorum_parts )
{
auto quorum_part_info = MergeTreePartInfo : : fromPartName ( quorum_part , format_version ) ;
if ( part_info . contains ( quorum_part_info ) )
2020-10-07 11:28:48 +00:00
updateQuorum ( quorum_part , true ) ;
2020-10-06 21:49:48 +00:00
}
}
2017-04-01 07:20:54 +00:00
2018-05-31 13:05:05 +00:00
merge_selecting_task - > schedule ( ) ;
2018-01-23 22:56:46 +00:00
for ( const auto & replaced_part : replaced_parts )
{
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Part {} is rendered obsolete by fetching part {} " , replaced_part - > name , part_name ) ;
2018-01-23 22:56:46 +00:00
ProfileEvents : : increment ( ProfileEvents : : ObsoleteReplicatedParts ) ;
}
2017-04-01 07:20:54 +00:00
2018-01-23 22:56:46 +00:00
write_part_log ( { } ) ;
}
else
2014-10-13 17:28:59 +00:00
{
2020-08-27 14:19:18 +00:00
// The fetched part is valuable and should not be cleaned like a temp part.
part - > is_temp = false ;
2022-10-22 22:51:59 +00:00
part - > renameTo ( fs : : path ( " detached " ) / part_name , true ) ;
2014-10-13 17:28:59 +00:00
}
}
2020-08-28 00:53:22 +00:00
catch ( const Exception & e )
{
/// The same part is being written right now (but probably it's not committed yet).
/// We will check the need for fetch later.
if ( e . code ( ) = = ErrorCodes : : DIRECTORY_ALREADY_EXISTS )
2022-08-09 16:44:51 +00:00
{
LOG_TRACE ( log , " Not fetching part: {} " , e . message ( ) ) ;
2020-08-28 00:53:22 +00:00
return false ;
2022-08-09 16:44:51 +00:00
}
2020-08-28 00:53:22 +00:00
throw ;
}
2018-01-23 22:56:46 +00:00
catch ( . . . )
2014-04-07 18:14:39 +00:00
{
2018-01-23 22:56:46 +00:00
if ( ! to_detached )
write_part_log ( ExecutionStatus : : fromCurrentException ( ) ) ;
throw ;
2014-04-07 18:14:39 +00:00
}
2017-04-01 07:20:54 +00:00
2014-04-07 15:45:46 +00:00
ProfileEvents : : increment ( ProfileEvents : : ReplicatedPartFetches ) ;
2017-04-01 07:20:54 +00:00
2018-09-11 14:41:04 +00:00
if ( part_to_clone )
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Cloned part {} from {}{} " , part_name , part_to_clone - > name , to_detached ? " (to 'detached' directory) " : " " ) ;
2018-09-11 14:41:04 +00:00
else
2020-05-24 19:17:15 +00:00
LOG_DEBUG ( log , " Fetched part {} from {}{} " , part_name , source_replica_path , to_detached ? " (to 'detached' directory) " : " " ) ;
2018-09-11 14:41:04 +00:00
2016-08-10 07:20:21 +00:00
return true ;
2014-04-03 11:48:28 +00:00
}
2014-03-21 19:17:59 +00:00
2014-10-18 17:37:55 +00:00
2022-10-22 22:51:59 +00:00
bool StorageReplicatedMergeTree : : fetchExistsPart (
2022-06-26 16:43:28 +00:00
const String & part_name ,
const StorageMetadataPtr & metadata_snapshot ,
const String & source_replica_path ,
DiskPtr replaced_disk ,
String replaced_part_path )
2021-03-09 17:49:50 +00:00
{
auto zookeeper = getZooKeeper ( ) ;
const auto part_info = MergeTreePartInfo : : fromPartName ( part_name , format_version ) ;
2022-08-12 11:03:57 +00:00
if ( auto part = getPartIfExists ( part_info , { MergeTreeDataPartState : : Outdated , MergeTreeDataPartState : : Deleting } ) )
2021-03-09 17:49:50 +00:00
{
LOG_DEBUG ( log , " Part {} should be deleted after previous attempt before fetch " , part - > name ) ;
/// Force immediate parts cleanup to delete the part that was left from the previous fetch attempt.
cleanup_thread . wakeup ( ) ;
2022-10-22 22:51:59 +00:00
return false ;
2021-03-09 17:49:50 +00:00
}
{
std : : lock_guard lock ( currently_fetching_parts_mutex ) ;
if ( ! currently_fetching_parts . insert ( part_name ) . second )
{
LOG_DEBUG ( log , " Part {} is already fetching right now " , part_name ) ;
2022-10-22 22:51:59 +00:00
return false ;
2021-03-09 17:49:50 +00:00
}
}
2021-04-04 09:23:40 +00:00
SCOPE_EXIT_MEMORY
2021-03-09 17:49:50 +00:00
( {
std : : lock_guard lock ( currently_fetching_parts_mutex ) ;
currently_fetching_parts . erase ( part_name ) ;
} ) ;
2022-09-08 14:18:21 +00:00
LOG_DEBUG ( log , " Fetching already known part {} from {} " , part_name , source_replica_path ) ;
2021-03-09 17:49:50 +00:00
TableLockHolder table_lock_holder = lockForShare ( RWLockImpl : : NO_QUERY , getSettings ( ) - > lock_acquire_timeout_for_background_operations ) ;
/// Logging
Stopwatch stopwatch ;
MutableDataPartPtr part ;
DataPartsVector replaced_parts ;
auto write_part_log = [ & ] ( const ExecutionStatus & execution_status )
{
writePartLog (
PartLogElement : : DOWNLOAD_PART , execution_status , stopwatch . elapsed ( ) ,
part_name , part , replaced_parts , nullptr ) ;
} ;
std : : function < MutableDataPartPtr ( ) > get_part ;
2021-05-08 10:59:55 +00:00
ReplicatedMergeTreeAddress address ( zookeeper - > get ( fs : : path ( source_replica_path ) / " host " ) ) ;
2021-04-10 23:33:54 +00:00
auto timeouts = ConnectionTimeouts : : getHTTPTimeouts ( getContext ( ) ) ;
auto credentials = getContext ( ) - > getInterserverCredentials ( ) ;
String interserver_scheme = getContext ( ) - > getInterserverScheme ( ) ;
2021-03-09 17:49:50 +00:00
2021-04-07 13:52:11 +00:00
get_part = [ & , address , timeouts , interserver_scheme , credentials ] ( )
2021-03-10 13:27:08 +00:00
{
if ( interserver_scheme ! = address . scheme )
throw Exception ( " Interserver schemes are different: ' " + interserver_scheme
+ " ' != ' " + address . scheme + " ', can't fetch part from " + address . host ,
ErrorCodes : : INTERSERVER_SCHEME_DOESNT_MATCH ) ;
2021-03-09 17:49:50 +00:00
2022-09-08 14:18:21 +00:00
return fetcher . fetchSelectedPart (
2021-05-21 16:14:01 +00:00
metadata_snapshot , getContext ( ) , part_name , source_replica_path ,
2021-03-10 13:27:08 +00:00
address . host , address . replication_port ,
2021-05-26 20:37:44 +00:00
timeouts , credentials - > getUser ( ) , credentials - > getPassword ( ) ,
interserver_scheme , replicated_fetches_throttler , false , " " , nullptr , true ,
2021-03-10 13:27:08 +00:00
replaced_disk ) ;
} ;
2021-03-09 17:49:50 +00:00
try
{
part = get_part ( ) ;
2022-06-20 18:18:17 +00:00
if ( part - > data_part_storage - > getDiskName ( ) ! = replaced_disk - > getName ( ) )
throw Exception ( " Part " + part - > name + " fetched on wrong disk " + part - > data_part_storage - > getDiskName ( ) , ErrorCodes : : LOGICAL_ERROR ) ;
2022-06-15 16:32:22 +00:00
auto replaced_path = fs : : path ( replaced_part_path ) ;
2022-10-22 22:51:59 +00:00
part - > data_part_storage - > rename ( replaced_path . parent_path ( ) , replaced_path . filename ( ) , nullptr , true , false ) ;
2021-03-09 17:49:50 +00:00
}
catch ( const Exception & e )
{
/// The same part is being written right now (but probably it's not committed yet).
/// We will check the need for fetch later.
if ( e . code ( ) = = ErrorCodes : : DIRECTORY_ALREADY_EXISTS )
2022-08-09 16:44:51 +00:00
{
LOG_TRACE ( log , " Not fetching part: {} " , e . message ( ) ) ;
2022-10-22 22:51:59 +00:00
return false ;
2022-08-09 16:44:51 +00:00
}
2021-03-09 17:49:50 +00:00
throw ;
}
catch ( . . . )
{
write_part_log ( ExecutionStatus : : fromCurrentException ( ) ) ;
throw ;
}
ProfileEvents : : increment ( ProfileEvents : : ReplicatedPartFetches ) ;
LOG_DEBUG ( log , " Fetched part {} from {} " , part_name , source_replica_path ) ;
2022-10-22 22:51:59 +00:00
return true ;
2021-03-09 17:49:50 +00:00
}
2017-06-06 17:06:14 +00:00
void StorageReplicatedMergeTree : : startup ( )
{
2022-08-24 17:44:14 +00:00
if ( attach_thread )
2022-08-12 09:32:13 +00:00
{
2022-08-24 17:44:14 +00:00
attach_thread - > start ( ) ;
attach_thread - > waitFirstTry ( ) ;
return ;
2022-08-12 09:32:13 +00:00
}
2022-08-19 08:49:51 +00:00
startupImpl ( ) ;
2022-08-12 09:32:13 +00:00
}
2022-08-19 08:49:51 +00:00
void StorageReplicatedMergeTree : : startupImpl ( )
2017-06-06 17:06:14 +00:00
{
2022-02-03 10:10:05 +00:00
/// Do not start replication if ZooKeeper is not configured or there is no metadata in zookeeper
if ( ! has_metadata_in_zookeeper . has_value ( ) | | ! * has_metadata_in_zookeeper )
2017-06-06 17:06:14 +00:00
return ;
2020-06-03 22:11:06 +00:00
try
{
2022-08-19 08:49:51 +00:00
auto zookeeper = getZooKeeper ( ) ;
2020-10-21 19:24:16 +00:00
InterserverIOEndpointPtr data_parts_exchange_ptr = std : : make_shared < DataPartsExchange : : Service > ( * this ) ;
[[maybe_unused]] auto prev_ptr = std : : atomic_exchange ( & data_parts_exchange_endpoint , data_parts_exchange_ptr ) ;
assert ( prev_ptr = = nullptr ) ;
2021-04-10 23:33:54 +00:00
getContext ( ) - > getInterserverIOHandler ( ) . addEndpoint ( data_parts_exchange_ptr - > getId ( replica_path ) , data_parts_exchange_ptr ) ;
2017-12-25 14:56:32 +00:00
2022-01-28 12:50:58 +00:00
startBeingLeader ( ) ;
2020-06-03 22:11:06 +00:00
/// In this thread replica will be activated.
restarting_thread . start ( ) ;
2022-09-23 12:40:04 +00:00
/// And this is just a callback
session_expired_callback_handler = EventNotifier : : instance ( ) . subscribe ( Coordination : : Error : : ZSESSIONEXPIRED , [ this ] ( )
{
2022-10-04 18:07:11 +00:00
LOG_TEST ( log , " Received event for expired session. Waking up restarting thread " ) ;
2022-09-23 12:40:04 +00:00
restarting_thread . start ( ) ;
} ) ;
2018-03-22 19:46:01 +00:00
2022-02-03 10:10:05 +00:00
/// Wait while restarting_thread finishing initialization.
/// NOTE It does not mean that replication is actually started after receiving this event.
/// It only means that an attempt to startup replication was made.
/// Table may be still in readonly mode if this attempt failed for any reason.
2020-06-03 22:11:06 +00:00
startup_event . wait ( ) ;
2020-02-18 17:31:49 +00:00
2020-06-23 16:40:58 +00:00
startBackgroundMovesIfNeeded ( ) ;
2020-11-24 14:24:48 +00:00
part_moves_between_shards_orchestrator . start ( ) ;
2020-06-03 22:11:06 +00:00
}
catch ( . . . )
2020-03-07 03:52:50 +00:00
{
2020-06-03 22:11:06 +00:00
/// Exception safety: failed "startup" does not require a call to "shutdown" from the caller.
/// And it should be able to safely destroy table after exception in "startup" method.
2020-06-03 22:15:13 +00:00
/// It means that failed "startup" must not create any background tasks that we will have to wait.
2020-06-03 22:11:06 +00:00
try
{
shutdown ( ) ;
}
catch ( . . . )
{
std : : terminate ( ) ;
}
/// Note: after failed "startup", the table will be in a state that only allows to destroy the object.
throw ;
2020-03-07 03:52:50 +00:00
}
2017-06-06 17:06:14 +00:00
}
2021-12-27 15:54:28 +00:00
void StorageReplicatedMergeTree : : flush ( )
{
2021-12-28 22:03:55 +00:00
if ( flush_called . exchange ( true ) )
2021-12-27 15:54:28 +00:00
return ;
flushAllInMemoryPartsIfNeeded ( ) ;
}
2017-06-06 17:06:14 +00:00
2014-03-22 14:44:44 +00:00
void StorageReplicatedMergeTree : : shutdown ( )
{
2021-12-28 22:03:55 +00:00
if ( shutdown_called . exchange ( true ) )
2021-12-27 15:54:28 +00:00
return ;
2022-09-23 12:40:04 +00:00
session_expired_callback_handler . reset ( ) ;
2018-08-21 14:03:06 +00:00
/// Cancel fetches, merges and mutations to force the queue_task to finish ASAP.
fetcher . blocker . cancelForever ( ) ;
2019-08-01 15:36:12 +00:00
merger_mutator . merges_blocker . cancelForever ( ) ;
2019-09-02 11:35:53 +00:00
parts_mover . moves_blocker . cancelForever ( ) ;
2022-07-05 12:28:20 +00:00
mutations_finalizing_task - > deactivate ( ) ;
2021-12-07 16:55:55 +00:00
stopBeingLeader ( ) ;
2018-08-21 14:03:06 +00:00
2022-08-24 17:44:14 +00:00
if ( attach_thread )
attach_thread - > shutdown ( ) ;
2018-08-21 14:03:06 +00:00
restarting_thread . shutdown ( ) ;
2021-09-08 00:21:21 +00:00
background_operations_assignee . finish ( ) ;
2020-11-24 14:24:48 +00:00
part_moves_between_shards_orchestrator . shutdown ( ) ;
2020-03-27 10:53:04 +00:00
{
auto lock = queue . lockQueue ( ) ;
2020-05-13 03:35:43 +00:00
/// Cancel logs pulling after background task were cancelled. It's still
/// required because we can trigger pullLogsToQueue during manual OPTIMIZE,
/// MUTATE, etc. query.
queue . pull_log_blocker . cancelForever ( ) ;
2020-03-27 10:53:04 +00:00
}
2021-09-08 00:21:21 +00:00
background_moves_assignee . finish ( ) ;
2017-04-01 07:20:54 +00:00
2020-10-21 19:24:16 +00:00
auto data_parts_exchange_ptr = std : : atomic_exchange ( & data_parts_exchange_endpoint , InterserverIOEndpointPtr { } ) ;
if ( data_parts_exchange_ptr )
2016-03-03 04:30:36 +00:00
{
2021-04-10 23:33:54 +00:00
getContext ( ) - > getInterserverIOHandler ( ) . removeEndpointIfExists ( data_parts_exchange_ptr - > getId ( replica_path ) ) ;
2020-01-14 14:27:48 +00:00
/// Ask all parts exchange handlers to finish asap. New ones will fail to start
2020-10-21 19:24:16 +00:00
data_parts_exchange_ptr - > blocker . cancelForever ( ) ;
2020-01-14 14:27:48 +00:00
/// Wait for all of them
2022-06-28 19:19:06 +00:00
std : : lock_guard lock ( data_parts_exchange_ptr - > rwlock ) ;
2016-03-03 04:30:36 +00:00
}
2014-04-25 13:55:15 +00:00
}
2014-03-22 14:44:44 +00:00
StorageReplicatedMergeTree : : ~ StorageReplicatedMergeTree ( )
{
try
{
shutdown ( ) ;
}
2018-11-23 18:52:00 +00:00
catch ( . . . )
2014-03-22 14:44:44 +00:00
{
2014-10-17 01:05:51 +00:00
tryLogCurrentException ( __PRETTY_FUNCTION__ ) ;
2014-03-22 14:44:44 +00:00
}
}
2014-10-18 17:37:55 +00:00
2019-10-28 17:27:43 +00:00
ReplicatedMergeTreeQuorumAddedParts : : PartitionIdToMaxBlock StorageReplicatedMergeTree : : getMaxAddedBlocks ( ) const
{
ReplicatedMergeTreeQuorumAddedParts : : PartitionIdToMaxBlock max_added_blocks ;
2021-11-17 18:14:14 +00:00
for ( const auto & data_part : getDataPartsForInternalUsage ( ) )
2019-10-28 17:27:43 +00:00
{
max_added_blocks [ data_part - > info . partition_id ]
= std : : max ( max_added_blocks [ data_part - > info . partition_id ] , data_part - > info . max_block ) ;
}
auto zookeeper = getZooKeeper ( ) ;
2021-05-08 10:59:55 +00:00
const String quorum_status_path = fs : : path ( zookeeper_path ) / " quorum " / " status " ;
2019-10-28 17:27:43 +00:00
String value ;
Coordination : : Stat stat ;
if ( zookeeper - > tryGet ( quorum_status_path , value , & stat ) )
{
ReplicatedMergeTreeQuorumEntry quorum_entry ;
quorum_entry . fromString ( value ) ;
auto part_info = MergeTreePartInfo : : fromPartName ( quorum_entry . part_name , format_version ) ;
max_added_blocks [ part_info . partition_id ] = part_info . max_block - 1 ;
}
String added_parts_str ;
2021-05-08 10:59:55 +00:00
if ( zookeeper - > tryGet ( fs : : path ( zookeeper_path ) / " quorum " / " last_part " , added_parts_str ) )
2019-10-28 17:27:43 +00:00
{
if ( ! added_parts_str . empty ( ) )
{
ReplicatedMergeTreeQuorumAddedParts part_with_quorum ( format_version ) ;
part_with_quorum . fromString ( added_parts_str ) ;
auto added_parts = part_with_quorum . added_parts ;
for ( const auto & added_part : added_parts )
2022-09-06 12:09:03 +00:00
{
2019-10-28 17:27:43 +00:00
if ( ! getActiveContainingPart ( added_part . second ) )
2022-09-06 12:09:03 +00:00
throw Exception ( ErrorCodes : : REPLICA_IS_NOT_IN_QUORUM ,
" Replica doesn't have part '{}' which was successfully written to quorum of other replicas. "
" Send query to another replica or disable 'select_sequential_consistency' setting " , added_part . second ) ;
}
2019-10-28 17:27:43 +00:00
for ( const auto & max_block : part_with_quorum . getMaxInsertedBlocks ( ) )
max_added_blocks [ max_block . first ] = max_block . second ;
}
}
return max_added_blocks ;
}
2020-10-01 17:34:22 +00:00
void StorageReplicatedMergeTree : : read (
QueryPlan & query_plan ,
2014-12-17 11:53:17 +00:00
const Names & column_names ,
2021-07-09 03:15:41 +00:00
const StorageSnapshotPtr & storage_snapshot ,
2020-09-20 17:52:17 +00:00
SelectQueryInfo & query_info ,
2021-04-10 23:33:54 +00:00
ContextPtr local_context ,
2021-02-10 14:12:49 +00:00
QueryProcessingStage : : Enum processed_stage ,
2019-02-18 23:38:44 +00:00
const size_t max_block_size ,
2017-06-02 15:54:39 +00:00
const unsigned num_streams )
2014-03-22 14:44:44 +00:00
{
2021-12-09 10:39:28 +00:00
/// If true, then we will ask initiator if we can read chosen ranges
const bool enable_parallel_reading = local_context - > getClientInfo ( ) . collaborate_with_initiator ;
2022-06-07 18:35:02 +00:00
SCOPE_EXIT ( {
/// Now, copy of parts that is required for the query, stored in the processors,
/// while snapshot_data.parts includes all parts, even one that had been filtered out with partition pruning,
/// reset them to avoid holding them.
auto & snapshot_data = assert_cast < MergeTreeData : : SnapshotData & > ( * storage_snapshot - > data ) ;
snapshot_data . parts = { } ;
} ) ;
2017-05-24 21:38:56 +00:00
/** The `select_sequential_consistency` setting has two meanings:
* 1. To throw an exception if on a replica there are not all parts which have been written down on quorum of remaining replicas .
* 2. Do not read parts that have not yet been written to the quorum of the replicas .
* For this you have to synchronously go to ZooKeeper .
*/
2021-04-10 23:33:54 +00:00
if ( local_context - > getSettingsRef ( ) . select_sequential_consistency )
2014-07-28 09:53:57 +00:00
{
2021-05-27 16:53:58 +00:00
auto max_added_blocks = std : : make_shared < ReplicatedMergeTreeQuorumAddedParts : : PartitionIdToMaxBlock > ( getMaxAddedBlocks ( ) ) ;
2021-02-10 14:12:49 +00:00
if ( auto plan = reader . read (
2021-12-17 16:02:29 +00:00
column_names , storage_snapshot , query_info , local_context ,
2021-12-09 10:39:28 +00:00
max_block_size , num_streams , processed_stage , std : : move ( max_added_blocks ) , enable_parallel_reading ) )
2020-11-10 10:26:26 +00:00
query_plan = std : : move ( * plan ) ;
2020-10-01 17:34:22 +00:00
return ;
2019-10-28 17:27:43 +00:00
}
2018-09-19 14:34:41 +00:00
2021-12-09 10:39:28 +00:00
if ( auto plan = reader . read (
2021-12-17 16:02:29 +00:00
column_names , storage_snapshot , query_info , local_context ,
2021-12-09 10:39:28 +00:00
max_block_size , num_streams , processed_stage , nullptr , enable_parallel_reading ) )
{
2020-11-10 10:26:26 +00:00
query_plan = std : : move ( * plan ) ;
2021-12-09 10:39:28 +00:00
}
2020-10-01 17:34:22 +00:00
}
2020-03-29 08:50:27 +00:00
template < class Func >
2021-12-30 14:27:22 +00:00
void StorageReplicatedMergeTree : : foreachActiveParts ( Func & & func , bool select_sequential_consistency ) const
2019-10-28 17:27:43 +00:00
{
2020-09-30 23:50:58 +00:00
std : : optional < ReplicatedMergeTreeQuorumAddedParts : : PartitionIdToMaxBlock > max_added_blocks = { } ;
2020-10-04 09:42:03 +00:00
/**
* Synchronously go to ZooKeeper when select_sequential_consistency enabled
*/
2020-11-25 13:47:32 +00:00
if ( select_sequential_consistency )
2020-09-30 23:50:58 +00:00
max_added_blocks = getMaxAddedBlocks ( ) ;
2019-10-28 17:27:43 +00:00
auto lock = lockParts ( ) ;
2021-11-17 18:14:14 +00:00
/// TODO Transactions: should we count visible parts only?
2021-12-30 14:27:22 +00:00
for ( const auto & part : getDataPartsStateRange ( DataPartState : : Active ) )
2019-10-28 17:27:43 +00:00
{
if ( part - > isEmpty ( ) )
continue ;
2018-10-18 11:51:40 +00:00
2020-09-30 23:50:58 +00:00
if ( max_added_blocks )
{
auto blocks_iterator = max_added_blocks - > find ( part - > info . partition_id ) ;
if ( blocks_iterator = = max_added_blocks - > end ( ) | | part - > info . max_block > blocks_iterator - > second )
continue ;
}
2018-08-21 14:26:20 +00:00
2020-03-29 08:50:27 +00:00
func ( part ) ;
2014-04-24 10:20:02 +00:00
}
2020-03-29 08:50:27 +00:00
}
2020-11-25 13:47:32 +00:00
std : : optional < UInt64 > StorageReplicatedMergeTree : : totalRows ( const Settings & settings ) const
2020-03-29 08:50:27 +00:00
{
UInt64 res = 0 ;
2021-12-30 14:27:22 +00:00
foreachActiveParts ( [ & res ] ( auto & part ) { res + = part - > rows_count ; } , settings . select_sequential_consistency ) ;
2020-03-29 08:50:27 +00:00
return res ;
}
2021-04-10 23:33:54 +00:00
std : : optional < UInt64 > StorageReplicatedMergeTree : : totalRowsByPartitionPredicate ( const SelectQueryInfo & query_info , ContextPtr local_context ) const
2020-09-21 10:13:01 +00:00
{
2021-03-03 08:36:20 +00:00
DataPartsVector parts ;
2021-12-30 14:27:22 +00:00
foreachActiveParts ( [ & ] ( auto & part ) { parts . push_back ( part ) ; } , local_context - > getSettingsRef ( ) . select_sequential_consistency ) ;
2021-04-10 23:33:54 +00:00
return totalRowsByPartitionPredicateImpl ( query_info , local_context , parts ) ;
2020-09-21 10:13:01 +00:00
}
2020-11-25 13:47:32 +00:00
std : : optional < UInt64 > StorageReplicatedMergeTree : : totalBytes ( const Settings & settings ) const
2020-03-29 08:50:27 +00:00
{
UInt64 res = 0 ;
2021-12-30 14:27:22 +00:00
foreachActiveParts ( [ & res ] ( auto & part ) { res + = part - > getBytesOnDisk ( ) ; } , settings . select_sequential_consistency ) ;
2019-10-28 17:27:43 +00:00
return res ;
2014-03-22 14:44:44 +00:00
}
2014-10-18 17:37:55 +00:00
2016-01-17 08:12:48 +00:00
void StorageReplicatedMergeTree : : assertNotReadonly ( ) const
2014-04-02 10:10:37 +00:00
{
2014-12-11 02:04:13 +00:00
if ( is_readonly )
2022-04-04 22:51:48 +00:00
throw Exception ( ErrorCodes : : TABLE_IS_READ_ONLY , " Table is in readonly mode (replica path: {}) " , replica_path) ;
2016-01-17 08:12:48 +00:00
}
2021-07-23 19:33:59 +00:00
SinkToStoragePtr StorageReplicatedMergeTree : : write ( const ASTPtr & /*query*/ , const StorageMetadataPtr & metadata_snapshot , ContextPtr local_context )
2016-01-17 08:12:48 +00:00
{
2019-08-26 18:08:58 +00:00
const auto storage_settings_ptr = getSettings ( ) ;
2016-01-17 08:12:48 +00:00
assertNotReadonly ( ) ;
2014-05-13 11:24:04 +00:00
2021-04-10 23:33:54 +00:00
const Settings & query_settings = local_context - > getSettingsRef ( ) ;
2019-08-26 18:08:58 +00:00
bool deduplicate = storage_settings_ptr - > replicated_deduplication_window ! = 0 & & query_settings . insert_deduplicate ;
2017-10-24 19:32:23 +00:00
2022-05-09 19:13:02 +00:00
// TODO: should we also somehow pass list of columns to deduplicate on to the ReplicatedMergeTreeSink?
2021-07-23 19:33:59 +00:00
return std : : make_shared < ReplicatedMergeTreeSink > (
2022-08-09 16:16:42 +00:00
* this , metadata_snapshot , query_settings . insert_quorum . valueOr ( 0 ) ,
2020-06-15 17:41:44 +00:00
query_settings . insert_quorum_timeout . totalMilliseconds ( ) ,
query_settings . max_partitions_per_insert_block ,
2020-09-30 23:16:27 +00:00
query_settings . insert_quorum_parallel ,
2020-11-13 07:54:05 +00:00
deduplicate ,
2022-08-23 12:03:11 +00:00
query_settings . insert_quorum . is_auto ,
2021-02-10 14:12:49 +00:00
local_context ) ;
2014-04-02 10:10:37 +00:00
}
2014-03-22 14:44:44 +00:00
2014-10-18 17:37:55 +00:00
2020-06-12 18:24:32 +00:00
bool StorageReplicatedMergeTree : : optimize (
2020-06-22 09:03:53 +00:00
const ASTPtr & ,
2020-08-28 13:45:42 +00:00
const StorageMetadataPtr & ,
2020-06-17 13:39:26 +00:00
const ASTPtr & partition ,
bool final ,
bool deduplicate ,
2020-12-01 09:10:12 +00:00
const Names & deduplicate_by_columns ,
2021-04-10 23:33:54 +00:00
ContextPtr query_context )
2014-05-08 08:03:03 +00:00
{
Fix concurrent OPTIMIZE and DROP for ReplicatedMergeTree
Found with fuzzer [1] for 00992_system_parts_race_condition_zookeeper:
2021.03.13 11:12:30.385188 [ 42042 ] {2d3a8e17-26be-47c1-974f-bd2c9fc7c3af} <Debug> executeQuery: (from [::1]:58192, using production parser) (comment: '/usr/share/clickhouse-test/queries/1_stateful/00153_aggregate_arena_race.sql') CREATE TABLE alter_tabl
e (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_3.alter_table', 'r1') ORDER BY a PARTITION BY b % 10 SETTINGS old_parts_lifetime = 1, cleanup_delay_p
eriod = 1, cleanup_delay_period_random_add = 0;
...
2021.03.13 11:12:30.678387 [ 42042 ] {528cafc5-a02b-4df8-a531-a9a98e37b478} <Debug> executeQuery: (from [::1]:58192, using production parser) (comment: '/usr/share/clickhouse-test/queries/1_stateful/00153_aggregate_arena_race.sql') CREATE TABLE alter_table2 (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_3.alter_table', 'r2') ORDER BY a PARTITION BY b % 10 SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 0;
...
2021.03.13 11:12:40.671994 [ 4193 ] {d96ee93c-69b0-4e89-b411-16c382ae27a8} <Debug> executeQuery: (from [::1]:59714, using production parser) (comment: '/usr/share/clickhouse-test/queries/1_stateful/00153_aggregate_arena_race.sql') OPTIMIZE TABLE alter_table FINAL
...
2021.03.13 11:12:40.990174 [ 2298 ] {a80f9306-3a73-4778-a921-db53249247e3} <Debug> executeQuery: (from [::1]:59768, using production parser) (comment: '/usr/share/clickhouse-test/queries/1_stateful/00153_aggregate_arena_race.sql') DROP TABLE alter_table;
...
2021.03.13 11:12:41.333054 [ 2298 ] {a80f9306-3a73-4778-a921-db53249247e3} <Debug> test_3.alter_table (d4fedaca-e0f6-4c22-9a4f-9f4d11b6b705): Removing part from filesystem 7_0_0_0
...
2021.03.13 11:12:41.335380 [ 2298 ] {a80f9306-3a73-4778-a921-db53249247e3} <Debug> DatabaseCatalog: Waiting for table d4fedaca-e0f6-4c22-9a4f-9f4d11b6b705 to be finally dropped
...
2021.03.13 11:12:41.781032 [ 4193 ] {d96ee93c-69b0-4e89-b411-16c382ae27a8} <Debug> test_3.alter_table (d4fedaca-e0f6-4c22-9a4f-9f4d11b6b705): Waiting for queue-0000000085 to disappear from r2 queue
...
2021.03.13 11:12:41.900039 [ 371 ] {} <Trace> test_3.alter_table2 (ReplicatedMergeTreeQueue): Not executing log entry queue-0000000085 of type MERGE_PARTS for part 7_0_0_1 because part 7_0_0_0 is not ready yet (log entry for that part is being processed).
2021.03.13 11:12:41.900213 [ 365 ] {} <Trace> test_3.alter_table2 (ReplicatedMergeTreeQueue): Cannot execute alter metadata queue-0000000056 with version 22 because another alter 21 must be executed before
2021.03.13 11:12:41.900231 [ 13762 ] {} <Trace> test_3.alter_table2 (ae877c49-0d30-416d-9afe-27fd457d8fc4): Executing log entry to merge parts -7_0_0_0 to -7_0_0_1
2021.03.13 11:12:41.900330 [ 13762 ] {} <Debug> test_3.alter_table2 (ae877c49-0d30-416d-9afe-27fd457d8fc4): Don't have all parts for merge -7_0_0_1; will try to fetch it instead
...
[1]: https://clickhouse-test-reports.s3.yandex.net/21691/eb3710c164b991b8d4f86b1435a65f9eceb8f1f5/stress_test_(address).html#fail1
2021-03-13 15:05:54 +00:00
/// NOTE: exclusive lock cannot be used here, since this may lead to deadlock (see comments below),
/// but it should be safe to use non-exclusive to avoid dropping parts that may be required for processing queue.
2021-04-10 23:33:54 +00:00
auto table_lock = lockForShare ( query_context - > getCurrentQueryId ( ) , query_context - > getSettingsRef ( ) . lock_acquire_timeout ) ;
Fix concurrent OPTIMIZE and DROP for ReplicatedMergeTree
Found with fuzzer [1] for 00992_system_parts_race_condition_zookeeper:
2021.03.13 11:12:30.385188 [ 42042 ] {2d3a8e17-26be-47c1-974f-bd2c9fc7c3af} <Debug> executeQuery: (from [::1]:58192, using production parser) (comment: '/usr/share/clickhouse-test/queries/1_stateful/00153_aggregate_arena_race.sql') CREATE TABLE alter_tabl
e (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_3.alter_table', 'r1') ORDER BY a PARTITION BY b % 10 SETTINGS old_parts_lifetime = 1, cleanup_delay_p
eriod = 1, cleanup_delay_period_random_add = 0;
...
2021.03.13 11:12:30.678387 [ 42042 ] {528cafc5-a02b-4df8-a531-a9a98e37b478} <Debug> executeQuery: (from [::1]:58192, using production parser) (comment: '/usr/share/clickhouse-test/queries/1_stateful/00153_aggregate_arena_race.sql') CREATE TABLE alter_table2 (a UInt8, b Int16, c Float32, d String, e Array(UInt8), f Nullable(UUID), g Tuple(UInt8, UInt16)) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_3.alter_table', 'r2') ORDER BY a PARTITION BY b % 10 SETTINGS old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 0;
...
2021.03.13 11:12:40.671994 [ 4193 ] {d96ee93c-69b0-4e89-b411-16c382ae27a8} <Debug> executeQuery: (from [::1]:59714, using production parser) (comment: '/usr/share/clickhouse-test/queries/1_stateful/00153_aggregate_arena_race.sql') OPTIMIZE TABLE alter_table FINAL
...
2021.03.13 11:12:40.990174 [ 2298 ] {a80f9306-3a73-4778-a921-db53249247e3} <Debug> executeQuery: (from [::1]:59768, using production parser) (comment: '/usr/share/clickhouse-test/queries/1_stateful/00153_aggregate_arena_race.sql') DROP TABLE alter_table;
...
2021.03.13 11:12:41.333054 [ 2298 ] {a80f9306-3a73-4778-a921-db53249247e3} <Debug> test_3.alter_table (d4fedaca-e0f6-4c22-9a4f-9f4d11b6b705): Removing part from filesystem 7_0_0_0
...
2021.03.13 11:12:41.335380 [ 2298 ] {a80f9306-3a73-4778-a921-db53249247e3} <Debug> DatabaseCatalog: Waiting for table d4fedaca-e0f6-4c22-9a4f-9f4d11b6b705 to be finally dropped
...
2021.03.13 11:12:41.781032 [ 4193 ] {d96ee93c-69b0-4e89-b411-16c382ae27a8} <Debug> test_3.alter_table (d4fedaca-e0f6-4c22-9a4f-9f4d11b6b705): Waiting for queue-0000000085 to disappear from r2 queue
...
2021.03.13 11:12:41.900039 [ 371 ] {} <Trace> test_3.alter_table2 (ReplicatedMergeTreeQueue): Not executing log entry queue-0000000085 of type MERGE_PARTS for part 7_0_0_1 because part 7_0_0_0 is not ready yet (log entry for that part is being processed).
2021.03.13 11:12:41.900213 [ 365 ] {} <Trace> test_3.alter_table2 (ReplicatedMergeTreeQueue): Cannot execute alter metadata queue-0000000056 with version 22 because another alter 21 must be executed before
2021.03.13 11:12:41.900231 [ 13762 ] {} <Trace> test_3.alter_table2 (ae877c49-0d30-416d-9afe-27fd457d8fc4): Executing log entry to merge parts -7_0_0_0 to -7_0_0_1
2021.03.13 11:12:41.900330 [ 13762 ] {} <Debug> test_3.alter_table2 (ae877c49-0d30-416d-9afe-27fd457d8fc4): Don't have all parts for merge -7_0_0_1; will try to fetch it instead
...
[1]: https://clickhouse-test-reports.s3.yandex.net/21691/eb3710c164b991b8d4f86b1435a65f9eceb8f1f5/stress_test_(address).html#fail1
2021-03-13 15:05:54 +00:00
2016-05-16 18:43:38 +00:00
assertNotReadonly ( ) ;
2018-04-06 16:06:07 +00:00
if ( ! is_leader )
2020-06-19 14:18:58 +00:00
throw Exception ( " OPTIMIZE cannot be done on this replica because it is not a leader " , ErrorCodes : : NOT_A_LEADER ) ;
2016-05-16 18:43:38 +00:00
2021-10-06 17:07:35 +00:00
auto handle_noop = [ & ] ( const String & message )
2016-05-16 18:43:38 +00:00
{
2021-10-06 17:07:35 +00:00
if ( query_context - > getSettingsRef ( ) . optimize_throw_if_noop )
throw Exception ( message , ErrorCodes : : CANNOT_ASSIGN_OPTIMIZE ) ;
return false ;
} ;
2018-01-12 17:30:21 +00:00
2022-02-03 10:10:05 +00:00
auto zookeeper = getZooKeeperAndAssertNotReadonly ( ) ;
2021-10-06 17:07:35 +00:00
const auto storage_settings_ptr = getSettings ( ) ;
auto metadata_snapshot = getInMemoryMetadataPtr ( ) ;
std : : vector < ReplicatedMergeTreeLogEntryData > merge_entries ;
2019-08-01 11:10:42 +00:00
2021-10-06 17:07:35 +00:00
auto try_assign_merge = [ & ] ( const String & partition_id ) - > bool
{
constexpr size_t max_retries = 10 ;
size_t try_no = 0 ;
for ( ; try_no < max_retries ; + + try_no )
2018-01-12 17:30:21 +00:00
{
2021-10-06 17:07:35 +00:00
/// We must select parts for merge under merge_selecting_mutex because other threads
/// (merge_selecting_thread or OPTIMIZE queries) could assign new merges.
std : : lock_guard merge_selecting_lock ( merge_selecting_mutex ) ;
ReplicatedMergeTreeMergePredicate can_merge = queue . getMergePredicate ( zookeeper ) ;
2018-07-06 15:25:22 +00:00
2021-10-06 17:07:35 +00:00
auto future_merged_part = std : : make_shared < FutureMergedMutatedPart > ( ) ;
if ( storage_settings . get ( ) - > assign_part_uuids )
future_merged_part - > uuid = UUIDHelpers : : generateV4 ( ) ;
2018-07-06 15:25:22 +00:00
2021-10-06 17:07:35 +00:00
constexpr const char * unknown_disable_reason = " unknown reason " ;
String disable_reason = unknown_disable_reason ;
SelectPartsDecision select_decision = SelectPartsDecision : : CANNOT_SELECT ;
2018-07-16 03:27:43 +00:00
2021-10-06 17:07:35 +00:00
if ( partition_id . empty ( ) )
2018-07-06 15:25:22 +00:00
{
2021-10-06 17:07:35 +00:00
select_decision = merger_mutator . selectPartsToMerge (
future_merged_part , /* aggressive */ true , storage_settings_ptr - > max_bytes_to_merge_at_max_space_in_pool ,
2022-03-16 19:16:26 +00:00
can_merge , /* merge_with_ttl_allowed */ false , NO_TRANSACTION_PTR , & disable_reason ) ;
2018-07-06 15:25:22 +00:00
}
2021-10-06 17:07:35 +00:00
else
2018-07-06 15:25:22 +00:00
{
2021-10-06 17:07:35 +00:00
select_decision = merger_mutator . selectAllPartsToMergeWithinPartition (
2022-03-16 19:16:26 +00:00
future_merged_part , can_merge , partition_id , final , metadata_snapshot , NO_TRANSACTION_PTR ,
2021-10-06 17:07:35 +00:00
& disable_reason , query_context - > getSettingsRef ( ) . optimize_skip_merged_partitions ) ;
}
2019-08-19 17:59:16 +00:00
2021-10-06 17:07:35 +00:00
/// If there is nothing to merge then we treat this merge as successful (needed for optimize final optimization)
if ( select_decision = = SelectPartsDecision : : NOTHING_TO_MERGE )
return false ;
2020-11-02 14:38:18 +00:00
2021-10-06 17:07:35 +00:00
if ( select_decision ! = SelectPartsDecision : : SELECTED )
{
constexpr const char * message_fmt = " Cannot select parts for optimization: {} " ;
assert ( disable_reason ! = unknown_disable_reason ) ;
if ( ! partition_id . empty ( ) )
disable_reason + = fmt : : format ( " (in partition {}) " , partition_id ) ;
String message = fmt : : format ( message_fmt , disable_reason ) ;
Use fmt::runtime() for LOG_* for non constexpr
Here is oneliner:
$ gg 'LOG_\(DEBUG\|TRACE\|INFO\|TEST\|WARNING\|ERROR\|FATAL\)([^,]*, [a-zA-Z]' -- :*.cpp :*.h | cut -d: -f1 | sort -u | xargs -r sed -E -i 's#(LOG_[A-Z]*)\(([^,]*), ([A-Za-z][^,)]*)#\1(\2, fmt::runtime(\3)#'
Note, that I tried to do this with coccinelle (tool for semantic
patchin), but it cannot parse C++:
$ cat fmt.cocci
@@
expression log;
expression var;
@@
-LOG_DEBUG(log, var)
+LOG_DEBUG(log, fmt::runtime(var))
I've also tried to use some macros/templates magic to do this implicitly
in logger_useful.h, but I failed to do so, and apparently it is not
possible for now.
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
v2: manual fixes
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2022-02-01 09:10:27 +00:00
LOG_INFO ( log , fmt : : runtime ( message ) ) ;
2021-10-06 17:07:35 +00:00
return handle_noop ( message ) ;
}
2020-10-13 18:25:45 +00:00
2021-10-06 17:07:35 +00:00
ReplicatedMergeTreeLogEntryData merge_entry ;
CreateMergeEntryResult create_result = createLogEntryToMergeParts (
zookeeper , future_merged_part - > parts ,
future_merged_part - > name , future_merged_part - > uuid , future_merged_part - > type ,
deduplicate , deduplicate_by_columns ,
& merge_entry , can_merge . getVersion ( ) , future_merged_part - > merge_type ) ;
2020-06-12 18:24:32 +00:00
2021-10-06 17:07:35 +00:00
if ( create_result = = CreateMergeEntryResult : : MissingPart )
{
String message = " Can't create merge queue node in ZooKeeper, because some parts are missing " ;
Use fmt::runtime() for LOG_* for non constexpr
Here is oneliner:
$ gg 'LOG_\(DEBUG\|TRACE\|INFO\|TEST\|WARNING\|ERROR\|FATAL\)([^,]*, [a-zA-Z]' -- :*.cpp :*.h | cut -d: -f1 | sort -u | xargs -r sed -E -i 's#(LOG_[A-Z]*)\(([^,]*), ([A-Za-z][^,)]*)#\1(\2, fmt::runtime(\3)#'
Note, that I tried to do this with coccinelle (tool for semantic
patchin), but it cannot parse C++:
$ cat fmt.cocci
@@
expression log;
expression var;
@@
-LOG_DEBUG(log, var)
+LOG_DEBUG(log, fmt::runtime(var))
I've also tried to use some macros/templates magic to do this implicitly
in logger_useful.h, but I failed to do so, and apparently it is not
possible for now.
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
v2: manual fixes
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2022-02-01 09:10:27 +00:00
LOG_TRACE ( log , fmt : : runtime ( message ) ) ;
2021-10-06 17:07:35 +00:00
return handle_noop ( message ) ;
}
2020-11-10 14:42:56 +00:00
2021-10-06 17:07:35 +00:00
if ( create_result = = CreateMergeEntryResult : : LogUpdated )
continue ;
2020-06-12 18:24:32 +00:00
2021-10-06 17:07:35 +00:00
merge_entries . push_back ( std : : move ( merge_entry ) ) ;
return true ;
}
2020-06-12 18:24:32 +00:00
2021-10-06 17:07:35 +00:00
assert ( try_no = = max_retries ) ;
String message = fmt : : format ( " Can't create merge queue node in ZooKeeper, because log was updated in every of {} tries " , try_no ) ;
Use fmt::runtime() for LOG_* for non constexpr
Here is oneliner:
$ gg 'LOG_\(DEBUG\|TRACE\|INFO\|TEST\|WARNING\|ERROR\|FATAL\)([^,]*, [a-zA-Z]' -- :*.cpp :*.h | cut -d: -f1 | sort -u | xargs -r sed -E -i 's#(LOG_[A-Z]*)\(([^,]*), ([A-Za-z][^,)]*)#\1(\2, fmt::runtime(\3)#'
Note, that I tried to do this with coccinelle (tool for semantic
patchin), but it cannot parse C++:
$ cat fmt.cocci
@@
expression log;
expression var;
@@
-LOG_DEBUG(log, var)
+LOG_DEBUG(log, fmt::runtime(var))
I've also tried to use some macros/templates magic to do this implicitly
in logger_useful.h, but I failed to do so, and apparently it is not
possible for now.
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
v2: manual fixes
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2022-02-01 09:10:27 +00:00
LOG_TRACE ( log , fmt : : runtime ( message ) ) ;
2021-10-06 17:07:35 +00:00
return handle_noop ( message ) ;
} ;
2020-06-12 18:24:32 +00:00
2021-10-06 17:07:35 +00:00
bool assigned = false ;
if ( ! partition & & final )
{
2022-01-28 17:47:37 +00:00
DataPartsVector data_parts = getVisibleDataPartsVector ( query_context ) ;
2021-10-06 17:07:35 +00:00
std : : unordered_set < String > partition_ids ;
2020-06-12 18:24:32 +00:00
2021-10-06 17:07:35 +00:00
for ( const DataPartPtr & part : data_parts )
partition_ids . emplace ( part - > info . partition_id ) ;
2016-05-16 18:43:38 +00:00
2021-10-06 17:07:35 +00:00
for ( const String & partition_id : partition_ids )
{
assigned = try_assign_merge ( partition_id ) ;
if ( ! assigned )
2020-06-12 18:24:32 +00:00
break ;
2018-07-06 15:25:22 +00:00
}
2016-05-16 18:43:38 +00:00
}
2021-10-06 17:07:35 +00:00
else
{
String partition_id ;
if ( partition )
partition_id = getPartitionIDFromQuery ( partition , query_context ) ;
assigned = try_assign_merge ( partition_id ) ;
}
2015-04-14 13:44:38 +00:00
2021-08-20 12:59:57 +00:00
table_lock . reset ( ) ;
for ( auto & merge_entry : merge_entries )
2021-08-23 12:57:50 +00:00
waitForLogEntryToBeProcessedIfNecessary ( merge_entry , query_context ) ;
2018-05-21 13:49:54 +00:00
2021-10-06 17:07:35 +00:00
return assigned ;
2014-05-08 08:03:03 +00:00
}
2020-01-30 12:54:52 +00:00
bool StorageReplicatedMergeTree : : executeMetadataAlter ( const StorageReplicatedMergeTree : : LogEntry & entry )
{
2021-04-28 17:49:27 +00:00
if ( entry . alter_version < metadata_version )
{
/// TODO Can we replace it with LOGICAL_ERROR?
2022-09-05 01:50:24 +00:00
/// As for now, it may rarely happen due to reordering of ALTER_METADATA entries in the queue of
2021-05-07 13:39:21 +00:00
/// non-initial replica and also may happen after stale replica recovery.
2021-04-28 17:49:27 +00:00
LOG_WARNING ( log , " Attempt to update metadata of version {} "
" to older version {} when processing log entry {}: {} " ,
metadata_version , entry . alter_version , entry . znode_name , entry . toString ( ) ) ;
return true ;
}
2020-01-30 12:54:52 +00:00
auto zookeeper = getZooKeeper ( ) ;
auto columns_from_entry = ColumnsDescription : : parse ( entry . columns_str ) ;
auto metadata_from_entry = ReplicatedMergeTreeTableMetadata : : parse ( entry . metadata_str ) ;
MergeTreeData : : DataParts parts ;
/// If metadata nodes have changed, we will update table structure locally.
2020-02-13 13:13:23 +00:00
Coordination : : Requests requests ;
2021-05-08 10:59:55 +00:00
requests . emplace_back ( zkutil : : makeSetRequest ( fs : : path ( replica_path ) / " columns " , entry . columns_str , - 1 ) ) ;
requests . emplace_back ( zkutil : : makeSetRequest ( fs : : path ( replica_path ) / " metadata " , entry . metadata_str , - 1 ) ) ;
2020-02-01 12:46:22 +00:00
2022-07-29 16:33:16 +00:00
auto table_id = getStorageID ( ) ;
auto alter_context = getContext ( ) ;
auto database = DatabaseCatalog : : instance ( ) . getDatabase ( table_id . database_name ) ;
bool is_in_replicated_database = database - > getEngineName ( ) = = " Replicated " ;
if ( is_in_replicated_database )
{
auto mutable_alter_context = Context : : createCopy ( getContext ( ) ) ;
const auto * replicated = dynamic_cast < const DatabaseReplicated * > ( database . get ( ) ) ;
mutable_alter_context - > makeQueryContext ( ) ;
auto alter_txn = std : : make_shared < ZooKeeperMetadataTransaction > ( zookeeper , replicated - > getZooKeeperPath ( ) ,
/* is_initial_query */ false , /* task_zk_path */ " " ) ;
mutable_alter_context - > initZooKeeperMetadataTransaction ( alter_txn ) ;
alter_context = mutable_alter_context ;
for ( auto & op : requests )
alter_txn - > addOp ( std : : move ( op ) ) ;
requests . clear ( ) ;
/// Requests will be executed by database in setTableStructure
}
else
{
zookeeper - > multi ( requests ) ;
}
2020-01-30 12:54:52 +00:00
2020-02-13 13:13:23 +00:00
{
2021-10-25 17:49:49 +00:00
auto table_lock_holder = lockForShare ( RWLockImpl : : NO_QUERY , getSettings ( ) - > lock_acquire_timeout_for_background_operations ) ;
auto alter_lock_holder = lockForAlter ( getSettings ( ) - > lock_acquire_timeout_for_background_operations ) ;
2020-02-13 13:13:23 +00:00
LOG_INFO ( log , " Metadata changed in ZooKeeper. Applying changes locally. " ) ;
2020-01-30 12:54:52 +00:00
2022-01-27 08:33:40 +00:00
auto metadata_diff = ReplicatedMergeTreeTableMetadata ( * this , getInMemoryMetadataPtr ( ) ) . checkAndFindDiff ( metadata_from_entry , getInMemoryMetadataPtr ( ) - > getColumns ( ) , getContext ( ) ) ;
2022-07-29 16:33:16 +00:00
setTableStructure ( table_id , alter_context , std : : move ( columns_from_entry ) , metadata_diff ) ;
2020-02-13 13:13:23 +00:00
metadata_version = entry . alter_version ;
2020-01-30 12:54:52 +00:00
2020-05-23 22:24:01 +00:00
LOG_INFO ( log , " Applied changes to the metadata of the table. Current metadata version: {} " , metadata_version ) ;
2020-01-30 12:54:52 +00:00
}
2020-01-31 12:25:31 +00:00
2022-09-13 22:43:59 +00:00
{
/// Reset Object columns, because column of type
/// Object may be added or dropped by alter.
auto parts_lock = lockParts ( ) ;
resetObjectColumnsFromActiveParts ( parts_lock ) ;
}
2020-02-17 16:33:05 +00:00
/// This transaction may not happen, but it's OK, because on the next retry we will eventually create/update this node
2022-07-29 16:33:16 +00:00
/// TODO Maybe do in in one transaction for Replicated database?
2021-05-08 10:59:55 +00:00
zookeeper - > createOrUpdate ( fs : : path ( replica_path ) / " metadata_version " , std : : to_string ( metadata_version ) , zkutil : : CreateMode : : Persistent ) ;
2020-02-14 10:17:04 +00:00
2020-01-30 12:54:52 +00:00
return true ;
}
2014-10-18 17:37:55 +00:00
2020-11-10 10:23:46 +00:00
PartitionBlockNumbersHolder StorageReplicatedMergeTree : : allocateBlockNumbersInAffectedPartitions (
2021-04-10 23:33:54 +00:00
const MutationCommands & commands , ContextPtr query_context , const zkutil : : ZooKeeperPtr & zookeeper ) const
2020-11-10 10:23:46 +00:00
{
const std : : set < String > mutation_affected_partition_ids = getPartitionIdsAffectedByCommands ( commands , query_context ) ;
if ( mutation_affected_partition_ids . size ( ) = = 1 )
{
const auto & affected_partition_id = * mutation_affected_partition_ids . cbegin ( ) ;
auto block_number_holder = allocateBlockNumber ( affected_partition_id , zookeeper ) ;
if ( ! block_number_holder . has_value ( ) )
return { } ;
auto block_number = block_number_holder - > getNumber ( ) ; /// Avoid possible UB due to std::move
return { { { affected_partition_id , block_number } } , std : : move ( block_number_holder ) } ;
}
else
{
2022-09-05 01:50:24 +00:00
/// TODO: Implement optimal block number acquisition algorithm in multiple (but not all) partitions
2020-11-10 10:23:46 +00:00
EphemeralLocksInAllPartitions lock_holder (
2021-05-08 10:59:55 +00:00
fs : : path ( zookeeper_path ) / " block_numbers " , " block- " , fs : : path ( zookeeper_path ) / " temp " , * zookeeper ) ;
2020-11-10 10:23:46 +00:00
PartitionBlockNumbersHolder : : BlockNumbersType block_numbers ;
for ( const auto & lock : lock_holder . getLocks ( ) )
{
2022-04-18 10:18:43 +00:00
if ( mutation_affected_partition_ids . empty ( ) | | mutation_affected_partition_ids . contains ( lock . partition_id ) )
2020-11-10 10:23:46 +00:00
block_numbers [ lock . partition_id ] = lock . number ;
}
return { std : : move ( block_numbers ) , std : : move ( lock_holder ) } ;
}
}
2019-03-05 10:12:20 +00:00
void StorageReplicatedMergeTree : : alter (
2021-10-25 17:49:49 +00:00
const AlterCommands & commands , ContextPtr query_context , AlterLockHolder & table_lock_holder )
2014-07-16 08:58:59 +00:00
{
2016-01-17 08:12:48 +00:00
assertNotReadonly ( ) ;
2019-12-10 20:47:05 +00:00
auto table_id = getStorageID ( ) ;
2019-08-26 14:50:34 +00:00
2020-08-27 13:10:10 +00:00
if ( commands . isSettingsAlter ( ) )
2019-08-06 13:04:29 +00:00
{
2019-08-26 18:08:58 +00:00
/// We don't replicate storage_settings_ptr ALTER. It's local operation.
2019-08-06 16:29:31 +00:00
/// Also we don't upgrade alter lock to table structure lock.
2020-06-08 18:23:26 +00:00
StorageInMemoryMetadata future_metadata = getInMemoryMetadata ( ) ;
2020-08-27 13:10:10 +00:00
commands . apply ( future_metadata , query_context ) ;
2020-02-14 13:17:50 +00:00
2020-09-18 10:57:33 +00:00
merge_strategy_picker . refreshState ( ) ;
2020-06-08 18:23:26 +00:00
changeSettings ( future_metadata . settings_changes , table_lock_holder ) ;
2019-08-27 09:34:53 +00:00
2020-06-08 18:23:26 +00:00
DatabaseCatalog : : instance ( ) . getDatabase ( table_id . database_name ) - > alterTable ( query_context , table_id , future_metadata ) ;
2019-08-06 13:04:29 +00:00
return ;
}
2020-06-12 18:24:32 +00:00
auto ast_to_str = [ ] ( ASTPtr query ) - > String
{
2019-12-27 14:36:59 +00:00
if ( ! query )
return " " ;
return queryToString ( query ) ;
} ;
2022-02-03 10:10:05 +00:00
const auto zookeeper = getZooKeeperAndAssertNotReadonly ( ) ;
2020-01-31 12:25:31 +00:00
2020-02-13 16:16:09 +00:00
std : : optional < ReplicatedMergeTreeLogEntryData > alter_entry ;
2020-01-31 19:30:33 +00:00
std : : optional < String > mutation_znode ;
2014-07-16 08:58:59 +00:00
2020-02-17 12:47:34 +00:00
while ( true )
2014-07-16 08:58:59 +00:00
{
2020-02-13 16:16:09 +00:00
/// Clear nodes from previous iteration
alter_entry . emplace ( ) ;
2020-02-14 10:17:04 +00:00
mutation_znode . reset ( ) ;
2020-02-13 16:16:09 +00:00
2020-06-16 16:55:04 +00:00
auto current_metadata = getInMemoryMetadataPtr ( ) ;
2017-04-01 07:20:54 +00:00
2020-06-16 16:55:04 +00:00
StorageInMemoryMetadata future_metadata = * current_metadata ;
2020-08-27 13:10:10 +00:00
commands . apply ( future_metadata , query_context ) ;
2017-04-01 07:20:54 +00:00
2020-06-16 16:55:04 +00:00
ReplicatedMergeTreeTableMetadata future_metadata_in_zk ( * this , current_metadata ) ;
if ( ast_to_str ( future_metadata . sorting_key . definition_ast ) ! = ast_to_str ( current_metadata - > sorting_key . definition_ast ) )
2020-10-15 13:02:39 +00:00
{
/// We serialize definition_ast as list, because code which apply ALTER (setTableStructure) expect serialized non empty expression
/// list here and we cannot change this representation for compatibility. Also we have preparsed AST `sorting_key.expression_list_ast`
/// in KeyDescription, but it contain version column for VersionedCollapsingMergeTree, which shouldn't be defined as a part of key definition AST.
/// So the best compatible way is just to convert definition_ast to list and serialize it. In all other places key.expression_list_ast should be used.
future_metadata_in_zk . sorting_key = serializeAST ( * extractKeyExpressionList ( future_metadata . sorting_key . definition_ast ) ) ;
}
2019-04-15 09:30:45 +00:00
2020-08-27 13:10:10 +00:00
if ( ast_to_str ( future_metadata . sampling_key . definition_ast ) ! = ast_to_str ( current_metadata - > sampling_key . definition_ast ) )
2020-10-15 13:02:39 +00:00
future_metadata_in_zk . sampling_expression = serializeAST ( * extractKeyExpressionList ( future_metadata . sampling_key . definition_ast ) ) ;
2020-08-27 13:10:10 +00:00
2020-06-27 19:31:11 +00:00
if ( ast_to_str ( future_metadata . partition_key . definition_ast ) ! = ast_to_str ( current_metadata - > partition_key . definition_ast ) )
2020-10-15 13:02:39 +00:00
future_metadata_in_zk . partition_key = serializeAST ( * extractKeyExpressionList ( future_metadata . partition_key . definition_ast ) ) ;
2020-06-27 19:31:11 +00:00
2020-06-16 16:55:04 +00:00
if ( ast_to_str ( future_metadata . table_ttl . definition_ast ) ! = ast_to_str ( current_metadata - > table_ttl . definition_ast ) )
2020-09-20 13:27:33 +00:00
{
if ( future_metadata . table_ttl . definition_ast )
future_metadata_in_zk . ttl_table = serializeAST ( * future_metadata . table_ttl . definition_ast ) ;
else /// TTL was removed
future_metadata_in_zk . ttl_table = " " ;
}
2019-02-05 14:50:25 +00:00
2020-06-01 12:11:23 +00:00
String new_indices_str = future_metadata . secondary_indices . toString ( ) ;
2020-06-16 16:55:04 +00:00
if ( new_indices_str ! = current_metadata - > secondary_indices . toString ( ) )
2020-02-13 16:16:09 +00:00
future_metadata_in_zk . skip_indices = new_indices_str ;
2019-08-14 19:51:03 +00:00
2021-05-16 08:49:38 +00:00
String new_projections_str = future_metadata . projections . toString ( ) ;
if ( new_projections_str ! = current_metadata - > projections . toString ( ) )
future_metadata_in_zk . projections = new_projections_str ;
2020-02-13 16:16:09 +00:00
String new_constraints_str = future_metadata . constraints . toString ( ) ;
2020-06-16 16:55:04 +00:00
if ( new_constraints_str ! = current_metadata - > constraints . toString ( ) )
2020-02-13 16:16:09 +00:00
future_metadata_in_zk . constraints = new_constraints_str ;
2017-04-01 07:20:54 +00:00
2018-11-01 13:30:38 +00:00
Coordination : : Requests ops ;
2020-11-24 10:24:39 +00:00
size_t alter_path_idx = std : : numeric_limits < size_t > : : max ( ) ;
size_t mutation_path_idx = std : : numeric_limits < size_t > : : max ( ) ;
2017-04-01 07:20:54 +00:00
2020-02-13 16:16:09 +00:00
String new_metadata_str = future_metadata_in_zk . toString ( ) ;
2021-05-08 10:59:55 +00:00
ops . emplace_back ( zkutil : : makeSetRequest ( fs : : path ( zookeeper_path ) / " metadata " , new_metadata_str , metadata_version ) ) ;
2019-03-05 10:12:20 +00:00
2020-02-13 16:16:09 +00:00
String new_columns_str = future_metadata . columns . toString ( ) ;
2021-05-08 10:59:55 +00:00
ops . emplace_back ( zkutil : : makeSetRequest ( fs : : path ( zookeeper_path ) / " columns " , new_columns_str , - 1 ) ) ;
2017-04-01 07:20:54 +00:00
2020-06-16 16:55:04 +00:00
if ( ast_to_str ( current_metadata - > settings_changes ) ! = ast_to_str ( future_metadata . settings_changes ) )
2018-11-01 13:30:38 +00:00
{
2020-02-13 16:16:09 +00:00
/// Just change settings
2020-06-16 16:55:04 +00:00
StorageInMemoryMetadata metadata_copy = * current_metadata ;
metadata_copy . settings_changes = future_metadata . settings_changes ;
changeSettings ( metadata_copy . settings_changes , table_lock_holder ) ;
DatabaseCatalog : : instance ( ) . getDatabase ( table_id . database_name ) - > alterTable ( query_context , table_id , metadata_copy ) ;
2018-11-01 13:30:38 +00:00
}
2017-04-01 07:20:54 +00:00
2020-06-27 19:05:00 +00:00
/// We can be sure, that in case of successful commit in zookeeper our
2020-02-13 16:16:09 +00:00
/// version will increments by 1. Because we update with version check.
int new_metadata_version = metadata_version + 1 ;
2017-04-01 07:20:54 +00:00
2020-02-13 16:16:09 +00:00
alter_entry - > type = LogEntry : : ALTER_METADATA ;
alter_entry - > source_replica = replica_name ;
alter_entry - > metadata_str = new_metadata_str ;
alter_entry - > columns_str = new_columns_str ;
alter_entry - > alter_version = new_metadata_version ;
alter_entry - > create_time = time ( nullptr ) ;
2017-04-01 07:20:54 +00:00
2020-08-27 13:10:10 +00:00
auto maybe_mutation_commands = commands . getMutationCommands (
2021-04-10 23:33:54 +00:00
* current_metadata , query_context - > getSettingsRef ( ) . materialize_ttl_after_modify , query_context ) ;
2021-04-28 17:49:27 +00:00
bool have_mutation = ! maybe_mutation_commands . empty ( ) ;
alter_entry - > have_mutation = have_mutation ;
2017-04-01 07:20:54 +00:00
2020-11-24 10:24:39 +00:00
alter_path_idx = ops . size ( ) ;
2020-06-12 18:24:32 +00:00
ops . emplace_back ( zkutil : : makeCreateRequest (
2021-05-08 10:59:55 +00:00
fs : : path ( zookeeper_path ) / " log/log- " , alter_entry - > toString ( ) , zkutil : : CreateMode : : PersistentSequential ) ) ;
2018-11-01 13:30:38 +00:00
2020-11-10 10:23:46 +00:00
PartitionBlockNumbersHolder partition_block_numbers_holder ;
2021-04-28 17:49:27 +00:00
if ( have_mutation )
2020-01-31 12:25:31 +00:00
{
2021-05-08 10:59:55 +00:00
const String mutations_path ( fs : : path ( zookeeper_path ) / " mutations " ) ;
2018-11-01 13:30:38 +00:00
2020-02-05 11:18:11 +00:00
ReplicatedMergeTreeMutationEntry mutation_entry ;
2020-02-13 16:16:09 +00:00
mutation_entry . alter_version = new_metadata_version ;
2020-11-10 10:23:46 +00:00
mutation_entry . source_replica = replica_name ;
mutation_entry . commands = std : : move ( maybe_mutation_commands ) ;
2020-02-05 11:18:11 +00:00
Coordination : : Stat mutations_stat ;
zookeeper - > get ( mutations_path , & mutations_stat ) ;
2018-11-01 13:30:38 +00:00
2020-11-10 10:23:46 +00:00
partition_block_numbers_holder =
2021-05-28 13:27:36 +00:00
allocateBlockNumbersInAffectedPartitions ( mutation_entry . commands , query_context , zookeeper ) ;
2018-11-01 13:30:38 +00:00
2020-11-10 10:23:46 +00:00
mutation_entry . block_numbers = partition_block_numbers_holder . getBlockNumbers ( ) ;
2020-02-05 11:18:11 +00:00
mutation_entry . create_time = time ( nullptr ) ;
2014-07-16 08:58:59 +00:00
2020-02-05 11:18:11 +00:00
ops . emplace_back ( zkutil : : makeSetRequest ( mutations_path , String ( ) , mutations_stat . version ) ) ;
2020-11-24 10:24:39 +00:00
mutation_path_idx = ops . size ( ) ;
2020-02-05 11:18:11 +00:00
ops . emplace_back (
2021-05-08 10:59:55 +00:00
zkutil : : makeCreateRequest ( fs : : path ( mutations_path ) / " " , mutation_entry . toString ( ) , zkutil : : CreateMode : : PersistentSequential ) ) ;
2020-01-31 12:25:31 +00:00
}
2014-07-16 08:58:59 +00:00
2021-04-10 23:33:54 +00:00
if ( auto txn = query_context - > getZooKeeperMetadataTransaction ( ) )
2020-11-20 16:06:27 +00:00
{
2021-07-30 16:34:18 +00:00
/// It would be better to clone ops instead of moving, so we could retry on ZBADVERSION,
/// but clone() is not implemented for Coordination::Request.
2021-02-08 19:36:17 +00:00
txn - > moveOpsTo ( ops ) ;
2020-11-24 10:24:39 +00:00
/// NOTE: IDatabase::alterTable(...) is called when executing ALTER_METADATA queue entry without query context,
/// so we have to update metadata of DatabaseReplicated here.
2021-05-08 10:59:55 +00:00
String metadata_zk_path = fs : : path ( txn - > getDatabaseZooKeeperPath ( ) ) / " metadata " / escapeForFileName ( table_id . table_name ) ;
2020-11-24 10:24:39 +00:00
auto ast = DatabaseCatalog : : instance ( ) . getDatabase ( table_id . database_name ) - > getCreateTableQuery ( table_id . table_name , query_context ) ;
2021-02-08 19:36:17 +00:00
applyMetadataChangesToCreateQuery ( ast , future_metadata ) ;
2020-11-24 10:24:39 +00:00
ops . emplace_back ( zkutil : : makeSetRequest ( metadata_zk_path , getObjectDefinitionFromCreateQuery ( ast ) , - 1 ) ) ;
2020-11-20 16:06:27 +00:00
}
2020-01-31 19:30:33 +00:00
Coordination : : Responses results ;
2020-06-12 15:09:12 +00:00
Coordination : : Error rc = zookeeper - > tryMulti ( ops , results ) ;
2018-11-01 13:30:38 +00:00
2022-09-05 01:50:24 +00:00
/// For the sake of consistency with mechanics of concurrent background process of assigning parts merge tasks
2020-11-10 10:23:46 +00:00
/// this placeholder must be held up until the moment of committing into ZK of the mutation entry
/// See ReplicatedMergeTreeMergePredicate::canMergeTwoParts() method
partition_block_numbers_holder . reset ( ) ;
2020-06-12 15:09:12 +00:00
if ( rc = = Coordination : : Error : : ZOK )
2020-01-31 19:30:33 +00:00
{
2021-04-28 17:49:27 +00:00
if ( have_mutation )
2014-07-16 08:58:59 +00:00
{
2020-02-17 16:33:05 +00:00
/// ALTER_METADATA record in replication /log
2020-11-24 10:24:39 +00:00
String alter_path = dynamic_cast < const Coordination : : CreateResponse & > ( * results [ alter_path_idx ] ) . path_created ;
2020-02-13 16:16:09 +00:00
alter_entry - > znode_name = alter_path . substr ( alter_path . find_last_of ( ' / ' ) + 1 ) ;
2018-11-01 13:30:38 +00:00
2020-02-17 16:33:05 +00:00
/// ReplicatedMergeTreeMutationEntry record in /mutations
2020-11-24 10:24:39 +00:00
String mutation_path = dynamic_cast < const Coordination : : CreateResponse & > ( * results [ mutation_path_idx ] ) . path_created ;
2020-01-31 19:30:33 +00:00
mutation_znode = mutation_path . substr ( mutation_path . find_last_of ( ' / ' ) + 1 ) ;
2016-04-09 05:43:55 +00:00
}
else
{
2020-02-17 16:33:05 +00:00
/// ALTER_METADATA record in replication /log
2020-11-24 10:24:39 +00:00
String alter_path = dynamic_cast < const Coordination : : CreateResponse & > ( * results [ alter_path_idx ] ) . path_created ;
2020-02-13 16:16:09 +00:00
alter_entry - > znode_name = alter_path . substr ( alter_path . find_last_of ( ' / ' ) + 1 ) ;
2016-04-09 05:43:55 +00:00
}
2020-02-13 14:48:38 +00:00
break ;
2014-07-16 08:58:59 +00:00
}
2020-06-12 15:09:12 +00:00
else if ( rc = = Coordination : : Error : : ZBADVERSION )
2016-04-09 05:43:55 +00:00
{
2020-06-12 15:09:12 +00:00
if ( results [ 0 ] - > error ! = Coordination : : Error : : ZOK )
2021-09-14 11:45:33 +00:00
throw Exception ( " Metadata on replica is not up to date with common metadata in Zookeeper. It means that this replica still not applied some of previous alters. "
2021-09-14 10:57:05 +00:00
" Probably too many alters executing concurrently (highly not recommended). You can retry this error " ,
2020-06-27 19:31:11 +00:00
ErrorCodes : : CANNOT_ASSIGN_ALTER ) ;
2017-04-01 07:20:54 +00:00
2021-07-30 16:34:18 +00:00
/// Cannot retry automatically, because some zookeeper ops were lost on the first attempt. Will retry on DDLWorker-level.
if ( query_context - > getZooKeeperMetadataTransaction ( ) )
throw Exception ( " Cannot execute alter, because mutations version was suddenly changed due to concurrent alter " ,
ErrorCodes : : CANNOT_ASSIGN_ALTER ) ;
2020-02-13 14:48:38 +00:00
continue ;
2020-01-31 19:30:33 +00:00
}
else
{
2020-02-13 14:48:38 +00:00
throw Coordination : : Exception ( " Alter cannot be assigned because of Zookeeper error " , rc ) ;
2020-01-31 19:30:33 +00:00
}
}
2017-04-01 07:20:54 +00:00
2021-10-25 17:49:49 +00:00
table_lock_holder . unlock ( ) ;
2017-04-01 07:20:54 +00:00
2021-08-23 12:57:50 +00:00
LOG_DEBUG ( log , " Updated shared metadata nodes in ZooKeeper. Waiting for replicas to apply changes. " ) ;
waitForLogEntryToBeProcessedIfNecessary ( * alter_entry , query_context , " Some replicas doesn't finish metadata alter: " ) ;
2017-04-01 07:20:54 +00:00
2020-02-14 10:17:04 +00:00
if ( mutation_znode )
2020-02-13 20:09:48 +00:00
{
LOG_DEBUG ( log , " Metadata changes applied. Will wait for data changes. " ) ;
2021-04-10 23:33:54 +00:00
waitMutation ( * mutation_znode , query_context - > getSettingsRef ( ) . replication_alter_partitions_sync ) ;
2020-02-13 20:09:48 +00:00
LOG_DEBUG ( log , " Data changes applied. " ) ;
2014-07-16 08:58:59 +00:00
}
}
2018-05-21 13:49:54 +00:00
/// If new version returns ordinary name, else returns part name containing the first and last month of the month
2020-10-05 13:52:03 +00:00
/// NOTE: use it in pair with getFakePartCoveringAllPartsInPartition(...)
2021-09-16 16:03:31 +00:00
String getPartNamePossiblyFake ( MergeTreeDataFormatVersion format_version , const MergeTreePartInfo & part_info )
2014-08-07 09:23:55 +00:00
{
2017-09-07 16:21:06 +00:00
if ( format_version < MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING )
2017-08-25 20:41:45 +00:00
{
/// The date range is all month long.
const auto & lut = DateLUT : : instance ( ) ;
2018-05-21 13:49:54 +00:00
time_t start_time = lut . YYYYMMDDToDate ( parse < UInt32 > ( part_info . partition_id + " 01 " ) ) ;
2021-02-14 11:00:40 +00:00
DayNum left_date = DayNum ( lut . toDayNum ( start_time ) . toUnderType ( ) ) ;
2018-05-25 13:29:15 +00:00
DayNum right_date = DayNum ( static_cast < size_t > ( left_date ) + lut . daysInMonth ( start_time ) - 1 ) ;
2017-08-25 20:41:45 +00:00
return part_info . getPartNameV0 ( left_date , right_date ) ;
}
2014-08-07 09:23:55 +00:00
2018-05-21 13:49:54 +00:00
return part_info . getPartName ( ) ;
}
2014-10-18 17:37:55 +00:00
2021-05-13 14:04:36 +00:00
bool StorageReplicatedMergeTree : : getFakePartCoveringAllPartsInPartition ( const String & partition_id , MergeTreePartInfo & part_info ,
std : : optional < EphemeralLockInZooKeeper > & delimiting_block_lock , bool for_replace_range )
2015-04-21 13:10:08 +00:00
{
2017-03-13 18:01:46 +00:00
/// Even if there is no data in the partition, you still need to mark the range for deletion.
/// - Because before executing DETACH, tasks for downloading parts to this partition can be executed.
2017-06-26 08:54:58 +00:00
Int64 left = 0 ;
2017-04-01 07:20:54 +00:00
2017-08-14 18:16:11 +00:00
/** Let's skip one number in `block_numbers` for the partition being deleted, and we will only delete parts until this number.
2019-05-03 02:00:57 +00:00
* This prohibits merges of deleted parts with the new inserted
2017-03-13 18:01:46 +00:00
* Invariant : merges of deleted parts with other parts do not appear in the log .
* NOTE : If you need to similarly support a ` DROP PART ` request , you will have to think of some new mechanism for it ,
* to guarantee this invariant .
2014-08-07 09:23:55 +00:00
*/
2015-08-17 21:09:36 +00:00
Int64 right ;
2018-05-14 14:51:33 +00:00
Int64 mutation_version ;
2017-04-01 07:20:54 +00:00
2014-08-07 09:23:55 +00:00
{
2017-06-25 00:01:10 +00:00
auto zookeeper = getZooKeeper ( ) ;
2021-05-13 14:04:36 +00:00
delimiting_block_lock = allocateBlockNumber ( partition_id , zookeeper ) ;
right = delimiting_block_lock - > getNumber ( ) ;
2021-05-24 11:23:23 +00:00
/// Make sure we cover all parts in drop range.
/// There might be parts with mutation version greater than current block number
2021-05-24 12:13:42 +00:00
/// if some part mutation has been assigned after block number allocation, but before creation of DROP_RANGE entry.
2021-05-20 06:55:28 +00:00
mutation_version = MergeTreePartInfo : : MAX_BLOCK_NUMBER ;
2014-08-07 09:23:55 +00:00
}
2017-04-01 07:20:54 +00:00
2021-05-13 11:29:59 +00:00
if ( for_replace_range )
2020-10-05 13:52:03 +00:00
{
2021-05-14 12:55:30 +00:00
/// NOTE Do not decrement max block number for REPLACE_RANGE, because there are invariants:
2021-05-13 11:29:59 +00:00
/// - drop range for REPLACE PARTITION must contain at least 2 blocks (1 skipped block and at least 1 real block)
/// - drop range for MOVE PARTITION/ATTACH PARTITION FROM always contains 1 block
2020-10-05 19:16:28 +00:00
2021-05-14 12:55:30 +00:00
/// NOTE UINT_MAX was previously used as max level for REPLACE/MOVE PARTITION (it was incorrect)
part_info = MergeTreePartInfo ( partition_id , left , right , MergeTreePartInfo : : MAX_LEVEL , mutation_version ) ;
2021-05-13 14:04:36 +00:00
return right ! = 0 ;
2020-10-05 13:52:03 +00:00
}
2017-11-15 16:32:47 +00:00
2021-05-13 14:04:36 +00:00
/// Empty partition.
if ( right = = 0 )
return false ;
- - right ;
2019-01-22 19:56:53 +00:00
/// Artificial high level is chosen, to make this part "covering" all parts inside.
2021-05-13 14:04:36 +00:00
part_info = MergeTreePartInfo ( partition_id , left , right , MergeTreePartInfo : : MAX_LEVEL , mutation_version ) ;
2018-05-21 13:49:54 +00:00
return true ;
2017-06-16 16:47:09 +00:00
}
2021-06-20 08:24:43 +00:00
void StorageReplicatedMergeTree : : restoreMetadataInZooKeeper ( )
{
LOG_INFO ( log , " Restoring replica metadata " ) ;
2022-08-16 08:19:02 +00:00
2022-08-24 17:44:14 +00:00
if ( ! initialization_done )
throw Exception ( ErrorCodes : : NOT_INITIALIZED , " Table is not initialized yet " ) ;
2022-08-16 08:19:02 +00:00
2022-01-20 18:55:59 +00:00
if ( ! is_readonly )
throw Exception ( ErrorCodes : : BAD_ARGUMENTS , " Replica must be readonly " ) ;
2021-06-20 08:24:43 +00:00
2022-08-16 08:19:02 +00:00
2022-01-20 18:55:59 +00:00
if ( getZooKeeper ( ) - > exists ( replica_path ) )
throw Exception ( ErrorCodes : : BAD_ARGUMENTS ,
" Replica path is present at {} - nothing to restore. "
2022-01-24 09:12:03 +00:00
" If you are sure that metadata is lost and that replica path contains some garbage, "
2022-01-20 18:55:59 +00:00
" then use SYSTEM DROP REPLICA query first. " , replica_path ) ;
if ( has_metadata_in_zookeeper . has_value ( ) & & * has_metadata_in_zookeeper )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Replica has metadata in ZooKeeper: "
2022-01-24 09:12:08 +00:00
" it's either a bug or it's a result of manual intervention to ZooKeeper " ) ;
2021-06-20 08:24:43 +00:00
if ( are_restoring_replica . exchange ( true ) )
throw Exception ( ErrorCodes : : CONCURRENT_ACCESS_NOT_SUPPORTED , " Replica restoration in progress " ) ;
2022-01-20 18:55:59 +00:00
SCOPE_EXIT ( { are_restoring_replica . store ( false ) ; } ) ;
2021-06-20 08:24:43 +00:00
auto metadata_snapshot = getInMemoryMetadataPtr ( ) ;
2021-09-06 14:24:03 +00:00
const DataPartsVector all_parts = getAllDataPartsVector ( ) ;
2021-06-20 08:24:43 +00:00
Strings active_parts_names ;
2021-12-30 14:27:22 +00:00
/// Why all parts (not only Active) are moved to detached/:
2021-06-20 08:24:43 +00:00
/// After ZK metadata restoration ZK resets sequential counters (including block number counters), so one may
/// potentially encounter a situation that a part we want to attach already exists.
for ( const auto & part : all_parts )
{
2021-12-30 14:27:22 +00:00
if ( part - > getState ( ) = = DataPartState : : Active )
2021-06-20 08:24:43 +00:00
active_parts_names . push_back ( part - > name ) ;
2022-09-30 17:01:06 +00:00
forcefullyMovePartToDetachedAndRemoveFromMemory ( part ) ;
2021-06-20 08:24:43 +00:00
}
LOG_INFO ( log , " Moved all parts to detached/ " ) ;
const bool is_first_replica = createTableIfNotExists ( metadata_snapshot ) ;
LOG_INFO ( log , " Created initial ZK nodes, replica is first: {} " , is_first_replica ) ;
if ( ! is_first_replica )
createReplica ( metadata_snapshot ) ;
createNewZooKeeperNodes ( ) ;
LOG_INFO ( log , " Created ZK nodes for table " ) ;
has_metadata_in_zookeeper = true ;
if ( is_first_replica )
for ( const String & part_name : active_parts_names )
attachPartition ( std : : make_shared < ASTLiteral > ( part_name ) , metadata_snapshot , true , getContext ( ) ) ;
LOG_INFO ( log , " Attached all partitions, starting table " ) ;
2022-08-19 08:49:51 +00:00
startupImpl ( ) ;
2021-06-20 08:24:43 +00:00
}
2021-05-17 14:26:36 +00:00
void StorageReplicatedMergeTree : : dropPartNoWaitNoThrow ( const String & part_name )
2021-04-20 02:31:08 +00:00
{
assertNotReadonly ( ) ;
if ( ! is_leader )
throw Exception ( " DROP PART cannot be done on this replica because it is not a leader " , ErrorCodes : : NOT_A_LEADER ) ;
2022-02-03 10:10:05 +00:00
zkutil : : ZooKeeperPtr zookeeper = getZooKeeperAndAssertNotReadonly ( ) ;
2021-04-20 02:31:08 +00:00
LogEntry entry ;
2021-05-25 17:25:00 +00:00
dropPartImpl ( zookeeper , part_name , entry , /*detach=*/ false , /*throw_if_noop=*/ false ) ;
2021-04-20 02:31:08 +00:00
}
2017-06-16 16:47:09 +00:00
2021-05-17 14:26:36 +00:00
void StorageReplicatedMergeTree : : dropPart ( const String & part_name , bool detach , ContextPtr query_context )
2017-06-16 16:47:09 +00:00
{
assertNotReadonly ( ) ;
2020-06-19 14:18:58 +00:00
if ( ! is_leader )
2021-05-17 14:26:36 +00:00
throw Exception ( " DROP PART cannot be done on this replica because it is not a leader " , ErrorCodes : : NOT_A_LEADER ) ;
2017-06-16 16:47:09 +00:00
2022-02-03 10:10:05 +00:00
zkutil : : ZooKeeperPtr zookeeper = getZooKeeperAndAssertNotReadonly ( ) ;
2018-04-21 00:35:20 +00:00
LogEntry entry ;
2020-09-04 15:48:51 +00:00
2021-05-25 17:25:00 +00:00
dropPartImpl ( zookeeper , part_name , entry , detach , /*throw_if_noop=*/ true ) ;
2020-09-04 15:48:51 +00:00
2021-08-23 12:57:50 +00:00
waitForLogEntryToBeProcessedIfNecessary ( entry , query_context ) ;
2021-05-17 14:26:36 +00:00
}
2017-11-15 16:32:47 +00:00
2022-05-06 14:12:31 +00:00
void StorageReplicatedMergeTree : : dropAllPartitionsImpl ( const zkutil : : ZooKeeperPtr & zookeeper , bool detach , ContextPtr query_context )
{
Strings partitions = zookeeper - > getChildren ( fs : : path ( zookeeper_path ) / " block_numbers " ) ;
std : : vector < LogEntryPtr > entries ;
dropAllPartsInPartitions ( * zookeeper , partitions , entries , query_context , detach ) ;
for ( const auto & entry : entries )
{
waitForLogEntryToBeProcessedIfNecessary ( * entry , query_context ) ;
auto drop_range_info = MergeTreePartInfo : : fromPartName ( entry - > new_part_name , format_version ) ;
cleanLastPartNode ( drop_range_info . partition_id ) ;
}
}
2021-05-17 14:26:36 +00:00
void StorageReplicatedMergeTree : : dropPartition ( const ASTPtr & partition , bool detach , ContextPtr query_context )
{
assertNotReadonly ( ) ;
if ( ! is_leader )
throw Exception ( " DROP PARTITION cannot be done on this replica because it is not a leader " , ErrorCodes : : NOT_A_LEADER ) ;
2017-11-15 16:32:47 +00:00
2022-02-03 10:10:05 +00:00
zkutil : : ZooKeeperPtr zookeeper = getZooKeeperAndAssertNotReadonly ( ) ;
2020-09-04 15:48:51 +00:00
2022-03-31 09:50:07 +00:00
const auto * partition_ast = partition - > as < ASTPartition > ( ) ;
if ( partition_ast & & partition_ast - > all )
{
2022-05-06 14:12:31 +00:00
dropAllPartitionsImpl ( zookeeper , detach , query_context ) ;
2022-03-31 09:50:07 +00:00
}
else
2017-07-24 09:59:55 +00:00
{
2022-03-31 09:50:07 +00:00
String partition_id = getPartitionIDFromQuery ( partition , query_context ) ;
2022-05-06 14:12:31 +00:00
auto entry = dropAllPartsInPartition ( * zookeeper , partition_id , query_context , detach ) ;
if ( entry )
2022-03-31 09:50:07 +00:00
{
2022-05-06 14:12:31 +00:00
waitForLogEntryToBeProcessedIfNecessary ( * entry , query_context ) ;
2022-03-31 09:50:07 +00:00
cleanLastPartNode ( partition_id ) ;
}
2020-09-04 15:48:51 +00:00
}
2018-04-21 00:35:20 +00:00
}
2017-07-24 09:59:55 +00:00
2017-11-15 16:32:47 +00:00
2020-06-18 10:29:13 +00:00
void StorageReplicatedMergeTree : : truncate (
2021-04-10 23:33:54 +00:00
const ASTPtr & , const StorageMetadataPtr & , ContextPtr query_context , TableExclusiveLockHolder & table_lock )
2018-04-21 00:35:20 +00:00
{
2019-08-27 20:43:08 +00:00
table_lock . release ( ) ; /// Truncate is done asynchronously.
2018-04-21 00:35:20 +00:00
assertNotReadonly ( ) ;
2020-06-19 14:18:58 +00:00
if ( ! is_leader )
throw Exception ( " TRUNCATE cannot be done on this replica because it is not a leader " , ErrorCodes : : NOT_A_LEADER ) ;
2017-04-01 07:20:54 +00:00
2022-02-03 10:10:05 +00:00
zkutil : : ZooKeeperPtr zookeeper = getZooKeeperAndAssertNotReadonly ( ) ;
2022-05-06 14:12:31 +00:00
dropAllPartitionsImpl ( zookeeper , /* detach */ false , query_context ) ;
2016-01-28 01:00:27 +00:00
}
2016-01-26 02:00:25 +00:00
2016-01-28 16:06:57 +00:00
2020-07-28 15:10:36 +00:00
PartitionCommandsResultInfo StorageReplicatedMergeTree : : attachPartition (
const ASTPtr & partition ,
const StorageMetadataPtr & metadata_snapshot ,
bool attach_part ,
2021-04-10 23:33:54 +00:00
ContextPtr query_context )
2014-08-07 11:46:01 +00:00
{
2022-02-03 10:10:05 +00:00
/// Allow ATTACH PARTITION on readonly replica when restoring it.
if ( ! are_restoring_replica )
assertNotReadonly ( ) ;
2017-04-01 07:20:54 +00:00
2020-07-28 15:10:36 +00:00
PartitionCommandsResultInfo results ;
2019-08-29 16:17:47 +00:00
PartsTemporaryRename renamed_parts ( * this , " detached/ " ) ;
2019-07-30 19:11:15 +00:00
MutableDataPartsVector loaded_parts = tryLoadPartsToAttach ( partition , attach_part , query_context , renamed_parts ) ;
2019-07-30 17:24:40 +00:00
2021-01-12 18:46:03 +00:00
/// TODO Allow to use quorum here.
2022-08-08 05:23:49 +00:00
ReplicatedMergeTreeSink output ( * this , metadata_snapshot , 0 , 0 , 0 , false , false , false , query_context ,
2021-02-15 17:31:58 +00:00
/*is_attach*/ true ) ;
2021-01-12 18:46:03 +00:00
2019-07-30 19:11:15 +00:00
for ( size_t i = 0 ; i < loaded_parts . size ( ) ; + + i )
2014-08-08 08:28:13 +00:00
{
2021-01-12 18:46:03 +00:00
const String old_name = loaded_parts [ i ] - > name ;
2019-07-30 19:11:15 +00:00
output . writeExistingPart ( loaded_parts [ i ] ) ;
2021-01-12 18:46:03 +00:00
2021-11-24 19:45:10 +00:00
renamed_parts . old_and_new_names [ i ] . old_name . clear ( ) ;
2021-01-12 18:46:03 +00:00
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Attached part {} as {} " , old_name , loaded_parts [ i ] - > name ) ;
2021-01-12 18:46:03 +00:00
2020-07-28 15:10:36 +00:00
results . push_back ( PartitionCommandResultInfo {
. partition_id = loaded_parts [ i ] - > info . partition_id ,
. part_name = loaded_parts [ i ] - > name ,
. old_part_name = old_name ,
} ) ;
2014-08-08 08:28:13 +00:00
}
2020-07-28 15:10:36 +00:00
return results ;
2014-08-07 11:46:01 +00:00
}
2017-06-25 00:51:51 +00:00
2018-08-03 09:39:01 +00:00
void StorageReplicatedMergeTree : : checkTableCanBeDropped ( ) const
2017-01-23 19:18:25 +00:00
{
2019-12-03 16:25:32 +00:00
auto table_id = getStorageID ( ) ;
2021-04-10 23:33:54 +00:00
getContext ( ) - > checkTableCanBeDropped ( table_id . database_name , table_id . table_name , getTotalActiveSizeInBytes ( ) ) ;
2017-01-23 19:18:25 +00:00
}
2014-10-18 17:37:55 +00:00
2022-04-13 14:51:59 +00:00
void StorageReplicatedMergeTree : : checkTableCanBeRenamed ( const StorageID & new_name ) const
2020-09-26 19:18:28 +00:00
{
2022-04-13 14:51:59 +00:00
if ( renaming_restrictions = = RenamingRestrictions : : ALLOW_ANY )
return ;
if ( renaming_restrictions = = RenamingRestrictions : : DO_NOT_ALLOW )
2022-08-05 19:41:02 +00:00
{
auto old_name = getStorageID ( ) ;
bool is_server_startup = Context : : getGlobalContextInstance ( ) - > getApplicationType ( ) = = Context : : ApplicationType : : SERVER
& & ! Context : : getGlobalContextInstance ( ) - > isServerCompletelyStarted ( ) ;
bool move_to_atomic = old_name . uuid = = UUIDHelpers : : Nil & & new_name . uuid ! = UUIDHelpers : : Nil ;
bool likely_converting_ordinary_to_atomic = is_server_startup & & move_to_atomic ;
if ( likely_converting_ordinary_to_atomic )
{
LOG_INFO ( log , " Table {} should not be renamed, because zookeeper_path contains implicit 'database' or 'table' macro. "
" We cannot rename path in ZooKeeper, so path may become inconsistent with table name. "
" However, we allow renaming while converting Ordinary database to Atomic, because all tables will be renamed back " ,
old_name . getNameForLogs ( ) ) ;
return ;
}
throw Exception (
" Cannot rename Replicated table, because zookeeper_path contains implicit 'database' or 'table' macro. "
" We cannot rename path in ZooKeeper, so path may become inconsistent with table name. If you really want to rename table, "
" you should edit metadata file first and restart server or reattach the table. " ,
ErrorCodes : : NOT_IMPLEMENTED ) ;
}
2022-04-13 14:51:59 +00:00
assert ( renaming_restrictions = = RenamingRestrictions : : ALLOW_PRESERVING_UUID ) ;
if ( ! new_name . hasUUID ( ) & & getStorageID ( ) . hasUUID ( ) )
throw Exception ( " Cannot move Replicated table to Ordinary database, because zookeeper_path contains implicit 'uuid' macro. "
" If you really want to rename table, "
" you should edit metadata file first and restart server or reattach the table. " , ErrorCodes : : NOT_IMPLEMENTED ) ;
2020-09-26 19:18:28 +00:00
}
2020-04-07 14:05:51 +00:00
void StorageReplicatedMergeTree : : rename ( const String & new_path_to_table_data , const StorageID & new_table_id )
2014-07-28 14:33:30 +00:00
{
2022-04-13 14:51:59 +00:00
checkTableCanBeRenamed ( new_table_id ) ;
2020-04-07 14:05:51 +00:00
MergeTreeData : : rename ( new_path_to_table_data , new_table_id ) ;
2014-07-28 14:33:30 +00:00
2018-04-17 17:59:42 +00:00
/// Update table name in zookeeper
2020-06-14 01:23:53 +00:00
if ( ! is_readonly )
{
/// We don't do it for readonly tables, because it will be updated on next table startup.
/// It is also Ok to skip ZK error for the same reason.
try
{
auto zookeeper = getZooKeeper ( ) ;
2021-05-08 10:59:55 +00:00
zookeeper - > set ( fs : : path ( replica_path ) / " host " , getReplicatedMergeTreeAddress ( ) . toString ( ) ) ;
2020-06-14 01:23:53 +00:00
}
catch ( Coordination : : Exception & e )
{
LOG_WARNING ( log , " Cannot update the value of 'host' node (replica address) in ZooKeeper: {} " , e . displayText ( ) ) ;
}
}
2018-04-17 17:59:42 +00:00
2017-03-12 19:18:07 +00:00
/// TODO: You can update names of loggers.
2014-07-28 14:33:30 +00:00
}
2014-10-18 17:37:55 +00:00
2020-11-10 10:23:46 +00:00
bool StorageReplicatedMergeTree : : existsNodeCached ( const std : : string & path ) const
2016-01-24 05:00:24 +00:00
{
{
2019-01-02 06:44:36 +00:00
std : : lock_guard lock ( existing_nodes_cache_mutex ) ;
2022-04-18 10:18:43 +00:00
if ( existing_nodes_cache . contains ( path ) )
2016-01-24 05:00:24 +00:00
return true ;
}
2016-10-24 12:34:08 +00:00
bool res = getZooKeeper ( ) - > exists ( path ) ;
2016-01-24 05:00:24 +00:00
if ( res )
{
2019-01-02 06:44:36 +00:00
std : : lock_guard lock ( existing_nodes_cache_mutex ) ;
2016-01-24 05:00:24 +00:00
existing_nodes_cache . insert ( path ) ;
}
return res ;
}
2018-07-04 16:31:21 +00:00
std : : optional < EphemeralLockInZooKeeper >
2018-05-28 15:37:30 +00:00
StorageReplicatedMergeTree : : allocateBlockNumber (
2021-05-21 09:30:49 +00:00
const String & partition_id , const zkutil : : ZooKeeperPtr & zookeeper , const String & zookeeper_block_id_path , const String & zookeeper_path_prefix ) const
2014-08-07 09:23:55 +00:00
{
2021-05-21 09:30:49 +00:00
String zookeeper_table_path ;
if ( zookeeper_path_prefix . empty ( ) )
zookeeper_table_path = zookeeper_path ;
else
zookeeper_table_path = zookeeper_path_prefix ;
2021-05-23 07:54:48 +00:00
String block_numbers_path = fs : : path ( zookeeper_table_path ) / " block_numbers " ;
2021-05-08 10:59:55 +00:00
String partition_path = fs : : path ( block_numbers_path ) / partition_id ;
2018-05-21 13:49:54 +00:00
2017-08-14 18:16:11 +00:00
if ( ! existsNodeCached ( partition_path ) )
2017-06-26 14:21:25 +00:00
{
2018-08-25 01:58:14 +00:00
Coordination : : Requests ops ;
2018-04-04 12:39:48 +00:00
ops . push_back ( zkutil : : makeCreateRequest ( partition_path , " " , zkutil : : CreateMode : : Persistent ) ) ;
/// We increment data version of the block_numbers node so that it becomes possible
/// to check in a ZK transaction that the set of partitions didn't change
/// (unfortunately there is no CheckChildren op).
ops . push_back ( zkutil : : makeSetRequest ( block_numbers_path , " " , - 1 ) ) ;
2018-08-25 01:58:14 +00:00
Coordination : : Responses responses ;
2020-06-12 15:09:12 +00:00
Coordination : : Error code = zookeeper - > tryMulti ( ops , responses ) ;
if ( code ! = Coordination : : Error : : ZOK & & code ! = Coordination : : Error : : ZNODEEXISTS )
2018-04-04 12:39:48 +00:00
zkutil : : KeeperMultiException : : check ( code , ops , responses ) ;
2017-06-26 14:21:25 +00:00
}
2017-04-01 07:20:54 +00:00
2022-07-06 16:59:53 +00:00
return createEphemeralLockInZooKeeper (
fs : : path ( partition_path ) / " block- " , fs : : path ( zookeeper_table_path ) / " temp " , * zookeeper , zookeeper_block_id_path ) ;
2014-08-07 09:23:55 +00:00
}
2014-10-18 17:37:55 +00:00
2021-08-23 12:57:50 +00:00
Strings StorageReplicatedMergeTree : : tryWaitForAllReplicasToProcessLogEntry (
2021-08-20 12:59:57 +00:00
const String & table_zookeeper_path , const ReplicatedMergeTreeLogEntryData & entry , Int64 wait_for_inactive_timeout )
2014-08-07 09:23:55 +00:00
{
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Waiting for all replicas to process {} " , entry . znode_name ) ;
2014-08-07 11:46:01 +00:00
2020-01-07 18:16:37 +00:00
auto zookeeper = getZooKeeper ( ) ;
2021-05-08 10:59:55 +00:00
Strings replicas = zookeeper - > getChildren ( fs : : path ( table_zookeeper_path ) / " replicas " ) ;
2020-01-17 13:54:22 +00:00
Strings unwaited ;
2021-08-23 12:57:50 +00:00
bool wait_for_inactive = wait_for_inactive_timeout ! = 0 ;
2014-10-18 17:37:55 +00:00
for ( const String & replica : replicas )
2019-12-20 10:00:21 +00:00
{
2021-08-20 12:59:57 +00:00
if ( wait_for_inactive | | zookeeper - > exists ( fs : : path ( table_zookeeper_path ) / " replicas " / replica / " is_active " ) )
2019-12-20 10:00:21 +00:00
{
2021-08-23 12:57:50 +00:00
if ( ! tryWaitForReplicaToProcessLogEntry ( table_zookeeper_path , replica , entry , wait_for_inactive_timeout ) )
2020-05-12 14:11:09 +00:00
unwaited . push_back ( replica ) ;
2019-12-20 10:00:21 +00:00
}
2020-01-17 13:54:22 +00:00
else
{
unwaited . push_back ( replica ) ;
}
2019-12-20 10:00:21 +00:00
}
2014-10-18 17:37:55 +00:00
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Finished waiting for all replicas to process {} " , entry . znode_name ) ;
2020-01-17 13:54:22 +00:00
return unwaited ;
2014-10-18 17:37:55 +00:00
}
2021-08-23 12:57:50 +00:00
void StorageReplicatedMergeTree : : waitForAllReplicasToProcessLogEntry (
const String & table_zookeeper_path , const ReplicatedMergeTreeLogEntryData & entry , Int64 wait_for_inactive_timeout , const String & error_context )
2020-12-04 13:01:12 +00:00
{
2021-08-23 12:57:50 +00:00
Strings unfinished_replicas = tryWaitForAllReplicasToProcessLogEntry ( table_zookeeper_path , entry , wait_for_inactive_timeout ) ;
2021-08-20 12:59:57 +00:00
if ( unfinished_replicas . empty ( ) )
return ;
2014-10-18 17:37:55 +00:00
2021-08-23 12:57:50 +00:00
throw Exception ( ErrorCodes : : UNFINISHED , " {}Timeout exceeded while waiting for replicas {} to process entry {}. "
" Probably some replicas are inactive " , error_context , fmt : : join ( unfinished_replicas , " , " ) , entry . znode_name ) ;
2020-12-04 13:01:12 +00:00
}
2021-08-23 12:57:50 +00:00
void StorageReplicatedMergeTree : : waitForLogEntryToBeProcessedIfNecessary ( const ReplicatedMergeTreeLogEntryData & entry , ContextPtr query_context , const String & error_context )
2020-12-04 13:01:12 +00:00
{
2021-08-23 12:57:50 +00:00
/// If necessary, wait until the operation is performed on itself or on all replicas.
Int64 wait_for_inactive_timeout = query_context - > getSettingsRef ( ) . replication_wait_for_inactive_replica_timeout ;
if ( query_context - > getSettingsRef ( ) . replication_alter_partitions_sync = = 1 )
{
bool finished = tryWaitForReplicaToProcessLogEntry ( zookeeper_path , replica_name , entry , wait_for_inactive_timeout ) ;
if ( ! finished )
{
throw Exception ( ErrorCodes : : UNFINISHED , " {}Log entry {} is not precessed on local replica, "
" most likely because the replica was shut down. " , error_context , entry . znode_name ) ;
}
}
else if ( query_context - > getSettingsRef ( ) . replication_alter_partitions_sync = = 2 )
{
waitForAllReplicasToProcessLogEntry ( zookeeper_path , entry , wait_for_inactive_timeout , error_context ) ;
}
2021-08-20 12:59:57 +00:00
}
2020-12-04 13:01:12 +00:00
2021-08-23 12:57:50 +00:00
bool StorageReplicatedMergeTree : : tryWaitForReplicaToProcessLogEntry (
2021-08-20 12:59:57 +00:00
const String & table_zookeeper_path , const String & replica , const ReplicatedMergeTreeLogEntryData & entry , Int64 wait_for_inactive_timeout )
2014-10-18 17:37:55 +00:00
{
2015-06-04 02:07:30 +00:00
String entry_str = entry . toString ( ) ;
String log_node_name ;
2017-04-01 07:20:54 +00:00
2021-06-07 10:01:57 +00:00
/** Wait for entries from `log` directory (a common log, from where replicas copy entries to their queue) to be processed.
2015-06-04 02:07:30 +00:00
*
2017-03-13 18:01:46 +00:00
* The problem is that the numbers ( ` sequential ` node ) of the queue elements in ` log ` and in ` queue ` do not match .
* ( And the numbers of the same log element for different replicas do not match in the ` queue ` . )
2015-06-04 02:07:30 +00:00
*/
2017-04-01 07:20:54 +00:00
2017-03-13 18:01:46 +00:00
/** First, you need to wait until replica takes `queue` element from the `log` to its queue,
* if it has not been done already ( see the ` pullLogsToQueue ` function ) .
2015-06-04 02:07:30 +00:00
*
2017-03-13 18:01:46 +00:00
* To do this , check its node ` log_pointer ` - the maximum number of the element taken from ` log ` + 1.
2015-06-04 02:07:30 +00:00
*/
2017-04-01 07:20:54 +00:00
2020-09-09 13:32:50 +00:00
bool waiting_itself = replica = = replica_name ;
2021-08-23 12:57:50 +00:00
/// Do not wait if timeout is zero
bool wait_for_inactive = wait_for_inactive_timeout ! = 0 ;
/// Wait for unlimited time if timeout is negative
bool check_timeout = wait_for_inactive_timeout > 0 ;
2021-08-20 12:59:57 +00:00
Stopwatch time_waiting ;
2020-09-09 13:32:50 +00:00
const auto & stop_waiting = [ & ] ( )
2020-05-12 14:11:09 +00:00
{
2021-08-18 09:49:22 +00:00
bool stop_waiting_itself = waiting_itself & & partial_shutdown_called ;
2021-08-23 12:57:50 +00:00
bool timeout_exceeded = check_timeout & & wait_for_inactive_timeout < time_waiting . elapsedSeconds ( ) ;
2021-08-20 12:59:57 +00:00
bool stop_waiting_inactive = ( ! wait_for_inactive | | timeout_exceeded )
& & ! getZooKeeper ( ) - > exists ( fs : : path ( table_zookeeper_path ) / " replicas " / replica / " is_active " ) ;
return is_dropped | | stop_waiting_itself | | stop_waiting_inactive ;
2020-05-12 14:11:09 +00:00
} ;
2021-01-18 12:15:07 +00:00
/// Don't recheck ZooKeeper too often
constexpr auto event_wait_timeout_ms = 3000 ;
2020-05-12 14:11:09 +00:00
2021-09-16 15:38:27 +00:00
LOG_DEBUG ( log , " Waiting for {} to process log entry " , replica ) ;
2021-08-20 12:59:57 +00:00
2021-09-16 15:47:57 +00:00
if ( startsWith ( entry . znode_name , " log- " ) )
2014-10-18 17:37:55 +00:00
{
2021-06-07 10:01:57 +00:00
/// Take the number from the node name `log-xxxxxxxxxx`.
2015-06-04 02:07:30 +00:00
UInt64 log_index = parse < UInt64 > ( entry . znode_name . substr ( entry . znode_name . size ( ) - 10 ) ) ;
log_node_name = entry . znode_name ;
2017-04-01 07:20:54 +00:00
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Waiting for {} to pull {} to queue " , replica , log_node_name ) ;
2017-04-01 07:20:54 +00:00
2017-03-13 18:01:46 +00:00
/// Let's wait until entry gets into the replica queue.
2021-08-20 12:59:57 +00:00
bool pulled_to_queue = false ;
2021-09-16 15:38:27 +00:00
do
2015-06-04 02:07:30 +00:00
{
2016-05-28 17:31:50 +00:00
zkutil : : EventPtr event = std : : make_shared < Poco : : Event > ( ) ;
2017-04-01 07:20:54 +00:00
2021-05-08 10:59:55 +00:00
String log_pointer = getZooKeeper ( ) - > get ( fs : : path ( table_zookeeper_path ) / " replicas " / replica / " log_pointer " , nullptr , event ) ;
2015-06-04 02:07:30 +00:00
if ( ! log_pointer . empty ( ) & & parse < UInt64 > ( log_pointer ) > log_index )
2021-08-20 12:59:57 +00:00
{
pulled_to_queue = true ;
2015-06-04 02:07:30 +00:00
break ;
2021-08-20 12:59:57 +00:00
}
2017-04-01 07:20:54 +00:00
2021-01-18 12:15:07 +00:00
/// Wait with timeout because we can be already shut down, but not dropped.
/// So log_pointer node will exist, but we will never update it because all background threads already stopped.
/// It can lead to query hung because table drop query can wait for some query (alter, optimize, etc) which called this method,
/// but the query will never finish because the drop already shut down the table.
2021-09-16 15:38:27 +00:00
if ( ! stop_waiting ( ) )
event - > tryWait ( event_wait_timeout_ms ) ;
} while ( ! stop_waiting ( ) ) ;
2021-08-20 12:59:57 +00:00
if ( ! pulled_to_queue )
return false ;
2021-09-16 15:47:57 +00:00
LOG_DEBUG ( log , " Looking for node corresponding to {} in {} queue " , log_node_name , replica ) ;
2014-10-18 17:37:55 +00:00
}
2021-09-16 15:47:57 +00:00
else if ( ! entry . log_entry_id . empty ( ) )
{
/// First pass, check the table log.
/// If found in the log, wait for replica to fetch it to the queue.
/// If not found in the log, it is already in the queue.
LOG_DEBUG ( log , " Looking for log entry with id `{}` in the log " , entry . log_entry_id ) ;
String log_pointer = getZooKeeper ( ) - > get ( fs : : path ( table_zookeeper_path ) / " replicas " / replica / " log_pointer " ) ;
Strings log_entries = getZooKeeper ( ) - > getChildren ( fs : : path ( table_zookeeper_path ) / " log " ) ;
UInt64 log_index = 0 ;
bool found = false ;
2017-04-01 07:20:54 +00:00
2021-09-16 15:47:57 +00:00
for ( const String & log_entry_name : log_entries )
{
log_index = parse < UInt64 > ( log_entry_name . substr ( log_entry_name . size ( ) - 10 ) ) ;
if ( ! log_pointer . empty ( ) & & log_index < parse < UInt64 > ( log_pointer ) )
continue ;
String log_entry_str ;
Coordination : : Stat log_entry_stat ;
bool exists = getZooKeeper ( ) - > tryGet ( fs : : path ( table_zookeeper_path ) / " log " / log_entry_name , log_entry_str , & log_entry_stat ) ;
ReplicatedMergeTreeLogEntryData log_entry = * ReplicatedMergeTreeLogEntry : : parse ( log_entry_str , log_entry_stat ) ;
if ( exists & & entry . log_entry_id = = log_entry . log_entry_id )
{
LOG_DEBUG ( log , " Found log entry with id `{}` in the log " , entry . log_entry_id ) ;
found = true ;
log_node_name = log_entry_name ;
break ;
}
}
if ( found )
{
LOG_DEBUG ( log , " Waiting for {} to pull {} to queue " , replica , log_node_name ) ;
/// Let's wait until entry gets into the replica queue.
bool pulled_to_queue = false ;
do
{
zkutil : : EventPtr event = std : : make_shared < Poco : : Event > ( ) ;
log_pointer = getZooKeeper ( ) - > get ( fs : : path ( table_zookeeper_path ) / " replicas " / replica / " log_pointer " , nullptr , event ) ;
if ( ! log_pointer . empty ( ) & & parse < UInt64 > ( log_pointer ) > log_index )
{
pulled_to_queue = true ;
break ;
}
2017-04-01 07:20:54 +00:00
2021-09-16 15:47:57 +00:00
/// Wait with timeout because we can be already shut down, but not dropped.
/// So log_pointer node will exist, but we will never update it because all background threads already stopped.
/// It can lead to query hung because table drop query can wait for some query (alter, optimize, etc) which called this method,
/// but the query will never finish because the drop already shut down the table.
if ( ! stop_waiting ( ) )
event - > tryWait ( event_wait_timeout_ms ) ;
} while ( ! stop_waiting ( ) ) ;
if ( ! pulled_to_queue )
return false ;
}
}
else
{
throw Exception ( " Logical error: unexpected name of log node: " + entry . znode_name , ErrorCodes : : LOGICAL_ERROR ) ;
}
2017-04-01 07:20:54 +00:00
2017-03-12 19:18:07 +00:00
/** Second - find the corresponding entry in the queue of the specified replica.
2021-06-07 10:01:57 +00:00
* Its number may not match the ` log ` node . Therefore , we search by comparing the content .
2015-06-04 02:07:30 +00:00
*/
2017-04-01 07:20:54 +00:00
2021-05-08 10:59:55 +00:00
Strings queue_entries = getZooKeeper ( ) - > getChildren ( fs : : path ( table_zookeeper_path ) / " replicas " / replica / " queue " ) ;
2015-06-04 02:07:30 +00:00
String queue_entry_to_wait_for ;
2017-04-01 07:20:54 +00:00
2014-10-18 17:37:55 +00:00
for ( const String & entry_name : queue_entries )
{
String queue_entry_str ;
2021-09-16 15:47:57 +00:00
Coordination : : Stat queue_entry_stat ;
bool exists = getZooKeeper ( ) - > tryGet ( fs : : path ( table_zookeeper_path ) / " replicas " / replica / " queue " / entry_name , queue_entry_str , & queue_entry_stat ) ;
2015-06-04 02:07:30 +00:00
if ( exists & & queue_entry_str = = entry_str )
2014-08-07 09:23:55 +00:00
{
2015-06-04 02:07:30 +00:00
queue_entry_to_wait_for = entry_name ;
2014-10-18 17:37:55 +00:00
break ;
2014-08-07 09:23:55 +00:00
}
2021-09-16 15:47:57 +00:00
else if ( ! entry . log_entry_id . empty ( ) )
{
/// Check if the id matches rather than just contents. This entry
/// might have been written by different ClickHouse versions and
/// it is hard to guarantee same text representation.
ReplicatedMergeTreeLogEntryData queue_entry = * ReplicatedMergeTreeLogEntry : : parse ( queue_entry_str , queue_entry_stat ) ;
if ( entry . log_entry_id = = queue_entry . log_entry_id )
{
queue_entry_to_wait_for = entry_name ;
break ;
}
}
2014-10-18 17:37:55 +00:00
}
2017-04-01 07:20:54 +00:00
2017-03-12 19:18:07 +00:00
/// While looking for the record, it has already been executed and deleted.
2015-06-04 02:07:30 +00:00
if ( queue_entry_to_wait_for . empty ( ) )
{
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " No corresponding node found. Assuming it has been already processed. Found {} nodes " , queue_entries . size ( ) ) ;
2020-05-12 14:11:09 +00:00
return true ;
2015-06-04 02:07:30 +00:00
}
2017-04-01 07:20:54 +00:00
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Waiting for {} to disappear from {} queue " , queue_entry_to_wait_for , replica ) ;
2017-04-01 07:20:54 +00:00
2020-05-12 14:11:09 +00:00
/// Third - wait until the entry disappears from the replica queue or replica become inactive.
2021-05-08 10:59:55 +00:00
String path_to_wait_on = fs : : path ( table_zookeeper_path ) / " replicas " / replica / " queue " / queue_entry_to_wait_for ;
2020-05-12 14:11:09 +00:00
2020-09-09 13:32:50 +00:00
return getZooKeeper ( ) - > waitForDisappear ( path_to_wait_on , stop_waiting ) ;
2014-08-07 09:23:55 +00:00
}
2014-10-07 18:44:03 +00:00
void StorageReplicatedMergeTree : : getStatus ( Status & res , bool with_zk_fields )
2014-10-06 05:18:17 +00:00
{
2016-01-17 08:12:48 +00:00
auto zookeeper = tryGetZooKeeper ( ) ;
2019-08-26 18:08:58 +00:00
const auto storage_settings_ptr = getSettings ( ) ;
2017-04-01 07:20:54 +00:00
2018-04-06 16:06:07 +00:00
res . is_leader = is_leader ;
2019-08-26 18:08:58 +00:00
res . can_become_leader = storage_settings_ptr - > replicated_can_become_leader ;
2014-12-11 02:04:13 +00:00
res . is_readonly = is_readonly ;
2014-11-07 01:12:55 +00:00
res . is_session_expired = ! zookeeper | | zookeeper - > expired ( ) ;
2017-04-01 07:20:54 +00:00
2016-01-10 04:43:30 +00:00
res . queue = queue . getStatus ( ) ;
2017-04-17 15:06:12 +00:00
res . absolute_delay = getAbsoluteDelay ( ) ; /// NOTE: may be slightly inconsistent with queue status.
2016-04-09 03:50:02 +00:00
res . parts_to_check = part_check_thread . size ( ) ;
2017-04-01 07:20:54 +00:00
2014-10-06 05:18:17 +00:00
res . zookeeper_path = zookeeper_path ;
res . replica_name = replica_name ;
res . replica_path = replica_path ;
2020-01-30 12:54:52 +00:00
res . columns_version = - 1 ;
2017-04-01 07:20:54 +00:00
2020-02-15 00:11:09 +00:00
res . log_max_index = 0 ;
res . log_pointer = 0 ;
res . total_replicas = 0 ;
res . active_replicas = 0 ;
2021-08-09 12:58:23 +00:00
res . last_queue_update_exception = getLastQueueUpdateException ( ) ;
2017-04-01 07:20:54 +00:00
2020-02-15 00:11:09 +00:00
if ( with_zk_fields & & ! res . is_session_expired )
{
try
2014-10-07 18:44:03 +00:00
{
2021-05-08 10:59:55 +00:00
auto log_entries = zookeeper - > getChildren ( fs : : path ( zookeeper_path ) / " log " ) ;
2017-04-01 07:20:54 +00:00
2021-05-04 11:29:50 +00:00
if ( ! log_entries . empty ( ) )
2020-02-15 00:11:09 +00:00
{
const String & last_log_entry = * std : : max_element ( log_entries . begin ( ) , log_entries . end ( ) ) ;
res . log_max_index = parse < UInt64 > ( last_log_entry . substr ( strlen ( " log- " ) ) ) ;
}
2017-04-01 07:20:54 +00:00
2021-05-08 10:59:55 +00:00
String log_pointer_str = zookeeper - > get ( fs : : path ( replica_path ) / " log_pointer " ) ;
2020-02-15 00:11:09 +00:00
res . log_pointer = log_pointer_str . empty ( ) ? 0 : parse < UInt64 > ( log_pointer_str ) ;
2017-04-01 07:20:54 +00:00
2021-05-08 10:59:55 +00:00
auto all_replicas = zookeeper - > getChildren ( fs : : path ( zookeeper_path ) / " replicas " ) ;
2020-02-15 00:11:09 +00:00
res . total_replicas = all_replicas . size ( ) ;
for ( const String & replica : all_replicas )
2021-08-04 13:19:42 +00:00
{
bool is_replica_active = zookeeper - > exists ( fs : : path ( zookeeper_path ) / " replicas " / replica / " is_active " ) ;
res . active_replicas + = static_cast < UInt8 > ( is_replica_active ) ;
res . replica_is_active . emplace ( replica , is_replica_active ) ;
}
2020-02-15 00:11:09 +00:00
}
2020-02-15 00:13:00 +00:00
catch ( const Coordination : : Exception & )
2020-02-15 00:11:09 +00:00
{
res . zookeeper_exception = getCurrentExceptionMessage ( false ) ;
}
2014-10-06 05:18:17 +00:00
}
}
2014-10-09 20:28:33 +00:00
2015-09-24 00:21:02 +00:00
void StorageReplicatedMergeTree : : getQueue ( LogEntriesData & res , String & replica_name_ )
{
replica_name_ = replica_name ;
2016-01-10 04:43:30 +00:00
queue . getEntries ( res ) ;
2015-09-24 00:21:02 +00:00
}
2020-11-24 14:24:48 +00:00
std : : vector < PartMovesBetweenShardsOrchestrator : : Entry > StorageReplicatedMergeTree : : getPartMovesBetweenShardsEntries ( )
{
return part_moves_between_shards_orchestrator . getEntries ( ) ;
}
2017-04-17 15:06:12 +00:00
time_t StorageReplicatedMergeTree : : getAbsoluteDelay ( ) const
{
time_t min_unprocessed_insert_time = 0 ;
time_t max_processed_insert_time = 0 ;
queue . getInsertTimes ( min_unprocessed_insert_time , max_processed_insert_time ) ;
2017-05-19 18:48:25 +00:00
/// Load start time, then finish time to avoid reporting false delay when start time is updated
/// between loading of two variables.
time_t queue_update_start_time = last_queue_update_start_time . load ( ) ;
time_t queue_update_finish_time = last_queue_update_finish_time . load ( ) ;
2017-04-17 15:06:12 +00:00
time_t current_time = time ( nullptr ) ;
2017-05-19 18:48:25 +00:00
if ( ! queue_update_finish_time )
2017-04-17 15:06:12 +00:00
{
2017-05-19 18:48:25 +00:00
/// We have not updated queue even once yet (perhaps replica is readonly).
2017-04-17 15:06:12 +00:00
/// As we have no info about the current state of replication log, return effectively infinite delay.
return current_time ;
}
else if ( min_unprocessed_insert_time )
{
/// There are some unprocessed insert entries in queue.
return ( current_time > min_unprocessed_insert_time ) ? ( current_time - min_unprocessed_insert_time ) : 0 ;
}
2017-05-19 18:48:25 +00:00
else if ( queue_update_start_time > queue_update_finish_time )
2017-04-17 15:06:12 +00:00
{
/// Queue is empty, but there are some in-flight or failed queue update attempts
/// (likely because of problems with connecting to ZooKeeper).
/// Return the time passed since last attempt.
2017-05-19 18:48:25 +00:00
return ( current_time > queue_update_start_time ) ? ( current_time - queue_update_start_time ) : 0 ;
2017-04-17 15:06:12 +00:00
}
else
{
/// Everything is up-to-date.
return 0 ;
}
}
2015-09-24 00:21:02 +00:00
2016-01-17 13:00:42 +00:00
void StorageReplicatedMergeTree : : getReplicaDelays ( time_t & out_absolute_delay , time_t & out_relative_delay )
2015-11-05 20:08:18 +00:00
{
2016-01-17 13:00:42 +00:00
assertNotReadonly ( ) ;
2017-04-17 15:06:12 +00:00
time_t current_time = time ( nullptr ) ;
2016-01-17 13:00:42 +00:00
2017-04-17 15:06:12 +00:00
out_absolute_delay = getAbsoluteDelay ( ) ;
2016-01-17 13:00:42 +00:00
out_relative_delay = 0 ;
2019-08-26 18:08:58 +00:00
const auto storage_settings_ptr = getSettings ( ) ;
2016-01-17 13:00:42 +00:00
2017-03-13 18:01:46 +00:00
/** Relative delay is the maximum difference of absolute delay from any other replica,
* ( if this replica lags behind any other live replica , or zero , otherwise ) .
2017-03-12 19:18:07 +00:00
* Calculated only if the absolute delay is large enough .
2016-01-17 13:00:42 +00:00
*/
2020-06-19 14:18:58 +00:00
if ( out_absolute_delay < static_cast < time_t > ( storage_settings_ptr - > min_relative_delay_to_measure ) )
2016-01-17 13:00:42 +00:00
return ;
auto zookeeper = getZooKeeper ( ) ;
time_t max_replicas_unprocessed_insert_time = 0 ;
2016-01-19 18:49:37 +00:00
bool have_replica_with_nothing_unprocessed = false ;
2021-05-08 10:59:55 +00:00
Strings replicas = zookeeper - > getChildren ( fs : : path ( zookeeper_path ) / " replicas " ) ;
2016-01-17 13:00:42 +00:00
for ( const auto & replica : replicas )
{
if ( replica = = replica_name )
continue ;
2017-03-13 18:01:46 +00:00
/// Skip dead replicas.
2021-05-08 10:59:55 +00:00
if ( ! zookeeper - > exists ( fs : : path ( zookeeper_path ) / " replicas " / replica / " is_active " ) )
2016-04-02 21:22:39 +00:00
continue ;
2016-01-17 13:00:42 +00:00
String value ;
2021-05-08 10:59:55 +00:00
if ( ! zookeeper - > tryGet ( fs : : path ( zookeeper_path ) / " replicas " / replica / " min_unprocessed_insert_time " , value ) )
2016-01-17 13:00:42 +00:00
continue ;
time_t replica_time = value . empty ( ) ? 0 : parse < time_t > ( value ) ;
2016-01-19 18:49:37 +00:00
if ( replica_time = = 0 )
{
2017-03-13 18:01:46 +00:00
/** Note
2017-03-12 19:18:07 +00:00
* The conclusion that the replica does not lag may be incorrect ,
2017-03-13 18:01:46 +00:00
* because the information about ` min_unprocessed_insert_time ` is taken
* only from that part of the log that has been moved to the queue .
2018-05-31 13:05:05 +00:00
* If the replica for some reason has stalled ` queueUpdatingTask ` ,
2017-03-13 18:01:46 +00:00
* then ` min_unprocessed_insert_time ` will be incorrect .
2016-04-02 21:22:39 +00:00
*/
2016-01-19 18:49:37 +00:00
have_replica_with_nothing_unprocessed = true ;
break ;
}
2016-01-17 13:00:42 +00:00
if ( replica_time > max_replicas_unprocessed_insert_time )
max_replicas_unprocessed_insert_time = replica_time ;
}
2015-11-05 20:08:18 +00:00
2016-01-19 18:49:37 +00:00
if ( have_replica_with_nothing_unprocessed )
out_relative_delay = out_absolute_delay ;
2017-04-17 15:06:12 +00:00
else
{
max_replicas_unprocessed_insert_time = std : : min ( current_time , max_replicas_unprocessed_insert_time ) ;
time_t min_replicas_delay = current_time - max_replicas_unprocessed_insert_time ;
if ( out_absolute_delay > min_replicas_delay )
out_relative_delay = out_absolute_delay - min_replicas_delay ;
}
2015-11-05 20:08:18 +00:00
}
2020-07-15 16:37:52 +00:00
void StorageReplicatedMergeTree : : fetchPartition (
const ASTPtr & partition ,
const StorageMetadataPtr & metadata_snapshot ,
const String & from_ ,
2021-04-13 04:40:33 +00:00
bool fetch_part ,
2021-04-10 23:33:54 +00:00
ContextPtr query_context )
2014-10-09 20:28:33 +00:00
{
2021-01-08 05:10:00 +00:00
Macros : : MacroExpansionInfo info ;
2021-05-04 11:29:50 +00:00
info . expand_special_macros_only = false ; //-V1048
2021-01-07 14:13:17 +00:00
info . table_id = getStorageID ( ) ;
info . table_id . uuid = UUIDHelpers : : Nil ;
2021-04-10 23:33:54 +00:00
auto expand_from = query_context - > getMacros ( ) - > expand ( from_ , info ) ;
2021-12-15 11:30:57 +00:00
String auxiliary_zookeeper_name = zkutil : : extractZooKeeperName ( expand_from ) ;
String from = zkutil : : extractZooKeeperPath ( expand_from , /* check_starts_with_slash */ true ) ;
2020-08-27 14:19:18 +00:00
if ( from . empty ( ) )
throw Exception ( " ZooKeeper path should not be empty " , ErrorCodes : : ILLEGAL_TYPE_OF_ARGUMENT ) ;
2020-08-28 11:12:51 +00:00
zkutil : : ZooKeeperPtr zookeeper ;
2020-11-16 12:30:54 +00:00
if ( auxiliary_zookeeper_name ! = default_zookeeper_name )
2021-04-10 23:33:54 +00:00
zookeeper = getContext ( ) - > getAuxiliaryZooKeeper ( auxiliary_zookeeper_name ) ;
2020-08-28 11:12:51 +00:00
else
zookeeper = getZooKeeper ( ) ;
2014-10-09 23:14:06 +00:00
if ( from . back ( ) = = ' / ' )
from . resize ( from . size ( ) - 1 ) ;
2017-04-01 07:20:54 +00:00
2021-04-13 04:40:33 +00:00
if ( fetch_part )
2017-08-14 18:16:11 +00:00
{
2021-04-13 04:40:33 +00:00
String part_name = partition - > as < ASTLiteral & > ( ) . value . safeGet < String > ( ) ;
2021-04-13 09:34:04 +00:00
auto part_path = findReplicaHavingPart ( part_name , from , zookeeper ) ;
2021-04-13 04:40:33 +00:00
2021-04-13 09:34:04 +00:00
if ( part_path . empty ( ) )
2021-04-14 10:55:42 +00:00
throw Exception ( ErrorCodes : : NO_REPLICA_HAS_PART , " Part {} does not exist on any replica " , part_name ) ;
2021-04-13 04:40:33 +00:00
/** Let's check that there is no such part in the `detached` directory (where we will write the downloaded parts).
* Unreliable ( there is a race condition ) - such a part may appear a little later .
*/
2021-04-14 02:05:41 +00:00
if ( checkIfDetachedPartExists ( part_name ) )
2021-04-14 09:54:56 +00:00
throw Exception ( ErrorCodes : : DUPLICATE_DATA_PART , " Detached part " + part_name + " already exists. " ) ;
2021-04-13 04:40:33 +00:00
LOG_INFO ( log , " Will fetch part {} from shard {} (zookeeper '{}') " , part_name , from_ , auxiliary_zookeeper_name ) ;
try
2019-05-12 14:57:23 +00:00
{
2021-04-13 09:34:04 +00:00
/// part name , metadata, part_path , true, 0, zookeeper
2022-04-23 09:58:56 +00:00
if ( ! fetchPart ( part_name , metadata_snapshot , part_path , true , 0 , zookeeper , /* try_fetch_shared = */ false ) )
2021-04-14 09:54:56 +00:00
throw Exception ( ErrorCodes : : UNFINISHED , " Failed to fetch part {} from {} " , part_name , from_ ) ;
2019-05-12 14:57:23 +00:00
}
2021-04-13 04:40:33 +00:00
catch ( const DB : : Exception & e )
{
if ( e . code ( ) ! = ErrorCodes : : RECEIVED_ERROR_FROM_REMOTE_IO_SERVER & & e . code ( ) ! = ErrorCodes : : RECEIVED_ERROR_TOO_MANY_REQUESTS
& & e . code ( ) ! = ErrorCodes : : CANNOT_READ_ALL_DATA )
throw ;
2019-05-12 14:57:23 +00:00
Use fmt::runtime() for LOG_* for non constexpr
Here is oneliner:
$ gg 'LOG_\(DEBUG\|TRACE\|INFO\|TEST\|WARNING\|ERROR\|FATAL\)([^,]*, [a-zA-Z]' -- :*.cpp :*.h | cut -d: -f1 | sort -u | xargs -r sed -E -i 's#(LOG_[A-Z]*)\(([^,]*), ([A-Za-z][^,)]*)#\1(\2, fmt::runtime(\3)#'
Note, that I tried to do this with coccinelle (tool for semantic
patchin), but it cannot parse C++:
$ cat fmt.cocci
@@
expression log;
expression var;
@@
-LOG_DEBUG(log, var)
+LOG_DEBUG(log, fmt::runtime(var))
I've also tried to use some macros/templates magic to do this implicitly
in logger_useful.h, but I failed to do so, and apparently it is not
possible for now.
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
v2: manual fixes
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2022-02-01 09:10:27 +00:00
LOG_INFO ( log , fmt : : runtime ( e . displayText ( ) ) ) ;
2021-04-13 04:40:33 +00:00
}
return ;
2017-08-14 18:16:11 +00:00
}
2017-04-01 07:20:54 +00:00
2021-04-13 04:40:33 +00:00
String partition_id = getPartitionIDFromQuery ( partition , query_context ) ;
LOG_INFO ( log , " Will fetch partition {} from shard {} (zookeeper '{}') " , partition_id , from_ , auxiliary_zookeeper_name ) ;
/** Let's check that there is no such partition in the `detached` directory (where we will write the downloaded parts).
* Unreliable ( there is a race condition ) - such a partition may appear a little later .
*/
2021-04-14 02:05:41 +00:00
if ( checkIfDetachedPartitionExists ( partition_id ) )
2021-04-13 04:40:33 +00:00
throw Exception ( " Detached partition " + partition_id + " already exists. " , ErrorCodes : : PARTITION_ALREADY_EXISTS ) ;
2016-10-24 12:34:08 +00:00
zkutil : : Strings replicas ;
2014-10-09 20:28:33 +00:00
zkutil : : Strings active_replicas ;
2016-10-24 12:34:08 +00:00
String best_replica ;
2017-04-01 07:20:54 +00:00
2016-10-24 12:34:08 +00:00
{
2017-03-12 19:18:07 +00:00
/// List of replicas of source shard.
2021-05-08 10:59:55 +00:00
replicas = zookeeper - > getChildren ( fs : : path ( from ) / " replicas " ) ;
2017-04-01 07:20:54 +00:00
2017-03-12 19:18:07 +00:00
/// Leave only active replicas.
2016-10-24 12:34:08 +00:00
active_replicas . reserve ( replicas . size ( ) ) ;
2017-04-01 07:20:54 +00:00
2016-10-24 12:34:08 +00:00
for ( const String & replica : replicas )
2021-05-08 10:59:55 +00:00
if ( zookeeper - > exists ( fs : : path ( from ) / " replicas " / replica / " is_active " ) )
2016-10-24 12:34:08 +00:00
active_replicas . push_back ( replica ) ;
2017-04-01 07:20:54 +00:00
2016-10-24 12:34:08 +00:00
if ( active_replicas . empty ( ) )
throw Exception ( " No active replicas for shard " + from , ErrorCodes : : NO_ACTIVE_REPLICAS ) ;
2017-04-01 07:20:54 +00:00
2017-03-13 18:01:46 +00:00
/** You must select the best (most relevant) replica.
* This is a replica with the maximum ` log_pointer ` , then with the minimum ` queue ` size .
* NOTE This is not exactly the best criteria . It does not make sense to download old partitions ,
* and it would be nice to be able to choose the replica closest by network .
* NOTE Of course , there are data races here . You can solve it by retrying .
2016-10-24 12:34:08 +00:00
*/
Int64 max_log_pointer = - 1 ;
UInt64 min_queue_size = std : : numeric_limits < UInt64 > : : max ( ) ;
2017-04-01 07:20:54 +00:00
2016-10-24 12:34:08 +00:00
for ( const String & replica : active_replicas )
2014-10-10 00:46:37 +00:00
{
2021-05-08 10:59:55 +00:00
String current_replica_path = fs : : path ( from ) / " replicas " / replica ;
2017-04-01 07:20:54 +00:00
2021-05-08 10:59:55 +00:00
String log_pointer_str = zookeeper - > get ( fs : : path ( current_replica_path ) / " log_pointer " ) ;
2016-10-24 12:34:08 +00:00
Int64 log_pointer = log_pointer_str . empty ( ) ? 0 : parse < UInt64 > ( log_pointer_str ) ;
2017-04-01 07:20:54 +00:00
2018-08-25 01:58:14 +00:00
Coordination : : Stat stat ;
2021-05-08 10:59:55 +00:00
zookeeper - > get ( fs : : path ( current_replica_path ) / " queue " , & stat ) ;
2016-10-24 12:34:08 +00:00
size_t queue_size = stat . numChildren ;
2017-04-01 07:20:54 +00:00
2016-10-24 12:34:08 +00:00
if ( log_pointer > max_log_pointer
| | ( log_pointer = = max_log_pointer & & queue_size < min_queue_size ) )
{
max_log_pointer = log_pointer ;
min_queue_size = queue_size ;
best_replica = replica ;
}
2014-10-10 00:46:37 +00:00
}
}
2017-04-01 07:20:54 +00:00
2014-10-10 00:46:37 +00:00
if ( best_replica . empty ( ) )
throw Exception ( " Logical error: cannot choose best replica. " , ErrorCodes : : LOGICAL_ERROR ) ;
2017-04-01 07:20:54 +00:00
2020-05-23 22:24:01 +00:00
LOG_INFO ( log , " Found {} replicas, {} of them are active. Selected {} to fetch from. " , replicas . size ( ) , active_replicas . size ( ) , best_replica ) ;
2017-04-01 07:20:54 +00:00
2021-05-08 10:59:55 +00:00
String best_replica_path = fs : : path ( from ) / " replicas " / best_replica ;
2017-04-01 07:20:54 +00:00
2017-03-13 18:01:46 +00:00
/// Let's find out which parts are on the best replica.
2017-04-01 07:20:54 +00:00
2017-03-13 18:01:46 +00:00
/** Trying to download these parts.
* Some of them could be deleted due to the merge .
* In this case , update the information about the available parts and try again .
2014-10-13 17:28:59 +00:00
*/
2017-04-01 07:20:54 +00:00
2014-10-13 17:28:59 +00:00
unsigned try_no = 0 ;
Strings missing_parts ;
do
{
if ( try_no )
2020-05-23 22:24:01 +00:00
LOG_INFO ( log , " Some of parts ({}) are missing. Will try to fetch covering parts. " , missing_parts . size ( ) ) ;
2017-04-01 07:20:54 +00:00
2021-04-10 23:33:54 +00:00
if ( try_no > = query_context - > getSettings ( ) . max_fetch_partition_retries_count )
2018-03-09 23:23:15 +00:00
throw Exception ( " Too many retries to fetch parts from " + best_replica_path , ErrorCodes : : TOO_MANY_RETRIES_TO_FETCH_PARTS ) ;
2017-04-01 07:20:54 +00:00
2021-05-08 10:59:55 +00:00
Strings parts = zookeeper - > getChildren ( fs : : path ( best_replica_path ) / " parts " ) ;
2019-05-03 02:00:57 +00:00
ActiveDataPartSet active_parts_set ( format_version , parts ) ;
2014-10-13 17:28:59 +00:00
Strings parts_to_fetch ;
2017-04-01 07:20:54 +00:00
2014-10-13 17:28:59 +00:00
if ( missing_parts . empty ( ) )
{
parts_to_fetch = active_parts_set . getParts ( ) ;
2017-04-01 07:20:54 +00:00
2017-03-13 18:01:46 +00:00
/// Leaving only the parts of the desired partition.
2014-10-13 17:28:59 +00:00
Strings parts_to_fetch_partition ;
for ( const String & part : parts_to_fetch )
2017-08-14 18:16:11 +00:00
{
2019-05-03 02:00:57 +00:00
if ( MergeTreePartInfo : : fromPartName ( part , format_version ) . partition_id = = partition_id )
2014-10-13 17:28:59 +00:00
parts_to_fetch_partition . push_back ( part ) ;
2017-08-14 18:16:11 +00:00
}
2017-04-01 07:20:54 +00:00
2014-10-13 17:28:59 +00:00
parts_to_fetch = std : : move ( parts_to_fetch_partition ) ;
2017-04-01 07:20:54 +00:00
2014-10-13 18:01:58 +00:00
if ( parts_to_fetch . empty ( ) )
2017-08-14 18:16:11 +00:00
throw Exception ( " Partition " + partition_id + " on " + best_replica_path + " doesn't exist " , ErrorCodes : : PARTITION_DOESNT_EXIST ) ;
2014-10-13 17:28:59 +00:00
}
else
{
for ( const String & missing_part : missing_parts )
{
String containing_part = active_parts_set . getContainingPart ( missing_part ) ;
if ( ! containing_part . empty ( ) )
parts_to_fetch . push_back ( containing_part ) ;
else
2020-05-23 22:24:01 +00:00
LOG_WARNING ( log , " Part {} on replica {} has been vanished. " , missing_part , best_replica_path ) ;
2014-10-13 17:28:59 +00:00
}
}
2017-04-01 07:20:54 +00:00
2020-05-23 22:24:01 +00:00
LOG_INFO ( log , " Parts to fetch: {} " , parts_to_fetch . size ( ) ) ;
2017-04-01 07:20:54 +00:00
2014-10-13 17:28:59 +00:00
missing_parts . clear ( ) ;
for ( const String & part : parts_to_fetch )
{
2020-08-28 00:53:22 +00:00
bool fetched = false ;
2014-10-13 17:28:59 +00:00
try
{
2022-04-23 09:58:56 +00:00
fetched = fetchPart ( part , metadata_snapshot , best_replica_path , true , 0 , zookeeper , /* try_fetch_shared = */ false ) ;
2014-10-13 17:28:59 +00:00
}
catch ( const DB : : Exception & e )
{
2018-08-08 18:01:25 +00:00
if ( e . code ( ) ! = ErrorCodes : : RECEIVED_ERROR_FROM_REMOTE_IO_SERVER & & e . code ( ) ! = ErrorCodes : : RECEIVED_ERROR_TOO_MANY_REQUESTS
& & e . code ( ) ! = ErrorCodes : : CANNOT_READ_ALL_DATA )
2014-10-13 17:28:59 +00:00
throw ;
2017-04-01 07:20:54 +00:00
Use fmt::runtime() for LOG_* for non constexpr
Here is oneliner:
$ gg 'LOG_\(DEBUG\|TRACE\|INFO\|TEST\|WARNING\|ERROR\|FATAL\)([^,]*, [a-zA-Z]' -- :*.cpp :*.h | cut -d: -f1 | sort -u | xargs -r sed -E -i 's#(LOG_[A-Z]*)\(([^,]*), ([A-Za-z][^,)]*)#\1(\2, fmt::runtime(\3)#'
Note, that I tried to do this with coccinelle (tool for semantic
patchin), but it cannot parse C++:
$ cat fmt.cocci
@@
expression log;
expression var;
@@
-LOG_DEBUG(log, var)
+LOG_DEBUG(log, fmt::runtime(var))
I've also tried to use some macros/templates magic to do this implicitly
in logger_useful.h, but I failed to do so, and apparently it is not
possible for now.
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
v2: manual fixes
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2022-02-01 09:10:27 +00:00
LOG_INFO ( log , fmt : : runtime ( e . displayText ( ) ) ) ;
2014-10-13 17:28:59 +00:00
}
2020-08-28 00:53:22 +00:00
if ( ! fetched )
missing_parts . push_back ( part ) ;
2014-10-13 17:28:59 +00:00
}
2017-04-01 07:20:54 +00:00
2014-10-13 17:28:59 +00:00
+ + try_no ;
} while ( ! missing_parts . empty ( ) ) ;
2014-10-09 20:28:33 +00:00
}
2021-04-10 23:33:54 +00:00
void StorageReplicatedMergeTree : : mutate ( const MutationCommands & commands , ContextPtr query_context )
2018-04-19 10:33:16 +00:00
{
2018-06-05 14:55:35 +00:00
/// Overview of the mutation algorithm.
///
/// When the client executes a mutation, this method is called. It acquires block numbers in all
/// partitions, saves them in the mutation entry and writes the mutation entry to a new ZK node in
/// the /mutations folder. This block numbers are needed to determine which parts should be mutated and
/// which shouldn't (parts inserted after the mutation will have the block number higher than the
2022-09-05 01:50:24 +00:00
/// block number acquired by the mutation in that partition and so will not be mutated).
2018-06-05 14:55:35 +00:00
/// This block number is called "mutation version" in that partition.
///
/// Mutation versions are acquired atomically in all partitions, so the case when an insert in some
/// partition has the block number higher than the mutation version but the following insert into another
/// partition acquires the block number lower than the mutation version in that partition is impossible.
/// Another important invariant: mutation entries appear in /mutations in the order of their mutation
/// versions (in any partition). This means that mutations form a sequence and we can execute them in
/// the order of their mutation versions and not worry that some mutation with the smaller version
/// will suddenly appear.
///
/// During mutations individual parts are immutable - when we want to change the contents of a part
/// we prepare the new part and add it to MergeTreeData (the original part gets replaced). The fact that
/// we have mutated the part is recorded in the part->info.mutation field of MergeTreePartInfo.
/// The relation with the original part is preserved because the new part covers the same block range
/// as the original one.
///
/// We then can for each part determine its "mutation version": the version of the last mutation in
/// the mutation sequence that we regard as already applied to that part. All mutations with the greater
/// version number will still need to be applied to that part.
///
/// Execution of mutations is done asynchronously. All replicas watch the /mutations directory and
/// load new mutation entries as they appear (see mutationsUpdatingTask()). Next we need to determine
/// how to mutate individual parts consistently with part merges. This is done by the leader replica
/// (see mergeSelectingTask() and class ReplicatedMergeTreeMergePredicate for details). Important
/// invariants here are that a) all source parts for a single merge must have the same mutation version
/// and b) any part can be mutated only once or merged only once (e.g. once we have decided to mutate
/// a part then we need to execute that mutation and can assign merges only to the new part and not to the
/// original part). Multiple consecutive mutations can be executed at once (without writing the
/// intermediate result to a part).
///
/// Leader replica records its decisions to the replication log (/log directory in ZK) in the form of
/// MUTATE_PART entries and all replicas then execute them in the background pool
2021-09-16 21:19:58 +00:00
/// (see MutateTask class). When a replica encounters a MUTATE_PART command, it is
2018-06-05 14:55:35 +00:00
/// guaranteed that the corresponding mutation entry is already loaded (when we pull entries from
/// replication log into the replica queue, we also load mutation entries). Note that just as with merges
/// the replica can decide not to do the mutation locally and fetch the mutated part from another replica
/// instead.
///
/// Mutations of individual parts are in fact pretty similar to merges, e.g. their assignment and execution
2019-08-13 10:29:31 +00:00
/// is governed by the same storage_settings. TODO: support a single "merge-mutation" operation when the data
2018-06-05 14:55:35 +00:00
/// read from the the source parts is first mutated on the fly to some uniform mutation version and then
/// merged to a resulting part.
///
/// After all needed parts are mutated (i.e. all active parts have the mutation version greater than
/// the version of this mutation), the mutation is considered done and can be deleted.
2020-11-10 10:23:46 +00:00
ReplicatedMergeTreeMutationEntry mutation_entry ;
mutation_entry . source_replica = replica_name ;
mutation_entry . commands = commands ;
2018-04-19 10:33:16 +00:00
2021-05-08 10:59:55 +00:00
const String mutations_path = fs : : path ( zookeeper_path ) / " mutations " ;
2020-11-10 10:23:46 +00:00
const auto zookeeper = getZooKeeper ( ) ;
2018-04-19 10:33:16 +00:00
/// Update the mutations_path node when creating the mutation and check its version to ensure that
/// nodes for mutations are created in the same order as the corresponding block numbers.
/// Should work well if the number of concurrent mutation requests is small.
while ( true )
{
2018-08-25 01:58:14 +00:00
Coordination : : Stat mutations_stat ;
2018-04-19 10:33:16 +00:00
zookeeper - > get ( mutations_path , & mutations_stat ) ;
2020-11-10 10:23:46 +00:00
PartitionBlockNumbersHolder partition_block_numbers_holder =
allocateBlockNumbersInAffectedPartitions ( mutation_entry . commands , query_context , zookeeper ) ;
2018-04-19 10:33:16 +00:00
2020-11-10 10:23:46 +00:00
mutation_entry . block_numbers = partition_block_numbers_holder . getBlockNumbers ( ) ;
mutation_entry . create_time = time ( nullptr ) ;
2018-04-19 10:33:16 +00:00
2020-11-10 10:23:46 +00:00
/// The following version check guarantees the linearizability property for any pair of mutations:
/// mutation with higher sequence number is guaranteed to have higher block numbers in every partition
/// (and thus will be applied strictly according to sequence numbers of mutations)
2018-08-25 01:58:14 +00:00
Coordination : : Requests requests ;
2018-04-19 10:33:16 +00:00
requests . emplace_back ( zkutil : : makeSetRequest ( mutations_path , String ( ) , mutations_stat . version ) ) ;
requests . emplace_back ( zkutil : : makeCreateRequest (
2021-05-08 10:59:55 +00:00
fs : : path ( mutations_path ) / " " , mutation_entry . toString ( ) , zkutil : : CreateMode : : PersistentSequential ) ) ;
2018-04-19 10:33:16 +00:00
2021-04-10 23:33:54 +00:00
if ( auto txn = query_context - > getZooKeeperMetadataTransaction ( ) )
2021-02-08 19:36:17 +00:00
txn - > moveOpsTo ( requests ) ;
2018-08-25 01:58:14 +00:00
Coordination : : Responses responses ;
2020-06-12 15:09:12 +00:00
Coordination : : Error rc = zookeeper - > tryMulti ( requests , responses ) ;
2018-04-19 10:33:16 +00:00
2020-11-10 10:23:46 +00:00
partition_block_numbers_holder . reset ( ) ;
2020-06-12 15:09:12 +00:00
if ( rc = = Coordination : : Error : : ZOK )
2018-06-21 13:27:36 +00:00
{
const String & path_created =
2018-08-25 01:58:14 +00:00
dynamic_cast < const Coordination : : CreateResponse * > ( responses [ 1 ] . get ( ) ) - > path_created ;
2020-11-10 10:23:46 +00:00
mutation_entry . znode_name = path_created . substr ( path_created . find_last_of ( ' / ' ) + 1 ) ;
LOG_TRACE ( log , " Created mutation with ID {} " , mutation_entry . znode_name ) ;
2018-04-19 10:33:16 +00:00
break ;
2018-06-21 13:27:36 +00:00
}
2020-06-12 15:09:12 +00:00
else if ( rc = = Coordination : : Error : : ZBADVERSION )
2018-04-19 10:33:16 +00:00
{
2021-07-30 16:34:18 +00:00
/// Cannot retry automatically, because some zookeeper ops were lost on the first attempt. Will retry on DDLWorker-level.
if ( query_context - > getZooKeeperMetadataTransaction ( ) )
throw Exception ( " Cannot execute alter, because mutations version was suddenly changed due to concurrent alter " ,
ErrorCodes : : CANNOT_ASSIGN_ALTER ) ;
2018-04-19 10:33:16 +00:00
LOG_TRACE ( log , " Version conflict when trying to create a mutation node, retrying... " ) ;
continue ;
}
else
2018-08-25 01:58:14 +00:00
throw Coordination : : Exception ( " Unable to create a mutation znode " , rc ) ;
2018-04-19 10:33:16 +00:00
}
2019-12-16 15:51:15 +00:00
2021-04-10 23:33:54 +00:00
waitMutation ( mutation_entry . znode_name , query_context - > getSettingsRef ( ) . mutations_sync ) ;
2020-01-13 16:39:20 +00:00
}
2020-01-31 12:25:31 +00:00
void StorageReplicatedMergeTree : : waitMutation ( const String & znode_name , size_t mutations_sync ) const
2020-01-29 17:28:39 +00:00
{
2020-03-20 02:49:47 +00:00
if ( ! mutations_sync )
return ;
2019-12-19 15:27:56 +00:00
/// we have to wait
2020-03-20 02:49:47 +00:00
auto zookeeper = getZooKeeper ( ) ;
Strings replicas ;
if ( mutations_sync = = 2 ) /// wait for all replicas
2021-09-11 12:16:05 +00:00
{
2021-05-08 10:59:55 +00:00
replicas = zookeeper - > getChildren ( fs : : path ( zookeeper_path ) / " replicas " ) ;
2021-09-11 12:16:05 +00:00
/// This replica should be first, to ensure that the mutation will be loaded into memory
for ( auto it = replicas . begin ( ) ; it ! = replicas . end ( ) ; + + it )
{
if ( * it = = replica_name )
{
2022-08-04 12:06:19 +00:00
std : : iter_swap ( it , replicas . begin ( ) ) ;
2021-09-11 12:16:05 +00:00
break ;
}
}
}
2020-03-20 02:49:47 +00:00
else if ( mutations_sync = = 1 ) /// just wait for ourself
replicas . push_back ( replica_name ) ;
2019-12-19 15:27:56 +00:00
2020-03-20 02:49:47 +00:00
waitMutationToFinishOnReplicas ( replicas , znode_name ) ;
2018-04-19 10:33:16 +00:00
}
2018-06-07 13:28:39 +00:00
std : : vector < MergeTreeMutationStatus > StorageReplicatedMergeTree : : getMutationsStatus ( ) const
{
return queue . getMutationsStatus ( ) ;
}
2019-02-04 13:04:02 +00:00
CancellationCode StorageReplicatedMergeTree : : killMutation ( const String & mutation_id )
2019-01-10 18:19:29 +00:00
{
2019-02-04 12:53:25 +00:00
assertNotReadonly ( ) ;
2022-02-03 10:10:05 +00:00
zkutil : : ZooKeeperPtr zookeeper = getZooKeeperAndAssertNotReadonly ( ) ;
2019-02-04 12:53:25 +00:00
2021-08-18 09:49:22 +00:00
LOG_INFO ( log , " Killing mutation {} " , mutation_id ) ;
2019-02-04 12:53:25 +00:00
auto mutation_entry = queue . removeMutation ( zookeeper , mutation_id ) ;
if ( ! mutation_entry )
2019-02-04 13:04:02 +00:00
return CancellationCode : : NotFound ;
2019-02-04 12:53:25 +00:00
/// After this point no new part mutations will start and part mutations that still exist
/// in the queue will be skipped.
/// Cancel already running part mutations.
for ( const auto & pair : mutation_entry - > block_numbers )
{
const String & partition_id = pair . first ;
Int64 block_number = pair . second ;
2021-06-06 12:24:49 +00:00
getContext ( ) - > getMergeList ( ) . cancelPartMutations ( getStorageID ( ) , partition_id , block_number ) ;
2019-02-04 12:53:25 +00:00
}
2019-02-04 13:04:02 +00:00
return CancellationCode : : CancelSent ;
2019-01-10 18:19:29 +00:00
}
2022-07-26 03:50:09 +00:00
bool StorageReplicatedMergeTree : : hasLightweightDeletedMask ( ) const
{
return has_lightweight_delete_parts . load ( std : : memory_order_relaxed ) ;
}
2017-10-06 11:30:57 +00:00
void StorageReplicatedMergeTree : : clearOldPartsAndRemoveFromZK ( )
2017-05-24 20:19:29 +00:00
{
2020-06-18 16:10:47 +00:00
auto table_lock = lockForShare (
RWLockImpl : : NO_QUERY , getSettings ( ) - > lock_acquire_timeout_for_background_operations ) ;
2017-05-24 20:19:29 +00:00
auto zookeeper = getZooKeeper ( ) ;
2022-05-07 22:53:55 +00:00
/// Now these parts are in Deleting state. If we fail to remove some of them we must roll them back to Outdated state.
/// Otherwise they will not be deleted.
2019-05-03 02:00:57 +00:00
DataPartsVector parts = grabOldParts ( ) ;
2017-10-03 14:44:10 +00:00
if ( parts . empty ( ) )
2017-05-24 20:19:29 +00:00
return ;
2019-05-03 02:00:57 +00:00
DataPartsVector parts_to_delete_only_from_filesystem ; // Only duplicates
DataPartsVector parts_to_delete_completely ; // All parts except duplicates
DataPartsVector parts_to_retry_deletion ; // Parts that should be retried due to network problems
DataPartsVector parts_to_remove_from_filesystem ; // Parts removed from ZK
2017-10-06 11:30:57 +00:00
for ( const auto & part : parts )
2017-08-29 11:58:33 +00:00
{
2017-10-06 11:30:57 +00:00
if ( ! part - > is_duplicate )
parts_to_delete_completely . emplace_back ( part ) ;
else
parts_to_delete_only_from_filesystem . emplace_back ( part ) ;
}
parts . clear ( ) ;
2017-08-29 11:58:33 +00:00
2022-05-09 13:21:21 +00:00
auto delete_parts_from_fs_and_rollback_in_case_of_error = [ this ] ( const DataPartsVector & parts_to_delete , const String & parts_type )
2017-10-06 11:30:57 +00:00
{
2022-05-07 22:53:55 +00:00
NameSet parts_failed_to_delete ;
2022-05-09 13:21:21 +00:00
clearPartsFromFilesystem ( parts_to_delete , false , & parts_failed_to_delete ) ;
2017-10-06 11:30:57 +00:00
2022-05-07 22:53:55 +00:00
DataPartsVector finally_remove_parts ;
if ( ! parts_failed_to_delete . empty ( ) )
{
DataPartsVector rollback_parts ;
2022-05-09 13:21:21 +00:00
for ( const auto & part : parts_to_delete )
2022-05-07 22:53:55 +00:00
{
if ( ! parts_failed_to_delete . contains ( part - > name ) )
finally_remove_parts . push_back ( part ) ;
else
rollback_parts . push_back ( part ) ;
}
if ( ! rollback_parts . empty ( ) )
rollbackDeletingParts ( rollback_parts ) ;
}
2022-05-08 14:40:41 +00:00
else /// all parts was successfully removed
2022-05-07 22:53:55 +00:00
{
2022-05-09 13:21:21 +00:00
finally_remove_parts = parts_to_delete ;
2022-05-07 22:53:55 +00:00
}
2022-05-09 13:21:21 +00:00
try
{
removePartsFinally ( finally_remove_parts ) ;
LOG_DEBUG ( log , " Removed {} {} parts " , finally_remove_parts . size ( ) , parts_type ) ;
}
catch ( . . . )
{
2022-05-09 13:24:50 +00:00
tryLogCurrentException ( log , " Failed to remove some parts from memory, or write info about them into part log " ) ;
2022-05-09 13:21:21 +00:00
}
} ;
2022-05-07 22:53:55 +00:00
2017-10-06 11:30:57 +00:00
/// Delete duplicate parts from filesystem
if ( ! parts_to_delete_only_from_filesystem . empty ( ) )
{
2022-05-09 13:21:21 +00:00
/// It can happen that some error appear during part removal from FS.
/// In case of such exception we have to change state of failed parts from Deleting to Outdated.
/// Otherwise nobody will try to remove them again (see grabOldParts).
delete_parts_from_fs_and_rollback_in_case_of_error ( parts_to_delete_only_from_filesystem , " old duplicate " ) ;
2017-10-06 11:30:57 +00:00
}
/// Delete normal parts from ZooKeeper
2017-10-03 14:44:10 +00:00
NameSet part_names_to_retry_deletion ;
2017-05-24 20:19:29 +00:00
try
{
2017-10-06 11:30:57 +00:00
Strings part_names_to_delete_completely ;
for ( const auto & part : parts_to_delete_completely )
part_names_to_delete_completely . emplace_back ( part - > name ) ;
2017-08-09 21:09:44 +00:00
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Removing {} old parts from ZooKeeper " , parts_to_delete_completely . size ( ) ) ;
2017-10-06 11:30:57 +00:00
removePartsFromZooKeeper ( zookeeper , part_names_to_delete_completely , & part_names_to_retry_deletion ) ;
2017-05-24 20:19:29 +00:00
}
catch ( . . . )
{
2020-05-23 22:24:01 +00:00
LOG_ERROR ( log , " There is a problem with deleting parts from ZooKeeper: {} " , getCurrentExceptionMessage ( true ) ) ;
2017-10-03 14:44:10 +00:00
}
2017-08-29 11:58:33 +00:00
2017-10-03 14:44:10 +00:00
/// Part names that were reliably deleted from ZooKeeper should be deleted from filesystem
2017-10-06 11:30:57 +00:00
auto num_reliably_deleted_parts = parts_to_delete_completely . size ( ) - part_names_to_retry_deletion . size ( ) ;
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Removed {} old parts from ZooKeeper. Removing them from filesystem. " , num_reliably_deleted_parts ) ;
2017-08-29 11:58:33 +00:00
2017-10-06 11:30:57 +00:00
/// Delete normal parts on two sets
for ( auto & part : parts_to_delete_completely )
2017-10-03 14:44:10 +00:00
{
2022-04-18 10:18:43 +00:00
if ( ! part_names_to_retry_deletion . contains ( part - > name ) )
2017-10-03 14:44:10 +00:00
parts_to_remove_from_filesystem . emplace_back ( part ) ;
else
parts_to_retry_deletion . emplace_back ( part ) ;
2017-05-24 20:19:29 +00:00
}
2017-10-03 14:44:10 +00:00
/// Will retry deletion
if ( ! parts_to_retry_deletion . empty ( ) )
2017-10-06 11:30:57 +00:00
{
2019-05-03 02:00:57 +00:00
rollbackDeletingParts ( parts_to_retry_deletion ) ;
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Will retry deletion of {} parts in the next time " , parts_to_retry_deletion . size ( ) ) ;
2017-10-06 11:30:57 +00:00
}
2017-08-29 11:58:33 +00:00
2022-05-07 22:53:55 +00:00
2017-10-03 14:44:10 +00:00
/// Remove parts from filesystem and finally from data_parts
2017-10-06 11:30:57 +00:00
if ( ! parts_to_remove_from_filesystem . empty ( ) )
2017-10-03 14:44:10 +00:00
{
2022-05-07 22:53:55 +00:00
/// It can happen that some error appear during part removal from FS.
/// In case of such exception we have to change state of failed parts from Deleting to Outdated.
/// Otherwise nobody will try to remove them again (see grabOldParts).
2022-05-09 13:21:21 +00:00
delete_parts_from_fs_and_rollback_in_case_of_error ( parts_to_remove_from_filesystem , " old " ) ;
2017-10-06 11:30:57 +00:00
}
2017-05-24 20:19:29 +00:00
}
2021-03-05 09:50:26 +00:00
void StorageReplicatedMergeTree : : removePartsFromZooKeeperWithRetries ( DataPartsVector & parts , size_t max_retries )
2018-05-21 13:49:54 +00:00
{
Strings part_names_to_remove ;
for ( const auto & part : parts )
part_names_to_remove . emplace_back ( part - > name ) ;
2021-03-05 09:50:26 +00:00
return removePartsFromZooKeeperWithRetries ( part_names_to_remove , max_retries ) ;
2018-05-21 13:49:54 +00:00
}
2021-03-05 09:50:26 +00:00
void StorageReplicatedMergeTree : : removePartsFromZooKeeperWithRetries ( const Strings & part_names , size_t max_retries )
2018-05-21 13:49:54 +00:00
{
size_t num_tries = 0 ;
2018-12-11 13:30:20 +00:00
bool success = false ;
2018-05-21 13:49:54 +00:00
2018-12-11 13:30:20 +00:00
while ( ! success & & ( max_retries = = 0 | | num_tries < max_retries ) )
2018-05-21 13:49:54 +00:00
{
try
{
2018-12-11 13:30:20 +00:00
+ + num_tries ;
success = true ;
2018-05-21 13:49:54 +00:00
auto zookeeper = getZooKeeper ( ) ;
2022-10-11 09:27:46 +00:00
Strings exists_paths ;
exists_paths . reserve ( part_names . size ( ) ) ;
2018-05-21 13:49:54 +00:00
for ( const String & part_name : part_names )
{
2022-10-11 09:27:46 +00:00
exists_paths . emplace_back ( fs : : path ( replica_path ) / " parts " / part_name ) ;
2018-12-11 13:30:20 +00:00
}
2018-05-21 13:49:54 +00:00
2022-10-11 09:27:46 +00:00
auto exists_results = zookeeper - > exists ( exists_paths ) ;
2018-12-11 13:30:20 +00:00
std : : vector < std : : future < Coordination : : MultiResponse > > remove_futures ;
remove_futures . reserve ( part_names . size ( ) ) ;
for ( size_t i = 0 ; i < part_names . size ( ) ; + + i )
{
2022-10-11 09:27:46 +00:00
Coordination : : ExistsResponse exists_resp = exists_results [ i ] ;
2020-06-12 15:09:12 +00:00
if ( exists_resp . error = = Coordination : : Error : : ZOK )
2018-12-11 13:30:20 +00:00
{
Coordination : : Requests ops ;
removePartFromZooKeeper ( part_names [ i ] , ops , exists_resp . stat . numChildren > 0 ) ;
2021-06-01 07:30:06 +00:00
remove_futures . emplace_back ( zookeeper - > asyncTryMultiNoThrow ( ops ) ) ;
2018-12-11 13:30:20 +00:00
}
2018-05-21 13:49:54 +00:00
}
2018-12-11 13:30:20 +00:00
for ( auto & future : remove_futures )
2018-05-21 13:49:54 +00:00
{
auto response = future . get ( ) ;
2020-06-12 15:09:12 +00:00
if ( response . error = = Coordination : : Error : : ZOK | | response . error = = Coordination : : Error : : ZNONODE )
2018-05-21 13:49:54 +00:00
continue ;
2018-08-25 01:58:14 +00:00
if ( Coordination : : isHardwareError ( response . error ) )
2018-05-21 13:49:54 +00:00
{
2018-12-11 13:30:20 +00:00
success = false ;
2018-05-21 13:49:54 +00:00
continue ;
}
2018-08-25 01:58:14 +00:00
throw Coordination : : Exception ( response . error ) ;
2018-05-21 13:49:54 +00:00
}
}
2018-08-25 01:58:14 +00:00
catch ( Coordination : : Exception & e )
2018-05-21 13:49:54 +00:00
{
2018-12-11 13:30:20 +00:00
success = false ;
2018-05-21 13:49:54 +00:00
2018-08-25 01:58:14 +00:00
if ( Coordination : : isHardwareError ( e . code ) )
2018-05-21 13:49:54 +00:00
tryLogCurrentException ( log , __PRETTY_FUNCTION__ ) ;
else
throw ;
}
2018-12-11 13:30:20 +00:00
if ( ! success & & num_tries < max_retries )
2018-05-21 13:49:54 +00:00
std : : this_thread : : sleep_for ( std : : chrono : : milliseconds ( 1000 ) ) ;
}
2021-03-05 09:50:26 +00:00
if ( ! success )
throw Exception ( ErrorCodes : : UNFINISHED , " Failed to remove parts from ZooKeeper after {} retries " , num_tries ) ;
2018-05-21 13:49:54 +00:00
}
2018-12-11 13:30:20 +00:00
void StorageReplicatedMergeTree : : removePartsFromZooKeeper (
zkutil : : ZooKeeperPtr & zookeeper , const Strings & part_names , NameSet * parts_should_be_retried )
2017-08-09 21:09:44 +00:00
{
2022-10-11 09:27:46 +00:00
Strings exists_paths ;
2018-12-11 13:30:20 +00:00
std : : vector < std : : future < Coordination : : MultiResponse > > remove_futures ;
2022-10-11 09:27:46 +00:00
exists_paths . reserve ( part_names . size ( ) ) ;
2018-12-11 13:30:20 +00:00
remove_futures . reserve ( part_names . size ( ) ) ;
try
{
2019-06-03 14:58:19 +00:00
/// Exception can be thrown from loop
/// if zk session will be dropped
for ( const String & part_name : part_names )
{
2022-10-11 09:27:46 +00:00
exists_paths . emplace_back ( fs : : path ( replica_path ) / " parts " / part_name ) ;
2019-06-03 14:58:19 +00:00
}
2022-10-11 09:27:46 +00:00
auto exists_results = zookeeper - > exists ( exists_paths ) ;
2018-12-11 13:30:20 +00:00
for ( size_t i = 0 ; i < part_names . size ( ) ; + + i )
2017-08-09 21:09:44 +00:00
{
2022-10-11 09:27:46 +00:00
auto exists_resp = exists_results [ i ] ;
2020-06-12 15:09:12 +00:00
if ( exists_resp . error = = Coordination : : Error : : ZOK )
2017-08-29 11:58:33 +00:00
{
2018-12-11 13:30:20 +00:00
Coordination : : Requests ops ;
removePartFromZooKeeper ( part_names [ i ] , ops , exists_resp . stat . numChildren > 0 ) ;
2021-06-01 07:30:06 +00:00
remove_futures . emplace_back ( zookeeper - > asyncTryMultiNoThrow ( ops ) ) ;
2017-10-03 14:44:10 +00:00
}
2018-12-11 13:30:20 +00:00
else
2017-08-29 11:58:33 +00:00
{
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " There is no part {} in ZooKeeper, it was only in filesystem " , part_names [ i ] ) ;
2018-12-11 13:30:20 +00:00
// emplace invalid future so that the total number of futures is the same as part_names.size();
remove_futures . emplace_back ( ) ;
2017-08-29 11:58:33 +00:00
}
2018-12-11 13:30:20 +00:00
}
}
catch ( const Coordination : : Exception & e )
{
if ( parts_should_be_retried & & Coordination : : isHardwareError ( e . code ) )
parts_should_be_retried - > insert ( part_names . begin ( ) , part_names . end ( ) ) ;
throw ;
}
for ( size_t i = 0 ; i < remove_futures . size ( ) ; + + i )
{
auto & future = remove_futures [ i ] ;
if ( ! future . valid ( ) )
continue ;
2017-08-29 11:58:33 +00:00
2018-12-11 13:30:20 +00:00
auto response = future . get ( ) ;
2020-06-12 15:09:12 +00:00
if ( response . error = = Coordination : : Error : : ZOK )
2018-12-11 13:30:20 +00:00
continue ;
2020-06-12 15:09:12 +00:00
else if ( response . error = = Coordination : : Error : : ZNONODE )
2018-12-11 13:30:20 +00:00
{
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " There is no part {} in ZooKeeper, it was only in filesystem " , part_names [ i ] ) ;
2018-12-11 13:30:20 +00:00
continue ;
2017-08-09 21:09:44 +00:00
}
2018-12-11 13:30:20 +00:00
else if ( Coordination : : isHardwareError ( response . error ) )
{
if ( parts_should_be_retried )
parts_should_be_retried - > insert ( part_names [ i ] ) ;
continue ;
2017-08-09 21:09:44 +00:00
}
2018-12-11 13:30:20 +00:00
else
2020-06-12 15:09:12 +00:00
LOG_WARNING ( log , " Cannot remove part {} from ZooKeeper: {} " , part_names [ i ] , Coordination : : errorMessage ( response . error ) ) ;
2017-08-09 21:09:44 +00:00
}
}
2020-11-02 17:30:53 +00:00
void StorageReplicatedMergeTree : : getClearBlocksInPartitionOps (
Coordination : : Requests & ops , zkutil : : ZooKeeper & zookeeper , const String & partition_id , Int64 min_block_num , Int64 max_block_num )
2017-11-15 16:32:47 +00:00
{
Strings blocks ;
2021-05-08 10:59:55 +00:00
if ( Coordination : : Error : : ZOK ! = zookeeper . tryGetChildren ( fs : : path ( zookeeper_path ) / " blocks " , blocks ) )
2017-11-15 16:32:47 +00:00
throw Exception ( zookeeper_path + " /blocks doesn't exist " , ErrorCodes : : NOT_FOUND_NODE ) ;
String partition_prefix = partition_id + " _ " ;
2018-08-25 01:58:14 +00:00
zkutil : : AsyncResponses < Coordination : : GetResponse > get_futures ;
2021-08-24 12:57:49 +00:00
2017-11-15 16:32:47 +00:00
for ( const String & block_id : blocks )
{
if ( startsWith ( block_id , partition_prefix ) )
{
2021-05-08 10:59:55 +00:00
String path = fs : : path ( zookeeper_path ) / " blocks " / block_id ;
2017-11-15 16:32:47 +00:00
get_futures . emplace_back ( path , zookeeper . asyncTryGet ( path ) ) ;
}
}
for ( auto & pair : get_futures )
{
const String & path = pair . first ;
2018-03-24 01:00:12 +00:00
auto result = pair . second . get ( ) ;
2017-11-15 16:32:47 +00:00
2020-06-12 15:09:12 +00:00
if ( result . error = = Coordination : : Error : : ZNONODE )
2017-11-15 16:32:47 +00:00
continue ;
2018-03-24 01:00:12 +00:00
ReadBufferFromString buf ( result . data ) ;
2021-08-24 12:57:49 +00:00
const auto part_info = MergeTreePartInfo : : tryParsePartName ( result . data , format_version ) ;
if ( ! part_info | | ( min_block_num < = part_info - > min_block & & part_info - > max_block < = max_block_num ) )
2020-11-02 17:30:53 +00:00
ops . emplace_back ( zkutil : : makeRemoveRequest ( path , - 1 ) ) ;
2017-11-15 16:32:47 +00:00
}
2020-11-02 17:30:53 +00:00
}
2017-11-15 16:32:47 +00:00
2020-11-02 17:30:53 +00:00
void StorageReplicatedMergeTree : : clearBlocksInPartition (
zkutil : : ZooKeeper & zookeeper , const String & partition_id , Int64 min_block_num , Int64 max_block_num )
{
Coordination : : Requests delete_requests ;
getClearBlocksInPartitionOps ( delete_requests , zookeeper , partition_id , min_block_num , max_block_num ) ;
Coordination : : Responses delete_responses ;
auto code = zookeeper . tryMulti ( delete_requests , delete_responses ) ;
if ( code ! = Coordination : : Error : : ZOK )
2017-11-15 16:32:47 +00:00
{
2020-11-02 17:30:53 +00:00
for ( size_t i = 0 ; i < delete_requests . size ( ) ; + + i )
if ( delete_responses [ i ] - > error ! = Coordination : : Error : : ZOK )
LOG_WARNING ( log , " Error while deleting ZooKeeper path `{}`: {}, ignoring. " , delete_requests [ i ] - > getPath ( ) , Coordination : : errorMessage ( delete_responses [ i ] - > error ) ) ;
2017-11-15 16:32:47 +00:00
}
2020-11-02 17:30:53 +00:00
LOG_TRACE ( log , " Deleted {} deduplication block IDs in partition ID {} " , delete_requests . size ( ) , partition_id ) ;
2017-11-15 16:32:47 +00:00
}
2020-06-26 11:30:23 +00:00
void StorageReplicatedMergeTree : : replacePartitionFrom (
2021-04-10 23:33:54 +00:00
const StoragePtr & source_table , const ASTPtr & partition , bool replace , ContextPtr query_context )
2018-05-21 13:49:54 +00:00
{
2020-03-17 18:07:54 +00:00
/// First argument is true, because we possibly will add new data to current table.
2021-04-10 23:33:54 +00:00
auto lock1 = lockForShare ( query_context - > getCurrentQueryId ( ) , query_context - > getSettingsRef ( ) . lock_acquire_timeout ) ;
auto lock2 = source_table - > lockForShare ( query_context - > getCurrentQueryId ( ) , query_context - > getSettingsRef ( ) . lock_acquire_timeout ) ;
2022-04-21 12:39:12 +00:00
auto storage_settings_ptr = getSettings ( ) ;
2018-05-21 13:49:54 +00:00
2020-06-17 10:34:23 +00:00
auto source_metadata_snapshot = source_table - > getInMemoryMetadataPtr ( ) ;
auto metadata_snapshot = getInMemoryMetadataPtr ( ) ;
2018-05-21 13:49:54 +00:00
Stopwatch watch ;
2020-06-17 10:34:23 +00:00
MergeTreeData & src_data = checkStructureAndGetMergeTreeData ( source_table , source_metadata_snapshot , metadata_snapshot ) ;
2021-04-10 23:33:54 +00:00
String partition_id = getPartitionIDFromQuery ( partition , query_context ) ;
2018-05-21 13:49:54 +00:00
2021-08-23 17:50:50 +00:00
/// NOTE: Some covered parts may be missing in src_all_parts if corresponding log entries are not executed yet.
2021-11-17 18:14:14 +00:00
DataPartsVector src_all_parts = src_data . getVisibleDataPartsVectorInPartition ( query_context , partition_id ) ;
2018-05-21 13:49:54 +00:00
2020-05-23 22:24:01 +00:00
LOG_DEBUG ( log , " Cloning {} parts " , src_all_parts . size ( ) ) ;
2018-05-21 13:49:54 +00:00
static const String TMP_PREFIX = " tmp_replace_from_ " ;
2021-11-03 07:46:33 +00:00
auto zookeeper = getZooKeeper ( ) ;
2018-05-21 13:49:54 +00:00
2021-11-03 07:46:33 +00:00
/// Retry if alter_partition_version changes
for ( size_t retry = 0 ; retry < 1000 ; + + retry )
2018-05-21 13:49:54 +00:00
{
2021-11-03 07:46:33 +00:00
DataPartsVector src_parts ;
MutableDataPartsVector dst_parts ;
2022-08-09 16:44:51 +00:00
std : : vector < scope_guard > dst_parts_locks ;
2021-11-03 07:46:33 +00:00
Strings block_id_paths ;
Strings part_checksums ;
std : : vector < EphemeralLockInZooKeeper > ephemeral_locks ;
String alter_partition_version_path = zookeeper_path + " /alter_partition_version " ;
Coordination : : Stat alter_partition_version_stat ;
zookeeper - > get ( alter_partition_version_path , & alter_partition_version_stat ) ;
2018-05-21 13:49:54 +00:00
2021-11-03 07:46:33 +00:00
/// Firstly, generate last block number and compute drop_range
/// NOTE: Even if we make ATTACH PARTITION instead of REPLACE PARTITION drop_range will not be empty, it will contain a block.
/// So, such case has special meaning, if drop_range contains only one block it means that nothing to drop.
/// TODO why not to add normal DROP_RANGE entry to replication queue if `replace` is true?
MergeTreePartInfo drop_range ;
std : : optional < EphemeralLockInZooKeeper > delimiting_block_lock ;
bool partition_was_empty = ! getFakePartCoveringAllPartsInPartition ( partition_id , drop_range , delimiting_block_lock , true ) ;
if ( replace & & partition_was_empty )
{
/// Nothing to drop, will just attach new parts
LOG_INFO ( log , " Partition {} was empty, REPLACE PARTITION will work as ATTACH PARTITION FROM " , drop_range . partition_id ) ;
replace = false ;
}
2018-05-21 13:49:54 +00:00
2021-11-03 07:46:33 +00:00
if ( ! replace )
{
/// It's ATTACH PARTITION FROM, not REPLACE PARTITION. We have to reset drop range
drop_range = makeDummyDropRangeForMovePartitionOrAttachPartitionFrom ( partition_id ) ;
}
2018-05-21 13:49:54 +00:00
2021-11-03 07:46:33 +00:00
assert ( replace = = ! LogEntry : : ReplaceRangeEntry : : isMovePartitionOrAttachFrom ( drop_range ) ) ;
2018-05-21 13:49:54 +00:00
2021-11-03 07:46:33 +00:00
String drop_range_fake_part_name = getPartNamePossiblyFake ( format_version , drop_range ) ;
2022-04-19 13:53:10 +00:00
std : : vector < MergeTreeData : : HardlinkedFiles > hardlinked_files_for_parts ;
2018-05-21 13:49:54 +00:00
2021-11-03 07:46:33 +00:00
for ( const auto & src_part : src_all_parts )
{
/// We also make some kind of deduplication to avoid duplicated parts in case of ATTACH PARTITION
/// Assume that merges in the partition are quite rare
/// Save deduplication block ids with special prefix replace_partition
2019-06-19 16:16:13 +00:00
2021-11-03 07:46:33 +00:00
if ( ! canReplacePartition ( src_part ) )
throw Exception (
" Cannot replace partition ' " + partition_id + " ' because part ' " + src_part - > name + " ' has inconsistent granularity with table " ,
ErrorCodes : : LOGICAL_ERROR ) ;
2020-03-07 00:05:49 +00:00
2021-11-03 07:46:33 +00:00
String hash_hex = src_part - > checksums . getTotalChecksumHex ( ) ;
2020-03-07 00:05:49 +00:00
2021-11-03 07:46:33 +00:00
if ( replace )
LOG_INFO ( log , " Trying to replace {} with hash_hex {} " , src_part - > name , hash_hex ) ;
else
LOG_INFO ( log , " Trying to attach {} with hash_hex {} " , src_part - > name , hash_hex ) ;
2018-05-21 13:49:54 +00:00
2021-11-03 07:46:33 +00:00
String block_id_path = replace ? " " : ( fs : : path ( zookeeper_path ) / " blocks " / ( partition_id + " _replace_from_ " + hash_hex ) ) ;
auto lock = allocateBlockNumber ( partition_id , zookeeper , block_id_path ) ;
if ( ! lock )
{
LOG_INFO ( log , " Part {} (hash {}) has been already attached " , src_part - > name , hash_hex ) ;
continue ;
}
UInt64 index = lock - > getNumber ( ) ;
MergeTreePartInfo dst_part_info ( partition_id , index , index , src_part - > info . level ) ;
2022-04-19 13:53:10 +00:00
MergeTreeData : : HardlinkedFiles hardlinked_files ;
2022-04-21 12:39:12 +00:00
bool copy_instead_of_hardlink = storage_settings_ptr - > allow_remote_fs_zero_copy_replication
& & src_part - > isStoredOnRemoteDiskWithZeroCopySupport ( ) ;
2022-09-27 13:23:02 +00:00
auto [ dst_part , part_lock ] = cloneAndLoadDataPartOnSameDisk ( src_part , TMP_PREFIX , dst_part_info , metadata_snapshot , NO_TRANSACTION_PTR , & hardlinked_files , copy_instead_of_hardlink , { } ) ;
2021-11-03 07:46:33 +00:00
src_parts . emplace_back ( src_part ) ;
dst_parts . emplace_back ( dst_part ) ;
2022-08-09 16:44:51 +00:00
dst_parts_locks . emplace_back ( std : : move ( part_lock ) ) ;
2021-11-03 07:46:33 +00:00
ephemeral_locks . emplace_back ( std : : move ( * lock ) ) ;
block_id_paths . emplace_back ( block_id_path ) ;
part_checksums . emplace_back ( hash_hex ) ;
2022-04-19 13:53:10 +00:00
hardlinked_files_for_parts . emplace_back ( hardlinked_files ) ;
2018-05-21 13:49:54 +00:00
}
2021-11-03 07:46:33 +00:00
ReplicatedMergeTreeLogEntryData entry ;
{
auto src_table_id = src_data . getStorageID ( ) ;
entry . type = ReplicatedMergeTreeLogEntryData : : REPLACE_RANGE ;
entry . source_replica = replica_name ;
entry . create_time = time ( nullptr ) ;
entry . replace_range_entry = std : : make_shared < ReplicatedMergeTreeLogEntryData : : ReplaceRangeEntry > ( ) ;
2018-05-21 13:49:54 +00:00
2021-11-03 07:46:33 +00:00
auto & entry_replace = * entry . replace_range_entry ;
entry_replace . drop_range_part_name = drop_range_fake_part_name ;
entry_replace . from_database = src_table_id . database_name ;
entry_replace . from_table = src_table_id . table_name ;
for ( const auto & part : src_parts )
entry_replace . src_part_names . emplace_back ( part - > name ) ;
for ( const auto & part : dst_parts )
entry_replace . new_part_names . emplace_back ( part - > name ) ;
for ( const String & checksum : part_checksums )
entry_replace . part_names_checksums . emplace_back ( checksum ) ;
entry_replace . columns_version = - 1 ;
}
2018-05-21 13:49:54 +00:00
2021-11-03 07:46:33 +00:00
/// Remove deduplication block_ids of replacing parts
if ( replace )
clearBlocksInPartition ( * zookeeper , drop_range . partition_id , drop_range . max_block , drop_range . max_block ) ;
2018-05-21 13:49:54 +00:00
2021-11-03 07:46:33 +00:00
DataPartsVector parts_to_remove ;
Coordination : : Responses op_results ;
2018-05-21 13:49:54 +00:00
2021-11-03 07:46:33 +00:00
try
2018-05-21 13:49:54 +00:00
{
2021-11-03 07:46:33 +00:00
Coordination : : Requests ops ;
for ( size_t i = 0 ; i < dst_parts . size ( ) ; + + i )
2018-05-21 13:49:54 +00:00
{
2021-11-03 07:46:33 +00:00
getCommitPartOps ( ops , dst_parts [ i ] , block_id_paths [ i ] ) ;
ephemeral_locks [ i ] . getUnlockOps ( ops ) ;
2018-05-21 13:49:54 +00:00
}
2021-11-03 07:46:33 +00:00
if ( auto txn = query_context - > getZooKeeperMetadataTransaction ( ) )
txn - > moveOpsTo ( ops ) ;
2021-02-08 19:36:17 +00:00
2021-11-03 07:46:33 +00:00
delimiting_block_lock - > getUnlockOps ( ops ) ;
/// Check and update version to avoid race with DROP_RANGE
ops . emplace_back ( zkutil : : makeSetRequest ( alter_partition_version_path , " " , alter_partition_version_stat . version ) ) ;
/// Just update version, because merges assignment relies on it
ops . emplace_back ( zkutil : : makeSetRequest ( fs : : path ( zookeeper_path ) / " log " , " " , - 1 ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( fs : : path ( zookeeper_path ) / " log/log- " , entry . toString ( ) , zkutil : : CreateMode : : PersistentSequential ) ) ;
2018-05-21 13:49:54 +00:00
2022-03-18 11:01:26 +00:00
Transaction transaction ( * this , NO_TRANSACTION_RAW ) ;
2021-11-03 07:46:33 +00:00
{
auto data_parts_lock = lockParts ( ) ;
2022-06-24 15:19:59 +00:00
for ( auto & part : dst_parts )
2022-10-22 22:51:59 +00:00
renameTempPartAndReplaceUnlocked ( part , transaction , data_parts_lock ) ;
2021-11-03 07:46:33 +00:00
}
2018-05-21 13:49:54 +00:00
2022-04-19 13:53:10 +00:00
for ( size_t i = 0 ; i < dst_parts . size ( ) ; + + i )
2022-06-30 20:51:27 +00:00
lockSharedData ( * dst_parts [ i ] , false , hardlinked_files_for_parts [ i ] ) ;
2022-04-19 13:53:10 +00:00
2021-11-03 07:46:33 +00:00
Coordination : : Error code = zookeeper - > tryMulti ( ops , op_results ) ;
if ( code = = Coordination : : Error : : ZOK )
delimiting_block_lock - > assumeUnlocked ( ) ;
else if ( code = = Coordination : : Error : : ZBADVERSION )
2021-11-09 10:02:17 +00:00
{
/// Cannot retry automatically, because some zookeeper ops were lost on the first attempt. Will retry on DDLWorker-level.
if ( query_context - > getZooKeeperMetadataTransaction ( ) )
throw Exception (
2021-11-09 10:19:00 +00:00
" Cannot execute alter, because alter partition version was suddenly changed due to concurrent alter " ,
2021-11-09 10:02:17 +00:00
ErrorCodes : : CANNOT_ASSIGN_ALTER ) ;
2021-11-03 07:46:33 +00:00
continue ;
2021-11-09 10:02:17 +00:00
}
2021-11-03 07:46:33 +00:00
else
zkutil : : KeeperMultiException : : check ( code , ops , op_results ) ;
2018-05-21 13:49:54 +00:00
2021-11-03 07:46:33 +00:00
{
auto data_parts_lock = lockParts ( ) ;
transaction . commit ( & data_parts_lock ) ;
if ( replace )
2022-05-04 14:22:06 +00:00
parts_to_remove = removePartsInRangeFromWorkingSetAndGetPartsToRemoveFromZooKeeper ( NO_TRANSACTION_RAW , drop_range , data_parts_lock ) ;
2021-11-03 07:46:33 +00:00
}
2018-05-21 13:49:54 +00:00
2021-11-03 07:46:33 +00:00
PartLog : : addNewParts ( getContext ( ) , dst_parts , watch . elapsed ( ) ) ;
}
catch ( . . . )
{
PartLog : : addNewParts ( getContext ( ) , dst_parts , watch . elapsed ( ) , ExecutionStatus : : fromCurrentException ( ) ) ;
2022-04-19 13:53:10 +00:00
for ( const auto & dst_part : dst_parts )
unlockSharedData ( * dst_part ) ;
2021-11-03 07:46:33 +00:00
throw ;
2018-05-21 13:49:54 +00:00
}
2021-11-03 07:46:33 +00:00
String log_znode_path = dynamic_cast < const Coordination : : CreateResponse & > ( * op_results . back ( ) ) . path_created ;
entry . znode_name = log_znode_path . substr ( log_znode_path . find_last_of ( ' / ' ) + 1 ) ;
2018-05-21 13:49:54 +00:00
2021-11-03 07:46:33 +00:00
for ( auto & lock : ephemeral_locks )
lock . assumeUnlocked ( ) ;
2018-05-21 13:49:54 +00:00
2021-11-03 07:46:33 +00:00
/// Forcibly remove replaced parts from ZooKeeper
2022-03-21 11:09:00 +00:00
removePartsFromZooKeeperWithRetries ( parts_to_remove ) ;
2018-05-21 13:49:54 +00:00
2021-11-03 07:46:33 +00:00
/// Speedup removing of replaced parts from filesystem
parts_to_remove . clear ( ) ;
cleanup_thread . wakeup ( ) ;
2018-05-21 13:49:54 +00:00
2021-11-03 07:46:33 +00:00
lock2 . reset ( ) ;
lock1 . reset ( ) ;
2018-05-21 13:49:54 +00:00
2021-11-03 07:46:33 +00:00
waitForLogEntryToBeProcessedIfNecessary ( entry , query_context ) ;
2021-08-20 12:59:57 +00:00
2021-11-03 07:46:33 +00:00
return ;
}
throw Exception (
ErrorCodes : : CANNOT_ASSIGN_ALTER , " Cannot assign ALTER PARTITION, because another ALTER PARTITION query was concurrently executed " ) ;
2018-05-21 13:49:54 +00:00
}
2021-04-10 23:33:54 +00:00
void StorageReplicatedMergeTree : : movePartitionToTable ( const StoragePtr & dest_table , const ASTPtr & partition , ContextPtr query_context )
2019-07-26 08:42:17 +00:00
{
2021-04-10 23:33:54 +00:00
auto lock1 = lockForShare ( query_context - > getCurrentQueryId ( ) , query_context - > getSettingsRef ( ) . lock_acquire_timeout ) ;
auto lock2 = dest_table - > lockForShare ( query_context - > getCurrentQueryId ( ) , query_context - > getSettingsRef ( ) . lock_acquire_timeout ) ;
2022-04-21 12:39:12 +00:00
auto storage_settings_ptr = getSettings ( ) ;
2019-09-17 09:00:20 +00:00
2019-08-28 08:24:17 +00:00
auto dest_table_storage = std : : dynamic_pointer_cast < StorageReplicatedMergeTree > ( dest_table ) ;
if ( ! dest_table_storage )
2020-02-21 16:57:40 +00:00
throw Exception ( " Table " + getStorageID ( ) . getNameForLogs ( ) + " supports movePartitionToTable only for ReplicatedMergeTree family of table engines. "
2019-08-28 08:24:17 +00:00
" Got " + dest_table - > getName ( ) , ErrorCodes : : NOT_IMPLEMENTED ) ;
2020-01-08 09:57:27 +00:00
if ( dest_table_storage - > getStoragePolicy ( ) ! = this - > getStoragePolicy ( ) )
2020-01-15 11:55:20 +00:00
throw Exception ( " Destination table " + dest_table_storage - > getStorageID ( ) . getNameForLogs ( ) +
" should have the same storage policy of source table " + getStorageID ( ) . getNameForLogs ( ) + " . " +
getStorageID ( ) . getNameForLogs ( ) + " : " + this - > getStoragePolicy ( ) - > getName ( ) + " , " +
2020-10-16 11:58:47 +00:00
getStorageID ( ) . getNameForLogs ( ) + " : " + dest_table_storage - > getStoragePolicy ( ) - > getName ( ) , ErrorCodes : : UNKNOWN_POLICY ) ;
2019-08-28 08:24:17 +00:00
2020-06-17 10:34:23 +00:00
auto dest_metadata_snapshot = dest_table - > getInMemoryMetadataPtr ( ) ;
auto metadata_snapshot = getInMemoryMetadataPtr ( ) ;
2019-08-28 08:24:17 +00:00
Stopwatch watch ;
2020-06-17 10:34:23 +00:00
MergeTreeData & src_data = dest_table_storage - > checkStructureAndGetMergeTreeData ( * this , metadata_snapshot , dest_metadata_snapshot ) ;
2020-01-15 11:55:20 +00:00
auto src_data_id = src_data . getStorageID ( ) ;
2020-04-13 15:21:05 +00:00
String partition_id = getPartitionIDFromQuery ( partition , query_context ) ;
2019-08-28 08:24:17 +00:00
2021-08-23 17:50:50 +00:00
/// A range for log entry to remove parts from the source table (myself).
auto zookeeper = getZooKeeper ( ) ;
2021-11-03 07:46:33 +00:00
/// Retry if alter_partition_version changes
for ( size_t retry = 0 ; retry < 1000 ; + + retry )
{
String alter_partition_version_path = zookeeper_path + " /alter_partition_version " ;
Coordination : : Stat alter_partition_version_stat ;
zookeeper - > get ( alter_partition_version_path , & alter_partition_version_stat ) ;
MergeTreePartInfo drop_range ;
std : : optional < EphemeralLockInZooKeeper > delimiting_block_lock ;
getFakePartCoveringAllPartsInPartition ( partition_id , drop_range , delimiting_block_lock , true ) ;
String drop_range_fake_part_name = getPartNamePossiblyFake ( format_version , drop_range ) ;
DataPartPtr covering_part ;
DataPartsVector src_all_parts ;
{
/// NOTE: Some covered parts may be missing in src_all_parts if corresponding log entries are not executed yet.
auto parts_lock = src_data . lockParts ( ) ;
src_all_parts = src_data . getActivePartsToReplace ( drop_range , drop_range_fake_part_name , covering_part , parts_lock ) ;
}
if ( covering_part )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Got part {} covering drop range {}, it's a bug " ,
covering_part - > name , drop_range_fake_part_name ) ;
/// After allocating block number for drop_range we must ensure that it does not intersect block numbers
/// allocated by concurrent REPLACE query.
/// We could check it in multi-request atomically with creation of DROP_RANGE entry in source table log,
/// but it's better to check it here and fail as early as possible (before we have done something to destination table).
Coordination : : Error version_check_code = zookeeper - > trySet ( alter_partition_version_path , " " , alter_partition_version_stat . version ) ;
if ( version_check_code ! = Coordination : : Error : : ZOK )
throw Exception ( ErrorCodes : : CANNOT_ASSIGN_ALTER , " Cannot DROP PARTITION in {} after copying partition to {}, "
" because another ALTER PARTITION query was concurrently executed " ,
getStorageID ( ) . getFullTableName ( ) , dest_table_storage - > getStorageID ( ) . getFullTableName ( ) ) ;
DataPartsVector src_parts ;
MutableDataPartsVector dst_parts ;
Strings block_id_paths ;
Strings part_checksums ;
std : : vector < EphemeralLockInZooKeeper > ephemeral_locks ;
LOG_DEBUG ( log , " Cloning {} parts " , src_all_parts . size ( ) ) ;
static const String TMP_PREFIX = " tmp_move_from_ " ;
/// Clone parts into destination table.
String dest_alter_partition_version_path = dest_table_storage - > zookeeper_path + " /alter_partition_version " ;
Coordination : : Stat dest_alter_partition_version_stat ;
zookeeper - > get ( dest_alter_partition_version_path , & dest_alter_partition_version_stat ) ;
2022-04-19 13:53:10 +00:00
std : : vector < MergeTreeData : : HardlinkedFiles > hardlinked_files_for_parts ;
2022-08-09 16:44:51 +00:00
std : : vector < scope_guard > temporary_parts_locks ;
2022-04-19 13:53:10 +00:00
2021-11-03 07:46:33 +00:00
for ( const auto & src_part : src_all_parts )
{
if ( ! dest_table_storage - > canReplacePartition ( src_part ) )
throw Exception (
" Cannot move partition ' " + partition_id + " ' because part ' " + src_part - > name + " ' has inconsistent granularity with table " ,
ErrorCodes : : LOGICAL_ERROR ) ;
String hash_hex = src_part - > checksums . getTotalChecksumHex ( ) ;
String block_id_path ;
auto lock = dest_table_storage - > allocateBlockNumber ( partition_id , zookeeper , block_id_path ) ;
if ( ! lock )
{
LOG_INFO ( log , " Part {} (hash {}) has been already attached " , src_part - > name , hash_hex ) ;
continue ;
}
2019-08-28 08:24:17 +00:00
2021-11-03 07:46:33 +00:00
UInt64 index = lock - > getNumber ( ) ;
MergeTreePartInfo dst_part_info ( partition_id , index , index , src_part - > info . level ) ;
2022-04-19 13:53:10 +00:00
MergeTreeData : : HardlinkedFiles hardlinked_files ;
2022-04-21 12:39:12 +00:00
bool copy_instead_of_hardlink = storage_settings_ptr - > allow_remote_fs_zero_copy_replication
& & src_part - > isStoredOnRemoteDiskWithZeroCopySupport ( ) ;
2022-09-27 13:23:02 +00:00
auto [ dst_part , dst_part_lock ] = dest_table_storage - > cloneAndLoadDataPartOnSameDisk ( src_part , TMP_PREFIX , dst_part_info , dest_metadata_snapshot , NO_TRANSACTION_PTR , & hardlinked_files , copy_instead_of_hardlink , { } ) ;
2019-08-28 08:24:17 +00:00
2021-11-03 07:46:33 +00:00
src_parts . emplace_back ( src_part ) ;
dst_parts . emplace_back ( dst_part ) ;
2022-08-09 16:44:51 +00:00
temporary_parts_locks . emplace_back ( std : : move ( dst_part_lock ) ) ;
2021-11-03 07:46:33 +00:00
ephemeral_locks . emplace_back ( std : : move ( * lock ) ) ;
block_id_paths . emplace_back ( block_id_path ) ;
part_checksums . emplace_back ( hash_hex ) ;
2022-04-19 13:53:10 +00:00
hardlinked_files_for_parts . emplace_back ( hardlinked_files ) ;
2021-11-03 07:46:33 +00:00
}
2019-08-28 08:24:17 +00:00
2021-11-03 07:46:33 +00:00
ReplicatedMergeTreeLogEntryData entry_delete ;
2019-08-28 08:24:17 +00:00
{
2021-11-03 07:46:33 +00:00
entry_delete . type = LogEntry : : DROP_RANGE ;
entry_delete . source_replica = replica_name ;
entry_delete . new_part_name = drop_range_fake_part_name ;
entry_delete . detach = false ; //-V1048
entry_delete . create_time = time ( nullptr ) ;
2019-08-28 08:24:17 +00:00
}
2021-11-03 07:46:33 +00:00
ReplicatedMergeTreeLogEntryData entry ;
{
MergeTreePartInfo drop_range_dest = makeDummyDropRangeForMovePartitionOrAttachPartitionFrom ( partition_id ) ;
2019-08-30 04:24:05 +00:00
2021-11-03 07:46:33 +00:00
entry . type = ReplicatedMergeTreeLogEntryData : : REPLACE_RANGE ;
entry . source_replica = dest_table_storage - > replica_name ;
entry . create_time = time ( nullptr ) ;
entry . replace_range_entry = std : : make_shared < ReplicatedMergeTreeLogEntryData : : ReplaceRangeEntry > ( ) ;
2019-08-28 08:24:17 +00:00
2021-11-03 07:46:33 +00:00
auto & entry_replace = * entry . replace_range_entry ;
entry_replace . drop_range_part_name = getPartNamePossiblyFake ( format_version , drop_range_dest ) ;
entry_replace . from_database = src_data_id . database_name ;
entry_replace . from_table = src_data_id . table_name ;
for ( const auto & part : src_parts )
entry_replace . src_part_names . emplace_back ( part - > name ) ;
for ( const auto & part : dst_parts )
entry_replace . new_part_names . emplace_back ( part - > name ) ;
for ( const String & checksum : part_checksums )
entry_replace . part_names_checksums . emplace_back ( checksum ) ;
entry_replace . columns_version = - 1 ;
}
2019-08-28 08:24:17 +00:00
2021-11-03 07:46:33 +00:00
clearBlocksInPartition ( * zookeeper , drop_range . partition_id , drop_range . max_block , drop_range . max_block ) ;
2019-08-28 08:24:17 +00:00
2021-11-03 07:46:33 +00:00
DataPartsVector parts_to_remove ;
Coordination : : Responses op_results ;
2019-08-28 08:24:17 +00:00
2021-11-03 07:46:33 +00:00
try
2019-08-28 08:24:17 +00:00
{
2021-11-03 07:46:33 +00:00
Coordination : : Requests ops ;
for ( size_t i = 0 ; i < dst_parts . size ( ) ; + + i )
2019-08-28 08:24:17 +00:00
{
2021-11-03 07:46:33 +00:00
dest_table_storage - > getCommitPartOps ( ops , dst_parts [ i ] , block_id_paths [ i ] ) ;
ephemeral_locks [ i ] . getUnlockOps ( ops ) ;
2019-08-28 08:24:17 +00:00
}
2021-11-03 07:46:33 +00:00
/// Check and update version to avoid race with DROP_RANGE
ops . emplace_back ( zkutil : : makeSetRequest ( dest_alter_partition_version_path , " " , dest_alter_partition_version_stat . version ) ) ;
/// Just update version, because merges assignment relies on it
ops . emplace_back ( zkutil : : makeSetRequest ( fs : : path ( dest_table_storage - > zookeeper_path ) / " log " , " " , - 1 ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( fs : : path ( dest_table_storage - > zookeeper_path ) / " log/log- " ,
entry . toString ( ) , zkutil : : CreateMode : : PersistentSequential ) ) ;
2019-08-28 08:24:17 +00:00
2021-11-03 07:46:33 +00:00
{
2022-03-18 11:01:26 +00:00
Transaction transaction ( * dest_table_storage , NO_TRANSACTION_RAW ) ;
2019-10-04 04:37:19 +00:00
2021-11-03 07:46:33 +00:00
auto src_data_parts_lock = lockParts ( ) ;
auto dest_data_parts_lock = dest_table_storage - > lockParts ( ) ;
2019-08-28 08:24:17 +00:00
2022-06-24 15:19:59 +00:00
for ( auto & part : dst_parts )
2022-10-22 22:51:59 +00:00
dest_table_storage - > renameTempPartAndReplaceUnlocked ( part , transaction , dest_data_parts_lock ) ;
2019-08-30 04:24:05 +00:00
2022-04-19 13:53:10 +00:00
for ( size_t i = 0 ; i < dst_parts . size ( ) ; + + i )
2022-06-30 20:51:27 +00:00
dest_table_storage - > lockSharedData ( * dst_parts [ i ] , false , hardlinked_files_for_parts [ i ] ) ;
2021-11-03 07:46:33 +00:00
Coordination : : Error code = zookeeper - > tryMulti ( ops , op_results ) ;
if ( code = = Coordination : : Error : : ZBADVERSION )
continue ;
else
zkutil : : KeeperMultiException : : check ( code , ops , op_results ) ;
2019-10-03 10:52:32 +00:00
2022-06-24 15:19:59 +00:00
parts_to_remove = removePartsInRangeFromWorkingSetAndGetPartsToRemoveFromZooKeeper ( NO_TRANSACTION_RAW , drop_range , src_data_parts_lock ) ;
transaction . commit ( & src_data_parts_lock ) ;
2021-11-03 07:46:33 +00:00
}
2019-10-03 10:52:32 +00:00
2021-11-03 07:46:33 +00:00
PartLog : : addNewParts ( getContext ( ) , dst_parts , watch . elapsed ( ) ) ;
}
catch ( . . . )
{
PartLog : : addNewParts ( getContext ( ) , dst_parts , watch . elapsed ( ) , ExecutionStatus : : fromCurrentException ( ) ) ;
2022-04-19 13:53:10 +00:00
for ( const auto & dst_part : dst_parts )
2022-04-22 13:20:02 +00:00
dest_table_storage - > unlockSharedData ( * dst_part ) ;
2022-04-19 13:53:10 +00:00
2021-11-03 07:46:33 +00:00
throw ;
2019-08-28 08:24:17 +00:00
}
2021-11-03 07:46:33 +00:00
String log_znode_path = dynamic_cast < const Coordination : : CreateResponse & > ( * op_results . back ( ) ) . path_created ;
entry . znode_name = log_znode_path . substr ( log_znode_path . find_last_of ( ' / ' ) + 1 ) ;
2019-08-28 08:24:17 +00:00
2021-11-03 07:46:33 +00:00
for ( auto & lock : ephemeral_locks )
lock . assumeUnlocked ( ) ;
2019-08-28 08:24:17 +00:00
2022-03-21 11:09:00 +00:00
removePartsFromZooKeeperWithRetries ( parts_to_remove ) ;
2019-08-28 08:24:17 +00:00
2021-11-03 07:46:33 +00:00
parts_to_remove . clear ( ) ;
cleanup_thread . wakeup ( ) ;
lock2 . reset ( ) ;
2019-08-28 08:24:17 +00:00
2021-11-03 07:46:33 +00:00
dest_table_storage - > waitForLogEntryToBeProcessedIfNecessary ( entry , query_context ) ;
2019-08-28 08:24:17 +00:00
2021-11-03 07:46:33 +00:00
/// Create DROP_RANGE for the source table
Coordination : : Requests ops_src ;
ops_src . emplace_back ( zkutil : : makeCreateRequest (
fs : : path ( zookeeper_path ) / " log/log- " , entry_delete . toString ( ) , zkutil : : CreateMode : : PersistentSequential ) ) ;
/// Just update version, because merges assignment relies on it
ops_src . emplace_back ( zkutil : : makeSetRequest ( fs : : path ( zookeeper_path ) / " log " , " " , - 1 ) ) ;
delimiting_block_lock - > getUnlockOps ( ops_src ) ;
2019-08-28 08:24:17 +00:00
2021-11-03 07:46:33 +00:00
op_results = zookeeper - > multi ( ops_src ) ;
2019-08-30 04:24:05 +00:00
2021-11-03 07:46:33 +00:00
log_znode_path = dynamic_cast < const Coordination : : CreateResponse & > ( * op_results . front ( ) ) . path_created ;
entry_delete . znode_name = log_znode_path . substr ( log_znode_path . find_last_of ( ' / ' ) + 1 ) ;
2019-08-30 04:24:05 +00:00
2021-11-03 07:46:33 +00:00
lock1 . reset ( ) ;
waitForLogEntryToBeProcessedIfNecessary ( entry_delete , query_context ) ;
2019-08-30 04:35:56 +00:00
2021-11-03 07:46:33 +00:00
/// Cleaning possibly stored information about parts from /quorum/last_part node in ZooKeeper.
cleanLastPartNode ( partition_id ) ;
2019-07-26 08:42:17 +00:00
2021-11-03 07:46:33 +00:00
return ;
}
2020-04-16 18:47:20 +00:00
2021-11-03 07:46:33 +00:00
throw Exception ( ErrorCodes : : CANNOT_ASSIGN_ALTER , " Cannot assign ALTER PARTITION, because another ALTER PARTITION query was concurrently executed " ) ;
2019-08-30 04:35:56 +00:00
}
2019-08-30 04:24:05 +00:00
2020-11-24 14:24:48 +00:00
void StorageReplicatedMergeTree : : movePartitionToShard (
2021-04-20 12:26:05 +00:00
const ASTPtr & partition , bool move_part , const String & to , ContextPtr /*query_context*/ )
2020-11-24 14:24:48 +00:00
{
/// This is a lightweight operation that only optimistically checks if it could succeed and queues tasks.
if ( ! move_part )
throw Exception ( " MOVE PARTITION TO SHARD is not supported, use MOVE PART instead " , ErrorCodes : : NOT_IMPLEMENTED ) ;
2021-12-15 11:30:57 +00:00
if ( zkutil : : normalizeZooKeeperPath ( zookeeper_path , /* check_starts_with_slash */ true ) = = zkutil : : normalizeZooKeeperPath ( to , /* check_starts_with_slash */ true ) )
2020-11-24 14:24:48 +00:00
throw Exception ( " Source and destination are the same " , ErrorCodes : : BAD_ARGUMENTS ) ;
2022-02-03 10:10:05 +00:00
auto zookeeper = getZooKeeperAndAssertNotReadonly ( ) ;
2020-11-24 14:24:48 +00:00
String part_name = partition - > as < ASTLiteral & > ( ) . value . safeGet < String > ( ) ;
auto part_info = MergeTreePartInfo : : fromPartName ( part_name , format_version ) ;
2021-12-30 14:27:22 +00:00
auto part = getPartIfExists ( part_info , { MergeTreeDataPartState : : Active } ) ;
2020-11-24 14:24:48 +00:00
if ( ! part )
throw Exception ( ErrorCodes : : NO_SUCH_DATA_PART , " Part {} not found locally " , part_name ) ;
if ( part - > uuid = = UUIDHelpers : : Nil )
throw Exception ( ErrorCodes : : NOT_IMPLEMENTED , " Part {} does not have an uuid assigned and it can't be moved between shards " , part_name ) ;
ReplicatedMergeTreeMergePredicate merge_pred = queue . getMergePredicate ( zookeeper ) ;
/// The following block is pretty much copy & paste from StorageReplicatedMergeTree::dropPart to avoid conflicts while this is WIP.
/// Extract it to a common method and re-use it before merging.
{
if ( partIsLastQuorumPart ( part - > info ) )
{
throw Exception ( ErrorCodes : : NOT_IMPLEMENTED , " Part {} is last inserted part with quorum in partition. Would not be able to drop " , part_name ) ;
}
/// canMergeSinglePart is overlapping with dropPart, let's try to use the same code.
String out_reason ;
if ( ! merge_pred . canMergeSinglePart ( part , & out_reason ) )
throw Exception ( ErrorCodes : : PART_IS_TEMPORARILY_LOCKED , " Part is busy, reason: " + out_reason ) ;
}
{
/// Optimistic check that for compatible destination table structure.
checkTableStructure ( to , getInMemoryMetadataPtr ( ) ) ;
}
PinnedPartUUIDs src_pins ;
PinnedPartUUIDs dst_pins ;
{
String s = zookeeper - > get ( zookeeper_path + " /pinned_part_uuids " , & src_pins . stat ) ;
src_pins . fromString ( s ) ;
}
{
String s = zookeeper - > get ( to + " /pinned_part_uuids " , & dst_pins . stat ) ;
dst_pins . fromString ( s ) ;
}
if ( src_pins . part_uuids . contains ( part - > uuid ) | | dst_pins . part_uuids . contains ( part - > uuid ) )
throw Exception ( ErrorCodes : : PART_IS_TEMPORARILY_LOCKED , " Part {} has it's uuid ({}) already pinned . " , part_name, toString(part->uuid)) ;
src_pins . part_uuids . insert ( part - > uuid ) ;
dst_pins . part_uuids . insert ( part - > uuid ) ;
PartMovesBetweenShardsOrchestrator : : Entry part_move_entry ;
2021-09-16 16:03:31 +00:00
part_move_entry . state = PartMovesBetweenShardsOrchestrator : : EntryState : : SYNC_SOURCE ;
2020-11-24 14:24:48 +00:00
part_move_entry . create_time = std : : time ( nullptr ) ;
part_move_entry . update_time = part_move_entry . create_time ;
part_move_entry . task_uuid = UUIDHelpers : : generateV4 ( ) ;
part_move_entry . part_name = part - > name ;
part_move_entry . part_uuid = part - > uuid ;
part_move_entry . to_shard = to ;
Coordination : : Requests ops ;
ops . emplace_back ( zkutil : : makeCheckRequest ( zookeeper_path + " /log " , merge_pred . getVersion ( ) ) ) ; /// Make sure no new events were added to the log.
ops . emplace_back ( zkutil : : makeSetRequest ( zookeeper_path + " /pinned_part_uuids " , src_pins . toString ( ) , src_pins . stat . version ) ) ;
ops . emplace_back ( zkutil : : makeSetRequest ( to + " /pinned_part_uuids " , dst_pins . toString ( ) , dst_pins . stat . version ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest (
part_moves_between_shards_orchestrator . entries_znode_path + " /task- " ,
part_move_entry . toString ( ) ,
zkutil : : CreateMode : : PersistentSequential ) ) ;
Coordination : : Responses responses ;
Coordination : : Error rc = zookeeper - > tryMulti ( ops , responses ) ;
zkutil : : KeeperMultiException : : check ( rc , ops , responses ) ;
String task_znode_path = dynamic_cast < const Coordination : : CreateResponse & > ( * responses . back ( ) ) . path_created ;
2022-02-01 09:52:02 +00:00
LOG_DEBUG ( log , " Created task for part movement between shards at {} " , task_znode_path ) ;
2020-11-24 14:24:48 +00:00
2021-09-16 16:03:31 +00:00
/// TODO(nv): Nice to have support for `replication_alter_partitions_sync`.
/// For now use the system.part_moves_between_shards table for status.
}
2020-11-24 14:24:48 +00:00
2021-09-16 16:03:31 +00:00
CancellationCode StorageReplicatedMergeTree : : killPartMoveToShard ( const UUID & task_uuid )
{
return part_moves_between_shards_orchestrator . killPartMoveToShard ( task_uuid ) ;
2020-11-24 14:24:48 +00:00
}
2018-05-21 13:49:54 +00:00
void StorageReplicatedMergeTree : : getCommitPartOps (
2018-08-25 01:58:14 +00:00
Coordination : : Requests & ops ,
2019-05-03 02:00:57 +00:00
MutableDataPartPtr & part ,
2018-05-21 13:49:54 +00:00
const String & block_id_path ) const
{
const String & part_name = part - > name ;
2019-08-26 18:08:58 +00:00
const auto storage_settings_ptr = getSettings ( ) ;
2018-05-21 13:49:54 +00:00
if ( ! block_id_path . empty ( ) )
{
/// Make final duplicate check and commit block_id
ops . emplace_back (
zkutil : : makeCreateRequest (
block_id_path ,
part_name , /// We will be able to know original part number for duplicate blocks, if we want.
zkutil : : CreateMode : : Persistent ) ) ;
}
2019-05-03 02:00:57 +00:00
/// Information about the part, in the replica
2019-08-26 18:08:58 +00:00
if ( storage_settings_ptr - > use_minimalistic_part_header_in_zookeeper )
2018-12-11 13:30:20 +00:00
{
ops . emplace_back ( zkutil : : makeCreateRequest (
2021-05-08 10:59:55 +00:00
fs : : path ( replica_path ) / " parts " / part - > name ,
2020-01-16 16:15:01 +00:00
ReplicatedMergeTreePartHeader : : fromColumnsAndChecksums ( part - > getColumns ( ) , part - > checksums ) . toString ( ) ,
2018-12-11 13:30:20 +00:00
zkutil : : CreateMode : : Persistent ) ) ;
}
else
{
ops . emplace_back ( zkutil : : makeCreateRequest (
2021-05-08 10:59:55 +00:00
fs : : path ( replica_path ) / " parts " / part - > name ,
2018-12-11 13:30:20 +00:00
" " ,
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest (
2021-05-08 10:59:55 +00:00
fs : : path ( replica_path ) / " parts " / part - > name / " columns " ,
2020-01-16 16:15:01 +00:00
part - > getColumns ( ) . toString ( ) ,
2018-12-11 13:30:20 +00:00
zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest (
2021-05-08 10:59:55 +00:00
fs : : path ( replica_path ) / " parts " / part - > name / " checksums " ,
2018-12-11 13:30:20 +00:00
getChecksumsForZooKeeper ( part - > checksums ) ,
zkutil : : CreateMode : : Persistent ) ) ;
}
}
2018-04-17 17:59:42 +00:00
ReplicatedMergeTreeAddress StorageReplicatedMergeTree : : getReplicatedMergeTreeAddress ( ) const
{
2021-04-10 23:33:54 +00:00
auto host_port = getContext ( ) - > getInterserverIOAddress ( ) ;
2019-12-03 16:25:32 +00:00
auto table_id = getStorageID ( ) ;
2018-04-17 17:59:42 +00:00
ReplicatedMergeTreeAddress res ;
res . host = host_port . first ;
res . replication_port = host_port . second ;
2021-04-10 23:33:54 +00:00
res . queries_port = getContext ( ) - > getTCPPort ( ) ;
2019-12-03 16:25:32 +00:00
res . database = table_id . database_name ;
res . table = table_id . table_name ;
2021-04-10 23:33:54 +00:00
res . scheme = getContext ( ) - > getInterserverScheme ( ) ;
2018-04-17 17:59:42 +00:00
return res ;
}
2018-05-28 15:37:30 +00:00
ActionLock StorageReplicatedMergeTree : : getActionLock ( StorageActionBlockType action_type )
2018-05-21 13:49:54 +00:00
{
if ( action_type = = ActionLocks : : PartsMerge )
2019-08-01 15:36:12 +00:00
return merger_mutator . merges_blocker . cancel ( ) ;
if ( action_type = = ActionLocks : : PartsTTLMerge )
return merger_mutator . ttl_merges_blocker . cancel ( ) ;
2018-05-21 13:49:54 +00:00
if ( action_type = = ActionLocks : : PartsFetch )
return fetcher . blocker . cancel ( ) ;
if ( action_type = = ActionLocks : : PartsSend )
2020-10-21 19:24:16 +00:00
{
auto data_parts_exchange_ptr = std : : atomic_load ( & data_parts_exchange_endpoint ) ;
return data_parts_exchange_ptr ? data_parts_exchange_ptr - > blocker . cancel ( ) : ActionLock ( ) ;
}
2018-05-21 13:49:54 +00:00
if ( action_type = = ActionLocks : : ReplicationQueue )
2018-05-28 15:37:30 +00:00
return queue . actions_blocker . cancel ( ) ;
2018-05-21 13:49:54 +00:00
2019-09-03 14:50:49 +00:00
if ( action_type = = ActionLocks : : PartsMove )
return parts_mover . moves_blocker . cancel ( ) ;
2018-05-21 13:49:54 +00:00
return { } ;
}
2020-10-15 16:10:22 +00:00
void StorageReplicatedMergeTree : : onActionLockRemove ( StorageActionBlockType action_type )
{
if ( action_type = = ActionLocks : : PartsMerge | | action_type = = ActionLocks : : PartsTTLMerge
| | action_type = = ActionLocks : : PartsFetch | | action_type = = ActionLocks : : PartsSend
| | action_type = = ActionLocks : : ReplicationQueue )
2021-09-08 00:21:21 +00:00
background_operations_assignee . trigger ( ) ;
2020-10-15 16:10:22 +00:00
else if ( action_type = = ActionLocks : : PartsMove )
2021-09-08 00:21:21 +00:00
background_moves_assignee . trigger ( ) ;
2020-10-15 16:10:22 +00:00
}
2018-06-09 15:48:22 +00:00
2018-05-21 13:49:54 +00:00
bool StorageReplicatedMergeTree : : waitForShrinkingQueueSize ( size_t queue_size , UInt64 max_wait_milliseconds )
{
2019-09-19 11:04:57 +00:00
Stopwatch watch ;
2018-05-21 13:49:54 +00:00
/// Let's fetch new log entries firstly
2022-02-03 10:10:05 +00:00
queue . pullLogsToQueue ( getZooKeeperAndAssertNotReadonly ( ) , { } , ReplicatedMergeTreeQueue : : SYNC ) ;
2020-06-15 22:54:19 +00:00
2020-10-15 08:29:18 +00:00
/// This is significant, because the execution of this task could be delayed at BackgroundPool.
/// And we force it to be executed.
2021-09-08 00:21:21 +00:00
background_operations_assignee . trigger ( ) ;
2018-05-21 13:49:54 +00:00
2019-09-19 11:04:57 +00:00
Poco : : Event target_size_event ;
auto callback = [ & target_size_event , queue_size ] ( size_t new_queue_size )
2018-05-21 13:49:54 +00:00
{
if ( new_queue_size < = queue_size )
2019-09-19 11:04:57 +00:00
target_size_event . set ( ) ;
2018-05-21 13:49:54 +00:00
} ;
2019-09-19 11:04:57 +00:00
const auto handler = queue . addSubscriber ( std : : move ( callback ) ) ;
2018-05-21 13:49:54 +00:00
2019-09-19 11:04:57 +00:00
while ( ! target_size_event . tryWait ( 50 ) )
2018-05-21 13:49:54 +00:00
{
if ( max_wait_milliseconds & & watch . elapsedMilliseconds ( ) > max_wait_milliseconds )
2019-09-19 11:04:57 +00:00
return false ;
2018-05-21 13:49:54 +00:00
2018-07-30 18:30:33 +00:00
if ( partial_shutdown_called )
2018-05-21 13:49:54 +00:00
throw Exception ( " Shutdown is called for table " , ErrorCodes : : ABORTED ) ;
}
2019-09-19 11:04:57 +00:00
return true ;
2018-04-02 12:45:55 +00:00
}
2017-11-15 16:32:47 +00:00
2021-05-27 23:10:44 +00:00
bool StorageReplicatedMergeTree : : dropPartImpl (
2020-11-12 17:36:02 +00:00
zkutil : : ZooKeeperPtr & zookeeper , String part_name , LogEntry & entry , bool detach , bool throw_if_noop )
2020-09-04 15:48:51 +00:00
{
2022-02-01 09:52:02 +00:00
LOG_TRACE ( log , " Will try to insert a log entry to DROP_RANGE for part {} " , part_name ) ;
2020-09-04 15:48:51 +00:00
auto part_info = MergeTreePartInfo : : fromPartName ( part_name , format_version ) ;
while ( true )
{
ReplicatedMergeTreeMergePredicate merge_pred = queue . getMergePredicate ( zookeeper ) ;
2021-12-30 14:27:22 +00:00
auto part = getPartIfExists ( part_info , { MergeTreeDataPartState : : Active } ) ;
2020-09-04 15:48:51 +00:00
2020-11-02 17:30:53 +00:00
if ( ! part )
2020-11-12 17:36:02 +00:00
{
if ( throw_if_noop )
2022-04-12 12:14:26 +00:00
throw Exception ( ErrorCodes : : NO_SUCH_DATA_PART , " Part {} not found locally, won't try to drop it. " , part_name ) ;
2020-11-12 17:36:02 +00:00
return false ;
}
2020-09-04 15:48:51 +00:00
2021-07-05 19:58:36 +00:00
if ( merge_pred . hasDropRange ( part - > info ) )
{
if ( throw_if_noop )
2022-04-12 12:14:26 +00:00
throw Exception ( ErrorCodes : : PART_IS_TEMPORARILY_LOCKED , " Already has DROP RANGE for part {} in queue. " , part_name ) ;
2021-07-05 19:58:36 +00:00
return false ;
}
2020-09-04 15:48:51 +00:00
/// There isn't a lot we can do otherwise. Can't cancel merges because it is possible that a replica already
/// finished the merge.
2022-04-12 12:14:26 +00:00
String out_reason ;
if ( ! merge_pred . canMergeSinglePart ( part , & out_reason ) )
2020-11-12 17:36:02 +00:00
{
if ( throw_if_noop )
2022-04-12 12:14:26 +00:00
throw Exception ( ErrorCodes : : PART_IS_TEMPORARILY_LOCKED , out_reason ) ;
2020-11-12 17:36:02 +00:00
return false ;
}
2020-11-03 09:24:10 +00:00
2022-06-01 18:11:53 +00:00
if ( merge_pred . partParticipatesInReplaceRange ( part , & out_reason ) )
{
if ( throw_if_noop )
throw Exception ( ErrorCodes : : PART_IS_TEMPORARILY_LOCKED , out_reason ) ;
return false ;
}
2020-11-03 09:24:10 +00:00
if ( partIsLastQuorumPart ( part - > info ) )
2020-11-12 17:36:02 +00:00
{
if ( throw_if_noop )
2022-04-12 12:14:26 +00:00
throw Exception ( ErrorCodes : : NOT_IMPLEMENTED , " Part {} is last inserted part with quorum in partition. Cannot drop " , part_name ) ;
2020-11-12 17:36:02 +00:00
return false ;
}
2018-06-09 14:24:50 +00:00
2020-11-03 09:24:10 +00:00
if ( partIsInsertingWithParallelQuorum ( part - > info ) )
2020-11-12 17:36:02 +00:00
{
if ( throw_if_noop )
2022-04-12 12:14:26 +00:00
throw Exception ( ErrorCodes : : NOT_IMPLEMENTED , " Part {} is inserting with parallel quorum. Cannot drop " , part_name ) ;
2020-11-12 17:36:02 +00:00
return false ;
}
2020-09-04 15:48:51 +00:00
2020-11-02 17:30:53 +00:00
Coordination : : Requests ops ;
getClearBlocksInPartitionOps ( ops , * zookeeper , part_info . partition_id , part_info . min_block , part_info . max_block ) ;
2021-01-12 10:55:02 +00:00
size_t clear_block_ops_size = ops . size ( ) ;
2020-11-02 17:30:53 +00:00
2020-09-04 15:48:51 +00:00
/// If `part_name` is result of a recent merge and source parts are still available then
/// DROP_RANGE with detach will move this part together with source parts to `detached/` dir.
entry . type = LogEntry : : DROP_RANGE ;
entry . source_replica = replica_name ;
2021-06-23 19:24:43 +00:00
/// We don't set fake drop level (999999999) for the single part DROP_RANGE.
/// First of all we don't guarantee anything other than the part will not be
/// active after DROP PART, but covering part (without data of dropped part) can exist.
/// If we add part with 9999999 level than we can break invariant in virtual_parts of
/// the queue.
entry . new_part_name = getPartNamePossiblyFake ( format_version , part - > info ) ;
2020-09-04 15:48:51 +00:00
entry . detach = detach ;
entry . create_time = time ( nullptr ) ;
2021-05-08 10:59:55 +00:00
ops . emplace_back ( zkutil : : makeCheckRequest ( fs : : path ( zookeeper_path ) / " log " , merge_pred . getVersion ( ) ) ) ; /// Make sure no new events were added to the log.
ops . emplace_back ( zkutil : : makeCreateRequest ( fs : : path ( zookeeper_path ) / " log/log- " , entry . toString ( ) , zkutil : : CreateMode : : PersistentSequential ) ) ;
2021-05-14 16:11:40 +00:00
/// Just update version, because merges assignment relies on it
2021-05-17 11:23:59 +00:00
ops . emplace_back ( zkutil : : makeSetRequest ( fs : : path ( zookeeper_path ) / " log " , " " , - 1 ) ) ;
2020-09-04 15:48:51 +00:00
Coordination : : Responses responses ;
Coordination : : Error rc = zookeeper - > tryMulti ( ops , responses ) ;
if ( rc = = Coordination : : Error : : ZBADVERSION )
{
LOG_TRACE ( log , " A new log entry appeared while trying to commit DROP RANGE. Retry. " ) ;
continue ;
}
2021-01-14 08:07:13 +00:00
else if ( rc = = Coordination : : Error : : ZNONODE )
{
LOG_TRACE ( log , " Other replica already removing same part {} or part deduplication node was removed by background thread. Retry. " , part_name ) ;
continue ;
}
2020-09-04 15:48:51 +00:00
else
zkutil : : KeeperMultiException : : check ( rc , ops , responses ) ;
2021-01-12 10:55:02 +00:00
String log_znode_path = dynamic_cast < const Coordination : : CreateResponse & > ( * responses [ clear_block_ops_size + 1 ] ) . path_created ;
2020-09-04 15:48:51 +00:00
entry . znode_name = log_znode_path . substr ( log_znode_path . find_last_of ( ' / ' ) + 1 ) ;
return true ;
}
}
2018-06-09 14:24:50 +00:00
2022-05-06 14:12:31 +00:00
bool StorageReplicatedMergeTree : : addOpsToDropAllPartsInPartition (
zkutil : : ZooKeeper & zookeeper , const String & partition_id , bool detach ,
Coordination : : Requests & ops , std : : vector < LogEntryPtr > & entries ,
std : : vector < EphemeralLockInZooKeeper > & delimiting_block_locks ,
std : : vector < size_t > & log_entry_ops_idx )
2018-04-21 00:35:20 +00:00
{
2022-05-06 14:12:31 +00:00
MergeTreePartInfo drop_range_info ;
/// It would prevent other replicas from assigning merges which intersect locked block number.
std : : optional < EphemeralLockInZooKeeper > delimiting_block_lock ;
if ( ! getFakePartCoveringAllPartsInPartition ( partition_id , drop_range_info , delimiting_block_lock ) )
2021-11-03 07:46:33 +00:00
{
2022-05-06 14:12:31 +00:00
LOG_INFO ( log , " Will not drop partition {}, it is empty. " , partition_id ) ;
return false ;
}
2021-05-17 19:23:38 +00:00
2022-05-06 14:12:31 +00:00
clearBlocksInPartition ( zookeeper , partition_id , drop_range_info . min_block , drop_range_info . max_block ) ;
2021-06-20 08:24:43 +00:00
2022-05-06 14:12:31 +00:00
String drop_range_fake_part_name = getPartNamePossiblyFake ( format_version , drop_range_info ) ;
2021-06-20 08:24:43 +00:00
2022-05-06 14:12:31 +00:00
LOG_DEBUG ( log , " Disabled merges covered by range {} " , drop_range_fake_part_name ) ;
2018-04-21 00:35:20 +00:00
2022-05-06 14:12:31 +00:00
/// Finally, having achieved the necessary invariants, you can put an entry in the log.
auto entry = std : : make_shared < LogEntry > ( ) ;
entry - > type = LogEntry : : DROP_RANGE ;
entry - > source_replica = replica_name ;
entry - > new_part_name = drop_range_fake_part_name ;
entry - > detach = detach ;
entry - > create_time = time ( nullptr ) ;
2018-04-21 00:35:20 +00:00
2022-05-06 14:12:31 +00:00
log_entry_ops_idx . push_back ( ops . size ( ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( fs : : path ( zookeeper_path ) / " log/log- " , entry - > toString ( ) ,
zkutil : : CreateMode : : PersistentSequential ) ) ;
delimiting_block_lock - > getUnlockOps ( ops ) ;
delimiting_block_locks . push_back ( std : : move ( * delimiting_block_lock ) ) ;
entries . push_back ( std : : move ( entry ) ) ;
return true ;
}
2018-04-21 00:35:20 +00:00
2022-05-06 14:12:31 +00:00
void StorageReplicatedMergeTree : : dropAllPartsInPartitions (
2022-09-05 01:50:24 +00:00
zkutil : : ZooKeeper & zookeeper , const Strings & partition_ids , std : : vector < LogEntryPtr > & entries , ContextPtr query_context , bool detach )
2022-05-06 14:12:31 +00:00
{
entries . reserve ( partition_ids . size ( ) ) ;
2018-04-21 00:35:20 +00:00
2022-05-06 14:12:31 +00:00
/// Retry if alter_partition_version changes
for ( size_t retry = 0 ; retry < 1000 ; + + retry )
{
entries . clear ( ) ;
String alter_partition_version_path = zookeeper_path + " /alter_partition_version " ;
Coordination : : Stat alter_partition_version_stat ;
zookeeper . get ( alter_partition_version_path , & alter_partition_version_stat ) ;
2018-04-21 00:35:20 +00:00
2021-11-03 07:46:33 +00:00
Coordination : : Requests ops ;
2022-05-06 14:12:31 +00:00
std : : vector < EphemeralLockInZooKeeper > delimiting_block_locks ;
std : : vector < size_t > log_entry_ops_idx ;
ops . reserve ( partition_ids . size ( ) * 2 ) ;
delimiting_block_locks . reserve ( partition_ids . size ( ) ) ;
log_entry_ops_idx . reserve ( partition_ids . size ( ) ) ;
for ( const auto & partition_id : partition_ids )
addOpsToDropAllPartsInPartition ( zookeeper , partition_id , detach , ops , entries , delimiting_block_locks , log_entry_ops_idx ) ;
2021-06-20 08:24:43 +00:00
2021-11-03 07:46:33 +00:00
/// Check and update version to avoid race with REPLACE_RANGE.
/// Otherwise new parts covered by drop_range_info may appear after execution of current DROP_RANGE entry
/// as a result of execution of concurrently created REPLACE_RANGE entry.
ops . emplace_back ( zkutil : : makeSetRequest ( alter_partition_version_path , " " , alter_partition_version_stat . version ) ) ;
2021-06-20 08:24:43 +00:00
2021-11-03 07:46:33 +00:00
/// Just update version, because merges assignment relies on it
ops . emplace_back ( zkutil : : makeSetRequest ( fs : : path ( zookeeper_path ) / " log " , " " , - 1 ) ) ;
2021-06-20 08:24:43 +00:00
2021-11-03 07:46:33 +00:00
if ( auto txn = query_context - > getZooKeeperMetadataTransaction ( ) )
txn - > moveOpsTo ( ops ) ;
2021-06-20 08:24:43 +00:00
2021-11-03 07:46:33 +00:00
Coordination : : Responses responses ;
Coordination : : Error code = zookeeper . tryMulti ( ops , responses ) ;
2021-06-20 08:24:43 +00:00
2021-11-03 07:46:33 +00:00
if ( code = = Coordination : : Error : : ZOK )
2022-05-06 14:12:31 +00:00
{
for ( auto & lock : delimiting_block_locks )
lock . assumeUnlocked ( ) ;
}
2021-11-03 07:46:33 +00:00
else if ( code = = Coordination : : Error : : ZBADVERSION )
2021-11-09 10:02:17 +00:00
{
/// Cannot retry automatically, because some zookeeper ops were lost on the first attempt. Will retry on DDLWorker-level.
if ( query_context - > getZooKeeperMetadataTransaction ( ) )
throw Exception (
2021-11-09 10:19:00 +00:00
" Cannot execute alter, because alter partition version was suddenly changed due to concurrent alter " ,
2021-11-09 10:02:17 +00:00
ErrorCodes : : CANNOT_ASSIGN_ALTER ) ;
2021-11-03 07:46:33 +00:00
continue ;
2021-11-09 10:02:17 +00:00
}
2021-11-03 07:46:33 +00:00
else
zkutil : : KeeperMultiException : : check ( code , ops , responses ) ;
2020-06-12 18:24:32 +00:00
2022-05-06 14:12:31 +00:00
assert ( entries . size ( ) = = log_entry_ops_idx . size ( ) ) ;
for ( size_t i = 0 ; i < entries . size ( ) ; + + i )
{
String log_znode_path = dynamic_cast < const Coordination : : CreateResponse & > ( * responses [ log_entry_ops_idx [ i ] ] ) . path_created ;
entries [ i ] - > znode_name = log_znode_path . substr ( log_znode_path . find_last_of ( ' / ' ) + 1 ) ;
2018-04-21 00:35:20 +00:00
2022-05-06 14:12:31 +00:00
auto drop_range_info = MergeTreePartInfo : : fromPartName ( entries [ i ] - > new_part_name , format_version ) ;
getContext ( ) - > getMergeList ( ) . cancelInPartition ( getStorageID ( ) , drop_range_info . partition_id , drop_range_info . max_block ) ;
}
2021-06-24 14:07:43 +00:00
2022-05-06 14:12:31 +00:00
return ;
2021-11-03 07:46:33 +00:00
}
throw Exception ( ErrorCodes : : CANNOT_ASSIGN_ALTER ,
2022-05-06 14:12:31 +00:00
" Cannot assign ALTER PARTITION because another ALTER PARTITION query was concurrently executed " ) ;
}
StorageReplicatedMergeTree : : LogEntryPtr StorageReplicatedMergeTree : : dropAllPartsInPartition (
zkutil : : ZooKeeper & zookeeper , const String & partition_id , ContextPtr query_context , bool detach )
{
Strings partition_ids = { partition_id } ;
std : : vector < LogEntryPtr > entries ;
dropAllPartsInPartitions ( zookeeper , partition_ids , entries , query_context , detach ) ;
if ( entries . empty ( ) )
return { } ;
return entries [ 0 ] ;
2018-05-21 13:49:54 +00:00
}
2022-04-12 12:14:26 +00:00
void StorageReplicatedMergeTree : : enqueuePartForCheck ( const String & part_name , time_t delay_to_check_seconds )
{
MergeTreePartInfo covering_drop_range ;
if ( queue . hasDropRange ( MergeTreePartInfo : : fromPartName ( part_name , format_version ) , & covering_drop_range ) )
{
LOG_WARNING ( log , " Do not enqueue part {} for check because it's covered by DROP_RANGE {} and going to be removed " ,
part_name , covering_drop_range . getPartName ( ) ) ;
return ;
}
part_check_thread . enqueuePart ( part_name , delay_to_check_seconds ) ;
}
2019-07-03 13:17:19 +00:00
2021-04-10 23:33:54 +00:00
CheckResults StorageReplicatedMergeTree : : checkData ( const ASTPtr & query , ContextPtr local_context )
2019-07-03 13:17:19 +00:00
{
CheckResults results ;
DataPartsVector data_parts ;
if ( const auto & check_query = query - > as < ASTCheckQuery & > ( ) ; check_query . partition )
{
2021-04-10 23:33:54 +00:00
String partition_id = getPartitionIDFromQuery ( check_query . partition , local_context ) ;
2021-11-17 18:14:14 +00:00
data_parts = getVisibleDataPartsVectorInPartition ( local_context , partition_id ) ;
2019-07-03 13:17:19 +00:00
}
else
2022-01-28 17:47:37 +00:00
data_parts = getVisibleDataPartsVector ( local_context ) ;
2019-07-03 13:17:19 +00:00
for ( auto & part : data_parts )
{
try
{
results . push_back ( part_check_thread . checkPart ( part - > name ) ) ;
}
2019-07-09 09:02:52 +00:00
catch ( const Exception & ex )
2019-07-03 13:17:19 +00:00
{
2019-07-09 09:02:52 +00:00
results . emplace_back ( part - > name , false , " Check of part finished with error: ' " + ex . message ( ) + " ' " ) ;
2019-07-03 13:17:19 +00:00
}
}
return results ;
}
2021-02-26 09:48:57 +00:00
2022-09-09 10:14:42 +00:00
bool StorageReplicatedMergeTree : : canUseZeroCopyReplication ( ) const
{
auto settings_ptr = getSettings ( ) ;
if ( ! settings_ptr - > allow_remote_fs_zero_copy_replication )
return false ;
auto disks = getStoragePolicy ( ) - > getDisks ( ) ;
for ( const auto & disk : disks )
{
if ( disk - > supportZeroCopyReplication ( ) )
return true ;
}
return false ;
}
2022-01-31 20:47:04 +00:00
void StorageReplicatedMergeTree : : checkBrokenDisks ( )
{
auto disks = getStoragePolicy ( ) - > getDisks ( ) ;
std : : unique_ptr < DataPartsVector > parts ;
for ( auto disk_it = disks . rbegin ( ) ; disk_it ! = disks . rend ( ) ; + + disk_it )
{
auto disk_ptr = * disk_it ;
if ( disk_ptr - > isBroken ( ) )
{
{
2022-06-28 19:19:06 +00:00
std : : lock_guard lock ( last_broken_disks_mutex ) ;
2022-01-31 20:47:04 +00:00
if ( ! last_broken_disks . insert ( disk_ptr - > getName ( ) ) . second )
continue ;
}
LOG_INFO ( log , " Scanning parts to recover on broken disk {} with path {} " , disk_ptr - > getName ( ) , disk_ptr - > getPath ( ) ) ;
if ( ! parts )
2022-02-01 12:05:05 +00:00
parts = std : : make_unique < DataPartsVector > ( getDataPartsVectorForInternalUsage ( ) ) ;
2022-01-31 20:47:04 +00:00
for ( auto & part : * parts )
{
2022-06-20 18:18:17 +00:00
if ( part - > data_part_storage & & part - > data_part_storage - > getDiskName ( ) = = disk_ptr - > getName ( ) )
2022-01-31 20:47:04 +00:00
broken_part_callback ( part - > name ) ;
}
continue ;
}
else
{
{
2022-06-28 19:19:06 +00:00
std : : lock_guard lock ( last_broken_disks_mutex ) ;
2022-01-31 20:47:04 +00:00
if ( last_broken_disks . erase ( disk_ptr - > getName ( ) ) > 0 )
LOG_INFO (
log ,
" Disk {} with path {} is recovered. Exclude it from last_broken_disks " ,
disk_ptr - > getName ( ) ,
disk_ptr - > getPath ( ) ) ;
}
}
}
}
2019-08-12 13:30:29 +00:00
bool StorageReplicatedMergeTree : : canUseAdaptiveGranularity ( ) const
{
2019-08-26 18:08:58 +00:00
const auto storage_settings_ptr = getSettings ( ) ;
return storage_settings_ptr - > index_granularity_bytes ! = 0 & &
( storage_settings_ptr - > enable_mixed_granularity_parts | |
2019-08-12 13:30:29 +00:00
( ! has_non_adaptive_index_granularity_parts & & ! other_replicas_fixed_granularity ) ) ;
}
2020-11-28 08:17:20 +00:00
MutationCommands StorageReplicatedMergeTree : : getFirstAlterMutationCommandsForPart ( const DataPartPtr & part ) const
2020-02-01 11:47:09 +00:00
{
2020-03-24 17:05:38 +00:00
return queue . getFirstAlterMutationCommandsForPart ( part ) ;
2020-02-01 11:47:09 +00:00
}
2020-06-23 16:40:58 +00:00
2021-02-26 09:48:57 +00:00
2020-06-23 16:40:58 +00:00
void StorageReplicatedMergeTree : : startBackgroundMovesIfNeeded ( )
{
2020-10-20 11:27:50 +00:00
if ( areBackgroundMovesNeeded ( ) )
2021-09-08 00:21:21 +00:00
background_moves_assignee . start ( ) ;
2020-06-23 16:40:58 +00:00
}
2021-11-23 13:57:24 +00:00
2021-07-05 12:44:58 +00:00
std : : unique_ptr < MergeTreeSettings > StorageReplicatedMergeTree : : getDefaultSettings ( ) const
{
return std : : make_unique < MergeTreeSettings > ( getContext ( ) - > getReplicatedMergeTreeSettings ( ) ) ;
}
2021-02-26 09:48:57 +00:00
2021-12-27 16:27:06 +00:00
String StorageReplicatedMergeTree : : getTableSharedID ( ) const
{
2022-09-20 12:51:16 +00:00
/// Lock is not required in other places because createTableSharedID()
/// can be called only during table initialization
std : : lock_guard lock ( table_shared_id_mutex ) ;
/// Can happen if table was partially initialized before drop by DatabaseCatalog
2022-10-05 18:32:43 +00:00
if ( table_shared_id = = UUIDHelpers : : Nil )
{
if ( has_metadata_in_zookeeper . has_value ( ) )
{
if ( * has_metadata_in_zookeeper )
createTableSharedID ( ) ;
else
throw Exception ( ErrorCodes : : TABLE_IS_DROPPED , " Table {} is already dropped " , getStorageID ( ) . getNameForLogs ( ) ) ;
}
else
{
2022-10-05 18:40:35 +00:00
throw Exception ( ErrorCodes : : NO_ZOOKEEPER , " No connection to ZooKeeper, cannot get shared table ID for table {}. "
" It will be resolve automatically when connection will be established " , getStorageID ( ) . getNameForLogs ( ) ) ;
2022-10-05 18:32:43 +00:00
}
}
2022-09-20 12:51:16 +00:00
2021-12-27 16:27:06 +00:00
return toString ( table_shared_id ) ;
}
2022-09-20 15:55:06 +00:00
void StorageReplicatedMergeTree : : createTableSharedID ( ) const
2021-12-17 11:03:20 +00:00
{
2022-09-20 12:51:16 +00:00
LOG_DEBUG ( log , " Creating shared ID for table {} " , getStorageID ( ) . getNameForLogs ( ) ) ;
2021-12-30 09:57:38 +00:00
if ( table_shared_id ! = UUIDHelpers : : Nil )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Table shared id already initialized " ) ;
2021-12-17 11:03:20 +00:00
2022-08-19 08:49:51 +00:00
auto zookeeper = getZooKeeper ( ) ;
2021-12-30 09:57:38 +00:00
String zookeeper_table_id_path = fs : : path ( zookeeper_path ) / " table_shared_id " ;
String id ;
if ( ! zookeeper - > tryGet ( zookeeper_table_id_path , id ) )
{
2022-09-20 12:51:16 +00:00
LOG_DEBUG ( log , " Shared ID for table {} doesn't exist in ZooKeeper on path {} " , getStorageID ( ) . getNameForLogs ( ) , zookeeper_table_id_path ) ;
2021-12-30 09:57:38 +00:00
UUID table_id_candidate ;
2022-03-09 14:46:54 +00:00
auto local_storage_id = getStorageID ( ) ;
if ( local_storage_id . uuid ! = UUIDHelpers : : Nil )
table_id_candidate = local_storage_id . uuid ;
2021-12-30 09:57:38 +00:00
else
table_id_candidate = UUIDHelpers : : generateV4 ( ) ;
2021-12-17 11:03:20 +00:00
2021-12-30 09:57:38 +00:00
id = toString ( table_id_candidate ) ;
2022-09-20 12:51:16 +00:00
LOG_DEBUG ( log , " Got candidate ID {}, will try to create it in ZooKeeper on path {} " , id , zookeeper_table_id_path ) ;
2021-12-17 11:03:20 +00:00
2021-12-30 09:57:38 +00:00
auto code = zookeeper - > tryCreate ( zookeeper_table_id_path , id , zkutil : : CreateMode : : Persistent ) ;
if ( code = = Coordination : : Error : : ZNODEEXISTS )
{ /// Other replica create node early
id = zookeeper - > get ( zookeeper_table_id_path ) ;
2022-09-20 12:51:16 +00:00
LOG_DEBUG ( log , " Shared ID on path {} concurrently created, will set ID {} " , zookeeper_table_id_path , id ) ;
2021-12-30 09:57:38 +00:00
}
else if ( code ! = Coordination : : Error : : ZOK )
{
throw zkutil : : KeeperException ( code , zookeeper_table_id_path ) ;
}
2021-12-17 11:03:20 +00:00
}
2021-12-30 09:57:38 +00:00
2022-09-20 12:51:16 +00:00
LOG_DEBUG ( log , " Initializing table shared ID with {} " , id ) ;
2021-12-30 09:57:38 +00:00
table_shared_id = parseFromString < UUID > ( id ) ;
2021-12-17 11:03:20 +00:00
}
2022-06-24 19:29:38 +00:00
std : : optional < String > StorageReplicatedMergeTree : : tryGetTableSharedIDFromCreateQuery ( const IAST & create_query , const ContextPtr & global_context )
{
auto zk_path = tryExtractZkPathFromCreateQuery ( create_query , global_context ) ;
if ( ! zk_path )
return { } ;
String zk_name = zkutil : : extractZooKeeperName ( * zk_path ) ;
zk_path = zkutil : : extractZooKeeperPath ( * zk_path , false , nullptr ) ;
zkutil : : ZooKeeperPtr zookeeper = ( zk_name = = getDefaultZooKeeperName ( ) ) ? global_context - > getZooKeeper ( ) : global_context - > getAuxiliaryZooKeeper ( zk_name ) ;
String id ;
if ( ! zookeeper - > tryGet ( fs : : path ( * zk_path ) / " table_shared_id " , id ) )
return { } ;
return id ;
}
2022-02-14 09:20:27 +00:00
void StorageReplicatedMergeTree : : lockSharedDataTemporary ( const String & part_name , const String & part_id , const DiskPtr & disk ) const
{
2022-04-15 14:24:38 +00:00
auto settings = getSettings ( ) ;
if ( ! disk | | ! disk - > supportZeroCopyReplication ( ) | | ! settings - > allow_remote_fs_zero_copy_replication )
2022-02-14 09:20:27 +00:00
return ;
zkutil : : ZooKeeperPtr zookeeper = tryGetZooKeeper ( ) ;
if ( ! zookeeper )
return ;
String id = part_id ;
boost : : replace_all ( id , " / " , " _ " ) ;
2022-08-19 14:58:30 +00:00
Strings zc_zookeeper_paths = getZeroCopyPartPath ( * getSettings ( ) , toString ( disk - > getDataSourceDescription ( ) . type ) , getTableSharedID ( ) ,
2022-02-14 09:20:27 +00:00
part_name , zookeeper_path ) ;
for ( const auto & zc_zookeeper_path : zc_zookeeper_paths )
{
String zookeeper_node = fs : : path ( zc_zookeeper_path ) / id / replica_name ;
LOG_TRACE ( log , " Set zookeeper temporary ephemeral lock {} " , zookeeper_node ) ;
createZeroCopyLockNode ( zookeeper , zookeeper_node , zkutil : : CreateMode : : Ephemeral , false ) ;
}
}
2022-06-30 20:51:27 +00:00
void StorageReplicatedMergeTree : : lockSharedData ( const IMergeTreeDataPart & part , bool replace_existing_lock , std : : optional < HardlinkedFiles > hardlinked_files ) const
2021-01-14 16:26:56 +00:00
{
2022-04-15 14:24:38 +00:00
auto settings = getSettings ( ) ;
2022-04-26 19:08:00 +00:00
if ( ! part . data_part_storage | | ! part . isStoredOnDisk ( ) | | ! settings - > allow_remote_fs_zero_copy_replication )
2021-02-26 09:48:57 +00:00
return ;
2022-02-01 11:37:12 +00:00
2022-04-22 20:38:18 +00:00
if ( ! part . data_part_storage - > supportZeroCopyReplication ( ) )
2021-02-26 09:48:57 +00:00
return ;
zkutil : : ZooKeeperPtr zookeeper = tryGetZooKeeper ( ) ;
if ( ! zookeeper )
return ;
2022-06-30 20:51:27 +00:00
String id = part . getUniqueId ( ) ;
2021-02-26 09:48:57 +00:00
boost : : replace_all ( id , " / " , " _ " ) ;
2022-04-15 15:05:17 +00:00
Strings zc_zookeeper_paths = getZeroCopyPartPath (
2022-04-26 19:08:00 +00:00
* getSettings ( ) , part . data_part_storage - > getDiskType ( ) , getTableSharedID ( ) ,
2022-04-15 16:36:23 +00:00
part . name , zookeeper_path ) ;
2022-04-19 12:01:30 +00:00
String path_to_set_hardlinked_files ;
2022-04-19 13:53:10 +00:00
NameSet hardlinks ;
if ( hardlinked_files . has_value ( ) & & ! hardlinked_files - > hardlinks_from_source_part . empty ( ) )
2022-04-15 16:36:23 +00:00
{
2022-04-19 12:01:30 +00:00
path_to_set_hardlinked_files = getZeroCopyPartPath (
2022-04-26 19:08:00 +00:00
* getSettings ( ) , part . data_part_storage - > getDiskType ( ) , hardlinked_files - > source_table_shared_id ,
2022-04-19 13:53:10 +00:00
hardlinked_files - > source_part_name , zookeeper_path ) [ 0 ] ;
hardlinks = hardlinked_files - > hardlinks_from_source_part ;
2022-04-15 16:36:23 +00:00
}
2022-04-15 15:05:17 +00:00
2021-11-23 13:57:24 +00:00
for ( const auto & zc_zookeeper_path : zc_zookeeper_paths )
2021-02-26 09:48:57 +00:00
{
2021-12-17 11:03:20 +00:00
String zookeeper_node = fs : : path ( zc_zookeeper_path ) / id / replica_name ;
2021-11-23 13:57:24 +00:00
2022-02-14 09:20:27 +00:00
LOG_TRACE ( log , " Set zookeeper persistent lock {} " , zookeeper_node ) ;
2022-04-15 16:36:23 +00:00
createZeroCopyLockNode (
zookeeper , zookeeper_node , zkutil : : CreateMode : : Persistent ,
2022-04-19 13:53:10 +00:00
replace_existing_lock , path_to_set_hardlinked_files , hardlinks ) ;
2021-02-26 09:48:57 +00:00
}
}
2022-04-18 23:09:09 +00:00
std : : pair < bool , NameSet > StorageReplicatedMergeTree : : unlockSharedData ( const IMergeTreeDataPart & part ) const
2021-02-26 09:48:57 +00:00
{
2022-09-06 17:25:58 +00:00
auto settings = getSettings ( ) ;
if ( ! settings - > allow_remote_fs_zero_copy_replication )
return std : : make_pair ( true , NameSet { } ) ;
2022-09-08 11:11:53 +00:00
if ( ! part . data_part_storage )
LOG_WARNING ( log , " Datapart storage for part {} (temp: {}) is not initialzied " , part . name , part . is_temp ) ;
2022-04-21 19:19:13 +00:00
if ( ! part . data_part_storage | | ! part . isStoredOnDisk ( ) )
2022-09-06 17:25:58 +00:00
{
LOG_TRACE ( log , " Part {} is not stored on disk, blobs can be removed " , part . name ) ;
2022-04-18 23:09:09 +00:00
return std : : make_pair ( true , NameSet { } ) ;
2022-09-06 17:25:58 +00:00
}
2022-02-01 11:37:12 +00:00
2022-04-26 19:08:00 +00:00
if ( ! part . data_part_storage | | ! part . data_part_storage - > supportZeroCopyReplication ( ) )
2022-09-06 17:25:58 +00:00
{
2022-09-09 11:24:22 +00:00
LOG_TRACE ( log , " Part {} is not stored on zero-copy replicated disk, blobs can be removed " , part . name ) ;
2022-04-18 23:09:09 +00:00
return std : : make_pair ( true , NameSet { } ) ;
2022-09-06 17:25:58 +00:00
}
2021-02-26 09:48:57 +00:00
2022-02-07 09:33:37 +00:00
/// If part is temporary refcount file may be absent
2022-04-21 19:19:13 +00:00
if ( part . data_part_storage - > exists ( IMergeTreeDataPart : : FILE_FOR_REFERENCES_CHECK ) )
2022-02-07 09:33:37 +00:00
{
2022-04-21 19:19:13 +00:00
auto ref_count = part . data_part_storage - > getRefCount ( IMergeTreeDataPart : : FILE_FOR_REFERENCES_CHECK ) ;
2022-02-07 09:33:37 +00:00
if ( ref_count > 0 ) /// Keep part shard info for frozen backups
2022-09-06 17:25:58 +00:00
{
LOG_TRACE ( log , " Part {} has more than zero local references ({}), blobs cannot be removed " , part . name , ref_count ) ;
2022-04-18 23:09:09 +00:00
return std : : make_pair ( false , NameSet { } ) ;
2022-09-06 17:25:58 +00:00
}
else
{
LOG_TRACE ( log , " Part {} local references is zero, will check blobs can be removed in zookeeper " , part . name ) ;
}
2022-02-07 09:33:37 +00:00
}
else
{
2022-09-16 11:49:39 +00:00
LOG_TRACE ( log , " Part {} looks temporary, because {} file doesn't exists, blobs can be removed " , part . name , IMergeTreeDataPart : : FILE_FOR_REFERENCES_CHECK ) ;
2022-02-07 09:33:37 +00:00
/// Temporary part with some absent file cannot be locked in shared mode
2022-04-18 23:09:09 +00:00
return std : : make_pair ( true , NameSet { } ) ;
2022-02-07 09:33:37 +00:00
}
2021-02-26 09:48:57 +00:00
2022-10-05 18:36:55 +00:00
/// If table was completely dropped (no meta in zookeeper) we can safely remove parts
2022-10-05 18:32:43 +00:00
if ( has_metadata_in_zookeeper . has_value ( ) & & ! has_metadata_in_zookeeper )
return std : : make_pair ( true , NameSet { } ) ;
2022-09-11 11:47:04 +00:00
/// We remove parts during table shutdown. If exception happen, restarting thread will be already turned
2022-09-11 11:37:39 +00:00
/// off and nobody will reconnect our zookeeper connection. In this case we use zookeeper connection from
/// context.
zkutil : : ZooKeeperPtr zookeeper ;
if ( shutdown_called . load ( ) )
zookeeper = getZooKeeperIfTableShutDown ( ) ;
else
zookeeper = getZooKeeper ( ) ;
2022-10-05 18:32:43 +00:00
/// It can happen that we didn't had the connection to zookeeper during table creation, but actually
/// table is completely dropped, so we can drop it without any additional checks.
if ( ! has_metadata_in_zookeeper . has_value ( ) & & ! zookeeper - > exists ( zookeeper_path ) )
return std : : make_pair ( true , NameSet { } ) ;
2022-10-03 17:06:12 +00:00
return unlockSharedDataByID (
part . getUniqueId ( ) , getTableSharedID ( ) , part . name , replica_name ,
2022-10-03 21:30:50 +00:00
part . data_part_storage - > getDiskType ( ) , zookeeper , * getSettings ( ) , log , zookeeper_path , format_version ) ;
}
namespace
{
2022-10-04 15:30:25 +00:00
/// What is going on here?
/// Actually we need this code because of flaws in hardlinks tracking. When we create child part during mutation we can hardlink some files from parent part, like
/// all_0_0_0:
/// a.bin a.mrk2 columns.txt ...
/// all_0_0_0_1: ^ ^
/// a.bin a.mrk2 columns.txt
/// So when we deleting all_0_0_0 it doesn't remove blobs for a.bin and a.mrk2 because all_0_0_0_1 use them.
2022-10-05 14:12:03 +00:00
/// But sometimes we need an opposite. When we deleting all_0_0_0_1 it can be non replicated to other replicas, so we are the only owner of this part.
2022-10-04 15:30:25 +00:00
/// In this case when we will drop all_0_0_0_1 we will drop blobs for all_0_0_0. But it will lead to dataloss. For such case we need to check that other replicas
/// still need parent part.
2022-10-14 14:50:48 +00:00
std : : pair < bool , NameSet > getParentLockedBlobs ( zkutil : : ZooKeeperPtr zookeeper_ptr , const std : : string & zero_copy_part_path_prefix , const std : : string & part_info_str , MergeTreeDataFormatVersion format_version , Poco : : Logger * log )
2022-10-03 21:30:50 +00:00
{
NameSet files_not_to_remove ;
MergeTreePartInfo part_info = MergeTreePartInfo : : fromPartName ( part_info_str , format_version ) ;
2022-10-04 15:30:25 +00:00
/// No mutations -- no hardlinks -- no issues
2022-10-03 21:30:50 +00:00
if ( part_info . mutation = = 0 )
2022-10-14 14:50:48 +00:00
return { false , files_not_to_remove } ;
2022-10-03 21:30:50 +00:00
2022-10-04 15:30:25 +00:00
/// Getting all zero copy parts
Strings parts_str ;
zookeeper_ptr - > tryGetChildren ( zero_copy_part_path_prefix , parts_str ) ;
2022-10-03 21:30:50 +00:00
2022-10-07 12:49:25 +00:00
/// Parsing infos. It's hard to convert info -> string for old-format merge tree
/// so storing string as is.
std : : vector < std : : pair < MergeTreePartInfo , std : : string > > parts_infos ;
2022-10-03 21:30:50 +00:00
for ( const auto & part_str : parts_str )
{
MergeTreePartInfo parent_candidate_info = MergeTreePartInfo : : fromPartName ( part_str , format_version ) ;
2022-10-07 12:49:25 +00:00
parts_infos . emplace_back ( parent_candidate_info , part_str ) ;
2022-10-03 21:30:50 +00:00
}
2022-10-04 15:30:25 +00:00
/// Sort is important. We need to find our closest parent, like:
/// for part all_0_0_0_64 we can have parents
2022-10-05 14:12:08 +00:00
/// all_0_0_0_6 < we need the closest parent, not others
2022-10-04 15:30:25 +00:00
/// all_0_0_0_1
/// all_0_0_0
2022-10-03 21:30:50 +00:00
std : : sort ( parts_infos . begin ( ) , parts_infos . end ( ) ) ;
2022-10-04 15:30:25 +00:00
/// In reverse order to process from bigger to smaller
2022-10-07 12:49:25 +00:00
for ( const auto & [ parent_candidate_info , part_candidate_info_str ] : parts_infos | std : : views : : reverse )
2022-10-03 21:30:50 +00:00
{
if ( parent_candidate_info = = part_info )
continue ;
2022-10-04 15:30:25 +00:00
/// We are mutation child of this parent
2022-10-03 21:30:50 +00:00
if ( part_info . isMutationChildOf ( parent_candidate_info ) )
{
2022-10-07 12:49:25 +00:00
LOG_TRACE ( log , " Found mutation parent {} for part {} " , part_candidate_info_str , part_info_str ) ;
2022-10-04 15:30:25 +00:00
/// Get hardlinked files
String files_not_to_remove_str ;
2022-10-07 12:49:25 +00:00
Coordination : : Error code ;
zookeeper_ptr - > tryGet ( fs : : path ( zero_copy_part_path_prefix ) / part_candidate_info_str , files_not_to_remove_str , nullptr , nullptr , & code ) ;
if ( code ! = Coordination : : Error : : ZOK )
LOG_TRACE ( log , " Cannot get parent files from ZooKeeper on path ({}), error {} " , ( fs : : path ( zero_copy_part_path_prefix ) / part_candidate_info_str ) . string ( ) , errorMessage ( code ) ) ;
if ( ! files_not_to_remove_str . empty ( ) )
{
boost : : split ( files_not_to_remove , files_not_to_remove_str , boost : : is_any_of ( " \n " ) ) ;
LOG_TRACE ( log , " Found files not to remove from parent part {}: [{}] " , part_candidate_info_str , fmt : : join ( files_not_to_remove , " , " ) ) ;
}
2022-10-14 14:50:48 +00:00
return { true , files_not_to_remove } ;
2022-10-03 21:30:50 +00:00
}
}
2022-10-14 14:50:48 +00:00
return { false , files_not_to_remove } ;
2022-10-03 21:30:50 +00:00
}
2021-11-23 13:57:24 +00:00
}
2021-02-26 09:48:57 +00:00
2022-04-18 23:09:09 +00:00
std : : pair < bool , NameSet > StorageReplicatedMergeTree : : unlockSharedDataByID (
2022-04-15 16:36:23 +00:00
String part_id , const String & table_uuid , const String & part_name ,
2022-04-21 19:19:13 +00:00
const String & replica_name_ , std : : string disk_type , zkutil : : ZooKeeperPtr zookeeper_ptr , const MergeTreeSettings & settings ,
2022-10-03 21:30:50 +00:00
Poco : : Logger * logger , const String & zookeeper_path_old , MergeTreeDataFormatVersion data_format_version )
2021-11-23 13:57:24 +00:00
{
2022-01-14 15:44:10 +00:00
boost : : replace_all ( part_id , " / " , " _ " ) ;
2021-02-26 09:48:57 +00:00
2022-04-21 19:19:13 +00:00
Strings zc_zookeeper_paths = getZeroCopyPartPath ( settings , disk_type , table_uuid , part_name , zookeeper_path_old ) ;
2021-02-26 09:48:57 +00:00
2022-02-02 16:40:21 +00:00
bool part_has_no_more_locks = true ;
2022-04-18 23:09:09 +00:00
NameSet files_not_to_remove ;
2021-02-26 09:48:57 +00:00
2021-11-23 13:57:24 +00:00
for ( const auto & zc_zookeeper_path : zc_zookeeper_paths )
2021-02-26 09:48:57 +00:00
{
2022-04-20 12:13:29 +00:00
String files_not_to_remove_str ;
zookeeper_ptr - > tryGet ( zc_zookeeper_path , files_not_to_remove_str ) ;
2022-04-15 16:36:23 +00:00
files_not_to_remove . clear ( ) ;
2022-04-18 23:12:07 +00:00
if ( ! files_not_to_remove_str . empty ( ) )
boost : : split ( files_not_to_remove , files_not_to_remove_str , boost : : is_any_of ( " \n " ) ) ;
2022-04-18 23:09:09 +00:00
2022-10-14 14:50:48 +00:00
auto [ has_parent , parent_not_to_remove ] = getParentLockedBlobs ( zookeeper_ptr , fs : : path ( zc_zookeeper_path ) . parent_path ( ) , part_name , data_format_version , logger ) ;
2022-10-03 21:30:50 +00:00
files_not_to_remove . insert ( parent_not_to_remove . begin ( ) , parent_not_to_remove . end ( ) ) ;
2022-01-14 15:44:10 +00:00
String zookeeper_part_uniq_node = fs : : path ( zc_zookeeper_path ) / part_id ;
/// Delete our replica node for part from zookeeper (we are not interested in it anymore)
String zookeeper_part_replica_node = fs : : path ( zookeeper_part_uniq_node ) / replica_name_ ;
2021-02-26 09:48:57 +00:00
2022-04-22 13:20:02 +00:00
LOG_TRACE ( logger , " Remove zookeeper lock {} for part {} " , zookeeper_part_replica_node , part_name ) ;
2021-02-26 09:48:57 +00:00
2022-10-14 14:50:48 +00:00
if ( auto ec = zookeeper_ptr - > tryRemove ( zookeeper_part_replica_node ) ; ec ! = Coordination : : Error : : ZOK )
2022-04-22 13:20:02 +00:00
{
2022-10-14 14:50:48 +00:00
/// Very complex case. It means that lock already doesn't exist when we tried to remove it.
/// So we don't know are we owner of this part or not. Maybe we just mutated it, renamed on disk and failed to lock in ZK.
/// But during mutation we can have hardlinks to another part. So it's not Ok to remove blobs of this part if it was mutated.
2022-10-17 10:27:05 +00:00
if ( ec = = Coordination : : Error : : ZNONODE )
2022-10-14 14:50:48 +00:00
{
2022-10-17 10:27:05 +00:00
if ( has_parent )
{
LOG_INFO ( logger , " Lock on path {} for part {} doesn't exist, refuse to remove blobs " , zookeeper_part_replica_node , part_name ) ;
return { false , { } } ;
}
}
else
{
throw zkutil : : KeeperException ( ec , zookeeper_part_replica_node ) ;
2022-10-14 14:50:48 +00:00
}
2022-04-22 13:20:02 +00:00
}
2021-02-26 09:48:57 +00:00
2022-01-14 15:44:10 +00:00
/// Check, maybe we were the last replica and can remove part forever
2021-11-23 13:57:24 +00:00
Strings children ;
2021-12-01 13:11:26 +00:00
zookeeper_ptr - > tryGetChildren ( zookeeper_part_uniq_node , children ) ;
2021-11-23 13:57:24 +00:00
if ( ! children . empty ( ) )
{
2022-09-26 15:13:10 +00:00
LOG_TRACE ( logger , " Found {} ({}) zookeeper locks for {} " , children . size ( ) , fmt : : join ( children , " , " ) , zookeeper_part_uniq_node ) ;
2022-02-02 16:40:21 +00:00
part_has_no_more_locks = false ;
2021-11-23 13:57:24 +00:00
continue ;
}
2022-09-06 17:25:58 +00:00
else
{
LOG_TRACE ( logger , " No more children left for for {}, will try to remove the whole node " , zookeeper_part_uniq_node ) ;
}
2021-11-23 13:57:24 +00:00
2022-01-14 15:44:10 +00:00
auto error_code = zookeeper_ptr - > tryRemove ( zookeeper_part_uniq_node ) ;
2021-12-17 11:03:20 +00:00
2022-04-22 13:20:02 +00:00
if ( error_code = = Coordination : : Error : : ZOK )
{
LOG_TRACE ( logger , " Removed last parent zookeeper lock {} for part {} with id {} " , zookeeper_part_uniq_node , part_name , part_id ) ;
}
else if ( error_code = = Coordination : : Error : : ZNOTEMPTY )
{
LOG_TRACE ( logger , " Cannot remove last parent zookeeper lock {} for part {} with id {}, another replica locked part concurrently " , zookeeper_part_uniq_node , part_name , part_id ) ;
}
else if ( error_code = = Coordination : : Error : : ZNONODE )
{
LOG_TRACE ( logger , " Node with parent zookeeper lock {} for part {} with id {} doesn't exist " , zookeeper_part_uniq_node , part_name , part_id ) ;
}
else
{
throw zkutil : : KeeperException ( error_code , zookeeper_part_uniq_node ) ;
}
2021-11-23 13:57:24 +00:00
/// Even when we have lock with same part name, but with different uniq, we can remove files on S3
children . clear ( ) ;
2021-12-17 11:03:20 +00:00
String zookeeper_part_node = fs : : path ( zookeeper_part_uniq_node ) . parent_path ( ) ;
2021-12-01 13:11:26 +00:00
zookeeper_ptr - > tryGetChildren ( zookeeper_part_node , children ) ;
2022-04-22 13:20:02 +00:00
2021-11-23 13:57:24 +00:00
if ( children . empty ( ) )
2021-12-01 13:11:26 +00:00
{
2021-11-23 13:57:24 +00:00
/// Cleanup after last uniq removing
2022-01-14 15:44:10 +00:00
error_code = zookeeper_ptr - > tryRemove ( zookeeper_part_node ) ;
2021-12-17 11:03:20 +00:00
2022-04-22 13:20:02 +00:00
if ( error_code = = Coordination : : Error : : ZOK )
{
LOG_TRACE ( logger , " Removed last parent zookeeper lock {} for part {} (part is finally unlocked) " , zookeeper_part_uniq_node , part_name ) ;
}
else if ( error_code = = Coordination : : Error : : ZNOTEMPTY )
{
LOG_TRACE ( logger , " Cannot remove last parent zookeeper lock {} for part {}, another replica locked part concurrently " , zookeeper_part_uniq_node , part_name ) ;
}
else if ( error_code = = Coordination : : Error : : ZNONODE )
{
LOG_TRACE ( logger , " Node with parent zookeeper lock {} for part {} doesn't exist (part was unlocked before) " , zookeeper_part_uniq_node , part_name ) ;
}
else
{
throw zkutil : : KeeperException ( error_code , zookeeper_part_uniq_node ) ;
}
2021-12-17 11:03:20 +00:00
}
else
{
2022-09-06 11:59:55 +00:00
LOG_TRACE ( logger , " Can't remove parent zookeeper lock {} for part {}, because children {} ({}) exists " ,
2022-10-03 21:30:50 +00:00
zookeeper_part_node , part_name , children . size ( ) , fmt : : join ( children , " , " ) ) ;
2021-12-01 13:11:26 +00:00
}
2021-11-23 13:57:24 +00:00
}
2022-04-15 16:36:23 +00:00
return std : : make_pair ( part_has_no_more_locks , files_not_to_remove ) ;
2021-02-26 09:48:57 +00:00
}
2022-10-22 22:51:59 +00:00
bool StorageReplicatedMergeTree : : tryToFetchIfShared (
2021-02-26 09:48:57 +00:00
const IMergeTreeDataPart & part ,
const DiskPtr & disk ,
2021-03-05 17:24:06 +00:00
const String & path )
2021-02-26 09:48:57 +00:00
{
2021-06-24 08:25:05 +00:00
const auto settings = getSettings ( ) ;
2022-08-19 14:58:30 +00:00
auto data_source_description = disk - > getDataSourceDescription ( ) ;
2021-07-05 03:32:56 +00:00
if ( ! ( disk - > supportZeroCopyReplication ( ) & & settings - > allow_remote_fs_zero_copy_replication ) )
2022-10-22 22:51:59 +00:00
return false ;
2021-02-26 09:48:57 +00:00
2022-08-19 14:58:30 +00:00
String replica = getSharedDataReplica ( part , data_source_description . type ) ;
2021-02-26 09:48:57 +00:00
2021-06-24 08:25:05 +00:00
/// We can't fetch part when none replicas have this part on a same type remote disk
2021-02-26 09:48:57 +00:00
if ( replica . empty ( ) )
2022-10-22 22:51:59 +00:00
return false ;
2021-02-26 09:48:57 +00:00
2021-03-05 17:24:06 +00:00
return executeFetchShared ( replica , part . name , disk , path ) ;
2021-02-26 09:48:57 +00:00
}
String StorageReplicatedMergeTree : : getSharedDataReplica (
2022-08-19 14:58:30 +00:00
const IMergeTreeDataPart & part , DataSourceType data_source_type ) const
2021-02-26 09:48:57 +00:00
{
String best_replica ;
zkutil : : ZooKeeperPtr zookeeper = tryGetZooKeeper ( ) ;
if ( ! zookeeper )
2022-01-14 15:44:10 +00:00
return " " ;
2021-02-26 09:48:57 +00:00
2022-08-19 14:58:30 +00:00
Strings zc_zookeeper_paths = getZeroCopyPartPath ( * getSettings ( ) , toString ( data_source_type ) , getTableSharedID ( ) , part . name ,
2021-12-20 17:23:25 +00:00
zookeeper_path ) ;
2021-02-26 09:48:57 +00:00
2021-11-23 13:57:24 +00:00
std : : set < String > replicas ;
2021-02-26 09:48:57 +00:00
2021-11-23 13:57:24 +00:00
for ( const auto & zc_zookeeper_path : zc_zookeeper_paths )
2021-02-26 09:48:57 +00:00
{
2021-11-23 13:57:24 +00:00
Strings ids ;
2021-12-17 11:03:20 +00:00
zookeeper - > tryGetChildren ( zc_zookeeper_path , ids ) ;
2021-11-23 13:57:24 +00:00
for ( const auto & id : ids )
{
2021-12-17 11:03:20 +00:00
String zookeeper_part_uniq_node = fs : : path ( zc_zookeeper_path ) / id ;
2021-11-23 13:57:24 +00:00
Strings id_replicas ;
zookeeper - > tryGetChildren ( zookeeper_part_uniq_node , id_replicas ) ;
2022-09-05 01:50:24 +00:00
LOG_TRACE ( log , " Found zookeeper replicas for {}: {} " , zookeeper_part_uniq_node , id_replicas . size ( ) ) ;
2021-11-23 13:57:24 +00:00
replicas . insert ( id_replicas . begin ( ) , id_replicas . end ( ) ) ;
}
2021-02-26 09:48:57 +00:00
}
2022-09-05 01:50:24 +00:00
LOG_TRACE ( log , " Found zookeeper replicas for part {}: {} " , part . name , replicas . size ( ) ) ;
2021-02-26 09:48:57 +00:00
Strings active_replicas ;
/// TODO: Move best replica choose in common method (here is the same code as in StorageReplicatedMergeTree::fetchPartition)
/// Leave only active replicas.
active_replicas . reserve ( replicas . size ( ) ) ;
for ( const String & replica : replicas )
2021-05-08 10:59:55 +00:00
if ( ( replica ! = replica_name ) & & ( zookeeper - > exists ( fs : : path ( zookeeper_path ) / " replicas " / replica / " is_active " ) ) )
2021-02-26 09:48:57 +00:00
active_replicas . push_back ( replica ) ;
2022-09-05 01:50:24 +00:00
LOG_TRACE ( log , " Found zookeeper active replicas for part {}: {} " , part . name , active_replicas . size ( ) ) ;
2021-02-26 09:48:57 +00:00
if ( active_replicas . empty ( ) )
2022-01-14 15:44:10 +00:00
return " " ;
2021-02-26 09:48:57 +00:00
/** You must select the best (most relevant) replica.
* This is a replica with the maximum ` log_pointer ` , then with the minimum ` queue ` size .
* NOTE This is not exactly the best criteria . It does not make sense to download old partitions ,
* and it would be nice to be able to choose the replica closest by network .
* NOTE Of course , there are data races here . You can solve it by retrying .
*/
Int64 max_log_pointer = - 1 ;
UInt64 min_queue_size = std : : numeric_limits < UInt64 > : : max ( ) ;
for ( const String & replica : active_replicas )
{
2021-05-08 10:59:55 +00:00
String current_replica_path = fs : : path ( zookeeper_path ) / " replicas " / replica ;
2021-02-26 09:48:57 +00:00
2021-05-08 10:59:55 +00:00
String log_pointer_str = zookeeper - > get ( fs : : path ( current_replica_path ) / " log_pointer " ) ;
2021-02-26 09:48:57 +00:00
Int64 log_pointer = log_pointer_str . empty ( ) ? 0 : parse < UInt64 > ( log_pointer_str ) ;
Coordination : : Stat stat ;
2021-05-08 10:59:55 +00:00
zookeeper - > get ( fs : : path ( current_replica_path ) / " queue " , & stat ) ;
2021-02-26 09:48:57 +00:00
size_t queue_size = stat . numChildren ;
if ( log_pointer > max_log_pointer
| | ( log_pointer = = max_log_pointer & & queue_size < min_queue_size ) )
{
max_log_pointer = log_pointer ;
min_queue_size = queue_size ;
best_replica = replica ;
}
}
return best_replica ;
2021-01-14 16:26:56 +00:00
}
2021-11-23 13:57:24 +00:00
2022-05-11 22:04:54 +00:00
Strings StorageReplicatedMergeTree : : getZeroCopyPartPath (
2022-06-02 10:19:07 +00:00
const MergeTreeSettings & settings , std : : string disk_type , const String & table_uuid ,
2021-12-17 11:03:20 +00:00
const String & part_name , const String & zookeeper_path_old )
2021-11-23 13:57:24 +00:00
{
Strings res ;
2022-04-21 19:19:13 +00:00
String zero_copy = fmt : : format ( " zero_copy_{} " , disk_type ) ;
2021-12-17 11:03:20 +00:00
String new_path = fs : : path ( settings . remote_fs_zero_copy_zookeeper_path . toString ( ) ) / zero_copy / table_uuid / part_name ;
res . push_back ( new_path ) ;
2021-12-20 17:23:25 +00:00
if ( settings . remote_fs_zero_copy_path_compatible_mode & & ! zookeeper_path_old . empty ( ) )
2021-12-17 11:03:20 +00:00
{ /// Compatibility mode for cluster with old and new versions
String old_path = fs : : path ( zookeeper_path_old ) / zero_copy / " shared " / part_name ;
res . push_back ( old_path ) ;
}
2021-11-23 13:57:24 +00:00
return res ;
}
2022-02-10 19:45:52 +00:00
bool StorageReplicatedMergeTree : : checkZeroCopyLockExists ( const String & part_name , const DiskPtr & disk )
{
auto path = getZeroCopyPartPath ( part_name , disk ) ;
if ( path )
{
/// FIXME
auto lock_path = fs : : path ( * path ) / " part_exclusive_lock " ;
if ( getZooKeeper ( ) - > exists ( lock_path ) )
{
return true ;
}
}
2021-11-23 13:57:24 +00:00
2022-02-10 19:45:52 +00:00
return false ;
}
std : : optional < String > StorageReplicatedMergeTree : : getZeroCopyPartPath ( const String & part_name , const DiskPtr & disk )
{
if ( ! disk | | ! disk - > supportZeroCopyReplication ( ) )
return std : : nullopt ;
2022-08-19 14:58:30 +00:00
return getZeroCopyPartPath ( * getSettings ( ) , toString ( disk - > getDataSourceDescription ( ) . type ) , getTableSharedID ( ) , part_name , zookeeper_path ) [ 0 ] ;
2022-02-10 19:45:52 +00:00
}
2021-11-23 13:57:24 +00:00
2022-02-10 19:45:52 +00:00
std : : optional < ZeroCopyLock > StorageReplicatedMergeTree : : tryCreateZeroCopyExclusiveLock ( const String & part_name , const DiskPtr & disk )
2022-01-17 11:52:51 +00:00
{
if ( ! disk | | ! disk - > supportZeroCopyReplication ( ) )
return std : : nullopt ;
zkutil : : ZooKeeperPtr zookeeper = tryGetZooKeeper ( ) ;
if ( ! zookeeper )
return std : : nullopt ;
2022-02-10 19:45:52 +00:00
String zc_zookeeper_path = * getZeroCopyPartPath ( part_name , disk ) ;
2022-01-17 11:52:51 +00:00
2022-01-18 08:27:01 +00:00
/// Just recursively create ancestors for lock
zookeeper - > createAncestors ( zc_zookeeper_path ) ;
zookeeper - > createIfNotExists ( zc_zookeeper_path , " " ) ;
2022-01-17 11:52:51 +00:00
/// Create actual lock
2022-01-18 08:27:01 +00:00
ZeroCopyLock lock ( zookeeper , zc_zookeeper_path ) ;
2022-01-17 11:52:51 +00:00
if ( lock . lock - > tryLock ( ) )
return lock ;
else
return std : : nullopt ;
}
2021-11-23 13:57:24 +00:00
2021-04-13 04:40:33 +00:00
String StorageReplicatedMergeTree : : findReplicaHavingPart (
2021-12-01 13:11:26 +00:00
const String & part_name , const String & zookeeper_path_ , zkutil : : ZooKeeper : : Ptr zookeeper_ptr )
2021-04-13 04:40:33 +00:00
{
2021-12-01 13:11:26 +00:00
Strings replicas = zookeeper_ptr - > getChildren ( fs : : path ( zookeeper_path_ ) / " replicas " ) ;
2021-04-13 04:40:33 +00:00
/// Select replicas in uniformly random order.
std : : shuffle ( replicas . begin ( ) , replicas . end ( ) , thread_local_rng ) ;
for ( const String & replica : replicas )
{
2021-12-01 13:11:26 +00:00
if ( zookeeper_ptr - > exists ( fs : : path ( zookeeper_path_ ) / " replicas " / replica / " parts " / part_name )
& & zookeeper_ptr - > exists ( fs : : path ( zookeeper_path_ ) / " replicas " / replica / " is_active " ) )
2021-05-08 10:59:55 +00:00
return fs : : path ( zookeeper_path_ ) / " replicas " / replica ;
2021-04-13 04:40:33 +00:00
}
return { } ;
}
2021-11-23 13:57:24 +00:00
2021-04-14 02:05:41 +00:00
bool StorageReplicatedMergeTree : : checkIfDetachedPartExists ( const String & part_name )
2021-04-13 04:40:33 +00:00
{
2021-05-08 10:59:55 +00:00
fs : : directory_iterator dir_end ;
2021-04-13 04:40:33 +00:00
for ( const std : : string & path : getDataPaths ( ) )
2021-05-08 10:59:55 +00:00
for ( fs : : directory_iterator dir_it { fs : : path ( path ) / " detached/ " } ; dir_it ! = dir_end ; + + dir_it )
if ( dir_it - > path ( ) . filename ( ) . string ( ) = = part_name )
2021-04-13 04:40:33 +00:00
return true ;
return false ;
}
2021-11-23 13:57:24 +00:00
2021-04-14 02:05:41 +00:00
bool StorageReplicatedMergeTree : : checkIfDetachedPartitionExists ( const String & partition_name )
2021-04-13 04:40:33 +00:00
{
2021-05-08 10:59:55 +00:00
fs : : directory_iterator dir_end ;
2021-08-24 12:57:49 +00:00
2021-04-13 04:40:33 +00:00
for ( const std : : string & path : getDataPaths ( ) )
{
2021-05-08 10:59:55 +00:00
for ( fs : : directory_iterator dir_it { fs : : path ( path ) / " detached/ " } ; dir_it ! = dir_end ; + + dir_it )
2021-04-13 04:40:33 +00:00
{
2021-08-24 12:57:49 +00:00
const String file_name = dir_it - > path ( ) . filename ( ) . string ( ) ;
auto part_info = MergeTreePartInfo : : tryParsePartName ( file_name , format_version ) ;
if ( part_info & & part_info - > partition_id = = partition_name )
2021-04-13 04:40:33 +00:00
return true ;
}
}
return false ;
}
2021-06-29 15:14:44 +00:00
2021-06-30 15:24:51 +00:00
bool StorageReplicatedMergeTree : : createEmptyPartInsteadOfLost ( zkutil : : ZooKeeperPtr zookeeper , const String & lost_part_name )
2021-06-29 15:14:44 +00:00
{
LOG_INFO ( log , " Going to replace lost part {} with empty part " , lost_part_name ) ;
auto metadata_snapshot = getInMemoryMetadataPtr ( ) ;
2021-06-29 19:47:54 +00:00
auto settings = getSettings ( ) ;
2021-06-29 15:14:44 +00:00
constexpr static auto TMP_PREFIX = " tmp_empty_ " ;
auto new_part_info = MergeTreePartInfo : : fromPartName ( lost_part_name , format_version ) ;
auto block = metadata_snapshot - > getSampleBlock ( ) ;
DB : : IMergeTreeDataPart : : TTLInfos move_ttl_infos ;
NamesAndTypesList columns = metadata_snapshot - > getColumns ( ) . getAllPhysical ( ) . filter ( block . getNames ( ) ) ;
ReservationPtr reservation = reserveSpacePreferringTTLRules ( metadata_snapshot , 0 , move_ttl_infos , time ( nullptr ) , 0 , true ) ;
VolumePtr volume = getStoragePolicy ( ) - > getVolume ( 0 ) ;
2021-09-16 21:19:58 +00:00
auto minmax_idx = std : : make_shared < IMergeTreeDataPart : : MinMaxIndex > ( ) ;
minmax_idx - > update ( block , getMinMaxColumnsNames ( metadata_snapshot - > getPartitionKey ( ) ) ) ;
2021-06-29 15:14:44 +00:00
2022-04-21 19:19:13 +00:00
auto new_volume = createVolumeFromReservation ( reservation , volume ) ;
2022-10-22 22:51:59 +00:00
2022-04-21 19:19:13 +00:00
auto data_part_storage = std : : make_shared < DataPartStorageOnDisk > (
new_volume ,
relative_data_path ,
TMP_PREFIX + lost_part_name ) ;
2022-10-22 22:51:59 +00:00
data_part_storage - > beginTransaction ( ) ;
2022-04-21 19:19:13 +00:00
2021-06-29 15:14:44 +00:00
auto new_data_part = createPart (
lost_part_name ,
choosePartType ( 0 , block . rows ( ) ) ,
new_part_info ,
2022-04-21 19:19:13 +00:00
data_part_storage ) ;
2021-06-29 15:14:44 +00:00
2021-06-29 19:47:54 +00:00
if ( settings - > assign_part_uuids )
2021-06-29 15:14:44 +00:00
new_data_part - > uuid = UUIDHelpers : : generateV4 ( ) ;
2022-07-27 14:05:16 +00:00
new_data_part - > setColumns ( columns , { } ) ;
2021-06-29 15:14:44 +00:00
new_data_part - > rows_count = block . rows ( ) ;
{
2021-06-30 19:41:25 +00:00
auto lock = lockParts ( ) ;
auto parts_in_partition = getDataPartsPartitionRange ( new_part_info . partition_id ) ;
2021-11-26 14:49:31 +00:00
if ( ! parts_in_partition . empty ( ) )
2021-06-30 19:41:25 +00:00
{
2021-11-26 14:49:31 +00:00
new_data_part - > partition = ( * parts_in_partition . begin ( ) ) - > partition ;
}
else if ( auto parsed_partition = MergeTreePartition : : tryParseValueFromID (
new_part_info . partition_id ,
metadata_snapshot - > getPartitionKey ( ) . sample_block ) )
{
new_data_part - > partition = MergeTreePartition ( * parsed_partition ) ;
}
else
2021-06-30 19:41:25 +00:00
{
2021-11-26 14:49:31 +00:00
LOG_WARNING ( log , " Empty part {} is not created instead of lost part because there are no parts in partition {} (it's empty), "
" resolve this manually using DROP/DETACH PARTITION. " , lost_part_name , new_part_info . partition_id ) ;
2021-06-30 19:41:25 +00:00
return false ;
}
2021-06-29 15:14:44 +00:00
}
new_data_part - > minmax_idx = std : : move ( minmax_idx ) ;
new_data_part - > is_temp = true ;
SyncGuardPtr sync_guard ;
if ( new_data_part - > isStoredOnDisk ( ) )
{
/// The name could be non-unique in case of stale files from previous runs.
2022-10-22 22:51:59 +00:00
if ( data_part_storage - > exists ( ) )
2021-06-29 15:14:44 +00:00
{
2022-04-21 19:19:13 +00:00
LOG_WARNING ( log , " Removing old temporary directory {} " , new_data_part - > data_part_storage - > getFullPath ( ) ) ;
2022-10-22 22:51:59 +00:00
data_part_storage - > removeRecursive ( ) ;
2021-06-29 15:14:44 +00:00
}
2022-10-22 22:51:59 +00:00
data_part_storage - > createDirectories ( ) ;
2021-06-29 15:14:44 +00:00
if ( getSettings ( ) - > fsync_part_directory )
2022-10-22 22:51:59 +00:00
sync_guard = data_part_storage - > getDirectorySyncGuard ( ) ;
2021-06-29 15:14:44 +00:00
}
/// This effectively chooses minimal compression method:
/// either default lz4 or compression method with zero thresholds on absolute and relative part size.
auto compression_codec = getContext ( ) - > chooseCompressionCodec ( 0 , 0 ) ;
const auto & index_factory = MergeTreeIndexFactory : : instance ( ) ;
2022-10-22 22:51:59 +00:00
MergedBlockOutputStream out ( new_data_part , metadata_snapshot , columns ,
2022-03-18 11:01:26 +00:00
index_factory . getMany ( metadata_snapshot - > getSecondaryIndices ( ) ) , compression_codec , NO_TRANSACTION_PTR ) ;
2021-07-07 14:05:11 +00:00
2021-06-29 19:47:54 +00:00
bool sync_on_insert = settings - > fsync_after_insert ;
2021-06-29 15:14:44 +00:00
out . write ( block ) ;
2021-08-26 11:01:15 +00:00
/// TODO(ab): What projections should we add to the empty part? How can we make sure that it
/// won't block future merges? Perhaps we should also check part emptiness when selecting parts
/// to merge.
2022-02-01 10:36:51 +00:00
out . finalizePart ( new_data_part , sync_on_insert ) ;
2021-06-29 15:14:44 +00:00
try
{
2022-03-16 19:16:26 +00:00
MergeTreeData : : Transaction transaction ( * this , NO_TRANSACTION_RAW ) ;
2022-10-22 22:51:59 +00:00
auto replaced_parts = renameTempPartAndReplace ( new_data_part , transaction ) ;
2021-07-02 09:29:45 +00:00
if ( ! replaced_parts . empty ( ) )
{
Strings part_names ;
for ( const auto & part : replaced_parts )
part_names . emplace_back ( part - > name ) ;
2021-07-02 13:38:46 +00:00
/// Why this exception is not a LOGICAL_ERROR? Because it's possible
/// to have some source parts for the lost part if replica currently
/// cloning from another replica, but source replica lost covering
/// part and finished MERGE_PARTS before clone. It's an extremely
/// rare case and it's unclear how to resolve it better. Eventually
/// source replica will replace lost part with empty part and we
/// will fetch this empty part instead of our source parts. This
/// will make replicas consistent, but some data will be lost.
throw Exception ( ErrorCodes : : INCORRECT_DATA , " Tried to create empty part {}, but it replaces existing parts {}. " , lost_part_name , fmt : : join ( part_names , " , " ) ) ;
2021-07-02 09:29:45 +00:00
}
2021-06-29 15:14:44 +00:00
2022-06-30 20:51:27 +00:00
lockSharedData ( * new_data_part , false , { } ) ;
2022-04-22 13:20:02 +00:00
2021-06-30 15:24:51 +00:00
while ( true )
{
2022-07-19 10:47:20 +00:00
/// We should be careful when creating an empty part, because we are not sure that this part is still needed.
/// For example, it's possible that part (or partition) was dropped (or replaced) concurrently.
/// We can enqueue part for check from DataPartExchange or SelectProcessor
/// and it's hard to synchronize it with ReplicatedMergeTreeQueue and PartCheckThread...
/// But at least we can ignore parts that are definitely not needed according to virtual parts and drop ranges.
2022-07-18 21:37:07 +00:00
auto pred = queue . getMergePredicate ( zookeeper ) ;
String covering_virtual = pred . getCoveringVirtualPart ( lost_part_name ) ;
if ( covering_virtual . empty ( ) )
{
LOG_WARNING ( log , " Will not create empty part instead of lost {}, because there's no covering part in replication queue " , lost_part_name ) ;
return false ;
}
if ( pred . hasDropRange ( MergeTreePartInfo : : fromPartName ( covering_virtual , format_version ) ) )
{
LOG_WARNING ( log , " Will not create empty part instead of lost {}, because it's covered by DROP_RANGE " , lost_part_name ) ;
return false ;
}
2021-06-30 15:24:51 +00:00
Coordination : : Requests ops ;
Coordination : : Stat replicas_stat ;
auto replicas_path = fs : : path ( zookeeper_path ) / " replicas " ;
Strings replicas = zookeeper - > getChildren ( replicas_path , & replicas_stat ) ;
2022-07-18 21:37:07 +00:00
ops . emplace_back ( zkutil : : makeCheckRequest ( zookeeper_path + " /log " , pred . getVersion ( ) ) ) ;
2021-06-30 15:24:51 +00:00
/// In rare cases new replica can appear during check
ops . emplace_back ( zkutil : : makeCheckRequest ( replicas_path , replicas_stat . version ) ) ;
for ( const String & replica : replicas )
{
String current_part_path = fs : : path ( zookeeper_path ) / " replicas " / replica / " parts " / lost_part_name ;
/// We must be sure that this part doesn't exist on other replicas
if ( ! zookeeper - > exists ( current_part_path ) )
{
ops . emplace_back ( zkutil : : makeCreateRequest ( current_part_path , " " , zkutil : : CreateMode : : Persistent ) ) ;
ops . emplace_back ( zkutil : : makeRemoveRequest ( current_part_path , - 1 ) ) ;
}
else
{
throw Exception ( ErrorCodes : : DUPLICATE_DATA_PART , " Part {} already exists on replica {} on path {} " , lost_part_name , replica , current_part_path ) ;
}
}
getCommitPartOps ( ops , new_data_part ) ;
Coordination : : Responses responses ;
if ( auto code = zookeeper - > tryMulti ( ops , responses ) ; code = = Coordination : : Error : : ZOK )
{
transaction . commit ( ) ;
break ;
}
else if ( code = = Coordination : : Error : : ZBADVERSION )
{
2022-07-18 21:37:07 +00:00
LOG_INFO ( log , " Looks like log was updated or new replica appeared while creating new empty part, will retry " ) ;
2021-06-30 15:24:51 +00:00
}
else
{
zkutil : : KeeperMultiException : : check ( code , ops , responses ) ;
}
}
2021-06-29 15:14:44 +00:00
}
catch ( const Exception & ex )
{
LOG_WARNING ( log , " Cannot commit empty part {} with error {} " , lost_part_name , ex . displayText ( ) ) ;
return false ;
}
LOG_INFO ( log , " Created empty part {} instead of lost part " , lost_part_name ) ;
return true ;
}
2021-11-23 13:57:24 +00:00
2022-04-19 12:01:30 +00:00
void StorageReplicatedMergeTree : : createZeroCopyLockNode (
const zkutil : : ZooKeeperPtr & zookeeper , const String & zookeeper_node , int32_t mode ,
bool replace_existing_lock , const String & path_to_set_hardlinked_files , const NameSet & hardlinked_files )
2021-11-23 13:57:24 +00:00
{
/// In rare case other replica can remove path between createAncestors and createIfNotExists
/// So we make up to 5 attempts
2022-04-22 13:20:02 +00:00
bool created = false ;
2021-11-23 13:57:24 +00:00
for ( int attempts = 5 ; attempts > 0 ; - - attempts )
{
try
{
2022-03-16 20:51:33 +00:00
/// Ephemeral locks can be created only when we fetch shared data.
/// So it never require to create ancestors. If we create them
/// race condition with source replica drop is possible.
if ( mode = = zkutil : : CreateMode : : Persistent )
zookeeper - > createAncestors ( zookeeper_node ) ;
2022-02-14 11:37:41 +00:00
if ( replace_existing_lock & & zookeeper - > exists ( zookeeper_node ) )
2022-02-14 09:20:27 +00:00
{
Coordination : : Requests ops ;
ops . emplace_back ( zkutil : : makeRemoveRequest ( zookeeper_node , - 1 ) ) ;
ops . emplace_back ( zkutil : : makeCreateRequest ( zookeeper_node , " " , mode ) ) ;
2022-04-19 12:01:30 +00:00
if ( ! path_to_set_hardlinked_files . empty ( ) & & ! hardlinked_files . empty ( ) )
2022-04-15 16:36:23 +00:00
{
std : : string data = boost : : algorithm : : join ( hardlinked_files , " \n " ) ;
2022-04-19 12:01:30 +00:00
/// List of files used to detect hardlinks. path_to_set_hardlinked_files --
/// is a path to source part zero copy node. During part removal hardlinked
/// files will be left for source part.
ops . emplace_back ( zkutil : : makeSetRequest ( path_to_set_hardlinked_files , data , - 1 ) ) ;
2022-04-15 16:36:23 +00:00
}
2022-02-14 09:20:27 +00:00
Coordination : : Responses responses ;
auto error = zookeeper - > tryMulti ( ops , responses ) ;
if ( error = = Coordination : : Error : : ZOK )
2022-04-22 13:20:02 +00:00
{
created = true ;
2022-02-14 09:20:27 +00:00
break ;
2022-04-22 13:20:02 +00:00
}
else if ( error = = Coordination : : Error : : ZNONODE & & mode ! = zkutil : : CreateMode : : Persistent )
{
throw Exception ( ErrorCodes : : NOT_FOUND_NODE , " Cannot create ephemeral zero copy lock {} because part was unlocked from zookeeper " , zookeeper_node ) ;
}
2022-02-14 09:20:27 +00:00
}
else
{
2022-04-15 16:36:23 +00:00
Coordination : : Requests ops ;
2022-04-19 12:01:30 +00:00
if ( ! path_to_set_hardlinked_files . empty ( ) & & ! hardlinked_files . empty ( ) )
2022-04-15 16:36:23 +00:00
{
std : : string data = boost : : algorithm : : join ( hardlinked_files , " \n " ) ;
2022-04-19 12:01:30 +00:00
/// List of files used to detect hardlinks. path_to_set_hardlinked_files --
/// is a path to source part zero copy node. During part removal hardlinked
/// files will be left for source part.
ops . emplace_back ( zkutil : : makeSetRequest ( path_to_set_hardlinked_files , data , - 1 ) ) ;
2022-04-15 16:36:23 +00:00
}
ops . emplace_back ( zkutil : : makeCreateRequest ( zookeeper_node , " " , mode ) ) ;
Coordination : : Responses responses ;
auto error = zookeeper - > tryMulti ( ops , responses ) ;
2022-02-14 09:20:27 +00:00
if ( error = = Coordination : : Error : : ZOK | | error = = Coordination : : Error : : ZNODEEXISTS )
2022-04-22 13:20:02 +00:00
{
created = true ;
2022-02-14 09:20:27 +00:00
break ;
2022-04-22 13:20:02 +00:00
}
else if ( error = = Coordination : : Error : : ZNONODE & & mode ! = zkutil : : CreateMode : : Persistent )
{
/// Ephemeral locks used during fetches so if parent node was removed we cannot do anything
throw Exception ( ErrorCodes : : NOT_FOUND_NODE , " Cannot create ephemeral zero copy lock {} because part was unlocked from zookeeper " , zookeeper_node ) ;
}
2022-02-14 09:20:27 +00:00
}
2021-11-23 13:57:24 +00:00
}
catch ( const zkutil : : KeeperException & e )
{
if ( e . code = = Coordination : : Error : : ZNONODE )
continue ;
2022-04-22 13:20:02 +00:00
2021-11-23 13:57:24 +00:00
throw ;
}
}
2022-04-22 13:20:02 +00:00
if ( ! created )
{
2022-09-05 01:50:24 +00:00
String mode_str = mode = = zkutil : : CreateMode : : Persistent ? " persistent " : " ephemeral " ;
2022-04-22 13:20:02 +00:00
throw Exception ( ErrorCodes : : NOT_FOUND_NODE , " Cannot create {} zero copy lock {} because part was unlocked from zookeeper " , mode_str , zookeeper_node ) ;
}
2021-11-23 13:57:24 +00:00
}
2022-06-21 09:17:52 +00:00
bool StorageReplicatedMergeTree : : removeDetachedPart ( DiskPtr disk , const String & path , const String & part_name )
2021-12-21 14:29:50 +00:00
{
if ( disk - > supportZeroCopyReplication ( ) )
{
2022-06-21 09:17:52 +00:00
String table_id = getTableSharedID ( ) ;
2022-07-21 13:57:28 +00:00
return removeSharedDetachedPart ( disk , path , part_name , table_id , zookeeper_name , replica_name , zookeeper_path , getContext ( ) , current_zookeeper ) ;
2021-12-21 14:29:50 +00:00
}
disk - > removeRecursive ( path ) ;
return false ;
}
bool StorageReplicatedMergeTree : : removeSharedDetachedPart ( DiskPtr disk , const String & path , const String & part_name , const String & table_uuid ,
2022-07-21 13:57:28 +00:00
const String & , const String & detached_replica_name , const String & detached_zookeeper_path , ContextPtr local_context , const zkutil : : ZooKeeperPtr & zookeeper )
2021-12-21 14:29:50 +00:00
{
bool keep_shared = false ;
2022-04-18 23:09:09 +00:00
NameSet files_not_to_remove ;
2021-12-21 14:29:50 +00:00
2022-02-02 16:40:21 +00:00
fs : : path checksums = fs : : path ( path ) / IMergeTreeDataPart : : FILE_FOR_REFERENCES_CHECK ;
if ( disk - > exists ( checksums ) )
2021-12-21 14:29:50 +00:00
{
2022-02-02 16:40:21 +00:00
if ( disk - > getRefCount ( checksums ) = = 0 )
2021-12-21 14:29:50 +00:00
{
2022-02-02 16:40:21 +00:00
String id = disk - > getUniqueId ( checksums ) ;
2022-04-15 16:36:23 +00:00
bool can_remove = false ;
2022-10-03 17:06:12 +00:00
std : : tie ( can_remove , files_not_to_remove ) = StorageReplicatedMergeTree : : unlockSharedDataByID (
id , table_uuid , part_name ,
detached_replica_name ,
toString ( disk - > getDataSourceDescription ( ) . type ) ,
zookeeper , local_context - > getReplicatedMergeTreeSettings ( ) ,
& Poco : : Logger : : get ( " StorageReplicatedMergeTree " ) ,
detached_zookeeper_path ,
2022-10-03 21:30:50 +00:00
MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING ) ;
2022-04-15 16:36:23 +00:00
keep_shared = ! can_remove ;
2021-12-21 14:29:50 +00:00
}
2022-02-02 16:40:21 +00:00
else
keep_shared = true ;
2021-12-21 14:29:50 +00:00
}
2022-04-18 23:09:09 +00:00
disk - > removeSharedRecursive ( path , keep_shared , files_not_to_remove ) ;
2021-12-21 14:29:50 +00:00
return keep_shared ;
}
2021-12-27 16:27:06 +00:00
void StorageReplicatedMergeTree : : createAndStoreFreezeMetadata ( DiskPtr disk , DataPartPtr , String backup_part_path ) const
2021-12-21 14:29:50 +00:00
{
if ( disk - > supportZeroCopyReplication ( ) )
{
FreezeMetaData meta ;
meta . fill ( * this ) ;
meta . save ( disk , backup_part_path ) ;
}
}
2022-06-23 10:17:54 +00:00
void StorageReplicatedMergeTree : : adjustCreateQueryForBackup ( ASTPtr & create_query ) const
2022-05-23 12:05:35 +00:00
{
2022-06-26 15:53:20 +00:00
/// Adjust the create query using values from ZooKeeper.
auto zookeeper = getZooKeeper ( ) ;
auto columns_from_entry = ColumnsDescription : : parse ( zookeeper - > get ( fs : : path ( zookeeper_path ) / " columns " ) ) ;
auto metadata_from_entry = ReplicatedMergeTreeTableMetadata : : parse ( zookeeper - > get ( fs : : path ( zookeeper_path ) / " metadata " ) ) ;
2022-05-29 19:53:56 +00:00
2022-06-26 15:53:20 +00:00
auto current_metadata = getInMemoryMetadataPtr ( ) ;
auto metadata_diff = ReplicatedMergeTreeTableMetadata ( * this , current_metadata ) . checkAndFindDiff ( metadata_from_entry , current_metadata - > getColumns ( ) , getContext ( ) ) ;
auto adjusted_metadata = metadata_diff . getNewMetadata ( columns_from_entry , getContext ( ) , * current_metadata ) ;
applyMetadataChangesToCreateQuery ( create_query , adjusted_metadata ) ;
2022-06-14 08:53:22 +00:00
2022-06-26 15:53:20 +00:00
/// Check that tryGetTableSharedIDFromCreateQuery() works for this storage.
if ( tryGetTableSharedIDFromCreateQuery ( * create_query , getContext ( ) ) ! = getTableSharedID ( ) )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Table {} has its shared ID to be different from one from the create query " ) ;
2022-05-31 09:33:23 +00:00
}
2022-05-29 19:53:56 +00:00
2022-05-31 09:33:23 +00:00
void StorageReplicatedMergeTree : : backupData (
BackupEntriesCollector & backup_entries_collector , const String & data_path_in_backup , const std : : optional < ASTs > & partitions )
{
/// First we generate backup entries in the same way as an ordinary MergeTree does.
/// But then we don't add them to the BackupEntriesCollector right away,
/// because we need to coordinate them with other replicas (other replicas can have better parts).
2022-07-05 07:39:52 +00:00
auto local_context = backup_entries_collector . getContext ( ) ;
DataPartsVector data_parts ;
if ( partitions )
data_parts = getVisibleDataPartsVectorInPartitions ( local_context , getPartitionIDsFromQuery ( * partitions , local_context ) ) ;
else
data_parts = getVisibleDataPartsVector ( local_context ) ;
2022-09-08 15:51:29 +00:00
auto backup_entries = backupParts ( data_parts , /* data_path_in_backup */ " " , local_context ) ;
2022-05-23 12:05:35 +00:00
2022-05-31 09:33:23 +00:00
auto coordination = backup_entries_collector . getBackupCoordination ( ) ;
2022-06-24 19:29:38 +00:00
String shared_id = getTableSharedID ( ) ;
coordination - > addReplicatedDataPath ( shared_id , data_path_in_backup ) ;
2022-05-29 19:53:56 +00:00
std : : unordered_map < String , SipHash > part_names_with_hashes_calculating ;
for ( auto & [ relative_path , backup_entry ] : backup_entries )
2022-05-23 12:05:35 +00:00
{
size_t slash_pos = relative_path . find ( ' / ' ) ;
if ( slash_pos ! = String : : npos )
{
String part_name = relative_path . substr ( 0 , slash_pos ) ;
if ( MergeTreePartInfo : : tryParsePartName ( part_name , MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING ) )
{
2022-05-29 19:53:56 +00:00
auto & hash = part_names_with_hashes_calculating [ part_name ] ;
2022-05-23 12:05:35 +00:00
if ( relative_path . ends_with ( " .bin " ) )
{
auto checksum = backup_entry - > getChecksum ( ) ;
hash . update ( relative_path ) ;
hash . update ( backup_entry - > getSize ( ) ) ;
hash . update ( * checksum ) ;
}
2022-05-29 19:53:56 +00:00
continue ;
2022-05-23 12:05:35 +00:00
}
}
2022-05-31 09:33:23 +00:00
/// Not a part name, probably error.
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " {} doesn't follow the format <part_name>/<path> " , quoteString ( relative_path ) ) ;
2022-05-23 12:05:35 +00:00
}
2022-05-29 19:53:56 +00:00
std : : vector < IBackupCoordination : : PartNameAndChecksum > part_names_with_hashes ;
part_names_with_hashes . reserve ( part_names_with_hashes_calculating . size ( ) ) ;
for ( auto & [ part_name , hash ] : part_names_with_hashes_calculating )
2022-05-23 12:05:35 +00:00
{
UInt128 checksum ;
hash . get128 ( checksum ) ;
2022-05-29 19:53:56 +00:00
auto & part_name_with_hash = part_names_with_hashes . emplace_back ( ) ;
part_name_with_hash . part_name = part_name ;
part_name_with_hash . checksum = checksum ;
2022-05-23 12:05:35 +00:00
}
2022-05-31 09:33:23 +00:00
/// Send our list of part names to the coordination (to compare with other replicas).
2022-06-24 19:29:38 +00:00
coordination - > addReplicatedPartNames ( shared_id , getStorageID ( ) . getFullTableName ( ) , getReplicaName ( ) , part_names_with_hashes ) ;
2022-05-29 19:53:56 +00:00
2022-07-05 07:39:52 +00:00
/// Send a list of mutations to the coordination too (we need to find the mutations which are not finished for added part names).
{
std : : vector < IBackupCoordination : : MutationInfo > mutation_infos ;
auto zookeeper = getZooKeeper ( ) ;
Strings mutation_ids = zookeeper - > getChildren ( fs : : path ( zookeeper_path ) / " mutations " ) ;
mutation_infos . reserve ( mutation_ids . size ( ) ) ;
for ( const auto & mutation_id : mutation_ids )
{
mutation_infos . emplace_back (
IBackupCoordination : : MutationInfo { mutation_id , zookeeper - > get ( fs : : path ( zookeeper_path ) / " mutations " / mutation_id ) } ) ;
}
coordination - > addReplicatedMutations ( shared_id , getStorageID ( ) . getFullTableName ( ) , getReplicaName ( ) , mutation_infos ) ;
}
2022-05-29 19:53:56 +00:00
/// This task will be executed after all replicas have collected their parts and the coordination is ready to
/// give us the final list of parts to add to the BackupEntriesCollector.
2022-06-24 19:29:38 +00:00
auto post_collecting_task = [ shared_id ,
2022-05-29 19:53:56 +00:00
replica_name = getReplicaName ( ) ,
coordination ,
backup_entries = std : : move ( backup_entries ) ,
2022-05-31 09:33:23 +00:00
& backup_entries_collector ] ( )
2022-05-29 19:53:56 +00:00
{
2022-06-24 19:29:38 +00:00
Strings data_paths = coordination - > getReplicatedDataPaths ( shared_id ) ;
2022-05-31 09:33:23 +00:00
std : : vector < fs : : path > data_paths_fs ;
data_paths_fs . reserve ( data_paths . size ( ) ) ;
for ( const auto & data_path : data_paths )
data_paths_fs . push_back ( data_path ) ;
2022-06-24 19:29:38 +00:00
Strings part_names = coordination - > getReplicatedPartNames ( shared_id , replica_name ) ;
2022-05-29 19:53:56 +00:00
std : : unordered_set < std : : string_view > part_names_set { part_names . begin ( ) , part_names . end ( ) } ;
2022-05-31 09:33:23 +00:00
2022-05-29 19:53:56 +00:00
for ( const auto & [ relative_path , backup_entry ] : backup_entries )
{
size_t slash_pos = relative_path . find ( ' / ' ) ;
String part_name = relative_path . substr ( 0 , slash_pos ) ;
if ( ! part_names_set . contains ( part_name ) )
continue ;
2022-05-31 09:33:23 +00:00
for ( const auto & data_path : data_paths_fs )
backup_entries_collector . addBackupEntry ( data_path / relative_path , backup_entry ) ;
2022-05-29 19:53:56 +00:00
}
2022-07-05 07:39:52 +00:00
auto mutation_infos = coordination - > getReplicatedMutations ( shared_id , replica_name ) ;
for ( const auto & mutation_info : mutation_infos )
{
auto backup_entry = ReplicatedMergeTreeMutationEntry : : parse ( mutation_info . entry , mutation_info . id ) . backup ( ) ;
for ( const auto & data_path : data_paths_fs )
backup_entries_collector . addBackupEntry ( data_path / " mutations " / ( mutation_info . id + " .txt " ) , backup_entry ) ;
}
2022-05-29 19:53:56 +00:00
} ;
2022-07-05 07:39:52 +00:00
2022-06-22 22:56:41 +00:00
backup_entries_collector . addPostTask ( post_collecting_task ) ;
2022-05-29 19:53:56 +00:00
}
2022-05-31 09:33:23 +00:00
void StorageReplicatedMergeTree : : restoreDataFromBackup ( RestorerFromBackup & restorer , const String & data_path_in_backup , const std : : optional < ASTs > & partitions )
2022-05-29 19:53:56 +00:00
{
2022-05-31 09:33:23 +00:00
String full_zk_path = getZooKeeperName ( ) + getZooKeeperPath ( ) ;
if ( ! restorer . getRestoreCoordination ( ) - > acquireInsertingDataIntoReplicatedTable ( full_zk_path ) )
{
/// Other replica is already restoring the data of this table.
/// We'll get them later due to replication, it's not necessary to read it from the backup.
return ;
}
2022-05-23 12:05:35 +00:00
2022-05-31 09:33:23 +00:00
if ( ! restorer . isNonEmptyTableAllowed ( ) )
2022-05-29 19:53:56 +00:00
{
2022-05-31 09:33:23 +00:00
bool empty = ! getTotalActiveSizeInBytes ( ) ;
if ( empty )
2022-05-29 19:53:56 +00:00
{
2022-05-31 09:33:23 +00:00
/// New parts could be in the replication queue but not fetched yet.
/// In that case we consider the table as not empty.
StorageReplicatedMergeTree : : Status status ;
getStatus ( status , /* with_zk_fields = */ false ) ;
if ( status . queue . inserts_in_queue )
empty = false ;
2022-05-29 19:53:56 +00:00
}
2022-05-31 09:33:23 +00:00
auto backup = restorer . getBackup ( ) ;
2022-06-06 09:50:20 +00:00
if ( ! empty & & backup - > hasFiles ( data_path_in_backup ) )
2022-05-31 09:33:23 +00:00
restorer . throwTableIsNotEmpty ( getStorageID ( ) ) ;
2022-05-29 19:53:56 +00:00
}
2022-05-23 12:05:35 +00:00
2022-05-31 09:33:23 +00:00
restorePartsFromBackup ( restorer , data_path_in_backup , partitions ) ;
2022-05-19 12:36:27 +00:00
}
2022-04-13 13:26:17 +00:00
2022-05-19 12:36:27 +00:00
void StorageReplicatedMergeTree : : attachRestoredParts ( MutableDataPartsVector & & parts )
{
auto metadata_snapshot = getInMemoryMetadataPtr ( ) ;
2022-08-08 05:23:49 +00:00
auto sink = std : : make_shared < ReplicatedMergeTreeSink > ( * this , metadata_snapshot , 0 , 0 , 0 , false , false , false , getContext ( ) , /*is_attach*/ true ) ;
2022-05-19 12:36:27 +00:00
for ( auto part : parts )
sink - > writeExistingPart ( part ) ;
}
2022-04-13 13:26:17 +00:00
#if 0
PartsTemporaryRename renamed_parts ( * this , " detached/ " ) ;
MutableDataPartsVector loaded_parts = tryLoadPartsToAttach ( partition , attach_part , query_context , renamed_parts ) ;
/// TODO Allow to use quorum here.
ReplicatedMergeTreeSink output ( * this , metadata_snapshot , 0 , 0 , 0 , false , false , query_context ,
/*is_attach*/ true ) ;
for ( size_t i = 0 ; i < loaded_parts . size ( ) ; + + i )
{
const String old_name = loaded_parts [ i ] - > name ;
output . writeExistingPart ( loaded_parts [ i ] ) ;
renamed_parts . old_and_new_names [ i ] . old_name . clear ( ) ;
LOG_DEBUG ( log , " Attached part {} as {} " , old_name , loaded_parts [ i ] - > name ) ;
results . push_back ( PartitionCommandResultInfo {
. partition_id = loaded_parts [ i ] - > info . partition_id ,
. part_name = loaded_parts [ i ] - > name ,
. old_part_name = old_name ,
} ) ;
}
# endif
2014-03-21 13:42:14 +00:00
}