2020-09-18 10:57:33 +00:00
# include <Storages/MergeTree/ReplicatedMergeTreeMergeStrategyPicker.h>
# include <Storages/StorageReplicatedMergeTree.h>
# include <Storages/MergeTree/ReplicatedMergeTreeLogEntry.h>
2021-10-02 07:13:14 +00:00
# include <base/types.h>
2022-01-30 19:49:48 +00:00
# include <base/sort.h>
2020-09-18 10:57:33 +00:00
# include <optional>
# include <mutex>
# include <city.h>
# include <algorithm>
# include <atomic>
2022-01-30 19:49:48 +00:00
2020-09-18 10:57:33 +00:00
namespace DB
{
2022-02-10 19:45:52 +00:00
/// minimum interval (seconds) between checks if chosen replica finished the merge.
static const auto RECHECK_MERGE_READYNESS_INTERVAL_SECONDS = 1 ;
2020-11-03 15:58:17 +00:00
/// don't refresh state too often (to limit number of zookeeper ops)
static const auto REFRESH_STATE_MINIMUM_INTERVAL_SECONDS = 3 ;
2020-09-18 10:57:33 +00:00
2020-11-03 15:58:17 +00:00
/// refresh the state automatically it it was not refreshed for a longer time
2020-09-18 10:57:33 +00:00
static const auto REFRESH_STATE_MAXIMUM_INTERVAL_SECONDS = 30 ;
ReplicatedMergeTreeMergeStrategyPicker : : ReplicatedMergeTreeMergeStrategyPicker ( StorageReplicatedMergeTree & storage_ )
: storage ( storage_ )
{ }
2022-02-10 19:45:52 +00:00
bool ReplicatedMergeTreeMergeStrategyPicker : : isMergeFinishedByReplica ( const String & replica , const ReplicatedMergeTreeLogEntryData & entry )
2022-02-09 19:56:22 +00:00
{
2022-02-10 19:45:52 +00:00
/// those have only seconds resolution, so recheck period is quite rough
auto reference_timestamp = entry . last_postpone_time ;
if ( reference_timestamp = = 0 )
reference_timestamp = entry . create_time ;
2020-09-18 10:57:33 +00:00
2022-02-10 19:45:52 +00:00
/// we don't want to check zookeeper too frequent
if ( time ( nullptr ) - reference_timestamp > = RECHECK_MERGE_READYNESS_INTERVAL_SECONDS )
{
return storage . checkReplicaHavePart ( replica , entry . new_part_name ) ;
}
2020-09-18 10:57:33 +00:00
2022-02-10 19:45:52 +00:00
return false ;
2020-11-03 15:58:17 +00:00
}
2020-09-18 10:57:33 +00:00
2022-02-10 19:45:52 +00:00
bool ReplicatedMergeTreeMergeStrategyPicker : : shouldMergeOnSingleReplica ( const ReplicatedMergeTreeLogEntryData & entry ) const
2021-03-09 14:34:28 +00:00
{
2022-02-10 19:45:52 +00:00
time_t threshold = execute_merges_on_single_replica_time_threshold ;
2021-06-24 08:25:05 +00:00
return (
threshold > 0 /// feature turned on
2022-02-10 19:45:52 +00:00
& & entry . type = = ReplicatedMergeTreeLogEntry : : MERGE_PARTS /// it is a merge log entry
2021-06-24 08:25:05 +00:00
& & entry . create_time + threshold > time ( nullptr ) /// not too much time waited
) ;
}
2020-11-03 15:58:17 +00:00
/// that will return the same replica name for ReplicatedMergeTreeLogEntry on all the replicas (if the replica set is the same).
/// that way each replica knows who is responsible for doing a certain merge.
2020-09-18 10:57:33 +00:00
2020-11-03 15:58:17 +00:00
/// in some corner cases (added / removed / deactivated replica)
/// nodes can pick different replicas to execute merge and wait for it (or to execute the same merge together)
/// but that doesn't have a significant impact (in one case it will wait for the execute_merges_on_single_replica_time_threshold,
/// in another just 2 replicas will do the merge)
2022-02-10 19:45:52 +00:00
std : : optional < String > ReplicatedMergeTreeMergeStrategyPicker : : pickReplicaToExecuteMerge ( const ReplicatedMergeTreeLogEntryData & entry )
2020-11-03 15:58:17 +00:00
{
/// last state refresh was too long ago, need to sync up the replicas list
if ( time ( nullptr ) - last_refresh_time > REFRESH_STATE_MAXIMUM_INTERVAL_SECONDS )
2020-09-18 10:57:33 +00:00
refreshState ( ) ;
auto hash = getEntryHash ( entry ) ;
std : : lock_guard lock ( mutex ) ;
auto num_replicas = active_replicas . size ( ) ;
2020-11-03 15:58:17 +00:00
if ( num_replicas = = 0 )
2020-09-18 10:57:33 +00:00
return std : : nullopt ;
auto replica_index = static_cast < int > ( hash % num_replicas ) ;
if ( replica_index = = current_replica_index )
return std : : nullopt ;
return active_replicas . at ( replica_index ) ;
}
void ReplicatedMergeTreeMergeStrategyPicker : : refreshState ( )
{
2021-06-24 08:25:05 +00:00
const auto settings = storage . getSettings ( ) ;
2022-10-07 10:46:45 +00:00
time_t threshold = settings - > execute_merges_on_single_replica_time_threshold . totalSeconds ( ) ;
time_t threshold_init = 0 ;
2021-07-05 03:32:56 +00:00
if ( settings - > allow_remote_fs_zero_copy_replication )
threshold_init = settings - > remote_fs_execute_merges_on_single_replica_time_threshold . totalSeconds ( ) ;
2020-09-18 10:57:33 +00:00
if ( threshold = = 0 )
2022-04-17 23:02:49 +00:00
/// we can reset the settings without lock (it's atomic)
2020-09-18 10:57:33 +00:00
execute_merges_on_single_replica_time_threshold = threshold ;
2021-07-05 03:32:56 +00:00
if ( threshold_init = = 0 )
remote_fs_execute_merges_on_single_replica_time_threshold = threshold_init ;
if ( threshold = = 0 & & threshold_init = = 0 )
2020-09-18 10:57:33 +00:00
return ;
2020-11-17 11:48:08 +00:00
auto now = time ( nullptr ) ;
/// the setting was already enabled, and last state refresh was done recently
2021-03-10 11:08:49 +00:00
if ( ( ( threshold ! = 0 & & execute_merges_on_single_replica_time_threshold ! = 0 )
2021-07-05 03:32:56 +00:00
| | ( threshold_init ! = 0 & & remote_fs_execute_merges_on_single_replica_time_threshold ! = 0 ) )
2020-11-17 11:48:08 +00:00
& & now - last_refresh_time < REFRESH_STATE_MINIMUM_INTERVAL_SECONDS )
return ;
2022-11-07 19:39:33 +00:00
LOG_DEBUG ( storage . log , " Updating strategy picker state " ) ;
2020-09-18 10:57:33 +00:00
auto zookeeper = storage . getZooKeeper ( ) ;
auto all_replicas = zookeeper - > getChildren ( storage . zookeeper_path + " /replicas " ) ;
2022-01-30 19:49:48 +00:00
: : sort ( all_replicas . begin ( ) , all_replicas . end ( ) ) ;
2020-09-18 10:57:33 +00:00
std : : vector < String > active_replicas_tmp ;
int current_replica_index_tmp = - 1 ;
for ( const String & replica : all_replicas )
{
2022-02-10 19:45:52 +00:00
if ( zookeeper - > exists ( storage . zookeeper_path + " /replicas/ " + replica + " /is_active " ) )
2020-09-18 10:57:33 +00:00
{
active_replicas_tmp . push_back ( replica ) ;
if ( replica = = storage . replica_name )
{
2022-10-07 10:46:45 +00:00
current_replica_index_tmp = static_cast < int > ( active_replicas_tmp . size ( ) - 1 ) ;
2020-09-18 10:57:33 +00:00
}
}
}
if ( current_replica_index_tmp < 0 | | active_replicas_tmp . size ( ) < 2 )
{
2021-07-29 08:25:22 +00:00
if ( execute_merges_on_single_replica_time_threshold > 0 )
{
2021-08-08 01:43:59 +00:00
LOG_WARNING ( storage . log , " Can't find current replica in the active replicas list, or too few active replicas to use 'execute_merges_on_single_replica_time_threshold' " ) ;
2022-04-17 23:02:49 +00:00
/// we can reset the settings without lock (it's atomic)
2021-07-29 08:25:22 +00:00
execute_merges_on_single_replica_time_threshold = 0 ;
}
/// default value of remote_fs_execute_merges_on_single_replica_time_threshold is not 0
/// so we write no warning in log here
2021-07-05 03:32:56 +00:00
remote_fs_execute_merges_on_single_replica_time_threshold = 0 ;
2020-09-18 10:57:33 +00:00
return ;
}
std : : lock_guard lock ( mutex ) ;
2021-03-09 14:34:28 +00:00
if ( threshold ! = 0 ) /// Zeros already reset
execute_merges_on_single_replica_time_threshold = threshold ;
2021-07-05 03:32:56 +00:00
if ( threshold_init ! = 0 )
remote_fs_execute_merges_on_single_replica_time_threshold = threshold_init ;
2020-09-18 10:57:33 +00:00
last_refresh_time = now ;
current_replica_index = current_replica_index_tmp ;
active_replicas = active_replicas_tmp ;
2022-11-07 19:39:33 +00:00
LOG_DEBUG ( storage . log , " Strategy picker state updated, current replica: {}, active replicas: [{}] " , current_replica_index , fmt : : join ( active_replicas , " , " ) ) ;
2020-09-18 10:57:33 +00:00
}
uint64_t ReplicatedMergeTreeMergeStrategyPicker : : getEntryHash ( const ReplicatedMergeTreeLogEntryData & entry ) const
{
auto hash_data = storage . zookeeper_path + entry . new_part_name ;
return CityHash_v1_0_2 : : CityHash64 ( hash_data . c_str ( ) , hash_data . length ( ) ) ;
}
}