mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-18 21:51:57 +00:00
Use 'merge on single replica' option instead of zookeeper lock
This commit is contained in:
parent
d80c2cef06
commit
265d293934
@ -71,6 +71,7 @@ struct Settings;
|
|||||||
M(Seconds, prefer_fetch_merged_part_time_threshold, 3600, "If time passed after replication log entry creation exceeds this threshold and sum size of parts is greater than \"prefer_fetch_merged_part_size_threshold\", prefer fetching merged part from replica instead of doing merge locally. To speed up very long merges.", 0) \
|
M(Seconds, prefer_fetch_merged_part_time_threshold, 3600, "If time passed after replication log entry creation exceeds this threshold and sum size of parts is greater than \"prefer_fetch_merged_part_size_threshold\", prefer fetching merged part from replica instead of doing merge locally. To speed up very long merges.", 0) \
|
||||||
M(UInt64, prefer_fetch_merged_part_size_threshold, 10ULL * 1024 * 1024 * 1024, "If sum size of parts exceeds this threshold and time passed after replication log entry creation is greater than \"prefer_fetch_merged_part_time_threshold\", prefer fetching merged part from replica instead of doing merge locally. To speed up very long merges.", 0) \
|
M(UInt64, prefer_fetch_merged_part_size_threshold, 10ULL * 1024 * 1024 * 1024, "If sum size of parts exceeds this threshold and time passed after replication log entry creation is greater than \"prefer_fetch_merged_part_time_threshold\", prefer fetching merged part from replica instead of doing merge locally. To speed up very long merges.", 0) \
|
||||||
M(Seconds, execute_merges_on_single_replica_time_threshold, 0, "When greater than zero only a single replica starts the merge immediately, others wait up to that amount of time to download the result instead of doing merges locally. If the chosen replica doesn't finish the merge during that amount of time, fallback to standard behavior happens.", 0) \
|
M(Seconds, execute_merges_on_single_replica_time_threshold, 0, "When greater than zero only a single replica starts the merge immediately, others wait up to that amount of time to download the result instead of doing merges locally. If the chosen replica doesn't finish the merge during that amount of time, fallback to standard behavior happens.", 0) \
|
||||||
|
M(Seconds, s3_execute_merges_on_single_replica_time_threshold, 3 * 60 * 60, "When greater than zero only a single replica starts the merge immediatelys when merged part on S3 storage and 'allow_s3_zero_copy_replication' is enabled.", 0) \
|
||||||
M(Seconds, try_fetch_recompressed_part_timeout, 7200, "Recompression works slow in most cases, so we don't start merge with recompression until this timeout and trying to fetch recompressed part from replica which assigned this merge with recompression.", 0) \
|
M(Seconds, try_fetch_recompressed_part_timeout, 7200, "Recompression works slow in most cases, so we don't start merge with recompression until this timeout and trying to fetch recompressed part from replica which assigned this merge with recompression.", 0) \
|
||||||
M(Bool, always_fetch_merged_part, 0, "If true, replica never merge parts and always download merged parts from other replicas.", 0) \
|
M(Bool, always_fetch_merged_part, 0, "If true, replica never merge parts and always download merged parts from other replicas.", 0) \
|
||||||
M(UInt64, max_suspicious_broken_parts, 10, "Max broken parts, if more - deny automatic deletion.", 0) \
|
M(UInt64, max_suspicious_broken_parts, 10, "Max broken parts, if more - deny automatic deletion.", 0) \
|
||||||
|
@ -56,6 +56,17 @@ bool ReplicatedMergeTreeMergeStrategyPicker::shouldMergeOnSingleReplica(const Re
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool ReplicatedMergeTreeMergeStrategyPicker::shouldMergeOnSingleReplicaS3Shared(const ReplicatedMergeTreeLogEntryData & entry) const
|
||||||
|
{
|
||||||
|
time_t threshold = s3_execute_merges_on_single_replica_time_threshold;
|
||||||
|
return (
|
||||||
|
threshold > 0 /// feature turned on
|
||||||
|
&& entry.type == ReplicatedMergeTreeLogEntry::MERGE_PARTS /// it is a merge log entry
|
||||||
|
&& entry.create_time + threshold > time(nullptr) /// not too much time waited
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/// that will return the same replica name for ReplicatedMergeTreeLogEntry on all the replicas (if the replica set is the same).
|
/// that will return the same replica name for ReplicatedMergeTreeLogEntry on all the replicas (if the replica set is the same).
|
||||||
/// that way each replica knows who is responsible for doing a certain merge.
|
/// that way each replica knows who is responsible for doing a certain merge.
|
||||||
|
|
||||||
@ -90,18 +101,23 @@ std::optional<String> ReplicatedMergeTreeMergeStrategyPicker::pickReplicaToExecu
|
|||||||
void ReplicatedMergeTreeMergeStrategyPicker::refreshState()
|
void ReplicatedMergeTreeMergeStrategyPicker::refreshState()
|
||||||
{
|
{
|
||||||
auto threshold = storage.getSettings()->execute_merges_on_single_replica_time_threshold.totalSeconds();
|
auto threshold = storage.getSettings()->execute_merges_on_single_replica_time_threshold.totalSeconds();
|
||||||
|
auto threshold_s3 = 0;
|
||||||
|
if (storage.getSettings()->allow_s3_zero_copy_replication)
|
||||||
|
threshold_s3 = storage.getSettings()->s3_execute_merges_on_single_replica_time_threshold.totalSeconds();
|
||||||
|
|
||||||
if (threshold == 0)
|
if (threshold == 0)
|
||||||
{
|
|
||||||
/// we can reset the settings w/o lock (it's atomic)
|
/// we can reset the settings w/o lock (it's atomic)
|
||||||
execute_merges_on_single_replica_time_threshold = threshold;
|
execute_merges_on_single_replica_time_threshold = threshold;
|
||||||
|
if (threshold_s3 == 0)
|
||||||
|
s3_execute_merges_on_single_replica_time_threshold = threshold_s3;
|
||||||
|
if (threshold == 0 && threshold_s3 == 0)
|
||||||
return;
|
return;
|
||||||
}
|
|
||||||
|
|
||||||
auto now = time(nullptr);
|
auto now = time(nullptr);
|
||||||
|
|
||||||
/// the setting was already enabled, and last state refresh was done recently
|
/// the setting was already enabled, and last state refresh was done recently
|
||||||
if (execute_merges_on_single_replica_time_threshold != 0
|
if ((execute_merges_on_single_replica_time_threshold != 0
|
||||||
|
|| s3_execute_merges_on_single_replica_time_threshold != 0)
|
||||||
&& now - last_refresh_time < REFRESH_STATE_MINIMUM_INTERVAL_SECONDS)
|
&& now - last_refresh_time < REFRESH_STATE_MINIMUM_INTERVAL_SECONDS)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@ -130,11 +146,15 @@ void ReplicatedMergeTreeMergeStrategyPicker::refreshState()
|
|||||||
LOG_WARNING(storage.log, "Can't find current replica in the active replicas list, or too few active replicas to use execute_merges_on_single_replica_time_threshold!");
|
LOG_WARNING(storage.log, "Can't find current replica in the active replicas list, or too few active replicas to use execute_merges_on_single_replica_time_threshold!");
|
||||||
/// we can reset the settings w/o lock (it's atomic)
|
/// we can reset the settings w/o lock (it's atomic)
|
||||||
execute_merges_on_single_replica_time_threshold = 0;
|
execute_merges_on_single_replica_time_threshold = 0;
|
||||||
|
s3_execute_merges_on_single_replica_time_threshold = 0;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::lock_guard lock(mutex);
|
std::lock_guard lock(mutex);
|
||||||
|
if (threshold != 0) /// Zeros already reset
|
||||||
execute_merges_on_single_replica_time_threshold = threshold;
|
execute_merges_on_single_replica_time_threshold = threshold;
|
||||||
|
if (threshold_s3 != 0)
|
||||||
|
s3_execute_merges_on_single_replica_time_threshold = threshold_s3;
|
||||||
last_refresh_time = now;
|
last_refresh_time = now;
|
||||||
current_replica_index = current_replica_index_tmp;
|
current_replica_index = current_replica_index_tmp;
|
||||||
active_replicas = active_replicas_tmp;
|
active_replicas = active_replicas_tmp;
|
||||||
|
@ -52,6 +52,10 @@ public:
|
|||||||
/// and we may need to do a fetch (or postpone) instead of merge
|
/// and we may need to do a fetch (or postpone) instead of merge
|
||||||
bool shouldMergeOnSingleReplica(const ReplicatedMergeTreeLogEntryData & entry) const;
|
bool shouldMergeOnSingleReplica(const ReplicatedMergeTreeLogEntryData & entry) const;
|
||||||
|
|
||||||
|
/// return true if s3_execute_merges_on_single_replica_time_threshold feature is active
|
||||||
|
/// and we may need to do a fetch (or postpone) instead of merge
|
||||||
|
bool shouldMergeOnSingleReplicaS3Shared(const ReplicatedMergeTreeLogEntryData & entry) const;
|
||||||
|
|
||||||
/// returns the replica name
|
/// returns the replica name
|
||||||
/// and it's not current replica should do the merge
|
/// and it's not current replica should do the merge
|
||||||
/// used in shouldExecuteLogEntry and in tryExecuteMerge
|
/// used in shouldExecuteLogEntry and in tryExecuteMerge
|
||||||
@ -68,6 +72,7 @@ private:
|
|||||||
uint64_t getEntryHash(const ReplicatedMergeTreeLogEntryData & entry) const;
|
uint64_t getEntryHash(const ReplicatedMergeTreeLogEntryData & entry) const;
|
||||||
|
|
||||||
std::atomic<time_t> execute_merges_on_single_replica_time_threshold = 0;
|
std::atomic<time_t> execute_merges_on_single_replica_time_threshold = 0;
|
||||||
|
std::atomic<time_t> s3_execute_merges_on_single_replica_time_threshold = 0;
|
||||||
std::atomic<time_t> last_refresh_time = 0;
|
std::atomic<time_t> last_refresh_time = 0;
|
||||||
|
|
||||||
std::mutex mutex;
|
std::mutex mutex;
|
||||||
|
@ -528,7 +528,6 @@ void StorageReplicatedMergeTree::waitMutationToFinishOnReplicas(
|
|||||||
|
|
||||||
void StorageReplicatedMergeTree::createNewZooKeeperNodes()
|
void StorageReplicatedMergeTree::createNewZooKeeperNodes()
|
||||||
{
|
{
|
||||||
auto storage_settings = getSettings();
|
|
||||||
auto zookeeper = getZooKeeper();
|
auto zookeeper = getZooKeeper();
|
||||||
|
|
||||||
/// Working with quorum.
|
/// Working with quorum.
|
||||||
@ -546,10 +545,9 @@ void StorageReplicatedMergeTree::createNewZooKeeperNodes()
|
|||||||
zookeeper->createIfNotExists(replica_path + "/mutation_pointer", String());
|
zookeeper->createIfNotExists(replica_path + "/mutation_pointer", String());
|
||||||
|
|
||||||
/// Nodes for zero-copy S3 replication
|
/// Nodes for zero-copy S3 replication
|
||||||
if (storage_settings->allow_s3_zero_copy_replication)
|
if (storage_settings.get()->allow_s3_zero_copy_replication)
|
||||||
{
|
{
|
||||||
zookeeper->createIfNotExists(zookeeper_path + "/zero_copy_s3", String());
|
zookeeper->createIfNotExists(zookeeper_path + "/zero_copy_s3", String());
|
||||||
zookeeper->createIfNotExists(zookeeper_path + "/zero_copy_s3/merged", String());
|
|
||||||
zookeeper->createIfNotExists(zookeeper_path + "/zero_copy_s3/shared", String());
|
zookeeper->createIfNotExists(zookeeper_path + "/zero_copy_s3/shared", String());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1459,9 +1457,12 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry)
|
|||||||
/// In some use cases merging can be more expensive than fetching
|
/// In some use cases merging can be more expensive than fetching
|
||||||
/// and it may be better to spread merges tasks across the replicas
|
/// and it may be better to spread merges tasks across the replicas
|
||||||
/// instead of doing exactly the same merge cluster-wise
|
/// instead of doing exactly the same merge cluster-wise
|
||||||
|
std::optional<String> replica_to_execute_merge;
|
||||||
|
bool replica_to_execute_merge_picked = false;
|
||||||
if (merge_strategy_picker.shouldMergeOnSingleReplica(entry))
|
if (merge_strategy_picker.shouldMergeOnSingleReplica(entry))
|
||||||
{
|
{
|
||||||
auto replica_to_execute_merge = merge_strategy_picker.pickReplicaToExecuteMerge(entry);
|
replica_to_execute_merge = merge_strategy_picker.pickReplicaToExecuteMerge(entry);
|
||||||
|
replica_to_execute_merge_picked = true;
|
||||||
|
|
||||||
if (replica_to_execute_merge)
|
if (replica_to_execute_merge)
|
||||||
{
|
{
|
||||||
@ -1547,17 +1548,19 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry)
|
|||||||
auto disk = reserved_space->getDisk();
|
auto disk = reserved_space->getDisk();
|
||||||
if (disk->getType() == DB::DiskType::Type::S3)
|
if (disk->getType() == DB::DiskType::Type::S3)
|
||||||
{
|
{
|
||||||
auto zookeeper = getZooKeeper();
|
if (merge_strategy_picker.shouldMergeOnSingleReplicaS3Shared(entry))
|
||||||
String zookeeper_node = zookeeper_path + "/zero_copy_s3/merged/" + entry.new_part_name;
|
{
|
||||||
|
if (!replica_to_execute_merge_picked)
|
||||||
|
replica_to_execute_merge = merge_strategy_picker.pickReplicaToExecuteMerge(entry);
|
||||||
|
|
||||||
auto code = zookeeper->tryCreate(zookeeper_node, "lock", zkutil::CreateMode::Ephemeral);
|
if (replica_to_execute_merge)
|
||||||
|
{
|
||||||
/// Someone else created or started create this merge,
|
LOG_DEBUG(log, "Prefer fetching part {} from replica {} due s3_execute_merges_on_single_replica_time_threshold", entry.new_part_name, replica_to_execute_merge.value());
|
||||||
/// so will try to fetch.
|
|
||||||
if (code == Coordination::Error::ZNODEEXISTS)
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Account TTL merge
|
/// Account TTL merge
|
||||||
if (isTTLMergeType(future_merged_part.merge_type))
|
if (isTTLMergeType(future_merged_part.merge_type))
|
||||||
|
Loading…
Reference in New Issue
Block a user