Add number of errors to ignore while choosing replicas (distributed_replica_max_ignored_errors)

This will allow avoid switching to another replica in case of error
(since error can be temporary).
This commit is contained in:
Azat Khuzhin 2020-06-13 20:32:21 +03:00
parent 687eb24f50
commit caa195c034
5 changed files with 41 additions and 6 deletions

View File

@ -821,6 +821,10 @@ ClickHouse supports the following algorithms of choosing replicas:
- [First or random](#load_balancing-first_or_random)
- [Round robin](#load_balancing-round_robin)
See also:
- [distributed\_replica\_max\_ignored\_errors](#settings-distributed_replica_max_ignored_errors)
### Random (by Default) {#load_balancing-random}
``` sql
@ -1170,8 +1174,10 @@ Controls how fast errors in distributed tables are zeroed. If a replica is unava
See also:
- [load\_balancing](#load_balancing-round_robin)
- [Table engine Distributed](../../engines/table-engines/special/distributed.md)
- [distributed\_replica\_error\_cap](#settings-distributed_replica_error_cap)
- [distributed\_replica\_max\_ignored\_errors](#settings-distributed_replica_max_ignored_errors)
## distributed\_replica\_error\_cap {#settings-distributed_replica_error_cap}
@ -1182,8 +1188,24 @@ Error count of each replica is capped at this value, preventing a single replica
See also:
- [load\_balancing](#load_balancing-round_robin)
- [Table engine Distributed](../../engines/table-engines/special/distributed.md)
- [distributed\_replica\_error\_half\_life](#settings-distributed_replica_error_half_life)
- [distributed\_replica\_max\_ignored\_errors](#settings-distributed_replica_max_ignored_errors)
## distributed\_replica\_max\_ignored\_errors {#settings-distributed_replica_max_ignored_errors}
- Type: unsigned int
- Default value: 0
Number of errors that will be ignored while choosing replicas (according to `load_balancing` algorithm).
See also:
- [load\_balancing](#load_balancing-round_robin)
- [Table engine Distributed](../../engines/table-engines/special/distributed.md)
- [distributed\_replica\_error\_cap](#settings-distributed_replica_error_cap)
- [distributed\_replica\_error\_half\_life](#settings-distributed_replica_error_half_life)
## distributed\_directory\_monitor\_sleep\_time\_ms {#distributed_directory_monitor_sleep_time_ms}

View File

@ -208,7 +208,10 @@ std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::g
bool fallback_to_stale_replicas = settings ? bool(settings->fallback_to_stale_replicas_for_distributed_queries) : true;
return Base::getMany(min_entries, max_entries, max_tries, try_get_entry, get_priority, fallback_to_stale_replicas);
return Base::getMany(min_entries, max_entries, max_tries,
try_get_entry, get_priority,
fallback_to_stale_replicas,
settings ? settings->distributed_replica_max_ignored_errors.value : 0);
}
ConnectionPoolWithFailover::TryResult

View File

@ -111,7 +111,8 @@ public:
size_t min_entries, size_t max_entries, size_t max_tries,
const TryGetEntryFunc & try_get_entry,
const GetPriorityFunc & get_priority = GetPriorityFunc(),
bool fallback_to_stale_replicas = true);
bool fallback_to_stale_replicas = true,
size_t max_ignored_errors = 0);
protected:
struct PoolState;
@ -119,7 +120,7 @@ protected:
using PoolStates = std::vector<PoolState>;
/// This function returns a copy of pool states to avoid race conditions when modifying shared pool states.
PoolStates updatePoolStates();
PoolStates updatePoolStates(size_t max_ignored_errors);
PoolStates getPoolStates() const;
NestedPools nested_pools;
@ -153,10 +154,11 @@ PoolWithFailoverBase<TNestedPool>::getMany(
size_t min_entries, size_t max_entries, size_t max_tries,
const TryGetEntryFunc & try_get_entry,
const GetPriorityFunc & get_priority,
bool fallback_to_stale_replicas)
bool fallback_to_stale_replicas,
size_t max_ignored_errors)
{
/// Update random numbers and error counts.
PoolStates pool_states = updatePoolStates();
PoolStates pool_states = updatePoolStates(max_ignored_errors);
if (get_priority)
{
for (size_t i = 0; i < pool_states.size(); ++i)
@ -317,7 +319,7 @@ private:
template <typename TNestedPool>
typename PoolWithFailoverBase<TNestedPool>::PoolStates
PoolWithFailoverBase<TNestedPool>::updatePoolStates()
PoolWithFailoverBase<TNestedPool>::updatePoolStates(size_t max_ignored_errors)
{
PoolStates result;
result.reserve(nested_pools.size());
@ -363,6 +365,11 @@ PoolWithFailoverBase<TNestedPool>::updatePoolStates()
result.assign(shared_pool_states.begin(), shared_pool_states.end());
}
/// distributed_replica_max_ignored_errors
for (auto & state : result)
state.error_count = std::max<UInt64>(0, state.error_count - max_ignored_errors);
return result;
}

View File

@ -45,6 +45,8 @@
#define DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_DECREASE_ERROR_PERIOD 60
/// replica error max cap, this is to prevent replica from accumulating too many errors and taking to long to recover.
#define DBMS_CONNECTION_POOL_WITH_FAILOVER_MAX_ERROR_COUNT 1000
/// Number of errors that will be ignored while choosing replicas
#define DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_ERROR_IGNORE 0
#define DBMS_MIN_REVISION_WITH_CLIENT_INFO 54032
#define DBMS_MIN_REVISION_WITH_SERVER_TIMEZONE 54058

View File

@ -348,6 +348,7 @@ struct Settings : public SettingsCollection<Settings>
\
M(SettingSeconds, distributed_replica_error_half_life, DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_DECREASE_ERROR_PERIOD, "Time period reduces replica error counter by 2 times.", 0) \
M(SettingUInt64, distributed_replica_error_cap, DBMS_CONNECTION_POOL_WITH_FAILOVER_MAX_ERROR_COUNT, "Max number of errors per replica, prevents piling up an incredible amount of errors if replica was offline for some time and allows it to be reconsidered in a shorter amount of time.", 0) \
M(SettingUInt64, distributed_replica_max_ignored_errors, DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_ERROR_IGNORE, "Number of errors that will be ignored while choosing replicas", 0) \
\
M(SettingBool, allow_experimental_live_view, false, "Enable LIVE VIEW. Not mature enough.", 0) \
M(SettingSeconds, live_view_heartbeat_interval, DEFAULT_LIVE_VIEW_HEARTBEAT_INTERVAL_SEC, "The heartbeat interval in seconds to indicate live query is alive.", 0) \