Merge pull request #58993 from ClickHouse/revert-58992-revert_flaky

Revive: Parallel replicas custom key: skip unavailable replicas
This commit is contained in:
Igor Nikonov 2024-01-20 19:44:56 +01:00 committed by GitHub
commit 4be068c73b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
37 changed files with 646 additions and 187 deletions

View File

@ -118,18 +118,18 @@ ConnectionPoolWithFailover::Status ConnectionPoolWithFailover::getStatus() const
return result; return result;
} }
std::vector<IConnectionPool::Entry> ConnectionPoolWithFailover::getMany(const ConnectionTimeouts & timeouts, std::vector<IConnectionPool::Entry> ConnectionPoolWithFailover::getMany(
const Settings & settings, const ConnectionTimeouts & timeouts,
PoolMode pool_mode, const Settings & settings,
AsyncCallback async_callback, PoolMode pool_mode,
std::optional<bool> skip_unavailable_endpoints) AsyncCallback async_callback,
std::optional<bool> skip_unavailable_endpoints,
GetPriorityForLoadBalancing::Func priority_func)
{ {
TryGetEntryFunc try_get_entry = [&](NestedPool & pool, std::string & fail_message) TryGetEntryFunc try_get_entry = [&](NestedPool & pool, std::string & fail_message)
{ { return tryGetEntry(pool, timeouts, fail_message, settings, nullptr, async_callback); };
return tryGetEntry(pool, timeouts, fail_message, settings, nullptr, async_callback);
};
std::vector<TryResult> results = getManyImpl(settings, pool_mode, try_get_entry, skip_unavailable_endpoints); std::vector<TryResult> results = getManyImpl(settings, pool_mode, try_get_entry, skip_unavailable_endpoints, priority_func);
std::vector<Entry> entries; std::vector<Entry> entries;
entries.reserve(results.size()); entries.reserve(results.size());
@ -153,17 +153,17 @@ std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::g
std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::getManyChecked( std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::getManyChecked(
const ConnectionTimeouts & timeouts, const ConnectionTimeouts & timeouts,
const Settings & settings, PoolMode pool_mode, const Settings & settings,
PoolMode pool_mode,
const QualifiedTableName & table_to_check, const QualifiedTableName & table_to_check,
AsyncCallback async_callback, AsyncCallback async_callback,
std::optional<bool> skip_unavailable_endpoints) std::optional<bool> skip_unavailable_endpoints,
GetPriorityForLoadBalancing::Func priority_func)
{ {
TryGetEntryFunc try_get_entry = [&](NestedPool & pool, std::string & fail_message) TryGetEntryFunc try_get_entry = [&](NestedPool & pool, std::string & fail_message)
{ { return tryGetEntry(pool, timeouts, fail_message, settings, &table_to_check, async_callback); };
return tryGetEntry(pool, timeouts, fail_message, settings, &table_to_check, async_callback);
};
return getManyImpl(settings, pool_mode, try_get_entry, skip_unavailable_endpoints); return getManyImpl(settings, pool_mode, try_get_entry, skip_unavailable_endpoints, priority_func);
} }
ConnectionPoolWithFailover::Base::GetPriorityFunc ConnectionPoolWithFailover::makeGetPriorityFunc(const Settings & settings) ConnectionPoolWithFailover::Base::GetPriorityFunc ConnectionPoolWithFailover::makeGetPriorityFunc(const Settings & settings)
@ -175,14 +175,16 @@ ConnectionPoolWithFailover::Base::GetPriorityFunc ConnectionPoolWithFailover::ma
} }
std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::getManyImpl( std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::getManyImpl(
const Settings & settings, const Settings & settings,
PoolMode pool_mode, PoolMode pool_mode,
const TryGetEntryFunc & try_get_entry, const TryGetEntryFunc & try_get_entry,
std::optional<bool> skip_unavailable_endpoints) std::optional<bool> skip_unavailable_endpoints,
GetPriorityForLoadBalancing::Func priority_func)
{ {
if (nested_pools.empty()) if (nested_pools.empty())
throw DB::Exception(DB::ErrorCodes::ALL_CONNECTION_TRIES_FAILED, throw DB::Exception(
"Cannot get connection from ConnectionPoolWithFailover cause nested pools are empty"); DB::ErrorCodes::ALL_CONNECTION_TRIES_FAILED,
"Cannot get connection from ConnectionPoolWithFailover cause nested pools are empty");
if (!skip_unavailable_endpoints.has_value()) if (!skip_unavailable_endpoints.has_value())
skip_unavailable_endpoints = settings.skip_unavailable_shards; skip_unavailable_endpoints = settings.skip_unavailable_shards;
@ -203,14 +205,13 @@ std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::g
else else
throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Unknown pool allocation mode"); throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Unknown pool allocation mode");
GetPriorityFunc get_priority = makeGetPriorityFunc(settings); if (!priority_func)
priority_func = makeGetPriorityFunc(settings);
UInt64 max_ignored_errors = settings.distributed_replica_max_ignored_errors.value; UInt64 max_ignored_errors = settings.distributed_replica_max_ignored_errors.value;
bool fallback_to_stale_replicas = settings.fallback_to_stale_replicas_for_distributed_queries.value; bool fallback_to_stale_replicas = settings.fallback_to_stale_replicas_for_distributed_queries.value;
return Base::getMany(min_entries, max_entries, max_tries, return Base::getMany(min_entries, max_entries, max_tries, max_ignored_errors, fallback_to_stale_replicas, try_get_entry, priority_func);
max_ignored_errors, fallback_to_stale_replicas,
try_get_entry, get_priority);
} }
ConnectionPoolWithFailover::TryResult ConnectionPoolWithFailover::TryResult
@ -251,11 +252,14 @@ ConnectionPoolWithFailover::tryGetEntry(
return result; return result;
} }
std::vector<ConnectionPoolWithFailover::Base::ShuffledPool> ConnectionPoolWithFailover::getShuffledPools(const Settings & settings) std::vector<ConnectionPoolWithFailover::Base::ShuffledPool>
ConnectionPoolWithFailover::getShuffledPools(const Settings & settings, GetPriorityForLoadBalancing::Func priority_func)
{ {
GetPriorityFunc get_priority = makeGetPriorityFunc(settings); if (!priority_func)
priority_func = makeGetPriorityFunc(settings);
UInt64 max_ignored_errors = settings.distributed_replica_max_ignored_errors.value; UInt64 max_ignored_errors = settings.distributed_replica_max_ignored_errors.value;
return Base::getShuffledPools(max_ignored_errors, get_priority); return Base::getShuffledPools(max_ignored_errors, priority_func);
} }
} }

View File

@ -54,10 +54,13 @@ public:
/** Allocates up to the specified number of connections to work. /** Allocates up to the specified number of connections to work.
* Connections provide access to different replicas of one shard. * Connections provide access to different replicas of one shard.
*/ */
std::vector<Entry> getMany(const ConnectionTimeouts & timeouts, std::vector<Entry> getMany(
const Settings & settings, PoolMode pool_mode, const ConnectionTimeouts & timeouts,
AsyncCallback async_callback = {}, const Settings & settings,
std::optional<bool> skip_unavailable_endpoints = std::nullopt); PoolMode pool_mode,
AsyncCallback async_callback = {},
std::optional<bool> skip_unavailable_endpoints = std::nullopt,
GetPriorityForLoadBalancing::Func priority_func = {});
/// The same as getMany(), but return std::vector<TryResult>. /// The same as getMany(), but return std::vector<TryResult>.
std::vector<TryResult> getManyForTableFunction(const ConnectionTimeouts & timeouts, std::vector<TryResult> getManyForTableFunction(const ConnectionTimeouts & timeouts,
@ -69,12 +72,13 @@ public:
/// The same as getMany(), but check that replication delay for table_to_check is acceptable. /// The same as getMany(), but check that replication delay for table_to_check is acceptable.
/// Delay threshold is taken from settings. /// Delay threshold is taken from settings.
std::vector<TryResult> getManyChecked( std::vector<TryResult> getManyChecked(
const ConnectionTimeouts & timeouts, const ConnectionTimeouts & timeouts,
const Settings & settings, const Settings & settings,
PoolMode pool_mode, PoolMode pool_mode,
const QualifiedTableName & table_to_check, const QualifiedTableName & table_to_check,
AsyncCallback async_callback = {}, AsyncCallback async_callback = {},
std::optional<bool> skip_unavailable_endpoints = std::nullopt); std::optional<bool> skip_unavailable_endpoints = std::nullopt,
GetPriorityForLoadBalancing::Func priority_func = {});
struct NestedPoolStatus struct NestedPoolStatus
{ {
@ -87,7 +91,7 @@ public:
using Status = std::vector<NestedPoolStatus>; using Status = std::vector<NestedPoolStatus>;
Status getStatus() const; Status getStatus() const;
std::vector<Base::ShuffledPool> getShuffledPools(const Settings & settings); std::vector<Base::ShuffledPool> getShuffledPools(const Settings & settings, GetPriorityFunc priority_func = {});
size_t getMaxErrorCup() const { return Base::max_error_cap; } size_t getMaxErrorCup() const { return Base::max_error_cap; }
@ -96,13 +100,16 @@ public:
Base::updateSharedErrorCounts(shuffled_pools); Base::updateSharedErrorCounts(shuffled_pools);
} }
size_t getPoolSize() const { return Base::getPoolSize(); }
private: private:
/// Get the values of relevant settings and call Base::getMany() /// Get the values of relevant settings and call Base::getMany()
std::vector<TryResult> getManyImpl( std::vector<TryResult> getManyImpl(
const Settings & settings, const Settings & settings,
PoolMode pool_mode, PoolMode pool_mode,
const TryGetEntryFunc & try_get_entry, const TryGetEntryFunc & try_get_entry,
std::optional<bool> skip_unavailable_endpoints = std::nullopt); std::optional<bool> skip_unavailable_endpoints = std::nullopt,
GetPriorityForLoadBalancing::Func priority_func = {});
/// Try to get a connection from the pool and check that it is good. /// Try to get a connection from the pool and check that it is good.
/// If table_to_check is not null and the check is enabled in settings, check that replication delay /// If table_to_check is not null and the check is enabled in settings, check that replication delay
@ -115,7 +122,7 @@ private:
const QualifiedTableName * table_to_check = nullptr, const QualifiedTableName * table_to_check = nullptr,
AsyncCallback async_callback = {}); AsyncCallback async_callback = {});
GetPriorityFunc makeGetPriorityFunc(const Settings & settings); GetPriorityForLoadBalancing::Func makeGetPriorityFunc(const Settings & settings);
GetPriorityForLoadBalancing get_priority_load_balancing; GetPriorityForLoadBalancing get_priority_load_balancing;
}; };

View File

@ -28,16 +28,18 @@ HedgedConnections::HedgedConnections(
const ThrottlerPtr & throttler_, const ThrottlerPtr & throttler_,
PoolMode pool_mode, PoolMode pool_mode,
std::shared_ptr<QualifiedTableName> table_to_check_, std::shared_ptr<QualifiedTableName> table_to_check_,
AsyncCallback async_callback) AsyncCallback async_callback,
GetPriorityForLoadBalancing::Func priority_func)
: hedged_connections_factory( : hedged_connections_factory(
pool_, pool_,
context_->getSettingsRef(), context_->getSettingsRef(),
timeouts_, timeouts_,
context_->getSettingsRef().connections_with_failover_max_tries.value, context_->getSettingsRef().connections_with_failover_max_tries.value,
context_->getSettingsRef().fallback_to_stale_replicas_for_distributed_queries.value, context_->getSettingsRef().fallback_to_stale_replicas_for_distributed_queries.value,
context_->getSettingsRef().max_parallel_replicas.value, context_->getSettingsRef().max_parallel_replicas.value,
context_->getSettingsRef().skip_unavailable_shards.value, context_->getSettingsRef().skip_unavailable_shards.value,
table_to_check_) table_to_check_,
priority_func)
, context(std::move(context_)) , context(std::move(context_))
, settings(context->getSettingsRef()) , settings(context->getSettingsRef())
, throttler(throttler_) , throttler(throttler_)

View File

@ -70,13 +70,15 @@ public:
size_t index; size_t index;
}; };
HedgedConnections(const ConnectionPoolWithFailoverPtr & pool_, HedgedConnections(
ContextPtr context_, const ConnectionPoolWithFailoverPtr & pool_,
const ConnectionTimeouts & timeouts_, ContextPtr context_,
const ThrottlerPtr & throttler, const ConnectionTimeouts & timeouts_,
PoolMode pool_mode, const ThrottlerPtr & throttler,
std::shared_ptr<QualifiedTableName> table_to_check_ = nullptr, PoolMode pool_mode,
AsyncCallback async_callback = {}); std::shared_ptr<QualifiedTableName> table_to_check_ = nullptr,
AsyncCallback async_callback = {},
GetPriorityForLoadBalancing::Func priority_func = {});
void sendScalarsData(Scalars & data) override; void sendScalarsData(Scalars & data) override;

View File

@ -29,7 +29,8 @@ HedgedConnectionsFactory::HedgedConnectionsFactory(
bool fallback_to_stale_replicas_, bool fallback_to_stale_replicas_,
UInt64 max_parallel_replicas_, UInt64 max_parallel_replicas_,
bool skip_unavailable_shards_, bool skip_unavailable_shards_,
std::shared_ptr<QualifiedTableName> table_to_check_) std::shared_ptr<QualifiedTableName> table_to_check_,
GetPriorityForLoadBalancing::Func priority_func)
: pool(pool_) : pool(pool_)
, timeouts(timeouts_) , timeouts(timeouts_)
, table_to_check(table_to_check_) , table_to_check(table_to_check_)
@ -39,7 +40,7 @@ HedgedConnectionsFactory::HedgedConnectionsFactory(
, max_parallel_replicas(max_parallel_replicas_) , max_parallel_replicas(max_parallel_replicas_)
, skip_unavailable_shards(skip_unavailable_shards_) , skip_unavailable_shards(skip_unavailable_shards_)
{ {
shuffled_pools = pool->getShuffledPools(settings_); shuffled_pools = pool->getShuffledPools(settings_, priority_func);
for (auto shuffled_pool : shuffled_pools) for (auto shuffled_pool : shuffled_pools)
replicas.emplace_back(std::make_unique<ConnectionEstablisherAsync>(shuffled_pool.pool, &timeouts, settings_, log, table_to_check.get())); replicas.emplace_back(std::make_unique<ConnectionEstablisherAsync>(shuffled_pool.pool, &timeouts, settings_, log, table_to_check.get()));
} }
@ -323,8 +324,7 @@ HedgedConnectionsFactory::State HedgedConnectionsFactory::processFinishedConnect
else else
{ {
ShuffledPool & shuffled_pool = shuffled_pools[index]; ShuffledPool & shuffled_pool = shuffled_pools[index];
LOG_WARNING( LOG_INFO(log, "Connection failed at try №{}, reason: {}", (shuffled_pool.error_count + 1), fail_message);
log, "Connection failed at try №{}, reason: {}", (shuffled_pool.error_count + 1), fail_message);
ProfileEvents::increment(ProfileEvents::DistributedConnectionFailTry); ProfileEvents::increment(ProfileEvents::DistributedConnectionFailTry);
shuffled_pool.error_count = std::min(pool->getMaxErrorCup(), shuffled_pool.error_count + 1); shuffled_pool.error_count = std::min(pool->getMaxErrorCup(), shuffled_pool.error_count + 1);

View File

@ -53,7 +53,8 @@ public:
bool fallback_to_stale_replicas_, bool fallback_to_stale_replicas_,
UInt64 max_parallel_replicas_, UInt64 max_parallel_replicas_,
bool skip_unavailable_shards_, bool skip_unavailable_shards_,
std::shared_ptr<QualifiedTableName> table_to_check_ = nullptr); std::shared_ptr<QualifiedTableName> table_to_check_ = nullptr,
GetPriorityForLoadBalancing::Func priority_func = {});
/// Create and return active connections according to pool_mode. /// Create and return active connections according to pool_mode.
std::vector<Connection *> getManyConnections(PoolMode pool_mode, AsyncCallback async_callback = {}); std::vector<Connection *> getManyConnections(PoolMode pool_mode, AsyncCallback async_callback = {});

View File

@ -9,7 +9,8 @@ namespace ErrorCodes
extern const int LOGICAL_ERROR; extern const int LOGICAL_ERROR;
} }
std::function<Priority(size_t index)> GetPriorityForLoadBalancing::getPriorityFunc(LoadBalancing load_balance, size_t offset, size_t pool_size) const GetPriorityForLoadBalancing::Func
GetPriorityForLoadBalancing::getPriorityFunc(LoadBalancing load_balance, size_t offset, size_t pool_size) const
{ {
std::function<Priority(size_t index)> get_priority; std::function<Priority(size_t index)> get_priority;
switch (load_balance) switch (load_balance)
@ -33,19 +34,26 @@ std::function<Priority(size_t index)> GetPriorityForLoadBalancing::getPriorityFu
get_priority = [offset](size_t i) { return i != offset ? Priority{1} : Priority{0}; }; get_priority = [offset](size_t i) { return i != offset ? Priority{1} : Priority{0}; };
break; break;
case LoadBalancing::ROUND_ROBIN: case LoadBalancing::ROUND_ROBIN:
if (last_used >= pool_size) auto local_last_used = last_used % pool_size;
last_used = 0;
++last_used; ++last_used;
/* Consider pool_size equals to 5
* last_used = 1 -> get_priority: 0 1 2 3 4 // Example: pool_size = 5
* last_used = 2 -> get_priority: 4 0 1 2 3 // | local_last_used | i=0 | i=1 | i=2 | i=3 | i=4 |
* last_used = 3 -> get_priority: 4 3 0 1 2 // | 0 | 4 | 0 | 1 | 2 | 3 |
* ... // | 1 | 3 | 4 | 0 | 1 | 2 |
* */ // | 2 | 2 | 3 | 4 | 0 | 1 |
get_priority = [this, pool_size](size_t i) // | 3 | 1 | 2 | 3 | 4 | 0 |
// | 4 | 0 | 1 | 2 | 3 | 4 |
get_priority = [pool_size, local_last_used](size_t i)
{ {
++i; // To make `i` indexing start with 1 instead of 0 as `last_used` does size_t priority = pool_size - 1;
return Priority{static_cast<Int64>(i < last_used ? pool_size - i : i - last_used)}; if (i < local_last_used)
priority = pool_size - 1 - (local_last_used - i);
if (i > local_last_used)
priority = i - local_last_used - 1;
return Priority{static_cast<Int64>(priority)};
}; };
break; break;
} }

View File

@ -8,7 +8,12 @@ namespace DB
class GetPriorityForLoadBalancing class GetPriorityForLoadBalancing
{ {
public: public:
explicit GetPriorityForLoadBalancing(LoadBalancing load_balancing_) : load_balancing(load_balancing_) {} using Func = std::function<Priority(size_t index)>;
explicit GetPriorityForLoadBalancing(LoadBalancing load_balancing_, size_t last_used_ = 0)
: load_balancing(load_balancing_), last_used(last_used_)
{
}
GetPriorityForLoadBalancing() = default; GetPriorityForLoadBalancing() = default;
bool operator == (const GetPriorityForLoadBalancing & other) const bool operator == (const GetPriorityForLoadBalancing & other) const
@ -23,7 +28,7 @@ public:
return !(*this == other); return !(*this == other);
} }
std::function<Priority(size_t index)> getPriorityFunc(LoadBalancing load_balance, size_t offset, size_t pool_size) const; Func getPriorityFunc(LoadBalancing load_balance, size_t offset, size_t pool_size) const;
std::vector<size_t> hostname_prefix_distance; /// Prefix distances from name of this host to the names of hosts of pools. std::vector<size_t> hostname_prefix_distance; /// Prefix distances from name of this host to the names of hosts of pools.
std::vector<size_t> hostname_levenshtein_distance; /// Levenshtein Distances from name of this host to the names of hosts of pools. std::vector<size_t> hostname_levenshtein_distance; /// Levenshtein Distances from name of this host to the names of hosts of pools.

View File

@ -124,7 +124,9 @@ public:
size_t max_ignored_errors, size_t max_ignored_errors,
bool fallback_to_stale_replicas, bool fallback_to_stale_replicas,
const TryGetEntryFunc & try_get_entry, const TryGetEntryFunc & try_get_entry,
const GetPriorityFunc & get_priority = GetPriorityFunc()); const GetPriorityFunc & get_priority);
size_t getPoolSize() const { return nested_pools.size(); }
protected: protected:
@ -147,7 +149,7 @@ protected:
return std::make_tuple(shared_pool_states, nested_pools, last_error_decrease_time); return std::make_tuple(shared_pool_states, nested_pools, last_error_decrease_time);
} }
NestedPools nested_pools; const NestedPools nested_pools;
const time_t decrease_error_period; const time_t decrease_error_period;
const size_t max_error_cap; const size_t max_error_cap;

View File

@ -117,13 +117,13 @@ void SelectStreamFactory::createForShard(
std::vector<QueryPlanPtr> & local_plans, std::vector<QueryPlanPtr> & local_plans,
Shards & remote_shards, Shards & remote_shards,
UInt32 shard_count, UInt32 shard_count,
bool parallel_replicas_enabled) bool parallel_replicas_enabled,
AdditionalShardFilterGenerator shard_filter_generator)
{ {
auto it = objects_by_shard.find(shard_info.shard_num); auto it = objects_by_shard.find(shard_info.shard_num);
if (it != objects_by_shard.end()) if (it != objects_by_shard.end())
replaceMissedSubcolumnsByConstants(storage_snapshot->object_columns, it->second, query_ast); replaceMissedSubcolumnsByConstants(storage_snapshot->object_columns, it->second, query_ast);
auto emplace_local_stream = [&]() auto emplace_local_stream = [&]()
{ {
local_plans.emplace_back(createLocalPlan( local_plans.emplace_back(createLocalPlan(
@ -139,6 +139,7 @@ void SelectStreamFactory::createForShard(
.shard_info = shard_info, .shard_info = shard_info,
.lazy = lazy, .lazy = lazy,
.local_delay = local_delay, .local_delay = local_delay,
.shard_filter_generator = std::move(shard_filter_generator),
}); });
}; };

View File

@ -40,6 +40,7 @@ ASTPtr rewriteSelectQuery(
ASTPtr table_function_ptr = nullptr); ASTPtr table_function_ptr = nullptr);
using ColumnsDescriptionByShardNum = std::unordered_map<UInt32, ColumnsDescription>; using ColumnsDescriptionByShardNum = std::unordered_map<UInt32, ColumnsDescription>;
using AdditionalShardFilterGenerator = std::function<ASTPtr(uint64_t)>;
class SelectStreamFactory class SelectStreamFactory
{ {
@ -59,6 +60,7 @@ public:
/// (When there is a local replica with big delay). /// (When there is a local replica with big delay).
bool lazy = false; bool lazy = false;
time_t local_delay = 0; time_t local_delay = 0;
AdditionalShardFilterGenerator shard_filter_generator{};
}; };
using Shards = std::vector<Shard>; using Shards = std::vector<Shard>;
@ -78,7 +80,8 @@ public:
std::vector<QueryPlanPtr> & local_plans, std::vector<QueryPlanPtr> & local_plans,
Shards & remote_shards, Shards & remote_shards,
UInt32 shard_count, UInt32 shard_count,
bool parallel_replicas_enabled); bool parallel_replicas_enabled,
AdditionalShardFilterGenerator shard_filter_generator);
const Block header; const Block header;
const ColumnsDescriptionByShardNum objects_by_shard; const ColumnsDescriptionByShardNum objects_by_shard;

View File

@ -158,6 +158,13 @@ ContextMutablePtr updateSettingsForCluster(const Cluster & cluster,
new_settings.timeout_overflow_mode = settings.timeout_overflow_mode_leaf; new_settings.timeout_overflow_mode = settings.timeout_overflow_mode_leaf;
} }
/// in case of parallel replicas custom key use round robing load balancing
/// so custom key partitions will be spread over nodes in round-robin fashion
if (context->canUseParallelReplicasCustomKey(cluster) && !settings.load_balancing.changed)
{
new_settings.load_balancing = LoadBalancing::ROUND_ROBIN;
}
auto new_context = Context::createCopy(context); auto new_context = Context::createCopy(context);
new_context->setSettings(new_settings); new_context->setSettings(new_settings);
return new_context; return new_context;
@ -247,21 +254,6 @@ void executeQuery(
visitor.visit(query_ast_for_shard); visitor.visit(query_ast_for_shard);
} }
if (shard_filter_generator)
{
auto shard_filter = shard_filter_generator(shard_info.shard_num);
if (shard_filter)
{
auto & select_query = query_ast_for_shard->as<ASTSelectQuery &>();
auto where_expression = select_query.where();
if (where_expression)
shard_filter = makeASTFunction("and", where_expression, shard_filter);
select_query.setExpression(ASTSelectQuery::Expression::WHERE, std::move(shard_filter));
}
}
// decide for each shard if parallel reading from replicas should be enabled // decide for each shard if parallel reading from replicas should be enabled
// according to settings and number of replicas declared per shard // according to settings and number of replicas declared per shard
const auto & addresses = cluster->getShardsAddresses().at(i); const auto & addresses = cluster->getShardsAddresses().at(i);
@ -276,7 +268,8 @@ void executeQuery(
plans, plans,
remote_shards, remote_shards,
static_cast<UInt32>(shards), static_cast<UInt32>(shards),
parallel_replicas_enabled); parallel_replicas_enabled,
shard_filter_generator);
} }
if (!remote_shards.empty()) if (!remote_shards.empty())

View File

@ -65,7 +65,7 @@ void executeQuery(
const std::string & sharding_key_column_name, const std::string & sharding_key_column_name,
const ClusterPtr & not_optimized_cluster, const ClusterPtr & not_optimized_cluster,
const DistributedSettings & distributed_settings, const DistributedSettings & distributed_settings,
AdditionalShardFilterGenerator shard_filter_generator = {}); AdditionalShardFilterGenerator shard_filter_generator);
void executeQueryWithParallelReplicas( void executeQueryWithParallelReplicas(

View File

@ -5113,6 +5113,12 @@ bool Context::canUseParallelReplicasOnFollower() const
return canUseTaskBasedParallelReplicas() && getClientInfo().collaborate_with_initiator; return canUseTaskBasedParallelReplicas() && getClientInfo().collaborate_with_initiator;
} }
bool Context::canUseParallelReplicasCustomKey(const Cluster & cluster) const
{
return settings.max_parallel_replicas > 1 && getParallelReplicasMode() == Context::ParallelReplicasMode::CUSTOM_KEY
&& cluster.getShardCount() == 1 && cluster.getShardsInfo()[0].getAllNodeCount() > 1;
}
void Context::setPreparedSetsCache(const PreparedSetsCachePtr & cache) void Context::setPreparedSetsCache(const PreparedSetsCachePtr & cache)
{ {
prepared_sets_cache = cache; prepared_sets_cache = cache;

View File

@ -1246,6 +1246,7 @@ public:
bool canUseTaskBasedParallelReplicas() const; bool canUseTaskBasedParallelReplicas() const;
bool canUseParallelReplicasOnInitiator() const; bool canUseParallelReplicasOnInitiator() const;
bool canUseParallelReplicasOnFollower() const; bool canUseParallelReplicasOnFollower() const;
bool canUseParallelReplicasCustomKey(const Cluster & cluster) const;
enum class ParallelReplicasMode : uint8_t enum class ParallelReplicasMode : uint8_t
{ {

View File

@ -589,9 +589,8 @@ InterpreterSelectQuery::InterpreterSelectQuery(
} }
} }
else if (auto * distributed = dynamic_cast<StorageDistributed *>(storage.get()); else if (auto * distributed = dynamic_cast<StorageDistributed *>(storage.get());
distributed && canUseCustomKey(settings, *distributed->getCluster(), *context)) distributed && context->canUseParallelReplicasCustomKey(*distributed->getCluster()))
{ {
query_info.use_custom_key = true;
context->setSetting("distributed_group_by_no_merge", 2); context->setSetting("distributed_group_by_no_merge", 2);
} }
} }

View File

@ -20,12 +20,6 @@ namespace ErrorCodes
extern const int ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER; extern const int ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER;
} }
bool canUseCustomKey(const Settings & settings, const Cluster & cluster, const Context & context)
{
return settings.max_parallel_replicas > 1 && context.getParallelReplicasMode() == Context::ParallelReplicasMode::CUSTOM_KEY
&& cluster.getShardCount() == 1 && cluster.getShardsInfo()[0].getAllNodeCount() > 1;
}
ASTPtr getCustomKeyFilterForParallelReplica( ASTPtr getCustomKeyFilterForParallelReplica(
size_t replicas_count, size_t replicas_count,
size_t replica_num, size_t replica_num,
@ -34,7 +28,7 @@ ASTPtr getCustomKeyFilterForParallelReplica(
const ColumnsDescription & columns, const ColumnsDescription & columns,
const ContextPtr & context) const ContextPtr & context)
{ {
assert(replicas_count > 1); chassert(replicas_count > 1);
if (filter_type == ParallelReplicasCustomKeyFilterType::DEFAULT) if (filter_type == ParallelReplicasCustomKeyFilterType::DEFAULT)
{ {
// first we do modulo with replica count // first we do modulo with replica count

View File

@ -9,9 +9,6 @@
namespace DB namespace DB
{ {
bool canUseCustomKey(const Settings & settings, const Cluster & cluster, const Context & context);
/// Get AST for filter created from custom_key /// Get AST for filter created from custom_key
/// replica_num is the number of the replica for which we are generating filter starting from 0 /// replica_num is the number of the replica for which we are generating filter starting from 0
ASTPtr getCustomKeyFilterForParallelReplica( ASTPtr getCustomKeyFilterForParallelReplica(

View File

@ -809,9 +809,8 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres
else else
{ {
if (auto * distributed = typeid_cast<StorageDistributed *>(storage.get()); if (auto * distributed = typeid_cast<StorageDistributed *>(storage.get());
distributed && canUseCustomKey(settings, *distributed->getCluster(), *query_context)) distributed && query_context->canUseParallelReplicasCustomKey(*distributed->getCluster()))
{ {
table_expression_query_info.use_custom_key = true;
planner_context->getMutableQueryContext()->setSetting("distributed_group_by_no_merge", 2); planner_context->getMutableQueryContext()->setSetting("distributed_group_by_no_merge", 2);
} }
} }

View File

@ -18,6 +18,7 @@
#include <Client/ConnectionPool.h> #include <Client/ConnectionPool.h>
#include <Client/ConnectionPoolWithFailover.h> #include <Client/ConnectionPoolWithFailover.h>
#include <QueryPipeline/QueryPipelineBuilder.h> #include <QueryPipeline/QueryPipelineBuilder.h>
#include <Parsers/ASTFunction.h>
#include <boost/algorithm/string/join.hpp> #include <boost/algorithm/string/join.hpp>
@ -231,8 +232,6 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact
add_extremes = context->getSettingsRef().extremes; add_extremes = context->getSettingsRef().extremes;
} }
String query_string = formattedAST(shard.query);
scalars["_shard_num"] scalars["_shard_num"]
= Block{{DataTypeUInt32().createColumnConst(1, shard.shard_info.shard_num), std::make_shared<DataTypeUInt32>(), "_shard_num"}}; = Block{{DataTypeUInt32().createColumnConst(1, shard.shard_info.shard_num), std::make_shared<DataTypeUInt32>(), "_shard_num"}};
@ -254,29 +253,81 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact
context->setSetting("cluster_for_parallel_replicas", cluster_name); context->setSetting("cluster_for_parallel_replicas", cluster_name);
} }
auto remote_query_executor = std::make_shared<RemoteQueryExecutor>( /// parallel replicas custom key case
shard.shard_info.pool, query_string, output_stream->header, context, throttler, scalars, external_tables, stage); if (shard.shard_filter_generator)
remote_query_executor->setLogger(log);
if (context->canUseTaskBasedParallelReplicas())
{ {
// when doing parallel reading from replicas (ParallelReplicasMode::READ_TASKS) on a shard: for (size_t i = 0; i < shard.shard_info.per_replica_pools.size(); ++i)
// establish a connection to a replica on the shard, the replica will instantiate coordinator to manage parallel reading from replicas on the shard. {
// The coordinator will return query result from the shard. auto query = shard.query->clone();
// Only one coordinator per shard is necessary. Therefore using PoolMode::GET_ONE to establish only one connection per shard. auto & select_query = query->as<ASTSelectQuery &>();
// Using PoolMode::GET_MANY for this mode will(can) lead to instantiation of several coordinators (depends on max_parallel_replicas setting) auto shard_filter = shard.shard_filter_generator(i + 1);
// each will execute parallel reading from replicas, so the query result will be multiplied by the number of created coordinators if (shard_filter)
remote_query_executor->setPoolMode(PoolMode::GET_ONE); {
auto where_expression = select_query.where();
if (where_expression)
shard_filter = makeASTFunction("and", where_expression, shard_filter);
select_query.setExpression(ASTSelectQuery::Expression::WHERE, std::move(shard_filter));
}
const String query_string = formattedAST(query);
if (!priority_func_factory.has_value())
priority_func_factory = GetPriorityForLoadBalancing(LoadBalancing::ROUND_ROBIN, randomSeed());
GetPriorityForLoadBalancing::Func priority_func
= priority_func_factory->getPriorityFunc(LoadBalancing::ROUND_ROBIN, 0, shard.shard_info.pool->getPoolSize());
auto remote_query_executor = std::make_shared<RemoteQueryExecutor>(
shard.shard_info.pool,
query_string,
output_stream->header,
context,
throttler,
scalars,
external_tables,
stage,
std::nullopt,
priority_func);
remote_query_executor->setLogger(log);
remote_query_executor->setPoolMode(PoolMode::GET_ONE);
if (!table_func_ptr)
remote_query_executor->setMainTable(shard.main_table ? shard.main_table : main_table);
pipes.emplace_back(
createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read, async_query_sending));
addConvertingActions(pipes.back(), output_stream->header);
}
} }
else else
remote_query_executor->setPoolMode(PoolMode::GET_MANY); {
const String query_string = formattedAST(shard.query);
if (!table_func_ptr) auto remote_query_executor = std::make_shared<RemoteQueryExecutor>(
remote_query_executor->setMainTable(shard.main_table ? shard.main_table : main_table); shard.shard_info.pool, query_string, output_stream->header, context, throttler, scalars, external_tables, stage);
remote_query_executor->setLogger(log);
pipes.emplace_back( if (context->canUseTaskBasedParallelReplicas())
createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read, async_query_sending)); {
addConvertingActions(pipes.back(), output_stream->header); // when doing parallel reading from replicas (ParallelReplicasMode::READ_TASKS) on a shard:
// establish a connection to a replica on the shard, the replica will instantiate coordinator to manage parallel reading from replicas on the shard.
// The coordinator will return query result from the shard.
// Only one coordinator per shard is necessary. Therefore using PoolMode::GET_ONE to establish only one connection per shard.
// Using PoolMode::GET_MANY for this mode will(can) lead to instantiation of several coordinators (depends on max_parallel_replicas setting)
// each will execute parallel reading from replicas, so the query result will be multiplied by the number of created coordinators
remote_query_executor->setPoolMode(PoolMode::GET_ONE);
}
else
remote_query_executor->setPoolMode(PoolMode::GET_MANY);
if (!table_func_ptr)
remote_query_executor->setMainTable(shard.main_table ? shard.main_table : main_table);
pipes.emplace_back(
createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read, async_query_sending));
addConvertingActions(pipes.back(), output_stream->header);
}
} }
void ReadFromRemote::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) void ReadFromRemote::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &)

View File

@ -60,6 +60,7 @@ private:
Poco::Logger * log; Poco::Logger * log;
UInt32 shard_count; UInt32 shard_count;
const String cluster_name; const String cluster_name;
std::optional<GetPriorityForLoadBalancing> priority_func_factory;
void addLazyPipe(Pipes & pipes, const ClusterProxy::SelectStreamFactory::Shard & shard); void addLazyPipe(Pipes & pipes, const ClusterProxy::SelectStreamFactory::Shard & shard);
void addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFactory::Shard & shard); void addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFactory::Shard & shard);

View File

@ -43,13 +43,24 @@ namespace ErrorCodes
} }
RemoteQueryExecutor::RemoteQueryExecutor( RemoteQueryExecutor::RemoteQueryExecutor(
const String & query_, const Block & header_, ContextPtr context_, const String & query_,
const Scalars & scalars_, const Tables & external_tables_, const Block & header_,
QueryProcessingStage::Enum stage_, std::optional<Extension> extension_) ContextPtr context_,
: header(header_), query(query_), context(context_), scalars(scalars_) const Scalars & scalars_,
, external_tables(external_tables_), stage(stage_) const Tables & external_tables_,
QueryProcessingStage::Enum stage_,
std::optional<Extension> extension_,
GetPriorityForLoadBalancing::Func priority_func_)
: header(header_)
, query(query_)
, context(context_)
, scalars(scalars_)
, external_tables(external_tables_)
, stage(stage_)
, extension(extension_) , extension(extension_)
{} , priority_func(priority_func_)
{
}
RemoteQueryExecutor::RemoteQueryExecutor( RemoteQueryExecutor::RemoteQueryExecutor(
Connection & connection, Connection & connection,
@ -100,10 +111,16 @@ RemoteQueryExecutor::RemoteQueryExecutor(
RemoteQueryExecutor::RemoteQueryExecutor( RemoteQueryExecutor::RemoteQueryExecutor(
const ConnectionPoolWithFailoverPtr & pool, const ConnectionPoolWithFailoverPtr & pool,
const String & query_, const Block & header_, ContextPtr context_, const String & query_,
const ThrottlerPtr & throttler, const Scalars & scalars_, const Tables & external_tables_, const Block & header_,
QueryProcessingStage::Enum stage_, std::optional<Extension> extension_) ContextPtr context_,
: RemoteQueryExecutor(query_, header_, context_, scalars_, external_tables_, stage_, extension_) const ThrottlerPtr & throttler,
const Scalars & scalars_,
const Tables & external_tables_,
QueryProcessingStage::Enum stage_,
std::optional<Extension> extension_,
GetPriorityForLoadBalancing::Func priority_func_)
: RemoteQueryExecutor(query_, header_, context_, scalars_, external_tables_, stage_, extension_, priority_func_)
{ {
create_connections = [this, pool, throttler](AsyncCallback async_callback)->std::unique_ptr<IConnections> create_connections = [this, pool, throttler](AsyncCallback async_callback)->std::unique_ptr<IConnections>
{ {
@ -117,7 +134,8 @@ RemoteQueryExecutor::RemoteQueryExecutor(
if (main_table) if (main_table)
table_to_check = std::make_shared<QualifiedTableName>(main_table.getQualifiedName()); table_to_check = std::make_shared<QualifiedTableName>(main_table.getQualifiedName());
auto res = std::make_unique<HedgedConnections>(pool, context, timeouts, throttler, pool_mode, table_to_check, std::move(async_callback)); auto res = std::make_unique<HedgedConnections>(
pool, context, timeouts, throttler, pool_mode, table_to_check, std::move(async_callback), priority_func);
if (extension && extension->replica_info) if (extension && extension->replica_info)
res->setReplicaInfo(*extension->replica_info); res->setReplicaInfo(*extension->replica_info);
return res; return res;
@ -137,14 +155,16 @@ RemoteQueryExecutor::RemoteQueryExecutor(
pool_mode, pool_mode,
main_table.getQualifiedName(), main_table.getQualifiedName(),
std::move(async_callback), std::move(async_callback),
skip_unavailable_endpoints); skip_unavailable_endpoints,
priority_func);
connection_entries.reserve(try_results.size()); connection_entries.reserve(try_results.size());
for (auto & try_result : try_results) for (auto & try_result : try_results)
connection_entries.emplace_back(std::move(try_result.entry)); connection_entries.emplace_back(std::move(try_result.entry));
} }
else else
{ {
connection_entries = pool->getMany(timeouts, current_settings, pool_mode, std::move(async_callback), skip_unavailable_endpoints); connection_entries = pool->getMany(
timeouts, current_settings, pool_mode, std::move(async_callback), skip_unavailable_endpoints, priority_func);
} }
auto res = std::make_unique<MultiplexedConnections>(std::move(connection_entries), current_settings, throttler); auto res = std::make_unique<MultiplexedConnections>(std::move(connection_entries), current_settings, throttler);

View File

@ -50,6 +50,7 @@ public:
std::shared_ptr<TaskIterator> task_iterator = nullptr; std::shared_ptr<TaskIterator> task_iterator = nullptr;
std::shared_ptr<ParallelReplicasReadingCoordinator> parallel_reading_coordinator = nullptr; std::shared_ptr<ParallelReplicasReadingCoordinator> parallel_reading_coordinator = nullptr;
std::optional<IConnections::ReplicaInfo> replica_info = {}; std::optional<IConnections::ReplicaInfo> replica_info = {};
GetPriorityForLoadBalancing::Func priority_func;
}; };
/// Takes already set connection. /// Takes already set connection.
@ -76,9 +77,15 @@ public:
/// Takes a pool and gets one or several connections from it. /// Takes a pool and gets one or several connections from it.
RemoteQueryExecutor( RemoteQueryExecutor(
const ConnectionPoolWithFailoverPtr & pool, const ConnectionPoolWithFailoverPtr & pool,
const String & query_, const Block & header_, ContextPtr context_, const String & query_,
const ThrottlerPtr & throttler = nullptr, const Scalars & scalars_ = Scalars(), const Tables & external_tables_ = Tables(), const Block & header_,
QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete, std::optional<Extension> extension_ = std::nullopt); ContextPtr context_,
const ThrottlerPtr & throttler = nullptr,
const Scalars & scalars_ = Scalars(),
const Tables & external_tables_ = Tables(),
QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete,
std::optional<Extension> extension_ = std::nullopt,
GetPriorityForLoadBalancing::Func priority_func = {});
~RemoteQueryExecutor(); ~RemoteQueryExecutor();
@ -191,9 +198,14 @@ public:
private: private:
RemoteQueryExecutor( RemoteQueryExecutor(
const String & query_, const Block & header_, ContextPtr context_, const String & query_,
const Scalars & scalars_, const Tables & external_tables_, const Block & header_,
QueryProcessingStage::Enum stage_, std::optional<Extension> extension_); ContextPtr context_,
const Scalars & scalars_,
const Tables & external_tables_,
QueryProcessingStage::Enum stage_,
std::optional<Extension> extension_,
GetPriorityForLoadBalancing::Func priority_func = {});
Block header; Block header;
Block totals; Block totals;
@ -273,6 +285,8 @@ private:
Poco::Logger * log = nullptr; Poco::Logger * log = nullptr;
GetPriorityForLoadBalancing::Func priority_func;
/// Send all scalars to remote servers /// Send all scalars to remote servers
void sendScalars(); void sendScalars();

View File

@ -176,8 +176,6 @@ struct SelectQueryInfo
/// ///
/// Configured in StorageDistributed::getQueryProcessingStage() /// Configured in StorageDistributed::getQueryProcessingStage()
ClusterPtr optimized_cluster; ClusterPtr optimized_cluster;
/// should we use custom key with the cluster
bool use_custom_key = false;
TreeRewriterResultPtr syntax_analyzer_result; TreeRewriterResultPtr syntax_analyzer_result;

View File

@ -429,15 +429,10 @@ QueryProcessingStage::Enum StorageDistributed::getQueryProcessingStage(
size_t nodes = getClusterQueriedNodes(settings, cluster); size_t nodes = getClusterQueriedNodes(settings, cluster);
if (query_info.use_custom_key) query_info.cluster = cluster;
{
LOG_INFO(log, "Single shard cluster used with custom_key, transforming replicas into virtual shards");
query_info.cluster = cluster->getClusterWithReplicasAsShards(settings, settings.max_parallel_replicas);
}
else
{
query_info.cluster = cluster;
if (!local_context->canUseParallelReplicasCustomKey(*cluster))
{
if (nodes > 1 && settings.optimize_skip_unused_shards) if (nodes > 1 && settings.optimize_skip_unused_shards)
{ {
/// Always calculate optimized cluster here, to avoid conditions during read() /// Always calculate optimized cluster here, to avoid conditions during read()
@ -880,30 +875,22 @@ void StorageDistributed::read(
storage_snapshot, storage_snapshot,
processed_stage); processed_stage);
auto settings = local_context->getSettingsRef(); const auto & settings = local_context->getSettingsRef();
ClusterProxy::AdditionalShardFilterGenerator additional_shard_filter_generator; ClusterProxy::AdditionalShardFilterGenerator additional_shard_filter_generator;
if (query_info.use_custom_key) if (local_context->canUseParallelReplicasCustomKey(*query_info.getCluster()))
{ {
if (auto custom_key_ast = parseCustomKeyForTable(settings.parallel_replicas_custom_key, *local_context)) if (auto custom_key_ast = parseCustomKeyForTable(settings.parallel_replicas_custom_key, *local_context))
{ {
if (query_info.getCluster()->getShardCount() == 1)
{
// we are reading from single shard with multiple replicas but didn't transform replicas
// into virtual shards with custom_key set
throw Exception(ErrorCodes::LOGICAL_ERROR, "Replicas weren't transformed into virtual shards");
}
additional_shard_filter_generator = additional_shard_filter_generator =
[&, my_custom_key_ast = std::move(custom_key_ast), shard_count = query_info.cluster->getShardCount()](uint64_t shard_num) -> ASTPtr [my_custom_key_ast = std::move(custom_key_ast),
column_description = this->getInMemoryMetadataPtr()->columns,
custom_key_type = settings.parallel_replicas_custom_key_filter_type.value,
context = local_context,
replica_count = query_info.getCluster()->getShardsInfo().front().per_replica_pools.size()](uint64_t replica_num) -> ASTPtr
{ {
return getCustomKeyFilterForParallelReplica( return getCustomKeyFilterForParallelReplica(
shard_count, replica_count, replica_num - 1, my_custom_key_ast, custom_key_type, column_description, context);
shard_num - 1,
my_custom_key_ast,
settings.parallel_replicas_custom_key_filter_type,
this->getInMemoryMetadataPtr()->columns,
local_context);
}; };
} }
} }

View File

@ -144,6 +144,24 @@
</replica> </replica>
</shard> </shard>
</parallel_replicas> </parallel_replicas>
<test_cluster_1_shard_3_replicas_1_unavailable>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>127.0.0.1</host>
<port>9000</port>
</replica>
<replica>
<host>127.0.0.2</host>
<port>9000</port>
</replica>
<!-- Unavailable replica -->
<replica>
<host>127.0.0.3</host>
<port>1234</port>
</replica>
</shard>
</test_cluster_1_shard_3_replicas_1_unavailable>
<test_cluster_one_shard_three_replicas_localhost> <test_cluster_one_shard_three_replicas_localhost>
<shard> <shard>
<internal_replication>false</internal_replication> <internal_replication>false</internal_replication>

View File

@ -87,8 +87,3 @@ def test_parallel_replicas_custom_key(start_cluster, cluster, custom_key, filter
node.contains_in_log("Processing query on a replica using custom_key") node.contains_in_log("Processing query on a replica using custom_key")
for node in nodes for node in nodes
) )
else:
# we first transform all replicas into shards and then append for each shard filter
assert n1.contains_in_log(
"Single shard cluster used with custom_key, transforming replicas into virtual shards"
)

View File

@ -0,0 +1,26 @@
<clickhouse>
<remote_servers>
<test_single_shard_multiple_replicas>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>n1</host>
<port>9000</port>
</replica>
<replica>
<host>n2</host>
<port>1234</port>
</replica>
<replica>
<host>n3</host>
<port>9000</port>
</replica>
<replica>
<host>n4</host>
<port>1234</port>
</replica>
</shard>
</test_single_shard_multiple_replicas>
</remote_servers>
</clickhouse>

View File

@ -0,0 +1,122 @@
import pytest
import uuid
from helpers.cluster import ClickHouseCluster
cluster = ClickHouseCluster(__file__)
node1 = cluster.add_instance(
"n1", main_configs=["configs/remote_servers.xml"], with_zookeeper=True
)
node3 = cluster.add_instance(
"n3", main_configs=["configs/remote_servers.xml"], with_zookeeper=True
)
nodes = [node1, node3]
@pytest.fixture(scope="module", autouse=True)
def start_cluster():
try:
cluster.start()
yield cluster
finally:
cluster.shutdown()
def create_tables(cluster, table_name):
node1.query(f"DROP TABLE IF EXISTS {table_name} SYNC")
node3.query(f"DROP TABLE IF EXISTS {table_name} SYNC")
node1.query(
f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r1') ORDER BY (key)"
)
node3.query(
f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r3') ORDER BY (key)"
)
# populate data
node1.query(
f"INSERT INTO {table_name} SELECT number % 4, number FROM numbers(1000)"
)
node1.query(
f"INSERT INTO {table_name} SELECT number % 4, number FROM numbers(1000, 1000)"
)
node1.query(
f"INSERT INTO {table_name} SELECT number % 4, number FROM numbers(2000, 1000)"
)
node1.query(
f"INSERT INTO {table_name} SELECT number % 4, number FROM numbers(3000, 1000)"
)
node3.query(f"SYSTEM SYNC REPLICA {table_name}")
@pytest.mark.parametrize("use_hedged_requests", [1, 0])
@pytest.mark.parametrize("custom_key", ["sipHash64(key)", "key"])
@pytest.mark.parametrize("filter_type", ["default", "range"])
@pytest.mark.parametrize("prefer_localhost_replica", [0, 1])
def test_parallel_replicas_custom_key_failover(
start_cluster,
use_hedged_requests,
custom_key,
filter_type,
prefer_localhost_replica,
):
cluster_name = "test_single_shard_multiple_replicas"
table = "test_table"
create_tables(cluster_name, table)
expected_result = ""
for i in range(4):
expected_result += f"{i}\t1000\n"
log_comment = uuid.uuid4()
assert (
node1.query(
f"SELECT key, count() FROM cluster('{cluster_name}', currentDatabase(), test_table) GROUP BY key ORDER BY key",
settings={
"log_comment": log_comment,
"prefer_localhost_replica": prefer_localhost_replica,
"max_parallel_replicas": 4,
"parallel_replicas_custom_key": custom_key,
"parallel_replicas_custom_key_filter_type": filter_type,
"use_hedged_requests": use_hedged_requests,
# avoid considering replica delay on connection choice
# otherwise connection can be not distributed evenly among available nodes
# and so custom key secondary queries (we check it bellow)
"max_replica_delay_for_distributed_queries": 0,
},
)
== expected_result
)
for node in nodes:
node.query("system flush logs")
# the subqueries should be spread over available nodes
query_id = node1.query(
f"SELECT query_id FROM system.query_log WHERE current_database = currentDatabase() AND log_comment = '{log_comment}' AND type = 'QueryFinish' AND initial_query_id = query_id"
)
assert query_id != ""
query_id = query_id[:-1]
if prefer_localhost_replica == 0:
assert (
node1.query(
f"SELECT 'subqueries', count() FROM clusterAllReplicas({cluster_name}, system.query_log) WHERE initial_query_id = '{query_id}' AND type ='QueryFinish' AND query_id != initial_query_id SETTINGS skip_unavailable_shards=1"
)
== "subqueries\t4\n"
)
# currently this assert is flaky with asan and tsan builds, disable the assert in such cases for now
# will be investigated separately
if (
not node1.is_built_with_thread_sanitizer()
and not node1.is_built_with_address_sanitizer()
):
assert (
node1.query(
f"SELECT h, count() FROM clusterAllReplicas({cluster_name}, system.query_log) WHERE initial_query_id = '{query_id}' AND type ='QueryFinish' GROUP BY hostname() as h ORDER BY h SETTINGS skip_unavailable_shards=1"
)
== "n1\t3\nn3\t2\n"
)

View File

@ -0,0 +1,26 @@
<clickhouse>
<remote_servers>
<test_single_shard_multiple_replicas>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>n1</host>
<port>9000</port>
</replica>
<replica>
<host>n2</host>
<port>9000</port>
</replica>
<replica>
<host>n3</host>
<port>9000</port>
</replica>
<replica>
<host>n4</host>
<port>9000</port>
</replica>
</shard>
</test_single_shard_multiple_replicas>
</remote_servers>
</clickhouse>

View File

@ -0,0 +1,118 @@
import pytest
import uuid
from helpers.cluster import ClickHouseCluster
cluster = ClickHouseCluster(__file__)
node1 = cluster.add_instance(
"n1", main_configs=["configs/remote_servers.xml"], with_zookeeper=True
)
node2 = cluster.add_instance(
"n2", main_configs=["configs/remote_servers.xml"], with_zookeeper=True
)
node3 = cluster.add_instance(
"n3", main_configs=["configs/remote_servers.xml"], with_zookeeper=True
)
node4 = cluster.add_instance(
"n4", main_configs=["configs/remote_servers.xml"], with_zookeeper=True
)
nodes = [node1, node2, node3, node4]
@pytest.fixture(scope="module", autouse=True)
def start_cluster():
try:
cluster.start()
yield cluster
finally:
cluster.shutdown()
def create_tables(table_name):
for i in range(0, 4):
nodes[i].query(f"DROP TABLE IF EXISTS {table_name} SYNC")
nodes[i].query(
f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r{i+1}') ORDER BY (key)"
)
# populate data
node1.query(
f"INSERT INTO {table_name} SELECT number % 4, number FROM numbers(1000)"
)
node1.query(
f"INSERT INTO {table_name} SELECT number % 4, number FROM numbers(1000, 1000)"
)
node1.query(
f"INSERT INTO {table_name} SELECT number % 4, number FROM numbers(2000, 1000)"
)
node1.query(
f"INSERT INTO {table_name} SELECT number % 4, number FROM numbers(3000, 1000)"
)
node2.query(f"SYSTEM SYNC REPLICA {table_name}")
node3.query(f"SYSTEM SYNC REPLICA {table_name}")
node4.query(f"SYSTEM SYNC REPLICA {table_name}")
@pytest.mark.parametrize("use_hedged_requests", [1, 0])
@pytest.mark.parametrize("custom_key", ["sipHash64(key)", "key"])
@pytest.mark.parametrize("filter_type", ["default", "range"])
def test_parallel_replicas_custom_key_load_balancing(
start_cluster,
use_hedged_requests,
custom_key,
filter_type,
):
cluster_name = "test_single_shard_multiple_replicas"
table = "test_table"
create_tables(table)
expected_result = ""
for i in range(4):
expected_result += f"{i}\t1000\n"
log_comment = uuid.uuid4()
assert (
node1.query(
f"SELECT key, count() FROM cluster('{cluster_name}', currentDatabase(), test_table) GROUP BY key ORDER BY key",
settings={
"log_comment": log_comment,
"prefer_localhost_replica": 0,
"max_parallel_replicas": 4,
"parallel_replicas_custom_key": custom_key,
"parallel_replicas_custom_key_filter_type": filter_type,
"use_hedged_requests": use_hedged_requests,
# avoid considering replica delay on connection choice
# otherwise connection can be not distributed evenly among available nodes
# and so custom key secondary queries (we check it bellow)
"max_replica_delay_for_distributed_queries": 0,
},
)
== expected_result
)
for node in nodes:
node.query("system flush logs")
# the subqueries should be spread over available nodes
query_id = node1.query(
f"SELECT query_id FROM system.query_log WHERE current_database = currentDatabase() AND log_comment = '{log_comment}' AND type = 'QueryFinish' AND initial_query_id = query_id"
)
assert query_id != ""
query_id = query_id[:-1]
assert (
node1.query(
f"SELECT 'subqueries', count() FROM clusterAllReplicas({cluster_name}, system.query_log) WHERE initial_query_id = '{query_id}' AND type ='QueryFinish' AND query_id != initial_query_id SETTINGS skip_unavailable_shards=1"
)
== "subqueries\t4\n"
)
# check queries per node
assert (
node1.query(
f"SELECT h, count() FROM clusterAllReplicas({cluster_name}, system.query_log) WHERE initial_query_id = '{query_id}' AND type ='QueryFinish' GROUP BY hostname() as h ORDER BY h SETTINGS skip_unavailable_shards=1"
)
== "n1\t2\nn2\t1\nn3\t1\nn4\t1\n"
)

View File

@ -5,4 +5,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh # shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh . "$CURDIR"/../shell_config.sh
$CLICKHOUSE_CLIENT --connections_with_failover_max_tries 10 --query "SELECT hostName() FROM remote('128.1.2.3', default.tmp)" 2>&1 | grep -o -P 'Timeout exceeded while connecting to socket|Network is unreachable|Timeout: connect timed out' | wc -l $CLICKHOUSE_CLIENT --connections_with_failover_max_tries 10 --connect_timeout_with_failover_ms 1 --query "SELECT hostName() FROM remote('128.1.2.3', default.tmp)" 2>&1 | grep -o -P 'Timeout exceeded while connecting to socket|Network is unreachable|Timeout: connect timed out' | wc -l

View File

@ -0,0 +1,29 @@
-- { echoOn }
SELECT y, count()
FROM cluster(test_cluster_1_shard_3_replicas_1_unavailable, currentDatabase(), 02918_parallel_replicas)
GROUP BY y
ORDER BY y
SETTINGS max_parallel_replicas=3, parallel_replicas_custom_key='cityHash64(y)', parallel_replicas_custom_key_filter_type='default';
0 250
1 250
2 250
3 250
SELECT y, count()
FROM cluster(test_cluster_1_shard_3_replicas_1_unavailable, currentDatabase(), 02918_parallel_replicas)
GROUP BY y
ORDER BY y
SETTINGS max_parallel_replicas=3, parallel_replicas_custom_key='cityHash64(y)', parallel_replicas_custom_key_filter_type='range';
0 250
1 250
2 250
3 250
SET use_hedged_requests=0;
SELECT y, count()
FROM cluster(test_cluster_1_shard_3_replicas_1_unavailable, currentDatabase(), 02918_parallel_replicas)
GROUP BY y
ORDER BY y
SETTINGS max_parallel_replicas=3, parallel_replicas_custom_key='cityHash64(y)', parallel_replicas_custom_key_filter_type='default';
0 250
1 250
2 250
3 250

View File

@ -0,0 +1,30 @@
DROP TABLE IF EXISTS 02918_parallel_replicas;
CREATE TABLE 02918_parallel_replicas (x String, y Int32) ENGINE = MergeTree ORDER BY cityHash64(x);
INSERT INTO 02918_parallel_replicas SELECT toString(number), number % 4 FROM numbers(1000);
SET prefer_localhost_replica=0;
-- { echoOn }
SELECT y, count()
FROM cluster(test_cluster_1_shard_3_replicas_1_unavailable, currentDatabase(), 02918_parallel_replicas)
GROUP BY y
ORDER BY y
SETTINGS max_parallel_replicas=3, parallel_replicas_custom_key='cityHash64(y)', parallel_replicas_custom_key_filter_type='default';
SELECT y, count()
FROM cluster(test_cluster_1_shard_3_replicas_1_unavailable, currentDatabase(), 02918_parallel_replicas)
GROUP BY y
ORDER BY y
SETTINGS max_parallel_replicas=3, parallel_replicas_custom_key='cityHash64(y)', parallel_replicas_custom_key_filter_type='range';
SET use_hedged_requests=0;
SELECT y, count()
FROM cluster(test_cluster_1_shard_3_replicas_1_unavailable, currentDatabase(), 02918_parallel_replicas)
GROUP BY y
ORDER BY y
SETTINGS max_parallel_replicas=3, parallel_replicas_custom_key='cityHash64(y)', parallel_replicas_custom_key_filter_type='default';
-- { echoOff }
DROP TABLE 02918_parallel_replicas;