From 68f427398d8ba974f974e1f2189079e1526d4f9b Mon Sep 17 00:00:00 2001 From: Alexey Arno Date: Fri, 15 Apr 2016 01:00:39 +0300 Subject: [PATCH] dbms: server: Retry after a timeout issue. [#METR-20825] --- .../Storages/MergeTree/ReshardingWorker.cpp | 54 +++++++++++++++++-- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/dbms/src/Storages/MergeTree/ReshardingWorker.cpp b/dbms/src/Storages/MergeTree/ReshardingWorker.cpp index 263a7b565db..cc19c0be07f 100644 --- a/dbms/src/Storages/MergeTree/ReshardingWorker.cpp +++ b/dbms/src/Storages/MergeTree/ReshardingWorker.cpp @@ -697,13 +697,16 @@ void ReshardingWorker::perform(const std::string & job_descriptor, const std::st } else if (ex.code() == ErrorCodes::RESHARDING_REMOTE_NODE_UNAVAILABLE) { - /// A remote performer has gone offline. + /// A remote performer has gone offline or we are experiencing network problems. /// Put the current distributed job on hold. Also jab the job scheduler /// so that it will come accross this distributed job even if no new jobs /// are submitted. - setStatus(current_job.coordinator_id, getFQDNOrHostName(), STATUS_ON_HOLD, - ex.message()); - dumped_coordinator_state = dumpCoordinatorState(current_job.coordinator_id); + if (current_job.isCoordinated()) + { + setStatus(current_job.coordinator_id, getFQDNOrHostName(), STATUS_ON_HOLD, + ex.message()); + dumped_coordinator_state = dumpCoordinatorState(current_job.coordinator_id); + } softCleanup(); wakeUpTrackerThread(); } @@ -956,6 +959,20 @@ void ReshardingWorker::publishShardedPartitions() pool.schedule([j, &tasks]{ tasks[j](); }); } } + catch (const Poco::TimeoutException & ex) + { + try + { + pool.wait(); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + + throw Exception{"Sharded partition upload operation timed out", + ErrorCodes::RESHARDING_REMOTE_NODE_UNAVAILABLE}; + } catch (...) { try @@ -1095,6 +1112,20 @@ void ReshardingWorker::commit() } } } + catch (const Poco::TimeoutException & ex) + { + try + { + pool.wait(); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + + throw Exception{"A remote operation timed out while committing", + ErrorCodes::RESHARDING_REMOTE_NODE_UNAVAILABLE}; + } catch (...) { try @@ -1260,6 +1291,21 @@ bool ReshardingWorker::checkAttachLogRecord(LogRecord & log_record) pool.schedule([i, &tasks]{ tasks[i](); }); } } + catch (const Poco::TimeoutException & ex) + { + try + { + pool.wait(); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + + throw Exception{"Part checking on remote node timed out while attempting " + "to fix a failed ATTACH operation", + ErrorCodes::RESHARDING_REMOTE_NODE_UNAVAILABLE}; + } catch (...) { try