mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 00:22:29 +00:00
Merge pull request #17089 from ClickHouse/fix_ddl_worker_non_leader
Fix ON CLUSTER queries hung for non leader replicas
This commit is contained in:
commit
0bc60e2d53
@ -871,13 +871,16 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica(
|
|||||||
zookeeper->tryCreate(tries_to_execute_path, "0", zkutil::CreateMode::Persistent);
|
zookeeper->tryCreate(tries_to_execute_path, "0", zkutil::CreateMode::Persistent);
|
||||||
|
|
||||||
static constexpr int MAX_TRIES_TO_EXECUTE = 3;
|
static constexpr int MAX_TRIES_TO_EXECUTE = 3;
|
||||||
|
static constexpr int MAX_EXECUTION_TIMEOUT_SEC = 3600;
|
||||||
|
|
||||||
String executed_by;
|
String executed_by;
|
||||||
|
|
||||||
zkutil::EventPtr event = std::make_shared<Poco::Event>();
|
zkutil::EventPtr event = std::make_shared<Poco::Event>();
|
||||||
if (zookeeper->tryGet(is_executed_path, executed_by, nullptr, event))
|
/// We must use exists request instead of get, because zookeeper will not setup event
|
||||||
|
/// for non existing node after get request
|
||||||
|
if (zookeeper->exists(is_executed_path, nullptr, event))
|
||||||
{
|
{
|
||||||
LOG_DEBUG(log, "Task {} has already been executed by replica ({}) of the same shard.", task.entry_name, executed_by);
|
LOG_DEBUG(log, "Task {} has already been executed by replica ({}) of the same shard.", task.entry_name, zookeeper->get(is_executed_path));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -885,8 +888,13 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica(
|
|||||||
|
|
||||||
auto lock = createSimpleZooKeeperLock(zookeeper, shard_path, "lock", task.host_id_str);
|
auto lock = createSimpleZooKeeperLock(zookeeper, shard_path, "lock", task.host_id_str);
|
||||||
|
|
||||||
|
Stopwatch stopwatch;
|
||||||
|
|
||||||
bool executed_by_leader = false;
|
bool executed_by_leader = false;
|
||||||
while (true)
|
/// Defensive programming. One hour is more than enough to execute almost all DDL queries.
|
||||||
|
/// If it will be very long query like ALTER DELETE for a huge table it's still will be executed,
|
||||||
|
/// but DDL worker can continue processing other queries.
|
||||||
|
while (stopwatch.elapsedSeconds() <= MAX_EXECUTION_TIMEOUT_SEC)
|
||||||
{
|
{
|
||||||
StorageReplicatedMergeTree::Status status;
|
StorageReplicatedMergeTree::Status status;
|
||||||
replicated_storage->getStatus(status);
|
replicated_storage->getStatus(status);
|
||||||
@ -895,8 +903,8 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica(
|
|||||||
if (status.is_leader && lock->tryLock())
|
if (status.is_leader && lock->tryLock())
|
||||||
{
|
{
|
||||||
/// In replicated merge tree we can have multiple leaders. So we can
|
/// In replicated merge tree we can have multiple leaders. So we can
|
||||||
/// be "leader", but another "leader" replica may already execute
|
/// be "leader" and took lock, but another "leader" replica may have
|
||||||
/// this task.
|
/// already executed this task.
|
||||||
if (zookeeper->tryGet(is_executed_path, executed_by))
|
if (zookeeper->tryGet(is_executed_path, executed_by))
|
||||||
{
|
{
|
||||||
LOG_DEBUG(log, "Task {} has already been executed by replica ({}) of the same shard.", task.entry_name, executed_by);
|
LOG_DEBUG(log, "Task {} has already been executed by replica ({}) of the same shard.", task.entry_name, executed_by);
|
||||||
@ -904,7 +912,7 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Doing it exclusively
|
/// Checking and incrementing counter exclusively.
|
||||||
size_t counter = parse<int>(zookeeper->get(tries_to_execute_path));
|
size_t counter = parse<int>(zookeeper->get(tries_to_execute_path));
|
||||||
if (counter > MAX_TRIES_TO_EXECUTE)
|
if (counter > MAX_TRIES_TO_EXECUTE)
|
||||||
break;
|
break;
|
||||||
@ -923,24 +931,45 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica(
|
|||||||
lock->unlock();
|
lock->unlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Waiting for someone who will execute query and change is_executed_path node
|
||||||
if (event->tryWait(std::uniform_int_distribution<int>(0, 1000)(rng)))
|
if (event->tryWait(std::uniform_int_distribution<int>(0, 1000)(rng)))
|
||||||
{
|
{
|
||||||
LOG_DEBUG(log, "Task {} has already been executed by replica ({}) of the same shard.", task.entry_name, zookeeper->get(is_executed_path));
|
LOG_DEBUG(log, "Task {} has already been executed by replica ({}) of the same shard.", task.entry_name, zookeeper->get(is_executed_path));
|
||||||
executed_by_leader = true;
|
executed_by_leader = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
else if (parse<int>(zookeeper->get(tries_to_execute_path)) > MAX_TRIES_TO_EXECUTE)
|
else
|
||||||
|
{
|
||||||
|
String tries_count;
|
||||||
|
zookeeper->tryGet(tries_to_execute_path, tries_count);
|
||||||
|
if (parse<int>(tries_count) > MAX_TRIES_TO_EXECUTE)
|
||||||
{
|
{
|
||||||
/// Nobody will try to execute query again
|
/// Nobody will try to execute query again
|
||||||
|
LOG_WARNING(log, "Maximum retries count for task {} exceeded, cannot execute replicated DDL query", task.entry_name);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/// Will try to wait or execute
|
||||||
|
LOG_TRACE(log, "Task {} still not executed, will try to wait for it or execute ourselves, tries count {}", task.entry_name, tries_count);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Not executed by leader so was not executed at all
|
/// Not executed by leader so was not executed at all
|
||||||
if (!executed_by_leader)
|
if (!executed_by_leader)
|
||||||
{
|
{
|
||||||
task.execution_status = ExecutionStatus(ErrorCodes::NOT_IMPLEMENTED, "Cannot execute replicated DDL query");
|
/// If we failed with timeout
|
||||||
|
if (stopwatch.elapsedSeconds() >= MAX_EXECUTION_TIMEOUT_SEC)
|
||||||
|
{
|
||||||
|
LOG_WARNING(log, "Task {} was not executed by anyone, maximum timeout {} seconds exceeded", task.entry_name, MAX_EXECUTION_TIMEOUT_SEC);
|
||||||
|
task.execution_status = ExecutionStatus(ErrorCodes::TIMEOUT_EXCEEDED, "Cannot execute replicated DDL query, timeout exceeded");
|
||||||
|
}
|
||||||
|
else /// If we exceeded amount of tries
|
||||||
|
{
|
||||||
|
LOG_WARNING(log, "Task {} was not executed by anyone, maximum number of retries exceeded", task.entry_name);
|
||||||
|
task.execution_status = ExecutionStatus(ErrorCodes::UNFINISHED, "Cannot execute replicated DDL query, maximum retires exceeded");
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1,17 @@
|
|||||||
|
<yandex>
|
||||||
|
<remote_servers>
|
||||||
|
<test_cluster>
|
||||||
|
<shard>
|
||||||
|
<internal_replication>true</internal_replication>
|
||||||
|
<replica>
|
||||||
|
<host>node1</host>
|
||||||
|
<port>9000</port>
|
||||||
|
</replica>
|
||||||
|
<replica>
|
||||||
|
<host>node2</host>
|
||||||
|
<port>9000</port>
|
||||||
|
</replica>
|
||||||
|
</shard>
|
||||||
|
</test_cluster>
|
||||||
|
</remote_servers>
|
||||||
|
</yandex>
|
59
tests/integration/test_ddl_worker_non_leader/test.py
Normal file
59
tests/integration/test_ddl_worker_non_leader/test.py
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
import pytest
|
||||||
|
import time
|
||||||
|
from helpers.cluster import ClickHouseCluster
|
||||||
|
from helpers.network import PartitionManager
|
||||||
|
from helpers.client import QueryRuntimeException
|
||||||
|
|
||||||
|
cluster = ClickHouseCluster(__file__)
|
||||||
|
node1 = cluster.add_instance('node1', main_configs=['configs/remote_servers.xml'], with_zookeeper=True)
|
||||||
|
node2 = cluster.add_instance('node2', main_configs=['configs/remote_servers.xml'], with_zookeeper=True)
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def started_cluster():
|
||||||
|
try:
|
||||||
|
cluster.start()
|
||||||
|
|
||||||
|
yield cluster
|
||||||
|
finally:
|
||||||
|
cluster.shutdown()
|
||||||
|
|
||||||
|
|
||||||
|
def test_non_leader_replica(started_cluster):
|
||||||
|
|
||||||
|
node1.query('''CREATE TABLE sometable(id UInt32, value String)
|
||||||
|
ENGINE = ReplicatedMergeTree('/clickhouse/tables/0/sometable', '1') ORDER BY tuple()''')
|
||||||
|
|
||||||
|
node2.query('''CREATE TABLE sometable(id UInt32, value String)
|
||||||
|
ENGINE = ReplicatedMergeTree('/clickhouse/tables/0/sometable', '2') ORDER BY tuple() SETTINGS replicated_can_become_leader = 0''')
|
||||||
|
|
||||||
|
node1.query("INSERT INTO sometable SELECT number, toString(number) FROM numbers(100)")
|
||||||
|
node2.query("SYSTEM SYNC REPLICA sometable", timeout=10)
|
||||||
|
|
||||||
|
assert node1.query("SELECT COUNT() FROM sometable") == "100\n"
|
||||||
|
assert node2.query("SELECT COUNT() FROM sometable") == "100\n"
|
||||||
|
|
||||||
|
|
||||||
|
with PartitionManager() as pm:
|
||||||
|
pm.drop_instance_zk_connections(node1)
|
||||||
|
|
||||||
|
# this query should be executed by leader, but leader partitioned from zookeeper
|
||||||
|
with pytest.raises(Exception):
|
||||||
|
node2.query("ALTER TABLE sometable ON CLUSTER 'test_cluster' MODIFY COLUMN value UInt64 SETTINGS distributed_ddl_task_timeout=5")
|
||||||
|
|
||||||
|
for _ in range(100):
|
||||||
|
if 'UInt64' in node1.query("SELECT type FROM system.columns WHERE name='value' and table = 'sometable'"):
|
||||||
|
break
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
for _ in range(100):
|
||||||
|
if 'UInt64' in node2.query("SELECT type FROM system.columns WHERE name='value' and table = 'sometable'"):
|
||||||
|
break
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
assert 'UInt64' in node1.query("SELECT type FROM system.columns WHERE name='value' and table = 'sometable'")
|
||||||
|
assert 'UInt64' in node2.query("SELECT type FROM system.columns WHERE name='value' and table = 'sometable'")
|
||||||
|
|
||||||
|
# Checking that DDLWorker doesn't hung and still able to execute DDL queries
|
||||||
|
node1.query("CREATE TABLE new_table_with_ddl ON CLUSTER 'test_cluster' (key UInt32) ENGINE=MergeTree() ORDER BY tuple()", settings={"distributed_ddl_task_timeout": "10"})
|
||||||
|
assert node1.query("EXISTS new_table_with_ddl") == "1\n"
|
||||||
|
assert node2.query("EXISTS new_table_with_ddl") == "1\n"
|
Loading…
Reference in New Issue
Block a user