Merge pull request #46308 from ClickHouse/keeper-retries-by-default

Enable retries for INSERT by default in case of ZooKeeper session loss
This commit is contained in:
Alexey Milovidov 2023-02-22 07:57:40 +03:00 committed by GitHub
commit 5788deeadd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 35 additions and 12 deletions

View File

@ -706,7 +706,7 @@ class IColumn;
M(UInt64, grace_hash_join_max_buckets, 1024, "Limit on the number of grace hash join buckets", 0) \
M(Bool, optimize_distinct_in_order, true, "Enable DISTINCT optimization if some columns in DISTINCT form a prefix of sorting. For example, prefix of sorting key in merge tree or ORDER BY statement", 0) \
M(Bool, optimize_sorting_by_input_stream_properties, true, "Optimize sorting by sorting properties of input stream", 0) \
M(UInt64, insert_keeper_max_retries, 0, "Max retries for keeper operations during insert", 0) \
M(UInt64, insert_keeper_max_retries, 20, "Max retries for keeper operations during insert", 0) \
M(UInt64, insert_keeper_retry_initial_backoff_ms, 100, "Initial backoff timeout for keeper operations during insert", 0) \
M(UInt64, insert_keeper_retry_max_backoff_ms, 10000, "Max backoff timeout for keeper operations during insert", 0) \
M(Float, insert_keeper_fault_injection_probability, 0.0f, "Approximate probability of failure for a keeper request during insert. Valid value is in interval [0.0f, 1.0f]", 0) \

View File

@ -83,7 +83,8 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
{"23.2", {{"output_format_parquet_fixed_string_as_fixed_byte_array", false, true, "Use Parquet FIXED_LENGTH_BYTE_ARRAY type for FixedString by default"},
{"output_format_arrow_fixed_string_as_fixed_byte_array", false, true, "Use Arrow FIXED_SIZE_BINARY type for FixedString by default"},
{"query_plan_remove_redundant_distinct", false, true, "Remove redundant Distinct step in query plan"},
{"optimize_duplicate_order_by_and_distinct", true, false, "Remove duplicate ORDER BY and DISTINCT if it's possible"}}},
{"optimize_duplicate_order_by_and_distinct", true, false, "Remove duplicate ORDER BY and DISTINCT if it's possible"},
{"insert_keeper_max_retries", 0, 20, "Enable reconnections to Keeper on INSERT, improve reliability"}}},
{"23.1", {{"input_format_json_read_objects_as_strings", 0, 1, "Enable reading nested json objects as strings while object type is experimental"},
{"input_format_json_defaults_for_missing_elements_in_named_tuple", false, true, "Allow missing elements in JSON objects while reading named tuples by default"},
{"input_format_csv_detect_header", false, true, "Detect header in CSV format by default"},

View File

@ -89,7 +89,9 @@ def test_blocade_leader(started_cluster):
print("Got exception from node", smaller_exception(ex))
time.sleep(0.1)
node2.query("INSERT INTO ordinary.t1 SELECT number FROM numbers(10)")
node2.query(
"INSERT INTO ordinary.t1 SELECT number FROM numbers(10) SETTINGS insert_keeper_max_retries = 0"
)
node1.query("SYSTEM SYNC REPLICA ordinary.t1", timeout=10)
node3.query("SYSTEM SYNC REPLICA ordinary.t1", timeout=10)
@ -107,7 +109,9 @@ def test_blocade_leader(started_cluster):
restart_replica_for_sure(
node2, "ordinary.t1", "/clickhouse/t1/replicas/2"
)
node2.query("INSERT INTO ordinary.t1 SELECT rand() FROM numbers(100)")
node2.query(
"INSERT INTO ordinary.t1 SELECT rand() FROM numbers(100) SETTINGS insert_keeper_max_retries = 0"
)
break
except Exception as ex:
try:
@ -128,7 +132,9 @@ def test_blocade_leader(started_cluster):
restart_replica_for_sure(
node3, "ordinary.t1", "/clickhouse/t1/replicas/3"
)
node3.query("INSERT INTO ordinary.t1 SELECT rand() FROM numbers(100)")
node3.query(
"INSERT INTO ordinary.t1 SELECT rand() FROM numbers(100) SETTINGS insert_keeper_max_retries = 0"
)
break
except Exception as ex:
try:
@ -167,7 +173,9 @@ def test_blocade_leader(started_cluster):
for i in range(100):
try:
node1.query("INSERT INTO ordinary.t1 SELECT rand() FROM numbers(100)")
node1.query(
"INSERT INTO ordinary.t1 SELECT rand() FROM numbers(100) SETTINGS insert_keeper_max_retries = 0"
)
break
except Exception as ex:
print("Got exception node1", smaller_exception(ex))
@ -293,7 +301,9 @@ def test_blocade_leader_twice(started_cluster):
print("Got exception from node", smaller_exception(ex))
time.sleep(0.1)
node2.query("INSERT INTO ordinary.t2 SELECT number FROM numbers(10)")
node2.query(
"INSERT INTO ordinary.t2 SELECT number FROM numbers(10) SETTINGS insert_keeper_max_retries = 0"
)
node1.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10)
node3.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10)
@ -311,7 +321,9 @@ def test_blocade_leader_twice(started_cluster):
restart_replica_for_sure(
node2, "ordinary.t2", "/clickhouse/t2/replicas/2"
)
node2.query("INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100)")
node2.query(
"INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100) SETTINGS insert_keeper_max_retries = 0"
)
break
except Exception as ex:
try:
@ -333,7 +345,9 @@ def test_blocade_leader_twice(started_cluster):
node3, "ordinary.t2", "/clickhouse/t2/replicas/3"
)
node3.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10)
node3.query("INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100)")
node3.query(
"INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100) SETTINGS insert_keeper_max_retries = 0"
)
break
except Exception as ex:
try:
@ -359,14 +373,18 @@ def test_blocade_leader_twice(started_cluster):
for i in range(10):
try:
node3.query("INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100)")
node3.query(
"INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100) SETTINGS insert_keeper_max_retries = 0"
)
assert False, "Node3 became leader?"
except Exception as ex:
time.sleep(0.5)
for i in range(10):
try:
node2.query("INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100)")
node2.query(
"INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100) SETTINGS insert_keeper_max_retries = 0"
)
assert False, "Node2 became leader?"
except Exception as ex:
time.sleep(0.5)
@ -399,7 +417,9 @@ def test_blocade_leader_twice(started_cluster):
for n, node in enumerate([node1, node2, node3]):
for i in range(100):
try:
node.query("INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100)")
node.query(
"INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100) SETTINGS insert_keeper_max_retries = 0"
)
break
except Exception as ex:
print("Got exception node{}".format(n + 1), smaller_exception(ex))

View File

@ -2,6 +2,8 @@
<profiles>
<default>
<stream_like_engine_allow_direct_select>1</stream_like_engine_allow_direct_select>
<!-- One test is expecting the interruption after blocking ZooKeeper -->
<insert_keeper_max_retries>0</insert_keeper_max_retries>
</default>
</profiles>
</clickhouse>