mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 07:31:57 +00:00
Better logging and tests
This commit is contained in:
parent
b4328ad8dc
commit
bfcd8b7de5
@ -1736,7 +1736,11 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks, std::optional<std::un
|
||||
broken_parts_to_detach.push_back(res.part);
|
||||
bool unexpected = expected_parts != std::nullopt && !expected_parts->contains(res.part->name);
|
||||
if (unexpected)
|
||||
{
|
||||
LOG_DEBUG(log, "loadDataParts: Part {} is broken, but it's not expected to be in parts set, "
|
||||
" will not count it as suspicious broken part", res.part->name);
|
||||
++suspicious_broken_unexpected_parts;
|
||||
}
|
||||
else
|
||||
++suspicious_broken_parts;
|
||||
|
||||
@ -1782,12 +1786,6 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks, std::optional<std::un
|
||||
|
||||
if (!skip_sanity_checks)
|
||||
{
|
||||
if (settings->strict_suspicious_broken_parts_check_on_start)
|
||||
{
|
||||
suspicious_broken_parts += suspicious_broken_unexpected_parts;
|
||||
suspicious_broken_parts_bytes += suspicious_broken_unexpected_parts_bytes;
|
||||
}
|
||||
|
||||
if (suspicious_broken_parts > settings->max_suspicious_broken_parts)
|
||||
throw Exception(
|
||||
ErrorCodes::TOO_MANY_UNEXPECTED_DATA_PARTS,
|
||||
@ -1811,6 +1809,9 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks, std::optional<std::un
|
||||
formatReadableSizeWithBinarySuffix(settings->max_suspicious_broken_parts_bytes));
|
||||
}
|
||||
|
||||
if (suspicious_broken_unexpected_parts != 0)
|
||||
LOG_WARNING(log, "Found suspicious broken unexpected parts {} with total rows count {}", suspicious_broken_unexpected_parts, suspicious_broken_unexpected_parts_bytes);
|
||||
|
||||
|
||||
if (!is_static_storage)
|
||||
for (auto & part : broken_parts_to_detach)
|
||||
|
@ -105,7 +105,6 @@ struct Settings;
|
||||
M(Bool, always_fetch_merged_part, false, "If true, replica never merge parts and always download merged parts from other replicas.", 0) \
|
||||
M(UInt64, max_suspicious_broken_parts, 100, "Max broken parts, if more - deny automatic deletion.", 0) \
|
||||
M(UInt64, max_suspicious_broken_parts_bytes, 1ULL * 1024 * 1024 * 1024, "Max size of all broken parts, if more - deny automatic deletion.", 0) \
|
||||
M(Bool, strict_suspicious_broken_parts_check_on_start, false, "If disabled false server will not count parts existing on other replicas as broken (will enqueue them for fetch), otherwise will fail startup when broken parts thrasholds are exceeded", 0) \
|
||||
M(UInt64, max_files_to_modify_in_alter_columns, 75, "Not apply ALTER if number of files for modification(deletion, addition) more than this.", 0) \
|
||||
M(UInt64, max_files_to_remove_in_alter_columns, 50, "Not apply ALTER, if number of files for deletion more than this.", 0) \
|
||||
M(Float, replicated_max_ratio_of_wrong_parts, 0.5, "If ratio of wrong parts to total number of parts is less than this - allow to start.", 0) \
|
||||
|
@ -1374,6 +1374,7 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks)
|
||||
|
||||
Strings covered_unexpected_parts;
|
||||
std::unordered_set<String> uncovered_unexpected_parts;
|
||||
std::unordered_set<String> restorable_unexpected_parts;
|
||||
UInt64 uncovered_unexpected_parts_rows = 0;
|
||||
|
||||
for (const auto & part : unexpected_parts)
|
||||
@ -1393,7 +1394,7 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks)
|
||||
|
||||
if (MergeTreePartInfo::areAllBlockNumbersCovered(part->info, covered_parts))
|
||||
{
|
||||
covered_unexpected_parts.push_back(part->name);
|
||||
restorable_unexpected_parts.insert(part->name);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1439,11 +1440,11 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks)
|
||||
"There are {} uncovered unexpected parts with {} rows ({} of them is not just-written with {} rows), "
|
||||
"{} missing parts (with {} blocks), {} covered unexpected parts (with {} rows).";
|
||||
|
||||
constexpr auto sanity_report_debug_fmt = "Uncovered unexpected parts: {}. Missing parts: {}. Covered unexpected parts: {}. Expected parts: {}.";
|
||||
constexpr auto sanity_report_debug_fmt = "Uncovered unexpected parts: {}. Restorable unexpected parts: {}. Missing parts: {}. Covered unexpected parts: {}. Expected parts: {}.";
|
||||
|
||||
if (insane && !skip_sanity_checks)
|
||||
{
|
||||
LOG_DEBUG(log, sanity_report_debug_fmt, fmt::join(uncovered_unexpected_parts, ", "), fmt::join(parts_to_fetch, ", "),
|
||||
LOG_DEBUG(log, sanity_report_debug_fmt, fmt::join(uncovered_unexpected_parts, ", "), fmt::join(restorable_unexpected_parts, ", "), fmt::join(parts_to_fetch, ", "),
|
||||
fmt::join(covered_unexpected_parts, ", "), fmt::join(expected_parts, ", "));
|
||||
throw Exception(ErrorCodes::TOO_MANY_UNEXPECTED_DATA_PARTS, sanity_report_fmt, getStorageID().getNameForLogs(),
|
||||
formatReadableQuantity(uncovered_unexpected_parts_rows),
|
||||
@ -1455,7 +1456,7 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks)
|
||||
|
||||
if (unexpected_parts_nonnew_rows > 0 || uncovered_unexpected_parts_rows > 0)
|
||||
{
|
||||
LOG_DEBUG(log, sanity_report_debug_fmt, fmt::join(uncovered_unexpected_parts, ", "), fmt::join(parts_to_fetch, ", "),
|
||||
LOG_DEBUG(log, sanity_report_debug_fmt, fmt::join(uncovered_unexpected_parts, ", "), fmt::join(restorable_unexpected_parts, ", "), fmt::join(parts_to_fetch, ", "),
|
||||
fmt::join(covered_unexpected_parts, ", "), fmt::join(expected_parts, ", "));
|
||||
LOG_WARNING(log, sanity_report_fmt, getStorageID().getNameForLogs(),
|
||||
formatReadableQuantity(uncovered_unexpected_parts_rows), formatReadableQuantity(total_rows_on_filesystem),
|
||||
@ -1474,7 +1475,7 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks)
|
||||
/// Remove extra local parts.
|
||||
for (const DataPartPtr & part : unexpected_parts)
|
||||
{
|
||||
bool restore_covered = uncovered_unexpected_parts.contains(part->name);
|
||||
bool restore_covered = restorable_unexpected_parts.contains(part->name) || uncovered_unexpected_parts.contains(part->name);
|
||||
LOG_ERROR(log, "Renaming unexpected part {} to ignored_{}{}", part->name, part->name, restore_covered ? ", restoring covered parts" : "");
|
||||
forcefullyMovePartToDetachedAndRemoveFromMemory(part, "ignored", restore_covered);
|
||||
}
|
||||
|
@ -0,0 +1 @@
|
||||
#!/usr/bin/env python3
|
@ -0,0 +1,139 @@
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# pylint: disable=unused-argument
|
||||
# pylint: disable=redefined-outer-name
|
||||
# pylint: disable=line-too-long
|
||||
|
||||
import pytest
|
||||
|
||||
import os
|
||||
from helpers.client import QueryRuntimeException
|
||||
from helpers.cluster import ClickHouseCluster
|
||||
|
||||
cluster = ClickHouseCluster(__file__)
|
||||
node = cluster.add_instance("node", stay_alive=True, with_zookeeper=True)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def start_cluster():
|
||||
try:
|
||||
cluster.start()
|
||||
yield cluster
|
||||
finally:
|
||||
cluster.shutdown()
|
||||
|
||||
|
||||
def break_part(table, part_name):
|
||||
node.exec_in_container(
|
||||
[
|
||||
"bash",
|
||||
"-c",
|
||||
f"rm /var/lib/clickhouse/data/default/{table}/{part_name}/columns.txt",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def remove_part(table, part_name):
|
||||
node.exec_in_container(
|
||||
["bash", "-c", f"rm -r /var/lib/clickhouse/data/default/{table}/{part_name}"]
|
||||
)
|
||||
|
||||
def get_count(table):
|
||||
return int(node.query(f"SELECT count() FROM {table}").strip())
|
||||
|
||||
|
||||
def detach_table(table):
|
||||
node.query(f"DETACH TABLE {table}")
|
||||
|
||||
def attach_table(table):
|
||||
node.query(f"ATTACH TABLE {table}")
|
||||
|
||||
def remove_part_from_zookeeper(replica_path, part_name):
|
||||
zk = cluster.get_kazoo_client("zoo1")
|
||||
zk.delete(os.path.join(replica_path, f"parts/{part_name}"))
|
||||
|
||||
def test_unexpected_uncommitted_merge():
|
||||
node.query("""
|
||||
CREATE TABLE broken_table (key Int) ENGINE = ReplicatedMergeTree('/tables/broken', '1') ORDER BY tuple()
|
||||
SETTINGS max_suspicious_broken_parts = 0, replicated_max_ratio_of_wrong_parts=0""")
|
||||
|
||||
node.query("INSERT INTO broken_table SELECT number from numbers(10)")
|
||||
node.query("INSERT INTO broken_table SELECT number from numbers(10, 10)")
|
||||
|
||||
node.query("OPTIMIZE TABLE broken_table FINAL")
|
||||
|
||||
assert node.query("SELECT sum(key) FROM broken_table") == "190\n"
|
||||
assert node.query("SELECT name FROM system.parts where table = 'broken_table' and active") == "all_0_1_1\n"
|
||||
|
||||
remove_part_from_zookeeper('/tables/broken/replicas/1', "all_0_1_1")
|
||||
|
||||
detach_table("broken_table")
|
||||
attach_table("broken_table")
|
||||
|
||||
assert node.query("SELECT sum(key) FROM broken_table") == "190\n"
|
||||
assert node.query("SELECT name FROM system.parts where table = 'broken_table' and active order by name") == "all_0_0_0\nall_1_1_0\n"
|
||||
|
||||
|
||||
def test_corrupted_random_part():
|
||||
node.query("""
|
||||
CREATE TABLE broken_table_1 (key Int) ENGINE = ReplicatedMergeTree('/tables/broken_1', '1') ORDER BY tuple()
|
||||
SETTINGS max_suspicious_broken_parts = 0, replicated_max_ratio_of_wrong_parts=0""")
|
||||
|
||||
node.query("INSERT INTO broken_table_1 SELECT number from numbers(10)")
|
||||
node.query("INSERT INTO broken_table_1 SELECT number from numbers(10, 10)")
|
||||
|
||||
assert node.query("SELECT sum(key) FROM broken_table_1") == "190\n"
|
||||
assert node.query("SELECT name FROM system.parts where table = 'broken_table_1' and active order by name") == "all_0_0_0\nall_1_1_0\n"
|
||||
|
||||
break_part('broken_table_1', 'all_0_0_0')
|
||||
|
||||
detach_table("broken_table_1")
|
||||
with pytest.raises(QueryRuntimeException):
|
||||
attach_table("broken_table_1")
|
||||
|
||||
|
||||
def test_corrupted_unexpected_part():
|
||||
node.query("""
|
||||
CREATE TABLE broken_table_2 (key Int) ENGINE = ReplicatedMergeTree('/tables/broken_2', '1') ORDER BY tuple()
|
||||
SETTINGS max_suspicious_broken_parts = 0, replicated_max_ratio_of_wrong_parts=0""")
|
||||
|
||||
node.query("INSERT INTO broken_table_2 SELECT number from numbers(10)")
|
||||
node.query("INSERT INTO broken_table_2 SELECT number from numbers(10, 10)")
|
||||
|
||||
node.query("OPTIMIZE TABLE broken_table_2 FINAL")
|
||||
|
||||
assert node.query("SELECT sum(key) FROM broken_table_2") == "190\n"
|
||||
assert node.query("SELECT name FROM system.parts where table = 'broken_table_2' and active") == "all_0_1_1\n"
|
||||
|
||||
remove_part_from_zookeeper('/tables/broken_2/replicas/1', "all_0_0_0")
|
||||
break_part('broken_table_2', 'all_0_0_0')
|
||||
|
||||
detach_table("broken_table_2")
|
||||
attach_table("broken_table_2")
|
||||
|
||||
assert node.query("SELECT sum(key) FROM broken_table_2") == "190\n"
|
||||
assert node.query("SELECT name FROM system.parts where table = 'broken_table_2' and active") == "all_0_1_1\n"
|
||||
|
||||
def test_corrupted_unexpected_part_ultimate():
|
||||
node.query("""
|
||||
CREATE TABLE broken_table_3 (key Int) ENGINE = ReplicatedMergeTree('/tables/broken_3', '1') ORDER BY tuple()
|
||||
SETTINGS max_suspicious_broken_parts = 0, replicated_max_ratio_of_wrong_parts=0""")
|
||||
|
||||
node.query("INSERT INTO broken_table_3 SELECT number from numbers(10)")
|
||||
node.query("INSERT INTO broken_table_3 SELECT number from numbers(10, 10)")
|
||||
|
||||
node.query("OPTIMIZE TABLE broken_table_3 FINAL")
|
||||
|
||||
assert node.query("SELECT sum(key) FROM broken_table_3") == "190\n"
|
||||
assert node.query("SELECT name FROM system.parts where table = 'broken_table_3' and active") == "all_0_1_1\n"
|
||||
|
||||
remove_part_from_zookeeper('/tables/broken_3/replicas/1', "all_0_0_0")
|
||||
break_part('broken_table_2', 'all_0_0_0')
|
||||
remove_part_from_zookeeper('/tables/broken_3/replicas/1', "all_0_1_1")
|
||||
|
||||
detach_table("broken_table_3")
|
||||
attach_table("broken_table_3")
|
||||
|
||||
assert node.query("SELECT is_readonly FROM system.replicas WHERE table = 'broken_table_3'") == "1\n"
|
||||
|
||||
assert node.query("SELECT sum(key) FROM broken_table_3") == "145\n"
|
Loading…
Reference in New Issue
Block a user