Merge pull request #55184 from ClickHouse/remove-garbage-broken-detached-parts

Remove the removal of broken detached parts
This commit is contained in:
Alexey Milovidov 2023-09-30 06:10:40 +03:00 committed by GitHub
commit de8e068da7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 0 additions and 468 deletions

View File

@ -2628,97 +2628,6 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t
"({} != {} + {}), it's a bug", parts_to_remove.size(), sum_of_ranges, excluded_parts.size());
}
size_t MergeTreeData::clearOldBrokenPartsFromDetachedDirectory()
{
/**
* Remove old (configured by setting) broken detached parts.
* Only parts with certain prefixes are removed. These prefixes
* are such that it is guaranteed that they will never be needed
* and need to be cleared. ctime is used to check when file was
* moved to detached/ directory (see https://unix.stackexchange.com/a/211134)
*/
DetachedPartsInfo detached_parts = getDetachedParts();
if (detached_parts.empty())
return 0;
auto get_last_touched_time = [&](const DetachedPartInfo & part_info) -> time_t
{
auto path = fs::path(relative_data_path) / "detached" / part_info.dir_name;
time_t last_change_time = part_info.disk->getLastChanged(path);
time_t last_modification_time = part_info.disk->getLastModified(path).epochTime();
return std::max(last_change_time, last_modification_time);
};
time_t ttl_seconds = getSettings()->merge_tree_clear_old_broken_detached_parts_ttl_timeout_seconds;
size_t unfinished_deleting_parts = 0;
time_t current_time = time(nullptr);
for (const auto & part_info : detached_parts)
{
if (!part_info.dir_name.starts_with("deleting_"))
continue;
time_t startup_time = current_time - static_cast<time_t>(Context::getGlobalContextInstance()->getUptimeSeconds());
time_t last_touch_time = get_last_touched_time(part_info);
/// Maybe it's being deleted right now (for example, in ALTER DROP DETACHED)
bool had_restart = last_touch_time < startup_time;
bool ttl_expired = last_touch_time + ttl_seconds <= current_time;
if (!had_restart && !ttl_expired)
continue;
/// We were trying to delete this detached part but did not finish deleting, probably because the server crashed
LOG_INFO(log, "Removing detached part {} that we failed to remove previously", part_info.dir_name);
try
{
removeDetachedPart(part_info.disk, fs::path(relative_data_path) / "detached" / part_info.dir_name / "", part_info.dir_name);
++unfinished_deleting_parts;
}
catch (...)
{
tryLogCurrentException(log);
}
}
if (!getSettings()->merge_tree_enable_clear_old_broken_detached)
return unfinished_deleting_parts;
const auto full_path = fs::path(relative_data_path) / "detached";
size_t removed_count = 0;
for (const auto & part_info : detached_parts)
{
if (!part_info.valid_name || part_info.prefix.empty())
continue;
const auto & removable_detached_parts_prefixes = DetachedPartInfo::DETACHED_REASONS_REMOVABLE_BY_TIMEOUT;
bool can_be_removed_by_timeout = std::find(
removable_detached_parts_prefixes.begin(),
removable_detached_parts_prefixes.end(),
part_info.prefix) != removable_detached_parts_prefixes.end();
if (!can_be_removed_by_timeout)
continue;
ssize_t threshold = current_time - ttl_seconds;
time_t last_touch_time = get_last_touched_time(part_info);
if (last_touch_time == 0 || last_touch_time >= threshold)
continue;
const String & old_name = part_info.dir_name;
String new_name = "deleting_" + part_info.dir_name;
part_info.disk->moveFile(fs::path(full_path) / old_name, fs::path(full_path) / new_name);
removeDetachedPart(part_info.disk, fs::path(relative_data_path) / "detached" / new_name / "", old_name);
LOG_WARNING(log, "Removed broken detached part {} due to a timeout for broken detached parts", old_name);
++removed_count;
}
LOG_INFO(log, "Cleaned up {} detached parts", removed_count);
return removed_count + unfinished_deleting_parts;
}
size_t MergeTreeData::clearOldWriteAheadLogs()
{

View File

@ -692,8 +692,6 @@ public:
/// Delete WAL files containing parts, that all already stored on disk.
size_t clearOldWriteAheadLogs();
size_t clearOldBrokenPartsFromDetachedDirectory();
/// Delete all directories which names begin with "tmp"
/// Must be called with locked lockForShare() because it's using relative_data_path.
size_t clearOldTemporaryDirectories(size_t custom_directories_lifetime_seconds, const NameSet & valid_prefixes = {"tmp_", "tmp-fetch_"});

View File

@ -63,11 +63,9 @@ struct Settings;
M(Float, merge_selecting_sleep_slowdown_factor, 1.2f, "The sleep time for merge selecting task is multiplied by this factor when there's nothing to merge and divided when a merge was assigned", 0) \
M(UInt64, merge_tree_clear_old_temporary_directories_interval_seconds, 60, "The period of executing the clear old temporary directories operation in background.", 0) \
M(UInt64, merge_tree_clear_old_parts_interval_seconds, 1, "The period of executing the clear old parts operation in background.", 0) \
M(UInt64, merge_tree_clear_old_broken_detached_parts_ttl_timeout_seconds, 1ULL * 3600 * 24 * 30, "Remove old broken detached parts in the background if they remained intouched for a specified by this setting period of time.", 0) \
M(UInt64, min_age_to_force_merge_seconds, 0, "If all parts in a certain range are older than this value, range will be always eligible for merging. Set to 0 to disable.", 0) \
M(Bool, min_age_to_force_merge_on_partition_only, false, "Whether min_age_to_force_merge_seconds should be applied only on the entire partition and not on subset.", false) \
M(UInt64, number_of_free_entries_in_pool_to_execute_optimize_entire_partition, 25, "When there is less than specified number of free entries in pool, do not try to execute optimize entire partition with a merge (this merge is created when set min_age_to_force_merge_seconds > 0 and min_age_to_force_merge_on_partition_only = true). This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \
M(UInt64, merge_tree_enable_clear_old_broken_detached, false, "Enable clearing old broken detached parts operation in background.", 0) \
M(Bool, remove_rolled_back_parts_immediately, 1, "Setting for an incomplete experimental feature.", 0) \
M(CleanDeletedRows, clean_deleted_rows, CleanDeletedRows::Never, "Is the Replicated Merge cleanup has to be done automatically at each merge or manually (possible values are 'Always'/'Never' (default))", 0) \
M(UInt64, replicated_max_mutations_in_one_entry, 10000, "Max number of mutation commands that can be merged together and executed in one MUTATE_PART entry (0 means unlimited)", 0) \

View File

@ -155,7 +155,6 @@ Float32 ReplicatedMergeTreeCleanupThread::iterate()
/// do it under share lock
cleaned_other += storage.clearOldWriteAheadLogs();
cleaned_part_like += storage.clearOldTemporaryDirectories(storage.getSettings()->temporary_directories_lifetime.totalSeconds());
cleaned_part_like += storage.clearOldBrokenPartsFromDetachedDirectory();
}
/// This is loose condition: no problem if we actually had lost leadership at this moment

View File

@ -1385,7 +1385,6 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & assign
cleared_count += clearOldWriteAheadLogs();
cleared_count += clearOldMutations();
cleared_count += clearEmptyParts();
cleared_count += clearOldBrokenPartsFromDetachedDirectory();
return cleared_count;
/// TODO maybe take into account number of cleared objects when calculating backoff
}, common_assignee_trigger, getStorageID()), /* need_trigger */ false);

View File

@ -1,11 +0,0 @@
<clickhouse>
<database_catalog_unused_dir_hide_timeout_sec>0</database_catalog_unused_dir_hide_timeout_sec>
<database_catalog_unused_dir_rm_timeout_sec>60</database_catalog_unused_dir_rm_timeout_sec>
<database_catalog_unused_dir_cleanup_period_sec>1</database_catalog_unused_dir_cleanup_period_sec>
<!-- We don't really need [Zoo]Keeper for this test.
And it makes sense to have at least one test with TestKeeper. -->
<zookeeper>
<implementation>testkeeper</implementation>
</zookeeper>
</clickhouse>

View File

@ -1,360 +0,0 @@
import pytest
from helpers.cluster import ClickHouseCluster
from helpers.corrupt_part_data_on_disk import corrupt_part_data_on_disk
from helpers.corrupt_part_data_on_disk import break_part
import time
cluster = ClickHouseCluster(__file__)
node1 = cluster.add_instance(
"node1", stay_alive=True, main_configs=["configs/store_cleanup.xml"]
)
path_to_data = "/var/lib/clickhouse/"
@pytest.fixture(scope="module")
def started_cluster():
try:
cluster.start()
yield cluster
finally:
cluster.shutdown()
def remove_broken_detached_part_impl(table, node, expect_broken_prefix):
assert (
node.query(
f"SELECT COUNT() FROM system.parts WHERE table='{table}' AND active=1"
)
== "4\n"
)
path_to_detached = path_to_data + f"data/default/{table}/detached/"
result = node.exec_in_container(["ls", path_to_detached])
assert result.strip() == ""
corrupt_part_data_on_disk(node, table, "all_3_3_0")
break_part(node, table, "all_3_3_0")
node.query(f"ALTER TABLE {table} DETACH PART 'all_1_1_0'")
result = node.exec_in_container(["touch", f"{path_to_detached}trash"])
node.exec_in_container(["mkdir", f"{path_to_detached}../broken_all_fake"])
node.exec_in_container(
["touch", "-t", "1312031429.30", f"{path_to_detached}../broken_all_fake"]
)
result = node.exec_in_container(["stat", f"{path_to_detached}../broken_all_fake"])
print(result)
assert "Modify: 2013-12-03" in result
node.exec_in_container(
[
"mv",
f"{path_to_detached}../broken_all_fake",
f"{path_to_detached}broken_all_fake",
]
)
for name in [
"unexpected_all_42_1337_5",
"deleting_all_123_456_7",
"covered-by-broken_all_12_34_5",
]:
node.exec_in_container(["mkdir", f"{path_to_detached}../{name}"])
node.exec_in_container(
[
"touch",
"-t",
"1312031429.30",
f"{path_to_detached}../{name}",
]
)
result = node.exec_in_container(["stat", f"{path_to_detached}../{name}"])
print(result)
assert "Modify: 2013-12-03" in result
node.exec_in_container(
[
"mv",
f"{path_to_detached}../{name}",
f"{path_to_detached}{name}",
]
)
result = node.query(
f"CHECK TABLE {table}", settings={"check_query_single_value_result": 0}
)
assert "all_3_3_0\t0" in result
node.query(f"DETACH TABLE {table}")
node.query(f"ATTACH TABLE {table}")
node.wait_for_log_line(
"Removing detached part deleting_all_123_456_7",
timeout=90,
look_behind_lines=1000000,
)
node.wait_for_log_line(
f"Removed broken detached part {expect_broken_prefix}_all_3_3_0 due to a timeout",
timeout=10,
look_behind_lines=1000000,
)
node.wait_for_log_line(
"Removed broken detached part unexpected_all_42_1337_5 due to a timeout",
timeout=10,
look_behind_lines=1000000,
)
result = node.exec_in_container(["ls", path_to_detached])
print(result)
assert f"{expect_broken_prefix}_all_3_3_0" not in result
assert "all_1_1_0" in result
assert "trash" in result
assert "broken_all_fake" in result
assert "covered-by-broken_all_12_34_5" in result
assert "unexpected_all_42_1337_5" not in result
assert "deleting_all_123_456_7" not in result
node.query(
f"ALTER TABLE {table} DROP DETACHED PART 'covered-by-broken_all_12_34_5'",
settings={"allow_drop_detached": 1},
)
result = node.exec_in_container(["ls", path_to_detached])
assert "covered-by-broken_all_12_34_5" not in result
node.query(f"DROP TABLE {table} SYNC")
def test_remove_broken_detached_part_merge_tree(started_cluster):
node1.query(
"""
CREATE TABLE
mt(id UInt32, value Int32)
ENGINE = MergeTree() ORDER BY id
SETTINGS
merge_tree_enable_clear_old_broken_detached=1,
merge_tree_clear_old_broken_detached_parts_ttl_timeout_seconds=5;
"""
)
for i in range(4):
node1.query(
f"INSERT INTO mt SELECT number, number * number FROM numbers ({i * 100000}, 100000)"
)
remove_broken_detached_part_impl("mt", node1, "broken-on-start")
def test_remove_broken_detached_part_replicated_merge_tree(started_cluster):
node1.query(
f"""
CREATE TABLE
replicated_mt(date Date, id UInt32, value Int32)
ENGINE = ReplicatedMergeTree('/clickhouse/tables/replicated_mt', '{node1.name}') ORDER BY id
SETTINGS
merge_tree_enable_clear_old_broken_detached=1,
merge_tree_clear_old_broken_detached_parts_ttl_timeout_seconds=5,
cleanup_delay_period=1,
cleanup_delay_period_random_add=0,
cleanup_thread_preferred_points_per_iteration=0;
"""
)
for i in range(4):
node1.query(
f"INSERT INTO replicated_mt SELECT toDate('2019-10-01'), number, number * number FROM numbers ({i * 100000}, 100000)"
)
remove_broken_detached_part_impl("replicated_mt", node1, "broken")
def test_store_cleanup(started_cluster):
node1.query("CREATE DATABASE db UUID '10000000-1000-4000-8000-000000000001'")
node1.query(
"CREATE TABLE db.log UUID '10000000-1000-4000-8000-000000000002' ENGINE=Log AS SELECT 1"
)
node1.query(
"CREATE TABLE db.mt UUID '10000000-1000-4000-8000-000000000003' ENGINE=MergeTree ORDER BY tuple() AS SELECT 1"
)
node1.query(
"CREATE TABLE db.mem UUID '10000000-1000-4000-8000-000000000004' ENGINE=Memory AS SELECT 1"
)
node1.query("CREATE DATABASE db2 UUID '20000000-1000-4000-8000-000000000001'")
node1.query(
"CREATE TABLE db2.log UUID '20000000-1000-4000-8000-000000000002' ENGINE=Log AS SELECT 1"
)
node1.query("DETACH DATABASE db2")
node1.query("CREATE DATABASE db3 UUID '30000000-1000-4000-8000-000000000001'")
node1.query(
"CREATE TABLE db3.log UUID '30000000-1000-4000-8000-000000000002' ENGINE=Log AS SELECT 1"
)
node1.query(
"CREATE TABLE db3.log2 UUID '30000000-1000-4000-8000-000000000003' ENGINE=Log AS SELECT 1"
)
node1.query("DETACH TABLE db3.log")
node1.query("DETACH TABLE db3.log2 PERMANENTLY")
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store"]
)
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/100"]
)
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/200"]
)
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/300"]
)
node1.stop_clickhouse(kill=True)
# All dirs related to `db` will be removed
node1.exec_in_container(["rm", f"{path_to_data}/metadata/db.sql"])
node1.exec_in_container(["mkdir", f"{path_to_data}/store/kek"])
node1.exec_in_container(["touch", f"{path_to_data}/store/12"])
try:
node1.exec_in_container(["mkdir", f"{path_to_data}/store/456"])
except Exception as e:
print("Failed to create 456/:", str(e))
node1.exec_in_container(["mkdir", f"{path_to_data}/store/456/testgarbage"])
node1.exec_in_container(
["mkdir", f"{path_to_data}/store/456/30000000-1000-4000-8000-000000000003"]
)
node1.exec_in_container(
["touch", f"{path_to_data}/store/456/45600000-1000-4000-8000-000000000003"]
)
node1.exec_in_container(
["mkdir", f"{path_to_data}/store/456/45600000-1000-4000-8000-000000000004"]
)
node1.start_clickhouse()
node1.query("DETACH DATABASE db2")
node1.query("DETACH TABLE db3.log")
node1.wait_for_log_line(
"Removing access rights for unused directory",
timeout=60,
look_behind_lines=1000000,
)
node1.wait_for_log_line(
"directories from store", timeout=60, look_behind_lines=1000000
)
store = node1.exec_in_container(["ls", f"{path_to_data}/store"])
assert "100" in store
assert "200" in store
assert "300" in store
assert "456" in store
assert "kek" in store
assert "12" in store
assert "d---------" in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store"]
)
assert "d---------" in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/456"]
)
# Metadata is removed, so store/100 contains garbage
store100 = node1.exec_in_container(["ls", f"{path_to_data}/store/100"])
assert "10000000-1000-4000-8000-000000000001" in store100
assert "10000000-1000-4000-8000-000000000002" in store100
assert "10000000-1000-4000-8000-000000000003" in store100
assert "d---------" in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/100"]
)
# Database is detached, nothing to clean up
store200 = node1.exec_in_container(["ls", f"{path_to_data}/store/200"])
assert "20000000-1000-4000-8000-000000000001" in store200
assert "20000000-1000-4000-8000-000000000002" in store200
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/200"]
)
# Tables are detached, nothing to clean up
store300 = node1.exec_in_container(["ls", f"{path_to_data}/store/300"])
assert "30000000-1000-4000-8000-000000000001" in store300
assert "30000000-1000-4000-8000-000000000002" in store300
assert "30000000-1000-4000-8000-000000000003" in store300
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/300"]
)
# Manually created garbage
store456 = node1.exec_in_container(["ls", f"{path_to_data}/store/456"])
assert "30000000-1000-4000-8000-000000000003" in store456
assert "45600000-1000-4000-8000-000000000003" in store456
assert "45600000-1000-4000-8000-000000000004" in store456
assert "testgarbage" in store456
assert "----------" in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/456"]
)
node1.wait_for_log_line(
"Removing unused directory", timeout=90, look_behind_lines=1000000
)
node1.wait_for_log_line(
"directories from store", timeout=90, look_behind_lines=1000000
)
node1.wait_for_log_line(
"Nothing to clean up from store/", timeout=90, look_behind_lines=1000000
)
store = node1.exec_in_container(["ls", f"{path_to_data}/store"])
assert "100" in store
assert "200" in store
assert "300" in store
assert "456" in store
assert "kek" not in store # changed
assert "\n12\n" not in store # changed
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store"]
) # changed
# Metadata is removed, so store/100 contains garbage
store100 = node1.exec_in_container(["ls", f"{path_to_data}/store/100"]) # changed
assert "10000000-1000-4000-8000-000000000001" not in store100 # changed
assert "10000000-1000-4000-8000-000000000002" not in store100 # changed
assert "10000000-1000-4000-8000-000000000003" not in store100 # changed
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/100"]
) # changed
# Database is detached, nothing to clean up
store200 = node1.exec_in_container(["ls", f"{path_to_data}/store/200"])
assert "20000000-1000-4000-8000-000000000001" in store200
assert "20000000-1000-4000-8000-000000000002" in store200
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/200"]
)
# Tables are detached, nothing to clean up
store300 = node1.exec_in_container(["ls", f"{path_to_data}/store/300"])
assert "30000000-1000-4000-8000-000000000001" in store300
assert "30000000-1000-4000-8000-000000000002" in store300
assert "30000000-1000-4000-8000-000000000003" in store300
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/300"]
)
# Manually created garbage
store456 = node1.exec_in_container(["ls", f"{path_to_data}/store/456"])
assert "30000000-1000-4000-8000-000000000003" not in store456 # changed
assert "45600000-1000-4000-8000-000000000003" not in store456 # changed
assert "45600000-1000-4000-8000-000000000004" not in store456 # changed
assert "testgarbage" not in store456 # changed
assert "---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/456"]
) # changed
node1.query("ATTACH TABLE db3.log2")
node1.query("ATTACH DATABASE db2")
node1.query("ATTACH TABLE db3.log")
assert "1\n" == node1.query("SELECT * FROM db3.log")
assert "1\n" == node1.query("SELECT * FROM db3.log2")
assert "1\n" == node1.query("SELECT * FROM db2.log")