Merge pull request #39872 from ClickHouse/store_cleanup_all_disks

Cleanup unused dirs from `store/` on all disks
This commit is contained in:
Alexander Tokmakov 2022-08-05 12:21:59 +03:00 committed by GitHub
commit d9190a8121
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 269 additions and 103 deletions

View File

@ -99,6 +99,12 @@ public:
void syncRevision(UInt64 revision) override { delegate->syncRevision(revision); }
UInt64 getRevision() const override { return delegate->getRevision(); }
bool supportsStat() const override { return delegate->supportsStat(); }
struct stat stat(const String & path) const override { return delegate->stat(path); }
bool supportsChmod() const override { return delegate->supportsChmod(); }
void chmod(const String & path, mode_t mode) override { delegate->chmod(path, mode); }
protected:
Executor & getExecutor() override;

View File

@ -11,6 +11,8 @@
#include <fstream>
#include <unistd.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <Disks/DiskFactory.h>
#include <Disks/DiskMemory.h>
@ -39,6 +41,7 @@ namespace ErrorCodes
extern const int CANNOT_UNLINK;
extern const int CANNOT_RMDIR;
extern const int BAD_ARGUMENTS;
extern const int CANNOT_STAT;
}
std::mutex DiskLocal::reservation_mutex;
@ -671,6 +674,23 @@ bool DiskLocal::setup()
return true;
}
struct stat DiskLocal::stat(const String & path) const
{
struct stat st;
auto full_path = fs::path(disk_path) / path;
if (::stat(full_path.string().c_str(), &st) == 0)
return st;
DB::throwFromErrnoWithPath("Cannot stat file: " + path, path, DB::ErrorCodes::CANNOT_STAT);
}
void DiskLocal::chmod(const String & path, mode_t mode)
{
auto full_path = fs::path(disk_path) / path;
if (::chmod(full_path.string().c_str(), mode) == 0)
return;
DB::throwFromErrnoWithPath("Cannot chmod file: " + path, path, DB::ErrorCodes::PATH_ACCESS_DENIED);
}
void registerDiskLocal(DiskFactory & factory)
{
auto creator = [](const String & name,

View File

@ -122,6 +122,12 @@ public:
bool canRead() const noexcept;
bool canWrite() const noexcept;
bool supportsStat() const override { return true; }
struct stat stat(const String & path) const override;
bool supportsChmod() const override { return true; }
void chmod(const String & path, mode_t mode) override;
private:
std::optional<UInt64> tryReserve(UInt64 bytes);

View File

@ -112,6 +112,11 @@ public:
disk.setLastModified(path, timestamp);
}
void chmod(const String & path, mode_t mode) override
{
disk.chmod(path, mode);
}
void setReadOnly(const std::string & path) override
{
disk.setReadOnly(path);

View File

@ -351,6 +351,12 @@ public:
getType());
}
virtual bool supportsStat() const { return false; }
virtual struct stat stat(const String & /*path*/) const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Disk does not support stat"); }
virtual bool supportsChmod() const { return false; }
virtual void chmod(const String & /*path*/, mode_t /*mode*/) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Disk does not support chmod"); }
protected:
friend class DiskDecorator;

View File

@ -103,6 +103,9 @@ public:
/// Set last modified time to file or directory at `path`.
virtual void setLastModified(const std::string & path, const Poco::Timestamp & timestamp) = 0;
/// Just chmod.
virtual void chmod(const String & path, mode_t mode) = 0;
/// Set file at `path` as read-only.
virtual void setReadOnly(const std::string & path) = 0;

View File

@ -367,6 +367,18 @@ time_t DiskObjectStorage::getLastChanged(const String & path) const
return metadata_storage->getLastChanged(path);
}
struct stat DiskObjectStorage::stat(const String & path) const
{
return metadata_storage->stat(path);
}
void DiskObjectStorage::chmod(const String & path, mode_t mode)
{
auto transaction = createObjectStorageTransaction();
transaction->chmod(path, mode);
transaction->commit();
}
void DiskObjectStorage::shutdown()
{
LOG_INFO(log, "Shutting down disk {}", name);

View File

@ -168,6 +168,12 @@ public:
bool supportsCache() const override;
bool supportsStat() const override { return metadata_storage->supportsStat(); }
struct stat stat(const String & path) const override;
bool supportsChmod() const override { return metadata_storage->supportsChmod(); }
void chmod(const String & path, mode_t mode) override;
private:
/// Create actual disk object storage transaction for operations

View File

@ -613,6 +613,15 @@ void DiskObjectStorageTransaction::setLastModified(const std::string & path, con
}));
}
void DiskObjectStorageTransaction::chmod(const String & path, mode_t mode)
{
operations_to_execute.emplace_back(
std::make_unique<PureMetadataObjectStorageOperation>(object_storage, metadata_storage, [path, mode](MetadataTransactionPtr tx)
{
tx->chmod(path, mode);
}));
}
void DiskObjectStorageTransaction::createFile(const std::string & path)
{
operations_to_execute.emplace_back(

View File

@ -109,6 +109,7 @@ public:
void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
void setLastModified(const std::string & path, const Poco::Timestamp & timestamp) override;
void chmod(const String & path, mode_t mode) override;
void setReadOnly(const std::string & path) override;
void createHardLink(const std::string & src_path, const std::string & dst_path) override;
};

View File

@ -42,6 +42,12 @@ public:
time_t getLastChanged(const std::string & path) const override;
bool supportsChmod() const override { return disk->supportsChmod(); }
bool supportsStat() const override { return disk->supportsStat(); }
struct stat stat(const String & path) const override { return disk->stat(path); }
std::vector<std::string> listDirectory(const std::string & path) const override;
DirectoryIteratorPtr iterateDirectory(const std::string & path) const override;
@ -89,6 +95,10 @@ public:
void setLastModified(const std::string & path, const Poco::Timestamp & timestamp) override;
bool supportsChmod() const override { return disk->supportsChmod(); }
void chmod(const String & path, mode_t mode) override { disk->chmod(path, mode); }
void setReadOnly(const std::string & path) override;
void unlinkFile(const std::string & path) override;

View File

@ -37,6 +37,9 @@ public:
virtual void setLastModified(const std::string & path, const Poco::Timestamp & timestamp) = 0;
virtual bool supportsChmod() const = 0;
virtual void chmod(const String & path, mode_t mode) = 0;
virtual void setReadOnly(const std::string & path) = 0;
virtual void unlinkFile(const std::string & path) = 0;
@ -107,6 +110,11 @@ public:
virtual time_t getLastChanged(const std::string & path) const = 0;
virtual bool supportsChmod() const = 0;
virtual bool supportsStat() const = 0;
virtual struct stat stat(const String & path) const = 0;
virtual std::vector<std::string> listDirectory(const std::string & path) const = 0;
virtual DirectoryIteratorPtr iterateDirectory(const std::string & path) const = 0;

View File

@ -250,6 +250,11 @@ void MetadataStorageFromDiskTransaction::setLastModified(const std::string & pat
addOperation(std::make_unique<SetLastModifiedOperation>(path, timestamp, *metadata_storage.getDisk()));
}
void MetadataStorageFromDiskTransaction::chmod(const String & path, mode_t mode)
{
addOperation(std::make_unique<ChmodOperation>(path, mode, *metadata_storage.getDisk()));
}
void MetadataStorageFromDiskTransaction::unlinkFile(const std::string & path)
{
addOperation(std::make_unique<UnlinkFileOperation>(path, *metadata_storage.getDisk()));

View File

@ -39,6 +39,12 @@ public:
time_t getLastChanged(const std::string & path) const override;
bool supportsChmod() const override { return disk->supportsChmod(); }
bool supportsStat() const override { return disk->supportsStat(); }
struct stat stat(const String & path) const override { return disk->stat(path); }
std::vector<std::string> listDirectory(const std::string & path) const override;
DirectoryIteratorPtr iterateDirectory(const std::string & path) const override;
@ -94,6 +100,10 @@ public:
void setLastModified(const std::string & path, const Poco::Timestamp & timestamp) override;
bool supportsChmod() const override { return metadata_storage.supportsChmod(); }
void chmod(const String & path, mode_t mode) override;
void setReadOnly(const std::string & path) override;
void unlinkFile(const std::string & path) override;

View File

@ -36,6 +36,24 @@ void SetLastModifiedOperation::undo()
disk.setLastModified(path, old_timestamp);
}
ChmodOperation::ChmodOperation(const std::string & path_, mode_t mode_, IDisk & disk_)
: path(path_)
, mode(mode_)
, disk(disk_)
{
}
void ChmodOperation::execute(std::unique_lock<std::shared_mutex> &)
{
old_mode = disk.stat(path).st_mode;
disk.chmod(path, mode);
}
void ChmodOperation::undo()
{
disk.chmod(path, old_mode);
}
UnlinkFileOperation::UnlinkFileOperation(const std::string & path_, IDisk & disk_)
: path(path_)
, disk(disk_)

View File

@ -37,6 +37,21 @@ private:
IDisk & disk;
};
struct ChmodOperation final : public IMetadataOperation
{
ChmodOperation(const std::string & path_, mode_t mode_, IDisk & disk_);
void execute(std::unique_lock<std::shared_mutex> & metadata_lock) override;
void undo() override;
private:
std::string path;
mode_t mode;
mode_t old_mode;
IDisk & disk;
};
struct UnlinkFileOperation final : public IMetadataOperation
{

View File

@ -5,6 +5,7 @@
#include <Databases/IDatabase.h>
#include <Databases/DatabaseMemory.h>
#include <Databases/DatabaseOnDisk.h>
#include <Disks/IDisk.h>
#include <Common/quoteString.h>
#include <Storages/StorageMemory.h>
#include <Storages/LiveView/TemporaryLiveViewCleaner.h>
@ -19,10 +20,6 @@
#include <Common/filesystemHelpers.h>
#include <Common/noexcept_scope.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <utime.h>
#include "config_core.h"
#if USE_MYSQL
@ -894,7 +891,7 @@ void DatabaseCatalog::enqueueDroppedTableCleanup(StorageID table_id, StoragePtr
create->setTable(table_id.table_name);
try
{
table = createTableFromAST(*create, table_id.getDatabaseName(), data_path, getContext(), false).second;
table = createTableFromAST(*create, table_id.getDatabaseName(), data_path, getContext(), /* force_restore */ true).second;
table->is_dropped = true;
}
catch (...)
@ -979,7 +976,6 @@ void DatabaseCatalog::dropTableDataTask()
if (table.table_id)
{
try
{
dropTableFinally(table);
@ -1019,13 +1015,15 @@ void DatabaseCatalog::dropTableFinally(const TableMarkedAsDropped & table)
table.table->drop();
}
/// Even if table is not loaded, try remove its data from disk.
/// TODO remove data from all volumes
fs::path data_path = fs::path(getContext()->getPath()) / "store" / getPathForUUID(table.table_id.uuid);
if (fs::exists(data_path))
/// Even if table is not loaded, try remove its data from disks.
for (const auto & [disk_name, disk] : getContext()->getDisksMap())
{
LOG_INFO(log, "Removing data directory {} of dropped table {}", data_path.string(), table.table_id.getNameForLogs());
fs::remove_all(data_path);
String data_path = "store/" + getPathForUUID(table.table_id.uuid);
if (!disk->exists(data_path))
continue;
LOG_INFO(log, "Removing data directory {} of dropped table {} from disk {}", data_path, table.table_id.getNameForLogs(), disk_name);
disk->removeRecursive(data_path);
}
LOG_INFO(log, "Removing metadata {} of dropped table {}", table.metadata_path, table.table_id.getNameForLogs());
@ -1169,121 +1167,118 @@ void DatabaseCatalog::updateLoadingDependencies(const StorageID & table_id, Tabl
void DatabaseCatalog::cleanupStoreDirectoryTask()
{
fs::path store_path = fs::path(getContext()->getPath()) / "store";
size_t affected_dirs = 0;
for (const auto & prefix_dir : fs::directory_iterator{store_path})
for (const auto & [disk_name, disk] : getContext()->getDisksMap())
{
String prefix = prefix_dir.path().filename();
bool expected_prefix_dir = prefix_dir.is_directory() &&
prefix.size() == 3 &&
isHexDigit(prefix[0]) &&
isHexDigit(prefix[1]) &&
isHexDigit(prefix[2]);
if (!expected_prefix_dir)
{
LOG_WARNING(log, "Found invalid directory {}, will try to remove it", prefix_dir.path().string());
affected_dirs += maybeRemoveDirectory(prefix_dir.path());
if (!disk->supportsStat() || !disk->supportsChmod())
continue;
}
for (const auto & uuid_dir : fs::directory_iterator{prefix_dir.path()})
size_t affected_dirs = 0;
for (auto it = disk->iterateDirectory("store"); it->isValid(); it->next())
{
String uuid_str = uuid_dir.path().filename();
UUID uuid;
bool parsed = tryParse(uuid, uuid_str);
String prefix = it->name();
bool expected_prefix_dir = disk->isDirectory(it->path()) && prefix.size() == 3 && isHexDigit(prefix[0]) && isHexDigit(prefix[1])
&& isHexDigit(prefix[2]);
bool expected_dir = uuid_dir.is_directory() &&
parsed &&
uuid != UUIDHelpers::Nil &&
uuid_str.starts_with(prefix);
if (!expected_dir)
if (!expected_prefix_dir)
{
LOG_WARNING(log, "Found invalid directory {}, will try to remove it", uuid_dir.path().string());
affected_dirs += maybeRemoveDirectory(uuid_dir.path());
LOG_WARNING(log, "Found invalid directory {} on disk {}, will try to remove it", it->path(), disk_name);
affected_dirs += maybeRemoveDirectory(disk_name, disk, it->path());
continue;
}
/// Order is important
if (!hasUUIDMapping(uuid))
for (auto jt = disk->iterateDirectory(it->path()); jt->isValid(); jt->next())
{
/// We load uuids even for detached and permanently detached tables,
/// so it looks safe enough to remove directory if we don't have uuid mapping for it.
/// No table or database using this directory should concurrently appear,
/// because creation of new table would fail with "directory already exists".
affected_dirs += maybeRemoveDirectory(uuid_dir.path());
String uuid_str = jt->name();
UUID uuid;
bool parsed = tryParse(uuid, uuid_str);
bool expected_dir = disk->isDirectory(jt->path()) && parsed && uuid != UUIDHelpers::Nil && uuid_str.starts_with(prefix);
if (!expected_dir)
{
LOG_WARNING(log, "Found invalid directory {} on disk {}, will try to remove it", jt->path(), disk_name);
affected_dirs += maybeRemoveDirectory(disk_name, disk, jt->path());
continue;
}
/// Order is important
if (!hasUUIDMapping(uuid))
{
/// We load uuids even for detached and permanently detached tables,
/// so it looks safe enough to remove directory if we don't have uuid mapping for it.
/// No table or database using this directory should concurrently appear,
/// because creation of new table would fail with "directory already exists".
affected_dirs += maybeRemoveDirectory(disk_name, disk, jt->path());
}
}
}
}
if (affected_dirs)
LOG_INFO(log, "Cleaned up {} directories from store/", affected_dirs);
if (affected_dirs)
LOG_INFO(log, "Cleaned up {} directories from store/ on disk {}", affected_dirs, disk_name);
}
(*cleanup_task)->scheduleAfter(unused_dir_cleanup_period_sec * 1000);
}
bool DatabaseCatalog::maybeRemoveDirectory(const fs::path & unused_dir)
bool DatabaseCatalog::maybeRemoveDirectory(const String & disk_name, const DiskPtr & disk, const String & unused_dir)
{
/// "Safe" automatic removal of some directory.
/// At first we do not remove anything and only revoke all access right.
/// And remove only if nobody noticed it after, for example, one month.
struct stat st;
if (stat(unused_dir.string().c_str(), &st))
try
{
LOG_ERROR(log, "Failed to stat {}, errno: {}", unused_dir.string(), errno);
struct stat st = disk->stat(unused_dir);
if (st.st_uid != geteuid())
{
/// Directory is not owned by clickhouse, it's weird, let's ignore it (chmod will likely fail anyway).
LOG_WARNING(log, "Found directory {} with unexpected owner (uid={}) on disk {}", unused_dir, st.st_uid, disk_name);
return false;
}
time_t max_modification_time = std::max(st.st_atime, std::max(st.st_mtime, st.st_ctime));
time_t current_time = time(nullptr);
if (st.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO))
{
if (current_time <= max_modification_time + unused_dir_hide_timeout_sec)
return false;
LOG_INFO(log, "Removing access rights for unused directory {} from disk {} (will remove it when timeout exceed)", unused_dir, disk_name);
/// Explicitly update modification time just in case
disk->setLastModified(unused_dir, Poco::Timestamp::fromEpochTime(current_time));
/// Remove all access right
disk->chmod(unused_dir, 0);
return true;
}
else
{
if (!unused_dir_rm_timeout_sec)
return false;
if (current_time <= max_modification_time + unused_dir_rm_timeout_sec)
return false;
LOG_INFO(log, "Removing unused directory {} from disk {}", unused_dir, disk_name);
/// We have to set these access rights to make recursive removal work
disk->chmod(unused_dir, S_IRWXU);
disk->removeRecursive(unused_dir);
return true;
}
}
catch (...)
{
tryLogCurrentException(log, fmt::format("Failed to remove unused directory {} from disk {} ({})",
unused_dir, disk->getName(), disk->getPath()));
return false;
}
if (st.st_uid != geteuid())
{
/// Directory is not owned by clickhouse, it's weird, let's ignore it (chmod will likely fail anyway).
LOG_WARNING(log, "Found directory {} with unexpected owner (uid={})", unused_dir.string(), st.st_uid);
return false;
}
time_t max_modification_time = std::max(st.st_atime, std::max(st.st_mtime, st.st_ctime));
time_t current_time = time(nullptr);
if (st.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO))
{
if (current_time <= max_modification_time + unused_dir_hide_timeout_sec)
return false;
LOG_INFO(log, "Removing access rights for unused directory {} (will remove it when timeout exceed)", unused_dir.string());
/// Explicitly update modification time just in case
struct utimbuf tb;
tb.actime = current_time;
tb.modtime = current_time;
if (utime(unused_dir.string().c_str(), &tb) != 0)
LOG_ERROR(log, "Failed to utime {}, errno: {}", unused_dir.string(), errno);
/// Remove all access right
if (chmod(unused_dir.string().c_str(), 0))
LOG_ERROR(log, "Failed to chmod {}, errno: {}", unused_dir.string(), errno);
return true;
}
else
{
if (!unused_dir_rm_timeout_sec)
return false;
if (current_time <= max_modification_time + unused_dir_rm_timeout_sec)
return false;
LOG_INFO(log, "Removing unused directory {}", unused_dir.string());
/// We have to set these access rights to make recursive removal work
if (chmod(unused_dir.string().c_str(), S_IRWXU))
LOG_ERROR(log, "Failed to chmod {}, errno: {}", unused_dir.string(), errno);
fs::remove_all(unused_dir);
return true;
}
}
static void maybeUnlockUUID(UUID uuid)

View File

@ -31,10 +31,12 @@ class IDatabase;
class Exception;
class ColumnsDescription;
struct ConstraintsDescription;
class IDisk;
using DatabasePtr = std::shared_ptr<IDatabase>;
using DatabaseAndTable = std::pair<DatabasePtr, StoragePtr>;
using Databases = std::map<String, std::shared_ptr<IDatabase>>;
using DiskPtr = std::shared_ptr<IDisk>;
/// Table -> set of table-views that make SELECT from it.
using ViewDependencies = std::map<StorageID, std::set<StorageID>>;
@ -271,7 +273,7 @@ private:
void dropTableFinally(const TableMarkedAsDropped & table);
void cleanupStoreDirectoryTask();
bool maybeRemoveDirectory(const fs::path & unused_dir);
bool maybeRemoveDirectory(const String & disk_name, const DiskPtr & disk, const String & unused_dir);
static constexpr size_t reschedule_time_ms = 100;
static constexpr time_t drop_error_cooldown_sec = 5;

View File

@ -90,4 +90,8 @@
<merge_tree>
<min_bytes_for_wide_part>0</min_bytes_for_wide_part>
</merge_tree>
<database_catalog_unused_dir_hide_timeout_sec>0</database_catalog_unused_dir_hide_timeout_sec>
<database_catalog_unused_dir_rm_timeout_sec>60</database_catalog_unused_dir_rm_timeout_sec>
<database_catalog_unused_dir_cleanup_period_sec>1</database_catalog_unused_dir_cleanup_period_sec>
</clickhouse>

View File

@ -20,6 +20,7 @@ def cluster():
"configs/config.d/storage_conf.xml",
"configs/config.d/bg_processing_pool_conf.xml",
],
stay_alive=True,
with_minio=True,
)
@ -712,3 +713,27 @@ def test_cache_with_full_disk_space(cluster, node_name):
"Insert into cache is skipped due to insufficient disk space"
)
node.query("DROP TABLE IF EXISTS s3_test NO DELAY")
@pytest.mark.parametrize("node_name", ["node"])
def test_store_cleanup_disk_s3(cluster, node_name):
node = cluster.instances[node_name]
node.query("DROP TABLE IF EXISTS s3_test SYNC")
node.query(
"CREATE TABLE s3_test UUID '00000000-1000-4000-8000-000000000001' (n UInt64) Engine=MergeTree() ORDER BY n SETTINGS storage_policy='s3';"
)
node.query("INSERT INTO s3_test SELECT 1")
node.stop_clickhouse(kill=True)
path_to_data = "/var/lib/clickhouse/"
node.exec_in_container(["rm", f"{path_to_data}/metadata/default/s3_test.sql"])
node.start_clickhouse()
node.wait_for_log_line(
"Removing unused directory", timeout=90, look_behind_lines=1000
)
node.wait_for_log_line("directories from store")
node.query(
"CREATE TABLE s3_test UUID '00000000-1000-4000-8000-000000000001' (n UInt64) Engine=MergeTree() ORDER BY n SETTINGS storage_policy='s3';"
)
node.query("INSERT INTO s3_test SELECT 1")