Merge pull request #38265 from ClickHouse/cleanup_garbage_in_store_dir

Add background cleanup of store/ subdirs
This commit is contained in:
Alexander Tokmakov 2022-06-27 16:53:13 +03:00 committed by GitHub
commit bec921c9d9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 625 additions and 122 deletions

View File

@ -193,6 +193,35 @@ Sets the delay before remove table data in seconds. If the query has `SYNC` modi
Default value: `480` (8 minute).
## database_catalog_unused_dir_hide_timeout_sec {#database_catalog_unused_dir_hide_timeout_sec}
Parameter of a task that cleans up garbage from `store/` directory.
If some subdirectory is not used by clickhouse-server and this directory was not modified for last
`database_catalog_unused_dir_hide_timeout_sec` seconds, the task will "hide" this directory by
removing all access rights. It also works for directories that clickhouse-server does not
expect to see inside `store/`. Zero means "immediately".
Default value: `3600` (1 hour).
## database_catalog_unused_dir_rm_timeout_sec {#database_catalog_unused_dir_rm_timeout_sec}
Parameter of a task that cleans up garbage from `store/` directory.
If some subdirectory is not used by clickhouse-server and it was previousely "hidden"
(see [database_catalog_unused_dir_hide_timeout_sec](../../operations/server-configuration-parameters/settings.md#database_catalog_unused_dir_hide_timeout_sec))
and this directory was not modified for last
`database_catalog_unused_dir_rm_timeout_sec` seconds, the task will remove this directory.
It also works for directories that clickhouse-server does not
expect to see inside `store/`. Zero means "never".
Default value: `2592000` (30 days).
## database_catalog_unused_dir_cleanup_period_sec {#database_catalog_unused_dir_cleanup_period_sec}
Parameter of a task that cleans up garbage from `store/` directory.
Sets scheduling period of the task. Zero means "never".
Default value: `86400` (1 day).
## default_database {#default-database}
The default database.

View File

@ -153,15 +153,7 @@ namespace
bool tryParseUUID(const String & str, UUID & id)
{
try
{
id = parseFromString<UUID>(str);
return true;
}
catch (...)
{
return false;
}
return tryParse(id, str);
}
}

View File

@ -1,12 +1,10 @@
#include "filesystemHelpers.h"
#if defined(OS_LINUX)
# include <cstdio>
# include <mntent.h>
# include <sys/sysmacros.h>
#endif
#include <cerrno>
#include <Poco/Version.h>
#include <Poco/Timestamp.h>
#include <filesystem>
#include <fcntl.h>

View File

@ -292,7 +292,6 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora
{
DetachedTables not_in_use;
auto table_data_path = getTableDataPath(query);
bool locked_uuid = false;
try
{
std::unique_lock lock{mutex};
@ -302,9 +301,7 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora
/// Do some checks before renaming file from .tmp to .sql
not_in_use = cleanupDetachedTables();
assertDetachedTableNotInUse(query.uuid);
/// We will get en exception if some table with the same UUID exists (even if it's detached table or table from another database)
DatabaseCatalog::instance().addUUIDMapping(query.uuid);
locked_uuid = true;
chassert(DatabaseCatalog::instance().hasUUIDMapping(query.uuid));
auto txn = query_context->getZooKeeperMetadataTransaction();
if (txn && !query_context->isInternalSubquery())
@ -321,8 +318,6 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora
catch (...)
{
fs::remove(table_metadata_tmp_path);
if (locked_uuid)
DatabaseCatalog::instance().removeUUIDMappingFinally(query.uuid);
throw;
}
if (table->storesDataOnDisk())

View File

@ -395,6 +395,18 @@ void DatabaseOnDisk::renameTable(
if (auto * target_db = dynamic_cast<DatabaseOnDisk *>(&to_database))
target_db->checkMetadataFilenameAvailability(to_table_name);
/// This place is actually quite dangerous. Since data directory is moved to store/
/// DatabaseCatalog may try to clean it up as unused. We add UUID mapping to avoid this.
/// However, we may fail after data directory move, but before metadata file creation in the destination db.
/// In this case nothing will protect data directory (except 30-days timeout).
/// But this situation (when table in Ordinary database is partially renamed) require manual intervention anyway.
if (from_ordinary_to_atomic)
{
DatabaseCatalog::instance().addUUIDMapping(create.uuid);
if (table->storesDataOnDisk())
LOG_INFO(log, "Moving table from {} to {}", table_data_relative_path, to_database.getTableDataPath(create));
}
/// Notify the table that it is renamed. It will move data to new path (if it stores data on disk) and update StorageID
table->rename(to_database.getTableDataPath(create), StorageID(create));
}

View File

@ -29,6 +29,12 @@ namespace fs = std::filesystem;
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
static constexpr size_t METADATA_FILE_BUFFER_SIZE = 32768;
namespace
@ -83,7 +89,7 @@ void DatabaseOrdinary::loadStoredObjects(
*/
ParsedTablesMetadata metadata;
loadTablesMetadata(local_context, metadata);
loadTablesMetadata(local_context, metadata, force_attach);
size_t total_tables = metadata.parsed_tables.size() - metadata.total_dictionaries;
@ -151,12 +157,12 @@ void DatabaseOrdinary::loadStoredObjects(
}
}
void DatabaseOrdinary::loadTablesMetadata(ContextPtr local_context, ParsedTablesMetadata & metadata)
void DatabaseOrdinary::loadTablesMetadata(ContextPtr local_context, ParsedTablesMetadata & metadata, bool is_startup)
{
size_t prev_tables_count = metadata.parsed_tables.size();
size_t prev_total_dictionaries = metadata.total_dictionaries;
auto process_metadata = [&metadata, this](const String & file_name)
auto process_metadata = [&metadata, is_startup, this](const String & file_name)
{
fs::path path(getMetadataPath());
fs::path file_path(file_name);
@ -170,12 +176,26 @@ void DatabaseOrdinary::loadTablesMetadata(ContextPtr local_context, ParsedTables
auto * create_query = ast->as<ASTCreateQuery>();
create_query->setDatabase(database_name);
/// Even if we don't load the table we can still mark the uuid of it as taken.
if (create_query->uuid != UUIDHelpers::Nil)
{
/// A bit tricky way to distinguish ATTACH DATABASE and server startup (actually it's "force_attach" flag).
if (is_startup)
{
/// Server is starting up. Lock UUID used by permanently detached table.
DatabaseCatalog::instance().addUUIDMapping(create_query->uuid);
}
else if (!DatabaseCatalog::instance().hasUUIDMapping(create_query->uuid))
{
/// It's ATTACH DATABASE. UUID for permanently detached table must be already locked.
/// FIXME MaterializedPostgreSQL works with UUIDs incorrectly and breaks invariants
if (getEngineName() != "MaterializedPostgreSQL")
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find UUID mapping for {}, it's a bug", create_query->uuid);
}
}
if (fs::exists(full_path.string() + detached_suffix))
{
/// FIXME: even if we don't load the table we can still mark the uuid of it as taken.
/// if (create_query->uuid != UUIDHelpers::Nil)
/// DatabaseCatalog::instance().addUUIDMapping(create_query->uuid);
const std::string table_name = unescapeForFileName(file_name.substr(0, file_name.size() - 4));
LOG_DEBUG(log, "Skipping permanently detached table {}.", backQuote(table_name));
return;

View File

@ -25,7 +25,7 @@ public:
bool supportsLoadingInTopologicalOrder() const override { return true; }
void loadTablesMetadata(ContextPtr context, ParsedTablesMetadata & metadata) override;
void loadTablesMetadata(ContextPtr context, ParsedTablesMetadata & metadata, bool is_startup) override;
void loadTableFromMetadata(ContextMutablePtr local_context, const String & file_path, const QualifiedTableName & name, const ASTPtr & ast, bool force_restore) override;

View File

@ -148,7 +148,7 @@ public:
{
}
virtual void loadTablesMetadata(ContextPtr /*local_context*/, ParsedTablesMetadata & /*metadata*/)
virtual void loadTablesMetadata(ContextPtr /*local_context*/, ParsedTablesMetadata & /*metadata*/, bool /*is_startup*/)
{
throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented");
}

View File

@ -259,6 +259,7 @@ void DatabaseMaterializedPostgreSQL::createTable(ContextPtr local_context, const
auto * create_query = assert_cast<ASTCreateQuery *>(query_copy.get());
create_query->attach = false;
create_query->attach_short_syntax = false;
DatabaseCatalog::instance().addUUIDMapping(create->uuid);
DatabaseAtomic::createTable(StorageMaterializedPostgreSQL::makeNestedTableContext(local_context), table_name, table, query_copy);
/// Attach MaterializedPostgreSQL table.

View File

@ -93,7 +93,7 @@ void TablesLoader::loadTables()
for (auto & database_name : databases_to_load)
{
databases[database_name]->beforeLoadingMetadata(global_context, force_restore, force_attach);
databases[database_name]->loadTablesMetadata(global_context, metadata);
databases[database_name]->loadTablesMetadata(global_context, metadata, force_attach);
}
LOG_INFO(log, "Parsed metadata of {} tables in {} databases in {} sec",

View File

@ -1065,6 +1065,8 @@ inline bool tryReadText(is_integer auto & x, ReadBuffer & buf)
return tryReadIntText(x, buf);
}
inline bool tryReadText(UUID & x, ReadBuffer & buf) { return tryReadUUIDText(x, buf); }
inline void readText(is_floating_point auto & x, ReadBuffer & buf) { readFloatText(x, buf); }
inline void readText(String & x, ReadBuffer & buf) { readEscapedString(x, buf); }

View File

@ -16,8 +16,12 @@
#include <Common/CurrentMetrics.h>
#include <Common/logger_useful.h>
#include <Poco/Util/AbstractConfiguration.h>
#include <filesystem>
#include <Common/filesystemHelpers.h>
#include <Common/noexcept_scope.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <utime.h>
#include "config_core.h"
@ -27,12 +31,10 @@
#endif
#if USE_LIBPQXX
# include <Storages/PostgreSQL/StorageMaterializedPostgreSQL.h>
# include <Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.h>
# include <Storages/PostgreSQL/StorageMaterializedPostgreSQL.h>
#endif
namespace fs = std::filesystem;
namespace CurrentMetrics
{
extern const Metric TablesToDropQueueSize;
@ -77,6 +79,7 @@ TemporaryTableHolder::TemporaryTableHolder(ContextPtr context_, const TemporaryT
}
auto table_id = StorageID(DatabaseCatalog::TEMPORARY_DATABASE, global_name, id);
auto table = creator(table_id);
DatabaseCatalog::instance().addUUIDMapping(id);
temporary_tables->createTable(getContext(), global_name, table, original_create);
table->startup();
}
@ -143,6 +146,9 @@ StoragePtr TemporaryTableHolder::getTable() const
void DatabaseCatalog::initializeAndLoadTemporaryDatabase()
{
drop_delay_sec = getContext()->getConfigRef().getInt("database_atomic_delay_before_drop_table_sec", default_drop_delay_sec);
unused_dir_hide_timeout_sec = getContext()->getConfigRef().getInt("database_catalog_unused_dir_hide_timeout_sec", unused_dir_hide_timeout_sec);
unused_dir_rm_timeout_sec = getContext()->getConfigRef().getInt("database_catalog_unused_dir_rm_timeout_sec", unused_dir_rm_timeout_sec);
unused_dir_cleanup_period_sec = getContext()->getConfigRef().getInt("database_catalog_unused_dir_cleanup_period_sec", unused_dir_cleanup_period_sec);
auto db_for_temporary_and_external_tables = std::make_shared<DatabaseMemory>(TEMPORARY_DATABASE, getContext());
attachDatabase(TEMPORARY_DATABASE, db_for_temporary_and_external_tables);
@ -150,6 +156,16 @@ void DatabaseCatalog::initializeAndLoadTemporaryDatabase()
void DatabaseCatalog::loadDatabases()
{
if (Context::getGlobalContextInstance()->getApplicationType() == Context::ApplicationType::SERVER && unused_dir_cleanup_period_sec)
{
auto cleanup_task_holder
= getContext()->getSchedulePool().createTask("DatabaseCatalog", [this]() { this->cleanupStoreDirectoryTask(); });
cleanup_task = std::make_unique<BackgroundSchedulePoolTaskHolder>(std::move(cleanup_task_holder));
(*cleanup_task)->activate();
/// Do not start task immediately on server startup, it's not urgent.
(*cleanup_task)->scheduleAfter(unused_dir_hide_timeout_sec * 1000);
}
auto task_holder = getContext()->getSchedulePool().createTask("DatabaseCatalog", [this](){ this->dropTableDataTask(); });
drop_task = std::make_unique<BackgroundSchedulePoolTaskHolder>(std::move(task_holder));
(*drop_task)->activate();
@ -166,6 +182,9 @@ void DatabaseCatalog::shutdownImpl()
{
TemporaryLiveViewCleaner::shutdown();
if (cleanup_task)
(*cleanup_task)->deactivate();
if (drop_task)
(*drop_task)->deactivate();
@ -189,19 +208,25 @@ void DatabaseCatalog::shutdownImpl()
tables_marked_dropped.clear();
std::lock_guard lock(databases_mutex);
for (const auto & db : databases)
{
UUID db_uuid = db.second->getUUID();
if (db_uuid != UUIDHelpers::Nil)
removeUUIDMapping(db_uuid);
}
assert(std::find_if(uuid_map.begin(), uuid_map.end(), [](const auto & elem)
{
/// Ensure that all UUID mappings are empty (i.e. all mappings contain nullptr instead of a pointer to storage)
const auto & not_empty_mapping = [] (const auto & mapping)
{
auto & db = mapping.second.first;
auto & table = mapping.second.second;
return table;
return db || table;
};
auto it = std::find_if(elem.map.begin(), elem.map.end(), not_empty_mapping);
return it != elem.map.end();
}) == uuid_map.end());
databases.clear();
db_uuid_map.clear();
view_dependencies.clear();
}
@ -331,9 +356,10 @@ void DatabaseCatalog::attachDatabase(const String & database_name, const Databas
std::lock_guard lock{databases_mutex};
assertDatabaseDoesntExistUnlocked(database_name);
databases.emplace(database_name, database);
NOEXCEPT_SCOPE;
UUID db_uuid = database->getUUID();
if (db_uuid != UUIDHelpers::Nil)
db_uuid_map.emplace(db_uuid, database);
addUUIDMapping(db_uuid, database, nullptr);
}
@ -347,7 +373,9 @@ DatabasePtr DatabaseCatalog::detachDatabase(ContextPtr local_context, const Stri
std::lock_guard lock{databases_mutex};
assertDatabaseExistsUnlocked(database_name);
db = databases.find(database_name)->second;
db_uuid_map.erase(db->getUUID());
UUID db_uuid = db->getUUID();
if (db_uuid != UUIDHelpers::Nil)
removeUUIDMapping(db_uuid);
databases.erase(database_name);
}
@ -372,6 +400,8 @@ DatabasePtr DatabaseCatalog::detachDatabase(ContextPtr local_context, const Stri
if (drop)
{
UUID db_uuid = db->getUUID();
/// Delete the database.
db->drop(local_context);
@ -381,6 +411,9 @@ DatabasePtr DatabaseCatalog::detachDatabase(ContextPtr local_context, const Stri
fs::remove(database_metadata_dir);
fs::path database_metadata_file = fs::path(getContext()->getPath()) / "metadata" / (escapeForFileName(database_name) + ".sql");
fs::remove(database_metadata_file);
if (db_uuid != UUIDHelpers::Nil)
removeUUIDMappingFinally(db_uuid);
}
return db;
@ -427,21 +460,19 @@ DatabasePtr DatabaseCatalog::tryGetDatabase(const String & database_name) const
DatabasePtr DatabaseCatalog::getDatabase(const UUID & uuid) const
{
std::lock_guard lock{databases_mutex};
auto it = db_uuid_map.find(uuid);
if (it == db_uuid_map.end())
auto db_and_table = tryGetByUUID(uuid);
if (!db_and_table.first || db_and_table.second)
throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database UUID {} does not exist", toString(uuid));
return it->second;
return db_and_table.first;
}
DatabasePtr DatabaseCatalog::tryGetDatabase(const UUID & uuid) const
{
assert(uuid != UUIDHelpers::Nil);
std::lock_guard lock{databases_mutex};
auto it = db_uuid_map.find(uuid);
if (it == db_uuid_map.end())
auto db_and_table = tryGetByUUID(uuid);
if (!db_and_table.first || db_and_table.second)
return {};
return it->second;
return db_and_table.first;
}
bool DatabaseCatalog::isDatabaseExist(const String & database_name) const
@ -496,18 +527,22 @@ void DatabaseCatalog::addUUIDMapping(const UUID & uuid)
void DatabaseCatalog::addUUIDMapping(const UUID & uuid, const DatabasePtr & database, const StoragePtr & table)
{
assert(uuid != UUIDHelpers::Nil && getFirstLevelIdx(uuid) < uuid_map.size());
assert((database && table) || (!database && !table));
assert(database || !table);
UUIDToStorageMapPart & map_part = uuid_map[getFirstLevelIdx(uuid)];
std::lock_guard lock{map_part.mutex};
auto [it, inserted] = map_part.map.try_emplace(uuid, database, table);
if (inserted)
{
/// Mapping must be locked before actually inserting something
chassert((!database && !table));
return;
}
auto & prev_database = it->second.first;
auto & prev_table = it->second.second;
assert((prev_database && prev_table) || (!prev_database && !prev_table));
assert(prev_database || !prev_table);
if (!prev_table && table)
if (!prev_database && database)
{
/// It's empty mapping, it was created to "lock" UUID and prevent collision. Just update it.
prev_database = database;
@ -515,8 +550,8 @@ void DatabaseCatalog::addUUIDMapping(const UUID & uuid, const DatabasePtr & data
return;
}
/// We are trying to replace existing mapping (prev_table != nullptr), it's logical error
if (table)
/// We are trying to replace existing mapping (prev_database != nullptr), it's logical error
if (database || table)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Mapping for table with UUID={} already exists", toString(uuid));
/// Normally this should never happen, but it's possible when the same UUIDs are explicitly specified in different CREATE queries,
/// so it's not LOGICAL_ERROR
@ -560,6 +595,14 @@ void DatabaseCatalog::updateUUIDMapping(const UUID & uuid, DatabasePtr database,
prev_table = std::move(table);
}
bool DatabaseCatalog::hasUUIDMapping(const UUID & uuid)
{
assert(uuid != UUIDHelpers::Nil && getFirstLevelIdx(uuid) < uuid_map.size());
UUIDToStorageMapPart & map_part = uuid_map[getFirstLevelIdx(uuid)];
std::lock_guard lock{map_part.mutex};
return map_part.map.contains(uuid);
}
std::unique_ptr<DatabaseCatalog> DatabaseCatalog::database_catalog;
DatabaseCatalog::DatabaseCatalog(ContextMutablePtr global_context_)
@ -701,6 +744,8 @@ DatabaseAndTable DatabaseCatalog::tryGetDatabaseAndTable(const StorageID & table
void DatabaseCatalog::loadMarkedAsDroppedTables()
{
assert(!cleanup_task);
/// /clickhouse_root/metadata_dropped/ contains files with metadata of tables,
/// which where marked as dropped by Atomic databases.
/// Data directories of such tables still exists in store/
@ -780,6 +825,7 @@ void DatabaseCatalog::enqueueDroppedTableCleanup(StorageID table_id, StoragePtr
time_t drop_time;
if (table)
{
chassert(hasUUIDMapping(table_id.uuid));
drop_time = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
table->is_dropped = true;
}
@ -1072,6 +1118,166 @@ void DatabaseCatalog::updateLoadingDependencies(const StorageID & table_id, Tabl
old_dependencies = std::move(new_dependencies);
}
void DatabaseCatalog::cleanupStoreDirectoryTask()
{
fs::path store_path = fs::path(getContext()->getPath()) / "store";
size_t affected_dirs = 0;
for (const auto & prefix_dir : fs::directory_iterator{store_path})
{
String prefix = prefix_dir.path().filename();
bool expected_prefix_dir = prefix_dir.is_directory() &&
prefix.size() == 3 &&
isHexDigit(prefix[0]) &&
isHexDigit(prefix[1]) &&
isHexDigit(prefix[2]);
if (!expected_prefix_dir)
{
LOG_WARNING(log, "Found invalid directory {}, will try to remove it", prefix_dir.path().string());
affected_dirs += maybeRemoveDirectory(prefix_dir.path());
continue;
}
for (const auto & uuid_dir : fs::directory_iterator{prefix_dir.path()})
{
String uuid_str = uuid_dir.path().filename();
UUID uuid;
bool parsed = tryParse(uuid, uuid_str);
bool expected_dir = uuid_dir.is_directory() &&
parsed &&
uuid != UUIDHelpers::Nil &&
uuid_str.starts_with(prefix);
if (!expected_dir)
{
LOG_WARNING(log, "Found invalid directory {}, will try to remove it", uuid_dir.path().string());
affected_dirs += maybeRemoveDirectory(uuid_dir.path());
continue;
}
/// Order is important
if (!hasUUIDMapping(uuid))
{
/// We load uuids even for detached and permanently detached tables,
/// so it looks safe enough to remove directory if we don't have uuid mapping for it.
/// No table or database using this directory should concurrently appear,
/// because creation of new table would fail with "directory already exists".
affected_dirs += maybeRemoveDirectory(uuid_dir.path());
}
}
}
if (affected_dirs)
LOG_INFO(log, "Cleaned up {} directories from store/", affected_dirs);
(*cleanup_task)->scheduleAfter(unused_dir_cleanup_period_sec * 1000);
}
bool DatabaseCatalog::maybeRemoveDirectory(const fs::path & unused_dir)
{
/// "Safe" automatic removal of some directory.
/// At first we do not remove anything and only revoke all access right.
/// And remove only if nobody noticed it after, for example, one month.
struct stat st;
if (stat(unused_dir.string().c_str(), &st))
{
LOG_ERROR(log, "Failed to stat {}, errno: {}", unused_dir.string(), errno);
return false;
}
if (st.st_uid != geteuid())
{
/// Directory is not owned by clickhouse, it's weird, let's ignore it (chmod will likely fail anyway).
LOG_WARNING(log, "Found directory {} with unexpected owner (uid={})", unused_dir.string(), st.st_uid);
return false;
}
time_t max_modification_time = std::max(st.st_atime, std::max(st.st_mtime, st.st_ctime));
time_t current_time = time(nullptr);
if (st.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO))
{
if (current_time <= max_modification_time + unused_dir_hide_timeout_sec)
return false;
LOG_INFO(log, "Removing access rights for unused directory {} (will remove it when timeout exceed)", unused_dir.string());
/// Explicitly update modification time just in case
struct utimbuf tb;
tb.actime = current_time;
tb.modtime = current_time;
if (utime(unused_dir.string().c_str(), &tb) != 0)
LOG_ERROR(log, "Failed to utime {}, errno: {}", unused_dir.string(), errno);
/// Remove all access right
if (chmod(unused_dir.string().c_str(), 0))
LOG_ERROR(log, "Failed to chmod {}, errno: {}", unused_dir.string(), errno);
return true;
}
else
{
if (!unused_dir_rm_timeout_sec)
return false;
if (current_time <= max_modification_time + unused_dir_rm_timeout_sec)
return false;
LOG_INFO(log, "Removing unused directory {}", unused_dir.string());
/// We have to set these access rights to make recursive removal work
if (chmod(unused_dir.string().c_str(), S_IRWXU))
LOG_ERROR(log, "Failed to chmod {}, errno: {}", unused_dir.string(), errno);
fs::remove_all(unused_dir);
return true;
}
}
static void maybeUnlockUUID(UUID uuid)
{
if (uuid == UUIDHelpers::Nil)
return;
chassert(DatabaseCatalog::instance().hasUUIDMapping(uuid));
auto db_and_table = DatabaseCatalog::instance().tryGetByUUID(uuid);
if (!db_and_table.first && !db_and_table.second)
{
DatabaseCatalog::instance().removeUUIDMappingFinally(uuid);
return;
}
chassert(db_and_table.first || !db_and_table.second);
}
TemporaryLockForUUIDDirectory::TemporaryLockForUUIDDirectory(UUID uuid_)
: uuid(uuid_)
{
if (uuid != UUIDHelpers::Nil)
DatabaseCatalog::instance().addUUIDMapping(uuid);
}
TemporaryLockForUUIDDirectory::~TemporaryLockForUUIDDirectory()
{
maybeUnlockUUID(uuid);
}
TemporaryLockForUUIDDirectory::TemporaryLockForUUIDDirectory(TemporaryLockForUUIDDirectory && rhs) noexcept
: uuid(rhs.uuid)
{
rhs.uuid = UUIDHelpers::Nil;
}
TemporaryLockForUUIDDirectory & TemporaryLockForUUIDDirectory::operator = (TemporaryLockForUUIDDirectory && rhs) noexcept
{
maybeUnlockUUID(uuid);
uuid = rhs.uuid;
rhs.uuid = UUIDHelpers::Nil;
return *this;
}
DDLGuard::DDLGuard(Map & map_, std::shared_mutex & db_mutex_, std::unique_lock<std::mutex> guards_lock_, const String & elem, const String & database_name)
: map(map_), db_mutex(db_mutex_), guards_lock(std::move(guards_lock_))

View File

@ -20,7 +20,9 @@
#include <shared_mutex>
#include <unordered_map>
#include <unordered_set>
#include <filesystem>
namespace fs = std::filesystem;
namespace DB
{
@ -203,6 +205,8 @@ public:
/// this method will throw an exception.
void addUUIDMapping(const UUID & uuid);
bool hasUUIDMapping(const UUID & uuid);
static String getPathForUUID(const UUID & uuid);
DatabaseAndTable tryGetByUUID(const UUID & uuid) const;
@ -261,17 +265,17 @@ private:
void dropTableDataTask();
void dropTableFinally(const TableMarkedAsDropped & table);
void cleanupStoreDirectoryTask();
bool maybeRemoveDirectory(const fs::path & unused_dir);
static constexpr size_t reschedule_time_ms = 100;
static constexpr time_t drop_error_cooldown_sec = 5;
using UUIDToDatabaseMap = std::unordered_map<UUID, DatabasePtr>;
mutable std::mutex databases_mutex;
ViewDependencies view_dependencies;
Databases databases;
UUIDToDatabaseMap db_uuid_map;
UUIDToStorageMap uuid_map;
DependenciesInfos loading_dependencies;
@ -298,6 +302,33 @@ private:
static constexpr time_t default_drop_delay_sec = 8 * 60;
time_t drop_delay_sec = default_drop_delay_sec;
std::condition_variable wait_table_finally_dropped;
std::unique_ptr<BackgroundSchedulePoolTaskHolder> cleanup_task;
static constexpr time_t default_unused_dir_hide_timeout_sec = 60 * 60; /// 1 hour
time_t unused_dir_hide_timeout_sec = default_unused_dir_hide_timeout_sec;
static constexpr time_t default_unused_dir_rm_timeout_sec = 30 * 24 * 60 * 60; /// 30 days
time_t unused_dir_rm_timeout_sec = default_unused_dir_rm_timeout_sec;
static constexpr time_t default_unused_dir_cleanup_period_sec = 24 * 60 * 60; /// 1 day
time_t unused_dir_cleanup_period_sec = default_unused_dir_cleanup_period_sec;
};
/// This class is useful when creating a table or database.
/// Usually we create IStorage/IDatabase object first and then add it to IDatabase/DatabaseCatalog.
/// But such object may start using a directory in store/ since its creation.
/// To avoid race with cleanupStoreDirectoryTask() we have to mark UUID as used first.
/// Then we can either add DatabasePtr/StoragePtr to the created UUID mapping
/// or remove the lock if creation failed.
/// See also addUUIDMapping(...)
class TemporaryLockForUUIDDirectory : private boost::noncopyable
{
UUID uuid = UUIDHelpers::Nil;
public:
TemporaryLockForUUIDDirectory() = default;
TemporaryLockForUUIDDirectory(UUID uuid_);
~TemporaryLockForUUIDDirectory();
TemporaryLockForUUIDDirectory(TemporaryLockForUUIDDirectory && rhs) noexcept;
TemporaryLockForUUIDDirectory & operator = (TemporaryLockForUUIDDirectory && rhs) noexcept;
};
}

View File

@ -249,13 +249,22 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create)
"Enable allow_experimental_database_materialized_postgresql to use it.", ErrorCodes::UNKNOWN_DATABASE_ENGINE);
}
bool need_write_metadata = !create.attach || !fs::exists(metadata_file_path);
bool need_lock_uuid = internal || need_write_metadata;
/// Lock uuid, so we will known it's already in use.
/// We do it when attaching databases on server startup (internal) and on CREATE query (!create.attach);
TemporaryLockForUUIDDirectory uuid_lock;
if (need_lock_uuid)
uuid_lock = TemporaryLockForUUIDDirectory{create.uuid};
else if (create.uuid != UUIDHelpers::Nil && !DatabaseCatalog::instance().hasUUIDMapping(create.uuid))
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find UUID mapping for {}, it's a bug", create.uuid);
DatabasePtr database = DatabaseFactory::get(create, metadata_path / "", getContext());
if (create.uuid != UUIDHelpers::Nil)
create.setDatabase(TABLE_WITH_UUID_NAME_PLACEHOLDER);
bool need_write_metadata = !create.attach || !fs::exists(metadata_file_path);
if (need_write_metadata)
{
create.attach = true;
@ -1163,70 +1172,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create)
bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create,
const InterpreterCreateQuery::TableProperties & properties)
{
std::unique_ptr<DDLGuard> guard;
String data_path;
DatabasePtr database;
bool need_add_to_database = !create.temporary;
if (need_add_to_database)
{
/** If the request specifies IF NOT EXISTS, we allow concurrent CREATE queries (which do nothing).
* If table doesn't exist, one thread is creating table, while others wait in DDLGuard.
*/
guard = DatabaseCatalog::instance().getDDLGuard(create.getDatabase(), create.getTable());
database = DatabaseCatalog::instance().getDatabase(create.getDatabase());
assertOrSetUUID(create, database);
String storage_name = create.is_dictionary ? "Dictionary" : "Table";
auto storage_already_exists_error_code = create.is_dictionary ? ErrorCodes::DICTIONARY_ALREADY_EXISTS : ErrorCodes::TABLE_ALREADY_EXISTS;
/// Table can be created before or it can be created concurrently in another thread, while we were waiting in DDLGuard.
if (database->isTableExist(create.getTable(), getContext()))
{
/// TODO Check structure of table
if (create.if_not_exists)
return false;
else if (create.replace_view)
{
/// when executing CREATE OR REPLACE VIEW, drop current existing view
auto drop_ast = std::make_shared<ASTDropQuery>();
drop_ast->setDatabase(create.getDatabase());
drop_ast->setTable(create.getTable());
drop_ast->no_ddl_lock = true;
auto drop_context = Context::createCopy(context);
InterpreterDropQuery interpreter(drop_ast, drop_context);
interpreter.execute();
}
else
throw Exception(storage_already_exists_error_code,
"{} {}.{} already exists", storage_name, backQuoteIfNeed(create.getDatabase()), backQuoteIfNeed(create.getTable()));
}
else if (!create.attach)
{
/// Checking that table may exists in detached/detached permanently state
try
{
database->checkMetadataFilenameAvailability(create.getTable());
}
catch (const Exception &)
{
if (create.if_not_exists)
return false;
throw;
}
}
data_path = database->getTableDataPath(create);
if (!create.attach && !data_path.empty() && fs::exists(fs::path{getContext()->getPath()} / data_path))
throw Exception(storage_already_exists_error_code,
"Directory for {} data {} already exists", Poco::toLower(storage_name), String(data_path));
}
else
if (create.temporary)
{
if (create.if_not_exists && getContext()->tryResolveStorageID({"", create.getTable()}, Context::ResolveExternal))
return false;
@ -1237,6 +1183,65 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create,
return true;
}
std::unique_ptr<DDLGuard> guard;
String data_path;
DatabasePtr database;
/** If the request specifies IF NOT EXISTS, we allow concurrent CREATE queries (which do nothing).
* If table doesn't exist, one thread is creating table, while others wait in DDLGuard.
*/
guard = DatabaseCatalog::instance().getDDLGuard(create.getDatabase(), create.getTable());
database = DatabaseCatalog::instance().getDatabase(create.getDatabase());
assertOrSetUUID(create, database);
String storage_name = create.is_dictionary ? "Dictionary" : "Table";
auto storage_already_exists_error_code = create.is_dictionary ? ErrorCodes::DICTIONARY_ALREADY_EXISTS : ErrorCodes::TABLE_ALREADY_EXISTS;
/// Table can be created before or it can be created concurrently in another thread, while we were waiting in DDLGuard.
if (database->isTableExist(create.getTable(), getContext()))
{
/// TODO Check structure of table
if (create.if_not_exists)
return false;
else if (create.replace_view)
{
/// when executing CREATE OR REPLACE VIEW, drop current existing view
auto drop_ast = std::make_shared<ASTDropQuery>();
drop_ast->setDatabase(create.getDatabase());
drop_ast->setTable(create.getTable());
drop_ast->no_ddl_lock = true;
auto drop_context = Context::createCopy(context);
InterpreterDropQuery interpreter(drop_ast, drop_context);
interpreter.execute();
}
else
throw Exception(storage_already_exists_error_code,
"{} {}.{} already exists", storage_name, backQuoteIfNeed(create.getDatabase()), backQuoteIfNeed(create.getTable()));
}
else if (!create.attach)
{
/// Checking that table may exists in detached/detached permanently state
try
{
database->checkMetadataFilenameAvailability(create.getTable());
}
catch (const Exception &)
{
if (create.if_not_exists)
return false;
throw;
}
}
data_path = database->getTableDataPath(create);
if (!create.attach && !data_path.empty() && fs::exists(fs::path{getContext()->getPath()} / data_path))
throw Exception(storage_already_exists_error_code,
"Directory for {} data {} already exists", Poco::toLower(storage_name), String(data_path));
bool from_path = create.attach_from_path.has_value();
String actual_data_path = data_path;
if (from_path)
@ -1261,6 +1266,19 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create,
database->checkDetachedTableNotInUse(create.uuid);
}
/// We should lock UUID on CREATE query (because for ATTACH it must be already locked previously).
/// But ATTACH without create.attach_short_syntax flag works like CREATE actually, that's why we check it.
bool need_lock_uuid = !create.attach_short_syntax;
TemporaryLockForUUIDDirectory uuid_lock;
if (need_lock_uuid)
uuid_lock = TemporaryLockForUUIDDirectory{create.uuid};
else if (create.uuid != UUIDHelpers::Nil && !DatabaseCatalog::instance().hasUUIDMapping(create.uuid))
{
/// FIXME MaterializedPostgreSQL works with UUIDs incorrectly and breaks invariants
if (database->getEngineName() != "MaterializedPostgreSQL")
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find UUID mapping for {}, it's a bug", create.uuid);
}
StoragePtr res;
/// NOTE: CREATE query may be rewritten by Storage creator or table function
if (create.as_table_function)

View File

@ -22,6 +22,7 @@ void attach(ContextPtr context, IDatabase & system_database, const String & tabl
/// NOTE: UUIDs are not persistent, but it's ok since no data are stored on disk for these storages
/// and path is actually not used
auto table_id = StorageID(DatabaseCatalog::SYSTEM_DATABASE, table_name, UUIDHelpers::generateV4());
DatabaseCatalog::instance().addUUIDMapping(table_id.uuid);
String path = "store/" + DatabaseCatalog::getPathForUUID(table_id.uuid);
system_database.attachTable(context, table_name, std::make_shared<StorageT>(table_id, std::forward<StorageArgs>(args)...), path);
}

View File

@ -2172,4 +2172,7 @@ if __name__ == "__main__":
if args.jobs is None:
args.jobs = multiprocessing.cpu_count()
if args.db_engine and args.db_engine == "Ordinary":
MESSAGES_TO_RETRY.append(" locking attempt on ")
main(args)

View File

@ -1,3 +1,8 @@
<clickhouse>
<database_atomic_delay_before_drop_table_sec>60</database_atomic_delay_before_drop_table_sec>
<!-- Aggressive cleanup for tests to catch more issues -->
<database_catalog_unused_dir_hide_timeout_sec>0</database_catalog_unused_dir_hide_timeout_sec>
<database_catalog_unused_dir_rm_timeout_sec>5</database_catalog_unused_dir_rm_timeout_sec>
<database_catalog_unused_dir_cleanup_period_sec>10</database_catalog_unused_dir_cleanup_period_sec>
</clickhouse>

View File

@ -0,0 +1,11 @@
<clickhouse>
<database_catalog_unused_dir_hide_timeout_sec>0</database_catalog_unused_dir_hide_timeout_sec>
<database_catalog_unused_dir_rm_timeout_sec>15</database_catalog_unused_dir_rm_timeout_sec>
<database_catalog_unused_dir_cleanup_period_sec>1</database_catalog_unused_dir_cleanup_period_sec>
<!-- We don't really need [Zoo]Keeper for this test.
And it makes sense to have at least one test with TestKeeper. -->
<zookeeper>
<implementation>testkeeper</implementation>
</zookeeper>
</clickhouse>

View File

@ -1,14 +1,15 @@
import pytest
from helpers.cluster import ClickHouseCluster
from multiprocessing.dummy import Pool
from helpers.corrupt_part_data_on_disk import corrupt_part_data_on_disk
from helpers.corrupt_part_data_on_disk import break_part
import time
cluster = ClickHouseCluster(__file__)
node1 = cluster.add_instance("node1", stay_alive=True, with_zookeeper=True)
node1 = cluster.add_instance(
"node1", stay_alive=True, main_configs=["configs/store_cleanup.xml"]
)
path_to_data = "/var/lib/clickhouse/"
@ -147,3 +148,181 @@ def test_remove_broken_detached_part_replicated_merge_tree(started_cluster):
)
remove_broken_detached_part_impl("replicated_mt", node1, "broken")
def test_store_cleanup(started_cluster):
node1.query("CREATE DATABASE db UUID '10000000-1000-4000-8000-000000000001'")
node1.query(
"CREATE TABLE db.log UUID '10000000-1000-4000-8000-000000000002' ENGINE=Log AS SELECT 1"
)
node1.query(
"CREATE TABLE db.mt UUID '10000000-1000-4000-8000-000000000003' ENGINE=MergeTree ORDER BY tuple() AS SELECT 1"
)
node1.query(
"CREATE TABLE db.mem UUID '10000000-1000-4000-8000-000000000004' ENGINE=Memory AS SELECT 1"
)
node1.query("CREATE DATABASE db2 UUID '20000000-1000-4000-8000-000000000001'")
node1.query(
"CREATE TABLE db2.log UUID '20000000-1000-4000-8000-000000000002' ENGINE=Log AS SELECT 1"
)
node1.query("DETACH DATABASE db2")
node1.query("CREATE DATABASE db3 UUID '30000000-1000-4000-8000-000000000001'")
node1.query(
"CREATE TABLE db3.log UUID '30000000-1000-4000-8000-000000000002' ENGINE=Log AS SELECT 1"
)
node1.query(
"CREATE TABLE db3.log2 UUID '30000000-1000-4000-8000-000000000003' ENGINE=Log AS SELECT 1"
)
node1.query("DETACH TABLE db3.log")
node1.query("DETACH TABLE db3.log2 PERMANENTLY")
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store"]
)
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/100"]
)
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/200"]
)
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/300"]
)
node1.stop_clickhouse(kill=True)
# All dirs related to `db` will be removed
node1.exec_in_container(["rm", f"{path_to_data}/metadata/db.sql"])
node1.exec_in_container(["mkdir", f"{path_to_data}/store/kek"])
node1.exec_in_container(["touch", f"{path_to_data}/store/12"])
node1.exec_in_container(["mkdir", f"{path_to_data}/store/456"])
node1.exec_in_container(["mkdir", f"{path_to_data}/store/456/testgarbage"])
node1.exec_in_container(
["mkdir", f"{path_to_data}/store/456/30000000-1000-4000-8000-000000000003"]
)
node1.exec_in_container(
["touch", f"{path_to_data}/store/456/45600000-1000-4000-8000-000000000003"]
)
node1.exec_in_container(
["mkdir", f"{path_to_data}/store/456/45600000-1000-4000-8000-000000000004"]
)
node1.start_clickhouse()
node1.query("DETACH DATABASE db2")
node1.query("DETACH TABLE db3.log")
node1.wait_for_log_line("Removing access rights for unused directory")
time.sleep(1)
node1.wait_for_log_line("testgarbage")
node1.wait_for_log_line("directories from store")
store = node1.exec_in_container(["ls", f"{path_to_data}/store"])
assert "100" in store
assert "200" in store
assert "300" in store
assert "456" in store
assert "kek" in store
assert "12" in store
assert "d---------" in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store"]
)
assert "d---------" in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/456"]
)
# Metadata is removed, so store/100 contains garbage
store100 = node1.exec_in_container(["ls", f"{path_to_data}/store/100"])
assert "10000000-1000-4000-8000-000000000001" in store100
assert "10000000-1000-4000-8000-000000000002" in store100
assert "10000000-1000-4000-8000-000000000003" in store100
assert "d---------" in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/100"]
)
# Database is detached, nothing to clean up
store200 = node1.exec_in_container(["ls", f"{path_to_data}/store/200"])
assert "20000000-1000-4000-8000-000000000001" in store200
assert "20000000-1000-4000-8000-000000000002" in store200
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/200"]
)
# Tables are detached, nothing to clean up
store300 = node1.exec_in_container(["ls", f"{path_to_data}/store/300"])
assert "30000000-1000-4000-8000-000000000001" in store300
assert "30000000-1000-4000-8000-000000000002" in store300
assert "30000000-1000-4000-8000-000000000003" in store300
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/300"]
)
# Manually created garbage
store456 = node1.exec_in_container(["ls", f"{path_to_data}/store/456"])
assert "30000000-1000-4000-8000-000000000003" in store456
assert "45600000-1000-4000-8000-000000000003" in store456
assert "45600000-1000-4000-8000-000000000004" in store456
assert "testgarbage" in store456
assert "----------" in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/456"]
)
node1.wait_for_log_line("Removing unused directory")
time.sleep(1)
node1.wait_for_log_line("directories from store")
store = node1.exec_in_container(["ls", f"{path_to_data}/store"])
assert "100" in store
assert "200" in store
assert "300" in store
assert "456" in store
assert "kek" not in store # changed
assert "\n12\n" not in store # changed
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store"]
) # changed
# Metadata is removed, so store/100 contains garbage
store100 = node1.exec_in_container(["ls", f"{path_to_data}/store/100"]) # changed
assert "10000000-1000-4000-8000-000000000001" not in store100 # changed
assert "10000000-1000-4000-8000-000000000002" not in store100 # changed
assert "10000000-1000-4000-8000-000000000003" not in store100 # changed
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/100"]
) # changed
# Database is detached, nothing to clean up
store200 = node1.exec_in_container(["ls", f"{path_to_data}/store/200"])
assert "20000000-1000-4000-8000-000000000001" in store200
assert "20000000-1000-4000-8000-000000000002" in store200
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/200"]
)
# Tables are detached, nothing to clean up
store300 = node1.exec_in_container(["ls", f"{path_to_data}/store/300"])
assert "30000000-1000-4000-8000-000000000001" in store300
assert "30000000-1000-4000-8000-000000000002" in store300
assert "30000000-1000-4000-8000-000000000003" in store300
assert "d---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/300"]
)
# Manually created garbage
store456 = node1.exec_in_container(["ls", f"{path_to_data}/store/456"])
assert "30000000-1000-4000-8000-000000000003" not in store456 # changed
assert "45600000-1000-4000-8000-000000000003" not in store456 # changed
assert "45600000-1000-4000-8000-000000000004" not in store456 # changed
assert "testgarbage" not in store456 # changed
assert "---------" not in node1.exec_in_container(
["ls", "-l", f"{path_to_data}/store/456"]
) # changed
node1.query("ATTACH TABLE db3.log2")
node1.query("ATTACH DATABASE db2")
node1.query("ATTACH TABLE db3.log")
assert "1\n" == node1.query("SELECT * FROM db3.log")
assert "1\n" == node1.query("SELECT * FROM db3.log2")
assert "1\n" == node1.query("SELECT * FROM db2.log")

View File

@ -45,8 +45,8 @@ function check_replication_consistency()
while [[ $($CLICKHOUSE_CLIENT -q "SELECT count() FROM system.processes WHERE current_database=currentDatabase() AND query LIKE '%$table_name_prefix%'") -ne 1 ]]; do
sleep 0.5;
num_tries=$((num_tries+1))
if [ $num_tries -eq 100 ]; then
$CLICKHOUSE_CLIENT -q "SELECT count() FROM system.processes WHERE current_database=currentDatabase() AND query LIKE '%$table_name_prefix%' FORMAT Vertical"
if [ $num_tries -eq 200 ]; then
$CLICKHOUSE_CLIENT -q "SELECT * FROM system.processes WHERE current_database=currentDatabase() AND query LIKE '%$table_name_prefix%' FORMAT Vertical"
break
fi
done

View File

@ -138,7 +138,7 @@ function wait_for_queries_to_finish()
sleep 0.5;
num_tries=$((num_tries+1))
if [ $num_tries -eq 20 ]; then
$CLICKHOUSE_CLIENT -q "SELECT count() FROM system.processes WHERE current_database=currentDatabase() AND query NOT LIKE '%system.processes%' FORMAT Vertical"
$CLICKHOUSE_CLIENT -q "SELECT * FROM system.processes WHERE current_database=currentDatabase() AND query NOT LIKE '%system.processes%' FORMAT Vertical"
break
fi
done