Merge pull request #60585 from aalexfvk/traverse_shadow_remote_data_paths

Traverse shadow directory for system.remote_data_paths
This commit is contained in:
Kseniia Sumarokova 2024-03-21 19:02:36 +01:00 committed by GitHub
commit 0c824dab17
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 64 additions and 7 deletions

View File

@ -859,6 +859,7 @@ class IColumn;
M(Bool, optimize_uniq_to_count, true, "Rewrite uniq and its variants(except uniqUpTo) to count if subquery has distinct or group by clause.", 0) \
M(Bool, use_variant_as_common_type, false, "Use Variant as a result type for if/multiIf in case when there is no common type for arguments", 0) \
M(Bool, enable_order_by_all, true, "Enable sorting expression ORDER BY ALL.", 0) \
M(Bool, traverse_shadow_remote_data_paths, false, "Traverse shadow directory when query system.remote_data_paths", 0) \
\
/** Experimental functions */ \
M(Bool, allow_experimental_materialized_postgresql_table, false, "Allows to use the MaterializedPostgreSQL table engine. Disabled by default, because this feature is experimental", 0) \

View File

@ -92,6 +92,7 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
{"page_cache_inject_eviction", false, false, "Added userspace page cache"},
{"default_table_engine", "None", "MergeTree", "Set default table engine to MergeTree for better usability"},
{"input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects", false, false, "Allow to use String type for ambiguous paths during named tuple inference from JSON objects"},
{"traverse_shadow_remote_data_paths", false, false, "Traverse shadow directory when query system.remote_data_paths."},
{"throw_if_deduplication_in_dependent_materialized_views_enabled_with_async_insert", false, true, "Deduplication is dependent materialized view cannot work together with async inserts."},
{"parallel_replicas_allow_in_with_subquery", false, true, "If true, subquery for IN will be executed on every follower replica"},
{"function_locate_has_mysql_compatible_argument_order", false, true, "Increase compatibility with MySQL's locate function."},

View File

@ -320,9 +320,11 @@ public:
{}
};
virtual void getRemotePathsRecursive(const String &, std::vector<LocalPathWithObjectStoragePaths> &)
virtual void getRemotePathsRecursive(
const String &, std::vector<LocalPathWithObjectStoragePaths> &, const std::function<bool(const String &)> & /* skip_predicate */)
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
throw Exception(
ErrorCodes::NOT_IMPLEMENTED,
"Method `getRemotePathsRecursive() not implemented for disk: {}`",
getDataSourceDescription().toString());
}

View File

@ -91,11 +91,17 @@ StoredObjects DiskObjectStorage::getStorageObjects(const String & local_path) co
return metadata_storage->getStorageObjects(local_path);
}
void DiskObjectStorage::getRemotePathsRecursive(const String & local_path, std::vector<LocalPathWithObjectStoragePaths> & paths_map)
void DiskObjectStorage::getRemotePathsRecursive(
const String & local_path,
std::vector<LocalPathWithObjectStoragePaths> & paths_map,
const std::function<bool(const String &)> & skip_predicate)
{
if (!metadata_storage->exists(local_path))
return;
if (skip_predicate && skip_predicate(local_path))
return;
/// Protect against concurrent delition of files (for example because of a merge).
if (metadata_storage->isFile(local_path))
{
@ -143,7 +149,7 @@ void DiskObjectStorage::getRemotePathsRecursive(const String & local_path, std::
}
for (; it->isValid(); it->next())
DiskObjectStorage::getRemotePathsRecursive(fs::path(local_path) / it->name(), paths_map);
DiskObjectStorage::getRemotePathsRecursive(fs::path(local_path) / it->name(), paths_map, skip_predicate);
}
}

View File

@ -48,7 +48,10 @@ public:
StoredObjects getStorageObjects(const String & local_path) const override;
void getRemotePathsRecursive(const String & local_path, std::vector<LocalPathWithObjectStoragePaths> & paths_map) override;
void getRemotePathsRecursive(
const String & local_path,
std::vector<LocalPathWithObjectStoragePaths> & paths_map,
const std::function<bool(const String &)> & skip_predicate) override;
const std::string & getCacheName() const override { return object_storage->getCacheName(); }

View File

@ -10,6 +10,7 @@
#include <Interpreters/Context.h>
#include <Disks/IDisk.h>
namespace fs = std::filesystem;
namespace DB
{
@ -58,8 +59,20 @@ Pipe StorageSystemRemoteDataPaths::read(
if (disk->isRemote())
{
std::vector<IDisk::LocalPathWithObjectStoragePaths> remote_paths_by_local_path;
disk->getRemotePathsRecursive("store", remote_paths_by_local_path);
disk->getRemotePathsRecursive("data", remote_paths_by_local_path);
disk->getRemotePathsRecursive("store", remote_paths_by_local_path, /* skip_predicate = */ {});
disk->getRemotePathsRecursive("data", remote_paths_by_local_path, /* skip_predicate = */ {});
if (context->getSettingsRef().traverse_shadow_remote_data_paths)
disk->getRemotePathsRecursive(
"shadow",
remote_paths_by_local_path,
[](const String & local_path)
{
// `shadow/{backup_name}/revision.txt` is not an object metadata file
const auto path = fs::path(local_path);
return path.filename() == "revision.txt" &&
path.parent_path().has_parent_path() &&
path.parent_path().parent_path().filename() == "shadow";
});
FileCachePtr cache;

View File

@ -0,0 +1,29 @@
#!/usr/bin/env bash
# Tags: no-fasttest
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
TABLE="03000_traverse_shadow_system_data_path_table"
BACKUP="03000_traverse_shadow_system_data_path_backup"
${CLICKHOUSE_CLIENT} --query="CREATE TABLE ${TABLE} (
id Int64,
data String
) ENGINE=MergeTree()
ORDER BY id
SETTINGS storage_policy='s3_cache';"
${CLICKHOUSE_CLIENT} --query="INSERT INTO ${TABLE} VALUES (0, 'data');"
${CLICKHOUSE_CLIENT} --query "SELECT count() > 0 FROM system.remote_data_paths WHERE disk_name = 's3_cache'"
${CLICKHOUSE_CLIENT} --query="ALTER TABLE ${TABLE} FREEZE WITH NAME '${BACKUP}';"
${CLICKHOUSE_CLIENT} --query="DROP TABLE ${TABLE} SYNC;"
${CLICKHOUSE_CLIENT} --query "
SELECT count() > 0
FROM system.remote_data_paths
WHERE disk_name = 's3_cache' AND local_path LIKE '%shadow/${BACKUP}%'
SETTINGS traverse_shadow_remote_data_paths=1;"
${CLICKHOUSE_CLIENT} --query "SYSTEM UNFREEZE WITH NAME '${BACKUP}';" &>/dev/null || true