mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-12 17:32:32 +00:00
Merge pull request #72810 from ClickHouse/disable-metadata-deletion-in-s3-queue
Better deletion of metadata in s3queue
This commit is contained in:
commit
63603cc6d7
@ -17,6 +17,7 @@
|
||||
#include <Common/ZooKeeper/ZooKeeper.h>
|
||||
#include <Common/getRandomASCIIString.h>
|
||||
#include <Common/randomSeed.h>
|
||||
#include <Common/DNSResolver.h>
|
||||
#include <numeric>
|
||||
|
||||
|
||||
@ -438,6 +439,163 @@ ObjectStorageQueueTableMetadata ObjectStorageQueueMetadata::syncWithKeeper(
|
||||
"of wrong zookeeper path or because of logical error");
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
struct Info
|
||||
{
|
||||
std::string hostname;
|
||||
std::string table_id;
|
||||
|
||||
bool operator ==(const Info & other) const
|
||||
{
|
||||
return hostname == other.hostname && table_id == other.table_id;
|
||||
}
|
||||
|
||||
static Info create(const StorageID & storage_id)
|
||||
{
|
||||
Info self;
|
||||
self.hostname = DNSResolver::instance().getHostName();
|
||||
self.table_id = storage_id.hasUUID() ? toString(storage_id.uuid) : storage_id.getFullTableName();
|
||||
return self;
|
||||
}
|
||||
|
||||
std::string serialize() const
|
||||
{
|
||||
WriteBufferFromOwnString buf;
|
||||
size_t version = 0;
|
||||
buf << version << "\n";
|
||||
buf << hostname << "\n";
|
||||
buf << table_id << "\n";
|
||||
return buf.str();
|
||||
}
|
||||
|
||||
static Info deserialize(const std::string & str)
|
||||
{
|
||||
ReadBufferFromString buf(str);
|
||||
Info info;
|
||||
size_t version;
|
||||
buf >> version >> "\n";
|
||||
buf >> info.hostname >> "\n";
|
||||
buf >> info.table_id >> "\n";
|
||||
return info;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
void ObjectStorageQueueMetadata::registerIfNot(const StorageID & storage_id)
|
||||
{
|
||||
const auto registry_path = zookeeper_path / "registry";
|
||||
const auto self = Info::create(storage_id);
|
||||
|
||||
Coordination::Error code;
|
||||
for (size_t i = 0; i < 1000; ++i)
|
||||
{
|
||||
Coordination::Stat stat;
|
||||
std::string registry_str;
|
||||
auto zk_client = getZooKeeper();
|
||||
|
||||
if (zk_client->tryGet(registry_path, registry_str, &stat))
|
||||
{
|
||||
Strings registered;
|
||||
splitInto<','>(registered, registry_str);
|
||||
|
||||
for (const auto & elem : registered)
|
||||
{
|
||||
if (elem.empty())
|
||||
continue;
|
||||
|
||||
auto info = Info::deserialize(elem);
|
||||
if (info == self)
|
||||
{
|
||||
LOG_TRACE(log, "Table {} is already registered", self.table_id);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
auto new_registry_str = registry_str + "," + self.serialize();
|
||||
code = zk_client->trySet(registry_path, new_registry_str, stat.version);
|
||||
}
|
||||
else
|
||||
code = zk_client->tryCreate(registry_path, self.serialize(), zkutil::CreateMode::Persistent);
|
||||
|
||||
if (code == Coordination::Error::ZOK)
|
||||
{
|
||||
LOG_TRACE(log, "Added {} to registry", self.table_id);
|
||||
return;
|
||||
}
|
||||
|
||||
if (code == Coordination::Error::ZBADVERSION
|
||||
|| code == Coordination::Error::ZSESSIONEXPIRED)
|
||||
continue;
|
||||
|
||||
throw zkutil::KeeperException(code);
|
||||
}
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot register in keeper. Last error: {}", code);
|
||||
}
|
||||
|
||||
size_t ObjectStorageQueueMetadata::unregister(const StorageID & storage_id)
|
||||
{
|
||||
const auto registry_path = zookeeper_path / "registry";
|
||||
const auto self = Info::create(storage_id);
|
||||
|
||||
Coordination::Error code = Coordination::Error::ZOK;
|
||||
for (size_t i = 0; i < 1000; ++i)
|
||||
{
|
||||
Coordination::Stat stat;
|
||||
std::string registry_str;
|
||||
auto zk_client = getZooKeeper();
|
||||
|
||||
bool node_exists = zk_client->tryGet(registry_path, registry_str, &stat);
|
||||
if (!node_exists)
|
||||
{
|
||||
LOG_WARNING(log, "Cannot unregister: registry does not exist");
|
||||
chassert(false);
|
||||
return 0;
|
||||
}
|
||||
|
||||
Strings registered;
|
||||
splitInto<','>(registered, registry_str);
|
||||
|
||||
bool found = false;
|
||||
std::string new_registry_str;
|
||||
size_t count = 0;
|
||||
for (const auto & elem : registered)
|
||||
{
|
||||
if (elem.empty())
|
||||
continue;
|
||||
|
||||
auto info = Info::deserialize(elem);
|
||||
if (info == self)
|
||||
found = true;
|
||||
else
|
||||
{
|
||||
if (!new_registry_str.empty())
|
||||
new_registry_str += ",";
|
||||
new_registry_str += elem;
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
if (!found)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot unregister: not registered");
|
||||
|
||||
code = zk_client->trySet(registry_path, new_registry_str, stat.version);
|
||||
|
||||
if (code == Coordination::Error::ZOK)
|
||||
return count;
|
||||
|
||||
if (Coordination::isHardwareError(code)
|
||||
|| code == Coordination::Error::ZBADVERSION)
|
||||
continue;
|
||||
|
||||
throw zkutil::KeeperException(code);
|
||||
}
|
||||
|
||||
if (Coordination::isHardwareError(code))
|
||||
throw zkutil::KeeperException(code);
|
||||
else
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot unregister in keeper. Last error: {}", code);
|
||||
}
|
||||
|
||||
void ObjectStorageQueueMetadata::cleanupThreadFunc()
|
||||
{
|
||||
/// A background task is responsible for maintaining
|
||||
|
@ -71,6 +71,9 @@ public:
|
||||
bool is_attach,
|
||||
LoggerPtr log);
|
||||
|
||||
void registerIfNot(const StorageID & storage_id);
|
||||
size_t unregister(const StorageID & storage_id);
|
||||
|
||||
void shutdown();
|
||||
|
||||
FileMetadataPtr getFileMetadata(const std::string & path, ObjectStorageQueueOrderedFileMetadata::BucketInfoPtr bucket_info = {});
|
||||
|
@ -16,7 +16,8 @@ ObjectStorageQueueMetadataFactory & ObjectStorageQueueMetadataFactory::instance(
|
||||
|
||||
ObjectStorageQueueMetadataFactory::FilesMetadataPtr ObjectStorageQueueMetadataFactory::getOrCreate(
|
||||
const std::string & zookeeper_path,
|
||||
ObjectStorageQueueMetadataPtr metadata)
|
||||
ObjectStorageQueueMetadataPtr metadata,
|
||||
const StorageID & storage_id)
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
auto it = metadata_by_path.find(zookeeper_path);
|
||||
@ -30,13 +31,14 @@ ObjectStorageQueueMetadataFactory::FilesMetadataPtr ObjectStorageQueueMetadataFa
|
||||
auto & metadata_from_keeper = it->second.metadata->getTableMetadata();
|
||||
|
||||
metadata_from_table.checkEquals(metadata_from_keeper);
|
||||
|
||||
it->second.ref_count += 1;
|
||||
}
|
||||
|
||||
it->second.metadata->registerIfNot(storage_id);
|
||||
it->second.ref_count += 1;
|
||||
return it->second.metadata;
|
||||
}
|
||||
|
||||
void ObjectStorageQueueMetadataFactory::remove(const std::string & zookeeper_path)
|
||||
void ObjectStorageQueueMetadataFactory::remove(const std::string & zookeeper_path, const StorageID & storage_id)
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
auto it = metadata_by_path.find(zookeeper_path);
|
||||
@ -44,28 +46,52 @@ void ObjectStorageQueueMetadataFactory::remove(const std::string & zookeeper_pat
|
||||
if (it == metadata_by_path.end())
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Metadata with zookeeper path {} does not exist", zookeeper_path);
|
||||
|
||||
chassert(it->second.ref_count > 0);
|
||||
if (--it->second.ref_count == 0)
|
||||
{
|
||||
it->second.ref_count -= 1;
|
||||
|
||||
size_t registry_size;
|
||||
try
|
||||
{
|
||||
auto zk_client = Context::getGlobalContextInstance()->getZooKeeper();
|
||||
zk_client->tryRemove(it->first);
|
||||
registry_size = it->second.metadata->unregister(storage_id);
|
||||
LOG_TRACE(log, "Remaining registry size: {}", registry_size);
|
||||
}
|
||||
catch (const zkutil::KeeperException & e)
|
||||
{
|
||||
if (!Coordination::isHardwareError(e.code))
|
||||
{
|
||||
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||
}
|
||||
/// Any non-zero value would do.
|
||||
registry_size = 1;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||
/// Any non-zero value would do.
|
||||
registry_size = 1;
|
||||
}
|
||||
|
||||
metadata_by_path.erase(it);
|
||||
if (registry_size == 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
auto zk_client = Context::getGlobalContextInstance()->getZooKeeper();
|
||||
zk_client->removeRecursive(it->first);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
tryLogCurrentException(log);
|
||||
}
|
||||
}
|
||||
|
||||
if (!it->second.ref_count)
|
||||
metadata_by_path.erase(it);
|
||||
}
|
||||
|
||||
std::unordered_map<std::string, ObjectStorageQueueMetadataFactory::FilesMetadataPtr> ObjectStorageQueueMetadataFactory::getAll()
|
||||
{
|
||||
std::unordered_map<std::string, ObjectStorageQueueMetadataFactory::FilesMetadataPtr> result;
|
||||
for (const auto & [zk_path, metadata_and_ref_count] : metadata_by_path)
|
||||
result.emplace(zk_path, metadata_and_ref_count.metadata);
|
||||
for (const auto & [zk_path, metadata] : metadata_by_path)
|
||||
result.emplace(zk_path, metadata.metadata);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -14,25 +14,25 @@ public:
|
||||
|
||||
FilesMetadataPtr getOrCreate(
|
||||
const std::string & zookeeper_path,
|
||||
ObjectStorageQueueMetadataPtr metadata);
|
||||
ObjectStorageQueueMetadataPtr metadata,
|
||||
const StorageID & storage_id);
|
||||
|
||||
void remove(const std::string & zookeeper_path);
|
||||
void remove(const std::string & zookeeper_path, const StorageID & storage_id);
|
||||
|
||||
std::unordered_map<std::string, FilesMetadataPtr> getAll();
|
||||
|
||||
private:
|
||||
struct Metadata
|
||||
struct MetadataWithRefCount
|
||||
{
|
||||
explicit Metadata(std::shared_ptr<ObjectStorageQueueMetadata> metadata_) : metadata(metadata_), ref_count(1) {}
|
||||
|
||||
explicit MetadataWithRefCount(std::shared_ptr<ObjectStorageQueueMetadata> metadata_) : metadata(metadata_) {}
|
||||
std::shared_ptr<ObjectStorageQueueMetadata> metadata;
|
||||
/// TODO: the ref count should be kept in keeper, because of the case with distributed processing.
|
||||
size_t ref_count = 0;
|
||||
};
|
||||
using MetadataByPath = std::unordered_map<std::string, Metadata>;
|
||||
using MetadataByPath = std::unordered_map<std::string, MetadataWithRefCount>;
|
||||
|
||||
MetadataByPath metadata_by_path;
|
||||
std::mutex mutex;
|
||||
LoggerPtr log = getLogger("QueueMetadataFactory");
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -214,9 +214,12 @@ StorageObjectStorageQueue::StorageObjectStorageQueue(
|
||||
zk_path, *queue_settings_, storage_metadata.getColumns(), configuration_->format, context_, is_attach, log);
|
||||
|
||||
auto queue_metadata = std::make_unique<ObjectStorageQueueMetadata>(
|
||||
zk_path, std::move(table_metadata), (*queue_settings_)[ObjectStorageQueueSetting::cleanup_interval_min_ms], (*queue_settings_)[ObjectStorageQueueSetting::cleanup_interval_max_ms]);
|
||||
zk_path,
|
||||
std::move(table_metadata),
|
||||
(*queue_settings_)[ObjectStorageQueueSetting::cleanup_interval_min_ms],
|
||||
(*queue_settings_)[ObjectStorageQueueSetting::cleanup_interval_max_ms]);
|
||||
|
||||
files_metadata = ObjectStorageQueueMetadataFactory::instance().getOrCreate(zk_path, std::move(queue_metadata));
|
||||
files_metadata = ObjectStorageQueueMetadataFactory::instance().getOrCreate(zk_path, std::move(queue_metadata), table_id_);
|
||||
|
||||
task = getContext()->getSchedulePool().createTask("ObjectStorageQueueStreamingTask", [this] { threadFunc(); });
|
||||
}
|
||||
@ -248,7 +251,7 @@ void StorageObjectStorageQueue::shutdown(bool is_drop)
|
||||
|
||||
void StorageObjectStorageQueue::drop()
|
||||
{
|
||||
ObjectStorageQueueMetadataFactory::instance().remove(zk_path);
|
||||
ObjectStorageQueueMetadataFactory::instance().remove(zk_path, getStorageID());
|
||||
}
|
||||
|
||||
bool StorageObjectStorageQueue::supportsSubsetOfColumns(const ContextPtr & context_) const
|
||||
|
@ -2403,3 +2403,127 @@ def test_list_and_delete_race(started_cluster):
|
||||
assert node.contains_in_log(
|
||||
"because of the race with list & delete"
|
||||
) or node_2.contains_in_log("because of the race with list & delete")
|
||||
|
||||
|
||||
def test_registry(started_cluster):
|
||||
node1 = started_cluster.instances["node1"]
|
||||
node2 = started_cluster.instances["node2"]
|
||||
|
||||
table_name = f"test_registry_{uuid.uuid4().hex[:8]}"
|
||||
db_name = f"db_{table_name}"
|
||||
dst_table_name = f"{table_name}_dst"
|
||||
keeper_path = f"/clickhouse/test_{table_name}"
|
||||
files_path = f"{table_name}_data"
|
||||
files_to_generate = 1000
|
||||
|
||||
node1.query(f"DROP DATABASE IF EXISTS {db_name}")
|
||||
node2.query(f"DROP DATABASE IF EXISTS {db_name}")
|
||||
|
||||
node1.query(
|
||||
f"CREATE DATABASE {db_name} ENGINE=Replicated('/clickhouse/databases/replicateddb2', 'shard1', 'node1')"
|
||||
)
|
||||
node2.query(
|
||||
f"CREATE DATABASE {db_name} ENGINE=Replicated('/clickhouse/databases/replicateddb2', 'shard1', 'node2')"
|
||||
)
|
||||
|
||||
create_table(
|
||||
started_cluster,
|
||||
node1,
|
||||
table_name,
|
||||
"ordered",
|
||||
files_path,
|
||||
additional_settings={"keeper_path": keeper_path, "buckets": 3},
|
||||
database_name=db_name,
|
||||
)
|
||||
|
||||
zk = started_cluster.get_kazoo_client("zoo1")
|
||||
registry, stat = zk.get(f"{keeper_path}/registry/")
|
||||
|
||||
uuid1 = node1.query(
|
||||
f"SELECT uuid FROM system.tables WHERE database = '{db_name}' and table = '{table_name}'"
|
||||
).strip()
|
||||
assert uuid1 in str(registry)
|
||||
|
||||
expected = [f"0\\nnode1\\n{uuid1}\\n", f"0\\nnode2\\n{uuid1}\\n"]
|
||||
|
||||
for elem in expected:
|
||||
assert elem in str(registry)
|
||||
|
||||
total_values = generate_random_files(
|
||||
started_cluster, files_path, files_to_generate, start_ind=0, row_num=1
|
||||
)
|
||||
|
||||
create_mv(node1, f"{db_name}.{table_name}", dst_table_name)
|
||||
create_mv(node2, f"{db_name}.{table_name}", dst_table_name)
|
||||
|
||||
def get_count():
|
||||
return int(
|
||||
node1.query(
|
||||
f"SELECT count() FROM clusterAllReplicas(cluster, default.{dst_table_name})"
|
||||
)
|
||||
)
|
||||
|
||||
expected_rows = files_to_generate
|
||||
for _ in range(20):
|
||||
if expected_rows == get_count():
|
||||
break
|
||||
time.sleep(1)
|
||||
assert expected_rows == get_count()
|
||||
|
||||
table_name_2 = f"test_registry_{uuid.uuid4().hex[:8]}_2"
|
||||
create_table(
|
||||
started_cluster,
|
||||
node1,
|
||||
table_name_2,
|
||||
"ordered",
|
||||
files_path,
|
||||
additional_settings={"keeper_path": keeper_path, "buckets": 3},
|
||||
database_name=db_name,
|
||||
)
|
||||
|
||||
registry, stat = zk.get(f"{keeper_path}/registry/")
|
||||
|
||||
uuid2 = node1.query(
|
||||
f"SELECT uuid FROM system.tables WHERE database = '{db_name}' and table = '{table_name_2}'"
|
||||
).strip()
|
||||
|
||||
assert uuid1 in str(registry)
|
||||
assert uuid2 in str(registry)
|
||||
|
||||
expected = [
|
||||
f"0\\nnode1\\n{uuid1}\\n",
|
||||
f"0\\nnode2\\n{uuid1}\\n",
|
||||
f"0\\nnode1\\n{uuid2}\\n",
|
||||
f"0\\nnode2\\n{uuid2}\\n",
|
||||
]
|
||||
|
||||
for elem in expected:
|
||||
assert elem in str(registry)
|
||||
|
||||
node1.restart_clickhouse()
|
||||
node2.restart_clickhouse()
|
||||
|
||||
registry, stat = zk.get(f"{keeper_path}/registry/")
|
||||
|
||||
assert uuid1 in str(registry)
|
||||
assert uuid2 in str(registry)
|
||||
|
||||
node1.query(f"DROP TABLE {db_name}.{table_name_2} SYNC")
|
||||
|
||||
assert zk.exists(keeper_path) is not None
|
||||
registry, stat = zk.get(f"{keeper_path}/registry/")
|
||||
|
||||
assert uuid1 in str(registry)
|
||||
assert uuid2 not in str(registry)
|
||||
|
||||
expected = [
|
||||
f"0\\nnode1\\n{uuid1}\\n",
|
||||
f"0\\nnode2\\n{uuid1}\\n",
|
||||
]
|
||||
|
||||
for elem in expected:
|
||||
assert elem in str(registry)
|
||||
|
||||
node1.query(f"DROP TABLE {db_name}.{table_name} SYNC")
|
||||
|
||||
assert zk.exists(keeper_path) is None
|
||||
|
Loading…
Reference in New Issue
Block a user