mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-15 10:52:30 +00:00
921518db0a
* add the query data deduplication excluding duplicated parts in MergeTree family engines. query deduplication is based on parts' UUID which should be enabled first with merge_tree setting assign_part_uuids=1 allow_experimental_query_deduplication setting is to enable part deduplication, default ot false. data part UUID is a mechanism of giving a data part a unique identifier. Having UUID and deduplication mechanism provides a potential of moving parts between shards preserving data consistency on a read path: duplicated UUIDs will cause root executor to retry query against on of the replica explicitly asking to exclude encountered duplicated fingerprints during a distributed query execution. NOTE: this implementation don't provide any knobs to lock part and hence its UUID. Any mutations/merge will update part's UUID. * add _part_uuid virtual column, allowing to use UUIDs in predicates. Signed-off-by: Aleksei Semiglazov <asemiglazov@cloudflare.com> address comments
39 lines
853 B
C++
39 lines
853 B
C++
#include <Storages/MergeTree/MergeTreeDataPartUUID.h>
|
|
|
|
namespace DB
|
|
{
|
|
|
|
std::vector<UUID> PartUUIDs::add(const std::vector<UUID> & new_uuids)
|
|
{
|
|
std::lock_guard lock(mutex);
|
|
std::vector<UUID> intersection;
|
|
|
|
/// First check any presence of uuids in a uuids, return duplicates back if any
|
|
for (const auto & uuid : new_uuids)
|
|
{
|
|
if (uuids.find(uuid) != uuids.end())
|
|
intersection.emplace_back(uuid);
|
|
}
|
|
|
|
if (intersection.empty())
|
|
{
|
|
for (const auto & uuid : new_uuids)
|
|
uuids.emplace(uuid);
|
|
}
|
|
return intersection;
|
|
}
|
|
|
|
std::vector<UUID> PartUUIDs::get() const
|
|
{
|
|
std::lock_guard lock(mutex);
|
|
return std::vector<UUID>(uuids.begin(), uuids.end());
|
|
}
|
|
|
|
bool PartUUIDs::has(const UUID & uuid) const
|
|
{
|
|
std::lock_guard lock(mutex);
|
|
return uuids.find(uuid) != uuids.end();
|
|
}
|
|
|
|
}
|