This commit is contained in:
Nikolai Kochetov 2022-04-21 19:19:13 +00:00
parent bcbab2ead8
commit 9133e398b8
17 changed files with 398 additions and 191 deletions

View File

@ -95,30 +95,42 @@ size_t DataPartStorageOnDisk::getFileSize(const String & path) const
return volume->getDisk()->getFileSize(fs::path(root_path) / part_dir / path); return volume->getDisk()->getFileSize(fs::path(root_path) / part_dir / path);
} }
DiskDirectoryIteratorPtr DataPartStorageOnDisk::iterate() const UInt32 DataPartStorageOnDisk::getRefCount(const String & path) const
{ {
return volume->getDisk()->iterateDirectory(fs::path(root_path) / part_dir); return volume->getDisk()->getRefCount(fs::path(root_path) / part_dir / path);
} }
DiskDirectoryIteratorPtr DataPartStorageOnDisk::iterateDirectory(const String & path) const class DataPartStorageIteratorOnDisk final : public IDataPartStorageIterator
{ {
return volume->getDisk()->iterateDirectory(fs::path(root_path) / part_dir / path); public:
DataPartStorageIteratorOnDisk(DiskPtr disk_, DiskDirectoryIteratorPtr it_)
: disk(std::move(disk_)), it(std::move(it_))
{
}
void next() override { it->next(); }
bool isValid() const override { return it->isValid(); }
bool isFile() const override { return isValid() && disk->isFile(it->path()); }
std::string name() const override { return it->name(); }
private:
DiskPtr disk;
DiskDirectoryIteratorPtr it;
};
DataPartStorageIteratorPtr DataPartStorageOnDisk::iterate() const
{
return std::make_unique<DataPartStorageIteratorOnDisk>(
volume->getDisk(),
volume->getDisk()->iterateDirectory(fs::path(root_path) / part_dir));
} }
// namespace DataPartStorageIteratorPtr DataPartStorageOnDisk::iterateDirectory(const String & path) const
// { {
// static constexpr std::string_view non_checksum_files[] = return std::make_unique<DataPartStorageIteratorOnDisk>(
// { volume->getDisk(),
// "checksums.txt", volume->getDisk()->iterateDirectory(fs::path(root_path) / part_dir / path));
// "columns.txt", }
// "default_compression_codec.txt",
// "delete-on-destroy.txt",
// "txn_version.txt",
// };
// static constexpr std::span<std::string_view> projection_non_checksum_files(non_checksum_files, 4);
// static constexpr std::span<std::string_view> part_non_checksum_files(non_checksum_files, 5);
// }
void DataPartStorageOnDisk::remove( void DataPartStorageOnDisk::remove(
bool keep_shared_data, bool keep_shared_data,
@ -244,6 +256,11 @@ DataPartStoragePtr DataPartStorageOnDisk::getProjection(const std::string & name
return std::make_shared<DataPartStorageOnDisk>(volume, std::string(fs::path(root_path) / part_dir), name); return std::make_shared<DataPartStorageOnDisk>(volume, std::string(fs::path(root_path) / part_dir), name);
} }
DiskPtr DataPartStorageOnDisk::getDisk() const
{
return volume->getDisk();
}
static UInt64 calculateTotalSizeOnDiskImpl(const DiskPtr & disk, const String & from) static UInt64 calculateTotalSizeOnDiskImpl(const DiskPtr & disk, const String & from)
{ {
if (disk->isFile(from)) if (disk->isFile(from))
@ -281,28 +298,62 @@ std::string DataPartStorageOnDisk::getDiskPathForLogs() const
return volume->getDisk()->getPath(); return volume->getDisk()->getPath();
} }
void DataPartStorageOnDisk::writeChecksums(MergeTreeDataPartChecksums & checksums) const void DataPartStorageOnDisk::writeChecksums(const MergeTreeDataPartChecksums & checksums) const
{ {
std::string path = fs::path(root_path) / part_dir / "checksums.txt"; std::string path = fs::path(root_path) / part_dir / "checksums.txt";
try
{ {
auto out = volume->getDisk()->writeFile(path + ".tmp", 4096); {
checksums.write(*out); auto out = volume->getDisk()->writeFile(path + ".tmp", 4096);
} checksums.write(*out);
}
volume->getDisk()->moveFile(path + ".tmp", path); volume->getDisk()->moveFile(path + ".tmp", path);
}
catch (...)
{
try
{
if (volume->getDisk()->exists(path + ".tmp"))
volume->getDisk()->removeFile(path + ".tmp");
}
catch (...)
{
tryLogCurrentException("DataPartStorageOnDisk");
}
throw;
}
} }
void DataPartStorageOnDisk::writeColumns(NamesAndTypesList & columns) const void DataPartStorageOnDisk::writeColumns(const NamesAndTypesList & columns) const
{ {
std::string path = fs::path(root_path) / part_dir / "columns.txt"; std::string path = fs::path(root_path) / part_dir / "columns.txt";
try
{ {
auto buf = volume->getDisk()->writeFile(path + ".tmp", 4096); {
columns.writeText(*buf); auto buf = volume->getDisk()->writeFile(path + ".tmp", 4096);
} columns.writeText(*buf);
}
volume->getDisk()->moveFile(path + ".tmp", path); volume->getDisk()->moveFile(path + ".tmp", path);
}
catch (...)
{
try
{
if (volume->getDisk()->exists(path + ".tmp"))
volume->getDisk()->removeFile(path + ".tmp");
}
catch (...)
{
tryLogCurrentException("DataPartStorageOnDisk");
}
throw;
}
} }
void DataPartStorageOnDisk::writeDeleteOnDestroyMarker(Poco::Logger * log) const void DataPartStorageOnDisk::writeDeleteOnDestroyMarker(Poco::Logger * log) const
@ -406,6 +457,10 @@ std::string DataPartStorageOnDisk::getName() const
return volume->getDisk()->getName(); return volume->getDisk()->getName();
} }
std::string DataPartStorageOnDisk::getDiskType() const
{
return toString(volume->getDisk()->getType());
}
void DataPartStorageOnDisk::backup( void DataPartStorageOnDisk::backup(
TemporaryFilesOnDisks & temp_dirs, TemporaryFilesOnDisks & temp_dirs,
@ -517,6 +572,30 @@ void DataPartStorageBuilderOnDisk::removeRecursive()
volume->getDisk()->removeRecursive(fs::path(root_path) / part_dir); volume->getDisk()->removeRecursive(fs::path(root_path) / part_dir);
} }
void DataPartStorageBuilderOnDisk::removeSharedRecursive(bool keep_in_remote_fs)
{
volume->getDisk()->removeSharedRecursive(fs::path(root_path) / part_dir, keep_in_remote_fs);
}
SyncGuardPtr DataPartStorageBuilderOnDisk::getDirectorySyncGuard() const
{
return volume->getDisk()->getDirectorySyncGuard(fs::path(root_path) / part_dir);
}
void DataPartStorageBuilderOnDisk::createHardLinkFrom(const IDataPartStorage & source, const std::string & from, const std::string & to) const
{
const auto * source_on_disk = typeid_cast<const DataPartStorageOnDisk *>(&source);
if (!source_on_disk)
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"Cannot create hardlink from different storage. Expected DataPartStorageOnDisk, got {}",
typeid(source).name());
volume->getDisk()->createHardLink(
fs::path(source_on_disk->getFullRelativePath()) / from,
fs::path(root_path) / part_dir / to);
}
bool DataPartStorageBuilderOnDisk::exists() const bool DataPartStorageBuilderOnDisk::exists() const
{ {
return volume->getDisk()->exists(fs::path(root_path) / part_dir); return volume->getDisk()->exists(fs::path(root_path) / part_dir);
@ -533,11 +612,21 @@ std::string DataPartStorageBuilderOnDisk::getFullPath() const
return fs::path(volume->getDisk()->getPath()) / root_path / part_dir; return fs::path(volume->getDisk()->getPath()) / root_path / part_dir;
} }
std::string DataPartStorageBuilderOnDisk::getFullRelativePath() const
{
return fs::path(root_path) / part_dir;
}
void DataPartStorageBuilderOnDisk::createDirectories() void DataPartStorageBuilderOnDisk::createDirectories()
{ {
return volume->getDisk()->createDirectories(fs::path(root_path) / part_dir); return volume->getDisk()->createDirectories(fs::path(root_path) / part_dir);
} }
void DataPartStorageBuilderOnDisk::createProjection(const std::string & name)
{
return volume->getDisk()->createDirectory(fs::path(root_path) / part_dir / name);
}
ReservationPtr DataPartStorageBuilderOnDisk::reserve(UInt64 bytes) ReservationPtr DataPartStorageBuilderOnDisk::reserve(UInt64 bytes)
{ {
auto res = volume->reserve(bytes); auto res = volume->reserve(bytes);

View File

@ -27,9 +27,10 @@ public:
Poco::Timestamp getLastModified() const override; Poco::Timestamp getLastModified() const override;
size_t getFileSize(const std::string & path) const override; size_t getFileSize(const std::string & path) const override;
UInt32 getRefCount(const String & path) const override;
DiskDirectoryIteratorPtr iterate() const override; DataPartStorageIteratorPtr iterate() const override;
DiskDirectoryIteratorPtr iterateDirectory(const std::string & path) const override; DataPartStorageIteratorPtr iterateDirectory(const std::string & path) const override;
void remove( void remove(
bool keep_shared_data, bool keep_shared_data,
@ -53,8 +54,8 @@ public:
bool isBroken() const override; bool isBroken() const override;
std::string getDiskPathForLogs() const override; std::string getDiskPathForLogs() const override;
void writeChecksums(MergeTreeDataPartChecksums & checksums) const override; void writeChecksums(const MergeTreeDataPartChecksums & checksums) const override;
void writeColumns(NamesAndTypesList & columns) const override; void writeColumns(const NamesAndTypesList & columns) const override;
void writeDeleteOnDestroyMarker(Poco::Logger * log) const override; void writeDeleteOnDestroyMarker(Poco::Logger * log) const override;
void checkConsistency(const MergeTreeDataPartChecksums & checksums) const override; void checkConsistency(const MergeTreeDataPartChecksums & checksums) const override;
@ -84,9 +85,12 @@ public:
void rename(const String & new_relative_path, Poco::Logger * log, bool remove_new_dir_if_exists, bool fsync) override; void rename(const String & new_relative_path, Poco::Logger * log, bool remove_new_dir_if_exists, bool fsync) override;
std::string getName() const override; std::string getName() const override;
std::string getDiskType() const override;
DataPartStoragePtr getProjection(const std::string & name) const override; DataPartStoragePtr getProjection(const std::string & name) const override;
DiskPtr getDisk() const;
private: private:
VolumePtr volume; VolumePtr volume;
std::string root_path; std::string root_path;
@ -103,6 +107,7 @@ private:
class DataPartStorageBuilderOnDisk final : public IDataPartStorageBuilder class DataPartStorageBuilderOnDisk final : public IDataPartStorageBuilder
{ {
public:
DataPartStorageBuilderOnDisk(VolumePtr volume_, std::string root_path_, std::string part_dir_); DataPartStorageBuilderOnDisk(VolumePtr volume_, std::string root_path_, std::string part_dir_);
void setRelativePath(const std::string & path) override; void setRelativePath(const std::string & path) override;
@ -111,9 +116,11 @@ class DataPartStorageBuilderOnDisk final : public IDataPartStorageBuilder
bool exists(const std::string & path) const override; bool exists(const std::string & path) const override;
void createDirectories() override; void createDirectories() override;
void createProjection(const std::string & name) override;
std::string getRelativePath() const override { return part_dir; } std::string getRelativePath() const override { return part_dir; }
std::string getFullPath() const override; std::string getFullPath() const override;
std::string getFullRelativePath() const override;
std::unique_ptr<ReadBufferFromFileBase> readFile( std::unique_ptr<ReadBufferFromFileBase> readFile(
const std::string & path, const std::string & path,
@ -127,6 +134,11 @@ class DataPartStorageBuilderOnDisk final : public IDataPartStorageBuilder
void removeFile(const String & path) override; void removeFile(const String & path) override;
void removeRecursive() override; void removeRecursive() override;
void removeSharedRecursive(bool keep_in_remote_fs) override;
SyncGuardPtr getDirectorySyncGuard() const override;
void createHardLinkFrom(const IDataPartStorage & source, const std::string & from, const std::string & to) const override;
ReservationPtr reserve(UInt64 bytes) override; ReservationPtr reserve(UInt64 bytes) override;

View File

@ -13,6 +13,7 @@
#include <Storages/StorageReplicatedMergeTree.h> #include <Storages/StorageReplicatedMergeTree.h>
#include <Common/CurrentMetrics.h> #include <Common/CurrentMetrics.h>
#include <Common/NetException.h> #include <Common/NetException.h>
#include <Storages/MergeTree/DataPartStorageOnDisk.h>
#include <Disks/IO/createReadBufferFromFileBase.h> #include <Disks/IO/createReadBufferFromFileBase.h>
#include <base/scope_guard.h> #include <base/scope_guard.h>
#include <Poco/Net/HTTPRequest.h> #include <Poco/Net/HTTPRequest.h>
@ -166,9 +167,8 @@ void Service::processQuery(const HTMLForm & params, ReadBuffer & /*body*/, Write
if (data_settings->allow_remote_fs_zero_copy_replication && if (data_settings->allow_remote_fs_zero_copy_replication &&
client_protocol_version >= REPLICATION_PROTOCOL_VERSION_WITH_PARTS_ZERO_COPY) client_protocol_version >= REPLICATION_PROTOCOL_VERSION_WITH_PARTS_ZERO_COPY)
{ {
auto disk = part->volume->getDisk(); auto disk_type = part->data_part_storage->getDiskType();
auto disk_type = toString(disk->getType()); if (part->data_part_storage->supportZeroCopyReplication() && std::find(capability.begin(), capability.end(), disk_type) != capability.end())
if (disk->supportZeroCopyReplication() && std::find(capability.begin(), capability.end(), disk_type) != capability.end())
{ {
/// Send metadata if the receiver's capability covers the source disk type. /// Send metadata if the receiver's capability covers the source disk type.
response.addCookie({"remote_fs_metadata", disk_type}); response.addCookie({"remote_fs_metadata", disk_type});
@ -259,7 +259,7 @@ MergeTreeData::DataPart::Checksums Service::sendPartFromDisk(
checksums.files[file_name] = {}; checksums.files[file_name] = {};
} }
auto disk = part->volume->getDisk(); //auto disk = part->volume->getDisk();
MergeTreeData::DataPart::Checksums data_checksums; MergeTreeData::DataPart::Checksums data_checksums;
for (const auto & [name, projection] : part->getProjectionParts()) for (const auto & [name, projection] : part->getProjectionParts())
{ {
@ -285,14 +285,14 @@ MergeTreeData::DataPart::Checksums Service::sendPartFromDisk(
{ {
String file_name = it.first; String file_name = it.first;
String path = fs::path(part->getFullRelativePath()) / file_name; //String path = fs::path(part->getFullRelativePath()) / file_name;
UInt64 size = disk->getFileSize(path); UInt64 size = part->data_part_storage->getFileSize(file_name);
writeStringBinary(it.first, out); writeStringBinary(it.first, out);
writeBinary(size, out); writeBinary(size, out);
auto file_in = disk->readFile(path); auto file_in = part->data_part_storage->readFile(file_name, {}, std::nullopt, std::nullopt);
HashingWriteBuffer hashing_out(out); HashingWriteBuffer hashing_out(out);
copyDataWithThrottler(*file_in, hashing_out, blocker.getCounter(), data.getSendsThrottler()); copyDataWithThrottler(*file_in, hashing_out, blocker.getCounter(), data.getSendsThrottler());
@ -300,7 +300,11 @@ MergeTreeData::DataPart::Checksums Service::sendPartFromDisk(
throw Exception("Transferring part to replica was cancelled", ErrorCodes::ABORTED); throw Exception("Transferring part to replica was cancelled", ErrorCodes::ABORTED);
if (hashing_out.count() != size) if (hashing_out.count() != size)
throw Exception(ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART, "Unexpected size of file {}, expected {} got {}", path, hashing_out.count(), size); throw Exception(
ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART,
"Unexpected size of file {}, expected {} got {}",
std::string(fs::path(part->data_part_storage->getFullRelativePath()) / file_name),
hashing_out.count(), size);
writePODBinary(hashing_out.getHash(), out); writePODBinary(hashing_out.getHash(), out);
@ -314,7 +318,11 @@ MergeTreeData::DataPart::Checksums Service::sendPartFromDisk(
void Service::sendPartFromDiskRemoteMeta(const MergeTreeData::DataPartPtr & part, WriteBuffer & out) void Service::sendPartFromDiskRemoteMeta(const MergeTreeData::DataPartPtr & part, WriteBuffer & out)
{ {
auto disk = part->volume->getDisk(); const auto * data_part_storage_on_disk = typeid_cast<const DataPartStorageOnDisk *>(part->data_part_storage.get());
if (!data_part_storage_on_disk)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Storage '{}' doesn't support zero-copy replication", part->data_part_storage->getName());
auto disk = data_part_storage_on_disk->getDisk();
if (!disk->supportZeroCopyReplication()) if (!disk->supportZeroCopyReplication())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Disk '{}' doesn't support zero-copy replication", disk->getName()); throw Exception(ErrorCodes::LOGICAL_ERROR, "Disk '{}' doesn't support zero-copy replication", disk->getName());
@ -328,7 +336,7 @@ void Service::sendPartFromDiskRemoteMeta(const MergeTreeData::DataPartPtr & part
std::vector<std::string> paths; std::vector<std::string> paths;
paths.reserve(checksums.files.size()); paths.reserve(checksums.files.size());
for (const auto & it : checksums.files) for (const auto & it : checksums.files)
paths.push_back(fs::path(part->getFullRelativePath()) / it.first); paths.push_back(fs::path(part->data_part_storage->getFullRelativePath()) / it.first);
/// Serialized metadatadatas with zero ref counts. /// Serialized metadatadatas with zero ref counts.
auto metadatas = disk->getSerializedMetadata(paths); auto metadatas = disk->getSerializedMetadata(paths);
@ -340,7 +348,7 @@ void Service::sendPartFromDiskRemoteMeta(const MergeTreeData::DataPartPtr & part
for (const auto & it : checksums.files) for (const auto & it : checksums.files)
{ {
const String & file_name = it.first; const String & file_name = it.first;
String file_path_prefix = fs::path(part->getFullRelativePath()) / file_name; String file_path_prefix = fs::path(part->data_part_storage->getFullRelativePath()) / file_name;
/// Just some additional checks /// Just some additional checks
String metadata_file_path = fs::path(disk->getPath()) / file_path_prefix; String metadata_file_path = fs::path(disk->getPath()) / file_path_prefix;
@ -571,8 +579,19 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory(
ThrottlerPtr throttler) ThrottlerPtr throttler)
{ {
auto volume = std::make_shared<SingleDiskVolume>("volume_" + part_name, disk, 0); auto volume = std::make_shared<SingleDiskVolume>("volume_" + part_name, disk, 0);
auto data_part_storage = std::make_shared<DataPartStorageOnDisk>(
volume,
data.getRelativeDataPath(),
part_name);
auto data_part_storage_builder = std::make_shared<DataPartStorageBuilderOnDisk>(
volume,
data.getRelativeDataPath(),
part_name);
MergeTreeData::MutableDataPartPtr new_data_part = MergeTreeData::MutableDataPartPtr new_data_part =
std::make_shared<MergeTreeDataPartInMemory>(data, part_name, volume); std::make_shared<MergeTreeDataPartInMemory>(data, part_name, data_part_storage);
for (auto i = 0ul; i < projections; ++i) for (auto i = 0ul; i < projections; ++i)
{ {
@ -586,9 +605,12 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory(
auto block = block_in.read(); auto block = block_in.read();
throttler->add(block.bytes()); throttler->add(block.bytes());
auto projection_part_storage = data_part_storage->getProjection(projection_name + ".proj");
auto projection_part_storage_builder = data_part_storage_builder->getProjection(projection_name + ".proj");
MergeTreePartInfo new_part_info("all", 0, 0, 0); MergeTreePartInfo new_part_info("all", 0, 0, 0);
MergeTreeData::MutableDataPartPtr new_projection_part = MergeTreeData::MutableDataPartPtr new_projection_part =
std::make_shared<MergeTreeDataPartInMemory>(data, projection_name, new_part_info, volume, projection_name, new_data_part.get()); std::make_shared<MergeTreeDataPartInMemory>(data, projection_name, new_part_info, projection_part_storage, new_data_part.get());
new_projection_part->is_temp = false; new_projection_part->is_temp = false;
new_projection_part->setColumns(block.getNamesAndTypesList()); new_projection_part->setColumns(block.getNamesAndTypesList());
@ -598,6 +620,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory(
MergedBlockOutputStream part_out( MergedBlockOutputStream part_out(
new_projection_part, new_projection_part,
projection_part_storage_builder,
metadata_snapshot->projections.get(projection_name).metadata, metadata_snapshot->projections.get(projection_name).metadata,
block.getNamesAndTypesList(), block.getNamesAndTypesList(),
{}, {},
@ -624,7 +647,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory(
new_data_part->partition.create(metadata_snapshot, block, 0, context); new_data_part->partition.create(metadata_snapshot, block, 0, context);
MergedBlockOutputStream part_out( MergedBlockOutputStream part_out(
new_data_part, metadata_snapshot, block.getNamesAndTypesList(), {}, new_data_part, data_part_storage_builder, metadata_snapshot, block.getNamesAndTypesList(), {},
CompressionCodecFactory::instance().get("NONE", {})); CompressionCodecFactory::instance().get("NONE", {}));
part_out.write(block); part_out.write(block);
@ -636,9 +659,8 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory(
void Fetcher::downloadBaseOrProjectionPartToDisk( void Fetcher::downloadBaseOrProjectionPartToDisk(
const String & replica_path, const String & replica_path,
const String & part_download_path, DataPartStorageBuilderPtr & data_part_storage_builder,
bool sync, bool sync,
DiskPtr disk,
PooledReadWriteBufferFromHTTP & in, PooledReadWriteBufferFromHTTP & in,
MergeTreeData::DataPart::Checksums & checksums, MergeTreeData::DataPart::Checksums & checksums,
ThrottlerPtr throttler) const ThrottlerPtr throttler) const
@ -656,14 +678,14 @@ void Fetcher::downloadBaseOrProjectionPartToDisk(
/// File must be inside "absolute_part_path" directory. /// File must be inside "absolute_part_path" directory.
/// Otherwise malicious ClickHouse replica may force us to write to arbitrary path. /// Otherwise malicious ClickHouse replica may force us to write to arbitrary path.
String absolute_file_path = fs::weakly_canonical(fs::path(part_download_path) / file_name); String absolute_file_path = fs::weakly_canonical(fs::path(data_part_storage_builder->getFullRelativePath()) / file_name);
if (!startsWith(absolute_file_path, fs::weakly_canonical(part_download_path).string())) if (!startsWith(absolute_file_path, fs::weakly_canonical(data_part_storage_builder->getFullRelativePath()).string()))
throw Exception(ErrorCodes::INSECURE_PATH, throw Exception(ErrorCodes::INSECURE_PATH,
"File path ({}) doesn't appear to be inside part path ({}). " "File path ({}) doesn't appear to be inside part path ({}). "
"This may happen if we are trying to download part from malicious replica or logical error.", "This may happen if we are trying to download part from malicious replica or logical error.",
absolute_file_path, part_download_path); absolute_file_path, data_part_storage_builder->getFullRelativePath());
auto file_out = disk->writeFile(fs::path(part_download_path) / file_name); auto file_out = data_part_storage_builder->writeFile(file_name, file_size);
HashingWriteBuffer hashing_out(*file_out); HashingWriteBuffer hashing_out(*file_out);
copyDataWithThrottler(in, hashing_out, file_size, blocker.getCounter(), throttler); copyDataWithThrottler(in, hashing_out, file_size, blocker.getCounter(), throttler);
@ -672,7 +694,7 @@ void Fetcher::downloadBaseOrProjectionPartToDisk(
/// NOTE The is_cancelled flag also makes sense to check every time you read over the network, /// NOTE The is_cancelled flag also makes sense to check every time you read over the network,
/// performing a poll with a not very large timeout. /// performing a poll with a not very large timeout.
/// And now we check it only between read chunks (in the `copyData` function). /// And now we check it only between read chunks (in the `copyData` function).
disk->removeRecursive(part_download_path); data_part_storage_builder->removeRecursive();
throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED); throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED);
} }
@ -682,7 +704,7 @@ void Fetcher::downloadBaseOrProjectionPartToDisk(
if (expected_hash != hashing_out.getHash()) if (expected_hash != hashing_out.getHash())
throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH, throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH,
"Checksum mismatch for file {} transferred from {}", "Checksum mismatch for file {} transferred from {}",
fullPath(disk, (fs::path(part_download_path) / file_name).string()), (fs::path(data_part_storage_builder->getFullPath()) / file_name).string(),
replica_path); replica_path);
if (file_name != "checksums.txt" && if (file_name != "checksums.txt" &&
@ -717,21 +739,33 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDisk(
|| std::string::npos != part_name.find_first_of("/.")) || std::string::npos != part_name.find_first_of("/."))
throw Exception("Logical error: tmp_prefix and part_name cannot be empty or contain '.' or '/' characters.", ErrorCodes::LOGICAL_ERROR); throw Exception("Logical error: tmp_prefix and part_name cannot be empty or contain '.' or '/' characters.", ErrorCodes::LOGICAL_ERROR);
String part_relative_path = String(to_detached ? "detached/" : "") + tmp_prefix + part_name; String part_dir = tmp_prefix + part_name;
String part_download_path = data.getRelativeDataPath() + part_relative_path + "/"; String part_relative_path = data.getRelativeDataPath() + String(to_detached ? "detached/" : "");
if (disk->exists(part_download_path)) auto volume = std::make_shared<SingleDiskVolume>("volume_" + part_name, disk, 0);
auto data_part_storage = std::make_shared<DataPartStorageOnDisk>(
volume,
part_relative_path,
part_dir);
DataPartStorageBuilderPtr data_part_storage_builder = std::make_shared<DataPartStorageBuilderOnDisk>(
volume,
part_relative_path,
part_dir);
if (data_part_storage_builder->exists())
{ {
LOG_WARNING(log, "Directory {} already exists, probably result of a failed fetch. Will remove it before fetching part.", LOG_WARNING(log, "Directory {} already exists, probably result of a failed fetch. Will remove it before fetching part.",
fullPath(disk, part_download_path)); data_part_storage_builder->getFullPath());
disk->removeRecursive(part_download_path); data_part_storage_builder->removeRecursive();
} }
disk->createDirectories(part_download_path); data_part_storage_builder->createDirectories();
SyncGuardPtr sync_guard; SyncGuardPtr sync_guard;
if (data.getSettings()->fsync_part_directory) if (data.getSettings()->fsync_part_directory)
sync_guard = disk->getDirectorySyncGuard(part_download_path); sync_guard = disk->getDirectorySyncGuard(data_part_storage->getRelativePath());
CurrentMetrics::Increment metric_increment{CurrentMetrics::ReplicatedFetch}; CurrentMetrics::Increment metric_increment{CurrentMetrics::ReplicatedFetch};
@ -740,19 +774,22 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDisk(
String projection_name; String projection_name;
readStringBinary(projection_name, in); readStringBinary(projection_name, in);
MergeTreeData::DataPart::Checksums projection_checksum; MergeTreeData::DataPart::Checksums projection_checksum;
disk->createDirectories(part_download_path + projection_name + ".proj/");
auto projection_part_storage = data_part_storage->getProjection(projection_name + ".proj");
auto projection_part_storage_builder = data_part_storage_builder->getProjection(projection_name + ".proj");
projection_part_storage_builder->createDirectories();
downloadBaseOrProjectionPartToDisk( downloadBaseOrProjectionPartToDisk(
replica_path, part_download_path + projection_name + ".proj/", sync, disk, in, projection_checksum, throttler); replica_path, projection_part_storage_builder, sync, in, projection_checksum, throttler);
checksums.addFile( checksums.addFile(
projection_name + ".proj", projection_checksum.getTotalSizeOnDisk(), projection_checksum.getTotalChecksumUInt128()); projection_name + ".proj", projection_checksum.getTotalSizeOnDisk(), projection_checksum.getTotalChecksumUInt128());
} }
// Download the base part // Download the base part
downloadBaseOrProjectionPartToDisk(replica_path, part_download_path, sync, disk, in, checksums, throttler); downloadBaseOrProjectionPartToDisk(replica_path, data_part_storage_builder, sync, in, checksums, throttler);
assertEOF(in); assertEOF(in);
auto volume = std::make_shared<SingleDiskVolume>("volume_" + part_name, disk, 0); MergeTreeData::MutableDataPartPtr new_data_part = data.createPart(part_name, data_part_storage);
MergeTreeData::MutableDataPartPtr new_data_part = data.createPart(part_name, volume, part_relative_path);
new_data_part->is_temp = true; new_data_part->is_temp = true;
new_data_part->modification_time = time(nullptr); new_data_part->modification_time = time(nullptr);
new_data_part->loadColumnsChecksumsIndexes(true, false); new_data_part->loadColumnsChecksumsIndexes(true, false);
@ -785,21 +822,31 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDiskRemoteMeta(
static const String TMP_PREFIX = "tmp-fetch_"; static const String TMP_PREFIX = "tmp-fetch_";
String tmp_prefix = tmp_prefix_.empty() ? TMP_PREFIX : tmp_prefix_; String tmp_prefix = tmp_prefix_.empty() ? TMP_PREFIX : tmp_prefix_;
String part_relative_path = String(to_detached ? "detached/" : "") + tmp_prefix + part_name; String part_dir = tmp_prefix + part_name;
String part_download_path = fs::path(data.getRelativeDataPath()) / part_relative_path / ""; String part_relative_path = data.getRelativeDataPath() + String(to_detached ? "detached/" : "");
if (disk->exists(part_download_path)) auto volume = std::make_shared<SingleDiskVolume>("volume_" + part_name, disk);
throw Exception(ErrorCodes::DIRECTORY_ALREADY_EXISTS, "Directory {} already exists.", fullPath(disk, part_download_path));
auto data_part_storage = std::make_shared<DataPartStorageOnDisk>(
volume,
part_relative_path,
part_dir);
DataPartStorageBuilderPtr data_part_storage_builder = std::make_shared<DataPartStorageBuilderOnDisk>(
volume,
part_relative_path,
part_dir);
if (data_part_storage_builder->exists())
throw Exception(ErrorCodes::DIRECTORY_ALREADY_EXISTS, "Directory {} already exists.", data_part_storage_builder->getFullPath());
CurrentMetrics::Increment metric_increment{CurrentMetrics::ReplicatedFetch}; CurrentMetrics::Increment metric_increment{CurrentMetrics::ReplicatedFetch};
disk->createDirectories(part_download_path); data_part_storage_builder->createDirectories();
size_t files; size_t files;
readBinary(files, in); readBinary(files, in);
auto volume = std::make_shared<SingleDiskVolume>("volume_" + part_name, disk);
for (size_t i = 0; i < files; ++i) for (size_t i = 0; i < files; ++i)
{ {
String file_name; String file_name;
@ -808,8 +855,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDiskRemoteMeta(
readStringBinary(file_name, in); readStringBinary(file_name, in);
readBinary(file_size, in); readBinary(file_size, in);
String data_path = fs::path(part_download_path) / file_name; String metadata_file = fs::path(data_part_storage_builder->getFullPath()) / file_name;
String metadata_file = fullPath(disk, data_path);
{ {
auto file_out = std::make_unique<WriteBufferFromFile>(metadata_file, DBMS_DEFAULT_BUFFER_SIZE, -1, 0666, nullptr, 0); auto file_out = std::make_unique<WriteBufferFromFile>(metadata_file, DBMS_DEFAULT_BUFFER_SIZE, -1, 0666, nullptr, 0);
@ -823,7 +869,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDiskRemoteMeta(
/// NOTE The is_cancelled flag also makes sense to check every time you read over the network, /// NOTE The is_cancelled flag also makes sense to check every time you read over the network,
/// performing a poll with a not very large timeout. /// performing a poll with a not very large timeout.
/// And now we check it only between read chunks (in the `copyData` function). /// And now we check it only between read chunks (in the `copyData` function).
disk->removeSharedRecursive(part_download_path, true); data_part_storage_builder->removeSharedRecursive(true);
throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED); throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED);
} }
@ -841,7 +887,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDiskRemoteMeta(
assertEOF(in); assertEOF(in);
MergeTreeData::MutableDataPartPtr new_data_part = data.createPart(part_name, volume, part_relative_path); MergeTreeData::MutableDataPartPtr new_data_part = data.createPart(part_name, data_part_storage);
new_data_part->is_temp = true; new_data_part->is_temp = true;
new_data_part->modification_time = time(nullptr); new_data_part->modification_time = time(nullptr);
new_data_part->loadColumnsChecksumsIndexes(true, false); new_data_part->loadColumnsChecksumsIndexes(true, false);

View File

@ -90,9 +90,8 @@ public:
private: private:
void downloadBaseOrProjectionPartToDisk( void downloadBaseOrProjectionPartToDisk(
const String & replica_path, const String & replica_path,
const String & part_download_path, DataPartStorageBuilderPtr & data_part_storage_builder,
bool sync, bool sync,
DiskPtr disk,
PooledReadWriteBufferFromHTTP & in, PooledReadWriteBufferFromHTTP & in,
MergeTreeData::DataPart::Checksums & checksums, MergeTreeData::DataPart::Checksums & checksums,
ThrottlerPtr throttler) const; ThrottlerPtr throttler) const;

View File

@ -11,8 +11,25 @@ class ReadBufferFromFileBase;
class WriteBufferFromFileBase; class WriteBufferFromFileBase;
class IDiskDirectoryIterator; class IDataPartStorageIterator
using DiskDirectoryIteratorPtr = std::unique_ptr<IDiskDirectoryIterator>; {
public:
/// Iterate to the next file.
virtual void next() = 0;
/// Return `true` if the iterator points to a valid element.
virtual bool isValid() const = 0;
/// Return `true` if the iterator points to a file.
virtual bool isFile() const = 0;
/// Name of the file that the iterator currently points to.
virtual std::string name() const = 0;
virtual ~IDataPartStorageIterator() = default;
};
using DataPartStorageIteratorPtr = std::unique_ptr<IDataPartStorageIterator>;
struct MergeTreeDataPartChecksums; struct MergeTreeDataPartChecksums;
@ -24,6 +41,9 @@ class IStoragePolicy;
class IDisk; class IDisk;
using DiskPtr = std::shared_ptr<IDisk>; using DiskPtr = std::shared_ptr<IDisk>;
class ISyncGuard;
using SyncGuardPtr = std::unique_ptr<ISyncGuard>;
class IBackupEntry; class IBackupEntry;
using BackupEntryPtr = std::unique_ptr<IBackupEntry>; using BackupEntryPtr = std::unique_ptr<IBackupEntry>;
using BackupEntries = std::vector<std::pair<std::string, BackupEntryPtr>>; using BackupEntries = std::vector<std::pair<std::string, BackupEntryPtr>>;
@ -50,8 +70,8 @@ public:
virtual Poco::Timestamp getLastModified() const = 0; virtual Poco::Timestamp getLastModified() const = 0;
virtual DiskDirectoryIteratorPtr iterate() const = 0; virtual DataPartStorageIteratorPtr iterate() const = 0;
virtual DiskDirectoryIteratorPtr iterateDirectory(const std::string & path) const = 0; virtual DataPartStorageIteratorPtr iterateDirectory(const std::string & path) const = 0;
struct ProjectionChecksums struct ProjectionChecksums
{ {
@ -66,6 +86,7 @@ public:
Poco::Logger * log) const = 0; Poco::Logger * log) const = 0;
virtual size_t getFileSize(const std::string & path) const = 0; virtual size_t getFileSize(const std::string & path) const = 0;
virtual UInt32 getRefCount(const String &) const { return 0; }
virtual std::string getRelativePathForPrefix(Poco::Logger * log, const String & prefix, bool detached) const = 0; virtual std::string getRelativePathForPrefix(Poco::Logger * log, const String & prefix, bool detached) const = 0;
@ -85,8 +106,8 @@ public:
virtual std::string getDiskPathForLogs() const = 0; virtual std::string getDiskPathForLogs() const = 0;
/// Should remove it later /// Should remove it later
virtual void writeChecksums(MergeTreeDataPartChecksums & checksums) const = 0; virtual void writeChecksums(const MergeTreeDataPartChecksums & checksums) const = 0;
virtual void writeColumns(NamesAndTypesList & columns) const = 0; virtual void writeColumns(const NamesAndTypesList & columns) const = 0;
virtual void writeDeleteOnDestroyMarker(Poco::Logger * log) const = 0; virtual void writeDeleteOnDestroyMarker(Poco::Logger * log) const = 0;
virtual void checkConsistency(const MergeTreeDataPartChecksums & checksums) const = 0; virtual void checkConsistency(const MergeTreeDataPartChecksums & checksums) const = 0;
@ -124,6 +145,7 @@ public:
/// Disk name /// Disk name
virtual std::string getName() const = 0; virtual std::string getName() const = 0;
virtual std::string getDiskType() const = 0;
virtual std::shared_ptr<IDataPartStorage> getProjection(const std::string & name) const = 0; virtual std::shared_ptr<IDataPartStorage> getProjection(const std::string & name) const = 0;
}; };
@ -140,11 +162,13 @@ public:
virtual std::string getRelativePath() const = 0; virtual std::string getRelativePath() const = 0;
virtual std::string getFullPath() const = 0; virtual std::string getFullPath() const = 0;
virtual std::string getFullRelativePath() const = 0;
virtual bool exists() const = 0; virtual bool exists() const = 0;
virtual bool exists(const std::string & path) const = 0; virtual bool exists(const std::string & path) const = 0;
virtual void createDirectories() = 0; virtual void createDirectories() = 0;
virtual void createProjection(const std::string & name) = 0;
virtual std::unique_ptr<ReadBufferFromFileBase> readFile( virtual std::unique_ptr<ReadBufferFromFileBase> readFile(
const std::string & path, const std::string & path,
@ -152,12 +176,15 @@ public:
std::optional<size_t> read_hint, std::optional<size_t> read_hint,
std::optional<size_t> file_size) const = 0; std::optional<size_t> file_size) const = 0;
virtual std::unique_ptr<WriteBufferFromFileBase> writeFile( virtual std::unique_ptr<WriteBufferFromFileBase> writeFile(const String & path, size_t buf_size) = 0;
const String & path,
size_t buf_size /* = DBMS_DEFAULT_BUFFER_SIZE*/) = 0;
virtual void removeFile(const String & path) = 0; virtual void removeFile(const String & path) = 0;
virtual void removeRecursive() = 0; virtual void removeRecursive() = 0;
virtual void removeSharedRecursive(bool keep_in_remote_fs) = 0;
virtual SyncGuardPtr getDirectorySyncGuard() const { return nullptr; }
virtual void createHardLinkFrom(const IDataPartStorage & source, const std::string & from, const std::string & to) const = 0;
virtual ReservationPtr reserve(UInt64 /*bytes*/) { return nullptr; } virtual ReservationPtr reserve(UInt64 /*bytes*/) { return nullptr; }

View File

@ -1298,12 +1298,7 @@ void IMergeTreeDataPart::remove() const
return; return;
if (isProjectionPart()) if (isProjectionPart())
{ LOG_WARNING(storage.log, "Projection part {} should be removed by its parent {}.", name, parent_part->name);
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"Projection part {} should be removed by its parent {}.",
name, parent_part->name);
}
metadata_manager->deleteAll(false); metadata_manager->deleteAll(false);
metadata_manager->assertAllDeleted(false); metadata_manager->assertAllDeleted(false);

View File

@ -87,8 +87,8 @@ public:
UncompressedCache * uncompressed_cache, UncompressedCache * uncompressed_cache,
MarkCache * mark_cache, MarkCache * mark_cache,
const MergeTreeReaderSettings & reader_settings_, const MergeTreeReaderSettings & reader_settings_,
const ValueSizeMap & avg_value_size_hints_ = ValueSizeMap{}, const ValueSizeMap & avg_value_size_hints_,
const ReadBufferFromFileBase::ProfileCallback & profile_callback_ = ReadBufferFromFileBase::ProfileCallback{}) const = 0; const ReadBufferFromFileBase::ProfileCallback & profile_callback_) const = 0;
virtual MergeTreeWriterPtr getWriter( virtual MergeTreeWriterPtr getWriter(
DataPartStorageBuilderPtr data_part_storage_builder, DataPartStorageBuilderPtr data_part_storage_builder,
@ -97,7 +97,7 @@ public:
const std::vector<MergeTreeIndexPtr> & indices_to_recalc, const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
const CompressionCodecPtr & default_codec_, const CompressionCodecPtr & default_codec_,
const MergeTreeWriterSettings & writer_settings, const MergeTreeWriterSettings & writer_settings,
const MergeTreeIndexGranularity & computed_index_granularity = {}) const = 0; const MergeTreeIndexGranularity & computed_index_granularity) const = 0;
virtual bool isStoredOnDisk() const = 0; virtual bool isStoredOnDisk() const = 0;

View File

@ -5,6 +5,7 @@
#include <base/logger_useful.h> #include <base/logger_useful.h>
#include <Common/ActionBlocker.h> #include <Common/ActionBlocker.h>
#include <Storages/MergeTree/DataPartStorageOnDisk.h>
#include <DataTypes/ObjectUtils.h> #include <DataTypes/ObjectUtils.h>
#include <DataTypes/Serializations/SerializationInfo.h> #include <DataTypes/Serializations/SerializationInfo.h>
@ -151,12 +152,22 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
auto local_single_disk_volume = std::make_shared<SingleDiskVolume>("volume_" + global_ctx->future_part->name, ctx->disk, 0); auto local_single_disk_volume = std::make_shared<SingleDiskVolume>("volume_" + global_ctx->future_part->name, ctx->disk, 0);
auto data_part_storage = std::make_shared<DataPartStorageOnDisk>(
local_single_disk_volume,
local_part_path,
local_tmp_part_basename);
global_ctx->data_part_storage_builder = std::make_shared<DataPartStorageBuilderOnDisk>(
local_single_disk_volume,
local_part_path,
local_tmp_part_basename);
global_ctx->new_data_part = global_ctx->data->createPart( global_ctx->new_data_part = global_ctx->data->createPart(
global_ctx->future_part->name, global_ctx->future_part->name,
global_ctx->future_part->type, global_ctx->future_part->type,
global_ctx->future_part->part_info, global_ctx->future_part->part_info,
local_single_disk_volume, data_part_storage,
local_tmp_part_basename,
global_ctx->parent_part); global_ctx->parent_part);
global_ctx->new_data_part->uuid = global_ctx->future_part->uuid; global_ctx->new_data_part->uuid = global_ctx->future_part->uuid;
@ -256,6 +267,7 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
global_ctx->to = std::make_shared<MergedBlockOutputStream>( global_ctx->to = std::make_shared<MergedBlockOutputStream>(
global_ctx->new_data_part, global_ctx->new_data_part,
global_ctx->data_part_storage_builder,
global_ctx->metadata_snapshot, global_ctx->metadata_snapshot,
global_ctx->merging_columns, global_ctx->merging_columns,
MergeTreeIndexFactory::instance().getMany(global_ctx->metadata_snapshot->getSecondaryIndices()), MergeTreeIndexFactory::instance().getMany(global_ctx->metadata_snapshot->getSecondaryIndices()),
@ -438,6 +450,7 @@ void MergeTask::VerticalMergeStage::prepareVerticalMergeForOneColumn() const
ctx->executor = std::make_unique<PullingPipelineExecutor>(ctx->column_parts_pipeline); ctx->executor = std::make_unique<PullingPipelineExecutor>(ctx->column_parts_pipeline);
ctx->column_to = std::make_unique<MergedColumnOnlyOutputStream>( ctx->column_to = std::make_unique<MergedColumnOnlyOutputStream>(
global_ctx->data_part_storage_builder,
global_ctx->new_data_part, global_ctx->new_data_part,
global_ctx->metadata_snapshot, global_ctx->metadata_snapshot,
ctx->executor->getHeader(), ctx->executor->getHeader(),

View File

@ -157,6 +157,7 @@ private:
std::unique_ptr<PullingPipelineExecutor> merging_executor; std::unique_ptr<PullingPipelineExecutor> merging_executor;
MergeTreeData::MutableDataPartPtr new_data_part{nullptr}; MergeTreeData::MutableDataPartPtr new_data_part{nullptr};
DataPartStorageBuilderPtr data_part_storage_builder;
size_t rows_written{0}; size_t rows_written{0};
UInt64 watch_prev_elapsed{0}; UInt64 watch_prev_elapsed{0};

View File

@ -64,11 +64,11 @@ void MergeTreeSelectProcessor::initializeReaders()
owned_mark_cache = storage.getContext()->getMarkCache(); owned_mark_cache = storage.getContext()->getMarkCache();
reader = data_part->getReader(task_columns.columns, storage_snapshot->getMetadataForQuery(), reader = data_part->getReader(task_columns.columns, storage_snapshot->getMetadataForQuery(),
all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings); all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, {}, {});
if (prewhere_info) if (prewhere_info)
pre_reader = data_part->getReader(task_columns.pre_columns, storage_snapshot->getMetadataForQuery(), pre_reader = data_part->getReader(task_columns.pre_columns, storage_snapshot->getMetadataForQuery(),
all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings); all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, {}, {});
} }

View File

@ -66,7 +66,7 @@ MergeTreeSequentialSource::MergeTreeSequentialSource(
reader = data_part->getReader(columns_for_reader, storage_snapshot->metadata, reader = data_part->getReader(columns_for_reader, storage_snapshot->metadata,
MarkRanges{MarkRange(0, data_part->getMarksCount())}, MarkRanges{MarkRange(0, data_part->getMarksCount())},
/* uncompressed_cache = */ nullptr, mark_cache.get(), reader_settings); /* uncompressed_cache = */ nullptr, mark_cache.get(), reader_settings, {}, {});
} }
Chunk MergeTreeSequentialSource::generate() Chunk MergeTreeSequentialSource::generate()

View File

@ -10,6 +10,7 @@ namespace ErrorCodes
} }
MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream( MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream(
DataPartStorageBuilderPtr data_part_storage_builder_,
const MergeTreeDataPartPtr & data_part, const MergeTreeDataPartPtr & data_part,
const StorageMetadataPtr & metadata_snapshot_, const StorageMetadataPtr & metadata_snapshot_,
const Block & header_, const Block & header_,
@ -18,7 +19,7 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream(
WrittenOffsetColumns * offset_columns_, WrittenOffsetColumns * offset_columns_,
const MergeTreeIndexGranularity & index_granularity, const MergeTreeIndexGranularity & index_granularity,
const MergeTreeIndexGranularityInfo * index_granularity_info) const MergeTreeIndexGranularityInfo * index_granularity_info)
: IMergedBlockOutputStream(data_part, metadata_snapshot_, header_.getNamesAndTypesList(), /*reset_columns=*/ true) : IMergedBlockOutputStream(std::move(data_part_storage_builder_), data_part, metadata_snapshot_, header_.getNamesAndTypesList(), /*reset_columns=*/ true)
, header(header_) , header(header_)
{ {
const auto & global_settings = data_part->storage.getContext()->getSettings(); const auto & global_settings = data_part->storage.getContext()->getSettings();
@ -31,6 +32,7 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream(
/* rewrite_primary_key = */false); /* rewrite_primary_key = */false);
writer = data_part->getWriter( writer = data_part->getWriter(
data_part_storage_builder,
header.getNamesAndTypesList(), header.getNamesAndTypesList(),
metadata_snapshot_, metadata_snapshot_,
indices_to_recalc, indices_to_recalc,
@ -75,13 +77,11 @@ MergedColumnOnlyOutputStream::fillChecksums(
auto removed_files = removeEmptyColumnsFromPart(new_part, columns, serialization_infos, checksums); auto removed_files = removeEmptyColumnsFromPart(new_part, columns, serialization_infos, checksums);
auto disk = new_part->volume->getDisk();
for (const String & removed_file : removed_files) for (const String & removed_file : removed_files)
{ {
auto file_path = new_part->getFullRelativePath() + removed_file;
/// Can be called multiple times, don't need to remove file twice /// Can be called multiple times, don't need to remove file twice
if (disk->exists(file_path)) if (data_part_storage_builder->exists(removed_file))
disk->removeFile(file_path); data_part_storage_builder->removeFile(removed_file);
if (all_checksums.files.count(removed_file)) if (all_checksums.files.count(removed_file))
all_checksums.files.erase(removed_file); all_checksums.files.erase(removed_file);

View File

@ -14,6 +14,7 @@ public:
/// Pass empty 'already_written_offset_columns' first time then and pass the same object to subsequent instances of MergedColumnOnlyOutputStream /// Pass empty 'already_written_offset_columns' first time then and pass the same object to subsequent instances of MergedColumnOnlyOutputStream
/// if you want to serialize elements of Nested data structure in different instances of MergedColumnOnlyOutputStream. /// if you want to serialize elements of Nested data structure in different instances of MergedColumnOnlyOutputStream.
MergedColumnOnlyOutputStream( MergedColumnOnlyOutputStream(
DataPartStorageBuilderPtr data_part_storage_builder_,
const MergeTreeDataPartPtr & data_part, const MergeTreeDataPartPtr & data_part,
const StorageMetadataPtr & metadata_snapshot_, const StorageMetadataPtr & metadata_snapshot_,
const Block & header_, const Block & header_,

View File

@ -2,6 +2,7 @@
#include <base/logger_useful.h> #include <base/logger_useful.h>
#include <Common/escapeForFileName.h> #include <Common/escapeForFileName.h>
#include <Storages/MergeTree/DataPartStorageOnDisk.h>
#include <Parsers/queryToString.h> #include <Parsers/queryToString.h>
#include <Interpreters/SquashingTransform.h> #include <Interpreters/SquashingTransform.h>
#include <Processors/Transforms/TTLTransform.h> #include <Processors/Transforms/TTLTransform.h>
@ -417,16 +418,17 @@ static NameToNameVector collectFilesForRenames(
/// Initialize and write to disk new part fields like checksums, columns, etc. /// Initialize and write to disk new part fields like checksums, columns, etc.
void finalizeMutatedPart( void finalizeMutatedPart(
const MergeTreeDataPartPtr & source_part, const MergeTreeDataPartPtr & source_part,
const DataPartStorageBuilderPtr & data_part_storage_builder,
MergeTreeData::MutableDataPartPtr new_data_part, MergeTreeData::MutableDataPartPtr new_data_part,
ExecuteTTLType execute_ttl_type, ExecuteTTLType execute_ttl_type,
const CompressionCodecPtr & codec) const CompressionCodecPtr & codec)
{ {
auto disk = new_data_part->volume->getDisk(); //auto disk = new_data_part->volume->getDisk();
auto part_path = fs::path(new_data_part->getFullRelativePath()); //auto part_path = fs::path(new_data_part->getFullRelativePath());
if (new_data_part->uuid != UUIDHelpers::Nil) if (new_data_part->uuid != UUIDHelpers::Nil)
{ {
auto out = disk->writeFile(part_path / IMergeTreeDataPart::UUID_FILE_NAME, 4096); auto out = data_part_storage_builder->writeFile(IMergeTreeDataPart::UUID_FILE_NAME, 4096);
HashingWriteBuffer out_hashing(*out); HashingWriteBuffer out_hashing(*out);
writeUUIDText(new_data_part->uuid, out_hashing); writeUUIDText(new_data_part->uuid, out_hashing);
new_data_part->checksums.files[IMergeTreeDataPart::UUID_FILE_NAME].file_size = out_hashing.count(); new_data_part->checksums.files[IMergeTreeDataPart::UUID_FILE_NAME].file_size = out_hashing.count();
@ -436,7 +438,7 @@ void finalizeMutatedPart(
if (execute_ttl_type != ExecuteTTLType::NONE) if (execute_ttl_type != ExecuteTTLType::NONE)
{ {
/// Write a file with ttl infos in json format. /// Write a file with ttl infos in json format.
auto out_ttl = disk->writeFile(part_path / "ttl.txt", 4096); auto out_ttl = data_part_storage_builder->writeFile("ttl.txt", 4096);
HashingWriteBuffer out_hashing(*out_ttl); HashingWriteBuffer out_hashing(*out_ttl);
new_data_part->ttl_infos.write(out_hashing); new_data_part->ttl_infos.write(out_hashing);
new_data_part->checksums.files["ttl.txt"].file_size = out_hashing.count(); new_data_part->checksums.files["ttl.txt"].file_size = out_hashing.count();
@ -445,7 +447,7 @@ void finalizeMutatedPart(
if (!new_data_part->getSerializationInfos().empty()) if (!new_data_part->getSerializationInfos().empty())
{ {
auto out = disk->writeFile(part_path / IMergeTreeDataPart::SERIALIZATION_FILE_NAME, 4096); auto out = data_part_storage_builder->writeFile(IMergeTreeDataPart::SERIALIZATION_FILE_NAME, 4096);
HashingWriteBuffer out_hashing(*out); HashingWriteBuffer out_hashing(*out);
new_data_part->getSerializationInfos().writeJSON(out_hashing); new_data_part->getSerializationInfos().writeJSON(out_hashing);
new_data_part->checksums.files[IMergeTreeDataPart::SERIALIZATION_FILE_NAME].file_size = out_hashing.count(); new_data_part->checksums.files[IMergeTreeDataPart::SERIALIZATION_FILE_NAME].file_size = out_hashing.count();
@ -454,18 +456,18 @@ void finalizeMutatedPart(
{ {
/// Write file with checksums. /// Write file with checksums.
auto out_checksums = disk->writeFile(part_path / "checksums.txt", 4096); auto out_checksums = data_part_storage_builder->writeFile("checksums.txt", 4096);
new_data_part->checksums.write(*out_checksums); new_data_part->checksums.write(*out_checksums);
} /// close fd } /// close fd
{ {
auto out = disk->writeFile(part_path / IMergeTreeDataPart::DEFAULT_COMPRESSION_CODEC_FILE_NAME, 4096); auto out = data_part_storage_builder->writeFile(IMergeTreeDataPart::DEFAULT_COMPRESSION_CODEC_FILE_NAME, 4096);
DB::writeText(queryToString(codec->getFullCodecDesc()), *out); DB::writeText(queryToString(codec->getFullCodecDesc()), *out);
} }
{ {
/// Write a file with a description of columns. /// Write a file with a description of columns.
auto out_columns = disk->writeFile(part_path / "columns.txt", 4096); auto out_columns = data_part_storage_builder->writeFile("columns.txt", 4096);
new_data_part->getColumns().writeText(*out_columns); new_data_part->getColumns().writeText(*out_columns);
} /// close fd } /// close fd
@ -475,8 +477,7 @@ void finalizeMutatedPart(
new_data_part->minmax_idx = source_part->minmax_idx; new_data_part->minmax_idx = source_part->minmax_idx;
new_data_part->modification_time = time(nullptr); new_data_part->modification_time = time(nullptr);
new_data_part->loadProjections(false, false); new_data_part->loadProjections(false, false);
new_data_part->setBytesOnDisk( new_data_part->setBytesOnDisk(new_data_part->data_part_storage->calculateTotalSizeOnDisk());
MergeTreeData::DataPart::calculateTotalSizeOnDisk(new_data_part->volume->getDisk(), part_path));
new_data_part->default_codec = codec; new_data_part->default_codec = codec;
new_data_part->calculateColumnsAndSecondaryIndicesSizesOnDisk(); new_data_part->calculateColumnsAndSecondaryIndicesSizesOnDisk();
new_data_part->storage.lockSharedData(*new_data_part); new_data_part->storage.lockSharedData(*new_data_part);
@ -525,10 +526,11 @@ struct MutationContext
MutationsInterpreter::MutationKind::MutationKindEnum mutation_kind MutationsInterpreter::MutationKind::MutationKindEnum mutation_kind
= MutationsInterpreter::MutationKind::MutationKindEnum::MUTATE_UNKNOWN; = MutationsInterpreter::MutationKind::MutationKindEnum::MUTATE_UNKNOWN;
VolumePtr single_disk_volume; //VolumePtr single_disk_volume;
MergeTreeData::MutableDataPartPtr new_data_part; MergeTreeData::MutableDataPartPtr new_data_part;
DiskPtr disk; //DiskPtr disk;
String new_part_tmp_path; DataPartStorageBuilderPtr data_part_storage_builder;
//String new_part_tmp_path;
IMergedBlockOutputStreamPtr out{nullptr}; IMergedBlockOutputStreamPtr out{nullptr};
@ -810,7 +812,7 @@ bool PartMergerWriter::mutateOriginalPartAndPrepareProjections()
if (projection_block) if (projection_block)
{ {
auto tmp_part = MergeTreeDataWriter::writeTempProjectionPart( auto tmp_part = MergeTreeDataWriter::writeTempProjectionPart(
*ctx->data, ctx->log, projection_block, projection, ctx->new_data_part.get(), ++block_num); *ctx->data, ctx->log, projection_block, projection, ctx->data_part_storage_builder, ctx->new_data_part.get(), ++block_num);
tmp_part.finalize(); tmp_part.finalize();
projection_parts[projection.name].emplace_back(std::move(tmp_part.part)); projection_parts[projection.name].emplace_back(std::move(tmp_part.part));
} }
@ -832,7 +834,7 @@ bool PartMergerWriter::mutateOriginalPartAndPrepareProjections()
if (projection_block) if (projection_block)
{ {
auto temp_part = MergeTreeDataWriter::writeTempProjectionPart( auto temp_part = MergeTreeDataWriter::writeTempProjectionPart(
*ctx->data, ctx->log, projection_block, projection, ctx->new_data_part.get(), ++block_num); *ctx->data, ctx->log, projection_block, projection, ctx->data_part_storage_builder, ctx->new_data_part.get(), ++block_num);
temp_part.finalize(); temp_part.finalize();
projection_parts[projection.name].emplace_back(std::move(temp_part.part)); projection_parts[projection.name].emplace_back(std::move(temp_part.part));
} }
@ -932,7 +934,7 @@ private:
void prepare() void prepare()
{ {
ctx->disk->createDirectories(ctx->new_part_tmp_path); ctx->data_part_storage_builder->createDirectories();
/// Note: this is done before creating input streams, because otherwise data.data_parts_mutex /// Note: this is done before creating input streams, because otherwise data.data_parts_mutex
/// (which is locked in data.getTotalActiveSizeInBytes()) /// (which is locked in data.getTotalActiveSizeInBytes())
@ -968,6 +970,7 @@ private:
ctx->out = std::make_shared<MergedBlockOutputStream>( ctx->out = std::make_shared<MergedBlockOutputStream>(
ctx->new_data_part, ctx->new_data_part,
ctx->data_part_storage_builder,
ctx->metadata_snapshot, ctx->metadata_snapshot,
ctx->new_data_part->getColumns(), ctx->new_data_part->getColumns(),
skip_part_indices, skip_part_indices,
@ -1056,15 +1059,15 @@ private:
if (ctx->execute_ttl_type != ExecuteTTLType::NONE) if (ctx->execute_ttl_type != ExecuteTTLType::NONE)
ctx->files_to_skip.insert("ttl.txt"); ctx->files_to_skip.insert("ttl.txt");
ctx->disk->createDirectories(ctx->new_part_tmp_path); ctx->data_part_storage_builder->createDirectories();
/// Create hardlinks for unchanged files /// Create hardlinks for unchanged files
for (auto it = ctx->disk->iterateDirectory(ctx->source_part->getFullRelativePath()); it->isValid(); it->next()) for (auto it = ctx->source_part->data_part_storage->iterate(); it->isValid(); it->next())
{ {
if (ctx->files_to_skip.count(it->name())) if (ctx->files_to_skip.count(it->name()))
continue; continue;
String destination = ctx->new_part_tmp_path; String destination; // = ctx->new_part_tmp_path;
String file_name = it->name(); String file_name = it->name();
auto rename_it = std::find_if(ctx->files_to_rename.begin(), ctx->files_to_rename.end(), [&file_name](const auto & rename_pair) auto rename_it = std::find_if(ctx->files_to_rename.begin(), ctx->files_to_rename.end(), [&file_name](const auto & rename_pair)
@ -1075,23 +1078,28 @@ private:
{ {
if (rename_it->second.empty()) if (rename_it->second.empty())
continue; continue;
destination += rename_it->second; destination = rename_it->second;
} }
else else
{ {
destination += it->name(); destination = it->name();
} }
if (!ctx->disk->isDirectory(it->path())) if (it->isFile())
ctx->disk->createHardLink(it->path(), destination); ctx->data_part_storage_builder->createHardLinkFrom(
*ctx->source_part->data_part_storage, it->name(), destination);
else if (!endsWith(".tmp_proj", it->name())) // ignore projection tmp merge dir else if (!endsWith(".tmp_proj", it->name())) // ignore projection tmp merge dir
{ {
// it's a projection part directory // it's a projection part directory
ctx->disk->createDirectories(destination); ctx->data_part_storage_builder->createDirectory(destination);
for (auto p_it = ctx->disk->iterateDirectory(it->path()); p_it->isValid(); p_it->next())
auto projection_data_part_storage = ctx->source_part->data_part_storage->getProjection(destination);
auto projection_data_part_storage_builder = ctx->data_part_storage_builder->getProjection(destination);
for (auto p_it = projection_data_part_storage->iterate(); p_it->isValid(); p_it->next())
{ {
String p_destination = fs::path(destination) / p_it->name(); projection_data_part_storage_builder->createHardLinkFrom(
ctx->disk->createHardLink(p_it->path(), p_destination); *projection_data_part_storage, p_it->name(), p_it->name());
} }
} }
} }
@ -1162,7 +1170,7 @@ private:
} }
} }
MutationHelpers::finalizeMutatedPart(ctx->source_part, ctx->new_data_part, ctx->execute_ttl_type, ctx->compression_codec); MutationHelpers::finalizeMutatedPart(ctx->source_part, ctx->new_data_part, ctx->data_part_storage_builder, ctx->execute_ttl_type, ctx->compression_codec);
} }
@ -1293,8 +1301,19 @@ bool MutateTask::prepare()
} }
ctx->single_disk_volume = std::make_shared<SingleDiskVolume>("volume_" + ctx->future_part->name, ctx->space_reservation->getDisk(), 0); ctx->single_disk_volume = std::make_shared<SingleDiskVolume>("volume_" + ctx->future_part->name, ctx->space_reservation->getDisk(), 0);
auto data_part_storage = std::make_shared<DataPartStorageOnDisk>(
ctx->single_disk_volume,
ctx->data->getRelativeDataPath(),
"tmp_mut_" + ctx->future_part->name);
ctx->data_part_storage_builder = std::make_shared<DataPartStorageBuilderOnDisk>(
ctx->single_disk_volume,
ctx->data->getRelativeDataPath(),
"tmp_mut_" + ctx->future_part->name);
ctx->new_data_part = ctx->data->createPart( ctx->new_data_part = ctx->data->createPart(
ctx->future_part->name, ctx->future_part->type, ctx->future_part->part_info, ctx->single_disk_volume, "tmp_mut_" + ctx->future_part->name); ctx->future_part->name, ctx->future_part->type, ctx->future_part->part_info, data_part_storage);
ctx->new_data_part->uuid = ctx->future_part->uuid; ctx->new_data_part->uuid = ctx->future_part->uuid;
ctx->new_data_part->is_temp = true; ctx->new_data_part->is_temp = true;
@ -1311,8 +1330,8 @@ bool MutateTask::prepare()
ctx->new_data_part->setSerializationInfos(new_infos); ctx->new_data_part->setSerializationInfos(new_infos);
ctx->new_data_part->partition.assign(ctx->source_part->partition); ctx->new_data_part->partition.assign(ctx->source_part->partition);
ctx->disk = ctx->new_data_part->volume->getDisk(); // ctx->disk = ctx->new_data_part->volume->getDisk();
ctx->new_part_tmp_path = ctx->new_data_part->getFullRelativePath(); //ctx->new_part_tmp_path = ctx->new_data_part->getFullRelativePath();
/// Don't change granularity type while mutating subset of columns /// Don't change granularity type while mutating subset of columns
ctx->mrk_extension = ctx->source_part->index_granularity_info.is_adaptive ? getAdaptiveMrkExtension(ctx->new_data_part->getType()) ctx->mrk_extension = ctx->source_part->index_granularity_info.is_adaptive ? getAdaptiveMrkExtension(ctx->new_data_part->getType())

View File

@ -1371,7 +1371,7 @@ void StorageMergeTree::dropPartsImpl(DataPartsVector && parts_to_remove, bool de
/// NOTE: no race with background cleanup until we hold pointers to parts /// NOTE: no race with background cleanup until we hold pointers to parts
for (const auto & part : parts_to_remove) for (const auto & part : parts_to_remove)
{ {
LOG_INFO(log, "Detaching {}", part->relative_path); LOG_INFO(log, "Detaching {}", part->data_part_storage->getRelativePath());
part->makeCloneInDetached("", metadata_snapshot); part->makeCloneInDetached("", metadata_snapshot);
} }
} }
@ -1606,29 +1606,25 @@ CheckResults StorageMergeTree::checkData(const ASTPtr & query, ContextPtr local_
for (auto & part : data_parts) for (auto & part : data_parts)
{ {
auto disk = part->volume->getDisk(); //auto disk = part->volume->getDisk();
String part_path = part->getFullRelativePath(); //String part_path = part->getFullRelativePath();
/// If the checksums file is not present, calculate the checksums and write them to disk. /// If the checksums file is not present, calculate the checksums and write them to disk.
String checksums_path = fs::path(part_path) / "checksums.txt"; String checksums_path = "checksums.txt";
String tmp_checksums_path = fs::path(part_path) / "checksums.txt.tmp"; String tmp_checksums_path = "checksums.txt.tmp";
if (part->isStoredOnDisk() && !disk->exists(checksums_path)) if (part->isStoredOnDisk() && !part->data_part_storage->exists(checksums_path))
{ {
try try
{ {
auto calculated_checksums = checkDataPart(part, false); auto calculated_checksums = checkDataPart(part, false);
calculated_checksums.checkEqual(part->checksums, true); calculated_checksums.checkEqual(part->checksums, true);
auto out = disk->writeFile(tmp_checksums_path, 4096);
part->checksums.write(*out); part->data_part_storage->writeChecksums(part->checksums);
disk->moveFile(tmp_checksums_path, checksums_path);
part->checkMetadata(); part->checkMetadata();
results.emplace_back(part->name, true, "Checksums recounted and written to disk."); results.emplace_back(part->name, true, "Checksums recounted and written to disk.");
} }
catch (const Exception & ex) catch (const Exception & ex)
{ {
if (disk->exists(tmp_checksums_path))
disk->removeFile(tmp_checksums_path);
results.emplace_back(part->name, false, results.emplace_back(part->name, false,
"Check of part finished with error: '" + ex.message() + "'"); "Check of part finished with error: '" + ex.message() + "'");
} }

View File

@ -9,6 +9,7 @@
#include <Common/formatReadable.h> #include <Common/formatReadable.h>
#include <Common/thread_local_rng.h> #include <Common/thread_local_rng.h>
#include <Common/typeid_cast.h> #include <Common/typeid_cast.h>
#include <Storages/MergeTree/DataPartStorageOnDisk.h>
#include <base/sort.h> #include <base/sort.h>
@ -1437,12 +1438,16 @@ MergeTreeData::MutableDataPartPtr StorageReplicatedMergeTree::attachPartHelperFo
continue; continue;
const String part_old_name = part_info->getPartName(); const String part_old_name = part_info->getPartName();
const String part_path = fs::path("detached") / part_old_name;
const VolumePtr volume = std::make_shared<SingleDiskVolume>("volume_" + part_old_name, disk); const VolumePtr volume = std::make_shared<SingleDiskVolume>("volume_" + part_old_name, disk);
auto data_part_storage = std::make_shared<DataPartStorageOnDisk>(
volume,
fs::path(relative_data_path) / "detached",
part_old_name);
/// actual_part_info is more recent than part_info so we use it /// actual_part_info is more recent than part_info so we use it
MergeTreeData::MutableDataPartPtr part = createPart(part_new_name, actual_part_info, volume, part_path); MergeTreeData::MutableDataPartPtr part = createPart(part_new_name, actual_part_info, data_part_storage);
try try
{ {
@ -1457,7 +1462,7 @@ MergeTreeData::MutableDataPartPtr StorageReplicatedMergeTree::attachPartHelperFo
if (entry.part_checksum == part->checksums.getTotalChecksumHex()) if (entry.part_checksum == part->checksums.getTotalChecksumHex())
{ {
part->modification_time = disk->getLastModified(part->getFullRelativePath()).epochTime(); part->modification_time = data_part_storage->getLastModified().epochTime();
return part; return part;
} }
} }
@ -1823,7 +1828,7 @@ void StorageReplicatedMergeTree::executeDropRange(const LogEntry & entry)
/// If DETACH clone parts to detached/ directory /// If DETACH clone parts to detached/ directory
for (const auto & part : parts_to_remove) for (const auto & part : parts_to_remove)
{ {
LOG_INFO(log, "Detaching {}", part->relative_path); LOG_INFO(log, "Detaching {}", part->data_part_storage->getRelativePath());
part->makeCloneInDetached("", metadata_snapshot); part->makeCloneInDetached("", metadata_snapshot);
} }
} }
@ -2448,7 +2453,7 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo
for (const auto & part : parts_to_remove_from_working_set) for (const auto & part : parts_to_remove_from_working_set)
{ {
LOG_INFO(log, "Detaching {}", part->relative_path); LOG_INFO(log, "Detaching {}", part->data_part_storage->getRelativePath());
part->makeCloneInDetached("clone", metadata_snapshot); part->makeCloneInDetached("clone", metadata_snapshot);
} }
} }
@ -4037,10 +4042,10 @@ bool StorageReplicatedMergeTree::fetchExistsPart(const String & part_name, const
{ {
part = get_part(); part = get_part();
if (part->volume->getDisk()->getName() != replaced_disk->getName()) if (part->data_part_storage->getName() != replaced_disk->getName())
throw Exception("Part " + part->name + " fetched on wrong disk " + part->volume->getDisk()->getName(), ErrorCodes::LOGICAL_ERROR); throw Exception("Part " + part->name + " fetched on wrong disk " + part->data_part_storage->getName(), ErrorCodes::LOGICAL_ERROR);
replaced_disk->removeFileIfExists(replaced_part_path); replaced_disk->removeFileIfExists(replaced_part_path);
replaced_disk->moveDirectory(part->getFullRelativePath(), replaced_part_path); replaced_disk->moveDirectory(part->data_part_storage->getFullRelativePath(), replaced_part_path);
} }
catch (const Exception & e) catch (const Exception & e)
{ {
@ -7115,7 +7120,7 @@ void StorageReplicatedMergeTree::checkBrokenDisks()
for (auto & part : *parts) for (auto & part : *parts)
{ {
if (part->volume && part->volume->getDisk()->getName() == disk_ptr->getName()) if (part->data_part_storage && part->data_part_storage->getName() == disk_ptr->getName())
broken_part_callback(part->name); broken_part_callback(part->name);
} }
continue; continue;
@ -7216,7 +7221,7 @@ void StorageReplicatedMergeTree::lockSharedDataTemporary(const String & part_nam
String id = part_id; String id = part_id;
boost::replace_all(id, "/", "_"); boost::replace_all(id, "/", "_");
Strings zc_zookeeper_paths = getZeroCopyPartPath(*getSettings(), disk->getType(), getTableSharedID(), Strings zc_zookeeper_paths = getZeroCopyPartPath(*getSettings(), toString(disk->getType()), getTableSharedID(),
part_name, zookeeper_path); part_name, zookeeper_path);
for (const auto & zc_zookeeper_path : zc_zookeeper_paths) for (const auto & zc_zookeeper_path : zc_zookeeper_paths)
@ -7230,11 +7235,10 @@ void StorageReplicatedMergeTree::lockSharedDataTemporary(const String & part_nam
void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock) const void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock) const
{ {
if (!part.volume || !part.isStoredOnDisk()) if (!part.data_part_storage || !part.isStoredOnDisk())
return; return;
DiskPtr disk = part.volume->getDisk(); if (part.data_part_storage->supportZeroCopyReplication())
if (!disk || !disk->supportZeroCopyReplication())
return; return;
zkutil::ZooKeeperPtr zookeeper = tryGetZooKeeper(); zkutil::ZooKeeperPtr zookeeper = tryGetZooKeeper();
@ -7244,7 +7248,7 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part,
String id = part.getUniqueId(); String id = part.getUniqueId();
boost::replace_all(id, "/", "_"); boost::replace_all(id, "/", "_");
Strings zc_zookeeper_paths = getZeroCopyPartPath(*getSettings(), disk->getType(), getTableSharedID(), Strings zc_zookeeper_paths = getZeroCopyPartPath(*getSettings(), part.data_part_storage->getDiskType(), getTableSharedID(),
part.name, zookeeper_path); part.name, zookeeper_path);
for (const auto & zc_zookeeper_path : zc_zookeeper_paths) for (const auto & zc_zookeeper_path : zc_zookeeper_paths)
{ {
@ -7265,18 +7269,16 @@ bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & par
bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part, const String & name) const bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part, const String & name) const
{ {
if (!part.volume || !part.isStoredOnDisk()) if (!part.data_part_storage || !part.isStoredOnDisk())
return true; return true;
DiskPtr disk = part.volume->getDisk(); if (!part.data_part_storage->supportZeroCopyReplication())
if (!disk || !disk->supportZeroCopyReplication())
return true; return true;
/// If part is temporary refcount file may be absent /// If part is temporary refcount file may be absent
auto ref_count_path = fs::path(part.getFullRelativePath()) / IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK; if (part.data_part_storage->exists(IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK))
if (disk->exists(ref_count_path))
{ {
auto ref_count = disk->getRefCount(ref_count_path); auto ref_count = part.data_part_storage->getRefCount(IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK);
if (ref_count > 0) /// Keep part shard info for frozen backups if (ref_count > 0) /// Keep part shard info for frozen backups
return false; return false;
} }
@ -7286,18 +7288,18 @@ bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & par
return true; return true;
} }
return unlockSharedDataByID(part.getUniqueId(), getTableSharedID(), name, replica_name, disk, getZooKeeper(), *getSettings(), log, return unlockSharedDataByID(part.getUniqueId(), getTableSharedID(), name, replica_name, part.data_part_storage->getDiskType(), getZooKeeper(), *getSettings(), log,
zookeeper_path); zookeeper_path);
} }
bool StorageReplicatedMergeTree::unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name, bool StorageReplicatedMergeTree::unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name,
const String & replica_name_, DiskPtr disk, zkutil::ZooKeeperPtr zookeeper_ptr, const MergeTreeSettings & settings, const String & replica_name_, std::string disk_type, zkutil::ZooKeeperPtr zookeeper_ptr, const MergeTreeSettings & settings,
Poco::Logger * logger, const String & zookeeper_path_old) Poco::Logger * logger, const String & zookeeper_path_old)
{ {
boost::replace_all(part_id, "/", "_"); boost::replace_all(part_id, "/", "_");
Strings zc_zookeeper_paths = getZeroCopyPartPath(settings, disk->getType(), table_uuid, part_name, zookeeper_path_old); Strings zc_zookeeper_paths = getZeroCopyPartPath(settings, disk_type, table_uuid, part_name, zookeeper_path_old);
bool part_has_no_more_locks = true; bool part_has_no_more_locks = true;
@ -7381,7 +7383,7 @@ String StorageReplicatedMergeTree::getSharedDataReplica(
if (!zookeeper) if (!zookeeper)
return ""; return "";
Strings zc_zookeeper_paths = getZeroCopyPartPath(*getSettings(), disk_type, getTableSharedID(), part.name, Strings zc_zookeeper_paths = getZeroCopyPartPath(*getSettings(), toString(disk_type), getTableSharedID(), part.name,
zookeeper_path); zookeeper_path);
std::set<String> replicas; std::set<String> replicas;
@ -7452,12 +7454,12 @@ String StorageReplicatedMergeTree::getSharedDataReplica(
} }
Strings StorageReplicatedMergeTree::getZeroCopyPartPath(const MergeTreeSettings & settings, DiskType disk_type, const String & table_uuid, Strings StorageReplicatedMergeTree::getZeroCopyPartPath(const MergeTreeSettings & settings, std::string disk_type, const String & table_uuid,
const String & part_name, const String & zookeeper_path_old) const String & part_name, const String & zookeeper_path_old)
{ {
Strings res; Strings res;
String zero_copy = fmt::format("zero_copy_{}", toString(disk_type)); String zero_copy = fmt::format("zero_copy_{}", disk_type);
String new_path = fs::path(settings.remote_fs_zero_copy_zookeeper_path.toString()) / zero_copy / table_uuid / part_name; String new_path = fs::path(settings.remote_fs_zero_copy_zookeeper_path.toString()) / zero_copy / table_uuid / part_name;
res.push_back(new_path); res.push_back(new_path);
@ -7491,7 +7493,7 @@ std::optional<String> StorageReplicatedMergeTree::getZeroCopyPartPath(const Stri
if (!disk || !disk->supportZeroCopyReplication()) if (!disk || !disk->supportZeroCopyReplication())
return std::nullopt; return std::nullopt;
return getZeroCopyPartPath(*getSettings(), disk->getType(), getTableSharedID(), part_name, zookeeper_path)[0]; return getZeroCopyPartPath(*getSettings(), toString(disk->getType()), getTableSharedID(), part_name, zookeeper_path)[0];
} }
std::optional<ZeroCopyLock> StorageReplicatedMergeTree::tryCreateZeroCopyExclusiveLock(const String & part_name, const DiskPtr & disk) std::optional<ZeroCopyLock> StorageReplicatedMergeTree::tryCreateZeroCopyExclusiveLock(const String & part_name, const DiskPtr & disk)
@ -7586,12 +7588,22 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP
auto minmax_idx = std::make_shared<IMergeTreeDataPart::MinMaxIndex>(); auto minmax_idx = std::make_shared<IMergeTreeDataPart::MinMaxIndex>();
minmax_idx->update(block, getMinMaxColumnsNames(metadata_snapshot->getPartitionKey())); minmax_idx->update(block, getMinMaxColumnsNames(metadata_snapshot->getPartitionKey()));
auto new_volume = createVolumeFromReservation(reservation, volume);
auto data_part_storage = std::make_shared<DataPartStorageOnDisk>(
new_volume,
relative_data_path,
TMP_PREFIX + lost_part_name);
DataPartStorageBuilderPtr data_part_storage_builder = std::make_shared<DataPartStorageBuilderOnDisk>(
new_volume,
relative_data_path,
TMP_PREFIX + lost_part_name);
auto new_data_part = createPart( auto new_data_part = createPart(
lost_part_name, lost_part_name,
choosePartType(0, block.rows()), choosePartType(0, block.rows()),
new_part_info, new_part_info,
createVolumeFromReservation(reservation, volume), data_part_storage);
TMP_PREFIX + lost_part_name);
if (settings->assign_part_uuids) if (settings->assign_part_uuids)
new_data_part->uuid = UUIDHelpers::generateV4(); new_data_part->uuid = UUIDHelpers::generateV4();
@ -7628,19 +7640,16 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP
if (new_data_part->isStoredOnDisk()) if (new_data_part->isStoredOnDisk())
{ {
/// The name could be non-unique in case of stale files from previous runs. /// The name could be non-unique in case of stale files from previous runs.
String full_path = new_data_part->getFullRelativePath(); if (data_part_storage_builder->exists())
if (new_data_part->volume->getDisk()->exists(full_path))
{ {
LOG_WARNING(log, "Removing old temporary directory {}", fullPath(new_data_part->volume->getDisk(), full_path)); LOG_WARNING(log, "Removing old temporary directory {}", new_data_part->data_part_storage->getFullPath());
new_data_part->volume->getDisk()->removeRecursive(full_path); data_part_storage_builder->removeRecursive();
} }
const auto disk = new_data_part->volume->getDisk(); data_part_storage_builder->createDirectories();
disk->createDirectories(full_path);
if (getSettings()->fsync_part_directory) if (getSettings()->fsync_part_directory)
sync_guard = disk->getDirectorySyncGuard(full_path); sync_guard = data_part_storage_builder->getDirectorySyncGuard();
} }
/// This effectively chooses minimal compression method: /// This effectively chooses minimal compression method:
@ -7648,7 +7657,7 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP
auto compression_codec = getContext()->chooseCompressionCodec(0, 0); auto compression_codec = getContext()->chooseCompressionCodec(0, 0);
const auto & index_factory = MergeTreeIndexFactory::instance(); const auto & index_factory = MergeTreeIndexFactory::instance();
MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns, MergedBlockOutputStream out(new_data_part, data_part_storage_builder, metadata_snapshot, columns,
index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec); index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec);
bool sync_on_insert = settings->fsync_after_insert; bool sync_on_insert = settings->fsync_after_insert;
@ -7909,7 +7918,7 @@ bool StorageReplicatedMergeTree::removeSharedDetachedPart(DiskPtr disk, const St
{ {
String id = disk->getUniqueId(checksums); String id = disk->getUniqueId(checksums);
keep_shared = !StorageReplicatedMergeTree::unlockSharedDataByID(id, table_uuid, part_name, keep_shared = !StorageReplicatedMergeTree::unlockSharedDataByID(id, table_uuid, part_name,
detached_replica_name, disk, zookeeper, getContext()->getReplicatedMergeTreeSettings(), log, detached_replica_name, toString(disk->getType()), zookeeper, getContext()->getReplicatedMergeTreeSettings(), log,
detached_zookeeper_path); detached_zookeeper_path);
} }
else else

View File

@ -247,7 +247,7 @@ public:
/// Return true if data unlocked /// Return true if data unlocked
/// Return false if data is still used by another node /// Return false if data is still used by another node
static bool unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name, const String & replica_name_, static bool unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name, const String & replica_name_,
DiskPtr disk, zkutil::ZooKeeperPtr zookeeper_, const MergeTreeSettings & settings, Poco::Logger * logger, std::string disk_type, zkutil::ZooKeeperPtr zookeeper_, const MergeTreeSettings & settings, Poco::Logger * logger,
const String & zookeeper_path_old); const String & zookeeper_path_old);
/// Fetch part only if some replica has it on shared storage like S3 /// Fetch part only if some replica has it on shared storage like S3
@ -758,7 +758,7 @@ private:
PartitionBlockNumbersHolder allocateBlockNumbersInAffectedPartitions( PartitionBlockNumbersHolder allocateBlockNumbersInAffectedPartitions(
const MutationCommands & commands, ContextPtr query_context, const zkutil::ZooKeeperPtr & zookeeper) const; const MutationCommands & commands, ContextPtr query_context, const zkutil::ZooKeeperPtr & zookeeper) const;
static Strings getZeroCopyPartPath(const MergeTreeSettings & settings, DiskType disk_type, const String & table_uuid, static Strings getZeroCopyPartPath(const MergeTreeSettings & settings, std::string disk_type, const String & table_uuid,
const String & part_name, const String & zookeeper_path_old); const String & part_name, const String & zookeeper_path_old);
static void createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false); static void createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false);