Merge pull request #59170 from azat/disks/s3-plain-improvements

Small improvements for tables on write-once (s3_plain)/read-only (web) disks
This commit is contained in:
Kseniia Sumarokova 2024-01-29 10:44:37 +01:00 committed by GitHub
commit a29ab8f29c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 260 additions and 7 deletions

View File

@ -98,6 +98,8 @@ DirectoryIteratorPtr MetadataStorageFromPlainObjectStorage::iterateDirectory(con
{
/// Required for MergeTree
auto paths = listDirectory(path);
// Prepend path, since iterateDirectory() includes path, unlike listDirectory()
std::for_each(paths.begin(), paths.end(), [&](auto & child) { child = fs::path(path) / child; });
std::vector<std::filesystem::path> fs_paths(paths.begin(), paths.end());
return std::make_unique<StaticDirectoryIterator>(std::move(fs_paths));
}
@ -121,6 +123,12 @@ void MetadataStorageFromPlainObjectStorageTransaction::unlinkFile(const std::str
metadata_storage.object_storage->removeObject(object);
}
void MetadataStorageFromPlainObjectStorageTransaction::removeDirectory(const std::string & path)
{
for (auto it = metadata_storage.iterateDirectory(path); it->isValid(); it->next())
metadata_storage.object_storage->removeObject(StoredObject(it->path()));
}
void MetadataStorageFromPlainObjectStorageTransaction::createDirectory(const std::string &)
{
/// Noop. It is an Object Storage not a filesystem.

View File

@ -101,12 +101,13 @@ public:
void createDirectoryRecursive(const std::string & path) override;
void unlinkFile(const std::string & path) override;
void removeDirectory(const std::string & path) override;
UnlinkMetadataFileOperationOutcomePtr unlinkMetadata(const std::string & path) override;
void commit() override
{
/// Nothing to commit.
/// TODO: rewrite with transactions
}
bool supportsChmod() const override { return false; }

View File

@ -90,6 +90,7 @@
#include <base/insertAtEnd.h>
#include <base/interpolate.h>
#include <base/defines.h>
#include <algorithm>
#include <atomic>
@ -300,7 +301,11 @@ void MergeTreeData::initializeDirectoriesAndFormatVersion(const std::string & re
if (disk->isBroken())
continue;
if (!disk->isReadOnly())
/// Write once disk is almost the same as read-only for MergeTree,
/// since it does not support move, that is required for any
/// operation over MergeTree, so avoid writing format_version.txt
/// into it as well, to avoid leaving it after DROP.
if (!disk->isReadOnly() && !disk->isWriteOnce())
{
auto buf = disk->writeFile(format_version_path, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite, getContext()->getWriteSettings());
writeIntText(format_version.toUnderType(), *buf);
@ -2740,6 +2745,15 @@ void MergeTreeData::renameInMemory(const StorageID & new_table_id)
void MergeTreeData::dropAllData()
{
/// In case there is read-only/write-once disk we cannot allow to call dropAllData(), but dropping tables is allowed.
///
/// Note, that one may think that drop on write-once disk should be
/// supported, since it is pretty trivial to implement
/// MetadataStorageFromPlainObjectStorageTransaction::removeDirectory(),
/// however removing part requires moveDirectory() as well.
if (isStaticStorage())
return;
LOG_TRACE(log, "dropAllData: waiting for locks.");
auto settings_ptr = getSettings();
@ -7071,6 +7085,8 @@ std::pair<MergeTreeData::MutableDataPartPtr, scope_guard> MergeTreeData::cloneAn
const ReadSettings & read_settings,
const WriteSettings & write_settings)
{
chassert(!isStaticStorage());
/// Check that the storage policy contains the disk where the src_part is located.
bool does_storage_policy_allow_same_disk = false;
for (const DiskPtr & disk : getStoragePolicy()->getDisks())

View File

@ -65,6 +65,7 @@ namespace ErrorCodes
extern const int NO_SUCH_DATA_PART;
extern const int ABORTED;
extern const int SUPPORT_IS_DISABLED;
extern const int TABLE_IS_READ_ONLY;
}
namespace ActionLocks
@ -294,6 +295,8 @@ std::optional<UInt64> StorageMergeTree::totalBytesUncompressed(const Settings &)
SinkToStoragePtr
StorageMergeTree::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /*async_insert*/)
{
assertNotReadonly();
const auto & settings = local_context->getSettingsRef();
return std::make_shared<MergeTreeSink>(
*this, metadata_snapshot, settings.max_partitions_per_insert_block, local_context);
@ -319,9 +322,6 @@ void StorageMergeTree::checkTableCanBeDropped(ContextPtr query_context) const
void StorageMergeTree::drop()
{
shutdown(true);
/// In case there is read-only disk we cannot allow to call dropAllData(), but dropping tables is allowed.
if (isStaticStorage())
return;
dropAllData();
}
@ -330,6 +330,8 @@ void StorageMergeTree::alter(
ContextPtr local_context,
AlterLockHolder & table_lock_holder)
{
assertNotReadonly();
if (local_context->getCurrentTransaction() && local_context->getSettingsRef().throw_on_unsupported_query_inside_transaction)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "ALTER METADATA is not supported inside transactions");
@ -620,6 +622,8 @@ void StorageMergeTree::setMutationCSN(const String & mutation_id, CSN csn)
void StorageMergeTree::mutate(const MutationCommands & commands, ContextPtr query_context)
{
assertNotReadonly();
delayMutationOrThrowIfNeeded(nullptr, query_context);
/// Validate partition IDs (if any) before starting mutation
@ -810,6 +814,8 @@ std::vector<MergeTreeMutationStatus> StorageMergeTree::getMutationsStatus() cons
CancellationCode StorageMergeTree::killMutation(const String & mutation_id)
{
assertNotReadonly();
LOG_TRACE(log, "Killing mutation {}", mutation_id);
UInt64 mutation_version = MergeTreeMutationEntry::tryParseFileName(mutation_id);
if (!mutation_version)
@ -1520,6 +1526,8 @@ bool StorageMergeTree::optimize(
bool cleanup,
ContextPtr local_context)
{
assertNotReadonly();
if (deduplicate)
{
if (deduplicate_by_columns.empty())
@ -1765,6 +1773,8 @@ void StorageMergeTree::renameAndCommitEmptyParts(MutableDataPartsVector & new_pa
void StorageMergeTree::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr query_context, TableExclusiveLockHolder &)
{
assertNotReadonly();
{
/// Asks to complete merges and does not allow them to start.
/// This protects against "revival" of data for a removed partition after completion of merge.
@ -2039,6 +2049,8 @@ PartitionCommandsResultInfo StorageMergeTree::attachPartition(
void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, const ASTPtr & partition, bool replace, ContextPtr local_context)
{
assertNotReadonly();
auto lock1 = lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout);
auto lock2 = source_table->lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout);
auto merges_blocker = stopMergesAndWait();
@ -2437,6 +2449,12 @@ PreparedSetsCachePtr StorageMergeTree::getPreparedSetsCache(Int64 mutation_id)
return cache;
}
void StorageMergeTree::assertNotReadonly() const
{
if (isStaticStorage())
throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is in readonly mode due to static storage");
}
void StorageMergeTree::fillNewPartName(MutableDataPartPtr & part, DataPartsLock &)
{
part->info.min_block = part->info.max_block = increment.get();

View File

@ -273,6 +273,8 @@ private:
PreparedSetsCachePtr getPreparedSetsCache(Int64 mutation_id);
void assertNotReadonly() const;
friend class MergeTreeSink;
friend class MergeTreeData;
friend class MergePlainMergeTreeTask;

View File

@ -4661,6 +4661,9 @@ bool StorageReplicatedMergeTree::fetchPart(
zkutil::ZooKeeper::Ptr zookeeper_,
bool try_fetch_shared)
{
if (isStaticStorage())
throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is in readonly mode due to static storage");
auto zookeeper = zookeeper_ ? zookeeper_ : getZooKeeper();
const auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version);
@ -5498,6 +5501,8 @@ void StorageReplicatedMergeTree::assertNotReadonly() const
{
if (is_readonly)
throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is in readonly mode (replica path: {})", replica_path);
if (isStaticStorage())
throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is in readonly mode due to static storage");
}
@ -5506,6 +5511,8 @@ SinkToStoragePtr StorageReplicatedMergeTree::write(const ASTPtr & /*query*/, con
if (!initialization_done)
throw Exception(ErrorCodes::NOT_INITIALIZED, "Table is not initialized yet");
if (isStaticStorage())
throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is in readonly mode due to static storage");
/// If table is read-only because it doesn't have metadata in zk yet, then it's not possible to insert into it
/// Without this check, we'll write data parts on disk, and afterwards will remove them since we'll fail to commit them into zk
/// In case of remote storage like s3, it'll generate unnecessary PUT requests

View File

@ -4,11 +4,17 @@
<s3_disk>
<type>s3</type>
<path>s3_disk/</path>
<endpoint>http://localhost:11111/test/common/</endpoint>
<endpoint>http://localhost:11111/test/s3/</endpoint>
<access_key_id>clickhouse</access_key_id>
<secret_access_key>clickhouse</secret_access_key>
<request_timeout_ms>20000</request_timeout_ms>
</s3_disk>
<s3_plain_disk>
<type>s3_plain</type>
<endpoint>http://localhost:11111/test/s3_plain/</endpoint>
<access_key_id>clickhouse</access_key_id>
<secret_access_key>clickhouse</secret_access_key>
</s3_plain_disk>
<s3_cache>
<type>cache</type>
<disk>s3_disk</disk>

View File

@ -172,7 +172,7 @@ def test_incorrect_usage(cluster):
assert "Table is read-only" in result
result = node2.query_and_get_error("OPTIMIZE TABLE test0 FINAL")
assert "Only read-only operations are supported" in result
assert "Table is in readonly mode due to static storage" in result
node2.query("DROP TABLE test0 SYNC")

View File

@ -0,0 +1,30 @@
data after INSERT 1
data after ATTACH 1
Files before DETACH TABLE
all_1_1_0
backups/ordinary_default/data/ordinary_default/data/all_1_1_0:
primary.cidx
serialization.json
metadata_version.txt
default_compression_codec.txt
data.bin
data.cmrk3
count.txt
columns.txt
checksums.txt
Files after DETACH TABLE
all_1_1_0
backups/ordinary_default/data/ordinary_default/data/all_1_1_0:
primary.cidx
serialization.json
metadata_version.txt
default_compression_codec.txt
data.bin
data.cmrk3
count.txt
columns.txt
checksums.txt

View File

@ -0,0 +1,56 @@
#!/usr/bin/env bash
# Tags: no-fasttest, no-random-settings, no-random-merge-tree-settings
# Tag no-fasttest: requires S3
# Tag no-random-settings, no-random-merge-tree-settings: to avoid creating extra files like serialization.json, this test too exocit anyway
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
# config for clickhouse-disks (to check leftovers)
config="${BASH_SOURCE[0]/.sh/.yml}"
# only in Atomic ATTACH from s3_plain works
new_database="ordinary_$CLICKHOUSE_DATABASE"
$CLICKHOUSE_CLIENT --allow_deprecated_database_ordinary=1 -q "create database $new_database engine=Ordinary"
CLICKHOUSE_CLIENT=${CLICKHOUSE_CLIENT/--database=$CLICKHOUSE_DATABASE/--database=$new_database}
CLICKHOUSE_DATABASE="$new_database"
$CLICKHOUSE_CLIENT -nm -q "
drop table if exists data;
create table data (key Int) engine=MergeTree() order by key;
insert into data values (1);
select 'data after INSERT', count() from data;
"
# suppress output
$CLICKHOUSE_CLIENT -q "backup table data to S3('http://localhost:11111/test/s3_plain/backups/$CLICKHOUSE_DATABASE', 'test', 'testtest')" > /dev/null
$CLICKHOUSE_CLIENT -nm -q "
drop table data;
attach table data (key Int) engine=MergeTree() order by key
settings
max_suspicious_broken_parts=0,
disk=disk(type=s3_plain,
endpoint='http://localhost:11111/test/s3_plain/backups/$CLICKHOUSE_DATABASE',
access_key_id='test',
secret_access_key='testtest');
select 'data after ATTACH', count() from data;
insert into data values (1); -- { serverError TABLE_IS_READ_ONLY }
optimize table data final; -- { serverError TABLE_IS_READ_ONLY }
"
path=$($CLICKHOUSE_CLIENT -q "SELECT replace(data_paths[1], 's3_plain', '') FROM system.tables WHERE database = '$CLICKHOUSE_DATABASE' AND table = 'data'")
# trim / to fix "Unable to parse ExceptionName: XMinioInvalidObjectName Message: Object name contains unsupported characters."
path=${path%/}
echo "Files before DETACH TABLE"
clickhouse-disks -C "$config" --disk s3_plain_disk list --recursive "${path:?}" | tail -n+2
$CLICKHOUSE_CLIENT -q "detach table data"
echo "Files after DETACH TABLE"
clickhouse-disks -C "$config" --disk s3_plain_disk list --recursive "$path" | tail -n+2
# metadata file is left
$CLICKHOUSE_CLIENT --force_remove_data_recursively_on_drop=1 -q "drop database if exists $CLICKHOUSE_DATABASE"

View File

@ -0,0 +1,7 @@
storage_configuration:
disks:
s3_plain_disk:
type: s3_plain
endpoint: http://localhost:11111/test/s3_plain/
access_key_id: clickhouse
secret_access_key: clickhouse

View File

@ -0,0 +1,30 @@
data after INSERT 1
data after ATTACH 1
Files before DETACH TABLE
all_X_X_X
backups/ordinary_default/data/ordinary_default/data_read/all_X_X_X:
primary.cidx
serialization.json
metadata_version.txt
default_compression_codec.txt
data.bin
data.cmrk3
count.txt
columns.txt
checksums.txt
Files after DETACH TABLE
all_X_X_X
backups/ordinary_default/data/ordinary_default/data_read/all_X_X_X:
primary.cidx
serialization.json
metadata_version.txt
default_compression_codec.txt
data.bin
data.cmrk3
count.txt
columns.txt
checksums.txt

View File

@ -0,0 +1,65 @@
#!/usr/bin/env bash
# Tags: no-fasttest, no-random-settings, no-random-merge-tree-settings
# Tag no-fasttest: requires S3
# Tag no-random-settings, no-random-merge-tree-settings: to avoid creating extra files like serialization.json, this test too exocit anyway
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
config="${BASH_SOURCE[0]/.sh/.yml}"
# only in Atomic ATTACH from s3_plain works
new_database="ordinary_$CLICKHOUSE_DATABASE"
$CLICKHOUSE_CLIENT --allow_deprecated_database_ordinary=1 -q "create database $new_database engine=Ordinary"
CLICKHOUSE_CLIENT=${CLICKHOUSE_CLIENT/--database=$CLICKHOUSE_DATABASE/--database=$new_database}
CLICKHOUSE_DATABASE="$new_database"
$CLICKHOUSE_CLIENT -nm -q "
drop table if exists data_read;
drop table if exists data_write;
create table data_write (key Int) engine=ReplicatedMergeTree('/tables/{database}/data', 'write') order by key;
create table data_read (key Int) engine=ReplicatedMergeTree('/tables/{database}/data', 'read') order by key;
insert into data_write values (1);
system sync replica data_read;
select 'data after INSERT', count() from data_read;
"
# suppress output
$CLICKHOUSE_CLIENT -q "backup table data_read to S3('http://localhost:11111/test/s3_plain/backups/$CLICKHOUSE_DATABASE', 'test', 'testtest')" > /dev/null
$CLICKHOUSE_CLIENT -nm -q "
drop table data_read;
attach table data_read (key Int) engine=ReplicatedMergeTree('/tables/{database}/data', 'read') order by key
settings
max_suspicious_broken_parts=0,
disk=disk(type=s3_plain,
endpoint='http://localhost:11111/test/s3_plain/backups/$CLICKHOUSE_DATABASE',
access_key_id='test',
secret_access_key='testtest');
select 'data after ATTACH', count() from data_read;
insert into data_read values (1); -- { serverError TABLE_IS_READ_ONLY }
optimize table data_read final; -- { serverError TABLE_IS_READ_ONLY }
system sync replica data_read; -- { serverError TABLE_IS_READ_ONLY }
"
path=$($CLICKHOUSE_CLIENT -q "SELECT replace(data_paths[1], 's3_plain', '') FROM system.tables WHERE database = '$CLICKHOUSE_DATABASE' AND table = 'data_read'")
# trim / to fix "Unable to parse ExceptionName: XMinioInvalidObjectName Message: Object name contains unsupported characters."
path=${path%/}
echo "Files before DETACH TABLE"
# sed to match any part, since in case of fault injection part name may not be all_0_0_0 but all_1_1_0
clickhouse-disks -C "$config" --disk s3_plain_disk list --recursive "${path:?}" | tail -n+2 | sed 's/all_[^_]*_[^_]*_0/all_X_X_X/g'
$CLICKHOUSE_CLIENT -nm -q "
detach table data_read;
detach table data_write;
"
echo "Files after DETACH TABLE"
clickhouse-disks -C "$config" --disk s3_plain_disk list --recursive "$path" | tail -n+2 | sed 's/all_[^_]*_[^_]*_0/all_X_X_X/g'
# metadata file is left
$CLICKHOUSE_CLIENT --force_remove_data_recursively_on_drop=1 -q "drop database if exists $CLICKHOUSE_DATABASE"

View File

@ -0,0 +1,7 @@
storage_configuration:
disks:
s3_plain_disk:
type: s3_plain
endpoint: http://localhost:11111/test/s3_plain/
access_key_id: clickhouse
secret_access_key: clickhouse