Merge branch 'master' into power8

This commit is contained in:
mergify[bot] 2022-04-24 22:17:21 +00:00 committed by GitHub
commit 196e34eb36
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
75 changed files with 1423 additions and 445 deletions

79
.github/workflows/codeql.yml vendored Normal file
View File

@ -0,0 +1,79 @@
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
#
# ******** NOTE ********
# We have attempted to detect the languages in your repository. Please check
# the `language` matrix defined below to confirm you have the correct set of
# supported CodeQL languages.
#
name: "CodeQL"
"on":
# push:
# branches: [ master ]
# pull_request:
# # The branches below must be a subset of the branches above
# branches: [ master ]
schedule:
- cron: '0 */6 * * *'
workflow_dispatch:
env:
CC: clang-14
CXX: clang++-14
jobs:
analyze:
name: Analyze
runs-on: ubuntu-latest
permissions:
actions: read
contents: read
security-events: write
strategy:
fail-fast: false
matrix:
language: ['cpp']
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
# Learn more about CodeQL language support at https://git.io/codeql-language-support
steps:
- name: Checkout repository
uses: actions/checkout@v3
with:
submodules: 'true'
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v2
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.
# queries: ./path/to/local/query, your-org/your-repo/queries@main
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
# - name: Autobuild
# uses: github/codeql-action/autobuild@v2
# Command-line programs to run using the OS shell.
# 📚 https://git.io/JvXDl
- name: Build
run: |
sudo apt-get install -yq git cmake python ninja-build
sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)"
mkdir build
cd build
cmake -DUSE_STATIC_LIBRARIES=0 -DSPLIT_SHARED_LIBRARIES=1 -DCLICKHOUSE_SPLIT_BINARY=1 ..
ninja
rm -rf ../contrib
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v2

View File

@ -5,8 +5,9 @@ sidebar_label: "ClickHouse用户"
# ClickHouse用户 {#clickhouse-adopters}
!!! warning "免责声明"
如下使用ClickHouse的公司和他们的成功案例来源于公开资源因此和实际情况可能有所出入。如果您分享您公司使用ClickHouse的故事我们将不胜感激 [将其添加到列表](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/introduction/adopters.md),但请确保你这样做不会有任何保密协议的问题。也欢迎提供来自其他公司的出版物的更新。
:::note 免责声明
如下使用ClickHouse的公司和他们的成功案例来源于公开资源因此和实际情况可能有所出入。如果您分享您公司使用ClickHouse的故事我们将不胜感激 [将其添加到列表](https://github.com/ClickHouse/ClickHouse/edit/master/docs/zh/introduction/adopters.md),但请确保你这样做不会有任何保密协议的问题。也欢迎提供来自其他公司的出版物的更新。
:::
| 公司简介 | 行业 | 用例 | 群集大小 | (Un)压缩数据大小<abbr title="of single replica"><sup>\*</sup></abbr> | 参考资料 |
|-----------------------------------------------------------------|------------------|----------------|---------------------------------------------------|----------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|

View File

@ -1507,8 +1507,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
}
#if defined(OS_LINUX)
auto tasks_stats_provider = TasksStatsCounters::findBestAvailableProvider();
if (tasks_stats_provider == TasksStatsCounters::MetricsProvider::None)
if (!TasksStatsCounters::checkIfAvailable())
{
LOG_INFO(log, "It looks like this system does not have procfs mounted at /proc location,"
" neither clickhouse-server process has CAP_NET_ADMIN capability."
@ -1519,10 +1518,6 @@ int Server::main(const std::vector<std::string> & /*args*/)
" It also doesn't work if you run clickhouse-server inside network namespace as it happens in some containers.",
executable_path);
}
else
{
LOG_INFO(log, "Tasks stats provider: {}", TasksStatsCounters::metricsProviderString(tasks_stats_provider));
}
if (!hasLinuxCapability(CAP_SYS_NICE))
{

View File

@ -265,24 +265,26 @@ void TaskStatsInfoGetter::getStat(::taskstats & out_stats, pid_t tid) const
{
NetlinkMessage answer = query(netlink_socket_fd, taskstats_family_id, tid, TASKSTATS_CMD_GET, TASKSTATS_CMD_ATTR_PID, &tid, sizeof(tid));
const NetlinkMessage::Attribute * attr = &answer.payload.attribute;
if (attr->header.nla_type != TASKSTATS_TYPE_AGGR_PID)
throw Exception("Expected TASKSTATS_TYPE_AGGR_PID", ErrorCodes::NETLINK_ERROR);
for (const NetlinkMessage::Attribute * attr = &answer.payload.attribute;
attr < answer.end();
attr = attr->next())
{
if (attr->header.nla_type == TASKSTATS_TYPE_AGGR_TGID || attr->header.nla_type == TASKSTATS_TYPE_AGGR_PID)
{
for (const NetlinkMessage::Attribute * nested_attr = reinterpret_cast<const NetlinkMessage::Attribute *>(attr->payload);
nested_attr < attr->next();
nested_attr = nested_attr->next())
{
if (nested_attr->header.nla_type == TASKSTATS_TYPE_STATS)
{
out_stats = unalignedLoad<::taskstats>(nested_attr->payload);
return;
}
}
}
}
/// TASKSTATS_TYPE_AGGR_PID
const NetlinkMessage::Attribute * nested_attr = reinterpret_cast<const NetlinkMessage::Attribute *>(attr->payload);
if (nested_attr->header.nla_type != TASKSTATS_TYPE_PID)
throw Exception("Expected TASKSTATS_TYPE_PID", ErrorCodes::NETLINK_ERROR);
if (nested_attr == nested_attr->next())
throw Exception("No TASKSTATS_TYPE_STATS packet after TASKSTATS_TYPE_PID", ErrorCodes::NETLINK_ERROR);
nested_attr = nested_attr->next();
if (nested_attr->header.nla_type != TASKSTATS_TYPE_STATS)
throw Exception("Expected TASKSTATS_TYPE_STATS", ErrorCodes::NETLINK_ERROR);
out_stats = unalignedLoad<::taskstats>(nested_attr->payload);
if (attr->next() != answer.end())
throw Exception("Unexpected end of response", ErrorCodes::NETLINK_ERROR);
throw Exception("There is no TASKSTATS_TYPE_STATS attribute in the Netlink response", ErrorCodes::NETLINK_ERROR);
}

View File

@ -67,20 +67,6 @@ namespace ProfileEvents
namespace DB
{
const char * TasksStatsCounters::metricsProviderString(MetricsProvider provider)
{
switch (provider)
{
case MetricsProvider::None:
return "none";
case MetricsProvider::Procfs:
return "procfs";
case MetricsProvider::Netlink:
return "netlink";
}
__builtin_unreachable();
}
bool TasksStatsCounters::checkIfAvailable()
{
return findBestAvailableProvider() != MetricsProvider::None;

View File

@ -176,17 +176,7 @@ extern PerfEventsCounters current_thread_counters;
class TasksStatsCounters
{
public:
enum class MetricsProvider
{
None,
Procfs,
Netlink,
};
static const char * metricsProviderString(MetricsProvider provider);
static bool checkIfAvailable();
static MetricsProvider findBestAvailableProvider();
static std::unique_ptr<TasksStatsCounters> create(UInt64 tid);
void reset();
@ -196,8 +186,16 @@ private:
::taskstats stats; //-V730_NOINIT
std::function<::taskstats()> stats_getter;
enum class MetricsProvider
{
None,
Procfs,
Netlink
};
explicit TasksStatsCounters(UInt64 tid, MetricsProvider provider);
static MetricsProvider findBestAvailableProvider();
static void incrementProfileEvents(const ::taskstats & prev, const ::taskstats & curr, ProfileEvents::Counters & profile_events);
};

View File

@ -309,23 +309,26 @@ void DiskCacheWrapper::removeSharedFile(const String & path, bool keep_s3)
DiskDecorator::removeSharedFile(path, keep_s3);
}
void DiskCacheWrapper::removeSharedRecursive(const String & path, bool keep_s3)
void DiskCacheWrapper::removeSharedRecursive(const String & path, bool keep_all, const NameSet & files_to_keep)
{
if (cache_disk->exists(path))
cache_disk->removeSharedRecursive(path, keep_s3);
DiskDecorator::removeSharedRecursive(path, keep_s3);
cache_disk->removeSharedRecursive(path, keep_all, files_to_keep);
DiskDecorator::removeSharedRecursive(path, keep_all, files_to_keep);
}
void DiskCacheWrapper::removeSharedFiles(const RemoveBatchRequest & files, bool keep_s3)
void DiskCacheWrapper::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all, const NameSet & files_to_keep)
{
for (const auto & file : files)
{
if (cache_disk->exists(file.path))
cache_disk->removeSharedFile(file.path, keep_s3);
{
bool keep_file = keep_all || files_to_keep.contains(fs::path(file.path).filename());
cache_disk->removeSharedFile(file.path, keep_file);
}
}
DiskDecorator::removeSharedFiles(files, keep_s3);
DiskDecorator::removeSharedFiles(files, keep_all, files_to_keep);
}
void DiskCacheWrapper::createHardLink(const String & src_path, const String & dst_path)

View File

@ -47,8 +47,9 @@ public:
void removeDirectory(const String & path) override;
void removeRecursive(const String & path) override;
void removeSharedFile(const String & path, bool keep_s3) override;
void removeSharedRecursive(const String & path, bool keep_s3) override;
void removeSharedFiles(const RemoveBatchRequest & files, bool keep_s3) override;
void removeSharedRecursive(const String & path, bool keep_all, const NameSet & files_to_keep) override;
void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all, const NameSet & files_to_keep) override;
void createHardLink(const String & src_path, const String & dst_path) override;
ReservationPtr reserve(UInt64 bytes) override;

View File

@ -108,6 +108,11 @@ void DiskDecorator::copy(const String & from_path, const std::shared_ptr<IDisk>
delegate->copy(from_path, to_disk, to_path);
}
void DiskDecorator::copyDirectoryContent(const String & from_dir, const std::shared_ptr<IDisk> & to_disk, const String & to_dir)
{
delegate->copyDirectoryContent(from_dir, to_disk, to_dir);
}
void DiskDecorator::listFiles(const String & path, std::vector<String> & file_names)
{
delegate->listFiles(path, file_names);
@ -151,14 +156,14 @@ void DiskDecorator::removeSharedFile(const String & path, bool keep_s3)
delegate->removeSharedFile(path, keep_s3);
}
void DiskDecorator::removeSharedFiles(const RemoveBatchRequest & files, bool keep_in_remote_fs)
void DiskDecorator::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only)
{
delegate->removeSharedFiles(files, keep_in_remote_fs);
delegate->removeSharedFiles(files, keep_all_batch_data, file_names_remove_metadata_only);
}
void DiskDecorator::removeSharedRecursive(const String & path, bool keep_s3)
void DiskDecorator::removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only)
{
delegate->removeSharedRecursive(path, keep_s3);
delegate->removeSharedRecursive(path, keep_all_batch_data, file_names_remove_metadata_only);
}
void DiskDecorator::setLastModified(const String & path, const Poco::Timestamp & timestamp)

View File

@ -33,6 +33,7 @@ public:
void moveFile(const String & from_path, const String & to_path) override;
void replaceFile(const String & from_path, const String & to_path) override;
void copy(const String & from_path, const std::shared_ptr<IDisk> & to_disk, const String & to_path) override;
void copyDirectoryContent(const String & from_dir, const std::shared_ptr<IDisk> & to_disk, const String & to_dir) override;
void listFiles(const String & path, std::vector<String> & file_names) override;
std::unique_ptr<ReadBufferFromFileBase> readFile(
@ -52,8 +53,8 @@ public:
void removeDirectory(const String & path) override;
void removeRecursive(const String & path) override;
void removeSharedFile(const String & path, bool keep_s3) override;
void removeSharedRecursive(const String & path, bool keep_s3) override;
void removeSharedFiles(const RemoveBatchRequest & files, bool keep_in_remote_fs) override;
void removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
void setLastModified(const String & path, const Poco::Timestamp & timestamp) override;
Poco::Timestamp getLastModified(const String & path) override;
void setReadOnly(const String & path) override;

View File

@ -249,6 +249,36 @@ void DiskEncrypted::copy(const String & from_path, const std::shared_ptr<IDisk>
copyThroughBuffers(from_path, to_disk, to_path);
}
void DiskEncrypted::copyDirectoryContent(const String & from_dir, const std::shared_ptr<IDisk> & to_disk, const String & to_dir)
{
/// Check if we can copy the file without deciphering.
if (isSameDiskType(*this, *to_disk))
{
/// Disk type is the same, check if the key is the same too.
if (auto * to_disk_enc = typeid_cast<DiskEncrypted *>(to_disk.get()))
{
auto from_settings = current_settings.get();
auto to_settings = to_disk_enc->current_settings.get();
if (from_settings->keys == to_settings->keys)
{
/// Keys are the same so we can simply copy the encrypted file.
auto wrapped_from_path = wrappedPath(from_dir);
auto to_delegate = to_disk_enc->delegate;
auto wrapped_to_path = to_disk_enc->wrappedPath(to_dir);
delegate->copyDirectoryContent(wrapped_from_path, to_delegate, wrapped_to_path);
return;
}
}
}
if (!to_disk->exists(to_dir))
to_disk->createDirectories(to_dir);
/// Copy the file through buffers with deciphering.
copyThroughBuffers(from_dir, to_disk, to_dir);
}
std::unique_ptr<ReadBufferFromFileBase> DiskEncrypted::readFile(
const String & path,
const ReadSettings & settings,

View File

@ -117,6 +117,8 @@ public:
void copy(const String & from_path, const std::shared_ptr<IDisk> & to_disk, const String & to_path) override;
void copyDirectoryContent(const String & from_dir, const std::shared_ptr<IDisk> & to_disk, const String & to_dir) override;
std::unique_ptr<ReadBufferFromFileBase> readFile(
const String & path,
const ReadSettings & settings,
@ -159,10 +161,23 @@ public:
delegate->removeSharedFile(wrapped_path, flag);
}
void removeSharedRecursive(const String & path, bool flag) override
void removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override
{
auto wrapped_path = wrappedPath(path);
delegate->removeSharedRecursive(wrapped_path, flag);
delegate->removeSharedRecursive(wrapped_path, keep_all_batch_data, file_names_remove_metadata_only);
}
void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override
{
for (const auto & file : files)
{
auto wrapped_path = wrappedPath(file.path);
bool keep = keep_all_batch_data || file_names_remove_metadata_only.contains(fs::path(file.path).filename());
if (file.if_exists)
delegate->removeSharedFileIfExists(wrapped_path, keep);
else
delegate->removeSharedFile(wrapped_path, keep);
}
}
void removeSharedFileIfExists(const String & path, bool flag) override

View File

@ -440,6 +440,14 @@ void DiskLocal::copy(const String & from_path, const std::shared_ptr<IDisk> & to
copyThroughBuffers(from_path, to_disk, to_path); /// Base implementation.
}
void DiskLocal::copyDirectoryContent(const String & from_dir, const std::shared_ptr<IDisk> & to_disk, const String & to_dir)
{
if (isSameDiskType(*this, *to_disk))
fs::copy(from_dir, to_dir, fs::copy_options::recursive | fs::copy_options::overwrite_existing); /// Use more optimal way.
else
copyThroughBuffers(from_dir, to_disk, to_dir); /// Base implementation.
}
SyncGuardPtr DiskLocal::getDirectorySyncGuard(const String & path) const
{
return std::make_unique<LocalDirectorySyncGuard>(fs::path(disk_path) / path);

View File

@ -68,6 +68,8 @@ public:
void copy(const String & from_path, const std::shared_ptr<IDisk> & to_disk, const String & to_path) override;
void copyDirectoryContent(const String & from_dir, const std::shared_ptr<IDisk> & to_disk, const String & to_dir) override;
void listFiles(const String & path, std::vector<String> & file_names) override;
std::unique_ptr<ReadBufferFromFileBase> readFile(

View File

@ -200,6 +200,12 @@ void DiskRestartProxy::copy(const String & from_path, const std::shared_ptr<IDis
DiskDecorator::copy(from_path, to_disk, to_path);
}
void DiskRestartProxy::copyDirectoryContent(const String & from_dir, const std::shared_ptr<IDisk> & to_disk, const String & to_dir)
{
ReadLock lock (mutex);
DiskDecorator::copyDirectoryContent(from_dir, to_disk, to_dir);
}
void DiskRestartProxy::listFiles(const String & path, std::vector<String> & file_names)
{
ReadLock lock (mutex);
@ -251,16 +257,16 @@ void DiskRestartProxy::removeSharedFile(const String & path, bool keep_s3)
DiskDecorator::removeSharedFile(path, keep_s3);
}
void DiskRestartProxy::removeSharedFiles(const RemoveBatchRequest & files, bool keep_in_remote_fs)
void DiskRestartProxy::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only)
{
ReadLock lock (mutex);
DiskDecorator::removeSharedFiles(files, keep_in_remote_fs);
DiskDecorator::removeSharedFiles(files, keep_all_batch_data, file_names_remove_metadata_only);
}
void DiskRestartProxy::removeSharedRecursive(const String & path, bool keep_s3)
void DiskRestartProxy::removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only)
{
ReadLock lock (mutex);
DiskDecorator::removeSharedRecursive(path, keep_s3);
DiskDecorator::removeSharedRecursive(path, keep_all_batch_data, file_names_remove_metadata_only);
}
void DiskRestartProxy::setLastModified(const String & path, const Poco::Timestamp & timestamp)

View File

@ -42,6 +42,7 @@ public:
void moveFile(const String & from_path, const String & to_path) override;
void replaceFile(const String & from_path, const String & to_path) override;
void copy(const String & from_path, const DiskPtr & to_disk, const String & to_path) override;
void copyDirectoryContent(const String & from_dir, const std::shared_ptr<IDisk> & to_disk, const String & to_dir) override;
void listFiles(const String & path, std::vector<String> & file_names) override;
std::unique_ptr<ReadBufferFromFileBase> readFile(
const String & path,
@ -54,8 +55,8 @@ public:
void removeDirectory(const String & path) override;
void removeRecursive(const String & path) override;
void removeSharedFile(const String & path, bool keep_s3) override;
void removeSharedFiles(const RemoveBatchRequest & files, bool keep_in_remote_fs) override;
void removeSharedRecursive(const String & path, bool keep_s3) override;
void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
void removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
void setLastModified(const String & path, const Poco::Timestamp & timestamp) override;
Poco::Timestamp getLastModified(const String & path) override;
void setReadOnly(const String & path) override;

View File

@ -165,10 +165,10 @@ std::unique_ptr<ReadBufferFromFileBase> DiskWebServer::readFile(const String & p
auto remote_path = fs_path.parent_path() / (escapeForFileName(fs_path.stem()) + fs_path.extension().string());
remote_path = remote_path.string().substr(url.size());
RemoteMetadata meta(path, remote_path);
meta.remote_fs_objects.emplace_back(remote_path, iter->second.size);
std::vector<BlobPathWithSize> blobs_to_read;
blobs_to_read.emplace_back(remote_path, iter->second.size);
auto web_impl = std::make_unique<ReadBufferFromWebServerGather>(url, meta.remote_fs_root_path, meta.remote_fs_objects, getContext(), read_settings);
auto web_impl = std::make_unique<ReadBufferFromWebServerGather>(url, path, blobs_to_read, getContext(), read_settings);
if (read_settings.remote_fs_method == RemoteFSReadMethod::threadpool)
{

View File

@ -139,7 +139,7 @@ public:
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Disk {} is read-only", getName());
}
void removeSharedRecursive(const String &, bool) override
void removeSharedRecursive(const String &, bool, const NameSet &) override
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Disk {} is read-only", getName());
}

View File

@ -20,13 +20,13 @@ bool IDisk::isDirectoryEmpty(const String & path)
return !iterateDirectory(path)->isValid();
}
void copyFile(IDisk & from_disk, const String & from_path, IDisk & to_disk, const String & to_path)
void IDisk::copyFile(const String & from_file_path, IDisk & to_disk, const String & to_file_path)
{
LOG_DEBUG(&Poco::Logger::get("IDisk"), "Copying from {} (path: {}) {} to {} (path: {}) {}.",
from_disk.getName(), from_disk.getPath(), from_path, to_disk.getName(), to_disk.getPath(), to_path);
getName(), getPath(), from_file_path, to_disk.getName(), to_disk.getPath(), to_file_path);
auto in = from_disk.readFile(from_path);
auto out = to_disk.writeFile(to_path);
auto in = readFile(from_file_path);
auto out = to_disk.writeFile(to_file_path);
copyData(*in, *out);
out->finalize();
}
@ -34,7 +34,7 @@ void copyFile(IDisk & from_disk, const String & from_path, IDisk & to_disk, cons
using ResultsCollector = std::vector<std::future<void>>;
void asyncCopy(IDisk & from_disk, String from_path, IDisk & to_disk, String to_path, Executor & exec, ResultsCollector & results)
void asyncCopy(IDisk & from_disk, String from_path, IDisk & to_disk, String to_path, Executor & exec, ResultsCollector & results, bool copy_root_dir)
{
if (from_disk.isFile(from_path))
{
@ -42,28 +42,32 @@ void asyncCopy(IDisk & from_disk, String from_path, IDisk & to_disk, String to_p
[&from_disk, from_path, &to_disk, to_path]()
{
setThreadName("DiskCopier");
DB::copyFile(from_disk, from_path, to_disk, fs::path(to_path) / fileName(from_path));
from_disk.copyFile(from_path, to_disk, fs::path(to_path) / fileName(from_path));
});
results.push_back(std::move(result));
}
else
{
fs::path dir_name = fs::path(from_path).parent_path().filename();
fs::path dest(fs::path(to_path) / dir_name);
to_disk.createDirectories(dest);
fs::path dest(to_path);
if (copy_root_dir)
{
fs::path dir_name = fs::path(from_path).parent_path().filename();
dest /= dir_name;
to_disk.createDirectories(dest);
}
for (auto it = from_disk.iterateDirectory(from_path); it->isValid(); it->next())
asyncCopy(from_disk, it->path(), to_disk, dest, exec, results);
asyncCopy(from_disk, it->path(), to_disk, dest, exec, results, true);
}
}
void IDisk::copyThroughBuffers(const String & from_path, const std::shared_ptr<IDisk> & to_disk, const String & to_path)
void IDisk::copyThroughBuffers(const String & from_path, const std::shared_ptr<IDisk> & to_disk, const String & to_path, bool copy_root_dir)
{
auto & exec = to_disk->getExecutor();
ResultsCollector results;
asyncCopy(*this, from_path, *to_disk, to_path, exec, results);
asyncCopy(*this, from_path, *to_disk, to_path, exec, results, copy_root_dir);
for (auto & result : results)
result.wait();
@ -73,7 +77,16 @@ void IDisk::copyThroughBuffers(const String & from_path, const std::shared_ptr<I
void IDisk::copy(const String & from_path, const std::shared_ptr<IDisk> & to_disk, const String & to_path)
{
copyThroughBuffers(from_path, to_disk, to_path);
copyThroughBuffers(from_path, to_disk, to_path, true);
}
void IDisk::copyDirectoryContent(const String & from_dir, const std::shared_ptr<IDisk> & to_disk, const String & to_dir)
{
if (!to_disk->exists(to_dir))
to_disk->createDirectories(to_dir);
copyThroughBuffers(from_dir, to_disk, to_dir, false);
}
void IDisk::truncateFile(const String &, size_t)

View File

@ -160,6 +160,12 @@ public:
/// Recursively copy data containing at `from_path` to `to_path` located at `to_disk`.
virtual void copy(const String & from_path, const std::shared_ptr<IDisk> & to_disk, const String & to_path);
/// Recursively copy files from from_dir to to_dir. Create to_dir if not exists.
virtual void copyDirectoryContent(const String & from_dir, const std::shared_ptr<IDisk> & to_disk, const String & to_dir);
/// Copy file `from_file_path` to `to_file_path` located at `to_disk`.
virtual void copyFile(const String & from_file_path, IDisk & to_disk, const String & to_file_path);
/// List files at `path` and add their names to `file_names`
virtual void listFiles(const String & path, std::vector<String> & file_names) = 0;
@ -192,17 +198,18 @@ public:
/// Remove file. Throws exception if file doesn't exists or if directory is not empty.
/// Differs from removeFile for S3/HDFS disks
/// Second bool param is a flag to remove (true) or keep (false) shared data on S3
virtual void removeSharedFile(const String & path, bool) { removeFile(path); }
virtual void removeSharedFile(const String & path, bool /* keep_shared_data */) { removeFile(path); }
/// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists.
/// Differs from removeRecursive for S3/HDFS disks
/// Second bool param is a flag to remove (true) or keep (false) shared data on S3
virtual void removeSharedRecursive(const String & path, bool) { removeRecursive(path); }
/// Second bool param is a flag to remove (false) or keep (true) shared data on S3.
/// Third param determines which files cannot be removed even if second is true.
virtual void removeSharedRecursive(const String & path, bool /* keep_all_shared_data */, const NameSet & /* file_names_remove_metadata_only */) { removeRecursive(path); }
/// Remove file or directory if it exists.
/// Differs from removeFileIfExists for S3/HDFS disks
/// Second bool param is a flag to remove (true) or keep (false) shared data on S3
virtual void removeSharedFileIfExists(const String & path, bool) { removeFileIfExists(path); }
virtual void removeSharedFileIfExists(const String & path, bool /* keep_shared_data */) { removeFileIfExists(path); }
virtual String getCacheBasePath() const { return ""; }
@ -237,14 +244,17 @@ public:
/// Batch request to remove multiple files.
/// May be much faster for blob storage.
virtual void removeSharedFiles(const RemoveBatchRequest & files, bool keep_in_remote_fs)
/// Second bool param is a flag to remove (true) or keep (false) shared data on S3.
/// Third param determines which files cannot be removed even if second is true.
virtual void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only)
{
for (const auto & file : files)
{
bool keep_file = keep_all_batch_data || file_names_remove_metadata_only.contains(fs::path(file.path).filename());
if (file.if_exists)
removeSharedFileIfExists(file.path, keep_in_remote_fs);
removeSharedFileIfExists(file.path, keep_file);
else
removeSharedFile(file.path, keep_in_remote_fs);
removeSharedFile(file.path, keep_file);
}
}
@ -343,7 +353,7 @@ protected:
/// Base implementation of the function copy().
/// It just opens two files, reads data by portions from the first file, and writes it to the second one.
/// A derived class may override copy() to provide a faster implementation.
void copyThroughBuffers(const String & from_path, const std::shared_ptr<IDisk> & to_disk, const String & to_path);
void copyThroughBuffers(const String & from_path, const std::shared_ptr<IDisk> & to_disk, const String & to_path, bool copy_root_dir = true);
private:
std::unique_ptr<Executor> executor;

View File

@ -45,7 +45,6 @@ IDiskRemote::Metadata IDiskRemote::Metadata::createAndStoreMetadata(const String
return result;
}
IDiskRemote::Metadata IDiskRemote::Metadata::readUpdateAndStoreMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, IDiskRemote::MetadataUpdater updater)
{
Metadata result(remote_fs_root_path_, metadata_disk_, metadata_file_path_);
@ -55,7 +54,6 @@ IDiskRemote::Metadata IDiskRemote::Metadata::readUpdateAndStoreMetadata(const St
return result;
}
IDiskRemote::Metadata IDiskRemote::Metadata::createUpdateAndStoreMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, IDiskRemote::MetadataUpdater updater)
{
Metadata result(remote_fs_root_path_, metadata_disk_, metadata_file_path_);
@ -64,6 +62,17 @@ IDiskRemote::Metadata IDiskRemote::Metadata::createUpdateAndStoreMetadata(const
return result;
}
IDiskRemote::Metadata IDiskRemote::Metadata::readUpdateStoreMetadataAndRemove(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, IDiskRemote::MetadataUpdater updater)
{
Metadata result(remote_fs_root_path_, metadata_disk_, metadata_file_path_);
result.load();
if (updater(result))
result.save(sync);
metadata_disk_->removeFile(metadata_file_path_);
return result;
}
IDiskRemote::Metadata IDiskRemote::Metadata::createAndStoreMetadataIfNotExists(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, bool overwrite)
{
@ -154,7 +163,8 @@ IDiskRemote::Metadata::Metadata(
const String & remote_fs_root_path_,
DiskPtr metadata_disk_,
const String & metadata_file_path_)
: RemoteMetadata(remote_fs_root_path_, metadata_file_path_)
: remote_fs_root_path(remote_fs_root_path_)
, metadata_file_path(metadata_file_path_)
, metadata_disk(metadata_disk_)
, total_size(0), ref_count(0)
{
@ -230,6 +240,12 @@ IDiskRemote::Metadata IDiskRemote::readUpdateAndStoreMetadata(const String & pat
}
IDiskRemote::Metadata IDiskRemote::readUpdateStoreMetadataAndRemove(const String & path, bool sync, IDiskRemote::MetadataUpdater updater)
{
std::unique_lock lock(metadata_mutex);
return Metadata::readUpdateStoreMetadataAndRemove(remote_fs_root_path, metadata_disk, path, sync, updater);
}
IDiskRemote::Metadata IDiskRemote::readOrCreateUpdateAndStoreMetadata(const String & path, WriteMode mode, bool sync, IDiskRemote::MetadataUpdater updater)
{
if (mode == WriteMode::Rewrite || !metadata_disk->exists(path))
@ -270,7 +286,7 @@ std::unordered_map<String, String> IDiskRemote::getSerializedMetadata(const std:
return metadatas;
}
void IDiskRemote::removeMetadata(const String & path, std::vector<std::string> & paths_to_remove)
void IDiskRemote::removeMetadata(const String & path, std::vector<String> & paths_to_remove)
{
LOG_TRACE(log, "Remove file by path: {}", backQuote(metadata_disk->getPath() + path));
@ -288,6 +304,7 @@ void IDiskRemote::removeMetadata(const String & path, std::vector<std::string> &
{
for (const auto & [remote_fs_object_path, _] : metadata.remote_fs_objects)
{
paths_to_remove.push_back(remote_fs_root_path + remote_fs_object_path);
if (cache)
@ -307,8 +324,7 @@ void IDiskRemote::removeMetadata(const String & path, std::vector<std::string> &
return true;
};
readUpdateAndStoreMetadata(path, false, metadata_updater);
metadata_disk->removeFile(path);
readUpdateStoreMetadataAndRemove(path, false, metadata_updater);
/// If there is no references - delete content from remote FS.
}
catch (const Exception & e)
@ -327,13 +343,13 @@ void IDiskRemote::removeMetadata(const String & path, std::vector<std::string> &
}
void IDiskRemote::removeMetadataRecursive(const String & path, std::vector<String> & paths_to_remove)
void IDiskRemote::removeMetadataRecursive(const String & path, std::unordered_map<String, std::vector<String>> & paths_to_remove)
{
checkStackSize(); /// This is needed to prevent stack overflow in case of cyclic symlinks.
if (metadata_disk->isFile(path))
{
removeMetadata(path, paths_to_remove);
removeMetadata(path, paths_to_remove[path]);
}
else
{
@ -522,27 +538,43 @@ void IDiskRemote::removeSharedFileIfExists(const String & path, bool delete_meta
}
}
void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool delete_metadata_only)
void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only)
{
std::vector<String> paths_to_remove;
std::unordered_map<String, std::vector<String>> paths_to_remove;
for (const auto & file : files)
{
bool skip = file.if_exists && !metadata_disk->exists(file.path);
if (!skip)
removeMetadata(file.path, paths_to_remove);
removeMetadata(file.path, paths_to_remove[file.path]);
}
if (!delete_metadata_only)
removeFromRemoteFS(paths_to_remove);
if (!keep_all_batch_data)
{
std::vector<String> remove_from_remote;
for (auto && [path, remote_paths] : paths_to_remove)
{
if (!file_names_remove_metadata_only.contains(fs::path(path).filename()))
remove_from_remote.insert(remove_from_remote.end(), remote_paths.begin(), remote_paths.end());
}
removeFromRemoteFS(remove_from_remote);
}
}
void IDiskRemote::removeSharedRecursive(const String & path, bool delete_metadata_only)
void IDiskRemote::removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only)
{
std::vector<String> paths_to_remove;
std::unordered_map<String, std::vector<String>> paths_to_remove;
removeMetadataRecursive(path, paths_to_remove);
if (!delete_metadata_only)
removeFromRemoteFS(paths_to_remove);
if (!keep_all_batch_data)
{
std::vector<String> remove_from_remote;
for (auto && [local_path, remote_paths] : paths_to_remove)
{
if (!file_names_remove_metadata_only.contains(fs::path(local_path).filename()))
remove_from_remote.insert(remove_from_remote.end(), remote_paths.begin(), remote_paths.end());
}
removeFromRemoteFS(remove_from_remote);
}
}

View File

@ -78,6 +78,8 @@ public:
Metadata readMetadata(const String & path) const;
Metadata readMetadataUnlocked(const String & path, std::shared_lock<std::shared_mutex> &) const;
Metadata readUpdateAndStoreMetadata(const String & path, bool sync, MetadataUpdater updater);
Metadata readUpdateStoreMetadataAndRemove(const String & path, bool sync, MetadataUpdater updater);
Metadata readOrCreateUpdateAndStoreMetadata(const String & path, WriteMode mode, bool sync, MetadataUpdater updater);
Metadata createAndStoreMetadata(const String & path, bool sync);
@ -107,15 +109,16 @@ public:
void removeFileIfExists(const String & path) override { removeSharedFileIfExists(path, false); }
void removeRecursive(const String & path) override { removeSharedRecursive(path, false); }
void removeRecursive(const String & path) override { removeSharedRecursive(path, false, {}); }
void removeSharedFile(const String & path, bool delete_metadata_only) override;
void removeSharedFileIfExists(const String & path, bool delete_metadata_only) override;
void removeSharedFiles(const RemoveBatchRequest & files, bool delete_metadata_only) override;
void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
void removeSharedRecursive(const String & path, bool delete_metadata_only) override;
void removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
void listFiles(const String & path, std::vector<String> & file_names) override;
@ -172,7 +175,7 @@ protected:
private:
void removeMetadata(const String & path, std::vector<String> & paths_to_remove);
void removeMetadataRecursive(const String & path, std::vector<String> & paths_to_remove);
void removeMetadataRecursive(const String & path, std::unordered_map<String, std::vector<String>> & paths_to_remove);
bool tryReserve(UInt64 bytes);
@ -184,10 +187,18 @@ private:
using RemoteDiskPtr = std::shared_ptr<IDiskRemote>;
/// Remote FS (S3, HDFS) metadata file layout:
/// FS objects, their number and total size of all FS objects.
/// Each FS object represents a file path in remote FS and its size.
/// Minimum info, required to be passed to ReadIndirectBufferFromRemoteFS<T>
struct RemoteMetadata
struct IDiskRemote::Metadata
{
using Updater = std::function<bool(IDiskRemote::Metadata & metadata)>;
/// Metadata file version.
static constexpr UInt32 VERSION_ABSOLUTE_PATHS = 1;
static constexpr UInt32 VERSION_RELATIVE_PATHS = 2;
static constexpr UInt32 VERSION_READ_ONLY_FLAG = 3;
/// Remote FS objects paths and their sizes.
std::vector<BlobPathWithSize> remote_fs_objects;
@ -197,22 +208,6 @@ struct RemoteMetadata
/// Relative path to metadata file on local FS.
const String metadata_file_path;
RemoteMetadata(const String & remote_fs_root_path_, const String & metadata_file_path_)
: remote_fs_root_path(remote_fs_root_path_), metadata_file_path(metadata_file_path_) {}
};
/// Remote FS (S3, HDFS) metadata file layout:
/// FS objects, their number and total size of all FS objects.
/// Each FS object represents a file path in remote FS and its size.
struct IDiskRemote::Metadata : RemoteMetadata
{
using Updater = std::function<bool(IDiskRemote::Metadata & metadata)>;
/// Metadata file version.
static constexpr UInt32 VERSION_ABSOLUTE_PATHS = 1;
static constexpr UInt32 VERSION_RELATIVE_PATHS = 2;
static constexpr UInt32 VERSION_READ_ONLY_FLAG = 3;
DiskPtr metadata_disk;
/// Total size of all remote FS (S3, HDFS) objects.
@ -236,6 +231,7 @@ struct IDiskRemote::Metadata : RemoteMetadata
static Metadata readMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_);
static Metadata readUpdateAndStoreMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, Updater updater);
static Metadata readUpdateStoreMetadataAndRemove(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, Updater updater);
static Metadata createAndStoreMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync);
static Metadata createUpdateAndStoreMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, Updater updater);

View File

@ -755,7 +755,7 @@ void DiskS3::restore()
bool cleanup_s3 = information.source_bucket != bucket || information.source_path != remote_fs_root_path;
for (const auto & root : data_roots)
if (exists(root))
removeSharedRecursive(root + '/', !cleanup_s3);
removeSharedRecursive(root + '/', !cleanup_s3, {});
restoreFiles(information);
restoreFileOperations(information);

View File

@ -12,6 +12,7 @@
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTAsterisk.h>
#include <Parsers/ASTColumnsMatcher.h>
#include <Parsers/ASTColumnsTransformers.h>
#include <Parsers/ASTQualifiedAsterisk.h>
#include <Parsers/ParserTablesInSelectQuery.h>
#include <Parsers/ExpressionListParsers.h>
@ -81,6 +82,7 @@ public:
/// By default should_add_column_predicate returns true for any column name
void addTableColumns(
const String & table_name,
ASTs & columns,
ShouldAddColumnPredicate should_add_column_predicate = [](const String &) { return true; })
{
auto it = table_columns.find(table_name);
@ -105,7 +107,7 @@ public:
else
identifier = std::make_shared<ASTIdentifier>(std::vector<String>{it->first, column.name});
new_select_expression_list->children.emplace_back(std::move(identifier));
columns.emplace_back(std::move(identifier));
}
}
}
@ -129,14 +131,18 @@ private:
for (const auto & child : node.children)
{
if (child->as<ASTAsterisk>())
ASTs columns;
if (const auto * asterisk = child->as<ASTAsterisk>())
{
has_asterisks = true;
for (auto & table_name : data.tables_order)
data.addTableColumns(table_name);
data.addTableColumns(table_name, columns);
for (const auto & transformer : asterisk->children)
IASTColumnsTransformer::transform(transformer, columns);
}
else if (child->as<ASTQualifiedAsterisk>())
else if (const auto * qualified_asterisk = child->as<ASTQualifiedAsterisk>())
{
has_asterisks = true;
@ -144,17 +150,44 @@ private:
throw Exception("Logical error: qualified asterisk must have exactly one child", ErrorCodes::LOGICAL_ERROR);
auto & identifier = child->children[0]->as<ASTTableIdentifier &>();
data.addTableColumns(identifier.name());
data.addTableColumns(identifier.name(), columns);
// QualifiedAsterisk's transformers start to appear at child 1
for (auto it = qualified_asterisk->children.begin() + 1; it != qualified_asterisk->children.end(); ++it)
{
IASTColumnsTransformer::transform(*it, columns);
}
}
else if (auto * columns_matcher = child->as<ASTColumnsMatcher>())
else if (const auto * columns_list_matcher = child->as<ASTColumnsListMatcher>())
{
has_asterisks = true;
for (const auto & ident : columns_list_matcher->column_list->children)
columns.emplace_back(ident->clone());
for (const auto & transformer : columns_list_matcher->children)
IASTColumnsTransformer::transform(transformer, columns);
}
else if (const auto * columns_regexp_matcher = child->as<ASTColumnsRegexpMatcher>())
{
has_asterisks = true;
for (auto & table_name : data.tables_order)
data.addTableColumns(table_name, [&](const String & column_name) { return columns_matcher->isColumnMatching(column_name); });
data.addTableColumns(
table_name,
columns,
[&](const String & column_name) { return columns_regexp_matcher->isColumnMatching(column_name); });
for (const auto & transformer : columns_regexp_matcher->children)
IASTColumnsTransformer::transform(transformer, columns);
}
else
data.new_select_expression_list->children.push_back(child);
data.new_select_expression_list->children.insert(
data.new_select_expression_list->children.end(),
std::make_move_iterator(columns.begin()),
std::make_move_iterator(columns.end()));
}
if (!has_asterisks)

View File

@ -96,8 +96,8 @@ void PredicateRewriteVisitorData::visitOtherInternalSelect(ASTSelectQuery & sele
size_t alias_index = 0;
for (auto & ref_select : temp_select_query->refSelect()->children)
{
if (!ref_select->as<ASTAsterisk>() && !ref_select->as<ASTQualifiedAsterisk>() && !ref_select->as<ASTColumnsMatcher>() &&
!ref_select->as<ASTIdentifier>())
if (!ref_select->as<ASTAsterisk>() && !ref_select->as<ASTQualifiedAsterisk>() && !ref_select->as<ASTColumnsListMatcher>()
&& !ref_select->as<ASTColumnsRegexpMatcher>() && !ref_select->as<ASTIdentifier>())
{
if (const auto & alias = ref_select->tryGetAlias(); alias.empty())
ref_select->setAlias("--predicate_optimizer_" + toString(alias_index++));

View File

@ -196,7 +196,7 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
bool has_asterisk = false;
for (const auto & child : node.children)
{
if (child->as<ASTAsterisk>() || child->as<ASTColumnsMatcher>())
if (child->as<ASTAsterisk>() || child->as<ASTColumnsListMatcher>() || child->as<ASTColumnsRegexpMatcher>())
{
if (tables_with_columns.empty())
throw Exception("An asterisk cannot be replaced with empty columns.", ErrorCodes::LOGICAL_ERROR);
@ -229,47 +229,40 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
for (const auto & column : *cols)
{
if (first_table || !data.join_using_columns.contains(column.name))
{
addIdentifier(columns, table.table, column.name);
}
}
}
first_table = false;
}
for (const auto & transformer : asterisk->children)
{
IASTColumnsTransformer::transform(transformer, columns);
}
}
else if (const auto * asterisk_pattern = child->as<ASTColumnsMatcher>())
{
if (asterisk_pattern->column_list)
{
for (const auto & ident : asterisk_pattern->column_list->children)
columns.emplace_back(ident->clone());
}
else
{
bool first_table = true;
for (const auto & table : tables_with_columns)
{
for (const auto & column : table.columns)
{
if (asterisk_pattern->isColumnMatching(column.name) && (first_table || !data.join_using_columns.contains(column.name)))
{
addIdentifier(columns, table.table, column.name);
}
}
first_table = false;
}
}
// ColumnsMatcher's transformers start to appear at child 1
for (auto it = asterisk_pattern->children.begin() + 1; it != asterisk_pattern->children.end(); ++it)
for (const auto & transformer : asterisk->children)
IASTColumnsTransformer::transform(transformer, columns);
}
else if (auto * asterisk_column_list = child->as<ASTColumnsListMatcher>())
{
for (const auto & ident : asterisk_column_list->column_list->children)
columns.emplace_back(ident->clone());
for (const auto & transformer : asterisk_column_list->children)
IASTColumnsTransformer::transform(transformer, columns);
}
else if (const auto * asterisk_regexp_pattern = child->as<ASTColumnsRegexpMatcher>())
{
bool first_table = true;
for (const auto & table : tables_with_columns)
{
IASTColumnsTransformer::transform(*it, columns);
for (const auto & column : table.columns)
{
if (asterisk_regexp_pattern->isColumnMatching(column.name) && (first_table || !data.join_using_columns.count(column.name)))
{
addIdentifier(columns, table.table, column.name);
}
}
first_table = false;
}
for (const auto & transformer : asterisk_regexp_pattern->children)
IASTColumnsTransformer::transform(transformer, columns);
}
else if (const auto * qualified_asterisk = child->as<ASTQualifiedAsterisk>())
{
@ -280,12 +273,11 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
if (ident_db_and_name.satisfies(table.table, true))
{
for (const auto & column : table.columns)
{
addIdentifier(columns, table.table, column.name);
}
break;
}
}
// QualifiedAsterisk's transformers start to appear at child 1
for (auto it = qualified_asterisk->children.begin() + 1; it != qualified_asterisk->children.end(); ++it)
{

View File

@ -21,6 +21,7 @@
#include <Interpreters/Context.h>
#include <Interpreters/ExternalDictionariesLoader.h>
#include <Interpreters/GatherFunctionQuantileVisitor.h>
#include <Interpreters/UserDefinedExecutableFunctionFactory.h>
#include <Parsers/ASTExpressionList.h>
#include <Parsers/ASTFunction.h>
@ -138,10 +139,18 @@ void optimizeGroupBy(ASTSelectQuery * select_query, ContextPtr context)
continue;
}
}
else if (!function_factory.get(function->name, context)->isInjective({}))
else
{
++i;
continue;
FunctionOverloadResolverPtr function_builder = UserDefinedExecutableFunctionFactory::instance().tryGet(function->name, context);
if (!function_builder)
function_builder = function_factory.get(function->name, context);
if (!function_builder->isInjective({}))
{
++i;
continue;
}
}
/// copy shared pointer to args in order to ensure lifetime

View File

@ -17,6 +17,8 @@ void ASTAsterisk::appendColumnName(WriteBuffer & ostr) const { ostr.write('*');
void ASTAsterisk::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{
settings.ostr << "*";
/// Format column transformers
for (const auto & child : children)
{
settings.ostr << ' ';

View File

@ -1,64 +1,117 @@
#include "ASTColumnsMatcher.h"
#include <Parsers/ASTColumnsMatcher.h>
#include <IO/Operators.h>
#include <IO/WriteHelpers.h>
#include <Common/quoteString.h>
#include <re2/re2.h>
#include <Common/SipHash.h>
#include <IO/Operators.h>
#include <Common/quoteString.h>
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_COMPILE_REGEXP;
}
ASTPtr ASTColumnsMatcher::clone() const
ASTPtr ASTColumnsRegexpMatcher::clone() const
{
auto clone = std::make_shared<ASTColumnsMatcher>(*this);
auto clone = std::make_shared<ASTColumnsRegexpMatcher>(*this);
clone->cloneChildren();
return clone;
}
void ASTColumnsMatcher::appendColumnName(WriteBuffer & ostr) const { writeString(original_pattern, ostr); }
void ASTColumnsRegexpMatcher::appendColumnName(WriteBuffer & ostr) const
{
writeCString("COLUMNS(", ostr);
writeQuotedString(original_pattern, ostr);
writeChar(')', ostr);
}
void ASTColumnsMatcher::updateTreeHashImpl(SipHash & hash_state) const
void ASTColumnsRegexpMatcher::updateTreeHashImpl(SipHash & hash_state) const
{
hash_state.update(original_pattern.size());
hash_state.update(original_pattern);
IAST::updateTreeHashImpl(hash_state);
}
void ASTColumnsMatcher::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
void ASTColumnsRegexpMatcher::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << "COLUMNS" << (settings.hilite ? hilite_none : "") << "(";
if (column_list)
{
frame.expression_list_prepend_whitespace = false;
column_list->formatImpl(settings, state, frame);
}
else
settings.ostr << quoteString(original_pattern);
settings.ostr << quoteString(original_pattern);
settings.ostr << ")";
for (ASTs::const_iterator it = children.begin() + 1; it != children.end(); ++it)
/// Format column transformers
for (const auto & child : children)
{
settings.ostr << ' ';
(*it)->formatImpl(settings, state, frame);
child->formatImpl(settings, state, frame);
}
}
void ASTColumnsMatcher::setPattern(String pattern)
void ASTColumnsRegexpMatcher::setPattern(String pattern)
{
original_pattern = std::move(pattern);
column_matcher = std::make_shared<RE2>(original_pattern, RE2::Quiet);
if (!column_matcher->ok())
throw DB::Exception("COLUMNS pattern " + original_pattern + " cannot be compiled: " + column_matcher->error(), DB::ErrorCodes::CANNOT_COMPILE_REGEXP);
throw DB::Exception(
"COLUMNS pattern " + original_pattern + " cannot be compiled: " + column_matcher->error(),
DB::ErrorCodes::CANNOT_COMPILE_REGEXP);
}
bool ASTColumnsMatcher::isColumnMatching(const String & column_name) const
bool ASTColumnsRegexpMatcher::isColumnMatching(const String & column_name) const
{
return RE2::PartialMatch(column_name, *column_matcher);
}
ASTPtr ASTColumnsListMatcher::clone() const
{
auto clone = std::make_shared<ASTColumnsListMatcher>(*this);
clone->column_list = column_list->clone();
clone->cloneChildren();
return clone;
}
void ASTColumnsListMatcher::updateTreeHashImpl(SipHash & hash_state) const
{
column_list->updateTreeHash(hash_state);
IAST::updateTreeHashImpl(hash_state);
}
void ASTColumnsListMatcher::appendColumnName(WriteBuffer & ostr) const
{
writeCString("COLUMNS(", ostr);
for (auto it = column_list->children.begin(); it != column_list->children.end(); ++it)
{
if (it != column_list->children.begin())
writeCString(", ", ostr);
(*it)->appendColumnName(ostr);
}
writeChar(')', ostr);
}
void ASTColumnsListMatcher::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << "COLUMNS" << (settings.hilite ? hilite_none : "") << "(";
for (ASTs::const_iterator it = column_list->children.begin(); it != column_list->children.end(); ++it)
{
if (it != column_list->children.begin())
{
settings.ostr << ", ";
}
(*it)->formatImpl(settings, state, frame);
}
settings.ostr << ")";
/// Format column transformers
for (const auto & child : children)
{
settings.ostr << ' ';
child->formatImpl(settings, state, frame);
}
}
}

View File

@ -2,10 +2,9 @@
#include <Parsers/IAST.h>
namespace re2
{
class RE2;
class RE2;
}
@ -14,21 +13,13 @@ namespace DB
class WriteBuffer;
namespace ErrorCodes
{
}
struct AsteriskSemantic;
struct AsteriskSemanticImpl;
/** SELECT COLUMNS('regexp') is expanded to multiple columns like * (asterisk).
* Optional transformers can be attached to further manipulate these expanded columns.
*/
class ASTColumnsMatcher : public IAST
class ASTColumnsRegexpMatcher : public IAST
{
public:
String getID(char) const override { return "ColumnsMatcher"; }
String getID(char) const override { return "ColumnsRegexpMatcher"; }
ASTPtr clone() const override;
void appendColumnName(WriteBuffer & ostr) const override;
@ -36,17 +27,26 @@ public:
bool isColumnMatching(const String & column_name) const;
void updateTreeHashImpl(SipHash & hash_state) const override;
ASTPtr column_list;
protected:
void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;
private:
std::shared_ptr<re2::RE2> column_matcher;
String original_pattern;
std::shared_ptr<AsteriskSemanticImpl> semantic; /// pimpl
};
friend struct AsteriskSemantic;
/// Same as the above but use a list of column names to do matching.
class ASTColumnsListMatcher : public IAST
{
public:
String getID(char) const override { return "ColumnsListMatcher"; }
ASTPtr clone() const override;
void appendColumnName(WriteBuffer & ostr) const override;
void updateTreeHashImpl(SipHash & hash_state) const override;
ASTPtr column_list;
protected:
void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;
};

View File

@ -105,6 +105,49 @@ void ASTColumnsApplyTransformer::transform(ASTs & nodes) const
}
}
void ASTColumnsApplyTransformer::appendColumnName(WriteBuffer & ostr) const
{
writeCString("APPLY ", ostr);
if (!column_name_prefix.empty())
writeChar('(', ostr);
if (lambda)
lambda->appendColumnName(ostr);
else
{
writeString(func_name, ostr);
if (parameters)
parameters->appendColumnName(ostr);
}
if (!column_name_prefix.empty())
{
writeCString(", '", ostr);
writeString(column_name_prefix, ostr);
writeCString("')", ostr);
}
}
void ASTColumnsApplyTransformer::updateTreeHashImpl(SipHash & hash_state) const
{
hash_state.update(func_name.size());
hash_state.update(func_name);
if (parameters)
parameters->updateTreeHashImpl(hash_state);
if (lambda)
lambda->updateTreeHashImpl(hash_state);
hash_state.update(lambda_arg.size());
hash_state.update(lambda_arg);
hash_state.update(column_name_prefix.size());
hash_state.update(column_name_prefix);
IAST::updateTreeHashImpl(hash_state);
}
void ASTColumnsExceptTransformer::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << "EXCEPT" << (is_strict ? " STRICT " : " ") << (settings.hilite ? hilite_none : "");
@ -128,6 +171,38 @@ void ASTColumnsExceptTransformer::formatImpl(const FormatSettings & settings, Fo
settings.ostr << ")";
}
void ASTColumnsExceptTransformer::appendColumnName(WriteBuffer & ostr) const
{
writeCString("EXCEPT ", ostr);
if (is_strict)
writeCString("STRICT ", ostr);
if (children.size() > 1)
writeChar('(', ostr);
for (ASTs::const_iterator it = children.begin(); it != children.end(); ++it)
{
if (it != children.begin())
writeCString(", ", ostr);
(*it)->appendColumnName(ostr);
}
if (!original_pattern.empty())
writeQuotedString(original_pattern, ostr);
if (children.size() > 1)
writeChar(')', ostr);
}
void ASTColumnsExceptTransformer::updateTreeHashImpl(SipHash & hash_state) const
{
hash_state.update(is_strict);
hash_state.update(original_pattern.size());
hash_state.update(original_pattern);
IAST::updateTreeHashImpl(hash_state);
}
void ASTColumnsExceptTransformer::transform(ASTs & nodes) const
{
std::set<String> expected_columns;
@ -201,6 +276,21 @@ void ASTColumnsReplaceTransformer::Replacement::formatImpl(
settings.ostr << (settings.hilite ? hilite_keyword : "") << " AS " << (settings.hilite ? hilite_none : "") << backQuoteIfNeed(name);
}
void ASTColumnsReplaceTransformer::Replacement::appendColumnName(WriteBuffer & ostr) const
{
expr->appendColumnName(ostr);
writeCString(" AS ", ostr);
writeProbablyBackQuotedString(name, ostr);
}
void ASTColumnsReplaceTransformer::Replacement::updateTreeHashImpl(SipHash & hash_state) const
{
hash_state.update(name.size());
hash_state.update(name);
expr->updateTreeHashImpl(hash_state);
IAST::updateTreeHashImpl(hash_state);
}
void ASTColumnsReplaceTransformer::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << "REPLACE" << (is_strict ? " STRICT " : " ") << (settings.hilite ? hilite_none : "");
@ -211,9 +301,8 @@ void ASTColumnsReplaceTransformer::formatImpl(const FormatSettings & settings, F
for (ASTs::const_iterator it = children.begin(); it != children.end(); ++it)
{
if (it != children.begin())
{
settings.ostr << ", ";
}
(*it)->formatImpl(settings, state, frame);
}
@ -221,6 +310,32 @@ void ASTColumnsReplaceTransformer::formatImpl(const FormatSettings & settings, F
settings.ostr << ")";
}
void ASTColumnsReplaceTransformer::appendColumnName(WriteBuffer & ostr) const
{
writeCString("REPLACE ", ostr);
if (is_strict)
writeCString("STRICT ", ostr);
if (children.size() > 1)
writeChar('(', ostr);
for (ASTs::const_iterator it = children.begin(); it != children.end(); ++it)
{
if (it != children.begin())
writeCString(", ", ostr);
(*it)->appendColumnName(ostr);
}
if (children.size() > 1)
writeChar(')', ostr);
}
void ASTColumnsReplaceTransformer::updateTreeHashImpl(SipHash & hash_state) const
{
hash_state.update(is_strict);
IAST::updateTreeHashImpl(hash_state);
}
void ASTColumnsReplaceTransformer::replaceChildren(ASTPtr & node, const ASTPtr & replacement, const String & name)
{
for (auto & child : node->children)

View File

@ -30,6 +30,8 @@ public:
return res;
}
void transform(ASTs & nodes) const override;
void appendColumnName(WriteBuffer & ostr) const override;
void updateTreeHashImpl(SipHash & hash_state) const override;
// Case 1 APPLY (quantile(0.9))
String func_name;
@ -59,6 +61,8 @@ public:
void transform(ASTs & nodes) const override;
void setPattern(String pattern);
bool isColumnMatching(const String & column_name) const;
void appendColumnName(WriteBuffer & ostr) const override;
void updateTreeHashImpl(SipHash & hash_state) const override;
protected:
void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;
@ -76,12 +80,13 @@ public:
ASTPtr clone() const override
{
auto replacement = std::make_shared<Replacement>(*this);
replacement->children.clear();
replacement->expr = expr->clone();
replacement->children.push_back(replacement->expr);
return replacement;
}
void appendColumnName(WriteBuffer & ostr) const override;
void updateTreeHashImpl(SipHash & hash_state) const override;
String name;
ASTPtr expr;
@ -98,6 +103,8 @@ public:
return clone;
}
void transform(ASTs & nodes) const override;
void appendColumnName(WriteBuffer & ostr) const override;
void updateTreeHashImpl(SipHash & hash_state) const override;
protected:
void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;

View File

@ -17,6 +17,8 @@ void ASTQualifiedAsterisk::formatImpl(const FormatSettings & settings, FormatSta
const auto & qualifier = children.at(0);
qualifier->formatImpl(settings, state, frame);
settings.ostr << ".*";
/// Format column transformers
for (ASTs::const_iterator it = children.begin() + 1; it != children.end(); ++it)
{
settings.ostr << ' ';

View File

@ -1934,16 +1934,18 @@ bool ParserColumnsMatcher::parseImpl(Pos & pos, ASTPtr & node, Expected & expect
return false;
++pos;
auto res = std::make_shared<ASTColumnsMatcher>();
ASTPtr res;
if (column_list)
{
res->column_list = column_list;
res->children.push_back(res->column_list);
auto list_matcher = std::make_shared<ASTColumnsListMatcher>();
list_matcher->column_list = column_list;
res = list_matcher;
}
else
{
res->setPattern(regex_node->as<ASTLiteral &>().value.get<String>());
res->children.push_back(regex_node);
auto regexp_matcher = std::make_shared<ASTColumnsRegexpMatcher>();
regexp_matcher->setPattern(regex_node->as<ASTLiteral &>().value.get<String>());
res = regexp_matcher;
}
ParserColumnsTransformers transformers_p(allowed_transformers);

View File

@ -534,6 +534,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
{
if (e.code() != ErrorCodes::S3_ERROR && e.code() != ErrorCodes::ZERO_COPY_REPLICATION_ERROR)
throw;
LOG_WARNING(log, fmt::runtime(e.message() + " Will retry fetching part without zero-copy."));
/// Try again but without zero-copy
return fetchPart(metadata_snapshot, context, part_name, replica_path, host, port, timeouts,
@ -826,7 +827,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDiskRemoteMeta(
/// NOTE The is_cancelled flag also makes sense to check every time you read over the network,
/// performing a poll with a not very large timeout.
/// And now we check it only between read chunks (in the `copyData` function).
disk->removeSharedRecursive(part_download_path, true);
disk->removeSharedRecursive(part_download_path, true, {});
throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED);
}
@ -850,7 +851,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDiskRemoteMeta(
new_data_part->modification_time = time(nullptr);
new_data_part->loadColumnsChecksumsIndexes(true, false);
data.lockSharedData(*new_data_part, /* replace_existing_lock = */ true);
data.lockSharedData(*new_data_part, /* replace_existing_lock = */ true, {});
LOG_DEBUG(log, "Download of part {} unique id {} metadata onto disk {} finished.",
part_name, part_id, disk->getName());

View File

@ -504,10 +504,8 @@ void IMergeTreeDataPart::removeIfNeeded()
if (parent_part)
{
std::optional<bool> keep_shared_data = keepSharedDataInDecoupledStorage();
if (!keep_shared_data.has_value())
return;
projectionRemove(parent_part->getFullRelativePath(), *keep_shared_data);
auto [can_remove, _] = canRemovePart();
projectionRemove(parent_part->getFullRelativePath(), !can_remove);
}
else
remove();
@ -1514,8 +1512,6 @@ try
SyncGuardPtr sync_guard;
if (storage.getSettings()->fsync_part_directory)
sync_guard = volume->getDisk()->getDirectorySyncGuard(to);
storage.lockSharedData(*this);
}
catch (...)
{
@ -1530,21 +1526,13 @@ catch (...)
throw;
}
void IMergeTreeDataPart::cleanupOldName(const String & old_part_name) const
{
if (name == old_part_name)
return;
storage.unlockSharedData(*this, old_part_name);
}
std::optional<bool> IMergeTreeDataPart::keepSharedDataInDecoupledStorage() const
std::pair<bool, NameSet> IMergeTreeDataPart::canRemovePart() const
{
/// NOTE: It's needed for zero-copy replication
if (force_keep_shared_data)
return true;
return std::make_pair(false, NameSet{});
return !storage.unlockSharedData(*this);
return storage.unlockSharedData(*this);
}
void IMergeTreeDataPart::initializePartMetadataManager()
@ -1564,9 +1552,7 @@ void IMergeTreeDataPart::remove() const
assert(assertHasValidVersionMetadata());
part_is_probably_removed_from_disk = true;
std::optional<bool> keep_shared_data = keepSharedDataInDecoupledStorage();
if (!keep_shared_data.has_value())
return;
auto [can_remove, files_not_to_remove] = canRemovePart();
if (!isStoredOnDisk())
return;
@ -1577,7 +1563,7 @@ void IMergeTreeDataPart::remove() const
if (isProjectionPart())
{
LOG_WARNING(storage.log, "Projection part {} should be removed by its parent {}.", name, parent_part->name);
projectionRemove(parent_part->getFullRelativePath(), *keep_shared_data);
projectionRemove(parent_part->getFullRelativePath(), !can_remove);
return;
}
@ -1609,7 +1595,7 @@ void IMergeTreeDataPart::remove() const
LOG_WARNING(storage.log, "Directory {} (to which part must be renamed before removing) already exists. Most likely this is due to unclean restart or race condition. Removing it.", fullPath(disk, to));
try
{
disk->removeSharedRecursive(fs::path(to) / "", *keep_shared_data);
disk->removeSharedRecursive(fs::path(to) / "", !can_remove, files_not_to_remove);
}
catch (...)
{
@ -1636,7 +1622,9 @@ void IMergeTreeDataPart::remove() const
std::unordered_set<String> projection_directories;
for (const auto & [p_name, projection_part] : projection_parts)
{
projection_part->projectionRemove(to, *keep_shared_data);
/// NOTE: projections currently unsupported with zero copy replication.
/// TODO: fix it.
projection_part->projectionRemove(to, !can_remove);
projection_directories.emplace(p_name + ".proj");
}
@ -1644,7 +1632,7 @@ void IMergeTreeDataPart::remove() const
if (checksums.empty())
{
/// If the part is not completely written, we cannot use fast path by listing files.
disk->removeSharedRecursive(fs::path(to) / "", *keep_shared_data);
disk->removeSharedRecursive(fs::path(to) / "", !can_remove, files_not_to_remove);
}
else
{
@ -1673,16 +1661,15 @@ void IMergeTreeDataPart::remove() const
request.emplace_back(fs::path(to) / DELETE_ON_DESTROY_MARKER_FILE_NAME, true);
request.emplace_back(fs::path(to) / TXN_VERSION_METADATA_FILE_NAME, true);
disk->removeSharedFiles(request, *keep_shared_data);
disk->removeSharedFiles(request, !can_remove, files_not_to_remove);
disk->removeDirectory(to);
}
catch (...)
{
/// Recursive directory removal does many excessive "stat" syscalls under the hood.
LOG_ERROR(storage.log, "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: {}", fullPath(disk, to), getCurrentExceptionMessage(false));
disk->removeSharedRecursive(fs::path(to) / "", *keep_shared_data);
disk->removeSharedRecursive(fs::path(to) / "", !can_remove, files_not_to_remove);
}
}
}
@ -1703,7 +1690,7 @@ void IMergeTreeDataPart::projectionRemove(const String & parent_to, bool keep_sh
"Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: checksums.txt is missing",
fullPath(disk, to));
/// If the part is not completely written, we cannot use fast path by listing files.
disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data);
disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data, {});
}
else
{
@ -1727,8 +1714,8 @@ void IMergeTreeDataPart::projectionRemove(const String & parent_to, bool keep_sh
request.emplace_back(fs::path(to) / DEFAULT_COMPRESSION_CODEC_FILE_NAME, true);
request.emplace_back(fs::path(to) / DELETE_ON_DESTROY_MARKER_FILE_NAME, true);
disk->removeSharedFiles(request, keep_shared_data);
disk->removeSharedRecursive(to, keep_shared_data);
disk->removeSharedFiles(request, keep_shared_data, {});
disk->removeSharedRecursive(to, keep_shared_data, {});
}
catch (...)
{
@ -1736,7 +1723,7 @@ void IMergeTreeDataPart::projectionRemove(const String & parent_to, bool keep_sh
LOG_ERROR(storage.log, "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: {}", fullPath(disk, to), getCurrentExceptionMessage(false));
disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data);
disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data, {});
}
}
}

View File

@ -105,6 +105,8 @@ public:
virtual bool isStoredOnRemoteDisk() const = 0;
virtual bool isStoredOnRemoteDiskWithZeroCopySupport() const = 0;
virtual bool supportsVerticalMerge() const { return false; }
/// NOTE: Returns zeros if column files are not found in checksums.
@ -354,9 +356,6 @@ public:
/// Changes only relative_dir_name, you need to update other metadata (name, is_temp) explicitly
virtual void renameTo(const String & new_relative_path, bool remove_new_dir_if_exists) const;
/// Cleanup shared locks made with old name after part renaming
virtual void cleanupOldName(const String & old_part_name) const;
/// Makes clone of a part in detached/ directory via hard links
virtual void makeCloneInDetached(const String & prefix, const StorageMetadataPtr & metadata_snapshot) const;
@ -515,7 +514,17 @@ protected:
String getRelativePathForDetachedPart(const String & prefix) const;
std::optional<bool> keepSharedDataInDecoupledStorage() const;
/// Checks that part can be actually removed from disk.
/// In ordinary scenario always returns true, but in case of
/// zero-copy replication part can be hold by some other replicas.
///
/// If method return false than only metadata of part from
/// local storage can be removed, leaving data in remove FS untouched.
///
/// If method return true, than files can be actually removed from remote
/// storage storage, excluding files in the second returned argument.
/// They can be hardlinks to some newer parts.
std::pair<bool, NameSet> canRemovePart() const;
void initializePartMetadataManager();

View File

@ -1575,8 +1575,36 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
if (!lock.try_lock())
return res;
bool need_remove_parts_in_order = supportsReplication() && getSettings()->allow_remote_fs_zero_copy_replication;
if (need_remove_parts_in_order)
{
bool has_zero_copy_disk = false;
for (const auto & disk : getDisks())
{
if (disk->supportZeroCopyReplication())
{
has_zero_copy_disk = true;
break;
}
}
need_remove_parts_in_order = has_zero_copy_disk;
}
time_t now = time(nullptr);
std::vector<DataPartIteratorByStateAndInfo> parts_to_delete;
std::vector<MergeTreePartInfo> skipped_parts;
auto has_skipped_mutation_parent = [&skipped_parts, need_remove_parts_in_order] (const DataPartPtr & part)
{
if (!need_remove_parts_in_order)
return false;
for (const auto & part_info : skipped_parts)
if (part->info.isMutationChildOf(part_info))
return true;
return false;
};
{
auto parts_lock = lockParts();
@ -1588,21 +1616,31 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
/// Do not remove outdated part if it may be visible for some transaction
if (!part->version.canBeRemoved())
{
skipped_parts.push_back(part->info);
continue;
}
auto part_remove_time = part->remove_time.load(std::memory_order_relaxed);
/// Grab only parts that are not used by anyone (SELECTs for example).
if (!part.unique())
{
skipped_parts.push_back(part->info);
continue;
}
if ((part_remove_time < now && now - part_remove_time > getSettings()->old_parts_lifetime.totalSeconds())
if ((part_remove_time < now && now - part_remove_time > getSettings()->old_parts_lifetime.totalSeconds() && !has_skipped_mutation_parent(part))
|| force
|| isInMemoryPart(part) /// Remove in-memory parts immediately to not store excessive data in RAM
|| (part->version.creation_csn == Tx::RolledBackCSN && getSettings()->remove_rolled_back_parts_immediately))
{
parts_to_delete.emplace_back(it);
}
else
{
skipped_parts.push_back(part->info);
}
}
res.reserve(parts_to_delete.size());
@ -2663,8 +2701,6 @@ bool MergeTreeData::renameTempPartAndReplace(
MergeTreePartInfo part_info = part->info;
String part_name;
String old_part_name = part->name;
if (DataPartPtr existing_part_in_partition = getAnyPartInPartition(part->info.partition_id, lock))
{
if (part->partition.value != existing_part_in_partition->partition.value)
@ -2786,9 +2822,6 @@ bool MergeTreeData::renameTempPartAndReplace(
out_covered_parts->emplace_back(std::move(covered_part));
}
/// Cleanup shared locks made with old name
part->cleanupOldName(old_part_name);
return true;
}
@ -3333,11 +3366,10 @@ void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy)
/// We do not check allow_remote_fs_zero_copy_replication here because data may be shared
/// when allow_remote_fs_zero_copy_replication turned on and off again
original_active_part->force_keep_shared_data = false;
if (original_active_part->volume->getDisk()->supportZeroCopyReplication() &&
part_copy->volume->getDisk()->supportZeroCopyReplication() &&
part_copy->isStoredOnRemoteDiskWithZeroCopySupport() &&
original_active_part->getUniqueId() == part_copy->getUniqueId())
{
/// May be when several volumes use the same S3/HDFS storage
@ -3354,6 +3386,10 @@ void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy)
ssize_t diff_rows = part_copy->rows_count - original_active_part->rows_count;
increaseDataVolume(diff_bytes, diff_rows, /* parts= */ 0);
/// Move parts are non replicated operations, so we take lock here.
/// All other locks are taken in StorageReplicatedMergeTree
lockSharedData(*part_copy);
auto disk = original_active_part->volume->getDisk();
String marker_path = fs::path(original_active_part->getFullRelativePath()) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME;
try
@ -5701,7 +5737,9 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk(
const String & tmp_part_prefix,
const MergeTreePartInfo & dst_part_info,
const StorageMetadataPtr & metadata_snapshot,
const MergeTreeTransactionPtr & txn)
const MergeTreeTransactionPtr & txn,
HardlinkedFiles * hardlinked_files,
bool copy_instead_of_hardlink)
{
/// Check that the storage policy contains the disk where the src_part is located.
bool does_storage_policy_allow_same_disk = false;
@ -5739,14 +5777,32 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk(
src_part_path = fs::path(src_relative_data_path) / flushed_part_path / "";
}
LOG_DEBUG(log, "Cloning part {} to {}", fullPath(disk, src_part_path), fullPath(disk, dst_part_path));
localBackup(disk, src_part_path, dst_part_path, /* make_source_readonly */ false);
String with_copy;
if (copy_instead_of_hardlink)
with_copy = " (copying data)";
LOG_DEBUG(log, "Cloning part {} to {}{}", fullPath(disk, src_part_path), fullPath(disk, dst_part_path), with_copy);
localBackup(disk, src_part_path, dst_part_path, /* make_source_readonly */ false, {}, /* copy_instead_of_hardlinks */ copy_instead_of_hardlink);
disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME);
disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME);
auto single_disk_volume = std::make_shared<SingleDiskVolume>(disk->getName(), disk, 0);
auto dst_data_part = createPart(dst_part_name, dst_part_info, single_disk_volume, tmp_dst_part_name);
if (!copy_instead_of_hardlink && hardlinked_files)
{
hardlinked_files->source_part_name = src_part->name;
hardlinked_files->source_table_shared_id = src_part->storage.getTableSharedID();
for (auto it = disk->iterateDirectory(src_part_path); it->isValid(); it->next())
{
if (it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME)
hardlinked_files->hardlinks_from_source_part.insert(it->name());
}
}
/// We should write version metadata on part creation to distinguish it from parts that were created without transaction.
TransactionID tid = txn ? txn->tid : Tx::PrehistoricTID;
dst_data_part->version.setCreationTID(tid, nullptr);

View File

@ -769,9 +769,21 @@ public:
MergeTreeData & checkStructureAndGetMergeTreeData(const StoragePtr & source_table, const StorageMetadataPtr & src_snapshot, const StorageMetadataPtr & my_snapshot) const;
MergeTreeData & checkStructureAndGetMergeTreeData(IStorage & source_table, const StorageMetadataPtr & src_snapshot, const StorageMetadataPtr & my_snapshot) const;
struct HardlinkedFiles
{
/// Shared table uuid where hardlinks live
std::string source_table_shared_id;
/// Hardlinked from part
std::string source_part_name;
/// Hardlinked files list
NameSet hardlinks_from_source_part;
};
MergeTreeData::MutableDataPartPtr cloneAndLoadDataPartOnSameDisk(
const MergeTreeData::DataPartPtr & src_part, const String & tmp_part_prefix, const MergeTreePartInfo & dst_part_info,
const StorageMetadataPtr & metadata_snapshot, const MergeTreeTransactionPtr & txn);
const MergeTreeData::DataPartPtr & src_part, const String & tmp_part_prefix,
const MergeTreePartInfo & dst_part_info, const StorageMetadataPtr & metadata_snapshot,
const MergeTreeTransactionPtr & txn, HardlinkedFiles * hardlinked_files,
bool copy_instead_of_hardlink);
virtual std::vector<MergeTreeMutationStatus> getMutationsStatus() const = 0;
@ -939,16 +951,14 @@ public:
bool scheduleDataMovingJob(BackgroundJobsAssignee & assignee);
bool areBackgroundMovesNeeded() const;
/// Lock part in zookeeper for shared data in several nodes
/// Overridden in StorageReplicatedMergeTree
virtual void lockSharedData(const IMergeTreeDataPart &, bool = false) const {} /// NOLINT
virtual void lockSharedData(const IMergeTreeDataPart &, bool = false, std::optional<HardlinkedFiles> = {}) const {} /// NOLINT
/// Unlock shared data part in zookeeper
/// Overridden in StorageReplicatedMergeTree
virtual bool unlockSharedData(const IMergeTreeDataPart &) const { return true; }
/// Remove lock with old name for shared data part after rename
virtual bool unlockSharedData(const IMergeTreeDataPart &, const String &) const { return true; }
virtual std::pair<bool, NameSet> unlockSharedData(const IMergeTreeDataPart &) const { return std::make_pair(true, NameSet{}); }
/// Fetch part only if some replica has it on shared storage like S3
/// Overridden in StorageReplicatedMergeTree
@ -958,6 +968,8 @@ public:
/// Remove local files and remote files if needed
virtual bool removeDetachedPart(DiskPtr disk, const String & path, const String & part_name, bool is_freezed);
virtual String getTableSharedID() const { return ""; }
/// Store metadata for replicated tables
/// Do nothing for non-replicated tables
virtual void createAndStoreFreezeMetadata(DiskPtr disk, DataPartPtr part, String backup_part_path) const;

View File

@ -187,6 +187,11 @@ bool MergeTreeDataPartCompact::isStoredOnRemoteDisk() const
return volume->getDisk()->isRemote();
}
bool MergeTreeDataPartCompact::isStoredOnRemoteDiskWithZeroCopySupport() const
{
return volume->getDisk()->supportZeroCopyReplication();
}
MergeTreeDataPartCompact::~MergeTreeDataPartCompact()
{
removeIfNeeded();

View File

@ -58,6 +58,8 @@ public:
bool isStoredOnRemoteDisk() const override;
bool isStoredOnRemoteDiskWithZeroCopySupport() const override;
bool hasColumnFiles(const NameAndTypePair & column) const override;
String getFileNameForColumn(const NameAndTypePair & /* column */) const override { return DATA_FILE_NAME; }

View File

@ -45,6 +45,7 @@ public:
bool isStoredOnDisk() const override { return false; }
bool isStoredOnRemoteDisk() const override { return false; }
bool isStoredOnRemoteDiskWithZeroCopySupport() const override { return false; }
bool hasColumnFiles(const NameAndTypePair & column) const override { return !!getColumnPosition(column.getNameInStorage()); }
String getFileNameForColumn(const NameAndTypePair & /* column */) const override { return ""; }
void renameTo(const String & new_relative_path, bool remove_new_dir_if_exists) const override;

View File

@ -146,6 +146,11 @@ bool MergeTreeDataPartWide::isStoredOnRemoteDisk() const
return volume->getDisk()->isRemote();
}
bool MergeTreeDataPartWide::isStoredOnRemoteDiskWithZeroCopySupport() const
{
return volume->getDisk()->supportZeroCopyReplication();
}
MergeTreeDataPartWide::~MergeTreeDataPartWide()
{
removeIfNeeded();

View File

@ -52,6 +52,8 @@ public:
bool isStoredOnRemoteDisk() const override;
bool isStoredOnRemoteDiskWithZeroCopySupport() const override;
bool supportsVerticalMerge() const override { return true; }
String getFileNameForColumn(const NameAndTypePair & column) const override;

View File

@ -72,6 +72,16 @@ struct MergeTreePartInfo
&& strictly_contains_block_range;
}
/// Part was created with mutation of parent_candidate part
bool isMutationChildOf(const MergeTreePartInfo & parent_candidate) const
{
return partition_id == parent_candidate.partition_id
&& min_block == parent_candidate.min_block
&& max_block == parent_candidate.max_block
&& level == parent_candidate.level
&& mutation >= parent_candidate.mutation;
}
/// Return part mutation version, if part wasn't mutated return zero
Int64 getMutationVersion() const
{

View File

@ -214,10 +214,14 @@ MergeTreeData::DataPartPtr MergeTreePartsMover::clonePart(const MergeTreeMoveEnt
LOG_WARNING(log, "Path {} already exists. Will remove it and clone again.", fullPath(disk, path_to_clone + relative_path));
disk->removeRecursive(fs::path(path_to_clone) / relative_path / "");
}
disk->createDirectories(path_to_clone);
bool is_fetched = data->tryToFetchIfShared(*part, disk, fs::path(path_to_clone) / part->name);
if (!is_fetched)
{
LOG_INFO(log, "Part {} was not fetched, we are the first who move it to another disk, so we will copy it", part->name);
part->volume->getDisk()->copy(fs::path(data->getRelativeDataPath()) / relative_path / "", disk, path_to_clone);
}
part->volume->getDisk()->removeFileIfExists(fs::path(path_to_clone) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME);
}
else

View File

@ -100,8 +100,6 @@ void MergedBlockOutputStream::Finalizer::Impl::finish()
if (sync)
file->sync();
}
part->storage.lockSharedData(*part);
}
MergedBlockOutputStream::Finalizer::~Finalizer()

View File

@ -115,7 +115,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MutateFromLogEntryTask::prepare()
String dummy;
if (!storage.findReplicaHavingCoveringPart(entry.new_part_name, true, dummy).empty())
{
LOG_DEBUG(log, "Mutation of part {} finished by some other replica, will download merged part", entry.new_part_name);
LOG_DEBUG(log, "Mutation of part {} finished by some other replica, will download mutated part", entry.new_part_name);
return PrepareResult{
.prepared_successfully = false,
.need_to_check_missing_part_in_fetch = true,
@ -127,7 +127,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MutateFromLogEntryTask::prepare()
if (!zero_copy_lock)
{
LOG_DEBUG(log, "Mutation of part {} started by some other replica, will wait it and fetch merged part", entry.new_part_name);
LOG_DEBUG(log, "Mutation of part {} started by some other replica, will wait it and mutated merged part", entry.new_part_name);
return PrepareResult{
.prepared_successfully = false,
.need_to_check_missing_part_in_fetch = false,
@ -175,7 +175,7 @@ bool MutateFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWrit
try
{
storage.checkPartChecksumsAndCommit(*transaction_ptr, new_part);
storage.checkPartChecksumsAndCommit(*transaction_ptr, new_part, mutate_task->getHardlinkedFiles());
}
catch (const Exception & e)
{

View File

@ -481,7 +481,6 @@ void finalizeMutatedPart(
MergeTreeData::DataPart::calculateTotalSizeOnDisk(new_data_part->volume->getDisk(), part_path));
new_data_part->default_codec = codec;
new_data_part->calculateColumnsAndSecondaryIndicesSizesOnDisk();
new_data_part->storage.lockSharedData(*new_data_part);
}
}
@ -549,6 +548,8 @@ struct MutationContext
ExecuteTTLType execute_ttl_type{ExecuteTTLType::NONE};
MergeTreeTransactionPtr txn;
MergeTreeData::HardlinkedFiles hardlinked_files;
};
using MutationContextPtr = std::shared_ptr<MutationContext>;
@ -1071,6 +1072,7 @@ private:
ctx->new_data_part->version.setCreationTID(tid, nullptr);
ctx->new_data_part->storeVersionMetadata();
NameSet hardlinked_files;
/// Create hardlinks for unchanged files
for (auto it = ctx->disk->iterateDirectory(ctx->source_part->getFullRelativePath()); it->isValid(); it->next())
{
@ -1084,10 +1086,12 @@ private:
{
return rename_pair.first == file_name;
});
if (rename_it != ctx->files_to_rename.end())
{
if (rename_it->second.empty())
continue;
destination += rename_it->second;
}
else
@ -1095,8 +1099,13 @@ private:
destination += it->name();
}
if (!ctx->disk->isDirectory(it->path()))
{
ctx->disk->createHardLink(it->path(), destination);
hardlinked_files.insert(it->name());
}
else if (!endsWith(".tmp_proj", it->name())) // ignore projection tmp merge dir
{
// it's a projection part directory
@ -1105,10 +1114,18 @@ private:
{
String p_destination = fs::path(destination) / p_it->name();
ctx->disk->createHardLink(p_it->path(), p_destination);
hardlinked_files.insert(p_it->name());
}
}
}
/// Tracking of hardlinked files required for zero-copy replication.
/// We don't remove them when we delete last copy of source part because
/// new part can use them.
ctx->hardlinked_files.source_table_shared_id = ctx->source_part->storage.getTableSharedID();
ctx->hardlinked_files.source_part_name = ctx->source_part->name;
ctx->hardlinked_files.hardlinks_from_source_part = hardlinked_files;
(*ctx->mutate_entry)->columns_written = ctx->storage_columns.size() - ctx->updated_header.columns();
ctx->new_data_part->checksums = ctx->source_part->checksums;
@ -1283,7 +1300,7 @@ bool MutateTask::prepare()
storage_from_source_part, ctx->metadata_snapshot, ctx->commands_for_part, Context::createCopy(context_for_reading)))
{
LOG_TRACE(ctx->log, "Part {} doesn't change up to mutation version {}", ctx->source_part->name, ctx->future_part->part_info.mutation);
promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_clone_", ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn));
promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_clone_", ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn, &ctx->hardlinked_files, false));
return false;
}
else
@ -1374,7 +1391,7 @@ bool MutateTask::prepare()
&& ctx->files_to_rename.empty())
{
LOG_TRACE(ctx->log, "Part {} doesn't change up to mutation version {} (optimized)", ctx->source_part->name, ctx->future_part->part_info.mutation);
promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_mut_", ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn));
promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_mut_", ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn, &ctx->hardlinked_files, false));
return false;
}
@ -1384,5 +1401,10 @@ bool MutateTask::prepare()
return true;
}
const MergeTreeData::HardlinkedFiles & MutateTask::getHardlinkedFiles() const
{
return ctx->hardlinked_files;
}
}

View File

@ -13,7 +13,7 @@ namespace DB
class MutateTask;
using MutateTaskPtr = std::shared_ptr<MutateTask>;\
using MutateTaskPtr = std::shared_ptr<MutateTask>;
class MergeTreeDataMergerMutator;
@ -44,6 +44,8 @@ public:
return promise.get_future();
}
const MergeTreeData::HardlinkedFiles & getHardlinkedFiles() const;
private:
bool prepare();
@ -56,7 +58,6 @@ private:
State state{State::NEED_PREPARE};
std::promise<MergeTreeData::MutableDataPartPtr> promise;
std::shared_ptr<MutationContext> ctx;

View File

@ -314,8 +314,6 @@ void ReplicatedMergeTreeSink::commitPart(
bool is_already_existing_part = false;
String old_part_name = part->name;
while (true)
{
/// Obtain incremental block number and lock it. The lock holds our intention to add the block to the filesystem.
@ -499,6 +497,8 @@ void ReplicatedMergeTreeSink::commitPart(
part->name);
}
storage.lockSharedData(*part, false, {});
Coordination::Responses responses;
Coordination::Error multi_code = zookeeper->tryMultiNoThrow(ops, responses); /// 1 RTT
@ -553,11 +553,13 @@ void ReplicatedMergeTreeSink::commitPart(
}
else if (multi_code == Coordination::Error::ZNODEEXISTS && failed_op_path == quorum_info.status_path)
{
storage.unlockSharedData(*part);
transaction.rollback();
throw Exception("Another quorum insert has been already started", ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE);
}
else
{
storage.unlockSharedData(*part);
/// NOTE: We could be here if the node with the quorum existed, but was quickly removed.
transaction.rollback();
throw Exception("Unexpected logical error while adding block " + toString(block_number) + " with ID '" + block_id + "': "
@ -567,12 +569,14 @@ void ReplicatedMergeTreeSink::commitPart(
}
else if (Coordination::isHardwareError(multi_code))
{
storage.unlockSharedData(*part);
transaction.rollback();
throw Exception("Unrecoverable network error while adding block " + toString(block_number) + " with ID '" + block_id + "': "
+ Coordination::errorMessage(multi_code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
}
else
{
storage.unlockSharedData(*part);
transaction.rollback();
throw Exception("Unexpected ZooKeeper error while adding block " + toString(block_number) + " with ID '" + block_id + "': "
+ Coordination::errorMessage(multi_code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
@ -595,9 +599,6 @@ void ReplicatedMergeTreeSink::commitPart(
waitForQuorum(zookeeper, part->name, quorum_info.status_path, quorum_info.is_active_node_value);
}
/// Cleanup shared locks made with old name
part->cleanupOldName(old_part_name);
}
void ReplicatedMergeTreeSink::onStart()

View File

@ -13,9 +13,13 @@ namespace ErrorCodes
extern const int DIRECTORY_ALREADY_EXISTS;
}
namespace
{
static void localBackupImpl(const DiskPtr & disk, const String & source_path, const String & destination_path, bool make_source_readonly, size_t level,
std::optional<size_t> max_level)
void localBackupImpl(
const DiskPtr & disk, const String & source_path,
const String & destination_path, bool make_source_readonly, size_t level,
std::optional<size_t> max_level)
{
if (max_level && level > *max_level)
return;
@ -43,8 +47,6 @@ static void localBackupImpl(const DiskPtr & disk, const String & source_path, co
}
}
namespace
{
class CleanupOnFail
{
public:
@ -81,7 +83,10 @@ private:
};
}
void localBackup(const DiskPtr & disk, const String & source_path, const String & destination_path, bool make_source_readonly, std::optional<size_t> max_level)
void localBackup(
const DiskPtr & disk, const String & source_path,
const String & destination_path, bool make_source_readonly,
std::optional<size_t> max_level, bool copy_instead_of_hardlinks)
{
if (disk->exists(destination_path) && !disk->isDirectoryEmpty(destination_path))
{
@ -101,7 +106,10 @@ void localBackup(const DiskPtr & disk, const String & source_path, const String
{
try
{
localBackupImpl(disk, source_path, destination_path, make_source_readonly, 0, max_level);
if (copy_instead_of_hardlinks)
disk->copyDirectoryContent(source_path, disk, destination_path);
else
localBackupImpl(disk, source_path, destination_path, make_source_readonly, 0, max_level);
}
catch (const DB::ErrnoException & e)
{

View File

@ -20,6 +20,6 @@ namespace DB
* If max_level is specified, than only files which depth relative source_path less or equal max_level will be copied.
* So, if max_level=0 than only direct file child are copied.
*/
void localBackup(const DiskPtr & disk, const String & source_path, const String & destination_path, bool make_source_readonly = true, std::optional<size_t> max_level = {});
void localBackup(const DiskPtr & disk, const String & source_path, const String & destination_path, bool make_source_readonly = true, std::optional<size_t> max_level = {}, bool copy_instead_of_hardlinks = false);
}

View File

@ -1583,7 +1583,7 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con
Int64 temp_index = insert_increment.get();
MergeTreePartInfo dst_part_info(partition_id, temp_index, temp_index, src_part->info.level);
auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, my_metadata_snapshot, local_context->getCurrentTransaction());
auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, my_metadata_snapshot, local_context->getCurrentTransaction(), {}, false);
dst_parts.emplace_back(std::move(dst_part));
}
@ -1669,7 +1669,7 @@ void StorageMergeTree::movePartitionToTable(const StoragePtr & dest_table, const
Int64 temp_index = insert_increment.get();
MergeTreePartInfo dst_part_info(partition_id, temp_index, temp_index, src_part->info.level);
auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, local_context->getCurrentTransaction());
auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, local_context->getCurrentTransaction(), {}, false);
dst_parts.emplace_back(std::move(dst_part));
}

View File

@ -75,6 +75,7 @@
#include <boost/algorithm/string/join.hpp>
#include <boost/algorithm/string/replace.hpp>
#include <boost/algorithm/string.hpp>
#include <algorithm>
#include <ctime>
@ -1446,15 +1447,18 @@ void StorageReplicatedMergeTree::checkPartChecksumsAndAddCommitOps(const zkutil:
}
MergeTreeData::DataPartsVector StorageReplicatedMergeTree::checkPartChecksumsAndCommit(Transaction & transaction,
const DataPartPtr & part)
const DataPartPtr & part, std::optional<MergeTreeData::HardlinkedFiles> hardlinked_files)
{
auto zookeeper = getZooKeeper();
while (true)
{
Coordination::Requests ops;
NameSet absent_part_paths_on_replicas;
lockSharedData(*part, false, hardlinked_files);
/// Checksums are checked here and `ops` is filled. In fact, the part is added to ZK just below, when executing `multi`.
checkPartChecksumsAndAddCommitOps(zookeeper, part, ops, part->name, &absent_part_paths_on_replicas);
@ -1493,7 +1497,10 @@ MergeTreeData::DataPartsVector StorageReplicatedMergeTree::checkPartChecksumsAnd
LOG_INFO(log, "The part {} on a replica suddenly appeared, will recheck checksums", e.getPathForFirstFailedOp());
}
else
{
unlockSharedData(*part);
throw;
}
}
}
}
@ -1933,6 +1940,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
entry.znode_name, entry_replace.drop_range_part_name, entry_replace.new_part_names.size(),
entry_replace.from_database, entry_replace.from_table);
auto metadata_snapshot = getInMemoryMetadataPtr();
auto storage_settings_ptr = getSettings();
MergeTreePartInfo drop_range = MergeTreePartInfo::fromPartName(entry_replace.drop_range_part_name, format_version);
/// Range with only one block has special meaning: it's ATTACH PARTITION or MOVE PARTITION, so there is no drop range
@ -1985,6 +1993,8 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
/// A replica that will be used to fetch part
String replica;
MergeTreeData::HardlinkedFiles hardlinked_files;
};
using PartDescriptionPtr = std::shared_ptr<PartDescription>;
@ -2083,6 +2093,14 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
continue;
}
bool avoid_copy_local_part = storage_settings_ptr->allow_remote_fs_zero_copy_replication && src_part->isStoredOnRemoteDiskWithZeroCopySupport();
if (avoid_copy_local_part)
{
LOG_DEBUG(log, "Avoid copy local part {} from table {} because of zero-copy replication", part_desc->src_part_name, source_table_id.getNameForLogs());
continue;
}
String checksum_hex = src_part->checksums.getTotalChecksumHex();
if (checksum_hex != part_desc->checksum_hex)
@ -2190,16 +2208,17 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
static const String TMP_PREFIX = "tmp_replace_from_";
std::vector<MergeTreeData::HardlinkedFiles> hardlinked_files_for_parts;
auto obtain_part = [&] (PartDescriptionPtr & part_desc)
{
if (part_desc->src_table_part)
{
if (part_desc->checksum_hex != part_desc->src_table_part->checksums.getTotalChecksumHex())
throw Exception("Checksums of " + part_desc->src_table_part->name + " is suddenly changed", ErrorCodes::UNFINISHED);
part_desc->res_part = cloneAndLoadDataPartOnSameDisk(
part_desc->src_table_part, TMP_PREFIX + "clone_", part_desc->new_part_info, metadata_snapshot, NO_TRANSACTION_PTR);
part_desc->src_table_part, TMP_PREFIX + "clone_", part_desc->new_part_info, metadata_snapshot, NO_TRANSACTION_PTR, &part_desc->hardlinked_files, false);
}
else if (!part_desc->replica.empty())
{
@ -2246,8 +2265,11 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
{
renameTempPartAndReplace(part_desc->res_part, NO_TRANSACTION_RAW, nullptr, &transaction);
getCommitPartOps(ops, part_desc->res_part);
lockSharedData(*part_desc->res_part, false, part_desc->hardlinked_files);
}
if (!ops.empty())
zookeeper->multi(ops);
@ -2274,6 +2296,10 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
catch (...)
{
PartLog::addNewParts(getContext(), res_parts, watch.elapsed(), ExecutionStatus::fromCurrentException());
for (const auto & res_part : res_parts)
unlockSharedData(*res_part);
throw;
}
@ -3818,7 +3844,7 @@ bool StorageReplicatedMergeTree::partIsLastQuorumPart(const MergeTreePartInfo &
bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const StorageMetadataPtr & metadata_snapshot,
const String & source_replica_path, bool to_detached, size_t quorum, zkutil::ZooKeeper::Ptr zookeeper_)
const String & source_replica_path, bool to_detached, size_t quorum, zkutil::ZooKeeper::Ptr zookeeper_, bool try_fetch_shared)
{
auto zookeeper = zookeeper_ ? zookeeper_ : getZooKeeper();
const auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version);
@ -3926,12 +3952,13 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora
InterserverCredentialsPtr credentials;
std::optional<CurrentlySubmergingEmergingTagger> tagger_ptr;
std::function<MutableDataPartPtr()> get_part;
MergeTreeData::HardlinkedFiles hardlinked_files;
if (part_to_clone)
{
get_part = [&, part_to_clone]()
{
return cloneAndLoadDataPartOnSameDisk(part_to_clone, "tmp_clone_", part_info, metadata_snapshot, NO_TRANSACTION_PTR);
return cloneAndLoadDataPartOnSameDisk(part_to_clone, "tmp_clone_", part_info, metadata_snapshot, NO_TRANSACTION_PTR, &hardlinked_files, false);
};
}
else
@ -3964,7 +3991,7 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora
to_detached,
"",
&tagger_ptr,
true);
try_fetch_shared);
};
}
@ -3977,7 +4004,7 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora
Transaction transaction(*this, NO_TRANSACTION_RAW);
renameTempPartAndReplace(part, NO_TRANSACTION_RAW, nullptr, &transaction);
replaced_parts = checkPartChecksumsAndCommit(transaction, part);
replaced_parts = checkPartChecksumsAndCommit(transaction, part, hardlinked_files);
/** If a quorum is tracked for this part, you must update it.
* If you do not have time, in case of losing the session, when you restart the server - see the `ReplicatedMergeTreeRestartingThread::updateQuorumIfWeHavePart` method.
@ -5716,7 +5743,7 @@ void StorageReplicatedMergeTree::fetchPartition(
try
{
/// part name , metadata, part_path , true, 0, zookeeper
if (!fetchPart(part_name, metadata_snapshot, part_path, true, 0, zookeeper))
if (!fetchPart(part_name, metadata_snapshot, part_path, true, 0, zookeeper, /* try_fetch_shared = */ false))
throw Exception(ErrorCodes::UNFINISHED, "Failed to fetch part {} from {}", part_name, from_);
}
catch (const DB::Exception & e)
@ -5854,7 +5881,7 @@ void StorageReplicatedMergeTree::fetchPartition(
try
{
fetched = fetchPart(part, metadata_snapshot, best_replica_path, true, 0, zookeeper);
fetched = fetchPart(part, metadata_snapshot, best_replica_path, true, 0, zookeeper, /* try_fetch_shared = */ false);
}
catch (const DB::Exception & e)
{
@ -6332,6 +6359,7 @@ void StorageReplicatedMergeTree::replacePartitionFrom(
/// First argument is true, because we possibly will add new data to current table.
auto lock1 = lockForShare(query_context->getCurrentQueryId(), query_context->getSettingsRef().lock_acquire_timeout);
auto lock2 = source_table->lockForShare(query_context->getCurrentQueryId(), query_context->getSettingsRef().lock_acquire_timeout);
auto storage_settings_ptr = getSettings();
auto source_metadata_snapshot = source_table->getInMemoryMetadataPtr();
auto metadata_snapshot = getInMemoryMetadataPtr();
@ -6383,6 +6411,7 @@ void StorageReplicatedMergeTree::replacePartitionFrom(
assert(replace == !LogEntry::ReplaceRangeEntry::isMovePartitionOrAttachFrom(drop_range));
String drop_range_fake_part_name = getPartNamePossiblyFake(format_version, drop_range);
std::vector<MergeTreeData::HardlinkedFiles> hardlinked_files_for_parts;
for (const auto & src_part : src_all_parts)
{
@ -6413,13 +6442,19 @@ void StorageReplicatedMergeTree::replacePartitionFrom(
UInt64 index = lock->getNumber();
MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level);
auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, metadata_snapshot, NO_TRANSACTION_PTR);
MergeTreeData::HardlinkedFiles hardlinked_files;
bool copy_instead_of_hardlink = storage_settings_ptr->allow_remote_fs_zero_copy_replication
&& src_part->isStoredOnRemoteDiskWithZeroCopySupport();
auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, metadata_snapshot, NO_TRANSACTION_PTR, &hardlinked_files, copy_instead_of_hardlink);
src_parts.emplace_back(src_part);
dst_parts.emplace_back(dst_part);
ephemeral_locks.emplace_back(std::move(*lock));
block_id_paths.emplace_back(block_id_path);
part_checksums.emplace_back(hash_hex);
hardlinked_files_for_parts.emplace_back(hardlinked_files);
}
ReplicatedMergeTreeLogEntryData entry;
@ -6477,6 +6512,9 @@ void StorageReplicatedMergeTree::replacePartitionFrom(
renameTempPartAndReplace(part, query_context->getCurrentTransaction().get(), nullptr, &transaction, data_parts_lock);
}
for (size_t i = 0; i < dst_parts.size(); ++i)
lockSharedData(*dst_parts[i], false, hardlinked_files_for_parts[i]);
Coordination::Error code = zookeeper->tryMulti(ops, op_results);
if (code == Coordination::Error::ZOK)
delimiting_block_lock->assumeUnlocked();
@ -6504,6 +6542,9 @@ void StorageReplicatedMergeTree::replacePartitionFrom(
catch (...)
{
PartLog::addNewParts(getContext(), dst_parts, watch.elapsed(), ExecutionStatus::fromCurrentException());
for (const auto & dst_part : dst_parts)
unlockSharedData(*dst_part);
throw;
}
@ -6536,6 +6577,7 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta
{
auto lock1 = lockForShare(query_context->getCurrentQueryId(), query_context->getSettingsRef().lock_acquire_timeout);
auto lock2 = dest_table->lockForShare(query_context->getCurrentQueryId(), query_context->getSettingsRef().lock_acquire_timeout);
auto storage_settings_ptr = getSettings();
auto dest_table_storage = std::dynamic_pointer_cast<StorageReplicatedMergeTree>(dest_table);
if (!dest_table_storage)
@ -6605,6 +6647,8 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta
String dest_alter_partition_version_path = dest_table_storage->zookeeper_path + "/alter_partition_version";
Coordination::Stat dest_alter_partition_version_stat;
zookeeper->get(dest_alter_partition_version_path, &dest_alter_partition_version_stat);
std::vector<MergeTreeData::HardlinkedFiles> hardlinked_files_for_parts;
for (const auto & src_part : src_all_parts)
{
if (!dest_table_storage->canReplacePartition(src_part))
@ -6624,13 +6668,20 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta
UInt64 index = lock->getNumber();
MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level);
auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, NO_TRANSACTION_PTR);
MergeTreeData::HardlinkedFiles hardlinked_files;
bool copy_instead_of_hardlink = storage_settings_ptr->allow_remote_fs_zero_copy_replication
&& src_part->isStoredOnRemoteDiskWithZeroCopySupport();
auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, NO_TRANSACTION_PTR, &hardlinked_files, copy_instead_of_hardlink);
src_parts.emplace_back(src_part);
dst_parts.emplace_back(dst_part);
ephemeral_locks.emplace_back(std::move(*lock));
block_id_paths.emplace_back(block_id_path);
part_checksums.emplace_back(hash_hex);
hardlinked_files_for_parts.emplace_back(hardlinked_files);
}
ReplicatedMergeTreeLogEntryData entry_delete;
@ -6697,6 +6748,9 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta
for (MutableDataPartPtr & part : dst_parts)
dest_table_storage->renameTempPartAndReplace(part, query_context->getCurrentTransaction().get(), nullptr, &transaction, lock);
for (size_t i = 0; i < dst_parts.size(); ++i)
dest_table_storage->lockSharedData(*dst_parts[i], false, hardlinked_files_for_parts[i]);
Coordination::Error code = zookeeper->tryMulti(ops, op_results);
if (code == Coordination::Error::ZBADVERSION)
continue;
@ -6712,6 +6766,10 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta
catch (...)
{
PartLog::addNewParts(getContext(), dst_parts, watch.elapsed(), ExecutionStatus::fromCurrentException());
for (const auto & dst_part : dst_parts)
dest_table_storage->unlockSharedData(*dst_part);
throw;
}
@ -7305,7 +7363,9 @@ void StorageReplicatedMergeTree::createTableSharedID()
void StorageReplicatedMergeTree::lockSharedDataTemporary(const String & part_name, const String & part_id, const DiskPtr & disk) const
{
if (!disk || !disk->supportZeroCopyReplication())
auto settings = getSettings();
if (!disk || !disk->supportZeroCopyReplication() || !settings->allow_remote_fs_zero_copy_replication)
return;
zkutil::ZooKeeperPtr zookeeper = tryGetZooKeeper();
@ -7327,9 +7387,11 @@ void StorageReplicatedMergeTree::lockSharedDataTemporary(const String & part_nam
}
}
void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock) const
void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock, std::optional<HardlinkedFiles> hardlinked_files) const
{
if (!part.volume || !part.isStoredOnDisk())
auto settings = getSettings();
if (!part.volume || !part.isStoredOnDisk() || !settings->allow_remote_fs_zero_copy_replication)
return;
DiskPtr disk = part.volume->getDisk();
@ -7343,33 +7405,42 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part,
String id = part.getUniqueId();
boost::replace_all(id, "/", "_");
Strings zc_zookeeper_paths = getZeroCopyPartPath(*getSettings(), disk->getType(), getTableSharedID(),
Strings zc_zookeeper_paths = getZeroCopyPartPath(
*getSettings(), disk->getType(), getTableSharedID(),
part.name, zookeeper_path);
String path_to_set_hardlinked_files;
NameSet hardlinks;
if (hardlinked_files.has_value() && !hardlinked_files->hardlinks_from_source_part.empty())
{
path_to_set_hardlinked_files = getZeroCopyPartPath(
*getSettings(), disk->getType(), hardlinked_files->source_table_shared_id,
hardlinked_files->source_part_name, zookeeper_path)[0];
hardlinks = hardlinked_files->hardlinks_from_source_part;
}
for (const auto & zc_zookeeper_path : zc_zookeeper_paths)
{
String zookeeper_node = fs::path(zc_zookeeper_path) / id / replica_name;
LOG_TRACE(log, "Set zookeeper persistent lock {}", zookeeper_node);
createZeroCopyLockNode(zookeeper, zookeeper_node, zkutil::CreateMode::Persistent, replace_existing_lock);
createZeroCopyLockNode(
zookeeper, zookeeper_node, zkutil::CreateMode::Persistent,
replace_existing_lock, path_to_set_hardlinked_files, hardlinks);
}
}
bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const
{
return unlockSharedData(part, part.name);
}
bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part, const String & name) const
std::pair<bool, NameSet> StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const
{
if (!part.volume || !part.isStoredOnDisk())
return true;
return std::make_pair(true, NameSet{});
DiskPtr disk = part.volume->getDisk();
if (!disk || !disk->supportZeroCopyReplication())
return true;
return std::make_pair(true, NameSet{});
/// If part is temporary refcount file may be absent
auto ref_count_path = fs::path(part.getFullRelativePath()) / IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK;
@ -7377,20 +7448,20 @@ bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & par
{
auto ref_count = disk->getRefCount(ref_count_path);
if (ref_count > 0) /// Keep part shard info for frozen backups
return false;
return std::make_pair(false, NameSet{});
}
else
{
/// Temporary part with some absent file cannot be locked in shared mode
return true;
return std::make_pair(true, NameSet{});
}
return unlockSharedDataByID(part.getUniqueId(), getTableSharedID(), name, replica_name, disk, getZooKeeper(), *getSettings(), log,
return unlockSharedDataByID(part.getUniqueId(), getTableSharedID(), part.name, replica_name, disk, getZooKeeper(), *getSettings(), log,
zookeeper_path);
}
bool StorageReplicatedMergeTree::unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name,
std::pair<bool, NameSet> StorageReplicatedMergeTree::unlockSharedDataByID(
String part_id, const String & table_uuid, const String & part_name,
const String & replica_name_, DiskPtr disk, zkutil::ZooKeeperPtr zookeeper_ptr, const MergeTreeSettings & settings,
Poco::Logger * logger, const String & zookeeper_path_old)
{
@ -7399,17 +7470,28 @@ bool StorageReplicatedMergeTree::unlockSharedDataByID(String part_id, const Stri
Strings zc_zookeeper_paths = getZeroCopyPartPath(settings, disk->getType(), table_uuid, part_name, zookeeper_path_old);
bool part_has_no_more_locks = true;
NameSet files_not_to_remove;
for (const auto & zc_zookeeper_path : zc_zookeeper_paths)
{
String files_not_to_remove_str;
zookeeper_ptr->tryGet(zc_zookeeper_path, files_not_to_remove_str);
files_not_to_remove.clear();
if (!files_not_to_remove_str.empty())
boost::split(files_not_to_remove, files_not_to_remove_str, boost::is_any_of("\n "));
String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / part_id;
/// Delete our replica node for part from zookeeper (we are not interested in it anymore)
String zookeeper_part_replica_node = fs::path(zookeeper_part_uniq_node) / replica_name_;
LOG_TRACE(logger, "Remove zookeeper lock {}", zookeeper_part_replica_node);
LOG_TRACE(logger, "Remove zookeeper lock {} for part {}", zookeeper_part_replica_node, part_name);
zookeeper_ptr->tryRemove(zookeeper_part_replica_node);
if (auto ec = zookeeper_ptr->tryRemove(zookeeper_part_replica_node); ec != Coordination::Error::ZOK && ec != Coordination::Error::ZNONODE)
{
throw zkutil::KeeperException(ec, zookeeper_part_replica_node);
}
/// Check, maybe we were the last replica and can remove part forever
Strings children;
@ -7417,37 +7499,66 @@ bool StorageReplicatedMergeTree::unlockSharedDataByID(String part_id, const Stri
if (!children.empty())
{
LOG_TRACE(logger, "Found zookeper locks for {}", zookeeper_part_uniq_node);
LOG_TRACE(logger, "Found {} ({}) zookeper locks for {}", zookeeper_part_uniq_node, children.size(), fmt::join(children, ", "));
part_has_no_more_locks = false;
continue;
}
auto error_code = zookeeper_ptr->tryRemove(zookeeper_part_uniq_node);
LOG_TRACE(logger, "Remove parent zookeeper lock {} : {}", zookeeper_part_uniq_node, error_code != Coordination::Error::ZNOTEMPTY);
if (error_code == Coordination::Error::ZOK)
{
LOG_TRACE(logger, "Removed last parent zookeeper lock {} for part {} with id {}", zookeeper_part_uniq_node, part_name, part_id);
}
else if (error_code == Coordination::Error::ZNOTEMPTY)
{
LOG_TRACE(logger, "Cannot remove last parent zookeeper lock {} for part {} with id {}, another replica locked part concurrently", zookeeper_part_uniq_node, part_name, part_id);
}
else if (error_code == Coordination::Error::ZNONODE)
{
LOG_TRACE(logger, "Node with parent zookeeper lock {} for part {} with id {} doesn't exist", zookeeper_part_uniq_node, part_name, part_id);
}
else
{
throw zkutil::KeeperException(error_code, zookeeper_part_uniq_node);
}
/// Even when we have lock with same part name, but with different uniq, we can remove files on S3
children.clear();
String zookeeper_part_node = fs::path(zookeeper_part_uniq_node).parent_path();
zookeeper_ptr->tryGetChildren(zookeeper_part_node, children);
if (children.empty())
{
/// Cleanup after last uniq removing
error_code = zookeeper_ptr->tryRemove(zookeeper_part_node);
LOG_TRACE(logger, "Remove parent zookeeper lock {} : {}", zookeeper_part_node, error_code != Coordination::Error::ZNOTEMPTY);
if (error_code == Coordination::Error::ZOK)
{
LOG_TRACE(logger, "Removed last parent zookeeper lock {} for part {} (part is finally unlocked)", zookeeper_part_uniq_node, part_name);
}
else if (error_code == Coordination::Error::ZNOTEMPTY)
{
LOG_TRACE(logger, "Cannot remove last parent zookeeper lock {} for part {}, another replica locked part concurrently", zookeeper_part_uniq_node, part_name);
}
else if (error_code == Coordination::Error::ZNONODE)
{
LOG_TRACE(logger, "Node with parent zookeeper lock {} for part {} doesn't exist (part was unlocked before)", zookeeper_part_uniq_node, part_name);
}
else
{
throw zkutil::KeeperException(error_code, zookeeper_part_uniq_node);
}
}
else
{
LOG_TRACE(logger, "Can't remove parent zookeeper lock {} : {}", zookeeper_part_node, children.size());
for (auto & c : children)
{
LOG_TRACE(logger, "Child node {}", c);
}
LOG_TRACE(logger, "Can't remove parent zookeeper lock {} for part {}, because children {} ({}) were concurrently created",
zookeeper_part_node, part_name, children.size(), fmt::join(children, ", "));
}
}
return part_has_no_more_locks;
return std::make_pair(part_has_no_more_locks, files_not_to_remove);
}
@ -7780,6 +7891,8 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP
throw Exception(ErrorCodes::INCORRECT_DATA, "Tried to create empty part {}, but it replaces existing parts {}.", lost_part_name, fmt::join(part_names, ", "));
}
lockSharedData(*new_data_part, false, {});
while (true)
{
@ -7827,6 +7940,7 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP
}
catch (const Exception & ex)
{
unlockSharedData(*new_data_part);
LOG_WARNING(log, "Cannot commit empty part {} with error {}", lost_part_name, ex.displayText());
return false;
}
@ -7837,11 +7951,14 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP
}
void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode, bool replace_existing_lock)
void StorageReplicatedMergeTree::createZeroCopyLockNode(
const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode,
bool replace_existing_lock, const String & path_to_set_hardlinked_files, const NameSet & hardlinked_files)
{
/// In rare case other replica can remove path between createAncestors and createIfNotExists
/// So we make up to 5 attempts
bool created = false;
for (int attempts = 5; attempts > 0; --attempts)
{
try
@ -7857,25 +7974,67 @@ void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperP
Coordination::Requests ops;
ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_node, -1));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_node, "", mode));
if (!path_to_set_hardlinked_files.empty() && !hardlinked_files.empty())
{
std::string data = boost::algorithm::join(hardlinked_files, "\n");
/// List of files used to detect hardlinks. path_to_set_hardlinked_files --
/// is a path to source part zero copy node. During part removal hardlinked
/// files will be left for source part.
ops.emplace_back(zkutil::makeSetRequest(path_to_set_hardlinked_files, data, -1));
}
Coordination::Responses responses;
auto error = zookeeper->tryMulti(ops, responses);
if (error == Coordination::Error::ZOK)
{
created = true;
break;
}
else if (error == Coordination::Error::ZNONODE && mode != zkutil::CreateMode::Persistent)
{
throw Exception(ErrorCodes::NOT_FOUND_NODE, "Cannot create ephemeral zero copy lock {} because part was unlocked from zookeeper", zookeeper_node);
}
}
else
{
auto error = zookeeper->tryCreate(zookeeper_node, "", mode);
Coordination::Requests ops;
if (!path_to_set_hardlinked_files.empty() && !hardlinked_files.empty())
{
std::string data = boost::algorithm::join(hardlinked_files, "\n");
/// List of files used to detect hardlinks. path_to_set_hardlinked_files --
/// is a path to source part zero copy node. During part removal hardlinked
/// files will be left for source part.
ops.emplace_back(zkutil::makeSetRequest(path_to_set_hardlinked_files, data, -1));
}
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_node, "", mode));
Coordination::Responses responses;
auto error = zookeeper->tryMulti(ops, responses);
if (error == Coordination::Error::ZOK || error == Coordination::Error::ZNODEEXISTS)
{
created = true;
break;
}
else if (error == Coordination::Error::ZNONODE && mode != zkutil::CreateMode::Persistent)
{
/// Ephemeral locks used during fetches so if parent node was removed we cannot do anything
throw Exception(ErrorCodes::NOT_FOUND_NODE, "Cannot create ephemeral zero copy lock {} because part was unlocked from zookeeper", zookeeper_node);
}
}
}
catch (const zkutil::KeeperException & e)
{
if (e.code == Coordination::Error::ZNONODE)
continue;
throw;
}
}
if (!created)
{
String mode_str = mode == zkutil::CreateMode::Persistent ? "persistent" : "ephemral";
throw Exception(ErrorCodes::NOT_FOUND_NODE, "Cannot create {} zero copy lock {} because part was unlocked from zookeeper", mode_str, zookeeper_node);
}
}
@ -8000,6 +8159,7 @@ bool StorageReplicatedMergeTree::removeSharedDetachedPart(DiskPtr disk, const St
bool keep_shared = false;
zkutil::ZooKeeperPtr zookeeper = getZooKeeper();
NameSet files_not_to_remove;
fs::path checksums = fs::path(path) / IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK;
if (disk->exists(checksums))
@ -8007,15 +8167,18 @@ bool StorageReplicatedMergeTree::removeSharedDetachedPart(DiskPtr disk, const St
if (disk->getRefCount(checksums) == 0)
{
String id = disk->getUniqueId(checksums);
keep_shared = !StorageReplicatedMergeTree::unlockSharedDataByID(id, table_uuid, part_name,
bool can_remove = false;
std::tie(can_remove, files_not_to_remove) = StorageReplicatedMergeTree::unlockSharedDataByID(id, table_uuid, part_name,
detached_replica_name, disk, zookeeper, getContext()->getReplicatedMergeTreeSettings(), log,
detached_zookeeper_path);
keep_shared = !can_remove;
}
else
keep_shared = true;
}
disk->removeSharedRecursive(path, keep_shared);
disk->removeSharedRecursive(path, keep_shared, files_not_to_remove);
return keep_shared;
}

View File

@ -236,22 +236,19 @@ public:
bool executeFetchShared(const String & source_replica, const String & new_part_name, const DiskPtr & disk, const String & path);
/// Lock part in zookeeper for use shared data in several nodes
void lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock) const override;
void lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock, std::optional<HardlinkedFiles> hardlinked_files) const override;
void lockSharedDataTemporary(const String & part_name, const String & part_id, const DiskPtr & disk) const;
/// Unlock shared data part in zookeeper
/// Return true if data unlocked
/// Return false if data is still used by another node
bool unlockSharedData(const IMergeTreeDataPart & part) const override;
/// Remove lock with old name for shared data part after rename
bool unlockSharedData(const IMergeTreeDataPart & part, const String & name) const override;
std::pair<bool, NameSet> unlockSharedData(const IMergeTreeDataPart & part) const override;
/// Unlock shared data part in zookeeper by part id
/// Return true if data unlocked
/// Return false if data is still used by another node
static bool unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name, const String & replica_name_,
static std::pair<bool, NameSet> unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name, const String & replica_name_,
DiskPtr disk, zkutil::ZooKeeperPtr zookeeper_, const MergeTreeSettings & settings, Poco::Logger * logger,
const String & zookeeper_path_old);
@ -286,7 +283,7 @@ public:
String getZooKeeperName() const { return zookeeper_name; }
// Return table id, common for different replicas
String getTableSharedID() const;
String getTableSharedID() const override;
static String getDefaultZooKeeperName() { return default_zookeeper_name; }
@ -479,7 +476,7 @@ private:
String getChecksumsForZooKeeper(const MergeTreeDataPartChecksums & checksums) const;
/// Accepts a PreActive part, atomically checks its checksums with ones on other replicas and commit the part
DataPartsVector checkPartChecksumsAndCommit(Transaction & transaction, const DataPartPtr & part);
DataPartsVector checkPartChecksumsAndCommit(Transaction & transaction, const DataPartPtr & part, std::optional<HardlinkedFiles> hardlinked_files = {});
bool partIsAssignedToBackgroundOperation(const DataPartPtr & part) const override;
@ -624,7 +621,8 @@ private:
const String & replica_path,
bool to_detached,
size_t quorum,
zkutil::ZooKeeper::Ptr zookeeper_ = nullptr);
zkutil::ZooKeeper::Ptr zookeeper_ = nullptr,
bool try_fetch_shared = true);
/** Download the specified part from the specified replica.
* Used for replace local part on the same s3-shared part in hybrid storage.
@ -766,7 +764,10 @@ private:
static Strings getZeroCopyPartPath(const MergeTreeSettings & settings, DiskType disk_type, const String & table_uuid,
const String & part_name, const String & zookeeper_path_old);
static void createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false);
static void createZeroCopyLockNode(
const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node,
int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false,
const String & path_to_set_hardlinked_files = "", const NameSet & hardlinked_files = {});
bool removeDetachedPart(DiskPtr disk, const String & path, const String & part_name, bool is_freezed) override;

View File

@ -10,10 +10,10 @@
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE match(URL, ' *tranio\\.ru/spain/*/commercial/*') settings max_threads=5]]></query>
<query><![CDATA[select count(position(URL, 'yandex')), count(position(URL, 'google')) FROM hits_100m_single]]></query>
<query><![CDATA[select count(multiSearchAllPositions(URL, ['yandex', 'google'])) FROM hits_100m_single]]></query>
<query><![CDATA[select count(match(URL, 'yandex|google')) FROM hits_10m_single]]></query>
<query><![CDATA[select count(multiMatchAny(URL, ['yandex', 'google'])) FROM hits_100m_single]]></query>
<query><![CDATA[select countIf(position(URL, 'yandex') > 0), count(position(URL, 'google')) FROM hits_100m_single]]></query>
<query><![CDATA[select countIf(multiSearchAllPositions(URL, ['yandex', 'google'])) FROM hits_100m_single]]></query>
<query><![CDATA[select countIf(match(URL, 'yandex|google')) FROM hits_10m_single]]></query>
<query><![CDATA[select countIf(multiMatchAny(URL, ['yandex', 'google'])) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'yahoo')), sum(match(URL, 'pikabu')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearchAny(URL, ['yandex', 'google', 'yahoo', 'pikabu'])) from hits_100m_single]]></query>

View File

@ -1,9 +0,0 @@
<test>
<preconditions>
<table_exists>hits_100m_single</table_exists>
</preconditions>
<settings><max_threads>1</max_threads></settings>
<!-- FIXME this should have been an EXPLAIN test -->
<query>SELECT count(JavaEnable) FROM hits_100m_single WHERE WatchID = 1 OR Title = 'next' OR URL = 'prev' OR OriginalURL = '???' OR 1</query>
</test>

View File

@ -1,27 +1,11 @@
<test>
<settings>
<max_threads>10</max_threads>
</settings>
<substitutions>
<substitution>
<name>items</name>
<values>
<value>1000</value>
<value>100000</value>
<value>10000000</value>
</values>
</substitution>
</substitutions>
<create_query>CREATE TABLE test (`id` UInt64, `value` UInt64, `text` String ) ENGINE = Memory</create_query>
<fill_query>INSERT INTO test SELECT number as id, rand64() as value, toString(number) as text FROM system.numbers_mt LIMIT 10000000</fill_query>
<query>SELECT groupArraySorted(100000)(id, value) FROM test</query>
<query>SELECT groupArraySorted(100000)(text, value) FROM test</query>
<query>SELECT groupArraySorted(100000)((id, text), value) FROM test</query>
<query>SELECT groupArraySorted(100000)(text) FROM test</query>
<create_query>CREATE TABLE test ( `id` UInt64, `value` UInt64, `text` String ) ENGINE = Memory</create_query>
<fill_query>INSERT INTO test SELECT number as id, rand64() as value, toString(number) as text FROM numbers({items})</fill_query>
<query>SELECT groupArraySorted(10)(id, value) FROM test</query>
<query>SELECT groupArraySorted(10)(text, value) FROM test</query>
<query>SELECT groupArraySorted(10)((id, text), value) FROM test</query>
<query>SELECT groupArraySorted(10)(text) FROM test</query>
<query>SELECT groupArraySorted(10000)(id, value) FROM test</query>
<query>SELECT groupArraySorted(10000)(text, value) FROM test</query>
<query>SELECT groupArraySorted(10000)((id, text), value) FROM test</query>
<query>SELECT groupArraySorted(10000)(text) FROM test</query>
<drop_query>DROP TABLE IF EXISTS test</drop_query>
</test>

View File

@ -1,6 +1,7 @@
<test>
<settings>
<max_insert_threads>8</max_insert_threads>
<allow_experimental_projection_optimization>0</allow_experimental_projection_optimization>
</settings>
<create_query>
@ -15,10 +16,10 @@
ORDER BY (d, n)
</create_query>
<fill_query>insert into a select '2000-01-01', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000)</fill_query>
<fill_query>insert into a select '2000-01-02', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000)</fill_query>
<fill_query>insert into a select '2000-01-03', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000)</fill_query>
<fill_query>insert into a select '2000-01-04', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000)</fill_query>
<fill_query>insert into a select '2000-01-01', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000)</fill_query>
<fill_query>insert into a select '2000-01-02', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000)</fill_query>
<fill_query>insert into a select '2000-01-03', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000)</fill_query>
<fill_query>insert into a select '2000-01-04', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000)</fill_query>
<fill_query>OPTIMIZE TABLE a FINAL</fill_query>

View File

@ -1,29 +1,37 @@
<test>
<preconditions>
<table_exists>hits_10m_single</table_exists>
<table_exists>hits_100m_single</table_exists>
</preconditions>
<settings>
<max_size_to_preallocate_for_aggregation>1000000000</max_size_to_preallocate_for_aggregation>
</settings>
<query>SELECT number FROM numbers(5000000) GROUP BY number FORMAT Null</query>
<query>SELECT number FROM numbers(10000000) GROUP BY number FORMAT Null</query>
<query short="1">SELECT number FROM numbers_mt(500000) GROUP BY number FORMAT Null</query>
<query short="1">SELECT number FROM numbers_mt(1000000) GROUP BY number FORMAT Null</query>
<query>SELECT number FROM numbers_mt(10000000) GROUP BY number FORMAT Null</query>
<query>SELECT number FROM numbers_mt(50000000) GROUP BY number FORMAT Null</query>
<query>WITH number % 524289 AS k, toUInt64(k) AS k1, k1 + 1 AS k2 SELECT k1, k2, count() FROM numbers(100000000) GROUP BY k1, k2 FORMAT Null</query>
<query>SELECT number FROM numbers_mt(10000000) GROUP BY number FORMAT Null SETTINGS group_by_two_level_threshold = 1e12, group_by_two_level_threshold_bytes = 1e12</query>
<query>SELECT number FROM numbers_mt(50000000) GROUP BY number FORMAT Null SETTINGS group_by_two_level_threshold = 1e12, group_by_two_level_threshold_bytes = 1e12</query>
<preconditions>
<table_exists>hits_100m_single</table_exists>
</preconditions>
<query>SELECT WatchID FROM hits_10m_single GROUP BY WatchID FORMAT Null</query>
<query>SELECT WatchID FROM hits_100m_single GROUP BY WatchID FORMAT Null</query>
<query>SELECT ClientIP AS x, x - 1, x - 2, x - 3, count() AS c FROM hits_10m_single GROUP BY x, x - 1, x - 2, x - 3 ORDER BY c DESC LIMIT 10</query>
<query>SELECT ClientIP AS x, x - 1, x - 2, x - 3, count() AS c FROM hits_100m_single GROUP BY x, x - 1, x - 2, x - 3 ORDER BY c DESC LIMIT 10</query>
<query>SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m_single WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10</query>
<query>SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_100m_single WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10</query>
<query>SELECT min(MobilePhoneModel) FROM hits_10m_single WHERE MobilePhoneModel != '' GROUP BY intHash32(UserID) % 1000000 FORMAT Null</query>
<query>SELECT min(MobilePhoneModel) FROM hits_100m_single WHERE MobilePhoneModel != '' GROUP BY intHash32(UserID) % 1000000 FORMAT Null</query>
<substitutions>
<substitution>
<name>table_name</name>
<values>
<value>hits_100m_single</value>
</values>
</substitution>
<substitution>
<name>numbers_threshold_value</name>
<values>
<value>5000000</value>
<value>10000000</value>
</values>
</substitution>
</substitutions>
<query>SELECT number FROM numbers({numbers_threshold_value}) GROUP BY number FORMAT Null</query>
<query>SELECT number FROM numbers_mt({numbers_threshold_value}) GROUP BY number FORMAT Null</query>
<query>SELECT number FROM numbers_mt({numbers_threshold_value}) GROUP BY number FORMAT Null</query>
<query>WITH number % 524289 AS k, toUInt64(k) AS k1, k1 + 1 AS k2 SELECT k1, k2, count() FROM numbers({numbers_threshold_value}) GROUP BY k1, k2 FORMAT Null</query>
<query>SELECT number FROM numbers_mt({numbers_threshold_value}) GROUP BY number FORMAT Null SETTINGS group_by_two_level_threshold = 1e12, group_by_two_level_threshold_bytes = 1e12</query>
<query>SELECT WatchID FROM {table_name} GROUP BY WatchID FORMAT Null</query>
<query>SELECT ClientIP AS x, x - 1, x - 2, x - 3, count() AS c FROM {table_name} GROUP BY x, x - 1, x - 2, x - 3 ORDER BY c DESC LIMIT 10</query>
<query>SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM {table_name} WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10</query>
<query>SELECT min(MobilePhoneModel) FROM {table_name} WHERE MobilePhoneModel != '' GROUP BY intHash32(UserID) % 1000000 FORMAT Null</query>
</test>

View File

@ -6,31 +6,21 @@
<query_plan_filter_push_down>0</query_plan_filter_push_down>
</settings>
<fill_query>INSERT INTO ints SELECT number AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(10000) settings query_plan_filter_push_down = 0</fill_query>
<fill_query>INSERT INTO ints SELECT 10000 + number % 1000 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(10000) settings query_plan_filter_push_down = 0</fill_query>
<fill_query>INSERT INTO ints SELECT 20000 + number % 100 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(10000) settings query_plan_filter_push_down = 0</fill_query>
<fill_query>INSERT INTO ints SELECT 30000 + number % 10 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(10000) settings query_plan_filter_push_down = 0</fill_query>
<fill_query>INSERT INTO ints SELECT 40000 + number % 1 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(10000) settings query_plan_filter_push_down = 0</fill_query>
<query short='1' tag='ANY LEFT'>SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64 WHERE i32 = 20042 settings query_plan_filter_push_down = 0</query>
<query short='1' tag='ANY LEFT KEY'>SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64,i32,i16,i8 WHERE i32 = 20042 settings query_plan_filter_push_down = 0</query>
<query short='1' tag='ANY LEFT ON'>SELECT COUNT() FROM ints l ANY LEFT JOIN ints r ON l.i64 = r.i64 WHERE i32 = 20042 settings query_plan_filter_push_down = 0</query>
<query short='1' tag='ANY LEFT IN'>SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64 WHERE i32 IN(42, 10042, 20042, 30042, 40042) settings query_plan_filter_push_down = 0</query>
<query tag='INNER'>SELECT COUNT() FROM ints l INNER JOIN ints r USING i64 WHERE i32 = 20042 settings query_plan_filter_push_down = 0</query>
<query tag='INNER KEY'>SELECT COUNT() FROM ints l INNER JOIN ints r USING i64,i32,i16,i8 WHERE i32 = 20042 settings query_plan_filter_push_down = 0</query>
<query tag='INNER ON'>SELECT COUNT() FROM ints l INNER JOIN ints r ON l.i64 = r.i64 WHERE i32 = 20042 settings query_plan_filter_push_down = 0</query>
<query tag='INNER IN'>SELECT COUNT() FROM ints l INNER JOIN ints r USING i64 WHERE i32 IN(42, 10042, 20042, 30042, 40042) settings query_plan_filter_push_down = 0</query>
<fill_query>INSERT INTO ints SELECT number AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(1000000) settings query_plan_filter_push_down = 0</fill_query>
<fill_query>INSERT INTO ints SELECT 10000 + number % 1000 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(1000000) settings query_plan_filter_push_down = 0</fill_query>
<fill_query>INSERT INTO ints SELECT 20000 + number % 100 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(1000000) settings query_plan_filter_push_down = 0</fill_query>
<fill_query>INSERT INTO ints SELECT 30000 + number % 10 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(1000000) settings query_plan_filter_push_down = 0</fill_query>
<fill_query>INSERT INTO ints SELECT 40000 + number % 1 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(1000000) settings query_plan_filter_push_down = 0</fill_query>
<query tag='LEFT'>SELECT COUNT() FROM ints l LEFT JOIN ints r USING i64 WHERE i32 = 20042 settings query_plan_filter_push_down = 0</query>
<query tag='LEFT KEY'>SELECT COUNT() FROM ints l LEFT JOIN ints r USING i64,i32,i16,i8 WHERE i32 = 20042 settings query_plan_filter_push_down = 0</query>
<query tag='LEFT ON'>SELECT COUNT() FROM ints l LEFT JOIN ints r ON l.i64 = r.i64 WHERE i32 = 20042 settings query_plan_filter_push_down = 0</query>
<query tag='LEFT IN'>SELECT COUNT() FROM ints l LEFT JOIN ints r USING i64 WHERE i32 IN(42, 10042, 20042, 30042, 40042) settings query_plan_filter_push_down = 0</query>
<query short='1' tag='ANY LEFT (noopt)'>SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0</query>
<query short='1' tag='ANY LEFT KEY (noopt)'>SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64,i32,i16,i8 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0</query>
<query short='1' tag='ANY LEFT ON (noopt)'>SELECT COUNT() FROM ints l ANY LEFT JOIN ints r ON l.i64 = r.i64 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0</query>
<query short='1' tag='ANY LEFT IN (noopt)'>SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64 WHERE i32 IN(42, 10042, 20042, 30042, 40042) SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0</query>
<query tag='ANY LEFT (noopt)'>SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0</query>
<query tag='ANY LEFT KEY (noopt)'>SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64,i32,i16,i8 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0</query>
<query tag='ANY LEFT ON (noopt)'>SELECT COUNT() FROM ints l ANY LEFT JOIN ints r ON l.i64 = r.i64 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0</query>
<query tag='ANY LEFT IN (noopt)'>SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64 WHERE i32 IN(42, 10042, 20042, 30042, 40042) SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0</query>
<query tag='INNER (noopt)'>SELECT COUNT() FROM ints l INNER JOIN ints r USING i64 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0</query>
<query tag='INNER KEY (noopt)'>SELECT COUNT() FROM ints l INNER JOIN ints r USING i64,i32,i16,i8 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0</query>

View File

@ -0,0 +1,3 @@
10000
10000
10000

View File

@ -0,0 +1,32 @@
DROP TABLE IF EXISTS partslost_0;
DROP TABLE IF EXISTS partslost_1;
DROP TABLE IF EXISTS partslost_2;
CREATE TABLE partslost_0 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/{database}_02067_lost/partslost', '0') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1;
CREATE TABLE partslost_1 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/{database}_02067_lost/partslost', '1') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1;
CREATE TABLE partslost_2 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/{database}_02067_lost/partslost', '2') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1;
INSERT INTO partslost_0 SELECT toString(number) AS x from system.numbers LIMIT 10000;
ALTER TABLE partslost_0 ADD INDEX idx x TYPE tokenbf_v1(285000, 3, 12345) GRANULARITY 3;
ALTER TABLE partslost_0 MATERIALIZE INDEX idx;
-- In worst case doesn't check anything, but it's not flaky
select sleep(3) FORMAT Null;
select sleep(3) FORMAT Null;
select sleep(3) FORMAT Null;
select sleep(3) FORMAT Null;
ALTER TABLE partslost_0 DROP INDEX idx;
select count() from partslost_0;
select count() from partslost_1;
select count() from partslost_2;
DROP TABLE IF EXISTS partslost_0;
DROP TABLE IF EXISTS partslost_1;
DROP TABLE IF EXISTS partslost_2;

View File

@ -0,0 +1,64 @@
DROP TABLE IF EXISTS github_events;
CREATE TABLE github_events
(
`file_time` DateTime,
`event_type` Enum8('CommitCommentEvent' = 1, 'CreateEvent' = 2, 'DeleteEvent' = 3, 'ForkEvent' = 4, 'GollumEvent' = 5, 'IssueCommentEvent' = 6, 'IssuesEvent' = 7, 'MemberEvent' = 8, 'PublicEvent' = 9, 'PullRequestEvent' = 10, 'PullRequestReviewCommentEvent' = 11, 'PushEvent' = 12, 'ReleaseEvent' = 13, 'SponsorshipEvent' = 14, 'WatchEvent' = 15, 'GistEvent' = 16, 'FollowEvent' = 17, 'DownloadEvent' = 18, 'PullRequestReviewEvent' = 19, 'ForkApplyEvent' = 20, 'Event' = 21, 'TeamAddEvent' = 22),
`actor_login` LowCardinality(String),
`repo_name` LowCardinality(String),
`created_at` DateTime,
`updated_at` DateTime,
`action` Enum8('none' = 0, 'created' = 1, 'added' = 2, 'edited' = 3, 'deleted' = 4, 'opened' = 5, 'closed' = 6, 'reopened' = 7, 'assigned' = 8, 'unassigned' = 9, 'labeled' = 10, 'unlabeled' = 11, 'review_requested' = 12, 'review_request_removed' = 13, 'synchronize' = 14, 'started' = 15, 'published' = 16, 'update' = 17, 'create' = 18, 'fork' = 19, 'merged' = 20),
`comment_id` UInt64,
`body` String,
`path` String,
`position` Int32,
`line` Int32,
`ref` LowCardinality(String),
`ref_type` Enum8('none' = 0, 'branch' = 1, 'tag' = 2, 'repository' = 3, 'unknown' = 4),
`creator_user_login` LowCardinality(String),
`number` UInt32,
`title` String,
`labels` Array(LowCardinality(String)),
`state` Enum8('none' = 0, 'open' = 1, 'closed' = 2),
`locked` UInt8,
`assignee` LowCardinality(String),
`assignees` Array(LowCardinality(String)),
`comments` UInt32,
`author_association` Enum8('NONE' = 0, 'CONTRIBUTOR' = 1, 'OWNER' = 2, 'COLLABORATOR' = 3, 'MEMBER' = 4, 'MANNEQUIN' = 5),
`closed_at` DateTime,
`merged_at` DateTime,
`merge_commit_sha` String,
`requested_reviewers` Array(LowCardinality(String)),
`requested_teams` Array(LowCardinality(String)),
`head_ref` LowCardinality(String),
`head_sha` String,
`base_ref` LowCardinality(String),
`base_sha` String,
`merged` UInt8,
`mergeable` UInt8,
`rebaseable` UInt8,
`mergeable_state` Enum8('unknown' = 0, 'dirty' = 1, 'clean' = 2, 'unstable' = 3, 'draft' = 4),
`merged_by` LowCardinality(String),
`review_comments` UInt32,
`maintainer_can_modify` UInt8,
`commits` UInt32,
`additions` UInt32,
`deletions` UInt32,
`changed_files` UInt32,
`diff_hunk` String,
`original_position` UInt32,
`commit_id` String,
`original_commit_id` String,
`push_size` UInt32,
`push_distinct_size` UInt32,
`member_login` LowCardinality(String),
`release_tag_name` String,
`release_name` String,
`review_state` Enum8('none' = 0, 'approved' = 1, 'changes_requested' = 2, 'commented' = 3, 'dismissed' = 4, 'pending' = 5)
)
ENGINE = MergeTree ORDER BY (event_type, repo_name, created_at);
with top_repos as ( select repo_name from github_events where event_type = 'WatchEvent' and toDate(created_at) = today() - 1 group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toMonday(created_at) = toMonday(today() - interval 1 week) group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toStartOfMonth(created_at) = toStartOfMonth(today()) - interval 1 month group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toYear(created_at) = toYear(today()) - 1 group by repo_name order by count() desc limit 100 ), last_day as ( select repo_name, count() as count_last_day, rowNumberInAllBlocks() + 1 as position_last_day from github_events where repo_name in (select repo_name from top_repos) and toDate(created_at) = today() - 1 group by repo_name order by count_last_day desc ), last_week as ( select repo_name, count() as count_last_week, rowNumberInAllBlocks() + 1 as position_last_week from github_events where repo_name in (select repo_name from top_repos) and toMonday(created_at) = toMonday(today()) - interval 1 week group by repo_name order by count_last_week desc ), last_month as ( select repo_name, count() as count_last_month, rowNumberInAllBlocks() + 1 as position_last_month from github_events where repo_name in (select repo_name from top_repos) and toStartOfMonth(created_at) = toStartOfMonth(today()) - interval 1 month group by repo_name order by count_last_month desc ) select d.repo_name, columns(count) from last_day d join last_week w on d.repo_name = w.repo_name join last_month m on d.repo_name = m.repo_name FORMAT TabSeparatedWithNamesAndTypes; -- { serverError 47 }
DROP TABLE github_events;

View File

@ -0,0 +1,5 @@
1
1
1
1
1

View File

@ -0,0 +1,82 @@
DROP TABLE IF EXISTS replace_partition_source;
DROP TABLE IF EXISTS replace_partition_dest1;
DROP TABLE IF EXISTS replace_partition_dest1_2;
DROP TABLE IF EXISTS replace_partition_dest2;
DROP TABLE IF EXISTS replace_partition_dest2_2;
CREATE TABLE replace_partition_source
(
key UInt64
)
ENGINE = ReplicatedMergeTree('/test/02271_replace_partition_many/{database}/source', '1')
PARTITION BY key
ORDER BY tuple();
INSERT INTO replace_partition_source VALUES (1);
CREATE TABLE replace_partition_dest1
(
key UInt64
)
ENGINE = ReplicatedMergeTree('/test/02271_replace_partition_many/{database}/dest1', '1')
PARTITION BY key
ORDER BY tuple();
CREATE TABLE replace_partition_dest1_2
(
key UInt64
)
ENGINE = ReplicatedMergeTree('/test/02271_replace_partition_many/{database}/dest1', '2')
PARTITION BY key
ORDER BY tuple();
CREATE TABLE replace_partition_dest2
(
key UInt64
)
ENGINE = ReplicatedMergeTree('/test/02271_replace_partition_many/{database}/dest2', '1')
PARTITION BY key
ORDER BY tuple();
CREATE TABLE replace_partition_dest2_2
(
key UInt64
)
ENGINE = ReplicatedMergeTree('/test/02271_replace_partition_many/{database}/dest2', '2')
PARTITION BY key
ORDER BY tuple();
ALTER TABLE replace_partition_dest1 REPLACE PARTITION 1 FROM replace_partition_source;
ALTER TABLE replace_partition_dest2 REPLACE PARTITION 1 FROM replace_partition_source;
OPTIMIZE TABLE replace_partition_source FINAL;
SELECT sleep(3) FORMAT Null;
SELECT sleep(3) FORMAT Null;
SELECT sleep(3) FORMAT Null;
SELECT sleep(3) FORMAT Null;
SELECT sleep(3) FORMAT Null;
OPTIMIZE TABLE replace_partition_dest1_2 FINAL;
OPTIMIZE TABLE replace_partition_dest2_2 FINAL;
SELECT sleep(3) FORMAT Null;
SELECT sleep(3) FORMAT Null;
SELECT sleep(3) FORMAT Null;
SELECT sleep(3) FORMAT Null;
SELECT sleep(3) FORMAT Null;
SELECT * FROM replace_partition_source;
SELECT * FROM replace_partition_dest1;
SELECT * FROM replace_partition_dest2;
SELECT * FROM replace_partition_dest1_2;
SELECT * FROM replace_partition_dest2_2;
--DROP TABLE IF EXISTS replace_partition_source;
--DROP TABLE IF EXISTS replace_partition_dest1;
--DROP TABLE IF EXISTS replace_partition_dest1_2;
--DROP TABLE IF EXISTS replace_partition_dest2;
--DROP TABLE IF EXISTS replace_partition_dest2_2;

View File

@ -0,0 +1,10 @@
0
2
4
6
8
10
12
14
16
18

View File

@ -0,0 +1 @@
SELECT test_function(number, number) as a FROM numbers(10) GROUP BY a ORDER BY a;

View File

@ -0,0 +1,8 @@
(Expression)
ExpressionTransform
(SettingQuotaAndLimits)
(ReadFromStorage)
AggregatingTransform
StrictResize
ExpressionTransform
SourceFromSingleChunk 0 → 1

View File

@ -0,0 +1 @@
EXPLAIN PIPELINE SELECT count(JavaEnable) FROM test.hits WHERE WatchID = 1 OR Title = 'next' OR URL = 'prev' OR URL = '???' OR 1;

View File

@ -95,6 +95,8 @@ Results for AWS instance type im4gn.8xlarge are from <b>Ananth Gundabattula</b>
Results for AWS instance type im4gn.16xlarge are from <b>Ananth Gundabattula</b> (Darwinium).<br/>
Results for AWS instance type i3.2xlarge are from <b>Ananth Gundabattula</b> (Darwinium).<br/>
Results for 2x EPYC 7702 on ZFS mirror NVME are from <b>Alibek A</b>.<br/>
Results for Intel 11th Gen Core i9-11900KF are from <b>Tim Xian</b>.<br/>
Results for AWS instance type m5a.4xlarge are from <b>Daniel Chimeno</b>.<br/>
</p>
</div>
</div>

View File

@ -0,0 +1,55 @@
[
{
"system": "AWS m5a.4xlarge",
"system_full": "AWS m5a.4xlarge 16 vCPU, 512 GiB RAM, EBS GP3",
"time": "2020-08-13 00:00:00",
"kind": "cloud",
"result":
[
[0.002, 0.002, 0.002],
[0.028, 0.017, 0.016],
[0.074, 0.040, 0.042],
[0.758, 0.061, 0.062],
[1.380, 0.151, 0.147],
[2.204, 0.394, 0.385],
[0.002, 0.002, 0.002],
[0.019, 0.018, 0.018],
[1.385, 0.751, 0.740],
[1.754, 0.873, 0.870],
[0.817, 0.222, 0.218],
[1.219, 0.268, 0.263],
[2.303, 1.092, 1.057],
[3.892, 1.431, 1.420],
[2.388, 1.199, 1.188],
[1.439, 1.329, 1.339],
[5.108, 3.816, 3.799],
[3.549, 2.273, 2.271],
[9.976, 7.023, 7.043],
[0.706, 0.068, 0.066],
[19.187, 1.101, 1.076],
[21.210, 1.315, 1.276],
[39.984, 3.065, 3.043],
[32.102, 1.328, 1.295],
[4.599, 0.341, 0.343],
[1.762, 0.294, 0.287],
[4.786, 0.345, 0.338],
[18.596, 1.190, 1.175],
[15.549, 1.973, 1.931],
[3.643, 3.628, 3.625],
[3.658, 0.781, 0.777],
[9.786, 1.383, 1.381],
[11.912, 10.221, 10.227],
[20.664, 5.942, 5.954],
[20.707, 6.092, 5.920],
[1.847, 1.758, 1.756],
[0.292, 0.221, 0.216],
[0.109, 0.086, 0.085],
[0.118, 0.084, 0.074],
[0.542, 0.452, 0.445],
[0.060, 0.026, 0.025],
[0.031, 0.019, 0.020],
[0.007, 0.005, 0.007]
]
}
]

View File

@ -0,0 +1,54 @@
[
{
"system": "Intel(R) 11th Gen Core i9-11900KF",
"system_full": "Intel(R) 11th Gen Core i9-11900KF, 64 GB RAM, 1 TB SSD",
"time": "2022-03-27 04:04:00",
"kind": "server",
"result":
[
[0.001, 0.001, 0.001],
[0.010, 0.007, 0.006],
[0.030, 0.022, 0.020],
[0.100, 0.033, 0.035],
[0.114, 0.079, 0.074],
[0.241, 0.209, 0.225],
[0.002, 0.001, 0.001],
[0.008, 0.007, 0.007],
[0.565, 0.519, 0.511],
[0.629, 0.590, 0.599],
[0.159, 0.130, 0.134],
[0.190, 0.149, 0.150],
[0.976, 0.927, 0.915],
[1.273, 1.208, 1.199],
[1.086, 1.044, 1.041],
[1.229, 1.196, 1.200],
[3.206, 3.491, 3.213],
[1.841, 1.774, 1.809],
[5.919, 5.897, 5.799],
[0.104, 0.039, 0.037],
[1.176, 0.639, 0.694],
[1.407, 0.814, 0.825],
[2.984, 2.391, 2.121],
[2.100, 0.770, 0.716],
[0.342, 0.220, 0.211],
[0.222, 0.211, 0.189],
[0.346, 0.222, 0.224],
[1.272, 0.832, 0.822],
[1.507, 1.306, 1.282],
[3.619, 3.573, 3.597],
[0.761, 0.695, 0.703],
[1.375, 1.217, 1.229],
[8.576, 9.686, 9.070],
[5.634, 5.699, 5.801],
[6.090, 5.789, 5.797],
[1.996, 2.057, 1.946],
[0.119, 0.105, 0.112],
[0.049, 0.040, 0.040],
[0.048, 0.038, 0.038],
[0.261, 0.237, 0.231],
[0.029, 0.013, 0.014],
[0.017, 0.013, 0.011],
[0.003, 0.002, 0.003],
]
}
]