From eb35d17e72b531ac56d552869716e825923a0ef6 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Mon, 27 May 2024 18:19:37 +0200 Subject: [PATCH 01/95] trigger the error --- tests/integration/test_checking_s3_blobs_paranoid/test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_checking_s3_blobs_paranoid/test.py b/tests/integration/test_checking_s3_blobs_paranoid/test.py index a7fe02b16de..661c367ebe8 100644 --- a/tests/integration/test_checking_s3_blobs_paranoid/test.py +++ b/tests/integration/test_checking_s3_blobs_paranoid/test.py @@ -532,11 +532,12 @@ def test_query_is_canceled_with_inf_retries(cluster, broken_s3): SELECT * FROM system.numbers - LIMIT 1000000 + LIMIT 1000000000 SETTINGS s3_max_single_part_upload_size=100, s3_min_upload_part_size=10000, - s3_check_objects_after_upload=0 + s3_check_objects_after_upload=0, + s3_max_inflight_parts_for_one_file=1000 """, query_id=insert_query_id, ) From 86089a3a105a842cc1e9993e48db239469fd325b Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Tue, 28 May 2024 00:09:39 +0200 Subject: [PATCH 02/95] make write buffers cancelable --- src/IO/WriteBuffer.cpp | 2 +- src/IO/WriteBuffer.h | 27 +++++++++++++++++ src/IO/WriteBufferFromS3.cpp | 15 +++++++++- src/IO/WriteBufferFromS3.h | 2 ++ src/Storages/MergeTree/MergeTreeSink.cpp | 4 +++ src/Storages/MergeTree/MergeTreeSink.h | 1 + .../StorageObjectStorageSink.cpp | 30 +++++++++---------- .../ObjectStorage/StorageObjectStorageSink.h | 5 ++-- src/Storages/StorageFile.cpp | 19 ++++++++---- src/Storages/StorageLog.cpp | 10 +++++++ src/Storages/StorageStripeLog.cpp | 3 ++ src/Storages/StorageURL.cpp | 30 +++++++++---------- src/Storages/StorageURL.h | 6 ++-- 13 files changed, 112 insertions(+), 42 deletions(-) diff --git a/src/IO/WriteBuffer.cpp b/src/IO/WriteBuffer.cpp index bcc7445486e..dfcf8432afb 100644 --- a/src/IO/WriteBuffer.cpp +++ b/src/IO/WriteBuffer.cpp @@ -11,7 +11,7 @@ namespace DB WriteBuffer::~WriteBuffer() { // That destructor could be call with finalized=false in case of exceptions - if (count() > 0 && !finalized) + if (count() > 0 && !finalized && !canceled) { /// It is totally OK to destroy instance without finalization when an exception occurs /// However it is suspicious to destroy instance without finalization at the green path diff --git a/src/IO/WriteBuffer.h b/src/IO/WriteBuffer.h index 1ceb938e454..5e7e74035e3 100644 --- a/src/IO/WriteBuffer.h +++ b/src/IO/WriteBuffer.h @@ -121,6 +121,9 @@ public: if (finalized) return; + if (canceled) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot finalize buffer after cancelation."); + LockMemoryExceptionInThread lock(VariableContext::Global); try { @@ -135,6 +138,25 @@ public: } } + void cancel() + { + if (canceled || finalized) + return; + + LockMemoryExceptionInThread lock(VariableContext::Global); + try + { + cancelImpl(); + canceled = true; + } + catch (...) + { + pos = working_buffer.begin(); + canceled = true; + throw; + } + } + /// Wait for data to be reliably written. Mainly, call fsync for fd. /// May be called after finalize() if needed. virtual void sync() @@ -150,7 +172,12 @@ protected: next(); } + virtual void cancelImpl() + { + } + bool finalized = false; + bool canceled = false; private: /** Write the data in the buffer (from the beginning of the buffer to the current position). diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index ff18a77f09f..52d7d2bae97 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -225,6 +225,11 @@ void WriteBufferFromS3::finalizeImpl() } } +void WriteBufferFromS3::cancelImpl() +{ + tryToAbortMultipartUpload(); +} + String WriteBufferFromS3::getVerboseLogDetails() const { String multipart_upload_details; @@ -265,8 +270,16 @@ WriteBufferFromS3::~WriteBufferFromS3() { LOG_TRACE(limitedLog, "Close WriteBufferFromS3. {}.", getShortLogDetails()); + if (canceled) + LOG_INFO( + log, + "WriteBufferFromS3 was canceled." + "The file might not be written to S3. " + "{}.", + getVerboseLogDetails()); + /// That destructor could be call with finalized=false in case of exceptions - if (!finalized) + if (!finalized && !canceled) { LOG_INFO( log, diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index fbfec3588fa..d70bf18cd9f 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -54,6 +54,8 @@ private: /// Receives response from the server after sending all data. void finalizeImpl() override; + void cancelImpl() override; + String getVerboseLogDetails() const; String getShortLogDetails() const; diff --git a/src/Storages/MergeTree/MergeTreeSink.cpp b/src/Storages/MergeTree/MergeTreeSink.cpp index b7dede3cb00..05751e0fa6f 100644 --- a/src/Storages/MergeTree/MergeTreeSink.cpp +++ b/src/Storages/MergeTree/MergeTreeSink.cpp @@ -54,6 +54,10 @@ void MergeTreeSink::onFinish() finishDelayedChunk(); } +void MergeTreeSink::onCancel() +{ +} + void MergeTreeSink::consume(Chunk chunk) { if (num_blocks_processed > 0) diff --git a/src/Storages/MergeTree/MergeTreeSink.h b/src/Storages/MergeTree/MergeTreeSink.h index 07ab3850df2..cf6715a3415 100644 --- a/src/Storages/MergeTree/MergeTreeSink.h +++ b/src/Storages/MergeTree/MergeTreeSink.h @@ -28,6 +28,7 @@ public: void consume(Chunk chunk) override; void onStart() override; void onFinish() override; + void onCancel() override; private: StorageMergeTree & storage; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index 0a3cf19a590..decc84c687b 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -50,31 +50,25 @@ void StorageObjectStorageSink::consume(Chunk chunk) void StorageObjectStorageSink::onCancel() { std::lock_guard lock(cancel_mutex); - finalize(); + cancelBuffers(); cancelled = true; + releaseBuffers(); } -void StorageObjectStorageSink::onException(std::exception_ptr exception) +void StorageObjectStorageSink::onException(std::exception_ptr) { std::lock_guard lock(cancel_mutex); - try - { - std::rethrow_exception(exception); - } - catch (...) - { - /// An exception context is needed to proper delete write buffers without finalization. - release(); - } + cancelBuffers(); + releaseBuffers(); } void StorageObjectStorageSink::onFinish() { std::lock_guard lock(cancel_mutex); - finalize(); + finalizeBuffers(); } -void StorageObjectStorageSink::finalize() +void StorageObjectStorageSink::finalizeBuffers() { if (!writer) return; @@ -88,17 +82,23 @@ void StorageObjectStorageSink::finalize() catch (...) { /// Stop ParallelFormattingOutputFormat correctly. - release(); + releaseBuffers(); throw; } } -void StorageObjectStorageSink::release() +void StorageObjectStorageSink::releaseBuffers() { writer.reset(); write_buf.reset(); } +void StorageObjectStorageSink::cancelBuffers() +{ + writer->cancel(); + write_buf->cancel(); +} + PartitionedStorageObjectStorageSink::PartitionedStorageObjectStorageSink( ObjectStoragePtr object_storage_, ConfigurationPtr configuration_, diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.h b/src/Storages/ObjectStorage/StorageObjectStorageSink.h index 45cf83d606f..e0081193686 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.h @@ -35,8 +35,9 @@ private: bool cancelled = false; std::mutex cancel_mutex; - void finalize(); - void release(); + void finalizeBuffers(); + void releaseBuffers(); + void cancelBuffers(); }; class PartitionedStorageObjectStorageSink : public PartitionedSink diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 51bcc64bceb..21abd6a8c5c 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1780,8 +1780,9 @@ public: void onCancel() override { std::lock_guard cancel_lock(cancel_mutex); - finalize(); + cancelBuffers(); cancelled = true; + releaseBuffers(); } void onException(std::exception_ptr exception) override @@ -1794,18 +1795,18 @@ public: catch (...) { /// An exception context is needed to proper delete write buffers without finalization - release(); + releaseBuffers(); } } void onFinish() override { std::lock_guard cancel_lock(cancel_mutex); - finalize(); + finalizeBuffers(); } private: - void finalize() + void finalizeBuffers() { if (!writer) return; @@ -1819,17 +1820,23 @@ private: catch (...) { /// Stop ParallelFormattingOutputFormat correctly. - release(); + releaseBuffers(); throw; } } - void release() + void releaseBuffers() { writer.reset(); write_buf->finalize(); } + void cancelBuffers() + { + writer->cancel(); + write_buf->cancel(); + } + StorageMetadataPtr metadata_snapshot; String table_name_for_log; diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp index 08e0526550d..de0324d7998 100644 --- a/src/Storages/StorageLog.cpp +++ b/src/Storages/StorageLog.cpp @@ -322,6 +322,10 @@ public: /// Rollback partial writes. /// No more writing. + for (auto & [_, stream] : streams) + { + stream.cancel(); + } streams.clear(); /// Truncate files to the older sizes. @@ -373,6 +377,12 @@ private: plain->next(); plain->finalize(); } + + void cancel() + { + compressed.cancel(); + plain->cancel(); + } }; using FileStreams = std::map; diff --git a/src/Storages/StorageStripeLog.cpp b/src/Storages/StorageStripeLog.cpp index f0c5103d657..d3471f4f170 100644 --- a/src/Storages/StorageStripeLog.cpp +++ b/src/Storages/StorageStripeLog.cpp @@ -207,7 +207,10 @@ public: /// Rollback partial writes. /// No more writing. + data_out->cancel(); data_out.reset(); + + data_out_compressed->cancel(); data_out_compressed.reset(); /// Truncate files to the older sizes. diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 272f771194d..814e12126b7 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -570,31 +570,25 @@ void StorageURLSink::consume(Chunk chunk) void StorageURLSink::onCancel() { std::lock_guard lock(cancel_mutex); - finalize(); + cancelBuffers(); cancelled = true; + releaseBuffers(); } -void StorageURLSink::onException(std::exception_ptr exception) +void StorageURLSink::onException(std::exception_ptr) { std::lock_guard lock(cancel_mutex); - try - { - std::rethrow_exception(exception); - } - catch (...) - { - /// An exception context is needed to proper delete write buffers without finalization - release(); - } + cancelBuffers(); + releaseBuffers(); } void StorageURLSink::onFinish() { std::lock_guard lock(cancel_mutex); - finalize(); + finalizeBuffers(); } -void StorageURLSink::finalize() +void StorageURLSink::finalizeBuffers() { if (!writer) return; @@ -608,17 +602,23 @@ void StorageURLSink::finalize() catch (...) { /// Stop ParallelFormattingOutputFormat correctly. - release(); + releaseBuffers(); throw; } } -void StorageURLSink::release() +void StorageURLSink::releaseBuffers() { writer.reset(); write_buf->finalize(); } +void StorageURLSink::cancelBuffers() +{ + writer->cancel(); + write_buf->cancel(); +} + class PartitionedStorageURLSink : public PartitionedSink { public: diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index f550ccb2bc4..3090f8db12e 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -257,8 +257,10 @@ public: void onFinish() override; private: - void finalize(); - void release(); + void finalizeBuffers(); + void releaseBuffers(); + void cancelBuffers(); + std::unique_ptr write_buf; OutputFormatPtr writer; std::mutex cancel_mutex; From 0fad110d9e30ad202394474de7ae86f9c70b8977 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Tue, 28 May 2024 15:20:45 +0200 Subject: [PATCH 03/95] fix typo --- src/IO/WriteBuffer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IO/WriteBuffer.h b/src/IO/WriteBuffer.h index 5e7e74035e3..4d57ea9565e 100644 --- a/src/IO/WriteBuffer.h +++ b/src/IO/WriteBuffer.h @@ -122,7 +122,7 @@ public: return; if (canceled) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot finalize buffer after cancelation."); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot finalize buffer after cancellation."); LockMemoryExceptionInThread lock(VariableContext::Global); try From 6771737c23f819407f14759a6b4a08c7a9cd05d3 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Tue, 28 May 2024 16:54:18 +0200 Subject: [PATCH 04/95] rm d-tor CompressedWriteBuffer, adjust WriteBufferFromFile --- src/Compression/CompressedWriteBuffer.cpp | 5 ----- src/Compression/CompressedWriteBuffer.h | 2 -- src/IO/WriteBufferFromFile.cpp | 4 +++- src/IO/WriteBufferFromFileDescriptor.cpp | 3 ++- src/Storages/MergeTree/MergeTask.cpp | 4 ++-- src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp | 2 ++ src/Storages/StorageStripeLog.cpp | 6 ++---- 7 files changed, 11 insertions(+), 15 deletions(-) diff --git a/src/Compression/CompressedWriteBuffer.cpp b/src/Compression/CompressedWriteBuffer.cpp index f16330332ab..a82d3c6c7a9 100644 --- a/src/Compression/CompressedWriteBuffer.cpp +++ b/src/Compression/CompressedWriteBuffer.cpp @@ -57,11 +57,6 @@ void CompressedWriteBuffer::nextImpl() } } -CompressedWriteBuffer::~CompressedWriteBuffer() -{ - finalize(); -} - CompressedWriteBuffer::CompressedWriteBuffer(WriteBuffer & out_, CompressionCodecPtr codec_, size_t buf_size) : BufferWithOwnMemory(buf_size), out(out_), codec(std::move(codec_)) { diff --git a/src/Compression/CompressedWriteBuffer.h b/src/Compression/CompressedWriteBuffer.h index 6ae1fbee9cc..30d3003c4bc 100644 --- a/src/Compression/CompressedWriteBuffer.h +++ b/src/Compression/CompressedWriteBuffer.h @@ -21,8 +21,6 @@ public: CompressionCodecPtr codec_ = CompressionCodecFactory::instance().getDefaultCodec(), size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE); - ~CompressedWriteBuffer() override; - /// The amount of compressed data size_t getCompressedBytes() { diff --git a/src/IO/WriteBufferFromFile.cpp b/src/IO/WriteBufferFromFile.cpp index 0ca6c26f08c..c9b40503a87 100644 --- a/src/IO/WriteBufferFromFile.cpp +++ b/src/IO/WriteBufferFromFile.cpp @@ -77,7 +77,9 @@ WriteBufferFromFile::~WriteBufferFromFile() if (fd < 0) return; - finalize(); + if (!canceled) + finalize(); + int err = ::close(fd); /// Everything except for EBADF should be ignored in dtor, since all of /// others (EINTR/EIO/ENOSPC/EDQUOT) could be possible during writing to diff --git a/src/IO/WriteBufferFromFileDescriptor.cpp b/src/IO/WriteBufferFromFileDescriptor.cpp index 813ef0deab9..cb04f9e6556 100644 --- a/src/IO/WriteBufferFromFileDescriptor.cpp +++ b/src/IO/WriteBufferFromFileDescriptor.cpp @@ -105,7 +105,8 @@ WriteBufferFromFileDescriptor::WriteBufferFromFileDescriptor( WriteBufferFromFileDescriptor::~WriteBufferFromFileDescriptor() { - finalize(); + if (!canceled) + finalize(); } void WriteBufferFromFileDescriptor::finalizeImpl() diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index e43b6c615b3..63dd25c7cdd 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -513,9 +513,9 @@ bool MergeTask::VerticalMergeStage::prepareVerticalMergeForAllColumns() const global_ctx->merge_list_element_ptr->columns_written = global_ctx->merging_column_names.size(); global_ctx->merge_list_element_ptr->progress.store(ctx->column_sizes->keyColumnsWeight(), std::memory_order_relaxed); - ctx->rows_sources_write_buf->next(); - ctx->rows_sources_uncompressed_write_buf->next(); /// Ensure data has written to disk. + ctx->rows_sources_write_buf->finalize(); + ctx->rows_sources_uncompressed_write_buf->finalize(); ctx->rows_sources_uncompressed_write_buf->finalize(); size_t rows_sources_count = ctx->rows_sources_write_buf->count(); diff --git a/src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp b/src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp index f33f4293023..b327480fa92 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp @@ -245,6 +245,8 @@ void MergeTreeDataPartChecksums::write(WriteBuffer & to) const writeBinaryLittleEndian(sum.uncompressed_hash, out); } } + + out.finalize(); } void MergeTreeDataPartChecksums::addFile(const String & file_name, UInt64 file_size, MergeTreeDataPartChecksum::uint128 file_hash) diff --git a/src/Storages/StorageStripeLog.cpp b/src/Storages/StorageStripeLog.cpp index d3471f4f170..8df87d6290f 100644 --- a/src/Storages/StorageStripeLog.cpp +++ b/src/Storages/StorageStripeLog.cpp @@ -236,8 +236,7 @@ public: if (done) return; - data_out->next(); - data_out_compressed->next(); + data_out->finalize(); data_out_compressed->finalize(); /// Save the new indices. @@ -497,8 +496,7 @@ void StorageStripeLog::saveIndices(const WriteLock & /* already locked for writi for (size_t i = start; i != num_indices; ++i) indices.blocks[i].write(*index_out); - index_out->next(); - index_out_compressed->next(); + index_out->finalize(); index_out_compressed->finalize(); num_indices_saved = num_indices; From ef3a2fef0115547985e5205ab981aa526ccd163d Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Wed, 29 May 2024 16:45:32 +0200 Subject: [PATCH 05/95] work with tests --- src/Client/Connection.cpp | 32 ++++++++++++++++++++--- src/Common/ZooKeeper/ZooKeeperImpl.cpp | 6 +++++ src/Compression/CompressedWriteBuffer.cpp | 6 +++++ src/Compression/CompressedWriteBuffer.h | 2 ++ src/IO/WriteBufferFromPocoSocket.cpp | 22 ++++++++-------- src/IO/WriteBufferFromPocoSocket.h | 2 +- src/Server/KeeperTCPHandler.cpp | 6 +++++ src/Server/TCPHandler.h | 6 +++++ src/Storages/StorageSet.cpp | 3 +-- 9 files changed, 68 insertions(+), 17 deletions(-) diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 19cd8cc4ee5..aaceb231077 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -37,6 +38,7 @@ #include #include +#include "Core/Types.h" #include "config.h" #if USE_SSL @@ -68,7 +70,11 @@ namespace ErrorCodes extern const int EMPTY_DATA_PASSED; } -Connection::~Connection() = default; +Connection::~Connection() +{ + if (connected) + disconnect(); +} Connection::Connection(const String & host_, UInt16 port_, const String & default_database_, @@ -257,13 +263,30 @@ void Connection::connect(const ConnectionTimeouts & timeouts) void Connection::disconnect() { - maybe_compressed_out = nullptr; in = nullptr; last_input_packet_type.reset(); std::exception_ptr finalize_exception; + + try + { + // finalize() can write and throw an exception. + maybe_compressed_out->finalize(); + } + catch (...) + { + /// Don't throw an exception here, it will leave Connection in invalid state. + finalize_exception = std::current_exception(); + + if (out) + { + out->cancel(); + out = nullptr; + } + } + maybe_compressed_out = nullptr; + try { - // finalize() can write to socket and throw an exception. if (out) out->finalize(); } @@ -276,6 +299,7 @@ void Connection::disconnect() if (socket) socket->close(); + socket = nullptr; connected = false; nonce.reset(); @@ -767,6 +791,8 @@ void Connection::sendQuery( } maybe_compressed_in.reset(); + if (maybe_compressed_out && maybe_compressed_out != out) + maybe_compressed_out->cancel(); maybe_compressed_out.reset(); block_in.reset(); block_logs_in.reset(); diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index ed7498b1ac9..8f421b14fe4 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -334,6 +334,12 @@ ZooKeeper::~ZooKeeper() { tryLogCurrentException(log); } + + if (compressed_out) + compressed_out->cancel(); + + if (out) + out->cancel(); } diff --git a/src/Compression/CompressedWriteBuffer.cpp b/src/Compression/CompressedWriteBuffer.cpp index a82d3c6c7a9..5ad76271d62 100644 --- a/src/Compression/CompressedWriteBuffer.cpp +++ b/src/Compression/CompressedWriteBuffer.cpp @@ -62,4 +62,10 @@ CompressedWriteBuffer::CompressedWriteBuffer(WriteBuffer & out_, CompressionCode { } +// CompressedWriteBuffer::~CompressedWriteBuffer() +// { +// finalize(); +// } + + } diff --git a/src/Compression/CompressedWriteBuffer.h b/src/Compression/CompressedWriteBuffer.h index 30d3003c4bc..27d357eaace 100644 --- a/src/Compression/CompressedWriteBuffer.h +++ b/src/Compression/CompressedWriteBuffer.h @@ -21,6 +21,8 @@ public: CompressionCodecPtr codec_ = CompressionCodecFactory::instance().getDefaultCodec(), size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE); + // ~CompressedWriteBuffer() override; + /// The amount of compressed data size_t getCompressedBytes() { diff --git a/src/IO/WriteBufferFromPocoSocket.cpp b/src/IO/WriteBufferFromPocoSocket.cpp index 10d9fd131cd..5240e3048b0 100644 --- a/src/IO/WriteBufferFromPocoSocket.cpp +++ b/src/IO/WriteBufferFromPocoSocket.cpp @@ -193,16 +193,16 @@ WriteBufferFromPocoSocket::WriteBufferFromPocoSocket(Poco::Net::Socket & socket_ write_event = write_event_; } -WriteBufferFromPocoSocket::~WriteBufferFromPocoSocket() -{ - try - { - finalize(); - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - } -} +// WriteBufferFromPocoSocket::~WriteBufferFromPocoSocket() +// { +// // try +// // { +// // finalize(); +// // } +// // catch (...) +// // { +// // tryLogCurrentException(__PRETTY_FUNCTION__); +// // } +// } } diff --git a/src/IO/WriteBufferFromPocoSocket.h b/src/IO/WriteBufferFromPocoSocket.h index 1f69dfc466c..cce6a62d674 100644 --- a/src/IO/WriteBufferFromPocoSocket.h +++ b/src/IO/WriteBufferFromPocoSocket.h @@ -19,7 +19,7 @@ public: explicit WriteBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE); explicit WriteBufferFromPocoSocket(Poco::Net::Socket & socket_, const ProfileEvents::Event & write_event_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE); - ~WriteBufferFromPocoSocket() override; + //~WriteBufferFromPocoSocket() override; void setAsyncCallback(AsyncCallback async_callback_) { async_callback = std::move(async_callback_); } diff --git a/src/Server/KeeperTCPHandler.cpp b/src/Server/KeeperTCPHandler.cpp index 6709cd298e5..2cf64aad5e3 100644 --- a/src/Server/KeeperTCPHandler.cpp +++ b/src/Server/KeeperTCPHandler.cpp @@ -693,6 +693,12 @@ void KeeperTCPHandler::resetStats() KeeperTCPHandler::~KeeperTCPHandler() { + if (compressed_out) + compressed_out->cancel(); + + if (out) + out->cancel(); + KeeperTCPHandler::unregisterConnection(this); } diff --git a/src/Server/TCPHandler.h b/src/Server/TCPHandler.h index 191617f1905..610b88d3280 100644 --- a/src/Server/TCPHandler.h +++ b/src/Server/TCPHandler.h @@ -20,6 +20,7 @@ #include #include +#include "Core/Types.h" #include "IServer.h" #include "Interpreters/AsynchronousInsertQueue.h" #include "Server/TCPProtocolStackData.h" @@ -121,6 +122,11 @@ struct QueryState void reset() { + if (maybe_compressed_out && maybe_compressed_out.unique()) + { + maybe_compressed_out->cancel(); + } + *this = QueryState(); } diff --git a/src/Storages/StorageSet.cpp b/src/Storages/StorageSet.cpp index 205a90423bf..b5f4b73eb90 100644 --- a/src/Storages/StorageSet.cpp +++ b/src/Storages/StorageSet.cpp @@ -97,8 +97,7 @@ void SetOrJoinSink::onFinish() if (persistent) { backup_stream.flush(); - compressed_backup_buf.next(); - backup_buf->next(); + compressed_backup_buf.finalize(); backup_buf->finalize(); table.disk->replaceFile(fs::path(backup_tmp_path) / backup_file_name, fs::path(backup_path) / backup_file_name); From 99ce2985d48a4ca032651657dbed7e240a94a10f Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Thu, 30 May 2024 17:01:28 +0200 Subject: [PATCH 06/95] work with tests --- src/Compression/CompressedWriteBuffer.cpp | 9 +++++---- src/Compression/CompressedWriteBuffer.h | 2 +- src/IO/WriteBufferFromPocoSocket.cpp | 23 ++++++++++++----------- src/IO/WriteBufferFromPocoSocket.h | 2 +- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/src/Compression/CompressedWriteBuffer.cpp b/src/Compression/CompressedWriteBuffer.cpp index 5ad76271d62..83c9fbc9573 100644 --- a/src/Compression/CompressedWriteBuffer.cpp +++ b/src/Compression/CompressedWriteBuffer.cpp @@ -62,10 +62,11 @@ CompressedWriteBuffer::CompressedWriteBuffer(WriteBuffer & out_, CompressionCode { } -// CompressedWriteBuffer::~CompressedWriteBuffer() -// { -// finalize(); -// } +CompressedWriteBuffer::~CompressedWriteBuffer() +{ + if (!canceled) + finalize(); +} } diff --git a/src/Compression/CompressedWriteBuffer.h b/src/Compression/CompressedWriteBuffer.h index 27d357eaace..6ae1fbee9cc 100644 --- a/src/Compression/CompressedWriteBuffer.h +++ b/src/Compression/CompressedWriteBuffer.h @@ -21,7 +21,7 @@ public: CompressionCodecPtr codec_ = CompressionCodecFactory::instance().getDefaultCodec(), size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE); - // ~CompressedWriteBuffer() override; + ~CompressedWriteBuffer() override; /// The amount of compressed data size_t getCompressedBytes() diff --git a/src/IO/WriteBufferFromPocoSocket.cpp b/src/IO/WriteBufferFromPocoSocket.cpp index 5240e3048b0..5ed4dbdc787 100644 --- a/src/IO/WriteBufferFromPocoSocket.cpp +++ b/src/IO/WriteBufferFromPocoSocket.cpp @@ -193,16 +193,17 @@ WriteBufferFromPocoSocket::WriteBufferFromPocoSocket(Poco::Net::Socket & socket_ write_event = write_event_; } -// WriteBufferFromPocoSocket::~WriteBufferFromPocoSocket() -// { -// // try -// // { -// // finalize(); -// // } -// // catch (...) -// // { -// // tryLogCurrentException(__PRETTY_FUNCTION__); -// // } -// } +WriteBufferFromPocoSocket::~WriteBufferFromPocoSocket() +{ + try + { + if (!canceled) + finalize(); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } +} } diff --git a/src/IO/WriteBufferFromPocoSocket.h b/src/IO/WriteBufferFromPocoSocket.h index cce6a62d674..1f69dfc466c 100644 --- a/src/IO/WriteBufferFromPocoSocket.h +++ b/src/IO/WriteBufferFromPocoSocket.h @@ -19,7 +19,7 @@ public: explicit WriteBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE); explicit WriteBufferFromPocoSocket(Poco::Net::Socket & socket_, const ProfileEvents::Event & write_event_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE); - //~WriteBufferFromPocoSocket() override; + ~WriteBufferFromPocoSocket() override; void setAsyncCallback(AsyncCallback async_callback_) { async_callback = std::move(async_callback_); } From 8f4a7a7c09231f15118b0c378fc599e7b00706b7 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Thu, 30 May 2024 17:07:36 +0200 Subject: [PATCH 07/95] work with tests --- src/Common/ZooKeeper/ZooKeeperImpl.cpp | 6 ------ src/Server/KeeperTCPHandler.cpp | 6 ------ src/Server/TCPHandler.h | 5 ----- 3 files changed, 17 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index 8f421b14fe4..ed7498b1ac9 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -334,12 +334,6 @@ ZooKeeper::~ZooKeeper() { tryLogCurrentException(log); } - - if (compressed_out) - compressed_out->cancel(); - - if (out) - out->cancel(); } diff --git a/src/Server/KeeperTCPHandler.cpp b/src/Server/KeeperTCPHandler.cpp index 2cf64aad5e3..6709cd298e5 100644 --- a/src/Server/KeeperTCPHandler.cpp +++ b/src/Server/KeeperTCPHandler.cpp @@ -693,12 +693,6 @@ void KeeperTCPHandler::resetStats() KeeperTCPHandler::~KeeperTCPHandler() { - if (compressed_out) - compressed_out->cancel(); - - if (out) - out->cancel(); - KeeperTCPHandler::unregisterConnection(this); } diff --git a/src/Server/TCPHandler.h b/src/Server/TCPHandler.h index 610b88d3280..27f8d710684 100644 --- a/src/Server/TCPHandler.h +++ b/src/Server/TCPHandler.h @@ -122,11 +122,6 @@ struct QueryState void reset() { - if (maybe_compressed_out && maybe_compressed_out.unique()) - { - maybe_compressed_out->cancel(); - } - *this = QueryState(); } From b287018949b13ed1393be03a3ceb2d30d1430f96 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Thu, 30 May 2024 18:37:19 +0200 Subject: [PATCH 08/95] work with tests --- src/Client/Connection.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index aaceb231077..c53b5bdaf81 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -270,7 +270,8 @@ void Connection::disconnect() try { // finalize() can write and throw an exception. - maybe_compressed_out->finalize(); + if (maybe_compressed_out) + maybe_compressed_out->finalize(); } catch (...) { From bc294ef51fca29f11d2b8b431fa5b3b2a262de27 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 10 Jun 2024 13:03:50 +0200 Subject: [PATCH 09/95] S3Queue small fixes --- src/Storages/S3Queue/S3QueueIFileMetadata.cpp | 47 +++++++++++--- src/Storages/S3Queue/S3QueueIFileMetadata.h | 3 +- src/Storages/S3Queue/S3QueueSettings.h | 2 +- src/Storages/S3Queue/S3QueueSource.cpp | 31 ++++++--- src/Storages/S3Queue/S3QueueSource.h | 10 ++- src/Storages/S3Queue/StorageS3Queue.cpp | 43 ++++++++++++- .../configs/merge_tree.xml | 5 ++ .../integration/test_storage_s3_queue/test.py | 64 +++++++++++++++++++ 8 files changed, 181 insertions(+), 24 deletions(-) create mode 100644 tests/integration/test_storage_s3_queue/configs/merge_tree.xml diff --git a/src/Storages/S3Queue/S3QueueIFileMetadata.cpp b/src/Storages/S3Queue/S3QueueIFileMetadata.cpp index 6c4089115d4..fcf91555cb5 100644 --- a/src/Storages/S3Queue/S3QueueIFileMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueIFileMetadata.cpp @@ -120,7 +120,14 @@ S3QueueIFileMetadata::~S3QueueIFileMetadata() { if (processing_id_version.has_value()) { - file_status->onFailed("Uncaught exception"); + if (file_status->getException().empty()) + { + if (std::current_exception()) + file_status->onFailed(getCurrentExceptionMessage(true)); + else + file_status->onFailed("Unprocessed exception"); + } + LOG_TEST(log, "Removing processing node in destructor for file: {}", path); try { @@ -227,7 +234,16 @@ void S3QueueIFileMetadata::setProcessed() ProfileEvents::increment(ProfileEvents::S3QueueProcessedFiles); file_status->onProcessed(); - setProcessedImpl(); + + try + { + setProcessedImpl(); + } + catch (...) + { + file_status->onFailed(getCurrentExceptionMessage(true)); + throw; + } processing_id.reset(); processing_id_version.reset(); @@ -235,7 +251,7 @@ void S3QueueIFileMetadata::setProcessed() LOG_TRACE(log, "Set file {} as processed (rows: {})", path, file_status->processed_rows); } -void S3QueueIFileMetadata::setFailed(const std::string & exception) +void S3QueueIFileMetadata::setFailed(const std::string & exception, bool reduce_retry_count) { LOG_TRACE(log, "Setting file {} as failed (exception: {}, path: {})", path, exception, failed_node_path); @@ -243,10 +259,25 @@ void S3QueueIFileMetadata::setFailed(const std::string & exception) file_status->onFailed(exception); node_metadata.last_exception = exception; - if (max_loading_retries == 0) - setFailedNonRetriable(); - else - setFailedRetriable(); + if (reduce_retry_count) + { + try + { + if (max_loading_retries == 0) + setFailedNonRetriable(); + else + setFailedRetriable(); + } + catch (...) + { + auto full_exception = fmt::format( + "First exception: {}, exception while setting file as failed: {}", + exception, getCurrentExceptionMessage(true)); + + file_status->onFailed(full_exception); + throw; + } + } processing_id.reset(); processing_id_version.reset(); @@ -296,6 +327,7 @@ void S3QueueIFileMetadata::setFailedRetriable() auto zk_client = getZooKeeper(); /// Extract the number of already done retries from node_hash.retriable node if it exists. + Coordination::Requests requests; Coordination::Stat stat; std::string res; if (zk_client->tryGet(retrieable_failed_node_path, res, &stat)) @@ -308,7 +340,6 @@ void S3QueueIFileMetadata::setFailedRetriable() LOG_TRACE(log, "File `{}` failed to process, try {}/{}", path, node_metadata.retries, max_loading_retries); - Coordination::Requests requests; if (node_metadata.retries >= max_loading_retries) { /// File is no longer retriable. diff --git a/src/Storages/S3Queue/S3QueueIFileMetadata.h b/src/Storages/S3Queue/S3QueueIFileMetadata.h index e0b0d16cbcc..ac1e3724c57 100644 --- a/src/Storages/S3Queue/S3QueueIFileMetadata.h +++ b/src/Storages/S3Queue/S3QueueIFileMetadata.h @@ -54,13 +54,14 @@ public: bool setProcessing(); void setProcessed(); - void setFailed(const std::string & exception); + void setFailed(const std::string & exception, bool reduce_retry_count = true); virtual void setProcessedAtStartRequests( Coordination::Requests & requests, const zkutil::ZooKeeperPtr & zk_client) = 0; FileStatusPtr getFileStatus() { return file_status; } + const std::string & getPath() { return path; } struct NodeMetadata { diff --git a/src/Storages/S3Queue/S3QueueSettings.h b/src/Storages/S3Queue/S3QueueSettings.h index 4a92d99c411..b698eabfe24 100644 --- a/src/Storages/S3Queue/S3QueueSettings.h +++ b/src/Storages/S3Queue/S3QueueSettings.h @@ -19,7 +19,7 @@ class ASTStorage; 0) \ M(S3QueueAction, after_processing, S3QueueAction::KEEP, "Delete or keep file in S3 after successful processing", 0) \ M(String, keeper_path, "", "Zookeeper node path", 0) \ - M(UInt32, s3queue_loading_retries, 0, "Retry loading up to specified number of times", 0) \ + M(UInt32, s3queue_loading_retries, 10, "Retry loading up to specified number of times", 0) \ M(UInt32, s3queue_processing_threads_num, 1, "Number of processing threads", 0) \ M(UInt32, s3queue_enable_logging_to_s3queue_log, 1, "Enable logging to system table system.s3queue_log", 0) \ M(String, s3queue_last_processed_path, "", "For Ordered mode. Files that have lexicographically smaller file name are considered already processed", 0) \ diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index d8633037ed9..efdda7cd0ba 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -33,9 +33,9 @@ namespace ErrorCodes StorageS3QueueSource::S3QueueObjectInfo::S3QueueObjectInfo( const ObjectInfo & object_info, - Metadata::FileMetadataPtr processing_holder_) + Metadata::FileMetadataPtr file_metadata_) : ObjectInfo(object_info.relative_path, object_info.metadata) - , processing_holder(processing_holder_) + , file_metadata(file_metadata_) { } @@ -346,7 +346,7 @@ Chunk StorageS3QueueSource::generate() break; const auto * object_info = dynamic_cast(&reader.getObjectInfo()); - auto file_metadata = object_info->processing_holder; + auto file_metadata = object_info->file_metadata; auto file_status = file_metadata->getFileStatus(); if (isCancelled()) @@ -409,6 +409,8 @@ Chunk StorageS3QueueSource::generate() SCOPE_EXIT({ CurrentThread::get().attachProfileCountersScope(prev_scope); }); /// FIXME: if files are compressed, profile counters update does not work fully (s3 related counters are not saved). Why? + started_files.push_back(file_metadata); + try { auto timer = DB::CurrentThread::getProfileEvents().timer(ProfileEvents::S3QueuePullMicroseconds); @@ -430,16 +432,10 @@ Chunk StorageS3QueueSource::generate() { const auto message = getCurrentExceptionMessage(true); LOG_ERROR(log, "Got an error while pulling chunk. Will set file {} as failed. Error: {} ", path, message); - - file_metadata->setFailed(message); - appendLogElement(path, *file_status, processed_rows_from_file, false); throw; } - file_metadata->setProcessed(); - applyActionAfterProcessing(reader.getObjectInfo().relative_path); - appendLogElement(path, *file_status, processed_rows_from_file, true); file_status.reset(); processed_rows_from_file = 0; @@ -467,6 +463,23 @@ Chunk StorageS3QueueSource::generate() return {}; } +void StorageS3QueueSource::setProcessed() +{ + for (const auto & file_metadata : started_files) + { + file_metadata->setProcessed(); + applyActionAfterProcessing(file_metadata->getPath()); + } +} + +void StorageS3QueueSource::setFailed(const std::string & exception, bool reduce_retry_count) +{ + for (const auto & file_metadata : started_files) + { + file_metadata->setFailed(exception, reduce_retry_count); + } +} + void StorageS3QueueSource::applyActionAfterProcessing(const String & path) { switch (action) diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index 6e098f8cb63..7d9f0aa7da7 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -35,9 +35,9 @@ public: { S3QueueObjectInfo( const ObjectInfo & object_info, - Metadata::FileMetadataPtr processing_holder_); + Metadata::FileMetadataPtr file_metadata_); - Metadata::FileMetadataPtr processing_holder; + Metadata::FileMetadataPtr file_metadata; }; class FileIterator : public StorageObjectStorageSource::IIterator @@ -102,6 +102,10 @@ public: Chunk generate() override; + void setProcessed(); + + void setFailed(const std::string & exception, bool reduce_retry_count); + private: const String name; const size_t processor_id; @@ -117,6 +121,8 @@ private: RemoveFileFunc remove_file_func; LoggerPtr log; + std::vector started_files; + ReaderHolder reader; std::future reader_future; std::atomic initialized{false}; diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index afb75a21b21..276ef9ed14e 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -47,6 +48,8 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int S3_ERROR; extern const int QUERY_NOT_ALLOWED; + extern const int MEMORY_LIMIT_EXCEEDED; + extern const int TOO_MANY_PARTS; } namespace @@ -95,6 +98,11 @@ namespace "Setting `s3queue_cleanup_interval_min_ms` ({}) must be less or equal to `s3queue_cleanup_interval_max_ms` ({})", s3queue_settings.s3queue_cleanup_interval_min_ms, s3queue_settings.s3queue_cleanup_interval_max_ms); } + + if (!s3queue_settings.s3queue_processing_threads_num.changed) + { + s3queue_settings.s3queue_processing_threads_num = std::max(getNumberOfPhysicalCPUCores(), 16); + } } } @@ -456,11 +464,16 @@ bool StorageS3Queue::streamToViews() auto read_from_format_info = prepareReadingFromFormat(block_io.pipeline.getHeader().getNames(), storage_snapshot, supportsSubsetOfColumns(s3queue_context)); Pipes pipes; + std::vector> sources; + pipes.reserve(s3queue_settings->s3queue_processing_threads_num); + sources.reserve(s3queue_settings->s3queue_processing_threads_num); + for (size_t i = 0; i < s3queue_settings->s3queue_processing_threads_num; ++i) { auto source = createSource(i, read_from_format_info, file_iterator, DBMS_DEFAULT_BUFFER_SIZE, s3queue_context); - pipes.emplace_back(std::move(source)); + pipes.emplace_back(source); + sources.emplace_back(source); } auto pipe = Pipe::unitePipes(std::move(pipes)); @@ -471,8 +484,32 @@ bool StorageS3Queue::streamToViews() std::atomic_size_t rows = 0; block_io.pipeline.setProgressCallback([&](const Progress & progress) { rows += progress.read_rows.load(); }); - CompletedPipelineExecutor executor(block_io.pipeline); - executor.execute(); + try + { + CompletedPipelineExecutor executor(block_io.pipeline); + executor.execute(); + } + catch (const Exception & e) + { + bool always_retriable_exception = e.code() == ErrorCodes::MEMORY_LIMIT_EXCEEDED + || e.code() == ErrorCodes::TOO_MANY_PARTS; + + /// May be we should just split errors into retriable and non-retriable, + /// and always retry retriable for any number of tried needed? (so deprecating s3queue_loading_retries setting) + + for (auto & source : sources) + source->setFailed(getCurrentExceptionMessage(true), /* reduce_retry_count */!always_retriable_exception); + throw; + } + catch (...) + { + for (auto & source : sources) + source->setFailed(getCurrentExceptionMessage(true), /* reduce_retry_count */true); + throw; + } + + for (auto & source : sources) + source->setProcessed(); return rows > 0; } diff --git a/tests/integration/test_storage_s3_queue/configs/merge_tree.xml b/tests/integration/test_storage_s3_queue/configs/merge_tree.xml new file mode 100644 index 00000000000..61eba8face7 --- /dev/null +++ b/tests/integration/test_storage_s3_queue/configs/merge_tree.xml @@ -0,0 +1,5 @@ + + + 0 + + diff --git a/tests/integration/test_storage_s3_queue/test.py b/tests/integration/test_storage_s3_queue/test.py index 66631c51b03..15e4cf18b7a 100644 --- a/tests/integration/test_storage_s3_queue/test.py +++ b/tests/integration/test_storage_s3_queue/test.py @@ -110,6 +110,17 @@ def started_cluster(): with_installed_binary=True, use_old_analyzer=True, ) + cluster.add_instance( + "instance_too_many_parts", + user_configs=["configs/users.xml"], + with_minio=True, + with_zookeeper=True, + main_configs=[ + "configs/s3queue_log.xml", + "configs/merge_tree.xml", + ], + stay_alive=True, + ) logging.info("Starting cluster...") cluster.start() @@ -1577,3 +1588,56 @@ def test_upgrade(started_cluster): node.restart_with_latest_version() assert expected_rows == get_count() + + +def test_exception_during_insert(started_cluster): + node = started_cluster.instances["instance_too_many_parts"] + + table_name = f"test_exception_during_insert" + dst_table_name = f"{table_name}_dst" + keeper_path = f"/clickhouse/test_{table_name}" + files_path = f"{table_name}_data" + files_to_generate = 10 + + create_table( + started_cluster, + node, + table_name, + "unordered", + files_path, + additional_settings={ + "keeper_path": keeper_path, + }, + ) + total_values = generate_random_files( + started_cluster, files_path, files_to_generate, start_ind=0, row_num=1 + ) + + create_mv(node, table_name, dst_table_name) + + node.wait_for_log_line( + "Failed to process data: Code: 252. DB::Exception: Too many parts" + ) + + time.sleep(2) + exception = node.query( + f"SELECT exception FROM system.s3queue WHERE zookeeper_path ilike '%{table_name}%' and notEmpty(exception)" + ) + assert "Too many parts" in exception + + node.replace_in_config( + "/etc/clickhouse-server/config.d/merge_tree.xml", + "parts_to_throw_insert>0", + "parts_to_throw_insert>10", + ) + node.restart_clickhouse() + + def get_count(): + return int(node.query(f"SELECT count() FROM {dst_table_name}")) + + expected_rows = 10 + for _ in range(20): + if expected_rows == get_count(): + break + time.sleep(1) + assert expected_rows == get_count() From fa9c0a2ab52e8498d1ae32c143efd07f78c69318 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Mon, 10 Jun 2024 22:08:50 +0200 Subject: [PATCH 10/95] fix tests --- src/Processors/IProcessor.cpp | 56 +++++++++++++++---- src/Processors/IProcessor.h | 8 +-- .../StorageObjectStorageSink.cpp | 8 ++- src/Storages/StorageFile.cpp | 6 +- src/Storages/StorageURL.cpp | 10 ++-- 5 files changed, 63 insertions(+), 25 deletions(-) diff --git a/src/Processors/IProcessor.cpp b/src/Processors/IProcessor.cpp index 8b160153733..6c21c85b766 100644 --- a/src/Processors/IProcessor.cpp +++ b/src/Processors/IProcessor.cpp @@ -1,21 +1,58 @@ #include #include +#include +#include +#include + namespace DB { +void IProcessor::cancel() +{ + + bool already_cancelled = is_cancelled.exchange(true, std::memory_order_acq_rel); + if (already_cancelled) + return; + + LOG_DEBUG(getLogger("IProcessor"), "cancel: {}", getName()); + onCancel(); +} + +String IProcessor::debug() const +{ + WriteBufferFromOwnString buf; + writeString(getName(), buf); + buf.write('\n'); + + writeString("inputs (hasData, isFinished):\n", buf); + for (const auto & port : inputs) + { + buf.write('\t'); + writeBoolText(port.hasData(), buf); + buf.write(' '); + writeBoolText(port.isFinished(), buf); + buf.write('\n'); + } + + writeString("outputs (hasData, isNeeded):\n", buf); + for (const auto & port : outputs) + { + buf.write('\t'); + writeBoolText(port.hasData(), buf); + buf.write(' '); + writeBoolText(port.isNeeded(), buf); + buf.write('\n'); + } + + buf.finalize(); + return buf.str(); +} + void IProcessor::dump() const { - std::cerr << getName() << "\n"; - - std::cerr << "inputs:\n"; - for (const auto & port : inputs) - std::cerr << "\t" << port.hasData() << " " << port.isFinished() << "\n"; - - std::cerr << "outputs:\n"; - for (const auto & port : outputs) - std::cerr << "\t" << port.hasData() << " " << port.isNeeded() << "\n"; + std::cerr << debug(); } @@ -41,4 +78,3 @@ std::string IProcessor::statusToName(Status status) } } - diff --git a/src/Processors/IProcessor.h b/src/Processors/IProcessor.h index 63f32d8deb7..6f779e7a8d4 100644 --- a/src/Processors/IProcessor.h +++ b/src/Processors/IProcessor.h @@ -238,12 +238,7 @@ public: /// In case if query was cancelled executor will wait till all processors finish their jobs. /// Generally, there is no reason to check this flag. However, it may be reasonable for long operations (e.g. i/o). bool isCancelled() const { return is_cancelled.load(std::memory_order_acquire); } - void cancel() - { - bool already_cancelled = is_cancelled.exchange(true, std::memory_order_acq_rel); - if (!already_cancelled) - onCancel(); - } + void cancel(); /// Additional method which is called in case if ports were updated while work() method. /// May be used to stop execution in rare cases. @@ -286,6 +281,7 @@ public: const auto & getOutputs() const { return outputs; } /// Debug output. + String debug() const; void dump() const; /// Used to print pipeline. diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index decc84c687b..48eba3ef741 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -75,8 +75,8 @@ void StorageObjectStorageSink::finalizeBuffers() try { - writer->finalize(); writer->flush(); + writer->finalize(); write_buf->finalize(); } catch (...) @@ -95,8 +95,10 @@ void StorageObjectStorageSink::releaseBuffers() void StorageObjectStorageSink::cancelBuffers() { - writer->cancel(); - write_buf->cancel(); + if (writer) + writer->cancel(); + if (write_buf) + write_buf->cancel(); } PartitionedStorageObjectStorageSink::PartitionedStorageObjectStorageSink( diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 21abd6a8c5c..73b713530c9 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1833,8 +1833,10 @@ private: void cancelBuffers() { - writer->cancel(); - write_buf->cancel(); + if (writer) + writer->cancel(); + if (write_buf) + write_buf->cancel(); } StorageMetadataPtr metadata_snapshot; diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 814e12126b7..c798883c4be 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -571,8 +571,8 @@ void StorageURLSink::onCancel() { std::lock_guard lock(cancel_mutex); cancelBuffers(); - cancelled = true; releaseBuffers(); + cancelled = true; } void StorageURLSink::onException(std::exception_ptr) @@ -610,13 +610,15 @@ void StorageURLSink::finalizeBuffers() void StorageURLSink::releaseBuffers() { writer.reset(); - write_buf->finalize(); + write_buf.reset(); } void StorageURLSink::cancelBuffers() { - writer->cancel(); - write_buf->cancel(); + if (writer) + writer->cancel(); + if (write_buf) + write_buf->cancel(); } class PartitionedStorageURLSink : public PartitionedSink From e4050334bb8e5bf928d644d0991b9b760fee93db Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Tue, 11 Jun 2024 13:59:22 +0200 Subject: [PATCH 11/95] fix StorageURLSink --- src/IO/WriteBufferDecorator.h | 5 +++++ src/Storages/StorageFile.cpp | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/IO/WriteBufferDecorator.h b/src/IO/WriteBufferDecorator.h index 88161f8d232..77f11424482 100644 --- a/src/IO/WriteBufferDecorator.h +++ b/src/IO/WriteBufferDecorator.h @@ -47,6 +47,11 @@ public: } } + void cancelImpl() override + { + out->cancel(); + } + WriteBuffer * getNestedBuffer() { return out; } protected: diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 884b754ee4e..60dd9f0e35e 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1790,8 +1790,8 @@ public: { std::lock_guard cancel_lock(cancel_mutex); cancelBuffers(); - cancelled = true; releaseBuffers(); + cancelled = true; } void onException(std::exception_ptr exception) override @@ -1837,7 +1837,7 @@ private: void releaseBuffers() { writer.reset(); - write_buf->finalize(); + write_buf.reset(); } void cancelBuffers() From 1e435eb353f47794b5f75348e4a90f28c1baf1a2 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 11 Jun 2024 19:01:24 +0200 Subject: [PATCH 12/95] Fix tests --- src/Storages/S3Queue/S3QueueMetadata.cpp | 3 ++ src/Storages/S3Queue/S3QueueSource.cpp | 35 ++++++++++++++++-- src/Storages/S3Queue/S3QueueSource.h | 5 ++- src/Storages/S3Queue/StorageS3Queue.cpp | 36 ++++++++++++------- src/Storages/S3Queue/StorageS3Queue.h | 3 +- .../integration/test_storage_s3_queue/test.py | 10 ++++++ 6 files changed, 75 insertions(+), 17 deletions(-) diff --git a/src/Storages/S3Queue/S3QueueMetadata.cpp b/src/Storages/S3Queue/S3QueueMetadata.cpp index 9c77bb2d24c..cb25d646468 100644 --- a/src/Storages/S3Queue/S3QueueMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueMetadata.cpp @@ -133,6 +133,9 @@ S3QueueMetadata::S3QueueMetadata(const fs::path & zookeeper_path_, const S3Queue generateRescheduleInterval( settings.s3queue_cleanup_interval_min_ms, settings.s3queue_cleanup_interval_max_ms)); } + LOG_TRACE(log, "Mode: {}, buckets: {}, processing threads: {}, result buckets num: {}", + settings.mode.toString(), settings.s3queue_buckets, settings.s3queue_processing_threads_num, buckets_num); + } S3QueueMetadata::~S3QueueMetadata() diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index efdda7cd0ba..25520d64d2b 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -301,7 +301,8 @@ StorageS3QueueSource::StorageS3QueueSource( const std::atomic & table_is_being_dropped_, std::shared_ptr s3_queue_log_, const StorageID & storage_id_, - LoggerPtr log_) + LoggerPtr log_, + bool commit_once_processed_) : ISource(header_) , WithContext(context_) , name(std::move(name_)) @@ -314,6 +315,7 @@ StorageS3QueueSource::StorageS3QueueSource( , table_is_being_dropped(table_is_being_dropped_) , s3_queue_log(s3_queue_log_) , storage_id(storage_id_) + , commit_once_processed(commit_once_processed_) , remove_file_func(remove_file_func_) , log(log_) { @@ -337,6 +339,28 @@ void StorageS3QueueSource::lazyInitialize(size_t processor) } Chunk StorageS3QueueSource::generate() +{ + Chunk chunk; + try + { + chunk = generateImpl(); + } + catch (...) + { + if (commit_once_processed) + setFailed(getCurrentExceptionMessage(true), true); + + throw; + } + + if (!chunk && commit_once_processed) + { + setProcessed(); + } + return chunk; +} + +Chunk StorageS3QueueSource::generateImpl() { lazyInitialize(processor_id); @@ -409,8 +433,6 @@ Chunk StorageS3QueueSource::generate() SCOPE_EXIT({ CurrentThread::get().attachProfileCountersScope(prev_scope); }); /// FIXME: if files are compressed, profile counters update does not work fully (s3 related counters are not saved). Why? - started_files.push_back(file_metadata); - try { auto timer = DB::CurrentThread::getProfileEvents().timer(ProfileEvents::S3QueuePullMicroseconds); @@ -432,13 +454,16 @@ Chunk StorageS3QueueSource::generate() { const auto message = getCurrentExceptionMessage(true); LOG_ERROR(log, "Got an error while pulling chunk. Will set file {} as failed. Error: {} ", path, message); + file_status->onFailed(getCurrentExceptionMessage(true)); appendLogElement(path, *file_status, processed_rows_from_file, false); + started_files.push_back(file_metadata); throw; } appendLogElement(path, *file_status, processed_rows_from_file, true); file_status.reset(); processed_rows_from_file = 0; + started_files.push_back(file_metadata); if (shutdown_called) { @@ -465,6 +490,8 @@ Chunk StorageS3QueueSource::generate() void StorageS3QueueSource::setProcessed() { + LOG_TEST(log, "Having {} files to set as processed", started_files.size()); + for (const auto & file_metadata : started_files) { file_metadata->setProcessed(); @@ -474,6 +501,8 @@ void StorageS3QueueSource::setProcessed() void StorageS3QueueSource::setFailed(const std::string & exception, bool reduce_retry_count) { + LOG_TEST(log, "Having {} files to set as failed", started_files.size()); + for (const auto & file_metadata : started_files) { file_metadata->setFailed(exception, reduce_retry_count); diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index 7d9f0aa7da7..ff868df2879 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -94,7 +94,8 @@ public: const std::atomic & table_is_being_dropped_, std::shared_ptr s3_queue_log_, const StorageID & storage_id_, - LoggerPtr log_); + LoggerPtr log_, + bool commit_once_processed_); static Block getHeader(Block sample_block, const std::vector & requested_virtual_columns); @@ -117,6 +118,7 @@ private: const std::atomic & table_is_being_dropped; const std::shared_ptr s3_queue_log; const StorageID storage_id; + const bool commit_once_processed; RemoveFileFunc remove_file_func; LoggerPtr log; @@ -130,6 +132,7 @@ private: S3QueueOrderedFileMetadata::BucketHolderPtr current_bucket_holder; + Chunk generateImpl(); void applyActionAfterProcessing(const String & path); void appendLogElement(const std::string & filename, S3QueueMetadata::FileStatus & file_status_, size_t processed_rows, bool processed); void lazyInitialize(size_t processor); diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index 276ef9ed14e..ec3f9baea41 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -74,7 +74,7 @@ namespace return zkutil::extractZooKeeperPath(result_zk_path, true); } - void checkAndAdjustSettings(S3QueueSettings & s3queue_settings, const Settings & settings, bool is_attach) + void checkAndAdjustSettings(S3QueueSettings & s3queue_settings, const Settings & settings, bool is_attach, const LoggerPtr & log) { if (!is_attach && !s3queue_settings.mode.changed) { @@ -82,11 +82,6 @@ namespace } /// In case !is_attach, we leave Ordered mode as default for compatibility. - if (!s3queue_settings.s3queue_processing_threads_num) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Setting `s3queue_processing_threads_num` cannot be set to zero"); - } - if (!s3queue_settings.s3queue_enable_logging_to_s3queue_log.changed) { s3queue_settings.s3queue_enable_logging_to_s3queue_log = settings.s3queue_enable_logging_to_s3queue_log; @@ -99,9 +94,15 @@ namespace s3queue_settings.s3queue_cleanup_interval_min_ms, s3queue_settings.s3queue_cleanup_interval_max_ms); } + if (!s3queue_settings.s3queue_processing_threads_num) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Setting `s3queue_processing_threads_num` cannot be set to zero"); + } + if (!s3queue_settings.s3queue_processing_threads_num.changed) { s3queue_settings.s3queue_processing_threads_num = std::max(getNumberOfPhysicalCPUCores(), 16); + LOG_TRACE(log, "Set `processing_threads_num` to {}", s3queue_settings.s3queue_processing_threads_num); } } } @@ -139,7 +140,7 @@ StorageS3Queue::StorageS3Queue( throw Exception(ErrorCodes::QUERY_NOT_ALLOWED, "S3Queue url must either end with '/' or contain globs"); } - checkAndAdjustSettings(*s3queue_settings, context_->getSettingsRef(), mode > LoadingStrictnessLevel::CREATE); + checkAndAdjustSettings(*s3queue_settings, context_->getSettingsRef(), mode > LoadingStrictnessLevel::CREATE, log); object_storage = configuration->createObjectStorage(context_, /* is_readonly */true); FormatFactory::instance().checkFormatName(configuration->format); @@ -313,10 +314,12 @@ void ReadFromS3Queue::initializePipeline(QueryPipelineBuilder & pipeline, const createIterator(nullptr); for (size_t i = 0; i < adjusted_num_streams; ++i) pipes.emplace_back(storage->createSource( - i, + i/* processor_id */, info, iterator, - max_block_size, context)); + max_block_size, + context, + true/* commit_once_processed */)); auto pipe = Pipe::unitePipes(std::move(pipes)); if (pipe.empty()) @@ -333,7 +336,8 @@ std::shared_ptr StorageS3Queue::createSource( const ReadFromFormatInfo & info, std::shared_ptr file_iterator, size_t max_block_size, - ContextPtr local_context) + ContextPtr local_context, + bool commit_once_processed) { auto internal_source = std::make_unique( getName(), @@ -366,7 +370,8 @@ std::shared_ptr StorageS3Queue::createSource( table_is_being_dropped, s3_queue_log, getStorageID(), - log); + log, + commit_once_processed); } bool StorageS3Queue::hasDependencies(const StorageID & table_id) @@ -471,7 +476,14 @@ bool StorageS3Queue::streamToViews() for (size_t i = 0; i < s3queue_settings->s3queue_processing_threads_num; ++i) { - auto source = createSource(i, read_from_format_info, file_iterator, DBMS_DEFAULT_BUFFER_SIZE, s3queue_context); + auto source = createSource( + i/* processor_id */, + read_from_format_info, + file_iterator, + DBMS_DEFAULT_BUFFER_SIZE, + s3queue_context, + false/* commit_once_processed */); + pipes.emplace_back(source); sources.emplace_back(source); } diff --git a/src/Storages/S3Queue/StorageS3Queue.h b/src/Storages/S3Queue/StorageS3Queue.h index ef83a1ccc25..f465fa92d1a 100644 --- a/src/Storages/S3Queue/StorageS3Queue.h +++ b/src/Storages/S3Queue/StorageS3Queue.h @@ -88,7 +88,8 @@ private: const ReadFromFormatInfo & info, std::shared_ptr file_iterator, size_t max_block_size, - ContextPtr local_context); + ContextPtr local_context, + bool commit_once_processed); bool hasDependencies(const StorageID & table_id); bool streamToViews(); diff --git a/tests/integration/test_storage_s3_queue/test.py b/tests/integration/test_storage_s3_queue/test.py index 15e4cf18b7a..656c25eb9b9 100644 --- a/tests/integration/test_storage_s3_queue/test.py +++ b/tests/integration/test_storage_s3_queue/test.py @@ -363,6 +363,7 @@ def test_direct_select_file(started_cluster, mode): files_path, additional_settings={ "keeper_path": keeper_path, + "s3queue_processing_threads_num": 1, }, ) @@ -390,6 +391,7 @@ def test_direct_select_file(started_cluster, mode): files_path, additional_settings={ "keeper_path": keeper_path, + "s3queue_processing_threads_num": 1, }, ) @@ -408,6 +410,7 @@ def test_direct_select_file(started_cluster, mode): files_path, additional_settings={ "keeper_path": keeper_path, + "s3queue_processing_threads_num": 1, }, ) @@ -793,6 +796,8 @@ def test_max_set_age(started_cluster): "s3queue_cleanup_interval_min_ms": 0, "s3queue_cleanup_interval_max_ms": 0, "s3queue_loading_retries": 0, + "s3queue_processing_threads_num": 1, + "s3queue_loading_retries": 0, }, ) create_mv(node, table_name, dst_table_name) @@ -872,6 +877,11 @@ def test_max_set_age(started_cluster): assert "Cannot parse input" in node.query( "SELECT exception FROM system.s3queue WHERE file_name ilike '%fff.csv'" ) + assert 1 == int( + node.query( + "SELECT count() FROM system.s3queue_log WHERE file_name ilike '%fff.csv'" + ) + ) assert 1 == int( node.query( "SELECT count() FROM system.s3queue_log WHERE file_name ilike '%fff.csv' AND notEmpty(exception)" From 4ef2f2b6760c1629e3f7e61689f15062173dd84a Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 12 Jun 2024 10:14:25 +0200 Subject: [PATCH 13/95] Fix test_upgrade --- src/Storages/S3Queue/StorageS3Queue.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index ec3f9baea41..4ba9f0ba677 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -99,7 +99,7 @@ namespace throw Exception(ErrorCodes::BAD_ARGUMENTS, "Setting `s3queue_processing_threads_num` cannot be set to zero"); } - if (!s3queue_settings.s3queue_processing_threads_num.changed) + if (!is_attach && !s3queue_settings.s3queue_processing_threads_num.changed) { s3queue_settings.s3queue_processing_threads_num = std::max(getNumberOfPhysicalCPUCores(), 16); LOG_TRACE(log, "Set `processing_threads_num` to {}", s3queue_settings.s3queue_processing_threads_num); From 0302741d50c3d6baf8ac64895dc74214bf1331c1 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 12 Jun 2024 19:23:47 +0200 Subject: [PATCH 14/95] Support aliases in parametrized view function (only new analyzer) --- src/Analyzer/Resolve/QueryAnalyzer.cpp | 4 +- src/Interpreters/Context.cpp | 8 ++- src/Interpreters/Context.h | 4 +- .../FunctionParameterValuesVisitor.cpp | 68 +++++++++++++------ src/Parsers/FunctionParameterValuesVisitor.h | 3 +- ...03167_parametrized_view_with_cte.reference | 2 + .../03167_parametrized_view_with_cte.sql | 4 ++ 7 files changed, 65 insertions(+), 28 deletions(-) create mode 100644 tests/queries/0_stateless/03167_parametrized_view_with_cte.reference create mode 100644 tests/queries/0_stateless/03167_parametrized_view_with_cte.sql diff --git a/src/Analyzer/Resolve/QueryAnalyzer.cpp b/src/Analyzer/Resolve/QueryAnalyzer.cpp index 5e5ecaaa93a..0bf61bb508b 100644 --- a/src/Analyzer/Resolve/QueryAnalyzer.cpp +++ b/src/Analyzer/Resolve/QueryAnalyzer.cpp @@ -4505,7 +4505,9 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, table_name = table_identifier[1]; } - auto parametrized_view_storage = scope_context->getQueryContext()->buildParametrizedViewStorage(function_ast, database_name, table_name); + auto parametrized_view_storage = scope_context->getQueryContext()->buildParametrizedViewStorage( + function_ast, database_name, table_name, scope.aliases); + if (parametrized_view_storage) { auto fake_table_node = std::make_shared(parametrized_view_storage, scope_context); diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 67ea069d46d..7fb1b0d0374 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -1871,7 +1872,7 @@ StoragePtr Context::executeTableFunction(const ASTPtr & table_expression, const if (table.get()->isView() && table->as() && table->as()->isParameterizedView()) { auto query = table->getInMemoryMetadataPtr()->getSelectQuery().inner_query->clone(); - NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression); + NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression, ScopeAliases{}); StorageView::replaceQueryParametersIfParametrizedView(query, parameterized_view_values); ASTCreateQuery create; @@ -2072,7 +2073,8 @@ StoragePtr Context::executeTableFunction(const ASTPtr & table_expression, const } -StoragePtr Context::buildParametrizedViewStorage(const ASTPtr & table_expression, const String & database_name, const String & table_name) +StoragePtr Context::buildParametrizedViewStorage( + const ASTPtr & table_expression, const String & database_name, const String & table_name, const ScopeAliases & scope_aliases) { if (table_name.empty()) return nullptr; @@ -2085,7 +2087,7 @@ StoragePtr Context::buildParametrizedViewStorage(const ASTPtr & table_expression return nullptr; auto query = original_view->getInMemoryMetadataPtr()->getSelectQuery().inner_query->clone(); - NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression); + NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression, scope_aliases); StorageView::replaceQueryParametersIfParametrizedView(query, parameterized_view_values); ASTCreateQuery create; diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 68f37377926..5c0f96a04a9 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -147,6 +147,7 @@ class ServerType; template class MergeTreeBackgroundExecutor; class AsyncLoader; +struct ScopeAliases; struct TemporaryTableHolder; using TemporaryTablesMapping = std::map>; @@ -743,7 +744,8 @@ public: /// Overload for the new analyzer. Structure inference is performed in QueryAnalysisPass. StoragePtr executeTableFunction(const ASTPtr & table_expression, const TableFunctionPtr & table_function_ptr); - StoragePtr buildParametrizedViewStorage(const ASTPtr & table_expression, const String & database_name, const String & table_name); + StoragePtr buildParametrizedViewStorage( + const ASTPtr & table_expression, const String & database_name, const String & table_name, const ScopeAliases & scope_aliases); void addViewSource(const StoragePtr & storage); StoragePtr getViewSource() const; diff --git a/src/Parsers/FunctionParameterValuesVisitor.cpp b/src/Parsers/FunctionParameterValuesVisitor.cpp index 3692a4c73e5..44bf36c2526 100644 --- a/src/Parsers/FunctionParameterValuesVisitor.cpp +++ b/src/Parsers/FunctionParameterValuesVisitor.cpp @@ -4,9 +4,14 @@ #include #include #include -#include +#include #include +#include +#include +#include +#include #include +#include namespace DB @@ -20,8 +25,8 @@ namespace ErrorCodes class FunctionParameterValuesVisitor { public: - explicit FunctionParameterValuesVisitor(NameToNameMap & parameter_values_) - : parameter_values(parameter_values_) + explicit FunctionParameterValuesVisitor(NameToNameMap & parameter_values_, const ScopeAliases & aliases_) + : parameter_values(parameter_values_), aliases(aliases_) { } @@ -29,12 +34,45 @@ public: { if (const auto * function = ast->as()) visitFunction(*function); + for (const auto & child : ast->children) visit(child); } private: NameToNameMap & parameter_values; + const ScopeAliases & aliases; + + std::string tryGetParameterValueAsString(const ASTPtr & ast) + { + if (const auto * literal = ast->as()) + { + return convertFieldToString(literal->value); + } + else if (const auto * value_identifier = ast->as()) + { + auto it = aliases.alias_name_to_expression_node_before_group_by.find(value_identifier->name()); + if (it != aliases.alias_name_to_expression_node_before_group_by.end()) + { + return tryGetParameterValueAsString(it->second->toAST()); + } + } + else if (const auto * function = ast->as()) + { + if (isFunctionCast(function)) + { + const auto * cast_expression = assert_cast(function->arguments.get()); + if (cast_expression->children.size() != 2) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function CAST must have exactly two arguments"); + + if (const auto * cast_literal = cast_expression->children[0]->as()) + { + return convertFieldToString(cast_literal->value); + } + } + } + return ""; + } void visitFunction(const ASTFunction & parameter_function) { @@ -48,31 +86,17 @@ private: if (const auto * identifier = expression_list->children[0]->as()) { - if (const auto * literal = expression_list->children[1]->as()) - { - parameter_values[identifier->name()] = convertFieldToString(literal->value); - } - else if (const auto * function = expression_list->children[1]->as()) - { - if (isFunctionCast(function)) - { - const auto * cast_expression = assert_cast(function->arguments.get()); - if (cast_expression->children.size() != 2) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function CAST must have exactly two arguments"); - if (const auto * cast_literal = cast_expression->children[0]->as()) - { - parameter_values[identifier->name()] = convertFieldToString(cast_literal->value); - } - } - } + auto value_str = tryGetParameterValueAsString(expression_list->children[1]); + if (!value_str.empty()) + parameter_values[identifier->name()] = value_str; } } }; -NameToNameMap analyzeFunctionParamValues(const ASTPtr & ast) +NameToNameMap analyzeFunctionParamValues(const ASTPtr & ast, const ScopeAliases & scope_aliases) { NameToNameMap parameter_values; - FunctionParameterValuesVisitor(parameter_values).visit(ast); + FunctionParameterValuesVisitor(parameter_values, scope_aliases).visit(ast); return parameter_values; } diff --git a/src/Parsers/FunctionParameterValuesVisitor.h b/src/Parsers/FunctionParameterValuesVisitor.h index e6ce0e42d06..01ce79a2a76 100644 --- a/src/Parsers/FunctionParameterValuesVisitor.h +++ b/src/Parsers/FunctionParameterValuesVisitor.h @@ -6,8 +6,9 @@ namespace DB { +struct ScopeAliases; /// Find parameters in a query parameter values and collect them into map. -NameToNameMap analyzeFunctionParamValues(const ASTPtr & ast); +NameToNameMap analyzeFunctionParamValues(const ASTPtr & ast, const ScopeAliases & scope_aliases); } diff --git a/tests/queries/0_stateless/03167_parametrized_view_with_cte.reference b/tests/queries/0_stateless/03167_parametrized_view_with_cte.reference new file mode 100644 index 00000000000..90afb158f23 --- /dev/null +++ b/tests/queries/0_stateless/03167_parametrized_view_with_cte.reference @@ -0,0 +1,2 @@ +OK +123 diff --git a/tests/queries/0_stateless/03167_parametrized_view_with_cte.sql b/tests/queries/0_stateless/03167_parametrized_view_with_cte.sql new file mode 100644 index 00000000000..433f4ed040b --- /dev/null +++ b/tests/queries/0_stateless/03167_parametrized_view_with_cte.sql @@ -0,0 +1,4 @@ +SET allow_experimental_analyzer=1; +CREATE OR REPLACE VIEW param_test AS SELECT {test_str:String} as s_result; +WITH 'OK' AS s SELECT * FROM param_test(test_str=s); +WITH CAST(123, String) AS s SELECT * FROM param_test(test_str=s); From b48a20735fe8ada2b02684a283331627b264a2e6 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Fri, 14 Jun 2024 13:26:23 +0200 Subject: [PATCH 15/95] Mask account key for azureBlobStorage & azureBlobStorageCluster functions --- .../FunctionSecretArgumentsFinderAST.h | 44 +++++++++++++++++++ .../test_mask_sensitive_info/test.py | 17 +++++++ 2 files changed, 61 insertions(+) diff --git a/src/Parsers/FunctionSecretArgumentsFinderAST.h b/src/Parsers/FunctionSecretArgumentsFinderAST.h index 348b2ca9e3a..537c65f829f 100644 --- a/src/Parsers/FunctionSecretArgumentsFinderAST.h +++ b/src/Parsers/FunctionSecretArgumentsFinderAST.h @@ -82,6 +82,16 @@ private: /// s3Cluster('cluster_name', 'url', 'aws_access_key_id', 'aws_secret_access_key', ...) findS3FunctionSecretArguments(/* is_cluster_function= */ true); } + else if (function.name == "azureBlobStorage") + { + /// azureBlobStorage(connection_string|storage_account_url, container_name, blobpath, account_name, account_key, format, compression, structure) + findAzureBlobStorageFunctionSecretArguments(/* is_cluster_function= */ false); + } + else if (function.name == "azureBlobStorageCluster") + { + /// azureBlobStorageCluster(cluster, connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure]) + findAzureBlobStorageFunctionSecretArguments(/* is_cluster_function= */ true); + } else if ((function.name == "remote") || (function.name == "remoteSecure")) { /// remote('addresses_expr', 'db', 'table', 'user', 'password', ...) @@ -169,6 +179,40 @@ private: markSecretArgument(url_arg_idx + 2); } + void findAzureBlobStorageFunctionSecretArguments(bool is_cluster_function) + { + /// azureBlobStorage('cluster_name', 'conn_string/storage_account_url', ...) has 'conn_string/storage_account_url' as its second argument. + size_t url_arg_idx = is_cluster_function ? 1 : 0; + + if (!is_cluster_function && isNamedCollectionName(0)) + { + /// azureBlobStorage(named_collection, ..., account_key = 'account_key', ...) + findSecretNamedArgument("account_key", 1); + return; + } + + /// We should check other arguments first because we don't need to do any replacement in case storage_account_url is not used + /// azureBlobStorage(connection_string|storage_account_url, container_name, blobpath, account_name, account_key, format, compression, structure) + /// azureBlobStorageCluster(cluster, connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure]) + size_t count = excludeS3OrURLNestedMaps(); + if ((url_arg_idx + 4 <= count) && (count <= url_arg_idx + 7)) + { + String second_arg; + if (tryGetStringFromArgument(url_arg_idx + 3, &second_arg)) + { + if (boost::iequals(second_arg, "NOSIGN")) + return; /// The argument after 'url' is "NOSIGN". + + if (second_arg == "auto" || KnownFormatNames::instance().exists(second_arg)) + return; /// The argument after 'url' is a format: s3('url', 'format', ...) + } + } + + /// We're going to replace 'account_key' with '[HIDDEN]' if account_key is used in the signature + if (url_arg_idx + 4 < count) + markSecretArgument(url_arg_idx + 4); + } + void findURLSecretArguments() { if (!isNamedCollectionName(0)) diff --git a/tests/integration/test_mask_sensitive_info/test.py b/tests/integration/test_mask_sensitive_info/test.py index 38cbf8c1aed..28e15fe0602 100644 --- a/tests/integration/test_mask_sensitive_info/test.py +++ b/tests/integration/test_mask_sensitive_info/test.py @@ -11,6 +11,7 @@ node = cluster.add_instance( ], user_configs=["configs/users.xml"], with_zookeeper=True, + with_azurite=True, ) @@ -327,6 +328,10 @@ def test_create_database(): def test_table_functions(): password = new_password() + azure_conn_string = cluster.env_variables['AZURITE_CONNECTION_STRING'] + azure_storage_account_url = cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL'] + azure_account_name = "devstoreaccount1" + azure_account_key = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" table_functions = [ f"mysql('mysql80:3306', 'mysql_db', 'mysql_table', 'mysql_user', '{password}')", @@ -365,6 +370,12 @@ def test_table_functions(): f"deltaLake('http://minio1:9001/root/data/test11.csv.gz', 'minio', '{password}')", "DNS_ERROR", ), + f"azureBlobStorage('{azure_conn_string}', 'cont', 'test_simple.csv', 'CSV')", + f"azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_1.csv', '{azure_account_name}', '{azure_account_key}')", + f"azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_2.csv', '{azure_account_name}', '{azure_account_key}', 'CSV')", + f"azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_3.csv', 'CSV')", + f"azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_4.csv', '{azure_account_name}', '{azure_account_key}')", + f"azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_5.csv', '{azure_account_name}', '{azure_account_key}', 'CSV')", ] def make_test_case(i): @@ -438,6 +449,12 @@ def test_table_functions(): "CREATE TABLE tablefunc30 (x int) AS s3('http://minio1:9001/root/data/test9.csv.gz', 'NOSIGN', 'CSV')", "CREATE TABLE tablefunc31 (`x` int) AS s3('http://minio1:9001/root/data/test10.csv.gz', 'minio', '[HIDDEN]')", "CREATE TABLE tablefunc32 (`x` int) AS deltaLake('http://minio1:9001/root/data/test11.csv.gz', 'minio', '[HIDDEN]')", + f"CREATE TABLE tablefunc33 (x int) AS azureBlobStorage('{azure_conn_string}', 'cont', 'test_simple.csv', 'CSV')", + f"CREATE TABLE tablefunc34 (`x` int) AS azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_1.csv', '{azure_account_name}', '[HIDDEN]')", + f"CREATE TABLE tablefunc35 (`x` int) AS azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_2.csv', '{azure_account_name}', '[HIDDEN]', 'CSV')", + f"CREATE TABLE tablefunc36 (x int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_3.csv', 'CSV')", + f"CREATE TABLE tablefunc37 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_4.csv', '{azure_account_name}', '[HIDDEN]')", + f"CREATE TABLE tablefunc38 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_5.csv', '{azure_account_name}', '[HIDDEN]', 'CSV')", ], must_not_contain=[password], ) From 234f7e0d135882b92c46da78ba0da864a613095c Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Fri, 14 Jun 2024 11:38:18 +0000 Subject: [PATCH 16/95] Automatic style fix --- tests/integration/test_mask_sensitive_info/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_mask_sensitive_info/test.py b/tests/integration/test_mask_sensitive_info/test.py index 28e15fe0602..965d6518164 100644 --- a/tests/integration/test_mask_sensitive_info/test.py +++ b/tests/integration/test_mask_sensitive_info/test.py @@ -328,8 +328,8 @@ def test_create_database(): def test_table_functions(): password = new_password() - azure_conn_string = cluster.env_variables['AZURITE_CONNECTION_STRING'] - azure_storage_account_url = cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL'] + azure_conn_string = cluster.env_variables["AZURITE_CONNECTION_STRING"] + azure_storage_account_url = cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"] azure_account_name = "devstoreaccount1" azure_account_key = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" From 366e801ceaa7392b2fc441984972217668aeffc5 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 14 Jun 2024 14:24:06 +0200 Subject: [PATCH 17/95] Commit by limit --- src/Storages/S3Queue/S3QueueSettings.h | 3 + src/Storages/S3Queue/S3QueueSource.cpp | 109 ++++++++++++++---- src/Storages/S3Queue/S3QueueSource.h | 18 ++- src/Storages/S3Queue/StorageS3Queue.cpp | 109 +++++++++--------- src/Storages/System/StorageSystemS3Queue.cpp | 6 +- .../integration/test_storage_s3_queue/test.py | 100 ++++++++++++++++ 6 files changed, 262 insertions(+), 83 deletions(-) diff --git a/src/Storages/S3Queue/S3QueueSettings.h b/src/Storages/S3Queue/S3QueueSettings.h index b698eabfe24..ebf57ce81cc 100644 --- a/src/Storages/S3Queue/S3QueueSettings.h +++ b/src/Storages/S3Queue/S3QueueSettings.h @@ -31,6 +31,9 @@ class ASTStorage; M(UInt32, s3queue_cleanup_interval_min_ms, 60000, "For unordered mode. Polling backoff min for cleanup", 0) \ M(UInt32, s3queue_cleanup_interval_max_ms, 60000, "For unordered mode. Polling backoff max for cleanup", 0) \ M(UInt32, s3queue_buckets, 0, "Number of buckets for Ordered mode parallel processing", 0) \ + M(UInt32, s3queue_max_processed_files_before_commit, 100, "Number of files which can be processed before being committed to keeper", 0) \ + M(UInt32, s3queue_max_processed_rows_before_commit, 0, "Number of rows which can be processed before being committed to keeper", 0) \ + M(UInt32, s3queue_max_processed_bytes_before_commit, 0, "Number of bytes which can be processed before being committed to keeper", 0) \ #define LIST_OF_S3QUEUE_SETTINGS(M, ALIAS) \ S3QUEUE_RELATED_SETTINGS(M, ALIAS) \ diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index 25520d64d2b..654cdb31fe9 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -52,6 +52,14 @@ StorageS3QueueSource::FileIterator::FileIterator( { } +bool StorageS3QueueSource::FileIterator::isFinished() const +{ + return iterator_finished + && listed_keys_cache.end() == std::find_if( + listed_keys_cache.begin(), listed_keys_cache.end(), + [](const auto & v) { return !v.second.keys.empty(); }); +} + size_t StorageS3QueueSource::FileIterator::estimatedKeysCount() { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method estimateKeysCount is not implemented"); @@ -302,6 +310,9 @@ StorageS3QueueSource::StorageS3QueueSource( std::shared_ptr s3_queue_log_, const StorageID & storage_id_, LoggerPtr log_, + size_t max_processed_files_before_commit_, + size_t max_processed_rows_before_commit_, + size_t max_processed_bytes_before_commit_, bool commit_once_processed_) : ISource(header_) , WithContext(context_) @@ -315,6 +326,9 @@ StorageS3QueueSource::StorageS3QueueSource( , table_is_being_dropped(table_is_being_dropped_) , s3_queue_log(s3_queue_log_) , storage_id(storage_id_) + , max_processed_files_before_commit(max_processed_files_before_commit_) + , max_processed_rows_before_commit(max_processed_rows_before_commit_) + , max_processed_bytes_before_commit(max_processed_bytes_before_commit_) , commit_once_processed(commit_once_processed_) , remove_file_func(remove_file_func_) , log(log_) @@ -348,14 +362,14 @@ Chunk StorageS3QueueSource::generate() catch (...) { if (commit_once_processed) - setFailed(getCurrentExceptionMessage(true), true); + commit(false, getCurrentExceptionMessage(true)); throw; } if (!chunk && commit_once_processed) { - setProcessed(); + commit(true); } return chunk; } @@ -444,6 +458,8 @@ Chunk StorageS3QueueSource::generateImpl() file_status->processed_rows += chunk.getNumRows(); processed_rows_from_file += chunk.getNumRows(); + total_processed_rows += chunk.getNumRows(); + total_processed_bytes += chunk.bytes(); VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk( chunk, requested_virtual_columns, path, reader.getObjectInfo().metadata->size_bytes); @@ -454,20 +470,64 @@ Chunk StorageS3QueueSource::generateImpl() { const auto message = getCurrentExceptionMessage(true); LOG_ERROR(log, "Got an error while pulling chunk. Will set file {} as failed. Error: {} ", path, message); - file_status->onFailed(getCurrentExceptionMessage(true)); + appendLogElement(path, *file_status, processed_rows_from_file, false); - started_files.push_back(file_metadata); + + failed_files.push_back(file_metadata); + file_status->onFailed(getCurrentExceptionMessage(true)); + + if (processed_rows_from_file == 0) + { + /// If we did not process any rows from the failed file, + /// commit all previosly processed files, + /// not to loose the work already done. + return {}; + } + throw; } appendLogElement(path, *file_status, processed_rows_from_file, true); + file_status.reset(); processed_rows_from_file = 0; - started_files.push_back(file_metadata); + processed_files.push_back(file_metadata); + + if (processed_files.size() == max_processed_files_before_commit) + { + LOG_TRACE(log, "Number of max processed files before commit reached " + "(rows: {}, bytes: {}, files: {})", + total_processed_rows, total_processed_bytes, processed_files.size()); + break; + } + + bool rows_or_bytes_limit_reached = false; + if (total_processed_rows == max_processed_rows_before_commit) + { + LOG_TRACE(log, "Number of max processed rows before commit reached " + "(rows: {}, bytes: {}, files: {})", + total_processed_rows, total_processed_bytes, processed_files.size()); + + rows_or_bytes_limit_reached = true; + } + else if (total_processed_bytes == max_processed_bytes_before_commit) + { + LOG_TRACE(log, "Number of max processed bytes before commit reached " + "(rows: {}, bytes: {}, files: {})", + total_processed_rows, total_processed_bytes, processed_files.size()); + + rows_or_bytes_limit_reached = true; + } + + if (rows_or_bytes_limit_reached && reader_future.valid()) + { + LOG_TRACE(log, "Rows or bytes limit reached, but we have one more file scheduled already, " + "will process it despite the limit"); + } if (shutdown_called) { - LOG_INFO(log, "Shutdown was called, stopping sync"); + LOG_TRACE(log, "Shutdown was called, stopping sync"); break; } @@ -479,33 +539,38 @@ Chunk StorageS3QueueSource::generateImpl() file_status = files_metadata->getFileStatus(reader.getObjectInfo().getPath()); - /// Even if task is finished the thread may be not freed in pool. - /// So wait until it will be freed before scheduling a new task. - internal_source->create_reader_pool->wait(); - reader_future = internal_source->createReaderAsync(processor_id); + if (!rows_or_bytes_limit_reached && processed_files.size() + 1 < max_processed_files_before_commit) + { + /// Even if task is finished the thread may be not freed in pool. + /// So wait until it will be freed before scheduling a new task. + internal_source->create_reader_pool->wait(); + reader_future = internal_source->createReaderAsync(processor_id); + } } return {}; } -void StorageS3QueueSource::setProcessed() +void StorageS3QueueSource::commit(bool success, const std::string & exception) { - LOG_TEST(log, "Having {} files to set as processed", started_files.size()); + LOG_TEST(log, "Having {} files to set as {}", processed_files.size(), success ? "Processed" : "Failed"); - for (const auto & file_metadata : started_files) + for (const auto & file_metadata : processed_files) { - file_metadata->setProcessed(); - applyActionAfterProcessing(file_metadata->getPath()); + if (success) + { + file_metadata->setProcessed(); + applyActionAfterProcessing(file_metadata->getPath()); + } + else + file_metadata->setFailed(exception, /* reduce_retry_count */false); } -} -void StorageS3QueueSource::setFailed(const std::string & exception, bool reduce_retry_count) -{ - LOG_TEST(log, "Having {} files to set as failed", started_files.size()); - - for (const auto & file_metadata : started_files) + for (const auto & file_metadata : failed_files) { - file_metadata->setFailed(exception, reduce_retry_count); + /// `exception` from commit args is from insertion to storage. + /// Here we do not used it as failed_files were not inserted into storage, but skipped. + file_metadata->setFailed(file_metadata->getFileStatus()->getException(), /* reduce_retry_count */true); } } diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index ff868df2879..e5b5fa89f9c 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -49,6 +49,8 @@ public: std::atomic & shutdown_called_, LoggerPtr logger_); + bool isFinished() const; + /// Note: /// List results in s3 are always returned in UTF-8 binary order. /// (https://docs.aws.amazon.com/AmazonS3/latest/userguide/ListingKeysUsingAPIs.html) @@ -95,6 +97,9 @@ public: std::shared_ptr s3_queue_log_, const StorageID & storage_id_, LoggerPtr log_, + size_t max_processed_files_before_commit_, + size_t max_processed_rows_before_commit_, + size_t max_processed_bytes_before_commit_, bool commit_once_processed_); static Block getHeader(Block sample_block, const std::vector & requested_virtual_columns); @@ -103,9 +108,7 @@ public: Chunk generate() override; - void setProcessed(); - - void setFailed(const std::string & exception, bool reduce_retry_count); + void commit(bool success, const std::string & exception = {}); private: const String name; @@ -118,17 +121,24 @@ private: const std::atomic & table_is_being_dropped; const std::shared_ptr s3_queue_log; const StorageID storage_id; + const size_t max_processed_files_before_commit; + const size_t max_processed_rows_before_commit; + const size_t max_processed_bytes_before_commit; const bool commit_once_processed; RemoveFileFunc remove_file_func; LoggerPtr log; - std::vector started_files; + std::vector processed_files; + std::vector failed_files; ReaderHolder reader; std::future reader_future; std::atomic initialized{false}; + size_t processed_rows_from_file = 0; + size_t total_processed_rows = 0; + size_t total_processed_bytes = 0; S3QueueOrderedFileMetadata::BucketHolderPtr current_bucket_holder; diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index 4ba9f0ba677..e3a98f29ae8 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -371,6 +371,9 @@ std::shared_ptr StorageS3Queue::createSource( s3_queue_log, getStorageID(), log, + s3queue_settings->s3queue_max_processed_files_before_commit, + s3queue_settings->s3queue_max_processed_rows_before_commit, + s3queue_settings->s3queue_max_processed_bytes_before_commit, commit_once_processed); } @@ -446,84 +449,80 @@ void StorageS3Queue::threadFunc() bool StorageS3Queue::streamToViews() { + // Create a stream for each consumer and join them in a union stream + // Only insert into dependent views and expect that input blocks contain virtual columns + auto table_id = getStorageID(); auto table = DatabaseCatalog::instance().getTable(table_id, getContext()); if (!table) throw Exception(ErrorCodes::LOGICAL_ERROR, "Engine table {} doesn't exist.", table_id.getNameForLogs()); - auto storage_snapshot = getStorageSnapshot(getInMemoryMetadataPtr(), getContext()); - - // Create an INSERT query for streaming data auto insert = std::make_shared(); insert->table_id = table_id; + auto storage_snapshot = getStorageSnapshot(getInMemoryMetadataPtr(), getContext()); auto s3queue_context = Context::createCopy(getContext()); s3queue_context->makeQueryContext(); - // Create a stream for each consumer and join them in a union stream - // Only insert into dependent views and expect that input blocks contain virtual columns - InterpreterInsertQuery interpreter(insert, s3queue_context, false, true, true); - auto block_io = interpreter.execute(); auto file_iterator = createFileIterator(s3queue_context, nullptr); + size_t total_rows = 0; - auto read_from_format_info = prepareReadingFromFormat(block_io.pipeline.getHeader().getNames(), storage_snapshot, supportsSubsetOfColumns(s3queue_context)); - - Pipes pipes; - std::vector> sources; - - pipes.reserve(s3queue_settings->s3queue_processing_threads_num); - sources.reserve(s3queue_settings->s3queue_processing_threads_num); - - for (size_t i = 0; i < s3queue_settings->s3queue_processing_threads_num; ++i) + while (!file_iterator->isFinished()) { - auto source = createSource( - i/* processor_id */, - read_from_format_info, - file_iterator, - DBMS_DEFAULT_BUFFER_SIZE, - s3queue_context, - false/* commit_once_processed */); + InterpreterInsertQuery interpreter(insert, s3queue_context, false, true, true); + auto block_io = interpreter.execute(); + auto read_from_format_info = prepareReadingFromFormat( + block_io.pipeline.getHeader().getNames(), + storage_snapshot, + supportsSubsetOfColumns(s3queue_context)); - pipes.emplace_back(source); - sources.emplace_back(source); - } - auto pipe = Pipe::unitePipes(std::move(pipes)); + Pipes pipes; + std::vector> sources; - block_io.pipeline.complete(std::move(pipe)); - block_io.pipeline.setNumThreads(s3queue_settings->s3queue_processing_threads_num); - block_io.pipeline.setConcurrencyControl(s3queue_context->getSettingsRef().use_concurrency_control); + pipes.reserve(s3queue_settings->s3queue_processing_threads_num); + sources.reserve(s3queue_settings->s3queue_processing_threads_num); - std::atomic_size_t rows = 0; - block_io.pipeline.setProgressCallback([&](const Progress & progress) { rows += progress.read_rows.load(); }); + for (size_t i = 0; i < s3queue_settings->s3queue_processing_threads_num; ++i) + { + auto source = createSource( + i/* processor_id */, + read_from_format_info, + file_iterator, + DBMS_DEFAULT_BUFFER_SIZE, + s3queue_context, + false/* commit_once_processed */); - try - { - CompletedPipelineExecutor executor(block_io.pipeline); - executor.execute(); - } - catch (const Exception & e) - { - bool always_retriable_exception = e.code() == ErrorCodes::MEMORY_LIMIT_EXCEEDED - || e.code() == ErrorCodes::TOO_MANY_PARTS; + pipes.emplace_back(source); + sources.emplace_back(source); + } + auto pipe = Pipe::unitePipes(std::move(pipes)); - /// May be we should just split errors into retriable and non-retriable, - /// and always retry retriable for any number of tried needed? (so deprecating s3queue_loading_retries setting) + block_io.pipeline.complete(std::move(pipe)); + block_io.pipeline.setNumThreads(s3queue_settings->s3queue_processing_threads_num); + block_io.pipeline.setConcurrencyControl(s3queue_context->getSettingsRef().use_concurrency_control); + + std::atomic_size_t rows = 0; + block_io.pipeline.setProgressCallback([&](const Progress & progress) { rows += progress.read_rows.load(); }); + + try + { + CompletedPipelineExecutor executor(block_io.pipeline); + executor.execute(); + } + catch (...) + { + for (auto & source : sources) + source->commit(/* success */false, getCurrentExceptionMessage(true)); + throw; + } for (auto & source : sources) - source->setFailed(getCurrentExceptionMessage(true), /* reduce_retry_count */!always_retriable_exception); - throw; - } - catch (...) - { - for (auto & source : sources) - source->setFailed(getCurrentExceptionMessage(true), /* reduce_retry_count */true); - throw; + source->commit(/* success */true); + + total_rows += rows; } - for (auto & source : sources) - source->setProcessed(); - - return rows > 0; + return total_rows > 0; } zkutil::ZooKeeperPtr StorageS3Queue::getZooKeeper() const diff --git a/src/Storages/System/StorageSystemS3Queue.cpp b/src/Storages/System/StorageSystemS3Queue.cpp index 637182067f2..131570709d0 100644 --- a/src/Storages/System/StorageSystemS3Queue.cpp +++ b/src/Storages/System/StorageSystemS3Queue.cpp @@ -26,6 +26,7 @@ ColumnsDescription StorageSystemS3Queue::getColumnsDescription() return ColumnsDescription { {"zookeeper_path", std::make_shared(), "Path in zookeeper to S3Queue metadata"}, + {"file_path", std::make_shared(), "File path of a file which is being processed by S3Queue"}, {"file_name", std::make_shared(), "File name of a file which is being processed by S3Queue"}, {"rows_processed", std::make_shared(), "Currently processed number of rows"}, {"status", std::make_shared(), "Status of processing: Processed, Processing, Failed"}, @@ -45,11 +46,12 @@ void StorageSystemS3Queue::fillData(MutableColumns & res_columns, ContextPtr, co { for (const auto & [zookeeper_path, metadata] : S3QueueMetadataFactory::instance().getAll()) { - for (const auto & [file_name, file_status] : metadata->getFileStatuses()) + for (const auto & [file_path, file_status] : metadata->getFileStatuses()) { size_t i = 0; res_columns[i++]->insert(zookeeper_path); - res_columns[i++]->insert(file_name); + res_columns[i++]->insert(file_path); + res_columns[i++]->insert(std::filesystem::path(file_path).filename().string()); res_columns[i++]->insert(file_status->processed_rows.load()); res_columns[i++]->insert(magic_enum::enum_name(file_status->state.load())); diff --git a/tests/integration/test_storage_s3_queue/test.py b/tests/integration/test_storage_s3_queue/test.py index 656c25eb9b9..1da7a084779 100644 --- a/tests/integration/test_storage_s3_queue/test.py +++ b/tests/integration/test_storage_s3_queue/test.py @@ -1651,3 +1651,103 @@ def test_exception_during_insert(started_cluster): break time.sleep(1) assert expected_rows == get_count() + + +def test_commit_on_limit(started_cluster): + node = started_cluster.instances["instance"] + + table_name = f"test_commit_on_limit" + dst_table_name = f"{table_name}_dst" + keeper_path = f"/clickhouse/test_{table_name}" + files_path = f"{table_name}_data" + files_to_generate = 10 + + create_table( + started_cluster, + node, + table_name, + "ordered", + files_path, + additional_settings={ + "keeper_path": keeper_path, + "s3queue_processing_threads_num": 1, + "s3queue_loading_retries": 1, + "s3queue_max_processed_files_before_commit": 10, + }, + ) + total_values = generate_random_files( + started_cluster, files_path, files_to_generate, start_ind=0, row_num=1 + ) + + incorrect_values = [ + ["failed", 1, 1], + ] + incorrect_values_csv = ( + "\n".join((",".join(map(str, row)) for row in incorrect_values)) + "\n" + ).encode() + + correct_values = [ + [1, 1, 1], + ] + correct_values_csv = ( + "\n".join((",".join(map(str, row)) for row in correct_values)) + "\n" + ).encode() + + put_s3_file_content( + started_cluster, f"{files_path}/test_99.csv", correct_values_csv + ) + put_s3_file_content( + started_cluster, f"{files_path}/test_999.csv", correct_values_csv + ) + put_s3_file_content( + started_cluster, f"{files_path}/test_9999.csv", incorrect_values_csv + ) + put_s3_file_content( + started_cluster, f"{files_path}/test_99999.csv", correct_values_csv + ) + put_s3_file_content( + started_cluster, f"{files_path}/test_999999.csv", correct_values_csv + ) + + create_mv(node, table_name, dst_table_name) + + def get_processed_files(): + return ( + node.query( + f"SELECT file_name FROM system.s3queue WHERE zookeeper_path ilike '%{table_name}%' and status = 'Processed' and rows_processed > 0 " + ) + .strip() + .split("\n") + ) + + def get_failed_files(): + return ( + node.query( + f"SELECT file_name FROM system.s3queue WHERE zookeeper_path ilike '%{table_name}%' and status = 'Failed'" + ) + .strip() + .split("\n") + ) + + for _ in range(30): + if "test_999999.csv" in get_processed_files(): + break + time.sleep(1) + assert "test_999999.csv" in get_processed_files() + + assert 1 == int( + node.query( + "SELECT value FROM system.events WHERE name = 'S3QueueFailedFiles' SETTINGS system_events_show_zero_values=1" + ) + ) + + expected_processed = ["test_" + str(i) + ".csv" for i in range(files_to_generate)] + processed = get_processed_files() + for value in expected_processed: + assert value in processed + + expected_failed = ["test_9999.csv"] + failed = get_failed_files() + for value in expected_failed: + assert value not in processed + assert value in failed From f1b40623adb770f5ed0cdf46e7860ad30ef3ed30 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 14 Jun 2024 14:37:35 +0200 Subject: [PATCH 18/95] Allow to flush and commit by timeout --- src/Storages/S3Queue/S3QueueSettings.h | 1 + src/Storages/S3Queue/S3QueueSource.cpp | 27 ++++++++++++++++++------- src/Storages/S3Queue/S3QueueSource.h | 4 ++++ src/Storages/S3Queue/StorageS3Queue.cpp | 1 + 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/src/Storages/S3Queue/S3QueueSettings.h b/src/Storages/S3Queue/S3QueueSettings.h index ebf57ce81cc..681713e8378 100644 --- a/src/Storages/S3Queue/S3QueueSettings.h +++ b/src/Storages/S3Queue/S3QueueSettings.h @@ -34,6 +34,7 @@ class ASTStorage; M(UInt32, s3queue_max_processed_files_before_commit, 100, "Number of files which can be processed before being committed to keeper", 0) \ M(UInt32, s3queue_max_processed_rows_before_commit, 0, "Number of rows which can be processed before being committed to keeper", 0) \ M(UInt32, s3queue_max_processed_bytes_before_commit, 0, "Number of bytes which can be processed before being committed to keeper", 0) \ + M(UInt32, s3queue_max_processing_time_sec_before_commit, 0, "Timeout in seconds after which to commit files committed to keeper", 0) \ #define LIST_OF_S3QUEUE_SETTINGS(M, ALIAS) \ S3QUEUE_RELATED_SETTINGS(M, ALIAS) \ diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index 654cdb31fe9..3d6da6d5477 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -313,6 +313,7 @@ StorageS3QueueSource::StorageS3QueueSource( size_t max_processed_files_before_commit_, size_t max_processed_rows_before_commit_, size_t max_processed_bytes_before_commit_, + size_t max_processing_time_sec_before_commit_, bool commit_once_processed_) : ISource(header_) , WithContext(context_) @@ -329,6 +330,7 @@ StorageS3QueueSource::StorageS3QueueSource( , max_processed_files_before_commit(max_processed_files_before_commit_) , max_processed_rows_before_commit(max_processed_rows_before_commit_) , max_processed_bytes_before_commit(max_processed_bytes_before_commit_) + , max_processing_time_sec_before_commit(max_processing_time_sec_before_commit_) , commit_once_processed(commit_once_processed_) , remove_file_func(remove_file_func_) , log(log_) @@ -501,25 +503,36 @@ Chunk StorageS3QueueSource::generateImpl() break; } - bool rows_or_bytes_limit_reached = false; - if (total_processed_rows == max_processed_rows_before_commit) + bool rows_or_bytes_or_time_limit_reached = false; + if (max_processed_rows_before_commit + && total_processed_rows == max_processed_rows_before_commit) { LOG_TRACE(log, "Number of max processed rows before commit reached " "(rows: {}, bytes: {}, files: {})", total_processed_rows, total_processed_bytes, processed_files.size()); - rows_or_bytes_limit_reached = true; + rows_or_bytes_or_time_limit_reached = true; } - else if (total_processed_bytes == max_processed_bytes_before_commit) + else if (max_processed_bytes_before_commit + && total_processed_bytes == max_processed_bytes_before_commit) { LOG_TRACE(log, "Number of max processed bytes before commit reached " "(rows: {}, bytes: {}, files: {})", total_processed_rows, total_processed_bytes, processed_files.size()); - rows_or_bytes_limit_reached = true; + rows_or_bytes_or_time_limit_reached = true; + } + else if (max_processing_time_sec_before_commit + && total_stopwatch.elapsedSeconds() >= max_processing_time_sec_before_commit) + { + LOG_TRACE(log, "Max processing time before commit reached " + "(rows: {}, bytes: {}, files: {})", + total_processed_rows, total_processed_bytes, processed_files.size()); + + rows_or_bytes_or_time_limit_reached = true; } - if (rows_or_bytes_limit_reached && reader_future.valid()) + if (rows_or_bytes_or_time_limit_reached && reader_future.valid()) { LOG_TRACE(log, "Rows or bytes limit reached, but we have one more file scheduled already, " "will process it despite the limit"); @@ -539,7 +552,7 @@ Chunk StorageS3QueueSource::generateImpl() file_status = files_metadata->getFileStatus(reader.getObjectInfo().getPath()); - if (!rows_or_bytes_limit_reached && processed_files.size() + 1 < max_processed_files_before_commit) + if (!rows_or_bytes_or_time_limit_reached && processed_files.size() + 1 < max_processed_files_before_commit) { /// Even if task is finished the thread may be not freed in pool. /// So wait until it will be freed before scheduling a new task. diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index e5b5fa89f9c..2e2e1b8f5f7 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -100,6 +100,7 @@ public: size_t max_processed_files_before_commit_, size_t max_processed_rows_before_commit_, size_t max_processed_bytes_before_commit_, + size_t max_processing_time_sec_before_commit_, bool commit_once_processed_); static Block getHeader(Block sample_block, const std::vector & requested_virtual_columns); @@ -124,6 +125,7 @@ private: const size_t max_processed_files_before_commit; const size_t max_processed_rows_before_commit; const size_t max_processed_bytes_before_commit; + const size_t max_processing_time_sec_before_commit; const bool commit_once_processed; RemoveFileFunc remove_file_func; @@ -140,6 +142,8 @@ private: size_t total_processed_rows = 0; size_t total_processed_bytes = 0; + Stopwatch total_stopwatch {CLOCK_MONOTONIC_COARSE}; + S3QueueOrderedFileMetadata::BucketHolderPtr current_bucket_holder; Chunk generateImpl(); diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index e3a98f29ae8..660cd6dd73e 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -374,6 +374,7 @@ std::shared_ptr StorageS3Queue::createSource( s3queue_settings->s3queue_max_processed_files_before_commit, s3queue_settings->s3queue_max_processed_rows_before_commit, s3queue_settings->s3queue_max_processed_bytes_before_commit, + s3queue_settings->s3queue_max_processing_time_sec_before_commit, commit_once_processed); } From d688a1e9824786efc8253f414fa8abd17686d62a Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 14 Jun 2024 14:49:53 +0200 Subject: [PATCH 19/95] Fxi --- src/Storages/S3Queue/S3QueueSource.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index 3d6da6d5477..b56cb6881fe 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -532,8 +532,11 @@ Chunk StorageS3QueueSource::generateImpl() rows_or_bytes_or_time_limit_reached = true; } - if (rows_or_bytes_or_time_limit_reached && reader_future.valid()) + if (rows_or_bytes_or_time_limit_reached) { + if (!reader_future.valid()) + break; + LOG_TRACE(log, "Rows or bytes limit reached, but we have one more file scheduled already, " "will process it despite the limit"); } From d70e39b586690de78316f7bf725fc4804530b5d9 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 14 Jun 2024 15:31:11 +0200 Subject: [PATCH 20/95] Fix style check --- src/Storages/S3Queue/S3QueueSource.cpp | 2 +- src/Storages/S3Queue/StorageS3Queue.cpp | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index b56cb6881fe..2530628dd77 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -481,7 +481,7 @@ Chunk StorageS3QueueSource::generateImpl() if (processed_rows_from_file == 0) { /// If we did not process any rows from the failed file, - /// commit all previosly processed files, + /// commit all previously processed files, /// not to loose the work already done. return {}; } diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index 660cd6dd73e..ed0b9464b1b 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -48,8 +48,6 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int S3_ERROR; extern const int QUERY_NOT_ALLOWED; - extern const int MEMORY_LIMIT_EXCEEDED; - extern const int TOO_MANY_PARTS; } namespace From 81e5bfa689f1d306d906d3dff92475307c661bad Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 14 Jun 2024 16:04:52 +0200 Subject: [PATCH 21/95] Move analyzeFunctionParamValues into QueryAnalyzer.cpp --- src/Analyzer/Resolve/QueryAnalyzer.cpp | 6 +++++- src/Interpreters/Context.cpp | 9 +++------ src/Interpreters/Context.h | 3 +-- src/Parsers/FunctionParameterValuesVisitor.cpp | 15 +++++++++------ src/Parsers/FunctionParameterValuesVisitor.h | 2 +- 5 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/Analyzer/Resolve/QueryAnalyzer.cpp b/src/Analyzer/Resolve/QueryAnalyzer.cpp index 0bf61bb508b..dde89250a53 100644 --- a/src/Analyzer/Resolve/QueryAnalyzer.cpp +++ b/src/Analyzer/Resolve/QueryAnalyzer.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -4505,8 +4506,11 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, table_name = table_identifier[1]; } + NameToNameMap param_values = analyzeFunctionParamValues(function_ast, &scope.aliases); auto parametrized_view_storage = scope_context->getQueryContext()->buildParametrizedViewStorage( - function_ast, database_name, table_name, scope.aliases); + database_name, + table_name, + param_values); if (parametrized_view_storage) { diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 7fb1b0d0374..9e4ae99fc0a 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -1872,7 +1871,7 @@ StoragePtr Context::executeTableFunction(const ASTPtr & table_expression, const if (table.get()->isView() && table->as() && table->as()->isParameterizedView()) { auto query = table->getInMemoryMetadataPtr()->getSelectQuery().inner_query->clone(); - NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression, ScopeAliases{}); + NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression, nullptr); StorageView::replaceQueryParametersIfParametrizedView(query, parameterized_view_values); ASTCreateQuery create; @@ -2073,8 +2072,7 @@ StoragePtr Context::executeTableFunction(const ASTPtr & table_expression, const } -StoragePtr Context::buildParametrizedViewStorage( - const ASTPtr & table_expression, const String & database_name, const String & table_name, const ScopeAliases & scope_aliases) +StoragePtr Context::buildParametrizedViewStorage(const String & database_name, const String & table_name, const NameToNameMap & param_values) { if (table_name.empty()) return nullptr; @@ -2087,8 +2085,7 @@ StoragePtr Context::buildParametrizedViewStorage( return nullptr; auto query = original_view->getInMemoryMetadataPtr()->getSelectQuery().inner_query->clone(); - NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression, scope_aliases); - StorageView::replaceQueryParametersIfParametrizedView(query, parameterized_view_values); + StorageView::replaceQueryParametersIfParametrizedView(query, param_values); ASTCreateQuery create; create.select = query->as(); diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 5c0f96a04a9..7e3bb8d8e72 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -744,8 +744,7 @@ public: /// Overload for the new analyzer. Structure inference is performed in QueryAnalysisPass. StoragePtr executeTableFunction(const ASTPtr & table_expression, const TableFunctionPtr & table_function_ptr); - StoragePtr buildParametrizedViewStorage( - const ASTPtr & table_expression, const String & database_name, const String & table_name, const ScopeAliases & scope_aliases); + StoragePtr buildParametrizedViewStorage(const String & database_name, const String & table_name, const NameToNameMap & param_values); void addViewSource(const StoragePtr & storage); StoragePtr getViewSource() const; diff --git a/src/Parsers/FunctionParameterValuesVisitor.cpp b/src/Parsers/FunctionParameterValuesVisitor.cpp index 44bf36c2526..1b629351b26 100644 --- a/src/Parsers/FunctionParameterValuesVisitor.cpp +++ b/src/Parsers/FunctionParameterValuesVisitor.cpp @@ -25,7 +25,7 @@ namespace ErrorCodes class FunctionParameterValuesVisitor { public: - explicit FunctionParameterValuesVisitor(NameToNameMap & parameter_values_, const ScopeAliases & aliases_) + explicit FunctionParameterValuesVisitor(NameToNameMap & parameter_values_, const ScopeAliases * aliases_) : parameter_values(parameter_values_), aliases(aliases_) { } @@ -41,7 +41,7 @@ public: private: NameToNameMap & parameter_values; - const ScopeAliases & aliases; + const ScopeAliases * aliases; std::string tryGetParameterValueAsString(const ASTPtr & ast) { @@ -51,10 +51,13 @@ private: } else if (const auto * value_identifier = ast->as()) { - auto it = aliases.alias_name_to_expression_node_before_group_by.find(value_identifier->name()); - if (it != aliases.alias_name_to_expression_node_before_group_by.end()) + if (aliases) { - return tryGetParameterValueAsString(it->second->toAST()); + auto it = aliases->alias_name_to_expression_node_before_group_by.find(value_identifier->name()); + if (it != aliases->alias_name_to_expression_node_before_group_by.end()) + { + return tryGetParameterValueAsString(it->second->toAST()); + } } } else if (const auto * function = ast->as()) @@ -93,7 +96,7 @@ private: } }; -NameToNameMap analyzeFunctionParamValues(const ASTPtr & ast, const ScopeAliases & scope_aliases) +NameToNameMap analyzeFunctionParamValues(const ASTPtr & ast, const ScopeAliases * scope_aliases) { NameToNameMap parameter_values; FunctionParameterValuesVisitor(parameter_values, scope_aliases).visit(ast); diff --git a/src/Parsers/FunctionParameterValuesVisitor.h b/src/Parsers/FunctionParameterValuesVisitor.h index 01ce79a2a76..9f19c68a852 100644 --- a/src/Parsers/FunctionParameterValuesVisitor.h +++ b/src/Parsers/FunctionParameterValuesVisitor.h @@ -9,6 +9,6 @@ namespace DB struct ScopeAliases; /// Find parameters in a query parameter values and collect them into map. -NameToNameMap analyzeFunctionParamValues(const ASTPtr & ast, const ScopeAliases & scope_aliases); +NameToNameMap analyzeFunctionParamValues(const ASTPtr & ast, const ScopeAliases * scope_aliases = nullptr); } From 3f968a8aad49421ed40b8b2c5ce295b0504d56a1 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 14 Jun 2024 17:28:16 +0200 Subject: [PATCH 22/95] Fix tests --- .../StorageObjectStorageSource.h | 1 + src/Storages/S3Queue/S3QueueIFileMetadata.cpp | 14 ++--- src/Storages/S3Queue/S3QueueIFileMetadata.h | 1 + .../S3Queue/S3QueueOrderedFileMetadata.cpp | 7 ++- src/Storages/S3Queue/S3QueueSource.cpp | 51 +++++++++++++++++-- src/Storages/S3Queue/S3QueueSource.h | 5 ++ .../S3Queue/S3QueueUnorderedFileMetadata.cpp | 7 ++- src/Storages/S3Queue/StorageS3Queue.cpp | 2 +- 8 files changed, 74 insertions(+), 14 deletions(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index fd7c7aa7102..da4d33fc0a8 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -101,6 +101,7 @@ protected: const PullingPipelineExecutor * operator->() const { return reader.get(); } const ObjectInfo & getObjectInfo() const { return *object_info; } + ObjectInfoPtr getObjectInfoPtr() const { return object_info; } const IInputFormat * getInputFormat() const { return dynamic_cast(source.get()); } private: diff --git a/src/Storages/S3Queue/S3QueueIFileMetadata.cpp b/src/Storages/S3Queue/S3QueueIFileMetadata.cpp index fcf91555cb5..44362ab1110 100644 --- a/src/Storages/S3Queue/S3QueueIFileMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueIFileMetadata.cpp @@ -233,7 +233,7 @@ void S3QueueIFileMetadata::setProcessed() LOG_TRACE(log, "Setting file {} as processed (path: {})", path, processed_node_path); ProfileEvents::increment(ProfileEvents::S3QueueProcessedFiles); - file_status->onProcessed(); + chassert(file_status->state == FileStatus::State::Processed); try { @@ -253,10 +253,11 @@ void S3QueueIFileMetadata::setProcessed() void S3QueueIFileMetadata::setFailed(const std::string & exception, bool reduce_retry_count) { - LOG_TRACE(log, "Setting file {} as failed (exception: {}, path: {})", path, exception, failed_node_path); + LOG_TRACE(log, "Setting file {} as failed (path: {}, reduce retry count: {}, exception: {})", + path, failed_node_path, reduce_retry_count, exception); ProfileEvents::increment(ProfileEvents::S3QueueFailedFiles); - file_status->onFailed(exception); + chassert(file_status->state == FileStatus::State::Failed); node_metadata.last_exception = exception; if (reduce_retry_count) @@ -330,15 +331,16 @@ void S3QueueIFileMetadata::setFailedRetriable() Coordination::Requests requests; Coordination::Stat stat; std::string res; - if (zk_client->tryGet(retrieable_failed_node_path, res, &stat)) + bool has_failed_before = zk_client->tryGet(retrieable_failed_node_path, res, &stat); + if (has_failed_before) { auto failed_node_metadata = NodeMetadata::fromString(res); node_metadata.retries = failed_node_metadata.retries + 1; file_status->retries = node_metadata.retries; } - LOG_TRACE(log, "File `{}` failed to process, try {}/{}", - path, node_metadata.retries, max_loading_retries); + LOG_TRACE(log, "File `{}` failed to process, try {}/{}, retries node exists: {} (failed node path: {})", + path, node_metadata.retries, max_loading_retries, has_failed_before, failed_node_path); if (node_metadata.retries >= max_loading_retries) { diff --git a/src/Storages/S3Queue/S3QueueIFileMetadata.h b/src/Storages/S3Queue/S3QueueIFileMetadata.h index ac1e3724c57..6d00aec5fd8 100644 --- a/src/Storages/S3Queue/S3QueueIFileMetadata.h +++ b/src/Storages/S3Queue/S3QueueIFileMetadata.h @@ -62,6 +62,7 @@ public: FileStatusPtr getFileStatus() { return file_status; } const std::string & getPath() { return path; } + size_t getMaxTries() const { return max_loading_retries; } struct NodeMetadata { diff --git a/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp b/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp index bac87c95cc9..7c922c0c0a3 100644 --- a/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp @@ -384,8 +384,11 @@ void S3QueueOrderedFileMetadata::setProcessedImpl() auto code = zk_client->tryMulti(requests, responses); if (code == Coordination::Error::ZOK) { - if (max_loading_retries) - zk_client->tryRemove(failed_node_path + ".retriable", -1); + if (max_loading_retries + && zk_client->tryRemove(failed_node_path + ".retriable", -1) == Coordination::Error::ZOK) + { + LOG_TEST(log, "Removed node {}.retriable", failed_node_path); + } return; } diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index 2530628dd77..1bd91674ecf 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -57,7 +57,8 @@ bool StorageS3QueueSource::FileIterator::isFinished() const return iterator_finished && listed_keys_cache.end() == std::find_if( listed_keys_cache.begin(), listed_keys_cache.end(), - [](const auto & v) { return !v.second.keys.empty(); }); + [](const auto & v) { return !v.second.keys.empty(); }) + && objects_to_retry.empty(); } size_t StorageS3QueueSource::FileIterator::estimatedKeysCount() @@ -73,9 +74,21 @@ StorageS3QueueSource::ObjectInfoPtr StorageS3QueueSource::FileIterator::nextImpl while (!shutdown_called) { if (metadata->useBucketsForProcessing()) + { std::tie(object_info, bucket_info) = getNextKeyFromAcquiredBucket(processor); + } else - object_info = glob_iterator->next(processor); + { + if (objects_to_retry.empty()) + { + object_info = glob_iterator->next(processor); + } + else + { + object_info = objects_to_retry.front(); + objects_to_retry.pop_front(); + } + } if (!object_info) return {}; @@ -93,6 +106,20 @@ StorageS3QueueSource::ObjectInfoPtr StorageS3QueueSource::FileIterator::nextImpl return {}; } +void StorageS3QueueSource::FileIterator::returnForRetry(ObjectInfoPtr object_info) +{ + if (metadata->useBucketsForProcessing()) + { + const auto bucket = metadata->getBucketForPath(object_info->relative_path); + listed_keys_cache[bucket].keys.emplace_front(object_info); + } + else + { + objects_to_retry.push_back(object_info); + } +} + + std::pair StorageS3QueueSource::FileIterator::getNextKeyFromAcquiredBucket(size_t processor) { @@ -383,7 +410,10 @@ Chunk StorageS3QueueSource::generateImpl() while (true) { if (!reader) + { + LOG_TEST(log, "No reader"); break; + } const auto * object_info = dynamic_cast(&reader.getObjectInfo()); auto file_metadata = object_info->file_metadata; @@ -408,6 +438,7 @@ Chunk StorageS3QueueSource::generateImpl() appendLogElement(reader.getObjectInfo().getPath(), *file_status, processed_rows_from_file, false); } + LOG_TEST(log, "Query is cancelled"); break; } @@ -415,6 +446,8 @@ Chunk StorageS3QueueSource::generateImpl() if (shutdown_called) { + LOG_TEST(log, "Shutdown called"); + if (processed_rows_from_file == 0) break; @@ -480,6 +513,12 @@ Chunk StorageS3QueueSource::generateImpl() if (processed_rows_from_file == 0) { + auto * file_iterator = dynamic_cast(internal_source->file_iterator.get()); + chassert(file_iterator); + + if (file_status->retries < file_metadata->getMaxTries()) + file_iterator->returnForRetry(reader.getObjectInfoPtr()); + /// If we did not process any rows from the failed file, /// commit all previously processed files, /// not to loose the work already done. @@ -491,7 +530,9 @@ Chunk StorageS3QueueSource::generateImpl() appendLogElement(path, *file_status, processed_rows_from_file, true); + file_status->onProcessed(); file_status.reset(); + processed_rows_from_file = 0; processed_files.push_back(file_metadata); @@ -551,7 +592,10 @@ Chunk StorageS3QueueSource::generateImpl() reader = reader_future.get(); if (!reader) + { + LOG_TEST(log, "Reader finished"); break; + } file_status = files_metadata->getFileStatus(reader.getObjectInfo().getPath()); @@ -569,7 +613,8 @@ Chunk StorageS3QueueSource::generateImpl() void StorageS3QueueSource::commit(bool success, const std::string & exception) { - LOG_TEST(log, "Having {} files to set as {}", processed_files.size(), success ? "Processed" : "Failed"); + LOG_TEST(log, "Having {} files to set as {}, failed files: {}", + processed_files.size(), success ? "Processed" : "Failed", failed_files.size()); for (const auto & file_metadata : processed_files) { diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index 2e2e1b8f5f7..12ba6953fcc 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -58,6 +58,8 @@ public: size_t estimatedKeysCount() override; + void returnForRetry(ObjectInfoPtr object_info); + private: using Bucket = S3QueueMetadata::Bucket; using Processor = S3QueueMetadata::Processor; @@ -79,6 +81,9 @@ public: bool iterator_finished = false; std::unordered_map bucket_holders; + /// Only for processing without buckets. + std::deque objects_to_retry; + std::pair getNextKeyFromAcquiredBucket(size_t processor); }; diff --git a/src/Storages/S3Queue/S3QueueUnorderedFileMetadata.cpp b/src/Storages/S3Queue/S3QueueUnorderedFileMetadata.cpp index c61e9557fc2..2f7c238cd4d 100644 --- a/src/Storages/S3Queue/S3QueueUnorderedFileMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueUnorderedFileMetadata.cpp @@ -130,8 +130,11 @@ void S3QueueUnorderedFileMetadata::setProcessedImpl() const auto code = zk_client->tryMulti(requests, responses); if (code == Coordination::Error::ZOK) { - if (max_loading_retries) - zk_client->tryRemove(failed_node_path + ".retriable", -1); + if (max_loading_retries + && zk_client->tryRemove(failed_node_path + ".retriable", -1) == Coordination::Error::ZOK) + { + LOG_TEST(log, "Removed node {}.retriable", failed_node_path); + } LOG_TRACE(log, "Moved file `{}` to processed (node path: {})", path, processed_node_path); return; diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index ed0b9464b1b..112807f4b9f 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -466,7 +466,7 @@ bool StorageS3Queue::streamToViews() auto file_iterator = createFileIterator(s3queue_context, nullptr); size_t total_rows = 0; - while (!file_iterator->isFinished()) + while (!shutdown_called && !file_iterator->isFinished()) { InterpreterInsertQuery interpreter(insert, s3queue_context, false, true, true); auto block_io = interpreter.execute(); From 5e213095a88a59a30ef3fe48397986d4cca69410 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 14 Jun 2024 17:37:01 +0200 Subject: [PATCH 23/95] Nicer --- .../ObjectStorage/StorageObjectStorageSource.cpp | 10 +++++----- .../ObjectStorage/StorageObjectStorageSource.h | 3 +-- src/Storages/S3Queue/S3QueueSource.cpp | 12 ++++++------ 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index b31d0f8a92e..aface19307e 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -193,19 +193,19 @@ Chunk StorageObjectStorageSource::generate() progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); const auto & object_info = reader.getObjectInfo(); - const auto & filename = object_info.getFileName(); - chassert(object_info.metadata); + const auto & filename = object_info->getFileName(); + chassert(object_info->metadata); VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk( chunk, read_from_format_info.requested_virtual_columns, - getUniqueStoragePathIdentifier(*configuration, reader.getObjectInfo(), false), - object_info.metadata->size_bytes, &filename); + getUniqueStoragePathIdentifier(*configuration, *object_info, false), + object_info->metadata->size_bytes, &filename); return chunk; } if (reader.getInputFormat() && getContext()->getSettingsRef().use_cache_for_count_from_files) - addNumRowsToCache(reader.getObjectInfo(), total_rows_in_file); + addNumRowsToCache(*reader.getObjectInfo(), total_rows_in_file); total_rows_in_file = 0; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index da4d33fc0a8..1b7c3b9be3f 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -100,8 +100,7 @@ protected: PullingPipelineExecutor * operator->() { return reader.get(); } const PullingPipelineExecutor * operator->() const { return reader.get(); } - const ObjectInfo & getObjectInfo() const { return *object_info; } - ObjectInfoPtr getObjectInfoPtr() const { return object_info; } + ObjectInfoPtr getObjectInfo() const { return object_info; } const IInputFormat * getInputFormat() const { return dynamic_cast(source.get()); } private: diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index 1bd91674ecf..72f2f158e33 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -415,7 +415,7 @@ Chunk StorageS3QueueSource::generateImpl() break; } - const auto * object_info = dynamic_cast(&reader.getObjectInfo()); + const auto * object_info = dynamic_cast(reader.getObjectInfo().get()); auto file_metadata = object_info->file_metadata; auto file_status = file_metadata->getFileStatus(); @@ -435,14 +435,14 @@ Chunk StorageS3QueueSource::generateImpl() object_info->relative_path, getCurrentExceptionMessage(true)); } - appendLogElement(reader.getObjectInfo().getPath(), *file_status, processed_rows_from_file, false); + appendLogElement(reader.getObjectInfo()->getPath(), *file_status, processed_rows_from_file, false); } LOG_TEST(log, "Query is cancelled"); break; } - const auto & path = reader.getObjectInfo().getPath(); + const auto & path = reader.getObjectInfo()->getPath(); if (shutdown_called) { @@ -497,7 +497,7 @@ Chunk StorageS3QueueSource::generateImpl() total_processed_bytes += chunk.bytes(); VirtualColumnUtils::addRequestedPathFileAndSizeVirtualsToChunk( - chunk, requested_virtual_columns, path, reader.getObjectInfo().metadata->size_bytes); + chunk, requested_virtual_columns, path, reader.getObjectInfo()->metadata->size_bytes); return chunk; } } @@ -517,7 +517,7 @@ Chunk StorageS3QueueSource::generateImpl() chassert(file_iterator); if (file_status->retries < file_metadata->getMaxTries()) - file_iterator->returnForRetry(reader.getObjectInfoPtr()); + file_iterator->returnForRetry(reader.getObjectInfo()); /// If we did not process any rows from the failed file, /// commit all previously processed files, @@ -597,7 +597,7 @@ Chunk StorageS3QueueSource::generateImpl() break; } - file_status = files_metadata->getFileStatus(reader.getObjectInfo().getPath()); + file_status = files_metadata->getFileStatus(reader.getObjectInfo()->getPath()); if (!rows_or_bytes_or_time_limit_reached && processed_files.size() + 1 < max_processed_files_before_commit) { From 99167d3b87ad348094dee68b1ab90d6b0879caa9 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Fri, 14 Jun 2024 18:32:12 +0200 Subject: [PATCH 24/95] fixing tests --- src/Client/Connection.cpp | 2 +- src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.cpp | 3 ++- src/Storages/ObjectStorage/StorageObjectStorageSink.cpp | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index c53b5bdaf81..c5b8ebfa88e 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -73,7 +73,7 @@ namespace ErrorCodes Connection::~Connection() { if (connected) - disconnect(); + Connection::disconnect(); } Connection::Connection(const String & host_, UInt16 port_, diff --git a/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.cpp b/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.cpp index 2c14b38ce01..5abfa878525 100644 --- a/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.cpp +++ b/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.cpp @@ -147,7 +147,8 @@ void WriteBufferFromHDFS::finalizeImpl() WriteBufferFromHDFS::~WriteBufferFromHDFS() { - finalize(); + if (!canceled) + finalize(); } } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index 48eba3ef741..22ba537f3b8 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -51,8 +51,8 @@ void StorageObjectStorageSink::onCancel() { std::lock_guard lock(cancel_mutex); cancelBuffers(); - cancelled = true; releaseBuffers(); + cancelled = true; } void StorageObjectStorageSink::onException(std::exception_ptr) From 0f2c2cc9bfd4be6007a8f8ed2e4a44a44680d47e Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 17 Jun 2024 17:11:17 +0200 Subject: [PATCH 25/95] Fix tests --- src/Storages/S3Queue/S3QueueIFileMetadata.cpp | 6 ++-- src/Storages/S3Queue/S3QueueIFileMetadata.h | 2 +- src/Storages/S3Queue/S3QueueSource.cpp | 35 +++++++++++++------ src/Storages/S3Queue/S3QueueSource.h | 3 +- src/Storages/S3Queue/StorageS3Queue.cpp | 15 ++++++-- .../integration/test_storage_s3_queue/test.py | 8 ++--- 6 files changed, 48 insertions(+), 21 deletions(-) diff --git a/src/Storages/S3Queue/S3QueueIFileMetadata.cpp b/src/Storages/S3Queue/S3QueueIFileMetadata.cpp index 44362ab1110..1e2daa97639 100644 --- a/src/Storages/S3Queue/S3QueueIFileMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueIFileMetadata.cpp @@ -251,13 +251,15 @@ void S3QueueIFileMetadata::setProcessed() LOG_TRACE(log, "Set file {} as processed (rows: {})", path, file_status->processed_rows); } -void S3QueueIFileMetadata::setFailed(const std::string & exception, bool reduce_retry_count) +void S3QueueIFileMetadata::setFailed(const std::string & exception, bool reduce_retry_count, bool overwrite_status) { LOG_TRACE(log, "Setting file {} as failed (path: {}, reduce retry count: {}, exception: {})", path, failed_node_path, reduce_retry_count, exception); ProfileEvents::increment(ProfileEvents::S3QueueFailedFiles); - chassert(file_status->state == FileStatus::State::Failed); + if (overwrite_status || file_status->state != FileStatus::State::Failed) + file_status->onFailed(exception); + node_metadata.last_exception = exception; if (reduce_retry_count) diff --git a/src/Storages/S3Queue/S3QueueIFileMetadata.h b/src/Storages/S3Queue/S3QueueIFileMetadata.h index 6d00aec5fd8..7b978e8c580 100644 --- a/src/Storages/S3Queue/S3QueueIFileMetadata.h +++ b/src/Storages/S3Queue/S3QueueIFileMetadata.h @@ -54,7 +54,7 @@ public: bool setProcessing(); void setProcessed(); - void setFailed(const std::string & exception, bool reduce_retry_count = true); + void setFailed(const std::string & exception, bool reduce_retry_count, bool overwrite_status); virtual void setProcessedAtStartRequests( Coordination::Requests & requests, diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index 6bb37113762..65ee929b2fb 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -54,6 +54,7 @@ StorageS3QueueSource::FileIterator::FileIterator( bool StorageS3QueueSource::FileIterator::isFinished() const { + LOG_TEST(log, "Iterator finished: {}, objects to retry: {}", iterator_finished, objects_to_retry.size()); return iterator_finished && listed_keys_cache.end() == std::find_if( listed_keys_cache.begin(), listed_keys_cache.end(), @@ -82,6 +83,8 @@ StorageS3QueueSource::ObjectInfoPtr StorageS3QueueSource::FileIterator::nextImpl if (objects_to_retry.empty()) { object_info = glob_iterator->next(processor); + if (!object_info) + iterator_finished = true; } else { @@ -91,7 +94,10 @@ StorageS3QueueSource::ObjectInfoPtr StorageS3QueueSource::FileIterator::nextImpl } if (!object_info) + { + LOG_TEST(log, "No object left"); return {}; + } if (shutdown_called) { @@ -108,6 +114,7 @@ StorageS3QueueSource::ObjectInfoPtr StorageS3QueueSource::FileIterator::nextImpl void StorageS3QueueSource::FileIterator::returnForRetry(ObjectInfoPtr object_info) { + chassert(object_info); if (metadata->useBucketsForProcessing()) { const auto bucket = metadata->getBucketForPath(object_info->relative_path); @@ -374,10 +381,13 @@ void StorageS3QueueSource::lazyInitialize(size_t processor) if (initialized) return; + LOG_TEST(log, "Initializing a new reader"); + internal_source->lazyInitialize(processor); reader = std::move(internal_source->reader); if (reader) reader_future = std::move(internal_source->reader_future); + initialized = true; } @@ -427,7 +437,7 @@ Chunk StorageS3QueueSource::generateImpl() { try { - file_metadata->setFailed("Cancelled"); + file_metadata->setFailed("Cancelled", /* reduce_retry_count */true, /* overwrite_status */false); } catch (...) { @@ -459,7 +469,7 @@ Chunk StorageS3QueueSource::generateImpl() try { - file_metadata->setFailed("Table is dropped"); + file_metadata->setFailed("Table is dropped", /* reduce_retry_count */true, /* overwrite_status */false); } catch (...) { @@ -511,10 +521,9 @@ Chunk StorageS3QueueSource::generateImpl() const auto message = getCurrentExceptionMessage(true); LOG_ERROR(log, "Got an error while pulling chunk. Will set file {} as failed. Error: {} ", path, message); - appendLogElement(path, *file_status, processed_rows_from_file, false); - - failed_files.push_back(file_metadata); + failed_during_read_files.push_back(file_metadata); file_status->onFailed(getCurrentExceptionMessage(true)); + appendLogElement(path, *file_status, processed_rows_from_file, false); if (processed_rows_from_file == 0) { @@ -619,7 +628,7 @@ Chunk StorageS3QueueSource::generateImpl() void StorageS3QueueSource::commit(bool success, const std::string & exception) { LOG_TEST(log, "Having {} files to set as {}, failed files: {}", - processed_files.size(), success ? "Processed" : "Failed", failed_files.size()); + processed_files.size(), success ? "Processed" : "Failed", failed_during_read_files.size()); for (const auto & file_metadata : processed_files) { @@ -629,14 +638,20 @@ void StorageS3QueueSource::commit(bool success, const std::string & exception) applyActionAfterProcessing(file_metadata->getPath()); } else - file_metadata->setFailed(exception, /* reduce_retry_count */false); + file_metadata->setFailed( + exception, + /* reduce_retry_count */false, + /* overwrite_status */true); } - for (const auto & file_metadata : failed_files) + for (const auto & file_metadata : failed_during_read_files) { /// `exception` from commit args is from insertion to storage. - /// Here we do not used it as failed_files were not inserted into storage, but skipped. - file_metadata->setFailed(file_metadata->getFileStatus()->getException(), /* reduce_retry_count */true); + /// Here we do not used it as failed_during_read_files were not inserted into storage, but skipped. + file_metadata->setFailed( + file_metadata->getFileStatus()->getException(), + /* reduce_retry_count */true, + /* overwrite_status */false); } } diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index 12ba6953fcc..a59f1d5cb05 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -85,6 +85,7 @@ public: std::deque objects_to_retry; std::pair getNextKeyFromAcquiredBucket(size_t processor); + bool hasKeysForProcessor(const Processor & processor) const; }; StorageS3QueueSource( @@ -137,7 +138,7 @@ private: LoggerPtr log; std::vector processed_files; - std::vector failed_files; + std::vector failed_during_read_files; ReaderHolder reader; std::future reader_future; diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index 112807f4b9f..b00d912d9b5 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -72,7 +72,12 @@ namespace return zkutil::extractZooKeeperPath(result_zk_path, true); } - void checkAndAdjustSettings(S3QueueSettings & s3queue_settings, const Settings & settings, bool is_attach, const LoggerPtr & log) + void checkAndAdjustSettings( + S3QueueSettings & s3queue_settings, + const Settings & settings, + bool is_attach, + const LoggerPtr & log, + ASTStorage * engine_args) { if (!is_attach && !s3queue_settings.mode.changed) { @@ -100,6 +105,10 @@ namespace if (!is_attach && !s3queue_settings.s3queue_processing_threads_num.changed) { s3queue_settings.s3queue_processing_threads_num = std::max(getNumberOfPhysicalCPUCores(), 16); + engine_args->settings->as()->changes.insertSetting( + "s3queue_processing_threads_num", + s3queue_settings.s3queue_processing_threads_num.value); + LOG_TRACE(log, "Set `processing_threads_num` to {}", s3queue_settings.s3queue_processing_threads_num); } } @@ -114,7 +123,7 @@ StorageS3Queue::StorageS3Queue( const String & comment, ContextPtr context_, std::optional format_settings_, - ASTStorage * /* engine_args */, + ASTStorage * engine_args, LoadingStrictnessLevel mode) : IStorage(table_id_) , WithContext(context_) @@ -138,7 +147,7 @@ StorageS3Queue::StorageS3Queue( throw Exception(ErrorCodes::QUERY_NOT_ALLOWED, "S3Queue url must either end with '/' or contain globs"); } - checkAndAdjustSettings(*s3queue_settings, context_->getSettingsRef(), mode > LoadingStrictnessLevel::CREATE, log); + checkAndAdjustSettings(*s3queue_settings, context_->getSettingsRef(), mode > LoadingStrictnessLevel::CREATE, log, engine_args); object_storage = configuration->createObjectStorage(context_, /* is_readonly */true); FormatFactory::instance().checkFormatName(configuration->format); diff --git a/tests/integration/test_storage_s3_queue/test.py b/tests/integration/test_storage_s3_queue/test.py index 1da7a084779..bd424d9cc0e 100644 --- a/tests/integration/test_storage_s3_queue/test.py +++ b/tests/integration/test_storage_s3_queue/test.py @@ -793,8 +793,8 @@ def test_max_set_age(started_cluster): additional_settings={ "keeper_path": keeper_path, "s3queue_tracked_file_ttl_sec": max_age, - "s3queue_cleanup_interval_min_ms": 0, - "s3queue_cleanup_interval_max_ms": 0, + "s3queue_cleanup_interval_min_ms": max_age / 3, + "s3queue_cleanup_interval_max_ms": max_age / 3, "s3queue_loading_retries": 0, "s3queue_processing_threads_num": 1, "s3queue_loading_retries": 0, @@ -822,7 +822,7 @@ def test_max_set_age(started_cluster): assert expected_rows == get_count() assert 10 == int(node.query(f"SELECT uniq(_path) from {dst_table_name}")) - time.sleep(max_age + 1) + time.sleep(max_age + 5) expected_rows = 20 @@ -1671,7 +1671,7 @@ def test_commit_on_limit(started_cluster): additional_settings={ "keeper_path": keeper_path, "s3queue_processing_threads_num": 1, - "s3queue_loading_retries": 1, + "s3queue_loading_retries": 0, "s3queue_max_processed_files_before_commit": 10, }, ) From e77c0ad029b6843c8b08d4d008772027b3d42f13 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 17 Jun 2024 22:28:28 +0200 Subject: [PATCH 26/95] Support inline expressions --- src/Analyzer/Resolve/QueryAnalyzer.cpp | 24 ++++++++++--- src/Interpreters/Context.cpp | 4 +-- .../FunctionParameterValuesVisitor.cpp | 34 ++++++++++++------- src/Parsers/FunctionParameterValuesVisitor.h | 10 +++++- ...03167_parametrized_view_with_cte.reference | 3 ++ .../03167_parametrized_view_with_cte.sql | 3 ++ 6 files changed, 58 insertions(+), 20 deletions(-) diff --git a/src/Analyzer/Resolve/QueryAnalyzer.cpp b/src/Analyzer/Resolve/QueryAnalyzer.cpp index dde89250a53..71c7860c13c 100644 --- a/src/Analyzer/Resolve/QueryAnalyzer.cpp +++ b/src/Analyzer/Resolve/QueryAnalyzer.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -3496,7 +3497,8 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi * * 4. If node has alias, update its value in scope alias map. Deregister alias from expression_aliases_in_resolve_process. */ -ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, IdentifierResolveScope & scope, bool allow_lambda_expression, bool allow_table_expression, bool ignore_alias) +ProjectionNames QueryAnalyzer::resolveExpressionNode( + QueryTreeNodePtr & node, IdentifierResolveScope & scope, bool allow_lambda_expression, bool allow_table_expression, bool ignore_alias) { checkStackSize(); @@ -4506,11 +4508,25 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, table_name = table_identifier[1]; } - NameToNameMap param_values = analyzeFunctionParamValues(function_ast, &scope.aliases); - auto parametrized_view_storage = scope_context->getQueryContext()->buildParametrizedViewStorage( + auto context = scope_context->getQueryContext(); + auto param_values_result = analyzeFunctionParamValues(function_ast, context, &scope.aliases); + + for (const auto & [param_name, alias] : param_values_result.unresolved_values) + { + auto it = scope.aliases.alias_name_to_expression_node_before_group_by.find(alias); + if (it != scope.aliases.alias_name_to_expression_node_before_group_by.end()) + { + auto res = resolveExpressionNode(it->second, scope, false, false); + auto resolved_value = evaluateConstantExpressionOrIdentifierAsLiteral(it->second->toAST(), context); + auto resolved_value_str = convertFieldToString(resolved_value->as()->value); + param_values_result.resolved_values.emplace(param_name, std::move(resolved_value_str)); + } + } + + auto parametrized_view_storage = context->buildParametrizedViewStorage( database_name, table_name, - param_values); + param_values_result.resolved_values); if (parametrized_view_storage) { diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 6804ef9e5e4..b606d6a3628 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -1871,7 +1871,7 @@ StoragePtr Context::executeTableFunction(const ASTPtr & table_expression, const if (table.get()->isView() && table->as() && table->as()->isParameterizedView()) { auto query = table->getInMemoryMetadataPtr()->getSelectQuery().inner_query->clone(); - NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression, getQueryContext(), nullptr); + NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression, getQueryContext(), nullptr).resolved_values; StorageView::replaceQueryParametersIfParametrizedView(query, parameterized_view_values); ASTCreateQuery create; @@ -2085,7 +2085,7 @@ StoragePtr Context::buildParametrizedViewStorage(const String & database_name, c return nullptr; auto query = original_view->getInMemoryMetadataPtr()->getSelectQuery().inner_query->clone(); - StorageView::replaceQueryParametersIfParametrizedView(query, getQueryContext(), param_values); + StorageView::replaceQueryParametersIfParametrizedView(query, param_values); ASTCreateQuery create; create.select = query->as(); diff --git a/src/Parsers/FunctionParameterValuesVisitor.cpp b/src/Parsers/FunctionParameterValuesVisitor.cpp index dd3e536aadf..2b4f92d39c8 100644 --- a/src/Parsers/FunctionParameterValuesVisitor.cpp +++ b/src/Parsers/FunctionParameterValuesVisitor.cpp @@ -25,8 +25,13 @@ namespace ErrorCodes class FunctionParameterValuesVisitor { public: - explicit FunctionParameterValuesVisitor(NameToNameMap & parameter_values_, ContextPtr context_, const ScopeAliases * aliases_) - : parameter_values(parameter_values_), aliases(aliases_), context(context_) + explicit FunctionParameterValuesVisitor( + ParamValuesAnalyzeResult & result_, + ContextPtr context_, + const ScopeAliases * aliases_) + : result(result_) + , aliases(aliases_) + , context(context_) { } @@ -40,11 +45,11 @@ public: } private: - NameToNameMap & parameter_values; + ParamValuesAnalyzeResult & result; const ScopeAliases * aliases; ContextPtr context; - std::string tryGetParameterValueAsString(const ASTPtr & ast) + std::string tryGetParameterValueAsString(const std::string & param_name, const ASTPtr & ast) { if (const auto * literal = ast->as()) { @@ -57,7 +62,10 @@ private: auto it = aliases->alias_name_to_expression_node_before_group_by.find(value_identifier->name()); if (it != aliases->alias_name_to_expression_node_before_group_by.end()) { - return tryGetParameterValueAsString(it->second->toAST()); + auto value_str = tryGetParameterValueAsString(param_name, it->second->toAST()); + if (value_str.empty()) + result.unresolved_values.emplace(param_name, value_identifier->name()); + return value_str; } } } @@ -76,8 +84,8 @@ private: } else { - ASTPtr res = evaluateConstantExpressionOrIdentifierAsLiteral(expression_list->children[1], context); - parameter_values[identifier->name()] = convertFieldToString(res->as()->value); + ASTPtr res = evaluateConstantExpressionOrIdentifierAsLiteral(ast, context); + return convertFieldToString(res->as()->value); } } return ""; @@ -95,18 +103,18 @@ private: if (const auto * identifier = expression_list->children[0]->as()) { - auto value_str = tryGetParameterValueAsString(expression_list->children[1]); + auto value_str = tryGetParameterValueAsString(identifier->name(), expression_list->children[1]); if (!value_str.empty()) - parameter_values[identifier->name()] = value_str; + result.resolved_values[identifier->name()] = value_str; } } }; -NameToNameMap analyzeFunctionParamValues(const ASTPtr & ast, const ContextPtr & context, const ScopeAliases * scope_aliases) +ParamValuesAnalyzeResult analyzeFunctionParamValues(const ASTPtr & ast, const ContextPtr & context, const ScopeAliases * scope_aliases) { - NameToNameMap parameter_values; - FunctionParameterValuesVisitor(parameter_values, context, scope_aliases).visit(ast); - return parameter_values; + ParamValuesAnalyzeResult result; + FunctionParameterValuesVisitor(result, context, scope_aliases).visit(ast); + return result; } diff --git a/src/Parsers/FunctionParameterValuesVisitor.h b/src/Parsers/FunctionParameterValuesVisitor.h index 7193066ee76..89d4fe4e18b 100644 --- a/src/Parsers/FunctionParameterValuesVisitor.h +++ b/src/Parsers/FunctionParameterValuesVisitor.h @@ -10,6 +10,14 @@ namespace DB struct ScopeAliases; /// Find parameters in a query parameter values and collect them into map. -NameToNameMap analyzeFunctionParamValues(const ASTPtr & ast, ContextPtr context, const ScopeAliases * scope_aliases = nullptr); +struct ParamValuesAnalyzeResult +{ + /// Param name -> resolved param value + NameToNameMap resolved_values; + /// Pram name -> alias + NameToNameMap unresolved_values; +}; + +ParamValuesAnalyzeResult analyzeFunctionParamValues(const ASTPtr & ast, const ContextPtr & context, const ScopeAliases * scope_aliases = nullptr); } diff --git a/tests/queries/0_stateless/03167_parametrized_view_with_cte.reference b/tests/queries/0_stateless/03167_parametrized_view_with_cte.reference index 90afb158f23..951910bbe74 100644 --- a/tests/queries/0_stateless/03167_parametrized_view_with_cte.reference +++ b/tests/queries/0_stateless/03167_parametrized_view_with_cte.reference @@ -1,2 +1,5 @@ OK 123 +123 +123 +123 diff --git a/tests/queries/0_stateless/03167_parametrized_view_with_cte.sql b/tests/queries/0_stateless/03167_parametrized_view_with_cte.sql index 433f4ed040b..c64200ee8ff 100644 --- a/tests/queries/0_stateless/03167_parametrized_view_with_cte.sql +++ b/tests/queries/0_stateless/03167_parametrized_view_with_cte.sql @@ -1,4 +1,7 @@ SET allow_experimental_analyzer=1; CREATE OR REPLACE VIEW param_test AS SELECT {test_str:String} as s_result; WITH 'OK' AS s SELECT * FROM param_test(test_str=s); +WITH (SELECT 123) AS s SELECT * FROM param_test(test_str=s); +WITH (SELECT 100 + 20 + 3) AS s SELECT * FROM param_test(test_str=s); +WITH (SELECT number FROM numbers(123, 1)) AS s SELECT * FROM param_test(test_str=s); WITH CAST(123, String) AS s SELECT * FROM param_test(test_str=s); From 517c1138113220e5a4318c07806759911896234d Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Tue, 18 Jun 2024 01:19:34 +0200 Subject: [PATCH 27/95] remove debug print --- src/Processors/IProcessor.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Processors/IProcessor.cpp b/src/Processors/IProcessor.cpp index 01275785389..f403aca2280 100644 --- a/src/Processors/IProcessor.cpp +++ b/src/Processors/IProcessor.cpp @@ -16,7 +16,6 @@ void IProcessor::cancel() if (already_cancelled) return; - LOG_DEBUG(getLogger("IProcessor"), "cancel: {}", getName()); onCancel(); } From 660e7f8dd2f62f56c2731f6149e6a85d13fb8bc4 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 18 Jun 2024 11:05:00 +0200 Subject: [PATCH 28/95] Better --- src/Analyzer/Resolve/QueryAnalyzer.cpp | 23 ++++++++++++++----- src/Interpreters/Context.cpp | 2 +- .../FunctionParameterValuesVisitor.cpp | 15 +++++------- src/Parsers/FunctionParameterValuesVisitor.h | 6 ++--- 4 files changed, 27 insertions(+), 19 deletions(-) diff --git a/src/Analyzer/Resolve/QueryAnalyzer.cpp b/src/Analyzer/Resolve/QueryAnalyzer.cpp index 71c7860c13c..e9f5a55dba9 100644 --- a/src/Analyzer/Resolve/QueryAnalyzer.cpp +++ b/src/Analyzer/Resolve/QueryAnalyzer.cpp @@ -4511,22 +4511,33 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, auto context = scope_context->getQueryContext(); auto param_values_result = analyzeFunctionParamValues(function_ast, context, &scope.aliases); - for (const auto & [param_name, alias] : param_values_result.unresolved_values) + for (const auto & [param_name, alias] : param_values_result.unresolved_param_aliases) { auto it = scope.aliases.alias_name_to_expression_node_before_group_by.find(alias); if (it != scope.aliases.alias_name_to_expression_node_before_group_by.end()) { - auto res = resolveExpressionNode(it->second, scope, false, false); - auto resolved_value = evaluateConstantExpressionOrIdentifierAsLiteral(it->second->toAST(), context); - auto resolved_value_str = convertFieldToString(resolved_value->as()->value); - param_values_result.resolved_values.emplace(param_name, std::move(resolved_value_str)); + std::string resolved_value_str; + try + { + resolveExpressionNode(it->second, scope, /* allow_lambda_expression */false, /* allow_table_expression */false); + auto resolved_value = evaluateConstantExpressionOrIdentifierAsLiteral(it->second->toAST(), context); + resolved_value_str = convertFieldToString(resolved_value->as()->value); + } + catch (...) + { + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, + "Failed to resolve alias ({}) value for parameter {} for parametrized view function: {}. Error: {}", + alias, param_name, it->second->formatASTForErrorMessage(), getCurrentExceptionMessage(true)); + } + param_values_result.resolved_param_values.emplace(param_name, std::move(resolved_value_str)); } } auto parametrized_view_storage = context->buildParametrizedViewStorage( database_name, table_name, - param_values_result.resolved_values); + param_values_result.resolved_param_values); if (parametrized_view_storage) { diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index b606d6a3628..e5d1fc5248d 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -1871,7 +1871,7 @@ StoragePtr Context::executeTableFunction(const ASTPtr & table_expression, const if (table.get()->isView() && table->as() && table->as()->isParameterizedView()) { auto query = table->getInMemoryMetadataPtr()->getSelectQuery().inner_query->clone(); - NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression, getQueryContext(), nullptr).resolved_values; + NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression, getQueryContext(), nullptr).resolved_param_values; StorageView::replaceQueryParametersIfParametrizedView(query, parameterized_view_values); ASTCreateQuery create; diff --git a/src/Parsers/FunctionParameterValuesVisitor.cpp b/src/Parsers/FunctionParameterValuesVisitor.cpp index 2b4f92d39c8..452ca20ab97 100644 --- a/src/Parsers/FunctionParameterValuesVisitor.cpp +++ b/src/Parsers/FunctionParameterValuesVisitor.cpp @@ -4,10 +4,7 @@ #include #include #include -#include #include -#include -#include #include #include #include @@ -49,7 +46,7 @@ private: const ScopeAliases * aliases; ContextPtr context; - std::string tryGetParameterValueAsString(const std::string & param_name, const ASTPtr & ast) + std::optional tryGetParameterValueAsString(const std::string & param_name, const ASTPtr & ast) { if (const auto * literal = ast->as()) { @@ -63,8 +60,8 @@ private: if (it != aliases->alias_name_to_expression_node_before_group_by.end()) { auto value_str = tryGetParameterValueAsString(param_name, it->second->toAST()); - if (value_str.empty()) - result.unresolved_values.emplace(param_name, value_identifier->name()); + if (!value_str.has_value()) + result.unresolved_param_aliases.emplace(param_name, value_identifier->name()); return value_str; } } @@ -88,7 +85,7 @@ private: return convertFieldToString(res->as()->value); } } - return ""; + return std::nullopt; } void visitFunction(const ASTFunction & parameter_function) @@ -104,8 +101,8 @@ private: if (const auto * identifier = expression_list->children[0]->as()) { auto value_str = tryGetParameterValueAsString(identifier->name(), expression_list->children[1]); - if (!value_str.empty()) - result.resolved_values[identifier->name()] = value_str; + if (value_str.has_value()) + result.resolved_param_values[identifier->name()] = value_str.value(); } } }; diff --git a/src/Parsers/FunctionParameterValuesVisitor.h b/src/Parsers/FunctionParameterValuesVisitor.h index 89d4fe4e18b..50dca8d86de 100644 --- a/src/Parsers/FunctionParameterValuesVisitor.h +++ b/src/Parsers/FunctionParameterValuesVisitor.h @@ -13,9 +13,9 @@ struct ScopeAliases; struct ParamValuesAnalyzeResult { /// Param name -> resolved param value - NameToNameMap resolved_values; - /// Pram name -> alias - NameToNameMap unresolved_values; + NameToNameMap resolved_param_values; + /// Param name -> alias + NameToNameMap unresolved_param_aliases; }; ParamValuesAnalyzeResult analyzeFunctionParamValues(const ASTPtr & ast, const ContextPtr & context, const ScopeAliases * scope_aliases = nullptr); From bd9241dabe53f6919d84e4373d59060513640d6b Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 18 Jun 2024 12:10:32 +0200 Subject: [PATCH 29/95] Add log message --- .../S3Queue/S3QueueOrderedFileMetadata.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp b/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp index 7c922c0c0a3..79b39f5f641 100644 --- a/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp @@ -61,7 +61,9 @@ void S3QueueOrderedFileMetadata::BucketHolder::release() return; released = true; - LOG_TEST(getLogger("S3QueueBucketHolder"), "Releasing bucket {}", bucket_info->bucket); + + LOG_TEST(getLogger("S3QueueBucketHolder"), "Releasing bucket {}, version {}", + bucket_info->bucket, bucket_info->bucket_version); Coordination::Requests requests; /// Check that bucket lock version has not changed @@ -72,6 +74,16 @@ void S3QueueOrderedFileMetadata::BucketHolder::release() Coordination::Responses responses; const auto code = zk_client->tryMulti(requests, responses); + + if (code == Coordination::Error::ZOK) + LOG_TEST(getLogger("S3QueueBucketHolder"), "Released bucket {}, version {}", + bucket_info->bucket, bucket_info->bucket_version); + else + LOG_TRACE(getLogger("S3QueueBucketHolder"), + "Failed to released bucket {}, version {}: {}. " + "This is normal if keeper session expired.", + bucket_info->bucket, bucket_info->bucket_version, code); + zkutil::KeeperMultiException::check(code, requests, responses); } From 1c415479f0288f7f9ef120e88747acb9a88350c7 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 18 Jun 2024 18:23:42 +0200 Subject: [PATCH 30/95] Do not release buckets until the files are commited --- .../S3Queue/S3QueueOrderedFileMetadata.h | 4 + src/Storages/S3Queue/S3QueueSource.cpp | 74 +++++++++++-------- src/Storages/S3Queue/S3QueueSource.h | 64 ++++++++-------- src/Storages/S3Queue/StorageS3Queue.cpp | 3 + .../integration/test_storage_s3_queue/test.py | 2 +- 5 files changed, 86 insertions(+), 61 deletions(-) diff --git a/src/Storages/S3Queue/S3QueueOrderedFileMetadata.h b/src/Storages/S3Queue/S3QueueOrderedFileMetadata.h index 698ec0f54cc..6d495a63c51 100644 --- a/src/Storages/S3Queue/S3QueueOrderedFileMetadata.h +++ b/src/Storages/S3Queue/S3QueueOrderedFileMetadata.h @@ -86,12 +86,16 @@ struct S3QueueOrderedFileMetadata::BucketHolder Bucket getBucket() const { return bucket_info->bucket; } BucketInfoPtr getBucketInfo() const { return bucket_info; } + void setFinished() { finished = true; } + bool isFinished() const { return finished; } + void release(); private: BucketInfoPtr bucket_info; const zkutil::ZooKeeperPtr zk_client; bool released = false; + bool finished = false; }; } diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index 65ee929b2fb..0809860e3a5 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -33,7 +33,7 @@ namespace ErrorCodes StorageS3QueueSource::S3QueueObjectInfo::S3QueueObjectInfo( const ObjectInfo & object_info, - Metadata::FileMetadataPtr file_metadata_) + S3QueueMetadata::FileMetadataPtr file_metadata_) : ObjectInfo(object_info.relative_path, object_info.metadata) , file_metadata(file_metadata_) { @@ -41,7 +41,7 @@ StorageS3QueueSource::S3QueueObjectInfo::S3QueueObjectInfo( StorageS3QueueSource::FileIterator::FileIterator( std::shared_ptr metadata_, - std::unique_ptr glob_iterator_, + std::unique_ptr glob_iterator_, std::atomic & shutdown_called_, LoggerPtr logger_) : StorageObjectStorageSource::IIterator("S3QueueIterator") @@ -67,9 +67,9 @@ size_t StorageS3QueueSource::FileIterator::estimatedKeysCount() throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method estimateKeysCount is not implemented"); } -StorageS3QueueSource::ObjectInfoPtr StorageS3QueueSource::FileIterator::nextImpl(size_t processor) +StorageS3QueueSource::Source::ObjectInfoPtr StorageS3QueueSource::FileIterator::nextImpl(size_t processor) { - ObjectInfoPtr object_info; + Source::ObjectInfoPtr object_info; S3QueueOrderedFileMetadata::BucketInfoPtr bucket_info; while (!shutdown_called) @@ -112,7 +112,7 @@ StorageS3QueueSource::ObjectInfoPtr StorageS3QueueSource::FileIterator::nextImpl return {}; } -void StorageS3QueueSource::FileIterator::returnForRetry(ObjectInfoPtr object_info) +void StorageS3QueueSource::FileIterator::returnForRetry(Source::ObjectInfoPtr object_info) { chassert(object_info); if (metadata->useBucketsForProcessing()) @@ -126,20 +126,30 @@ void StorageS3QueueSource::FileIterator::returnForRetry(ObjectInfoPtr object_inf } } +void StorageS3QueueSource::FileIterator::releaseHoldBuckets() +{ + for (const auto & [_, holders] : bucket_holders) + for (const auto & bucket_holder : holders) + bucket_holder->release(); +} -std::pair +std::pair StorageS3QueueSource::FileIterator::getNextKeyFromAcquiredBucket(size_t processor) { /// We need this lock to maintain consistency between listing s3 directory /// and getting/putting result into listed_keys_cache. std::lock_guard lock(buckets_mutex); - auto bucket_holder_it = bucket_holders.emplace(processor, nullptr).first; + auto bucket_holder_it = bucket_holders.emplace(processor, std::vector{}).first; + BucketHolder * current_bucket_holder = bucket_holder_it->second.empty() || bucket_holder_it->second.back()->isFinished() + ? nullptr + : bucket_holder_it->second.back().get(); + auto current_processor = toString(processor); LOG_TEST( log, "Current processor: {}, acquired bucket: {}", - processor, bucket_holder_it->second ? toString(bucket_holder_it->second->getBucket()) : "None"); + processor, current_bucket_holder ? toString(current_bucket_holder->getBucket()) : "None"); while (true) { @@ -148,9 +158,9 @@ StorageS3QueueSource::FileIterator::getNextKeyFromAcquiredBucket(size_t processo /// In case it is already acquired, they put the key into listed_keys_cache, /// so that the thread who acquired the bucket will be able to see /// those keys without the need to list s3 directory once again. - if (bucket_holder_it->second) + if (current_bucket_holder) { - const auto bucket = bucket_holder_it->second->getBucket(); + const auto bucket = current_bucket_holder->getBucket(); auto it = listed_keys_cache.find(bucket); if (it != listed_keys_cache.end()) { @@ -183,7 +193,7 @@ StorageS3QueueSource::FileIterator::getNextKeyFromAcquiredBucket(size_t processo LOG_TEST(log, "Current bucket: {}, will process file: {}", bucket, object_info->getFileName()); - return std::pair{object_info, bucket_holder_it->second->getBucketInfo()}; + return std::pair{object_info, current_bucket_holder->getBucketInfo()}; } LOG_TEST(log, "Cache of bucket {} is empty", bucket); @@ -198,9 +208,9 @@ StorageS3QueueSource::FileIterator::getNextKeyFromAcquiredBucket(size_t processo if (iterator_finished) { - /// Bucket is fully processed - release the bucket. - bucket_holder_it->second->release(); - bucket_holder_it->second.reset(); + /// Bucket is fully processed, but we will release it later + /// - once we write and commit files via commit() method. + current_bucket_holder->setFinished(); } } /// If processing thread has already acquired some bucket @@ -209,8 +219,10 @@ StorageS3QueueSource::FileIterator::getNextKeyFromAcquiredBucket(size_t processo /// because one processing thread can acquire only one bucket at a time. /// Once a thread is finished with its acquired bucket, it checks listed_keys_cache /// to see if there are keys from buckets not acquired by anyone. - if (!bucket_holder_it->second) + if (!current_bucket_holder) { + LOG_TEST(log, "Checking caches keys: {}", listed_keys_cache.size()); + for (auto it = listed_keys_cache.begin(); it != listed_keys_cache.end();) { auto & [bucket, bucket_info] = *it; @@ -235,8 +247,8 @@ StorageS3QueueSource::FileIterator::getNextKeyFromAcquiredBucket(size_t processo continue; } - bucket_holder_it->second = metadata->tryAcquireBucket(bucket, current_processor); - if (!bucket_holder_it->second) + auto acquired_bucket = metadata->tryAcquireBucket(bucket, current_processor); + if (!acquired_bucket) { LOG_TEST(log, "Bucket {} is already locked for processing (keys: {})", bucket, bucket_keys.size()); @@ -244,6 +256,9 @@ StorageS3QueueSource::FileIterator::getNextKeyFromAcquiredBucket(size_t processo continue; } + bucket_holder_it->second.push_back(acquired_bucket); + current_bucket_holder = bucket_holder_it->second.back().get(); + bucket_processor = current_processor; /// Take the key from the front, the order is important. @@ -253,7 +268,7 @@ StorageS3QueueSource::FileIterator::getNextKeyFromAcquiredBucket(size_t processo LOG_TEST(log, "Acquired bucket: {}, will process file: {}", bucket, object_info->getFileName()); - return std::pair{object_info, bucket_holder_it->second->getBucketInfo()}; + return std::pair{object_info, current_bucket_holder->getBucketInfo()}; } } @@ -271,12 +286,12 @@ StorageS3QueueSource::FileIterator::getNextKeyFromAcquiredBucket(size_t processo LOG_TEST(log, "Found next file: {}, bucket: {}, current bucket: {}, cached_keys: {}", object_info->getFileName(), bucket, - bucket_holder_it->second ? toString(bucket_holder_it->second->getBucket()) : "None", + current_bucket_holder ? toString(current_bucket_holder->getBucket()) : "None", bucket_cache.keys.size()); - if (bucket_holder_it->second) + if (current_bucket_holder) { - if (bucket_holder_it->second->getBucket() != bucket) + if (current_bucket_holder->getBucket() != bucket) { /// Acquired bucket differs from object's bucket, /// put it into bucket's cache and continue. @@ -284,13 +299,16 @@ StorageS3QueueSource::FileIterator::getNextKeyFromAcquiredBucket(size_t processo continue; } /// Bucket is already acquired, process the file. - return std::pair{object_info, bucket_holder_it->second->getBucketInfo()}; + return std::pair{object_info, current_bucket_holder->getBucketInfo()}; } else { - bucket_holder_it->second = metadata->tryAcquireBucket(bucket, current_processor); - if (bucket_holder_it->second) + auto acquired_bucket = metadata->tryAcquireBucket(bucket, current_processor); + if (acquired_bucket) { + bucket_holder_it->second.push_back(acquired_bucket); + current_bucket_holder = bucket_holder_it->second.back().get(); + bucket_cache.processor = current_processor; if (!bucket_cache.keys.empty()) { @@ -300,7 +318,7 @@ StorageS3QueueSource::FileIterator::getNextKeyFromAcquiredBucket(size_t processo object_info = bucket_cache.keys.front(); bucket_cache.keys.pop_front(); } - return std::pair{object_info, bucket_holder_it->second->getBucketInfo()}; + return std::pair{object_info, current_bucket_holder->getBucketInfo()}; } else { @@ -312,12 +330,6 @@ StorageS3QueueSource::FileIterator::getNextKeyFromAcquiredBucket(size_t processo } else { - if (bucket_holder_it->second) - { - bucket_holder_it->second->release(); - bucket_holder_it->second.reset(); - } - LOG_TEST(log, "Reached the end of file iterator"); iterator_finished = true; diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index a59f1d5cb05..c4788e755f4 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -20,24 +20,18 @@ class StorageS3QueueSource : public ISource, WithContext { public: using Storage = StorageObjectStorage; - using ConfigurationPtr = Storage::ConfigurationPtr; - using GlobIterator = StorageObjectStorageSource::GlobIterator; - using ZooKeeperGetter = std::function; + using Source = StorageObjectStorageSource; using RemoveFileFunc = std::function; - using FileStatusPtr = S3QueueMetadata::FileStatusPtr; - using ReaderHolder = StorageObjectStorageSource::ReaderHolder; - using Metadata = S3QueueMetadata; - using ObjectInfo = StorageObjectStorageSource::ObjectInfo; - using ObjectInfoPtr = std::shared_ptr; - using ObjectInfos = std::vector; + using BucketHolderPtr = S3QueueOrderedFileMetadata::BucketHolderPtr; + using BucketHolder = S3QueueOrderedFileMetadata::BucketHolder; - struct S3QueueObjectInfo : public ObjectInfo + struct S3QueueObjectInfo : public Source::ObjectInfo { S3QueueObjectInfo( - const ObjectInfo & object_info, - Metadata::FileMetadataPtr file_metadata_); + const Source::ObjectInfo & object_info, + S3QueueMetadata::FileMetadataPtr file_metadata_); - Metadata::FileMetadataPtr file_metadata; + S3QueueMetadata::FileMetadataPtr file_metadata; }; class FileIterator : public StorageObjectStorageSource::IIterator @@ -45,7 +39,7 @@ public: public: FileIterator( std::shared_ptr metadata_, - std::unique_ptr glob_iterator_, + std::unique_ptr glob_iterator_, std::atomic & shutdown_called_, LoggerPtr logger_); @@ -54,37 +48,51 @@ public: /// Note: /// List results in s3 are always returned in UTF-8 binary order. /// (https://docs.aws.amazon.com/AmazonS3/latest/userguide/ListingKeysUsingAPIs.html) - ObjectInfoPtr nextImpl(size_t processor) override; + Source::ObjectInfoPtr nextImpl(size_t processor) override; size_t estimatedKeysCount() override; - void returnForRetry(ObjectInfoPtr object_info); + /// If the key was taken from iterator via next() call, + /// we might later want to return it back for retrying. + void returnForRetry(Source::ObjectInfoPtr object_info); + + /// Release hold buckets. + /// In fact, they will anyway be release in destructors of BucketHolder, + /// but we anyway release it explicitly, + /// because we want to be able to rethrow exceptions if they might happen. + void releaseHoldBuckets(); private: using Bucket = S3QueueMetadata::Bucket; using Processor = S3QueueMetadata::Processor; const std::shared_ptr metadata; - const std::unique_ptr glob_iterator; + const std::unique_ptr glob_iterator; std::atomic & shutdown_called; std::mutex mutex; LoggerPtr log; - std::mutex buckets_mutex; struct ListedKeys { - std::deque keys; + std::deque keys; std::optional processor; }; - std::unordered_map listed_keys_cache; + /// A cache of keys which were iterated via glob_iterator, but not taken for processing. + std::unordered_map listed_keys_cache TSA_GUARDED_BY(buckets_mutex); + + /// We store a vector of holders, because we cannot release them until processed files are commited. + std::unordered_map> bucket_holders TSA_GUARDED_BY(buckets_mutex); + /// Protects bucket_holders. + std::mutex buckets_mutex; + + /// Is glob_iterator finished? bool iterator_finished = false; - std::unordered_map bucket_holders; /// Only for processing without buckets. - std::deque objects_to_retry; + std::deque objects_to_retry; - std::pair getNextKeyFromAcquiredBucket(size_t processor); + std::pair getNextKeyFromAcquiredBucket(size_t processor); bool hasKeysForProcessor(const Processor & processor) const; }; @@ -137,11 +145,11 @@ private: RemoveFileFunc remove_file_func; LoggerPtr log; - std::vector processed_files; - std::vector failed_during_read_files; + std::vector processed_files; + std::vector failed_during_read_files; - ReaderHolder reader; - std::future reader_future; + Source::ReaderHolder reader; + std::future reader_future; std::atomic initialized{false}; size_t processed_rows_from_file = 0; @@ -150,8 +158,6 @@ private: Stopwatch total_stopwatch {CLOCK_MONOTONIC_COARSE}; - S3QueueOrderedFileMetadata::BucketHolderPtr current_bucket_holder; - Chunk generateImpl(); void applyActionAfterProcessing(const String & path); void appendLogElement(const std::string & filename, S3QueueMetadata::FileStatus & file_status_, size_t processed_rows, bool processed); diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index b00d912d9b5..70bc7bdbe91 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -521,12 +521,15 @@ bool StorageS3Queue::streamToViews() { for (auto & source : sources) source->commit(/* success */false, getCurrentExceptionMessage(true)); + + file_iterator->releaseHoldBuckets(); throw; } for (auto & source : sources) source->commit(/* success */true); + file_iterator->releaseHoldBuckets(); total_rows += rows; } diff --git a/tests/integration/test_storage_s3_queue/test.py b/tests/integration/test_storage_s3_queue/test.py index bd424d9cc0e..c3c2ef0d803 100644 --- a/tests/integration/test_storage_s3_queue/test.py +++ b/tests/integration/test_storage_s3_queue/test.py @@ -1305,7 +1305,7 @@ def test_shards_distributed(started_cluster, mode, processing_threads): def get_count(node, table_name): return int(run_query(node, f"SELECT count() FROM {table_name}")) - for _ in range(10): + for _ in range(30): if ( get_count(node, dst_table_name) + get_count(node_2, dst_table_name) ) == total_rows: From eb0b500bb03105e804e47161f1dd2216f7ab3434 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Tue, 18 Jun 2024 22:46:48 +0200 Subject: [PATCH 31/95] Fix typo --- src/Storages/S3Queue/S3QueueSource.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index c4788e755f4..839288a43e9 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -81,7 +81,7 @@ public: /// A cache of keys which were iterated via glob_iterator, but not taken for processing. std::unordered_map listed_keys_cache TSA_GUARDED_BY(buckets_mutex); - /// We store a vector of holders, because we cannot release them until processed files are commited. + /// We store a vector of holders, because we cannot release them until processed files are committed. std::unordered_map> bucket_holders TSA_GUARDED_BY(buckets_mutex); /// Protects bucket_holders. std::mutex buckets_mutex; From 96072954c12462aa5892cfbd3ebfb2ede5b7b988 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 19 Jun 2024 13:29:57 +0200 Subject: [PATCH 32/95] Fix build --- src/Storages/S3Queue/S3QueueSource.cpp | 6 +++--- src/Storages/S3Queue/S3QueueSource.h | 6 ++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index 0809860e3a5..64619446882 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -32,9 +32,9 @@ namespace ErrorCodes } StorageS3QueueSource::S3QueueObjectInfo::S3QueueObjectInfo( - const ObjectInfo & object_info, + const Source::ObjectInfo & object_info, S3QueueMetadata::FileMetadataPtr file_metadata_) - : ObjectInfo(object_info.relative_path, object_info.metadata) + : Source::ObjectInfo(object_info.relative_path, object_info.metadata) , file_metadata(file_metadata_) { } @@ -138,7 +138,7 @@ StorageS3QueueSource::FileIterator::getNextKeyFromAcquiredBucket(size_t processo { /// We need this lock to maintain consistency between listing s3 directory /// and getting/putting result into listed_keys_cache. - std::lock_guard lock(buckets_mutex); + std::lock_guard lock(mutex); auto bucket_holder_it = bucket_holders.emplace(processor, std::vector{}).first; BucketHolder * current_bucket_holder = bucket_holder_it->second.empty() || bucket_holder_it->second.back()->isFinished() diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index c4788e755f4..cd67c076992 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -79,12 +79,10 @@ public: std::optional processor; }; /// A cache of keys which were iterated via glob_iterator, but not taken for processing. - std::unordered_map listed_keys_cache TSA_GUARDED_BY(buckets_mutex); + std::unordered_map listed_keys_cache; /// We store a vector of holders, because we cannot release them until processed files are commited. - std::unordered_map> bucket_holders TSA_GUARDED_BY(buckets_mutex); - /// Protects bucket_holders. - std::mutex buckets_mutex; + std::unordered_map> bucket_holders; /// Is glob_iterator finished? bool iterator_finished = false; From cb8b823867c83f503e35e77c70d0ea882695d8f4 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Wed, 19 Jun 2024 14:58:59 +0200 Subject: [PATCH 33/95] unmute the test --- tests/integration/test_checking_s3_blobs_paranoid/test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_checking_s3_blobs_paranoid/test.py b/tests/integration/test_checking_s3_blobs_paranoid/test.py index e917417c173..1ed70e20b79 100644 --- a/tests/integration/test_checking_s3_blobs_paranoid/test.py +++ b/tests/integration/test_checking_s3_blobs_paranoid/test.py @@ -512,7 +512,6 @@ def test_when_s3_connection_reset_by_peer_at_create_mpu_retried( ), error -@pytest.mark.skip(reason="test is flaky, waiting ClickHouse/issues/64451") def test_query_is_canceled_with_inf_retries(cluster, broken_s3): node = cluster.instances["node_with_inf_s3_retries"] From 7f37de6ed2cfe52c6812dfdbe3e59e9e544e1603 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Wed, 19 Jun 2024 14:59:30 +0200 Subject: [PATCH 34/95] mute exceptions in dtor for Connection --- src/Client/Connection.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index c5b8ebfa88e..4b210c22642 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -72,8 +72,14 @@ namespace ErrorCodes Connection::~Connection() { - if (connected) - Connection::disconnect(); + try{ + if (connected) + Connection::disconnect(); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } } Connection::Connection(const String & host_, UInt16 port_, From 1b55ba32e229be248d0fcaf5e0ad998ce0ee07c0 Mon Sep 17 00:00:00 2001 From: Sema Checherinda <104093494+CheSema@users.noreply.github.com> Date: Wed, 19 Jun 2024 14:59:52 +0200 Subject: [PATCH 35/95] Update src/Client/Connection.cpp Co-authored-by: Antonio Andelic --- src/Client/Connection.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 4b210c22642..7d1fbe30752 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -38,7 +38,7 @@ #include #include -#include "Core/Types.h" +#include #include "config.h" #if USE_SSL From 53a15eaa83aa3fcd3840a648d0ac4af24aec80a1 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 19 Jun 2024 18:13:30 +0200 Subject: [PATCH 36/95] Add a comment --- src/Storages/S3Queue/S3QueueSource.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index a4e5437ed11..4770b3134af 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -121,6 +121,8 @@ public: Chunk generate() override; + /// Commit files after insertion into storage finished. + /// `success` defines whether insertion was successful or not. void commit(bool success, const std::string & exception = {}); private: From 181eb79f3fa0ba216f0669a89f49ef34a99edc81 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 19 Jun 2024 18:45:59 +0200 Subject: [PATCH 37/95] Fix --- src/Storages/S3Queue/S3QueueIFileMetadata.cpp | 13 ++++++++++--- src/Storages/S3Queue/S3QueueIFileMetadata.h | 1 + src/Storages/S3Queue/S3QueueSource.cpp | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/Storages/S3Queue/S3QueueIFileMetadata.cpp b/src/Storages/S3Queue/S3QueueIFileMetadata.cpp index 1e2daa97639..41edaed58f8 100644 --- a/src/Storages/S3Queue/S3QueueIFileMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueIFileMetadata.cpp @@ -35,6 +35,11 @@ namespace } } +void S3QueueIFileMetadata::FileStatus::setProcessingEndTime() +{ + processing_end_time = now(); +} + void S3QueueIFileMetadata::FileStatus::onProcessing() { state = FileStatus::State::Processing; @@ -44,13 +49,15 @@ void S3QueueIFileMetadata::FileStatus::onProcessing() void S3QueueIFileMetadata::FileStatus::onProcessed() { state = FileStatus::State::Processed; - processing_end_time = now(); + if (!processing_end_time) + setProcessingEndTime(); } void S3QueueIFileMetadata::FileStatus::onFailed(const std::string & exception) { state = FileStatus::State::Failed; - processing_end_time = now(); + if (!processing_end_time) + setProcessingEndTime(); std::lock_guard lock(last_exception_mutex); last_exception = exception; } @@ -233,7 +240,7 @@ void S3QueueIFileMetadata::setProcessed() LOG_TRACE(log, "Setting file {} as processed (path: {})", path, processed_node_path); ProfileEvents::increment(ProfileEvents::S3QueueProcessedFiles); - chassert(file_status->state == FileStatus::State::Processed); + file_status->onProcessed(); try { diff --git a/src/Storages/S3Queue/S3QueueIFileMetadata.h b/src/Storages/S3Queue/S3QueueIFileMetadata.h index 7b978e8c580..81026d34b05 100644 --- a/src/Storages/S3Queue/S3QueueIFileMetadata.h +++ b/src/Storages/S3Queue/S3QueueIFileMetadata.h @@ -19,6 +19,7 @@ public: None }; + void setProcessingEndTime(); void onProcessing(); void onProcessed(); void onFailed(const std::string & exception); diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index 64619446882..c55a8447c2d 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -556,7 +556,7 @@ Chunk StorageS3QueueSource::generateImpl() appendLogElement(path, *file_status, processed_rows_from_file, true); - file_status->onProcessed(); + file_status->setProcessingEndTime(); file_status.reset(); processed_rows_from_file = 0; From 73c0a65d0a42f08d2fbcfc35064755b982d0e14b Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 19 Jun 2024 19:00:28 +0200 Subject: [PATCH 38/95] Rewrite visitor to analyzer --- .../Resolve/ParametrizedViewFunctionVisitor.h | 54 +++++++++++ src/Analyzer/Resolve/QueryAnalyzer.cpp | 35 ++----- src/Interpreters/Context.cpp | 2 +- .../FunctionParameterValuesVisitor.cpp | 92 +++++++------------ src/Parsers/FunctionParameterValuesVisitor.h | 11 +-- ...03167_parametrized_view_with_cte.reference | 1 - .../03167_parametrized_view_with_cte.sql | 2 +- 7 files changed, 97 insertions(+), 100 deletions(-) create mode 100644 src/Analyzer/Resolve/ParametrizedViewFunctionVisitor.h diff --git a/src/Analyzer/Resolve/ParametrizedViewFunctionVisitor.h b/src/Analyzer/Resolve/ParametrizedViewFunctionVisitor.h new file mode 100644 index 00000000000..1845cf22286 --- /dev/null +++ b/src/Analyzer/Resolve/ParametrizedViewFunctionVisitor.h @@ -0,0 +1,54 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +class ParametrizedViewFunctionParamsVisitor : public InDepthQueryTreeVisitor +{ +public: + ParametrizedViewFunctionParamsVisitor( + std::function resolve_node_, + const ContextPtr & context_) + : context(context_) + , resolve_node(resolve_node_) + { + } + + void visitImpl(QueryTreeNodePtr & node) + { + if (auto * function_node = node->as()) + { + if (function_node->getFunctionName() != "equals") + return; + + auto nodes = function_node->getArguments().getNodes(); + if (nodes.size() != 2) + return; + + if (auto * identifier_node = nodes[0]->as()) + { + resolve_node(nodes[1]); + auto resolved_value = evaluateConstantExpressionOrIdentifierAsLiteral(nodes[1]->toAST(), context); + auto resolved_value_str = convertFieldToString(resolved_value->as()->value); + params[identifier_node->getIdentifier().getFullName()] = resolved_value_str; + } + } + } + + bool needChildVisit(const QueryTreeNodePtr &, const QueryTreeNodePtr &) { return true; } + + NameToNameMap getParams() const { return params; } + +private: + NameToNameMap params; + const ContextPtr context; + std::function resolve_node; +}; +} diff --git a/src/Analyzer/Resolve/QueryAnalyzer.cpp b/src/Analyzer/Resolve/QueryAnalyzer.cpp index e9f5a55dba9..26fa3574865 100644 --- a/src/Analyzer/Resolve/QueryAnalyzer.cpp +++ b/src/Analyzer/Resolve/QueryAnalyzer.cpp @@ -8,7 +8,6 @@ #include #include #include -#include #include #include @@ -25,7 +24,6 @@ #include #include #include -#include #include @@ -63,6 +61,7 @@ #include #include #include +#include namespace ProfileEvents { @@ -4509,35 +4508,19 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, } auto context = scope_context->getQueryContext(); - auto param_values_result = analyzeFunctionParamValues(function_ast, context, &scope.aliases); - - for (const auto & [param_name, alias] : param_values_result.unresolved_param_aliases) - { - auto it = scope.aliases.alias_name_to_expression_node_before_group_by.find(alias); - if (it != scope.aliases.alias_name_to_expression_node_before_group_by.end()) + auto params_visitor = ParametrizedViewFunctionParamsVisitor( + [&](QueryTreeNodePtr node) { - std::string resolved_value_str; - try - { - resolveExpressionNode(it->second, scope, /* allow_lambda_expression */false, /* allow_table_expression */false); - auto resolved_value = evaluateConstantExpressionOrIdentifierAsLiteral(it->second->toAST(), context); - resolved_value_str = convertFieldToString(resolved_value->as()->value); - } - catch (...) - { - throw Exception( - ErrorCodes::NOT_IMPLEMENTED, - "Failed to resolve alias ({}) value for parameter {} for parametrized view function: {}. Error: {}", - alias, param_name, it->second->formatASTForErrorMessage(), getCurrentExceptionMessage(true)); - } - param_values_result.resolved_param_values.emplace(param_name, std::move(resolved_value_str)); - } - } + resolveExpressionNode(node, scope, /* allow_lambda_expression */true, /* allow_table_function */false); + }, + context); + + params_visitor.visit(table_function_node); auto parametrized_view_storage = context->buildParametrizedViewStorage( database_name, table_name, - param_values_result.resolved_param_values); + params_visitor.getParams()); if (parametrized_view_storage) { diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index e5d1fc5248d..b714a0b26a7 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -1871,7 +1871,7 @@ StoragePtr Context::executeTableFunction(const ASTPtr & table_expression, const if (table.get()->isView() && table->as() && table->as()->isParameterizedView()) { auto query = table->getInMemoryMetadataPtr()->getSelectQuery().inner_query->clone(); - NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression, getQueryContext(), nullptr).resolved_param_values; + NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression, getQueryContext()); StorageView::replaceQueryParametersIfParametrizedView(query, parameterized_view_values); ASTCreateQuery create; diff --git a/src/Parsers/FunctionParameterValuesVisitor.cpp b/src/Parsers/FunctionParameterValuesVisitor.cpp index 452ca20ab97..eaf28bbbc41 100644 --- a/src/Parsers/FunctionParameterValuesVisitor.cpp +++ b/src/Parsers/FunctionParameterValuesVisitor.cpp @@ -4,9 +4,8 @@ #include #include #include -#include -#include #include +#include #include #include @@ -22,13 +21,9 @@ namespace ErrorCodes class FunctionParameterValuesVisitor { public: - explicit FunctionParameterValuesVisitor( - ParamValuesAnalyzeResult & result_, - ContextPtr context_, - const ScopeAliases * aliases_) - : result(result_) - , aliases(aliases_) - , context(context_) + explicit FunctionParameterValuesVisitor(NameToNameMap & parameter_values_, ContextPtr context_) + : parameter_values(parameter_values_) + , context(context_) { } @@ -36,58 +31,14 @@ public: { if (const auto * function = ast->as()) visitFunction(*function); - for (const auto & child : ast->children) visit(child); } private: - ParamValuesAnalyzeResult & result; - const ScopeAliases * aliases; + NameToNameMap & parameter_values; ContextPtr context; - std::optional tryGetParameterValueAsString(const std::string & param_name, const ASTPtr & ast) - { - if (const auto * literal = ast->as()) - { - return convertFieldToString(literal->value); - } - else if (const auto * value_identifier = ast->as()) - { - if (aliases) - { - auto it = aliases->alias_name_to_expression_node_before_group_by.find(value_identifier->name()); - if (it != aliases->alias_name_to_expression_node_before_group_by.end()) - { - auto value_str = tryGetParameterValueAsString(param_name, it->second->toAST()); - if (!value_str.has_value()) - result.unresolved_param_aliases.emplace(param_name, value_identifier->name()); - return value_str; - } - } - } - else if (const auto * function = ast->as()) - { - if (isFunctionCast(function)) - { - const auto * cast_expression = assert_cast(function->arguments.get()); - if (cast_expression->children.size() != 2) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function CAST must have exactly two arguments"); - - if (const auto * cast_literal = cast_expression->children[0]->as()) - { - return convertFieldToString(cast_literal->value); - } - } - else - { - ASTPtr res = evaluateConstantExpressionOrIdentifierAsLiteral(ast, context); - return convertFieldToString(res->as()->value); - } - } - return std::nullopt; - } - void visitFunction(const ASTFunction & parameter_function) { if (parameter_function.name != "equals" && parameter_function.children.size() != 1) @@ -100,18 +51,37 @@ private: if (const auto * identifier = expression_list->children[0]->as()) { - auto value_str = tryGetParameterValueAsString(identifier->name(), expression_list->children[1]); - if (value_str.has_value()) - result.resolved_param_values[identifier->name()] = value_str.value(); + if (const auto * literal = expression_list->children[1]->as()) + { + parameter_values[identifier->name()] = convertFieldToString(literal->value); + } + else if (const auto * function = expression_list->children[1]->as()) + { + if (isFunctionCast(function)) + { + const auto * cast_expression = assert_cast(function->arguments.get()); + if (cast_expression->children.size() != 2) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function CAST must have exactly two arguments"); + if (const auto * cast_literal = cast_expression->children[0]->as()) + { + parameter_values[identifier->name()] = convertFieldToString(cast_literal->value); + } + } + else + { + ASTPtr res = evaluateConstantExpressionOrIdentifierAsLiteral(expression_list->children[1], context); + parameter_values[identifier->name()] = convertFieldToString(res->as()->value); + } + } } } }; -ParamValuesAnalyzeResult analyzeFunctionParamValues(const ASTPtr & ast, const ContextPtr & context, const ScopeAliases * scope_aliases) +NameToNameMap analyzeFunctionParamValues(const ASTPtr & ast, ContextPtr context) { - ParamValuesAnalyzeResult result; - FunctionParameterValuesVisitor(result, context, scope_aliases).visit(ast); - return result; + NameToNameMap parameter_values; + FunctionParameterValuesVisitor(parameter_values, context).visit(ast); + return parameter_values; } diff --git a/src/Parsers/FunctionParameterValuesVisitor.h b/src/Parsers/FunctionParameterValuesVisitor.h index 50dca8d86de..8c2686dcc65 100644 --- a/src/Parsers/FunctionParameterValuesVisitor.h +++ b/src/Parsers/FunctionParameterValuesVisitor.h @@ -7,17 +7,8 @@ namespace DB { -struct ScopeAliases; /// Find parameters in a query parameter values and collect them into map. -struct ParamValuesAnalyzeResult -{ - /// Param name -> resolved param value - NameToNameMap resolved_param_values; - /// Param name -> alias - NameToNameMap unresolved_param_aliases; -}; - -ParamValuesAnalyzeResult analyzeFunctionParamValues(const ASTPtr & ast, const ContextPtr & context, const ScopeAliases * scope_aliases = nullptr); +NameToNameMap analyzeFunctionParamValues(const ASTPtr & ast, ContextPtr context); } diff --git a/tests/queries/0_stateless/03167_parametrized_view_with_cte.reference b/tests/queries/0_stateless/03167_parametrized_view_with_cte.reference index 951910bbe74..9c1bdda2e2c 100644 --- a/tests/queries/0_stateless/03167_parametrized_view_with_cte.reference +++ b/tests/queries/0_stateless/03167_parametrized_view_with_cte.reference @@ -2,4 +2,3 @@ OK 123 123 123 -123 diff --git a/tests/queries/0_stateless/03167_parametrized_view_with_cte.sql b/tests/queries/0_stateless/03167_parametrized_view_with_cte.sql index c64200ee8ff..80eba7867cc 100644 --- a/tests/queries/0_stateless/03167_parametrized_view_with_cte.sql +++ b/tests/queries/0_stateless/03167_parametrized_view_with_cte.sql @@ -4,4 +4,4 @@ WITH 'OK' AS s SELECT * FROM param_test(test_str=s); WITH (SELECT 123) AS s SELECT * FROM param_test(test_str=s); WITH (SELECT 100 + 20 + 3) AS s SELECT * FROM param_test(test_str=s); WITH (SELECT number FROM numbers(123, 1)) AS s SELECT * FROM param_test(test_str=s); -WITH CAST(123, String) AS s SELECT * FROM param_test(test_str=s); +-- WITH CAST(123, String) AS s SELECT * FROM param_test(test_str=s); From 1f0405c77d3972d8330fa2c1ac95dc0dd8424ca3 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 19 Jun 2024 21:29:59 +0200 Subject: [PATCH 39/95] Fixed test --- tests/integration/test_storage_azure_blob_storage/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 9f5aef1489c..fff786e3102 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -1217,7 +1217,7 @@ def test_filtering_by_file_or_path(cluster): node.query("SYSTEM FLUSH LOGS") result = node.query( - f"SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query like '%select%azure%test_filter%' AND type='QueryFinish'" + f"SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query ilike '%select%azure%test_filter%' AND type='QueryFinish'" ) assert int(result) == 1 From c424fcd79f593fed9ce51a63ab66fcca41f71ab6 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 20 Jun 2024 13:39:32 +0200 Subject: [PATCH 40/95] Fix --- src/Analyzer/Resolve/ParametrizedViewFunctionVisitor.h | 10 +++++----- src/Analyzer/Resolve/QueryAnalyzer.cpp | 7 ++++++- .../03167_parametrized_view_with_cte.reference | 1 + .../0_stateless/03167_parametrized_view_with_cte.sql | 2 +- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/Analyzer/Resolve/ParametrizedViewFunctionVisitor.h b/src/Analyzer/Resolve/ParametrizedViewFunctionVisitor.h index 1845cf22286..8b726853543 100644 --- a/src/Analyzer/Resolve/ParametrizedViewFunctionVisitor.h +++ b/src/Analyzer/Resolve/ParametrizedViewFunctionVisitor.h @@ -14,7 +14,7 @@ class ParametrizedViewFunctionParamsVisitor : public InDepthQueryTreeVisitor resolve_node_, + std::function resolve_node_, const ContextPtr & context_) : context(context_) , resolve_node(resolve_node_) @@ -34,8 +34,8 @@ public: if (auto * identifier_node = nodes[0]->as()) { - resolve_node(nodes[1]); - auto resolved_value = evaluateConstantExpressionOrIdentifierAsLiteral(nodes[1]->toAST(), context); + auto resolved_node = resolve_node(nodes[1]); + auto resolved_value = evaluateConstantExpressionOrIdentifierAsLiteral(resolved_node->toAST(), context); auto resolved_value_str = convertFieldToString(resolved_value->as()->value); params[identifier_node->getIdentifier().getFullName()] = resolved_value_str; } @@ -44,11 +44,11 @@ public: bool needChildVisit(const QueryTreeNodePtr &, const QueryTreeNodePtr &) { return true; } - NameToNameMap getParams() const { return params; } + const NameToNameMap & getParametersMap() const { return params; } private: NameToNameMap params; const ContextPtr context; - std::function resolve_node; + std::function resolve_node; }; } diff --git a/src/Analyzer/Resolve/QueryAnalyzer.cpp b/src/Analyzer/Resolve/QueryAnalyzer.cpp index 26fa3574865..cd2ae38e386 100644 --- a/src/Analyzer/Resolve/QueryAnalyzer.cpp +++ b/src/Analyzer/Resolve/QueryAnalyzer.cpp @@ -4512,6 +4512,11 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, [&](QueryTreeNodePtr node) { resolveExpressionNode(node, scope, /* allow_lambda_expression */true, /* allow_table_function */false); + auto alias_node = scope.aliases.alias_name_to_expression_node->find(node->getAlias()); + if (alias_node != scope.aliases.alias_name_to_expression_node->end()) + return alias_node->second; + else + return node; }, context); @@ -4520,7 +4525,7 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, auto parametrized_view_storage = context->buildParametrizedViewStorage( database_name, table_name, - params_visitor.getParams()); + params_visitor.getParametersMap()); if (parametrized_view_storage) { diff --git a/tests/queries/0_stateless/03167_parametrized_view_with_cte.reference b/tests/queries/0_stateless/03167_parametrized_view_with_cte.reference index 9c1bdda2e2c..951910bbe74 100644 --- a/tests/queries/0_stateless/03167_parametrized_view_with_cte.reference +++ b/tests/queries/0_stateless/03167_parametrized_view_with_cte.reference @@ -2,3 +2,4 @@ OK 123 123 123 +123 diff --git a/tests/queries/0_stateless/03167_parametrized_view_with_cte.sql b/tests/queries/0_stateless/03167_parametrized_view_with_cte.sql index 80eba7867cc..1ac5540047a 100644 --- a/tests/queries/0_stateless/03167_parametrized_view_with_cte.sql +++ b/tests/queries/0_stateless/03167_parametrized_view_with_cte.sql @@ -4,4 +4,4 @@ WITH 'OK' AS s SELECT * FROM param_test(test_str=s); WITH (SELECT 123) AS s SELECT * FROM param_test(test_str=s); WITH (SELECT 100 + 20 + 3) AS s SELECT * FROM param_test(test_str=s); WITH (SELECT number FROM numbers(123, 1)) AS s SELECT * FROM param_test(test_str=s); --- WITH CAST(123, String) AS s SELECT * FROM param_test(test_str=s); +WITH CAST(123, 'String') AS s SELECT * FROM param_test(test_str=s); From 1248490f0d023ab7598d2f38e847e5efffd325cb Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Thu, 20 Jun 2024 14:51:49 +0200 Subject: [PATCH 41/95] Added more tests for different signatures --- .../test_mask_sensitive_info/test.py | 36 +++++++++++++------ 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/tests/integration/test_mask_sensitive_info/test.py b/tests/integration/test_mask_sensitive_info/test.py index 965d6518164..c9895bdc2d9 100644 --- a/tests/integration/test_mask_sensitive_info/test.py +++ b/tests/integration/test_mask_sensitive_info/test.py @@ -371,11 +371,19 @@ def test_table_functions(): "DNS_ERROR", ), f"azureBlobStorage('{azure_conn_string}', 'cont', 'test_simple.csv', 'CSV')", - f"azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_1.csv', '{azure_account_name}', '{azure_account_key}')", - f"azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_2.csv', '{azure_account_name}', '{azure_account_key}', 'CSV')", - f"azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_3.csv', 'CSV')", - f"azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_4.csv', '{azure_account_name}', '{azure_account_key}')", - f"azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_5.csv', '{azure_account_name}', '{azure_account_key}', 'CSV')", + f"azureBlobStorage('{azure_conn_string}', 'cont', 'test_simple_1.csv', 'CSV', 'none')", + f"azureBlobStorage('{azure_conn_string}', 'cont', 'test_simple_2.csv', 'CSV', 'none', 'auto')", + f"azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_3.csv', '{azure_account_name}', '{azure_account_key}')", + f"azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_4.csv', '{azure_account_name}', '{azure_account_key}', 'CSV')", + f"azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_5.csv', '{azure_account_name}', '{azure_account_key}', 'CSV', 'none')", + f"azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_6.csv', '{azure_account_name}', '{azure_account_key}', 'CSV', 'none', 'auto')", + f"azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_7.csv', 'CSV')", + f"azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_8.csv', 'CSV', 'none')", + f"azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_9.csv', 'CSV', 'none', 'auto')", + f"azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_10.csv', '{azure_account_name}', '{azure_account_key}')", + f"azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_11.csv', '{azure_account_name}', '{azure_account_key}', 'CSV')", + f"azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_12.csv', '{azure_account_name}', '{azure_account_key}', 'CSV', 'none')", + f"azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_13.csv', '{azure_account_name}', '{azure_account_key}', 'CSV', 'none', 'auto')", ] def make_test_case(i): @@ -450,11 +458,19 @@ def test_table_functions(): "CREATE TABLE tablefunc31 (`x` int) AS s3('http://minio1:9001/root/data/test10.csv.gz', 'minio', '[HIDDEN]')", "CREATE TABLE tablefunc32 (`x` int) AS deltaLake('http://minio1:9001/root/data/test11.csv.gz', 'minio', '[HIDDEN]')", f"CREATE TABLE tablefunc33 (x int) AS azureBlobStorage('{azure_conn_string}', 'cont', 'test_simple.csv', 'CSV')", - f"CREATE TABLE tablefunc34 (`x` int) AS azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_1.csv', '{azure_account_name}', '[HIDDEN]')", - f"CREATE TABLE tablefunc35 (`x` int) AS azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_2.csv', '{azure_account_name}', '[HIDDEN]', 'CSV')", - f"CREATE TABLE tablefunc36 (x int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_3.csv', 'CSV')", - f"CREATE TABLE tablefunc37 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_4.csv', '{azure_account_name}', '[HIDDEN]')", - f"CREATE TABLE tablefunc38 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_5.csv', '{azure_account_name}', '[HIDDEN]', 'CSV')", + f"CREATE TABLE tablefunc34 (x int) AS azureBlobStorage('{azure_conn_string}', 'cont', 'test_simple_1.csv', 'CSV', 'none')", + f"CREATE TABLE tablefunc35 (x int) AS azureBlobStorage('{azure_conn_string}', 'cont', 'test_simple_2.csv', 'CSV', 'none', 'auto')", + f"CREATE TABLE tablefunc36 (`x` int) AS azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_3.csv', '{azure_account_name}', '[HIDDEN]')", + f"CREATE TABLE tablefunc37 (`x` int) AS azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_4.csv', '{azure_account_name}', '[HIDDEN]', 'CSV')", + f"CREATE TABLE tablefunc38 (`x` int) AS azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_5.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none')", + f"CREATE TABLE tablefunc39 (`x` int) AS azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_6.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none', 'auto')", + f"CREATE TABLE tablefunc40 (x int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_7.csv', 'CSV')", + f"CREATE TABLE tablefunc41 (x int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_8.csv', 'CSV', 'none')", + f"CREATE TABLE tablefunc42 (x int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_9.csv', 'CSV', 'none', 'auto')", + f"CREATE TABLE tablefunc43 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_10.csv', '{azure_account_name}', '[HIDDEN]')", + f"CREATE TABLE tablefunc44 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_11.csv', '{azure_account_name}', '[HIDDEN]', 'CSV')", + f"CREATE TABLE tablefunc45 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_12.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none')", + f"CREATE TABLE tablefunc46 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_13.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none', 'auto')", ], must_not_contain=[password], ) From a8924596ccfd2aac8adb925c7e59c4faf5fa1000 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 20 Jun 2024 17:21:30 +0200 Subject: [PATCH 42/95] Review fixes --- src/Storages/S3Queue/S3QueueIFileMetadata.h | 2 +- src/Storages/S3Queue/S3QueueSource.cpp | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/Storages/S3Queue/S3QueueIFileMetadata.h b/src/Storages/S3Queue/S3QueueIFileMetadata.h index 81026d34b05..708d7a434d7 100644 --- a/src/Storages/S3Queue/S3QueueIFileMetadata.h +++ b/src/Storages/S3Queue/S3QueueIFileMetadata.h @@ -62,7 +62,7 @@ public: const zkutil::ZooKeeperPtr & zk_client) = 0; FileStatusPtr getFileStatus() { return file_status; } - const std::string & getPath() { return path; } + const std::string & getPath() const { return path; } size_t getMaxTries() const { return max_loading_retries; } struct NodeMetadata diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index c55a8447c2d..fa682e123f8 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -56,9 +56,7 @@ bool StorageS3QueueSource::FileIterator::isFinished() const { LOG_TEST(log, "Iterator finished: {}, objects to retry: {}", iterator_finished, objects_to_retry.size()); return iterator_finished - && listed_keys_cache.end() == std::find_if( - listed_keys_cache.begin(), listed_keys_cache.end(), - [](const auto & v) { return !v.second.keys.empty(); }) + && std::all_of(listed_keys_cache.begin(), listed_keys_cache.end(), [](const auto & v) { return v.second.keys.empty(); }) && objects_to_retry.empty(); } @@ -547,7 +545,7 @@ Chunk StorageS3QueueSource::generateImpl() /// If we did not process any rows from the failed file, /// commit all previously processed files, - /// not to loose the work already done. + /// not to lose the work already done. return {}; } From 0b0f235a0d9b47b8a164862a48f1218fb7ef3a81 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 21 Jun 2024 16:20:57 +0200 Subject: [PATCH 43/95] Review fixes, fix race, add more logging to debug failing test --- src/Storages/S3Queue/S3QueueIFileMetadata.cpp | 10 ++++---- src/Storages/S3Queue/S3QueueIFileMetadata.h | 2 +- src/Storages/S3Queue/S3QueueMetadata.cpp | 2 +- .../S3Queue/S3QueueOrderedFileMetadata.cpp | 23 ++++++++++++------- .../S3Queue/S3QueueOrderedFileMetadata.h | 9 +++++--- src/Storages/S3Queue/S3QueueSource.cpp | 10 +++++--- src/Storages/S3Queue/S3QueueSource.h | 8 +++---- 7 files changed, 39 insertions(+), 25 deletions(-) diff --git a/src/Storages/S3Queue/S3QueueIFileMetadata.cpp b/src/Storages/S3Queue/S3QueueIFileMetadata.cpp index 41edaed58f8..6d550571f22 100644 --- a/src/Storages/S3Queue/S3QueueIFileMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueIFileMetadata.cpp @@ -258,16 +258,16 @@ void S3QueueIFileMetadata::setProcessed() LOG_TRACE(log, "Set file {} as processed (rows: {})", path, file_status->processed_rows); } -void S3QueueIFileMetadata::setFailed(const std::string & exception, bool reduce_retry_count, bool overwrite_status) +void S3QueueIFileMetadata::setFailed(const std::string & exception_message, bool reduce_retry_count, bool overwrite_status) { LOG_TRACE(log, "Setting file {} as failed (path: {}, reduce retry count: {}, exception: {})", - path, failed_node_path, reduce_retry_count, exception); + path, failed_node_path, reduce_retry_count, exception_message); ProfileEvents::increment(ProfileEvents::S3QueueFailedFiles); if (overwrite_status || file_status->state != FileStatus::State::Failed) - file_status->onFailed(exception); + file_status->onFailed(exception_message); - node_metadata.last_exception = exception; + node_metadata.last_exception = exception_message; if (reduce_retry_count) { @@ -282,7 +282,7 @@ void S3QueueIFileMetadata::setFailed(const std::string & exception, bool reduce_ { auto full_exception = fmt::format( "First exception: {}, exception while setting file as failed: {}", - exception, getCurrentExceptionMessage(true)); + exception_message, getCurrentExceptionMessage(true)); file_status->onFailed(full_exception); throw; diff --git a/src/Storages/S3Queue/S3QueueIFileMetadata.h b/src/Storages/S3Queue/S3QueueIFileMetadata.h index 708d7a434d7..6f40338a421 100644 --- a/src/Storages/S3Queue/S3QueueIFileMetadata.h +++ b/src/Storages/S3Queue/S3QueueIFileMetadata.h @@ -55,7 +55,7 @@ public: bool setProcessing(); void setProcessed(); - void setFailed(const std::string & exception, bool reduce_retry_count, bool overwrite_status); + void setFailed(const std::string & exception_message, bool reduce_retry_count, bool overwrite_status); virtual void setProcessedAtStartRequests( Coordination::Requests & requests, diff --git a/src/Storages/S3Queue/S3QueueMetadata.cpp b/src/Storages/S3Queue/S3QueueMetadata.cpp index e390dcce2b7..734d0eed625 100644 --- a/src/Storages/S3Queue/S3QueueMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueMetadata.cpp @@ -222,7 +222,7 @@ S3QueueMetadata::Bucket S3QueueMetadata::getBucketForPath(const std::string & pa S3QueueOrderedFileMetadata::BucketHolderPtr S3QueueMetadata::tryAcquireBucket(const Bucket & bucket, const Processor & processor) { - return S3QueueOrderedFileMetadata::tryAcquireBucket(zookeeper_path, bucket, processor); + return S3QueueOrderedFileMetadata::tryAcquireBucket(zookeeper_path, bucket, processor, log); } void S3QueueMetadata::initialize( diff --git a/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp b/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp index 79b39f5f641..da1e394ef82 100644 --- a/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp @@ -45,13 +45,15 @@ S3QueueOrderedFileMetadata::BucketHolder::BucketHolder( int bucket_version_, const std::string & bucket_lock_path_, const std::string & bucket_lock_id_path_, - zkutil::ZooKeeperPtr zk_client_) + zkutil::ZooKeeperPtr zk_client_, + LoggerPtr log_) : bucket_info(std::make_shared(BucketInfo{ .bucket = bucket_, .bucket_version = bucket_version_, .bucket_lock_path = bucket_lock_path_, .bucket_lock_id_path = bucket_lock_id_path_})) , zk_client(zk_client_) + , log(log_) { } @@ -62,7 +64,7 @@ void S3QueueOrderedFileMetadata::BucketHolder::release() released = true; - LOG_TEST(getLogger("S3QueueBucketHolder"), "Releasing bucket {}, version {}", + LOG_TEST(log, "Releasing bucket {}, version {}", bucket_info->bucket, bucket_info->bucket_version); Coordination::Requests requests; @@ -76,11 +78,11 @@ void S3QueueOrderedFileMetadata::BucketHolder::release() const auto code = zk_client->tryMulti(requests, responses); if (code == Coordination::Error::ZOK) - LOG_TEST(getLogger("S3QueueBucketHolder"), "Released bucket {}, version {}", + LOG_TEST(log, "Released bucket {}, version {}", bucket_info->bucket, bucket_info->bucket_version); else - LOG_TRACE(getLogger("S3QueueBucketHolder"), - "Failed to released bucket {}, version {}: {}. " + LOG_TRACE(log, + "Failed to release bucket {}, version {}: {}. " "This is normal if keeper session expired.", bucket_info->bucket, bucket_info->bucket_version, code); @@ -89,6 +91,9 @@ void S3QueueOrderedFileMetadata::BucketHolder::release() S3QueueOrderedFileMetadata::BucketHolder::~BucketHolder() { + if (!released) + LOG_TEST(log, "Releasing bucket ({}) holder in destructor", bucket_info->bucket); + try { release(); @@ -166,7 +171,8 @@ S3QueueOrderedFileMetadata::Bucket S3QueueOrderedFileMetadata::getBucketForPath( S3QueueOrderedFileMetadata::BucketHolderPtr S3QueueOrderedFileMetadata::tryAcquireBucket( const std::filesystem::path & zk_path, const Bucket & bucket, - const Processor & processor) + const Processor & processor, + LoggerPtr log_) { const auto zk_client = getZooKeeper(); const auto bucket_lock_path = zk_path / "buckets" / toString(bucket) / "lock"; @@ -195,7 +201,7 @@ S3QueueOrderedFileMetadata::BucketHolderPtr S3QueueOrderedFileMetadata::tryAcqui const auto bucket_lock_version = set_response->stat.version; LOG_TEST( - getLogger("S3QueueOrderedFileMetadata"), + log_, "Processor {} acquired bucket {} for processing (bucket lock version: {})", processor, bucket, bucket_lock_version); @@ -204,7 +210,8 @@ S3QueueOrderedFileMetadata::BucketHolderPtr S3QueueOrderedFileMetadata::tryAcqui bucket_lock_version, bucket_lock_path, bucket_lock_id_path, - zk_client); + zk_client, + log_); } if (code == Coordination::Error::ZNODEEXISTS) diff --git a/src/Storages/S3Queue/S3QueueOrderedFileMetadata.h b/src/Storages/S3Queue/S3QueueOrderedFileMetadata.h index 6d495a63c51..82ca87e3251 100644 --- a/src/Storages/S3Queue/S3QueueOrderedFileMetadata.h +++ b/src/Storages/S3Queue/S3QueueOrderedFileMetadata.h @@ -36,7 +36,8 @@ public: static BucketHolderPtr tryAcquireBucket( const std::filesystem::path & zk_path, const Bucket & bucket, - const Processor & processor); + const Processor & processor, + LoggerPtr log_); static S3QueueOrderedFileMetadata::Bucket getBucketForPath(const std::string & path, size_t buckets_num); @@ -72,14 +73,15 @@ private: bool ignore_if_exists); }; -struct S3QueueOrderedFileMetadata::BucketHolder +struct S3QueueOrderedFileMetadata::BucketHolder : private boost::noncopyable { BucketHolder( const Bucket & bucket_, int bucket_version_, const std::string & bucket_lock_path_, const std::string & bucket_lock_id_path_, - zkutil::ZooKeeperPtr zk_client_); + zkutil::ZooKeeperPtr zk_client_, + LoggerPtr log_); ~BucketHolder(); @@ -96,6 +98,7 @@ private: const zkutil::ZooKeeperPtr zk_client; bool released = false; bool finished = false; + LoggerPtr log; }; } diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index fa682e123f8..13b2af5b0df 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -126,9 +126,13 @@ void StorageS3QueueSource::FileIterator::returnForRetry(Source::ObjectInfoPtr ob void StorageS3QueueSource::FileIterator::releaseHoldBuckets() { - for (const auto & [_, holders] : bucket_holders) + for (const auto & [processor, holders] : bucket_holders) + { + LOG_TEST(log, "Releasing {} bucket holders for processor {}", holders.size(), processor); + for (const auto & bucket_holder : holders) bucket_holder->release(); + } } std::pair @@ -635,7 +639,7 @@ Chunk StorageS3QueueSource::generateImpl() return {}; } -void StorageS3QueueSource::commit(bool success, const std::string & exception) +void StorageS3QueueSource::commit(bool success, const std::string & exception_message) { LOG_TEST(log, "Having {} files to set as {}, failed files: {}", processed_files.size(), success ? "Processed" : "Failed", failed_during_read_files.size()); @@ -649,7 +653,7 @@ void StorageS3QueueSource::commit(bool success, const std::string & exception) } else file_metadata->setFailed( - exception, + exception_message, /* reduce_retry_count */false, /* overwrite_status */true); } diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index 4770b3134af..f628379f719 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -57,8 +57,8 @@ public: void returnForRetry(Source::ObjectInfoPtr object_info); /// Release hold buckets. - /// In fact, they will anyway be release in destructors of BucketHolder, - /// but we anyway release it explicitly, + /// In fact, they could be released in destructors of BucketHolder, + /// but we anyway try to release them explicitly, /// because we want to be able to rethrow exceptions if they might happen. void releaseHoldBuckets(); @@ -85,7 +85,7 @@ public: std::unordered_map> bucket_holders; /// Is glob_iterator finished? - bool iterator_finished = false; + std::atomic_bool iterator_finished = false; /// Only for processing without buckets. std::deque objects_to_retry; @@ -123,7 +123,7 @@ public: /// Commit files after insertion into storage finished. /// `success` defines whether insertion was successful or not. - void commit(bool success, const std::string & exception = {}); + void commit(bool success, const std::string & exception_message = {}); private: const String name; From 90231e302ed7dd285094a760dafaa7d300f20d18 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Fri, 21 Jun 2024 20:12:34 +0200 Subject: [PATCH 44/95] fix WriteBufferFromFileDecorator cancelation --- src/IO/WriteBuffer.cpp | 2 +- src/IO/WriteBuffer.h | 24 ++++++++++-------------- src/IO/WriteBufferDecorator.h | 2 +- src/IO/WriteBufferFromFileDecorator.cpp | 6 ++++++ src/IO/WriteBufferFromFileDecorator.h | 2 ++ src/IO/WriteBufferFromS3.cpp | 4 ++-- src/IO/WriteBufferFromS3.h | 4 ++-- 7 files changed, 24 insertions(+), 20 deletions(-) diff --git a/src/IO/WriteBuffer.cpp b/src/IO/WriteBuffer.cpp index dfcf8432afb..2ed14222ffc 100644 --- a/src/IO/WriteBuffer.cpp +++ b/src/IO/WriteBuffer.cpp @@ -20,7 +20,7 @@ WriteBuffer::~WriteBuffer() LoggerPtr log = getLogger("WriteBuffer"); LOG_ERROR( log, - "WriteBuffer is not finalized when destructor is called. " + "WriteBuffer is neither finalized nor canceled when destructor is called. " "No exceptions in flight are detected. " "The file might not be written at all or might be truncated. " "Stack trace: {}", diff --git a/src/IO/WriteBuffer.h b/src/IO/WriteBuffer.h index 7a9046ded5d..8b429891567 100644 --- a/src/IO/WriteBuffer.h +++ b/src/IO/WriteBuffer.h @@ -59,6 +59,9 @@ public: */ pos = working_buffer.begin(); bytes += bytes_in_buffer; + + cancel(); + throw; } @@ -133,28 +136,21 @@ public: catch (...) { pos = working_buffer.begin(); - finalized = true; + + cancel(); + throw; } } - void cancel() + void cancel() noexcept { if (canceled || finalized) return; LockMemoryExceptionInThread lock(VariableContext::Global); - try - { - cancelImpl(); - canceled = true; - } - catch (...) - { - pos = working_buffer.begin(); - canceled = true; - throw; - } + cancelImpl(); + canceled = true; } /// Wait for data to be reliably written. Mainly, call fsync for fd. @@ -172,7 +168,7 @@ protected: next(); } - virtual void cancelImpl() + virtual void cancelImpl() noexcept { } diff --git a/src/IO/WriteBufferDecorator.h b/src/IO/WriteBufferDecorator.h index 77f11424482..109c2bd24e4 100644 --- a/src/IO/WriteBufferDecorator.h +++ b/src/IO/WriteBufferDecorator.h @@ -47,7 +47,7 @@ public: } } - void cancelImpl() override + void cancelImpl() noexcept override { out->cancel(); } diff --git a/src/IO/WriteBufferFromFileDecorator.cpp b/src/IO/WriteBufferFromFileDecorator.cpp index 0e4e5e13a86..b1e7d843d92 100644 --- a/src/IO/WriteBufferFromFileDecorator.cpp +++ b/src/IO/WriteBufferFromFileDecorator.cpp @@ -28,6 +28,12 @@ void WriteBufferFromFileDecorator::finalizeImpl() } } +void WriteBufferFromFileDecorator::cancelImpl() noexcept +{ + SwapHelper swap(*this, *impl); + impl->cancel(); +} + WriteBufferFromFileDecorator::~WriteBufferFromFileDecorator() { /// It is not a mistake that swap is called here diff --git a/src/IO/WriteBufferFromFileDecorator.h b/src/IO/WriteBufferFromFileDecorator.h index 5344bb1425c..07f843986bb 100644 --- a/src/IO/WriteBufferFromFileDecorator.h +++ b/src/IO/WriteBufferFromFileDecorator.h @@ -24,6 +24,8 @@ public: protected: void finalizeImpl() override; + void cancelImpl() noexcept override; + std::unique_ptr impl; private: diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index bc7792ff138..7ac16c4c76b 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -224,7 +224,7 @@ void WriteBufferFromS3::finalizeImpl() } } -void WriteBufferFromS3::cancelImpl() +void WriteBufferFromS3::cancelImpl() noexcept { tryToAbortMultipartUpload(); } @@ -251,7 +251,7 @@ String WriteBufferFromS3::getShortLogDetails() const bucket, key, multipart_upload_details); } -void WriteBufferFromS3::tryToAbortMultipartUpload() +void WriteBufferFromS3::tryToAbortMultipartUpload() noexcept { try { diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index 5f06a44e5f0..b026da607c5 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -54,7 +54,7 @@ private: /// Receives response from the server after sending all data. void finalizeImpl() override; - void cancelImpl() override; + void cancelImpl() noexcept override; String getVerboseLogDetails() const; String getShortLogDetails() const; @@ -73,7 +73,7 @@ private: void createMultipartUpload(); void completeMultipartUpload(); void abortMultipartUpload(); - void tryToAbortMultipartUpload(); + void tryToAbortMultipartUpload() noexcept; S3::PutObjectRequest getPutRequest(PartData & data); void makeSinglepartUpload(PartData && data); From a902d37615712e057559bf1510d81dc4a07fb033 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Fri, 21 Jun 2024 15:39:57 -0300 Subject: [PATCH 45/95] resolve ambiguity in MARK_CACHE_SIZE default settings --- programs/server/config.xml | 4 ++-- programs/server/config.yaml.example | 2 +- src/Core/Defines.h | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/programs/server/config.xml b/programs/server/config.xml index 91066cd2859..05214ae20de 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -409,12 +409,12 @@ - 5368709120 - 5368709120 1000M 10 + + + + + + + + You should not lower this value. --> + - + + - 1000 + - 134217728 + - 10000 + + + + /var/lib/clickhouse/caches/ @@ -1642,14 +1650,6 @@ --> - - - 1073741824 - 1024 - 1048576 - 30000000 - - backups diff --git a/programs/server/config.yaml.example b/programs/server/config.yaml.example index 74b7d9d66b0..5d5499f876c 100644 --- a/programs/server/config.yaml.example +++ b/programs/server/config.yaml.example @@ -262,6 +262,9 @@ uncompressed_cache_size: 8589934592 # You should not lower this value. # mark_cache_size: 5368709120 +# For marks of secondary indices. +# index_mark_cache_size: 5368709120 + # If you enable the `min_bytes_to_use_mmap_io` setting, # the data in MergeTree tables can be read with mmap to avoid copying from kernel to userspace. # It makes sense only for large files and helps only if data reside in page cache. @@ -277,13 +280,20 @@ uncompressed_cache_size: 8589934592 # in query or server memory usage - because this memory can be discarded similar to OS page cache. # The cache is dropped (the files are closed) automatically on removal of old parts in MergeTree, # also it can be dropped manually by the SYSTEM DROP MMAP CACHE query. -mmap_cache_size: 1000 +# mmap_cache_size: 1024 # Cache size in bytes for compiled expressions. -compiled_expression_cache_size: 134217728 +# compiled_expression_cache_size: 134217728 # Cache size in elements for compiled expressions. -compiled_expression_cache_elements_size: 10000 +# compiled_expression_cache_elements_size: 10000 + +# Configuration for the query cache +# query_cache: +# max_size_in_bytes: 1073741824 +# max_entries: 1024 +# max_entry_size_in_bytes: 1048576 +# max_entry_size_in_rows: 30000000 # Path to data directory, with trailing slash. path: /var/lib/clickhouse/ diff --git a/src/Core/Defines.h b/src/Core/Defines.h index 526b27f99a5..6df335a9c8f 100644 --- a/src/Core/Defines.h +++ b/src/Core/Defines.h @@ -90,13 +90,13 @@ static constexpr auto DEFAULT_UNCOMPRESSED_CACHE_POLICY = "SLRU"; static constexpr auto DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE = 0_MiB; static constexpr auto DEFAULT_UNCOMPRESSED_CACHE_SIZE_RATIO = 0.5l; static constexpr auto DEFAULT_MARK_CACHE_POLICY = "SLRU"; -static constexpr auto DEFAULT_MARK_CACHE_MAX_SIZE = 5120_MiB; +static constexpr auto DEFAULT_MARK_CACHE_MAX_SIZE = 5_GiB; static constexpr auto DEFAULT_MARK_CACHE_SIZE_RATIO = 0.5l; static constexpr auto DEFAULT_INDEX_UNCOMPRESSED_CACHE_POLICY = "SLRU"; static constexpr auto DEFAULT_INDEX_UNCOMPRESSED_CACHE_MAX_SIZE = 0; static constexpr auto DEFAULT_INDEX_UNCOMPRESSED_CACHE_SIZE_RATIO = 0.5; static constexpr auto DEFAULT_INDEX_MARK_CACHE_POLICY = "SLRU"; -static constexpr auto DEFAULT_INDEX_MARK_CACHE_MAX_SIZE = 5120_MiB; +static constexpr auto DEFAULT_INDEX_MARK_CACHE_MAX_SIZE = 5_GiB; static constexpr auto DEFAULT_INDEX_MARK_CACHE_SIZE_RATIO = 0.3; static constexpr auto DEFAULT_MMAP_CACHE_MAX_SIZE = 1_KiB; /// chosen by rolling dice static constexpr auto DEFAULT_COMPILED_EXPRESSION_CACHE_MAX_SIZE = 128_MiB; From d7b56e112eab914568decc9c2d194917023b80f1 Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Thu, 20 Jun 2024 11:24:20 +0800 Subject: [PATCH 53/95] support writting page index into parquet file --- src/Core/Settings.h | 1 + src/Core/SettingsChangesHistory.h | 1 + src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 1 + src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp | 3 +++ 5 files changed, 7 insertions(+) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index fbab72446a0..6aa562a6853 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1160,6 +1160,7 @@ class IColumn; M(Bool, output_format_parquet_parallel_encoding, true, "Do Parquet encoding in multiple threads. Requires output_format_parquet_use_custom_encoder.", 0) \ M(UInt64, output_format_parquet_data_page_size, 1024 * 1024, "Target page size in bytes, before compression.", 0) \ M(UInt64, output_format_parquet_batch_size, 1024, "Check page size every this many rows. Consider decreasing if you have columns with average values size above a few KBs.", 0) \ + M(Bool, output_format_parquet_write_page_index, false, "Write page index into parquet files.") M(String, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy', 'zstd'.", 0) \ M(UInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \ M(String, output_format_avro_string_column_pattern, "", "For Avro format: regexp of String columns to select as AVRO string.", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index fbc414d4f2f..0e621be494d 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -115,6 +115,7 @@ static const std::map chunks) parquet::WriterProperties::Builder builder; builder.version(getParquetVersion(format_settings)); builder.compression(getParquetCompression(format_settings.parquet.output_compression_method)); + // write page index is disable at default. + if (format_settings.parquet.write_page_index) + builder.enable_write_page_index(); parquet::ArrowWriterProperties::Builder writer_props_builder; if (format_settings.parquet.output_compliant_nested_types) From ea4eb1f3881bf851a6bdac21262aab60f6fef3f7 Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Thu, 20 Jun 2024 17:43:13 +0800 Subject: [PATCH 54/95] update --- docs/en/interfaces/formats.md | 1 + docs/en/operations/settings/settings-formats.md | 7 +++++++ src/Core/Settings.h | 2 +- src/Core/SettingsChangesHistory.h | 3 ++- 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index ffdd7e2ca25..a4dcc5d0492 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -2169,6 +2169,7 @@ To exchange data with Hadoop, you can use [HDFS table engine](/docs/en/engines/t - [output_format_parquet_compression_method](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_compression_method) - compression method used in output Parquet format. Default value - `lz4`. - [input_format_parquet_max_block_size](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_max_block_size) - Max block row size for parquet reader. Default value - `65409`. - [input_format_parquet_prefer_block_bytes](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_prefer_block_bytes) - Average block bytes output by parquet reader. Default value - `16744704`. +- [output_format_parquet_write_page_index](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_max_block_size) - Add a posibility to write page index into parquet files. Need to disable `output_format_parquet_use_custom_encoder` at present. Default value - `true`. ## ParquetMetadata {data-format-parquet-metadata} diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index 670c9c6cbf1..ce98604f288 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -1428,6 +1428,13 @@ Average block bytes output by parquet reader. Lowering the configuration in the Default value: `65409 * 256 = 16744704` +### output_format_parquet_write_page_index {#input_format_parquet_max_block_size} + +Could add page index into parquet files. To enable this, need set `output_format_parquet_use_custom_encoder`=`false` and +`output_format_parquet_write_page_index`=`true`. + +Enable by defautl. + ## Hive format settings {#hive-format-settings} ### input_format_hive_text_fields_delimiter {#input_format_hive_text_fields_delimiter} diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 6aa562a6853..a08e5f31435 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1160,7 +1160,7 @@ class IColumn; M(Bool, output_format_parquet_parallel_encoding, true, "Do Parquet encoding in multiple threads. Requires output_format_parquet_use_custom_encoder.", 0) \ M(UInt64, output_format_parquet_data_page_size, 1024 * 1024, "Target page size in bytes, before compression.", 0) \ M(UInt64, output_format_parquet_batch_size, 1024, "Check page size every this many rows. Consider decreasing if you have columns with average values size above a few KBs.", 0) \ - M(Bool, output_format_parquet_write_page_index, false, "Write page index into parquet files.") + M(Bool, output_format_parquet_write_page_index, true, "Add a posibility to write page index into parquet files.") \ M(String, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy', 'zstd'.", 0) \ M(UInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \ M(String, output_format_avro_string_column_pattern, "", "For Avro format: regexp of String columns to select as AVRO string.", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 0e621be494d..113b527e371 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -86,6 +86,8 @@ namespace SettingsChangesHistory /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972) static const std::map settings_changes_history = { + {"24.7", {{"output_format_parquet_write_page_index", false, true, "Add a posibility to write page index into parquet files."}, + }}, {"24.6", {{"materialize_skip_indexes_on_insert", true, true, "Added new setting to allow to disable materialization of skip indexes on insert"}, {"materialize_statistics_on_insert", true, true, "Added new setting to allow to disable materialization of statistics on insert"}, {"input_format_parquet_use_native_reader", false, false, "When reading Parquet files, to use native reader instead of arrow reader."}, @@ -115,7 +117,6 @@ static const std::map Date: Fri, 21 Jun 2024 08:59:06 +0800 Subject: [PATCH 55/95] fix typos --- docs/en/interfaces/formats.md | 2 +- docs/en/operations/settings/settings-formats.md | 2 +- src/Core/Settings.h | 2 +- src/Core/SettingsChangesHistory.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index a4dcc5d0492..a81a17e65d6 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -2169,7 +2169,7 @@ To exchange data with Hadoop, you can use [HDFS table engine](/docs/en/engines/t - [output_format_parquet_compression_method](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_compression_method) - compression method used in output Parquet format. Default value - `lz4`. - [input_format_parquet_max_block_size](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_max_block_size) - Max block row size for parquet reader. Default value - `65409`. - [input_format_parquet_prefer_block_bytes](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_prefer_block_bytes) - Average block bytes output by parquet reader. Default value - `16744704`. -- [output_format_parquet_write_page_index](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_max_block_size) - Add a posibility to write page index into parquet files. Need to disable `output_format_parquet_use_custom_encoder` at present. Default value - `true`. +- [output_format_parquet_write_page_index](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_max_block_size) - Add a possibility to write page index into parquet files. Need to disable `output_format_parquet_use_custom_encoder` at present. Default value - `true`. ## ParquetMetadata {data-format-parquet-metadata} diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index ce98604f288..530023df5b7 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -1433,7 +1433,7 @@ Default value: `65409 * 256 = 16744704` Could add page index into parquet files. To enable this, need set `output_format_parquet_use_custom_encoder`=`false` and `output_format_parquet_write_page_index`=`true`. -Enable by defautl. +Enable by default. ## Hive format settings {#hive-format-settings} diff --git a/src/Core/Settings.h b/src/Core/Settings.h index a08e5f31435..96a840bc376 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1160,7 +1160,7 @@ class IColumn; M(Bool, output_format_parquet_parallel_encoding, true, "Do Parquet encoding in multiple threads. Requires output_format_parquet_use_custom_encoder.", 0) \ M(UInt64, output_format_parquet_data_page_size, 1024 * 1024, "Target page size in bytes, before compression.", 0) \ M(UInt64, output_format_parquet_batch_size, 1024, "Check page size every this many rows. Consider decreasing if you have columns with average values size above a few KBs.", 0) \ - M(Bool, output_format_parquet_write_page_index, true, "Add a posibility to write page index into parquet files.") \ + M(Bool, output_format_parquet_write_page_index, true, "Add a possibility to write page index into parquet files.") \ M(String, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy', 'zstd'.", 0) \ M(UInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \ M(String, output_format_avro_string_column_pattern, "", "For Avro format: regexp of String columns to select as AVRO string.", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 113b527e371..fba6386b9bd 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -86,7 +86,7 @@ namespace SettingsChangesHistory /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972) static const std::map settings_changes_history = { - {"24.7", {{"output_format_parquet_write_page_index", false, true, "Add a posibility to write page index into parquet files."}, + {"24.7", {{"output_format_parquet_write_page_index", false, true, "Add a possibility to write page index into parquet files."}, }}, {"24.6", {{"materialize_skip_indexes_on_insert", true, true, "Added new setting to allow to disable materialization of skip indexes on insert"}, {"materialize_statistics_on_insert", true, true, "Added new setting to allow to disable materialization of statistics on insert"}, From db0b438d53bee36163e848764319124d565fd7db Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Fri, 21 Jun 2024 16:11:28 +0800 Subject: [PATCH 56/95] fixed --- src/Core/Settings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 96a840bc376..41878142bdc 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1160,7 +1160,7 @@ class IColumn; M(Bool, output_format_parquet_parallel_encoding, true, "Do Parquet encoding in multiple threads. Requires output_format_parquet_use_custom_encoder.", 0) \ M(UInt64, output_format_parquet_data_page_size, 1024 * 1024, "Target page size in bytes, before compression.", 0) \ M(UInt64, output_format_parquet_batch_size, 1024, "Check page size every this many rows. Consider decreasing if you have columns with average values size above a few KBs.", 0) \ - M(Bool, output_format_parquet_write_page_index, true, "Add a possibility to write page index into parquet files.") \ + M(Bool, output_format_parquet_write_page_index, true, "Add a possibility to write page index into parquet files.", 0) \ M(String, output_format_avro_codec, "", "Compression codec used for output. Possible values: 'null', 'deflate', 'snappy', 'zstd'.", 0) \ M(UInt64, output_format_avro_sync_interval, 16 * 1024, "Sync interval in bytes.", 0) \ M(String, output_format_avro_string_column_pattern, "", "For Avro format: regexp of String columns to select as AVRO string.", 0) \ From cd6995e266d1c3fd3a59839a2fe86fc994f7c635 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 24 Jun 2024 12:17:29 +0200 Subject: [PATCH 57/95] Fix race --- src/Storages/S3Queue/S3QueueSource.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index 13b2af5b0df..a73a7c34a6a 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -74,10 +74,12 @@ StorageS3QueueSource::Source::ObjectInfoPtr StorageS3QueueSource::FileIterator:: { if (metadata->useBucketsForProcessing()) { + std::lock_guard lock(mutex); std::tie(object_info, bucket_info) = getNextKeyFromAcquiredBucket(processor); } else { + std::lock_guard lock(mutex); if (objects_to_retry.empty()) { object_info = glob_iterator->next(processor); @@ -138,10 +140,6 @@ void StorageS3QueueSource::FileIterator::releaseHoldBuckets() std::pair StorageS3QueueSource::FileIterator::getNextKeyFromAcquiredBucket(size_t processor) { - /// We need this lock to maintain consistency between listing s3 directory - /// and getting/putting result into listed_keys_cache. - std::lock_guard lock(mutex); - auto bucket_holder_it = bucket_holders.emplace(processor, std::vector{}).first; BucketHolder * current_bucket_holder = bucket_holder_it->second.empty() || bucket_holder_it->second.back()->isFinished() ? nullptr From 8bdbb8c383e7eabd81ab753c6f221b0ed1452b92 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Mon, 24 Jun 2024 13:54:37 +0200 Subject: [PATCH 58/95] fix WriteBufferFromFile --- src/IO/WriteBufferFromFile.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/IO/WriteBufferFromFile.cpp b/src/IO/WriteBufferFromFile.cpp index 8da8eb96dfd..37b1161356f 100644 --- a/src/IO/WriteBufferFromFile.cpp +++ b/src/IO/WriteBufferFromFile.cpp @@ -112,7 +112,8 @@ void WriteBufferFromFile::close() if (fd < 0) return; - finalize(); + if (!canceled) + finalize(); if (0 != ::close(fd)) throw Exception(ErrorCodes::CANNOT_CLOSE_FILE, "Cannot close file"); From eb36efbfb4eecc2415d9e3c43841cc043070302d Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 24 Jun 2024 16:43:14 +0200 Subject: [PATCH 59/95] Update implementation --- .../Resolve/ParametrizedViewFunctionVisitor.h | 54 ------------------- src/Analyzer/Resolve/QueryAnalyzer.cpp | 36 ++++++++----- 2 files changed, 22 insertions(+), 68 deletions(-) delete mode 100644 src/Analyzer/Resolve/ParametrizedViewFunctionVisitor.h diff --git a/src/Analyzer/Resolve/ParametrizedViewFunctionVisitor.h b/src/Analyzer/Resolve/ParametrizedViewFunctionVisitor.h deleted file mode 100644 index 8b726853543..00000000000 --- a/src/Analyzer/Resolve/ParametrizedViewFunctionVisitor.h +++ /dev/null @@ -1,54 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -class ParametrizedViewFunctionParamsVisitor : public InDepthQueryTreeVisitor -{ -public: - ParametrizedViewFunctionParamsVisitor( - std::function resolve_node_, - const ContextPtr & context_) - : context(context_) - , resolve_node(resolve_node_) - { - } - - void visitImpl(QueryTreeNodePtr & node) - { - if (auto * function_node = node->as()) - { - if (function_node->getFunctionName() != "equals") - return; - - auto nodes = function_node->getArguments().getNodes(); - if (nodes.size() != 2) - return; - - if (auto * identifier_node = nodes[0]->as()) - { - auto resolved_node = resolve_node(nodes[1]); - auto resolved_value = evaluateConstantExpressionOrIdentifierAsLiteral(resolved_node->toAST(), context); - auto resolved_value_str = convertFieldToString(resolved_value->as()->value); - params[identifier_node->getIdentifier().getFullName()] = resolved_value_str; - } - } - } - - bool needChildVisit(const QueryTreeNodePtr &, const QueryTreeNodePtr &) { return true; } - - const NameToNameMap & getParametersMap() const { return params; } - -private: - NameToNameMap params; - const ContextPtr context; - std::function resolve_node; -}; -} diff --git a/src/Analyzer/Resolve/QueryAnalyzer.cpp b/src/Analyzer/Resolve/QueryAnalyzer.cpp index cd2ae38e386..f731e573135 100644 --- a/src/Analyzer/Resolve/QueryAnalyzer.cpp +++ b/src/Analyzer/Resolve/QueryAnalyzer.cpp @@ -61,7 +61,6 @@ #include #include #include -#include namespace ProfileEvents { @@ -4507,25 +4506,34 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, table_name = table_identifier[1]; } - auto context = scope_context->getQueryContext(); - auto params_visitor = ParametrizedViewFunctionParamsVisitor( - [&](QueryTreeNodePtr node) + /// Collect parametrized view arguments + NameToNameMap view_params; + for (const auto & argument : table_function_node_typed.getArguments()) + { + if (auto * arg_func = argument->as()) { - resolveExpressionNode(node, scope, /* allow_lambda_expression */true, /* allow_table_function */false); - auto alias_node = scope.aliases.alias_name_to_expression_node->find(node->getAlias()); - if (alias_node != scope.aliases.alias_name_to_expression_node->end()) - return alias_node->second; - else - return node; - }, - context); + if (arg_func->getFunctionName() != "equals") + continue; - params_visitor.visit(table_function_node); + auto nodes = arg_func->getArguments().getNodes(); + if (nodes.size() != 2) + return; + + if (auto * identifier_node = nodes[0]->as()) + { + resolveExpressionNode(nodes[1], scope, /* allow_lambda_expression */false, /* allow_table_function */false); + if (auto * constant = nodes[1]->as()) + { + view_params[identifier_node->getIdentifier().getFullName()] = constant->getValueStringRepresentation(); + } + } + } + } auto parametrized_view_storage = context->buildParametrizedViewStorage( database_name, table_name, - params_visitor.getParametersMap()); + view_params); if (parametrized_view_storage) { From 8f81dc49d3d1bdf349dedff52a512f520a3d3714 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Mon, 24 Jun 2024 16:50:45 +0200 Subject: [PATCH 60/95] fix d-tor in WriteBufferFromS3 --- src/IO/WriteBufferFromS3.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 7ac16c4c76b..3682e49b018 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -270,12 +270,15 @@ WriteBufferFromS3::~WriteBufferFromS3() LOG_TRACE(limitedLog, "Close WriteBufferFromS3. {}.", getShortLogDetails()); if (canceled) + { LOG_INFO( log, "WriteBufferFromS3 was canceled." "The file might not be written to S3. " "{}.", getVerboseLogDetails()); + return; + } /// That destructor could be call with finalized=false in case of exceptions if (!finalized && !canceled) From 13c2f7221dc39c5b51e39ddaec0b9d24754c12e7 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 24 Jun 2024 18:25:06 +0200 Subject: [PATCH 61/95] Fix build --- src/Analyzer/Resolve/QueryAnalyzer.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Analyzer/Resolve/QueryAnalyzer.cpp b/src/Analyzer/Resolve/QueryAnalyzer.cpp index f731e573135..64ddf59089a 100644 --- a/src/Analyzer/Resolve/QueryAnalyzer.cpp +++ b/src/Analyzer/Resolve/QueryAnalyzer.cpp @@ -4530,6 +4530,7 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, } } + auto context = scope_context->getQueryContext(); auto parametrized_view_storage = context->buildParametrizedViewStorage( database_name, table_name, From 96a64620597696fbaada411966c983b28ee04056 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Mon, 24 Jun 2024 21:12:06 +0200 Subject: [PATCH 62/95] fix CascadeWriteBuffer --- .../gtest_cascade_and_memory_write_buffer.cpp | 2 +- src/IO/CascadeWriteBuffer.cpp | 16 +++++++++++++++- src/IO/CascadeWriteBuffer.h | 3 ++- src/IO/MemoryReadWriteBuffer.cpp | 2 +- src/IO/MemoryReadWriteBuffer.h | 7 ------- src/IO/WriteBuffer.h | 16 ++++++++++++++++ src/IO/WriteBufferFromVector.h | 3 ++- 7 files changed, 37 insertions(+), 12 deletions(-) diff --git a/src/Disks/tests/gtest_cascade_and_memory_write_buffer.cpp b/src/Disks/tests/gtest_cascade_and_memory_write_buffer.cpp index 23b783173c8..f5d34f7f70c 100644 --- a/src/Disks/tests/gtest_cascade_and_memory_write_buffer.cpp +++ b/src/Disks/tests/gtest_cascade_and_memory_write_buffer.cpp @@ -222,7 +222,7 @@ TEST(MemoryWriteBuffer, WriteAndReread) if (s > 1) { MemoryWriteBuffer buf(s - 1); - EXPECT_THROW(buf.write(data.data(), data.size()), MemoryWriteBuffer::CurrentBufferExhausted); + EXPECT_THROW(buf.write(data.data(), data.size()), WriteBuffer::CurrentBufferExhausted); buf.finalize(); } } diff --git a/src/IO/CascadeWriteBuffer.cpp b/src/IO/CascadeWriteBuffer.cpp index 91a42e77fdb..4542ffc88f7 100644 --- a/src/IO/CascadeWriteBuffer.cpp +++ b/src/IO/CascadeWriteBuffer.cpp @@ -36,7 +36,7 @@ void CascadeWriteBuffer::nextImpl() curr_buffer->position() = position(); curr_buffer->next(); } - catch (const MemoryWriteBuffer::CurrentBufferExhausted &) + catch (const WriteBuffer::CurrentBufferExhausted &) { if (curr_buffer_num < num_sources) { @@ -83,6 +83,20 @@ void CascadeWriteBuffer::finalizeImpl() } } +void CascadeWriteBuffer::cancelImpl() noexcept +{ + if (curr_buffer) + curr_buffer->position() = position(); + + for (auto & buf : prepared_sources) + { + if (buf) + { + buf->cancel(); + } + } +} + WriteBuffer * CascadeWriteBuffer::setNextBuffer() { if (first_lazy_source_num <= curr_buffer_num && curr_buffer_num < num_sources) diff --git a/src/IO/CascadeWriteBuffer.h b/src/IO/CascadeWriteBuffer.h index a003d11bd8a..7a8b11c6a87 100644 --- a/src/IO/CascadeWriteBuffer.h +++ b/src/IO/CascadeWriteBuffer.h @@ -16,7 +16,7 @@ namespace ErrorCodes * (lazy_sources contains not pointers themself, but their delayed constructors) * * Firtly, CascadeWriteBuffer redirects data to first buffer of the sequence - * If current WriteBuffer cannot receive data anymore, it throws special exception MemoryWriteBuffer::CurrentBufferExhausted in nextImpl() body, + * If current WriteBuffer cannot receive data anymore, it throws special exception WriteBuffer::CurrentBufferExhausted in nextImpl() body, * CascadeWriteBuffer prepare next buffer and continuously redirects data to it. * If there are no buffers anymore CascadeWriteBuffer throws an exception. * @@ -48,6 +48,7 @@ public: private: void finalizeImpl() override; + void cancelImpl() noexcept override; WriteBuffer * setNextBuffer(); diff --git a/src/IO/MemoryReadWriteBuffer.cpp b/src/IO/MemoryReadWriteBuffer.cpp index 1f4d350f083..c79ee1d6f58 100644 --- a/src/IO/MemoryReadWriteBuffer.cpp +++ b/src/IO/MemoryReadWriteBuffer.cpp @@ -112,7 +112,7 @@ void MemoryWriteBuffer::addChunk() if (0 == next_chunk_size) { set(position(), 0); - throw MemoryWriteBuffer::CurrentBufferExhausted(); + throw WriteBuffer::CurrentBufferExhausted(); } } diff --git a/src/IO/MemoryReadWriteBuffer.h b/src/IO/MemoryReadWriteBuffer.h index d7ca992aa44..feb1499d12f 100644 --- a/src/IO/MemoryReadWriteBuffer.h +++ b/src/IO/MemoryReadWriteBuffer.h @@ -16,13 +16,6 @@ namespace DB class MemoryWriteBuffer : public WriteBuffer, public IReadableWriteBuffer, boost::noncopyable, private Allocator { public: - /// Special exception to throw when the current WriteBuffer cannot receive data - class CurrentBufferExhausted : public std::exception - { - public: - const char * what() const noexcept override { return "MemoryWriteBuffer limit is exhausted"; } - }; - /// Use max_total_size_ = 0 for unlimited storage explicit MemoryWriteBuffer( size_t max_total_size_ = 0, diff --git a/src/IO/WriteBuffer.h b/src/IO/WriteBuffer.h index 8b429891567..5b9381334a8 100644 --- a/src/IO/WriteBuffer.h +++ b/src/IO/WriteBuffer.h @@ -20,6 +20,7 @@ namespace ErrorCodes } + /** A simple abstract class for buffered data writing (char sequences) somewhere. * Unlike std::ostream, it provides access to the internal buffer, * and also allows you to manually manage the position inside the buffer. @@ -29,6 +30,14 @@ namespace ErrorCodes class WriteBuffer : public BufferBase { public: + /// Special exception to throw when the current WriteBuffer cannot receive data + /// It is used in MemoryWriteBuffer and CascadeWriteBuffer + class CurrentBufferExhausted : public std::exception + { + public: + const char * what() const noexcept override { return "WriteBuffer limit is exhausted"; } + }; + using BufferBase::set; using BufferBase::position; void set(Position ptr, size_t size) { BufferBase::set(ptr, size, 0); } @@ -52,6 +61,13 @@ public: { nextImpl(); } + catch (const CurrentBufferExhausted &) + { + pos = working_buffer.begin(); + bytes += bytes_in_buffer; + + throw; + } catch (...) { /** If the nextImpl() call was unsuccessful, move the cursor to the beginning, diff --git a/src/IO/WriteBufferFromVector.h b/src/IO/WriteBufferFromVector.h index 1ea32af2968..17a329d401d 100644 --- a/src/IO/WriteBufferFromVector.h +++ b/src/IO/WriteBufferFromVector.h @@ -63,7 +63,8 @@ public: ~WriteBufferFromVector() override { - finalize(); + if (!canceled) + finalize(); } private: From 1ab53553657a12e6fbdbe6614e3b6be4908958d2 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Mon, 24 Jun 2024 20:16:19 +0100 Subject: [PATCH 63/95] fix --- src/Processors/Transforms/MergeJoinTransform.cpp | 3 ++- .../queries/0_stateless/03174_merge_join_bug.reference | 10 ++++++++++ tests/queries/0_stateless/03174_merge_join_bug.sql | 10 ++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/03174_merge_join_bug.reference create mode 100644 tests/queries/0_stateless/03174_merge_join_bug.sql diff --git a/src/Processors/Transforms/MergeJoinTransform.cpp b/src/Processors/Transforms/MergeJoinTransform.cpp index 159a3244fe9..8d3cae03369 100644 --- a/src/Processors/Transforms/MergeJoinTransform.cpp +++ b/src/Processors/Transforms/MergeJoinTransform.cpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include @@ -19,6 +18,7 @@ #include #include #include +#include #include @@ -260,6 +260,7 @@ void FullMergeJoinCursor::setChunk(Chunk && chunk) return; } + convertToFullIfSparse(chunk); current_chunk = std::move(chunk); cursor = SortCursorImpl(sample_block, current_chunk.getColumns(), desc); } diff --git a/tests/queries/0_stateless/03174_merge_join_bug.reference b/tests/queries/0_stateless/03174_merge_join_bug.reference new file mode 100644 index 00000000000..af98bcd6397 --- /dev/null +++ b/tests/queries/0_stateless/03174_merge_join_bug.reference @@ -0,0 +1,10 @@ +0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 diff --git a/tests/queries/0_stateless/03174_merge_join_bug.sql b/tests/queries/0_stateless/03174_merge_join_bug.sql new file mode 100644 index 00000000000..026c352c2b3 --- /dev/null +++ b/tests/queries/0_stateless/03174_merge_join_bug.sql @@ -0,0 +1,10 @@ +-- Tags: no-random-settings + +-- https://github.com/ClickHouse/ClickHouse/issues/24395 +SET allow_experimental_analyzer=1, join_algorithm = 'full_sorting_merge'; +CREATE TABLE xxxx_yyy (key UInt32, key_b ALIAS key) ENGINE=MergeTree() ORDER BY key SETTINGS ratio_of_defaults_for_sparse_serialization=0.0; +INSERT INTO xxxx_yyy SELECT number FROM numbers(10); + +SELECT * +FROM xxxx_yyy AS a +INNER JOIN xxxx_yyy AS b ON a.key = b.key_b; From 8e0a0b0a759949d06652f93b40cd88b1104d02c2 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Mon, 24 Jun 2024 22:04:34 +0100 Subject: [PATCH 64/95] illusion of safety --- src/Processors/Transforms/MergeJoinTransform.cpp | 12 ++++++++++-- src/Processors/Transforms/MergeJoinTransform.h | 6 +----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/Processors/Transforms/MergeJoinTransform.cpp b/src/Processors/Transforms/MergeJoinTransform.cpp index 8d3cae03369..fb3b2faa9c5 100644 --- a/src/Processors/Transforms/MergeJoinTransform.cpp +++ b/src/Processors/Transforms/MergeJoinTransform.cpp @@ -40,7 +40,7 @@ FullMergeJoinCursorPtr createCursor(const Block & block, const Names & columns) desc.reserve(columns.size()); for (const auto & name : columns) desc.emplace_back(name); - return std::make_unique(materializeBlock(block), desc); + return std::make_unique(block, desc); } template @@ -234,9 +234,14 @@ void inline addMany(PaddedPODArray & left_or_right_map, size_t idx, size for (size_t i = 0; i < num; ++i) left_or_right_map.push_back(idx); } - } +FullMergeJoinCursor::FullMergeJoinCursor(const Block & sample_block_, const SortDescription & description_) + : sample_block(materializeBlock(sample_block_).cloneEmpty()), desc(description_) +{ +} + + const Chunk & FullMergeJoinCursor::getCurrent() const { return current_chunk; @@ -260,7 +265,10 @@ void FullMergeJoinCursor::setChunk(Chunk && chunk) return; } + // should match the structure of sample_block (after materialization) + convertToFullIfConst(chunk); convertToFullIfSparse(chunk); + current_chunk = std::move(chunk); cursor = SortCursorImpl(sample_block, current_chunk.getColumns(), desc); } diff --git a/src/Processors/Transforms/MergeJoinTransform.h b/src/Processors/Transforms/MergeJoinTransform.h index cf9331abd59..5ca6b076544 100644 --- a/src/Processors/Transforms/MergeJoinTransform.h +++ b/src/Processors/Transforms/MergeJoinTransform.h @@ -193,11 +193,7 @@ private: class FullMergeJoinCursor : boost::noncopyable { public: - explicit FullMergeJoinCursor(const Block & sample_block_, const SortDescription & description_) - : sample_block(sample_block_.cloneEmpty()) - , desc(description_) - { - } + explicit FullMergeJoinCursor(const Block & sample_block_, const SortDescription & description_); bool fullyCompleted() const; void setChunk(Chunk && chunk); From d66aedf3dbddcd2e9ca3fd7ef5c836bdd1af1d03 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 25 Jun 2024 02:52:42 +0000 Subject: [PATCH 65/95] Fix overflow in StorageWindowView --- src/Storages/WindowView/StorageWindowView.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Storages/WindowView/StorageWindowView.cpp b/src/Storages/WindowView/StorageWindowView.cpp index a6e7ce301a0..eea9f9ad38f 100644 --- a/src/Storages/WindowView/StorageWindowView.cpp +++ b/src/Storages/WindowView/StorageWindowView.cpp @@ -1068,9 +1068,10 @@ void StorageWindowView::threadFuncFireProc() if (max_watermark >= timestamp_now) clean_cache_task->schedule(); + UInt64 next_fire_ms = static_cast(next_fire_signal) * 1000; UInt64 timestamp_ms = static_cast(Poco::Timestamp().epochMicroseconds()) / 1000; if (!shutdown_called) - fire_task->scheduleAfter(std::max(UInt64(0), static_cast(next_fire_signal) * 1000 - timestamp_ms)); + fire_task->scheduleAfter(std::max(UInt64(0), next_fire_ms - std::min(next_fire_ms, timestamp_ms))); } void StorageWindowView::threadFuncFireEvent() From e4c84cf4672bebf9809e06d011889e84be429863 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 25 Jun 2024 08:31:24 +0000 Subject: [PATCH 66/95] Fix inconsistent AST formatting when a keyword is used as type name --- src/Parsers/ParserCreateQuery.h | 1 + src/Parsers/ParserDataType.cpp | 19 ++++++++++++++++++- ...3168_inconsistent_ast_formatting.reference | 0 .../03168_inconsistent_ast_formatting.sql | 4 ++++ 4 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/03168_inconsistent_ast_formatting.reference create mode 100644 tests/queries/0_stateless/03168_inconsistent_ast_formatting.sql diff --git a/src/Parsers/ParserCreateQuery.h b/src/Parsers/ParserCreateQuery.h index 5f6df33176f..bb37491a366 100644 --- a/src/Parsers/ParserCreateQuery.h +++ b/src/Parsers/ParserCreateQuery.h @@ -213,6 +213,7 @@ bool IParserColumnDeclaration::parseImpl(Pos & pos, ASTPtr & node, E return res; }; + /// Keep this list of keywords in sync with ParserDataType::parseImpl(). if (!null_check_without_moving() && !s_default.checkWithoutMoving(pos, expected) && !s_materialized.checkWithoutMoving(pos, expected) diff --git a/src/Parsers/ParserDataType.cpp b/src/Parsers/ParserDataType.cpp index b5bc9f89990..ad33c7e4558 100644 --- a/src/Parsers/ParserDataType.cpp +++ b/src/Parsers/ParserDataType.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -103,12 +104,28 @@ bool ParserDataType::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) return false; tryGetIdentifierNameInto(identifier, type_name); - /// Don't accept things like Array(`x.y`). + /// When parsing we accept quoted type names (e.g. `UInt64`), but when formatting we print them + /// unquoted (e.g. UInt64). This introduces problems when the string in the quotes is garbage: + /// * Array(`x.y`) -> Array(x.y) -> fails to parse + /// * `Null` -> Null -> parses as keyword instead of type name + /// Here we check for these cases and reject. if (!std::all_of(type_name.begin(), type_name.end(), [](char c) { return isWordCharASCII(c) || c == '$'; })) { expected.add(pos, "type name"); return false; } + /// Keywords that IParserColumnDeclaration recognizes before the type name. + /// E.g. reject CREATE TABLE a (x `Null`) because in "x Null" the Null would be parsed as + /// column attribute rather than type name. + { + String n = type_name; + boost::to_upper(n); + if (n == "NOT" || n == "NULL" || n == "DEFAULT" || n == "MATERIALIZED" || n == "EPHEMERAL" || n == "ALIAS" || n == "AUTO" || n == "PRIMARY" || n == "COMMENT" || n == "CODEC") + { + expected.add(pos, "type name"); + return false; + } + } String type_name_upper = Poco::toUpper(type_name); String type_name_suffix; diff --git a/tests/queries/0_stateless/03168_inconsistent_ast_formatting.reference b/tests/queries/0_stateless/03168_inconsistent_ast_formatting.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03168_inconsistent_ast_formatting.sql b/tests/queries/0_stateless/03168_inconsistent_ast_formatting.sql new file mode 100644 index 00000000000..d43d46d5b14 --- /dev/null +++ b/tests/queries/0_stateless/03168_inconsistent_ast_formatting.sql @@ -0,0 +1,4 @@ +create table a (x `Null`); -- { clientError SYNTAX_ERROR } +create table a (x f(`Null`)); -- { clientError SYNTAX_ERROR } +create table a (x Enum8(f(`Null`, 'World', 2))); -- { clientError SYNTAX_ERROR } +create table a (`value2` Enum8('Hello' = 1, equals(`Null`, 'World', 2), '!' = 3)); -- { clientError SYNTAX_ERROR } \ No newline at end of file From 8f8ca44102c31ea189e732cbb21235301a09ef6c Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 25 Jun 2024 11:58:52 +0200 Subject: [PATCH 67/95] Fix bug --- src/Storages/S3Queue/S3QueueSource.cpp | 18 +++++++++++++++--- src/Storages/S3Queue/S3QueueSource.h | 2 +- src/Storages/S3Queue/StorageS3Queue.cpp | 4 ++-- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index a73a7c34a6a..e41d5917105 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -126,14 +126,26 @@ void StorageS3QueueSource::FileIterator::returnForRetry(Source::ObjectInfoPtr ob } } -void StorageS3QueueSource::FileIterator::releaseHoldBuckets() +void StorageS3QueueSource::FileIterator::releaseFinishedBuckets() { for (const auto & [processor, holders] : bucket_holders) { LOG_TEST(log, "Releasing {} bucket holders for processor {}", holders.size(), processor); - for (const auto & bucket_holder : holders) - bucket_holder->release(); + for (auto it = holders.begin(); it != holders.end(); ++it) + { + const auto & holder = *it; + if (!holder->isFinished()) + { + /// Only the last holder in the list of holders can be non-finished. + chassert(std::next(it) == holders.end()); + + /// Do not release non-finished bucket holder. We will continue processing it. + LOG_TEST(log, "Bucket {} is not finished yet, will not release it", holder->getBucketInfo()->bucket); + break; + } + holder->release(); + } } } diff --git a/src/Storages/S3Queue/S3QueueSource.h b/src/Storages/S3Queue/S3QueueSource.h index f628379f719..bfa1c358fa9 100644 --- a/src/Storages/S3Queue/S3QueueSource.h +++ b/src/Storages/S3Queue/S3QueueSource.h @@ -60,7 +60,7 @@ public: /// In fact, they could be released in destructors of BucketHolder, /// but we anyway try to release them explicitly, /// because we want to be able to rethrow exceptions if they might happen. - void releaseHoldBuckets(); + void releaseFinishedBuckets(); private: using Bucket = S3QueueMetadata::Bucket; diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index 70bc7bdbe91..b1253516f17 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -522,14 +522,14 @@ bool StorageS3Queue::streamToViews() for (auto & source : sources) source->commit(/* success */false, getCurrentExceptionMessage(true)); - file_iterator->releaseHoldBuckets(); + file_iterator->releaseFinishedBuckets(); throw; } for (auto & source : sources) source->commit(/* success */true); - file_iterator->releaseHoldBuckets(); + file_iterator->releaseFinishedBuckets(); total_rows += rows; } From 4b5b149167943d79e3c93211768e6ca1184b68b7 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Tue, 25 Jun 2024 12:13:50 +0200 Subject: [PATCH 68/95] fix style --- src/IO/WriteBuffer.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/IO/WriteBuffer.h b/src/IO/WriteBuffer.h index 5b9381334a8..cee033c420f 100644 --- a/src/IO/WriteBuffer.h +++ b/src/IO/WriteBuffer.h @@ -20,7 +20,6 @@ namespace ErrorCodes } - /** A simple abstract class for buffered data writing (char sequences) somewhere. * Unlike std::ostream, it provides access to the internal buffer, * and also allows you to manually manage the position inside the buffer. @@ -94,7 +93,6 @@ public: next(); } - void write(const char * from, size_t n) { if (finalized) From 7d23eb9f26697a83a1fc642f99babe41d50d7d25 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Tue, 25 Jun 2024 12:46:19 +0100 Subject: [PATCH 69/95] fix test --- tests/queries/0_stateless/03174_merge_join_bug.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/03174_merge_join_bug.sql b/tests/queries/0_stateless/03174_merge_join_bug.sql index 026c352c2b3..9e31d0f7c1a 100644 --- a/tests/queries/0_stateless/03174_merge_join_bug.sql +++ b/tests/queries/0_stateless/03174_merge_join_bug.sql @@ -7,4 +7,5 @@ INSERT INTO xxxx_yyy SELECT number FROM numbers(10); SELECT * FROM xxxx_yyy AS a -INNER JOIN xxxx_yyy AS b ON a.key = b.key_b; +INNER JOIN xxxx_yyy AS b ON a.key = b.key_b +ORDER BY a.key; From 13a009684031924819fb8889e8a9e9c478eb5a21 Mon Sep 17 00:00:00 2001 From: Max K Date: Tue, 25 Jun 2024 13:11:04 +0200 Subject: [PATCH 70/95] update backport and release workflow to make report for limited number of builds --- .github/workflows/backport_branches.yml | 13 ++++++++----- .github/workflows/release_branches.yml | 15 +++++++++------ tests/ci/build_report_check.py | 5 ++++- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/.github/workflows/backport_branches.yml b/.github/workflows/backport_branches.yml index df19b45aa8b..0dd46f2ce9d 100644 --- a/.github/workflows/backport_branches.yml +++ b/.github/workflows/backport_branches.yml @@ -163,11 +163,14 @@ jobs: # run report check for failed builds to indicate the CI error if: ${{ !cancelled() && needs.RunConfig.result == 'success' && contains(fromJson(needs.RunConfig.outputs.data).jobs_data.jobs_to_do, 'Builds') }} needs: [RunConfig, BuilderDebAarch64, BuilderDebAsan, BuilderDebDebug, BuilderDebRelease, BuilderDebTsan, BuilderBinDarwin, BuilderBinDarwinAarch64] - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Builds - runner_type: style-checker-aarch64 - data: ${{ needs.RunConfig.outputs.data }} + runs-on: [self-hosted, style-checker-aarch64] + steps: + - name: Check out repository code + uses: ClickHouse/checkout@v1 + - name: Builds report + run: | + cd "$GITHUB_WORKSPACE/tests/ci" + python3 ./build_report_check.py --reports package_release package_aarch64 package_asan package_tsan package_debug binary_darwin binary_darwin_aarch64 ############################################################################################ #################################### INSTALL PACKAGES ###################################### ############################################################################################ diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index cecc5f754aa..831d29238dc 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -179,12 +179,15 @@ jobs: Builds_Report: # run report check for failed builds to indicate the CI error if: ${{ !cancelled() && needs.RunConfig.result == 'success' && contains(fromJson(needs.RunConfig.outputs.data).jobs_data.jobs_to_do, 'Builds') }} - needs: [RunConfig, BuilderDebRelease, BuilderDebAarch64, BuilderDebAsan, BuilderDebTsan, BuilderDebUBsan, BuilderDebMsan, BuilderDebDebug, BuilderBinDarwin, BuilderBinDarwinAarch64] - uses: ./.github/workflows/reusable_test.yml - with: - test_name: Builds - runner_type: style-checker-aarch64 - data: ${{ needs.RunConfig.outputs.data }} + needs: [RunConfig, BuilderDebRelease, BuilderDebAarch64, BuilderDebAsan, BuilderDebUBsan, BuilderDebMsan, BuilderDebTsan, BuilderDebDebug, BuilderBinDarwin, BuilderBinDarwinAarch64] + runs-on: [self-hosted, style-checker-aarch64] + steps: + - name: Check out repository code + uses: ClickHouse/checkout@v1 + - name: Builds report + run: | + cd "$GITHUB_WORKSPACE/tests/ci" + python3 ./build_report_check.py --reports package_release package_aarch64 package_asan package_msan package_ubsan package_tsan package_debug binary_darwin binary_darwin_aarch64 MarkReleaseReady: if: ${{ !failure() && !cancelled() }} needs: diff --git a/tests/ci/build_report_check.py b/tests/ci/build_report_check.py index 664f6a7cbb9..04c8d12fc30 100644 --- a/tests/ci/build_report_check.py +++ b/tests/ci/build_report_check.py @@ -64,11 +64,14 @@ def main(): + ci_config["jobs_data"]["jobs_to_do"] ) builds_for_check = [job for job in CI.BuildNames if job in all_ci_jobs] - print(f"NOTE: following build reports will be checked: [{builds_for_check}]") + print("NOTE: builds for check taken from ci configuration") else: builds_for_check = parse_args().reports for job in builds_for_check: assert job in CI.BuildNames, "Builds must be known build job names" + print("NOTE: builds for check taken from input arguments") + + print(f"NOTE: following build reports will be checked: [{builds_for_check}]") required_builds = len(builds_for_check) missing_builds = 0 From 5c4beb03f36617e4a04508e8d1531309766cd69e Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 25 Jun 2024 14:41:32 +0200 Subject: [PATCH 71/95] Try to fix tests --- src/Analyzer/Resolve/QueryAnalyzer.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Analyzer/Resolve/QueryAnalyzer.cpp b/src/Analyzer/Resolve/QueryAnalyzer.cpp index 64ddf59089a..446a27ac40b 100644 --- a/src/Analyzer/Resolve/QueryAnalyzer.cpp +++ b/src/Analyzer/Resolve/QueryAnalyzer.cpp @@ -1,3 +1,5 @@ +#include + #include #include #include @@ -4524,7 +4526,7 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, resolveExpressionNode(nodes[1], scope, /* allow_lambda_expression */false, /* allow_table_function */false); if (auto * constant = nodes[1]->as()) { - view_params[identifier_node->getIdentifier().getFullName()] = constant->getValueStringRepresentation(); + view_params[identifier_node->getIdentifier().getFullName()] = convertFieldToString(constant->getValue()); } } } From f62f182d694c961fad5dcaae737604eb1d80c285 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Tue, 25 Jun 2024 15:05:34 +0200 Subject: [PATCH 72/95] Update Context.h --- src/Interpreters/Context.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index a3157627703..4b56ee4a5ae 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -147,7 +147,6 @@ class ServerType; template class MergeTreeBackgroundExecutor; class AsyncLoader; -struct ScopeAliases; struct TemporaryTableHolder; using TemporaryTablesMapping = std::map>; From 9481b665cc15fcb30a182ceba7861d07b400ab47 Mon Sep 17 00:00:00 2001 From: Max K Date: Tue, 25 Jun 2024 15:22:01 +0200 Subject: [PATCH 73/95] CI: Single point of setting mergeable check status --- .github/workflows/backport_branches.yml | 3 + .github/workflows/merge_queue.yml | 15 ++-- .github/workflows/release_branches.yml | 3 + tests/ci/ci_definitions.py | 8 +- tests/ci/commit_status_helper.py | 99 +++++++++---------------- tests/ci/finish_check.py | 29 -------- tests/ci/merge_pr.py | 26 +++++-- tests/ci/pr_info.py | 31 ++++++++ tests/ci/s3_helper.py | 2 +- 9 files changed, 107 insertions(+), 109 deletions(-) diff --git a/.github/workflows/backport_branches.yml b/.github/workflows/backport_branches.yml index e1980ec9ef2..234600c7463 100644 --- a/.github/workflows/backport_branches.yml +++ b/.github/workflows/backport_branches.yml @@ -273,5 +273,8 @@ jobs: - name: Finish label run: | cd "$GITHUB_WORKSPACE/tests/ci" + # update mergeable check + python3 merge_pr.py --set-ci-status --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} + # update overall ci report python3 finish_check.py --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} python3 merge_pr.py diff --git a/.github/workflows/merge_queue.yml b/.github/workflows/merge_queue.yml index cfa01b0e8f3..01685ee1f5a 100644 --- a/.github/workflows/merge_queue.yml +++ b/.github/workflows/merge_queue.yml @@ -96,20 +96,15 @@ jobs: stage: Tests_1 data: ${{ needs.RunConfig.outputs.data }} - ################################# Stage Final ################################# - # - FinishCheck: - if: ${{ !cancelled() }} + CheckReadyForMerge: + if: ${{ !cancelled() && needs.StyleCheck.result == 'success' }} + # Test_2 or Test_3 must not have jobs required for Mergeable check needs: [RunConfig, BuildDockers, StyleCheck, FastTest, Builds_1, Tests_1] runs-on: [self-hosted, style-checker-aarch64] steps: - name: Check out repository code uses: ClickHouse/checkout@v1 - - name: Check sync status + - name: Check and set merge status run: | cd "$GITHUB_WORKSPACE/tests/ci" - python3 sync_pr.py --status - - name: Finish label - run: | - cd "$GITHUB_WORKSPACE/tests/ci" - python3 finish_check.py --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} + python3 merge_pr.py --set-ci-status --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 45eb7431bb4..ebe8195cddb 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -496,4 +496,7 @@ jobs: - name: Finish label run: | cd "$GITHUB_WORKSPACE/tests/ci" + # update mergeable check + python3 merge_pr.py --set-ci-status --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} + # update overall ci report python3 finish_check.py --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} diff --git a/tests/ci/ci_definitions.py b/tests/ci/ci_definitions.py index d60c67d318d..316f460a2b7 100644 --- a/tests/ci/ci_definitions.py +++ b/tests/ci/ci_definitions.py @@ -214,8 +214,12 @@ class StatusNames(metaclass=WithIter): class SyncState(metaclass=WithIter): - PENDING = "awaiting merge" - MERGE_FAILED = "merge failed" + PENDING = "awaiting sync" + # temporary state if GH does not know mergeable state + MERGE_UNKNOWN = "unknown state (might be auto recoverable)" + # changes cannot be pushed/merged to a sync branch + PUSH_FAILED = "push failed" + MERGE_CONFLICTS = "merge conflicts" TESTING = "awaiting test results" TESTS_FAILED = "tests failed" COMPLETED = "completed" diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index 96cf700ed77..fdc9c002b66 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -18,8 +18,7 @@ from github.IssueComment import IssueComment from github.Repository import Repository from ci_config import CI -from env_helper import GITHUB_REPOSITORY, GITHUB_UPSTREAM_REPOSITORY, TEMP_PATH -from lambda_shared_package.lambda_shared.pr import Labels +from env_helper import GITHUB_REPOSITORY, TEMP_PATH from pr_info import PRInfo from report import ( ERROR, @@ -29,7 +28,6 @@ from report import ( StatusType, TestResult, TestResults, - get_status, get_worst_status, ) from s3_helper import S3Helper @@ -103,7 +101,12 @@ def post_commit_status( if i == RETRY - 1: raise ex time.sleep(i) - if pr_info: + if pr_info and check_name not in ( + CI.StatusNames.MERGEABLE, + CI.StatusNames.CI, + CI.StatusNames.PR_CHECK, + CI.StatusNames.SYNC, + ): status_updated = False for i in range(RETRY): try: @@ -157,6 +160,17 @@ def set_status_comment(commit: Commit, pr_info: PRInfo) -> None: gh.__requester = commit._requester # type:ignore #pylint:disable=protected-access repo = get_repo(gh) statuses = sorted(get_commit_filtered_statuses(commit), key=lambda x: x.context) + statuses = [ + status + for status in statuses + if status.context + not in ( + CI.StatusNames.MERGEABLE, + CI.StatusNames.CI, + CI.StatusNames.PR_CHECK, + CI.StatusNames.SYNC, + ) + ] if not statuses: return @@ -439,29 +453,10 @@ def set_mergeable_check( ) -def update_mergeable_check(commit: Commit, pr_info: PRInfo, check_name: str) -> None: - "check if the check_name in REQUIRED_CHECKS and then trigger update" - not_run = ( - pr_info.labels.intersection({Labels.SKIP_MERGEABLE_CHECK, Labels.RELEASE}) - or not CI.is_required(check_name) - or pr_info.release_pr - or pr_info.number == 0 - ) - - if not_run: - # Let's avoid unnecessary work - return - - logging.info("Update Mergeable Check by %s", check_name) - - statuses = get_commit_filtered_statuses(commit) - trigger_mergeable_check(commit, statuses) - - def trigger_mergeable_check( commit: Commit, statuses: CommitStatuses, - set_if_green: bool = False, + set_from_sync: bool = False, workflow_failed: bool = False, ) -> StatusType: """calculate and update StatusNames.MERGEABLE""" @@ -501,63 +496,43 @@ def trigger_mergeable_check( description = format_description(description) - if not set_if_green and state == SUCCESS: - # do not set green Mergeable Check status - pass - else: - if mergeable_status is None or mergeable_status.description != description: + if set_from_sync: + # update Mergeable Check from sync WF only if its status already present or its new status is not SUCCESS + # to avoid false-positives + if mergeable_status or state != SUCCESS: set_mergeable_check(commit, description, state) + elif mergeable_status is None or mergeable_status.description != description: + set_mergeable_check(commit, description, state) return state def update_upstream_sync_status( - upstream_pr_number: int, - sync_pr_number: int, - gh: Github, + pr_info: PRInfo, state: StatusType, - can_set_green_mergeable_status: bool = False, ) -> None: - upstream_repo = gh.get_repo(GITHUB_UPSTREAM_REPOSITORY) - upstream_pr = upstream_repo.get_pull(upstream_pr_number) - sync_repo = gh.get_repo(GITHUB_REPOSITORY) - sync_pr = sync_repo.get_pull(sync_pr_number) - # Find the commit that is in both repos, upstream and cloud - sync_commits = sync_pr.get_commits().reversed - upstream_commits = upstream_pr.get_commits().reversed - # Github objects are compared by _url attribute. We can't compare them directly and - # should compare commits by SHA1 - upstream_shas = [c.sha for c in upstream_commits] - logging.info("Commits in upstream PR:\n %s", ", ".join(upstream_shas)) - sync_shas = [c.sha for c in sync_commits] - logging.info("Commits in sync PR:\n %s", ", ".join(reversed(sync_shas))) + last_synced_upstream_commit = pr_info.get_latest_sync_commit() - # find latest synced commit - last_synced_upstream_commit = None - for commit in upstream_commits: - if commit.sha in sync_shas: - last_synced_upstream_commit = commit - break - - assert last_synced_upstream_commit - - sync_status = get_status(state) logging.info( - "Using commit %s to post the %s status `%s`: [%s]", + "Using commit [%s] to post the [%s] status [%s]", last_synced_upstream_commit.sha, - sync_status, + state, CI.StatusNames.SYNC, - "", ) + if state == SUCCESS: + description = CI.SyncState.COMPLETED + else: + description = CI.SyncState.TESTS_FAILED + post_commit_status( last_synced_upstream_commit, - sync_status, - "", + state, "", + description, CI.StatusNames.SYNC, ) trigger_mergeable_check( last_synced_upstream_commit, get_commit_filtered_statuses(last_synced_upstream_commit), - set_if_green=can_set_green_mergeable_status, + set_from_sync=True, ) diff --git a/tests/ci/finish_check.py b/tests/ci/finish_check.py index bd83f875790..385caccc8cd 100644 --- a/tests/ci/finish_check.py +++ b/tests/ci/finish_check.py @@ -9,15 +9,10 @@ from commit_status_helper import ( get_commit, get_commit_filtered_statuses, post_commit_status, - set_mergeable_check, - trigger_mergeable_check, - update_upstream_sync_status, ) -from env_helper import GITHUB_REPOSITORY, GITHUB_UPSTREAM_REPOSITORY from get_robot_token import get_best_robot_token from pr_info import PRInfo from report import FAILURE, PENDING, SUCCESS, StatusType -from synchronizer_utils import SYNC_BRANCH_PREFIX def parse_args() -> argparse.Namespace: @@ -45,31 +40,7 @@ def main(): gh = Github(get_best_robot_token(), per_page=100) commit = get_commit(gh, pr_info.sha) - if pr_info.is_merge_queue: - # in MQ Mergeable check status must never be green if any failures in the workflow - if has_workflow_failures: - set_mergeable_check(commit, "workflow failed", FAILURE) - else: - # This must be the only place where green MCheck is set in the MQ (in the end of CI) to avoid early merge - set_mergeable_check(commit, "workflow passed", SUCCESS) - return - statuses = get_commit_filtered_statuses(commit) - state = trigger_mergeable_check(commit, statuses, set_if_green=True) - - # Process upstream StatusNames.SYNC - if ( - pr_info.head_ref.startswith(f"{SYNC_BRANCH_PREFIX}/pr/") - and GITHUB_REPOSITORY != GITHUB_UPSTREAM_REPOSITORY - ): - upstream_pr_number = int(pr_info.head_ref.split("/pr/", maxsplit=1)[1]) - update_upstream_sync_status( - upstream_pr_number, - pr_info.number, - gh, - state, - can_set_green_mergeable_status=True, - ) ci_running_statuses = [s for s in statuses if s.context == CI.StatusNames.CI] if not ci_running_statuses: diff --git a/tests/ci/merge_pr.py b/tests/ci/merge_pr.py index e1c7bf94ff5..37c08fc4efe 100644 --- a/tests/ci/merge_pr.py +++ b/tests/ci/merge_pr.py @@ -4,6 +4,7 @@ import argparse import logging +import sys from datetime import datetime from os import getenv from pprint import pformat @@ -17,11 +18,14 @@ from commit_status_helper import ( get_commit_filtered_statuses, get_commit, trigger_mergeable_check, + update_upstream_sync_status, ) from get_robot_token import get_best_robot_token from github_helper import GitHub, NamedUser, PullRequest, Repository from pr_info import PRInfo -from report import SUCCESS +from report import SUCCESS, FAILURE +from env_helper import GITHUB_UPSTREAM_REPOSITORY, GITHUB_REPOSITORY +from synchronizer_utils import SYNC_BRANCH_PREFIX # The team name for accepted approvals TEAM_NAME = getenv("GITHUB_TEAM_NAME", "core") @@ -243,17 +247,29 @@ def main(): repo = gh.get_repo(args.repo) if args.set_ci_status: - assert args.wf_status in ("failure", "success") + assert args.wf_status in (FAILURE, SUCCESS) # set mergeable check status and exit commit = get_commit(gh, args.pr_info.sha) statuses = get_commit_filtered_statuses(commit) - trigger_mergeable_check( + state = trigger_mergeable_check( commit, statuses, - set_if_green=True, workflow_failed=(args.wf_status != "success"), ) - return + + # Process upstream StatusNames.SYNC + pr_info = PRInfo() + if ( + pr_info.head_ref.startswith(f"{SYNC_BRANCH_PREFIX}/pr/") + and GITHUB_REPOSITORY != GITHUB_UPSTREAM_REPOSITORY + ): + print("Updating upstream statuses") + update_upstream_sync_status(pr_info, state) + + if args.wf_status != "success": + # exit with 1 to rerun on workflow failed job restart + sys.exit(1) + sys.exit(0) # An ugly and not nice fix to patch the wrong organization URL, # see https://github.com/PyGithub/PyGithub/issues/2395#issuecomment-1378629710 diff --git a/tests/ci/pr_info.py b/tests/ci/pr_info.py index fb25a29cc57..a411fc4e8f6 100644 --- a/tests/ci/pr_info.py +++ b/tests/ci/pr_info.py @@ -13,8 +13,11 @@ from env_helper import ( GITHUB_REPOSITORY, GITHUB_RUN_URL, GITHUB_SERVER_URL, + GITHUB_UPSTREAM_REPOSITORY, ) from lambda_shared_package.lambda_shared.pr import Labels +from get_robot_token import get_best_robot_token +from github_helper import GitHub NeedsDataType = Dict[str, Dict[str, Union[str, Dict[str, str]]]] @@ -432,6 +435,34 @@ class PRInfo: return True return False + def get_latest_sync_commit(self): + gh = GitHub(get_best_robot_token(), per_page=100) + assert self.head_ref.startswith("sync-upstream/pr/") + assert self.repo_full_name != GITHUB_UPSTREAM_REPOSITORY + upstream_repo = gh.get_repo(GITHUB_UPSTREAM_REPOSITORY) + upstream_pr_number = int(self.head_ref.split("/pr/", maxsplit=1)[1]) + upstream_pr = upstream_repo.get_pull(upstream_pr_number) + sync_repo = gh.get_repo(GITHUB_REPOSITORY) + sync_pr = sync_repo.get_pull(self.number) + # Find the commit that is in both repos, upstream and cloud + sync_commits = sync_pr.get_commits().reversed + upstream_commits = upstream_pr.get_commits().reversed + # Github objects are compared by _url attribute. We can't compare them directly and + # should compare commits by SHA1 + upstream_shas = [c.sha for c in upstream_commits] + logging.info("Commits in upstream PR:\n %s", ", ".join(upstream_shas)) + sync_shas = [c.sha for c in sync_commits] + logging.info("Commits in sync PR:\n %s", ", ".join(reversed(sync_shas))) + + # find latest synced commit + last_synced_upstream_commit = None + for commit in upstream_commits: + if commit.sha in sync_shas: + last_synced_upstream_commit = commit + break + assert last_synced_upstream_commit + return last_synced_upstream_commit + class FakePRInfo: def __init__(self): diff --git a/tests/ci/s3_helper.py b/tests/ci/s3_helper.py index da0ccb2b74d..86656e6e7c0 100644 --- a/tests/ci/s3_helper.py +++ b/tests/ci/s3_helper.py @@ -104,7 +104,7 @@ class S3Helper: self.client.upload_file(file_path, bucket_name, s3_path, ExtraArgs=metadata) url = self.s3_url(bucket_name, s3_path) - logging.info("Upload %s to %s. Meta: %s", file_path, url, metadata) + logging.info("Upload %s to %s Meta: %s", file_path, url, metadata) return url def delete_file_from_s3(self, bucket_name: str, s3_path: str) -> None: From 52496c69e26607575b9288a277846e9ba197ae14 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 25 Jun 2024 15:31:21 +0200 Subject: [PATCH 74/95] Follow up fix --- src/Storages/S3Queue/S3QueueSource.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/Storages/S3Queue/S3QueueSource.cpp b/src/Storages/S3Queue/S3QueueSource.cpp index e41d5917105..3a611ece51b 100644 --- a/src/Storages/S3Queue/S3QueueSource.cpp +++ b/src/Storages/S3Queue/S3QueueSource.cpp @@ -135,16 +135,24 @@ void StorageS3QueueSource::FileIterator::releaseFinishedBuckets() for (auto it = holders.begin(); it != holders.end(); ++it) { const auto & holder = *it; + const auto bucket = holder->getBucketInfo()->bucket; if (!holder->isFinished()) { /// Only the last holder in the list of holders can be non-finished. chassert(std::next(it) == holders.end()); /// Do not release non-finished bucket holder. We will continue processing it. - LOG_TEST(log, "Bucket {} is not finished yet, will not release it", holder->getBucketInfo()->bucket); + LOG_TEST(log, "Bucket {} is not finished yet, will not release it", bucket); break; } + + /// Release bucket lock. holder->release(); + + /// Reset bucket processor in cached state. + auto cached_info = listed_keys_cache.find(bucket); + if (cached_info != listed_keys_cache.end()) + cached_info->second.processor.reset(); } } } From 52489a25f57b94b7b175da151bc2575d245d034b Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 25 Jun 2024 17:34:58 +0200 Subject: [PATCH 75/95] add Replicated db names to zk for introspection --- src/Databases/DatabaseReplicated.cpp | 15 ++++++++ src/Databases/DatabaseReplicated.h | 2 ++ ...859_replicated_db_name_zookeeper.reference | 2 ++ .../02859_replicated_db_name_zookeeper.sh | 34 +++++++++++++++++++ 4 files changed, 53 insertions(+) create mode 100644 tests/queries/0_stateless/02859_replicated_db_name_zookeeper.reference create mode 100755 tests/queries/0_stateless/02859_replicated_db_name_zookeeper.sh diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 1368251e155..72725f4c58c 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -65,6 +65,7 @@ static constexpr const char * REPLICATED_DATABASE_MARK = "DatabaseReplicated"; static constexpr const char * DROPPED_MARK = "DROPPED"; static constexpr const char * BROKEN_TABLES_SUFFIX = "_broken_tables"; static constexpr const char * BROKEN_REPLICATED_TABLES_SUFFIX = "_broken_replicated_tables"; +static constexpr const char * FIRST_REPLICA_DATABASE_NAME = "first_replica_database_name"; static constexpr size_t METADATA_FILE_BUFFER_SIZE = 32768; @@ -465,6 +466,13 @@ void DatabaseReplicated::tryConnectToZooKeeperAndInitDatabase(LoadingStrictnessL return; } + /// If not exist, create a node with the database name for introspection. + /// Technically, the database may have different names on different replicas, but this is not a usual case and we only save the first one + auto db_name_path = fs::path(zookeeper_path) / FIRST_REPLICA_DATABASE_NAME; + auto error_code = current_zookeeper->trySet(db_name_path, getDatabaseName()); + if (error_code == Coordination::Error::ZNONODE) + current_zookeeper->create(db_name_path, getDatabaseName(), zkutil::CreateMode::Persistent); + is_readonly = false; } catch (...) @@ -1382,6 +1390,13 @@ void DatabaseReplicated::drop(ContextPtr context_) } } +void DatabaseReplicated::renameDatabase(ContextPtr query_context, const String & new_name) +{ + DatabaseAtomic::renameDatabase(query_context, new_name); + auto db_name_path = fs::path(zookeeper_path) / FIRST_REPLICA_DATABASE_NAME; + getZooKeeper()->set(db_name_path, getDatabaseName()); +} + void DatabaseReplicated::stopReplication() { if (ddl_worker) diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 761d6b4b503..eab5b2ff931 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -86,6 +86,8 @@ public: std::vector tryGetAreReplicasActive(const ClusterPtr & cluster_) const; + void renameDatabase(ContextPtr query_context, const String & new_name) override; + friend struct DatabaseReplicatedTask; friend class DatabaseReplicatedDDLWorker; private: diff --git a/tests/queries/0_stateless/02859_replicated_db_name_zookeeper.reference b/tests/queries/0_stateless/02859_replicated_db_name_zookeeper.reference new file mode 100644 index 00000000000..e7d63a6add3 --- /dev/null +++ b/tests/queries/0_stateless/02859_replicated_db_name_zookeeper.reference @@ -0,0 +1,2 @@ +rdb1_default 1 +rdb3_default 1 diff --git a/tests/queries/0_stateless/02859_replicated_db_name_zookeeper.sh b/tests/queries/0_stateless/02859_replicated_db_name_zookeeper.sh new file mode 100755 index 00000000000..3c14c569257 --- /dev/null +++ b/tests/queries/0_stateless/02859_replicated_db_name_zookeeper.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT --distributed_ddl_output_mode=none -q "CREATE DATABASE rdb1_$CLICKHOUSE_DATABASE ON CLUSTER test_shard_localhost ENGINE=Replicated('/clickhouse/databases/{uuid}', '{shard}', '{replica}')"; +$CLICKHOUSE_CLIENT --distributed_ddl_output_mode=none -q "CREATE DATABASE rdb2_$CLICKHOUSE_DATABASE ON CLUSTER test_shard_localhost ENGINE=Replicated('/clickhouse/databases/{uuid}', '{shard}', '{replica}')"; +$CLICKHOUSE_CLIENT --distributed_ddl_output_mode=none -q "RENAME DATABASE rdb2_$CLICKHOUSE_DATABASE to rdb3_$CLICKHOUSE_DATABASE ON CLUSTER test_shard_localhost" + +$CLICKHOUSE_CLIENT -q " +SELECT + db_name, + t1.uuid = t2.uuid +FROM +( + WITH '/clickhouse/databases/' AS prefix + SELECT + toUUID(substr(path, length(prefix) + 1)) AS uuid, + value AS db_name + FROM system.zookeeper + WHERE (path IN ( + SELECT concat(path, name) + FROM system.zookeeper + WHERE path = prefix + )) AND (name = 'first_replica_database_name') +) AS t1 +INNER JOIN system.databases AS t2 USING (uuid) +WHERE db_name like '%$CLICKHOUSE_DATABASE%' +ORDER BY db_name +" + +$CLICKHOUSE_CLIENT -q "DROP DATABASE rdb1_$CLICKHOUSE_DATABASE" +$CLICKHOUSE_CLIENT -q "DROP DATABASE rdb3_$CLICKHOUSE_DATABASE" From 818867d1fcd8a7ff0f9bd5cb1a673fc0a9fde9d7 Mon Sep 17 00:00:00 2001 From: Blargian Date: Tue, 25 Jun 2024 20:33:24 +0200 Subject: [PATCH 76/95] Add missing function protocol --- .../sql-reference/functions/url-functions.md | 34 +++++++++++++++++++ .../aspell-ignore/en/aspell-dict.txt | 1 + 2 files changed, 35 insertions(+) diff --git a/docs/en/sql-reference/functions/url-functions.md b/docs/en/sql-reference/functions/url-functions.md index 8b3e4f44840..76c0141ac8b 100644 --- a/docs/en/sql-reference/functions/url-functions.md +++ b/docs/en/sql-reference/functions/url-functions.md @@ -818,6 +818,40 @@ The same as above, but including query string and fragment. Example: `/top/news.html?page=2#comments`. +### protocol + +Extracts the protocol from a URL. + +**Syntax** + +```sql +protocol(url) +``` + +**Arguments** + +- `url` — URL to extract protocol from. [String](../data-types/string.md). + +**Returned value** + +- Protocol, or an empty string if it cannot be determined. [String](../data-types/string.md). + +**Example** + +Query: + +```sql +SELECT protocol('https://clickhouse.com/'); +``` + +Result: + +```response +┌─protocol('https://clickhouse.com/')─┐ +│ https │ +└─────────────────────────────────────┘ +``` + ### queryString Returns the query string without the initial question mark, `#` and everything after `#`. diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index f1fcd19ea4a..bd8d4450da0 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -2257,6 +2257,7 @@ proleptic prometheus proportionsZTest proto +protocol protobuf protobufsingle proxied From 9dadcb2e6a4f56b5175a0361f2417e98cbe5e35a Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 25 Jun 2024 11:37:37 -0700 Subject: [PATCH 77/95] oops, didn't notice the max Co-authored-by: Nikolay Degterinsky <43110995+evillique@users.noreply.github.com> --- src/Storages/WindowView/StorageWindowView.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/WindowView/StorageWindowView.cpp b/src/Storages/WindowView/StorageWindowView.cpp index eea9f9ad38f..77e6ee9cb24 100644 --- a/src/Storages/WindowView/StorageWindowView.cpp +++ b/src/Storages/WindowView/StorageWindowView.cpp @@ -1071,7 +1071,7 @@ void StorageWindowView::threadFuncFireProc() UInt64 next_fire_ms = static_cast(next_fire_signal) * 1000; UInt64 timestamp_ms = static_cast(Poco::Timestamp().epochMicroseconds()) / 1000; if (!shutdown_called) - fire_task->scheduleAfter(std::max(UInt64(0), next_fire_ms - std::min(next_fire_ms, timestamp_ms))); + fire_task->scheduleAfter(next_fire_ms - std::min(next_fire_ms, timestamp_ms)); } void StorageWindowView::threadFuncFireEvent() From 432dab7aa3aa683039159276294f54a05b0d8bb0 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 25 Jun 2024 18:45:47 +0000 Subject: [PATCH 78/95] Fix tests --- .../01559_misplaced_codec_diagnostics.reference | 1 - .../0_stateless/01559_misplaced_codec_diagnostics.sh | 7 ------- .../0_stateless/01559_misplaced_codec_diagnostics.sql | 1 + .../02302_column_decl_null_before_defaul_value.sql | 2 +- 4 files changed, 2 insertions(+), 9 deletions(-) delete mode 100755 tests/queries/0_stateless/01559_misplaced_codec_diagnostics.sh create mode 100644 tests/queries/0_stateless/01559_misplaced_codec_diagnostics.sql diff --git a/tests/queries/0_stateless/01559_misplaced_codec_diagnostics.reference b/tests/queries/0_stateless/01559_misplaced_codec_diagnostics.reference index d5bdb816bf2..e69de29bb2d 100644 --- a/tests/queries/0_stateless/01559_misplaced_codec_diagnostics.reference +++ b/tests/queries/0_stateless/01559_misplaced_codec_diagnostics.reference @@ -1 +0,0 @@ -Unknown data type family: CODEC diff --git a/tests/queries/0_stateless/01559_misplaced_codec_diagnostics.sh b/tests/queries/0_stateless/01559_misplaced_codec_diagnostics.sh deleted file mode 100755 index 8a3242c7036..00000000000 --- a/tests/queries/0_stateless/01559_misplaced_codec_diagnostics.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash - -CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# shellcheck source=../shell_config.sh -. "$CURDIR"/../shell_config.sh - -${CLICKHOUSE_CLIENT} --query "CREATE TABLE t (c CODEC(NONE)) ENGINE = Memory" 2>&1 | grep -oF 'Unknown data type family: CODEC' | uniq diff --git a/tests/queries/0_stateless/01559_misplaced_codec_diagnostics.sql b/tests/queries/0_stateless/01559_misplaced_codec_diagnostics.sql new file mode 100644 index 00000000000..ab1cfc89be1 --- /dev/null +++ b/tests/queries/0_stateless/01559_misplaced_codec_diagnostics.sql @@ -0,0 +1 @@ +CREATE TABLE t (c CODEC(NONE)) ENGINE = Memory -- { clientError SYNTAX_ERROR } \ No newline at end of file diff --git a/tests/queries/0_stateless/02302_column_decl_null_before_defaul_value.sql b/tests/queries/0_stateless/02302_column_decl_null_before_defaul_value.sql index 3825df1e557..a2c2fc7cba2 100644 --- a/tests/queries/0_stateless/02302_column_decl_null_before_defaul_value.sql +++ b/tests/queries/0_stateless/02302_column_decl_null_before_defaul_value.sql @@ -56,6 +56,6 @@ ALTER TABLE null_before ALTER COLUMN id TYPE INT NULL; -- { clientError SYNTAX_E select 'modify column, NULL modifier is not allowed'; DROP TABLE IF EXISTS null_before SYNC; CREATE TABLE null_before (id INT NOT NULL) ENGINE=MergeTree() ORDER BY tuple(); -ALTER TABLE null_before MODIFY COLUMN id NULL DEFAULT 1; -- { serverError UNKNOWN_TYPE } +ALTER TABLE null_before MODIFY COLUMN id NULL DEFAULT 1; -- { clientError SYNTAX_ERROR } DROP TABLE IF EXISTS null_before SYNC; From 5d09eb7025d381cbcd4e7bfcc2dd03a76d1f4c47 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 25 Jun 2024 19:47:49 +0000 Subject: [PATCH 79/95] Print slightly more information in 02982_aggregation_states_destruction --- .../0_stateless/02982_aggregation_states_destruction.reference | 2 +- .../queries/0_stateless/02982_aggregation_states_destruction.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02982_aggregation_states_destruction.reference b/tests/queries/0_stateless/02982_aggregation_states_destruction.reference index d00491fd7e5..72749c905a3 100644 --- a/tests/queries/0_stateless/02982_aggregation_states_destruction.reference +++ b/tests/queries/0_stateless/02982_aggregation_states_destruction.reference @@ -1 +1 @@ -1 +1 1 1 diff --git a/tests/queries/0_stateless/02982_aggregation_states_destruction.sh b/tests/queries/0_stateless/02982_aggregation_states_destruction.sh index 1c72cf2b8c1..263a4535c0e 100755 --- a/tests/queries/0_stateless/02982_aggregation_states_destruction.sh +++ b/tests/queries/0_stateless/02982_aggregation_states_destruction.sh @@ -11,4 +11,4 @@ $CLICKHOUSE_CLIENT --query_id $query_id --log_query_threads 1 --query="select nu $CLICKHOUSE_CLIENT -q "system flush logs;" -$CLICKHOUSE_CLIENT -q "select count() > 1 from system.query_thread_log where query_id = '$query_id' and current_database = currentDatabase() and thread_name = 'AggregDestruct';" +$CLICKHOUSE_CLIENT -q "select count() > 0, (countIf(thread_name = 'AggregDestruct') as aggs) > 0, aggs > 1 from system.query_thread_log where query_id = '$query_id' and current_database = currentDatabase();" From cfc004ee6f478716811227eaaf2caba5a81f8561 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 25 Jun 2024 22:00:54 +0200 Subject: [PATCH 80/95] Fix crash in maxIntersections --- .../AggregateFunctionMaxIntersections.cpp | 3 ++- .../03196_max_intersections_arena_crash.reference | 8 ++++++++ .../0_stateless/03196_max_intersections_arena_crash.sql | 5 +++++ 3 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/03196_max_intersections_arena_crash.reference create mode 100644 tests/queries/0_stateless/03196_max_intersections_arena_crash.sql diff --git a/src/AggregateFunctions/AggregateFunctionMaxIntersections.cpp b/src/AggregateFunctions/AggregateFunctionMaxIntersections.cpp index 05ed85a9004..6c26065a918 100644 --- a/src/AggregateFunctions/AggregateFunctionMaxIntersections.cpp +++ b/src/AggregateFunctions/AggregateFunctionMaxIntersections.cpp @@ -91,7 +91,8 @@ public: return std::make_shared>(); } - bool allocatesMemoryInArena() const override { return false; } + /// MaxIntersectionsData::Allocator uses the arena + bool allocatesMemoryInArena() const override { return true; } void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override { diff --git a/tests/queries/0_stateless/03196_max_intersections_arena_crash.reference b/tests/queries/0_stateless/03196_max_intersections_arena_crash.reference new file mode 100644 index 00000000000..049e7107258 --- /dev/null +++ b/tests/queries/0_stateless/03196_max_intersections_arena_crash.reference @@ -0,0 +1,8 @@ +1 3 3 +1 6 3 +2 5 3 +3 7 3 +1 3 2 +1 6 2 +2 5 2 +3 7 2 diff --git a/tests/queries/0_stateless/03196_max_intersections_arena_crash.sql b/tests/queries/0_stateless/03196_max_intersections_arena_crash.sql new file mode 100644 index 00000000000..b7269d7c4e2 --- /dev/null +++ b/tests/queries/0_stateless/03196_max_intersections_arena_crash.sql @@ -0,0 +1,5 @@ +DROP TABLE IF EXISTS my_events; +CREATE TABLE my_events (start UInt32, end UInt32) Engine = MergeTree ORDER BY tuple() + AS Select * FROM VALUES ('start UInt32, end UInt32', (1, 3), (1, 6), (2, 5), (3, 7)); +SELECT start, end, maxIntersections(start, end) OVER () FROM my_events; +SELECT start, end, maxIntersectionsPosition(start, end) OVER () FROM my_events; From 072eb144938ed61c7723be86dad3e6c4713ce5d9 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Tue, 25 Jun 2024 23:04:47 +0200 Subject: [PATCH 81/95] fixing HTTPHandler --- .../gtest_cascade_and_memory_write_buffer.cpp | 2 +- src/IO/CascadeWriteBuffer.cpp | 2 +- src/IO/MemoryReadWriteBuffer.cpp | 2 +- src/IO/MemoryReadWriteBuffer.h | 7 +++++ src/IO/WriteBuffer.cpp | 15 +++++++++++ src/IO/WriteBuffer.h | 27 +------------------ .../WriteBufferFromHTTPServerResponse.cpp | 3 ++- src/Server/HTTPHandler.cpp | 13 +++------ src/Server/HTTPHandler.h | 19 +++++++++++++ 9 files changed, 51 insertions(+), 39 deletions(-) diff --git a/src/Disks/tests/gtest_cascade_and_memory_write_buffer.cpp b/src/Disks/tests/gtest_cascade_and_memory_write_buffer.cpp index f5d34f7f70c..23b783173c8 100644 --- a/src/Disks/tests/gtest_cascade_and_memory_write_buffer.cpp +++ b/src/Disks/tests/gtest_cascade_and_memory_write_buffer.cpp @@ -222,7 +222,7 @@ TEST(MemoryWriteBuffer, WriteAndReread) if (s > 1) { MemoryWriteBuffer buf(s - 1); - EXPECT_THROW(buf.write(data.data(), data.size()), WriteBuffer::CurrentBufferExhausted); + EXPECT_THROW(buf.write(data.data(), data.size()), MemoryWriteBuffer::CurrentBufferExhausted); buf.finalize(); } } diff --git a/src/IO/CascadeWriteBuffer.cpp b/src/IO/CascadeWriteBuffer.cpp index 4542ffc88f7..8b863cb253c 100644 --- a/src/IO/CascadeWriteBuffer.cpp +++ b/src/IO/CascadeWriteBuffer.cpp @@ -36,7 +36,7 @@ void CascadeWriteBuffer::nextImpl() curr_buffer->position() = position(); curr_buffer->next(); } - catch (const WriteBuffer::CurrentBufferExhausted &) + catch (const MemoryWriteBuffer::CurrentBufferExhausted &) { if (curr_buffer_num < num_sources) { diff --git a/src/IO/MemoryReadWriteBuffer.cpp b/src/IO/MemoryReadWriteBuffer.cpp index c79ee1d6f58..1f4d350f083 100644 --- a/src/IO/MemoryReadWriteBuffer.cpp +++ b/src/IO/MemoryReadWriteBuffer.cpp @@ -112,7 +112,7 @@ void MemoryWriteBuffer::addChunk() if (0 == next_chunk_size) { set(position(), 0); - throw WriteBuffer::CurrentBufferExhausted(); + throw MemoryWriteBuffer::CurrentBufferExhausted(); } } diff --git a/src/IO/MemoryReadWriteBuffer.h b/src/IO/MemoryReadWriteBuffer.h index feb1499d12f..a7d3e388cb3 100644 --- a/src/IO/MemoryReadWriteBuffer.h +++ b/src/IO/MemoryReadWriteBuffer.h @@ -16,6 +16,13 @@ namespace DB class MemoryWriteBuffer : public WriteBuffer, public IReadableWriteBuffer, boost::noncopyable, private Allocator { public: + /// Special exception to throw when the current MemoryWriteBuffer cannot receive data + class CurrentBufferExhausted : public std::exception + { + public: + const char * what() const noexcept override { return "WriteBuffer limit is exhausted"; } + }; + /// Use max_total_size_ = 0 for unlimited storage explicit MemoryWriteBuffer( size_t max_total_size_ = 0, diff --git a/src/IO/WriteBuffer.cpp b/src/IO/WriteBuffer.cpp index 2ed14222ffc..766a83ad7e8 100644 --- a/src/IO/WriteBuffer.cpp +++ b/src/IO/WriteBuffer.cpp @@ -30,4 +30,19 @@ WriteBuffer::~WriteBuffer() } } +void WriteBuffer::cancel() noexcept +{ + if (canceled || finalized) + return; + + LoggerPtr log = getLogger("WriteBuffer"); + LOG_INFO( + log, + "Cancel has been called. Stack trace: {}", + StackTrace().toString()); + + LockMemoryExceptionInThread lock(VariableContext::Global); + cancelImpl(); + canceled = true; +} } diff --git a/src/IO/WriteBuffer.h b/src/IO/WriteBuffer.h index cee033c420f..4759f96a235 100644 --- a/src/IO/WriteBuffer.h +++ b/src/IO/WriteBuffer.h @@ -29,14 +29,6 @@ namespace ErrorCodes class WriteBuffer : public BufferBase { public: - /// Special exception to throw when the current WriteBuffer cannot receive data - /// It is used in MemoryWriteBuffer and CascadeWriteBuffer - class CurrentBufferExhausted : public std::exception - { - public: - const char * what() const noexcept override { return "WriteBuffer limit is exhausted"; } - }; - using BufferBase::set; using BufferBase::position; void set(Position ptr, size_t size) { BufferBase::set(ptr, size, 0); } @@ -60,13 +52,6 @@ public: { nextImpl(); } - catch (const CurrentBufferExhausted &) - { - pos = working_buffer.begin(); - bytes += bytes_in_buffer; - - throw; - } catch (...) { /** If the nextImpl() call was unsuccessful, move the cursor to the beginning, @@ -75,8 +60,6 @@ public: pos = working_buffer.begin(); bytes += bytes_in_buffer; - cancel(); - throw; } @@ -157,15 +140,7 @@ public: } } - void cancel() noexcept - { - if (canceled || finalized) - return; - - LockMemoryExceptionInThread lock(VariableContext::Global); - cancelImpl(); - canceled = true; - } + void cancel() noexcept; /// Wait for data to be reliably written. Mainly, call fsync for fd. /// May be called after finalize() if needed. diff --git a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp index 8098671a903..e2098b284bf 100644 --- a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp +++ b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp @@ -162,7 +162,8 @@ WriteBufferFromHTTPServerResponse::~WriteBufferFromHTTPServerResponse() { try { - finalize(); + if (!canceled) + finalize(); } catch (...) { diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index 26ec185f5ad..3241e22fa35 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -1027,14 +1027,7 @@ catch (...) { tryLogCurrentException(log, "Cannot send exception to client"); - try - { - used_output.finalize(); - } - catch (...) - { - tryLogCurrentException(log, "Cannot flush data to client (after sending exception)"); - } + used_output.cancel(); } void HTTPHandler::formatExceptionForClient(int exception_code, HTTPServerRequest & request, HTTPServerResponse & response, Output & used_output) @@ -1172,7 +1165,7 @@ void HTTPHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse /// Check if exception was thrown in used_output.finalize(). /// In this case used_output can be in invalid state and we /// cannot write in it anymore. So, just log this exception. - if (used_output.isFinalized()) + if (used_output.isFinalized() || used_output.isCanceled()) { if (thread_trace_context) thread_trace_context->root_span.addAttribute("clickhouse.exception", "Cannot flush data to client"); @@ -1191,6 +1184,8 @@ void HTTPHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse if (thread_trace_context) thread_trace_context->root_span.addAttribute(status); + + return; } used_output.finalize(); diff --git a/src/Server/HTTPHandler.h b/src/Server/HTTPHandler.h index c5551102f7a..c78c45826f0 100644 --- a/src/Server/HTTPHandler.h +++ b/src/Server/HTTPHandler.h @@ -78,6 +78,7 @@ private: WriteBuffer * out_maybe_delayed_and_compressed = nullptr; bool finalized = false; + bool canceled = false; bool exception_is_written = false; std::function exception_writer; @@ -99,6 +100,24 @@ private: out->finalize(); } + void cancel() + { + if (canceled) + return; + canceled = true; + + if (out_compressed_holder) + out_compressed_holder->cancel(); + if (out) + out->cancel(); + } + + + bool isCanceled() const + { + return canceled; + } + bool isFinalized() const { return finalized; From 340b5998d2d5d7cfe778fc28ad9b2ea41211c189 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 26 Jun 2024 00:13:21 +0200 Subject: [PATCH 82/95] Update DatabaseReplicated.cpp --- src/Databases/DatabaseReplicated.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 72725f4c58c..8779490c8c6 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -471,7 +471,7 @@ void DatabaseReplicated::tryConnectToZooKeeperAndInitDatabase(LoadingStrictnessL auto db_name_path = fs::path(zookeeper_path) / FIRST_REPLICA_DATABASE_NAME; auto error_code = current_zookeeper->trySet(db_name_path, getDatabaseName()); if (error_code == Coordination::Error::ZNONODE) - current_zookeeper->create(db_name_path, getDatabaseName(), zkutil::CreateMode::Persistent); + current_zookeeper->tryCreate(db_name_path, getDatabaseName(), zkutil::CreateMode::Persistent); is_readonly = false; } From bbd9c77b7c3b2432a9b2a0c61fd1b955e3b82c16 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Wed, 26 Jun 2024 12:56:53 +0200 Subject: [PATCH 83/95] rm debug print --- src/IO/WriteBuffer.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/IO/WriteBuffer.cpp b/src/IO/WriteBuffer.cpp index 766a83ad7e8..a86eb4ccea2 100644 --- a/src/IO/WriteBuffer.cpp +++ b/src/IO/WriteBuffer.cpp @@ -35,12 +35,6 @@ void WriteBuffer::cancel() noexcept if (canceled || finalized) return; - LoggerPtr log = getLogger("WriteBuffer"); - LOG_INFO( - log, - "Cancel has been called. Stack trace: {}", - StackTrace().toString()); - LockMemoryExceptionInThread lock(VariableContext::Global); cancelImpl(); canceled = true; From f49c194ad00ae530394a4725dece0b98011fc29e Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 26 Jun 2024 13:48:35 +0200 Subject: [PATCH 84/95] Fix server restarts in performance tests --- tests/performance/scripts/compare.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/performance/scripts/compare.sh b/tests/performance/scripts/compare.sh index 9a0fb5b335c..cb56ab6c5bf 100755 --- a/tests/performance/scripts/compare.sh +++ b/tests/performance/scripts/compare.sh @@ -87,6 +87,7 @@ function configure --path db0 --user_files_path db0/user_files --top_level_domains_path "$(left_or_right right top_level_domains)" + --keeper_server.storage_path coordination0 --tcp_port $LEFT_SERVER_PORT ) left/clickhouse-server "${setup_left_server_opts[@]}" &> setup-server-log.log & @@ -113,8 +114,12 @@ function configure rm -r db0/preprocessed_configs ||: rm -r db0/{data,metadata}/system ||: rm db0/status ||: + cp -al db0/ left/db/ + cp -R coordination0 left/coordination + cp -al db0/ right/db/ + cp -R coordination0 right/coordination } function restart @@ -135,6 +140,7 @@ function restart --tcp_port $LEFT_SERVER_PORT --keeper_server.tcp_port $LEFT_SERVER_KEEPER_PORT --keeper_server.raft_configuration.server.port $LEFT_SERVER_KEEPER_RAFT_PORT + --keeper_server.storage_path left/coordination --zookeeper.node.port $LEFT_SERVER_KEEPER_PORT --interserver_http_port $LEFT_SERVER_INTERSERVER_PORT ) @@ -154,6 +160,7 @@ function restart --tcp_port $RIGHT_SERVER_PORT --keeper_server.tcp_port $RIGHT_SERVER_KEEPER_PORT --keeper_server.raft_configuration.server.port $RIGHT_SERVER_KEEPER_RAFT_PORT + --keeper_server.storage_path right/coordination --zookeeper.node.port $RIGHT_SERVER_KEEPER_PORT --interserver_http_port $RIGHT_SERVER_INTERSERVER_PORT ) From 7a5734749c6d4de64ffed1eaa2e8290103aafb27 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 26 Jun 2024 14:00:49 +0200 Subject: [PATCH 85/95] Fix logical error in fuzzer --- src/Analyzer/Resolve/QueryAnalyzer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Analyzer/Resolve/QueryAnalyzer.cpp b/src/Analyzer/Resolve/QueryAnalyzer.cpp index 446a27ac40b..c01929c737f 100644 --- a/src/Analyzer/Resolve/QueryAnalyzer.cpp +++ b/src/Analyzer/Resolve/QueryAnalyzer.cpp @@ -4519,7 +4519,7 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, auto nodes = arg_func->getArguments().getNodes(); if (nodes.size() != 2) - return; + continue; if (auto * identifier_node = nodes[0]->as()) { From 2ec0ffd4f0829ae99f03286c9486dde10c911c00 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 26 Jun 2024 14:07:38 +0200 Subject: [PATCH 86/95] Addressed review comments --- .../FunctionSecretArgumentsFinderAST.h | 11 +++--- .../test_mask_sensitive_info/test.py | 36 +++++++++++-------- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/src/Parsers/FunctionSecretArgumentsFinderAST.h b/src/Parsers/FunctionSecretArgumentsFinderAST.h index 537c65f829f..5b77485afb0 100644 --- a/src/Parsers/FunctionSecretArgumentsFinderAST.h +++ b/src/Parsers/FunctionSecretArgumentsFinderAST.h @@ -190,19 +190,22 @@ private: findSecretNamedArgument("account_key", 1); return; } + else if (is_cluster_function && isNamedCollectionName(1)) + { + /// azureBlobStorageCluster(cluster, named_collection, ..., account_key = 'account_key', ...) + findSecretNamedArgument("account_key", 2); + return; + } /// We should check other arguments first because we don't need to do any replacement in case storage_account_url is not used /// azureBlobStorage(connection_string|storage_account_url, container_name, blobpath, account_name, account_key, format, compression, structure) /// azureBlobStorageCluster(cluster, connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure]) - size_t count = excludeS3OrURLNestedMaps(); + size_t count = arguments->size(); if ((url_arg_idx + 4 <= count) && (count <= url_arg_idx + 7)) { String second_arg; if (tryGetStringFromArgument(url_arg_idx + 3, &second_arg)) { - if (boost::iequals(second_arg, "NOSIGN")) - return; /// The argument after 'url' is "NOSIGN". - if (second_arg == "auto" || KnownFormatNames::instance().exists(second_arg)) return; /// The argument after 'url' is a format: s3('url', 'format', ...) } diff --git a/tests/integration/test_mask_sensitive_info/test.py b/tests/integration/test_mask_sensitive_info/test.py index c9895bdc2d9..902d3800324 100644 --- a/tests/integration/test_mask_sensitive_info/test.py +++ b/tests/integration/test_mask_sensitive_info/test.py @@ -377,13 +377,17 @@ def test_table_functions(): f"azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_4.csv', '{azure_account_name}', '{azure_account_key}', 'CSV')", f"azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_5.csv', '{azure_account_name}', '{azure_account_key}', 'CSV', 'none')", f"azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_6.csv', '{azure_account_name}', '{azure_account_key}', 'CSV', 'none', 'auto')", - f"azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_7.csv', 'CSV')", - f"azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_8.csv', 'CSV', 'none')", - f"azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_9.csv', 'CSV', 'none', 'auto')", - f"azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_10.csv', '{azure_account_name}', '{azure_account_key}')", - f"azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_11.csv', '{azure_account_name}', '{azure_account_key}', 'CSV')", - f"azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_12.csv', '{azure_account_name}', '{azure_account_key}', 'CSV', 'none')", - f"azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_13.csv', '{azure_account_name}', '{azure_account_key}', 'CSV', 'none', 'auto')", + f"azureBlobStorage(named_collection_2, connection_string = '{azure_conn_string}', container = 'cont', blob_path = 'test_simple_7.csv', format = 'CSV')", + f"azureBlobStorage(named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_8.csv', account_name = '{azure_account_name}', account_key = '{azure_account_key}')", + f"azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_9.csv', 'CSV')", + f"azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_10.csv', 'CSV', 'none')", + f"azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_11.csv', 'CSV', 'none', 'auto')", + f"azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_12.csv', '{azure_account_name}', '{azure_account_key}')", + f"azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_13.csv', '{azure_account_name}', '{azure_account_key}', 'CSV')", + f"azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_14.csv', '{azure_account_name}', '{azure_account_key}', 'CSV', 'none')", + f"azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_15.csv', '{azure_account_name}', '{azure_account_key}', 'CSV', 'none', 'auto')", + f"azureBlobStorageCluster('test_shard_localhost', named_collection_2, connection_string = '{azure_conn_string}', container = 'cont', blob_path = 'test_simple_16.csv', format = 'CSV')", + f"azureBlobStorageCluster('test_shard_localhost', named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_17.csv', account_name = '{azure_account_name}', account_key = '{azure_account_key}')", ] def make_test_case(i): @@ -464,13 +468,17 @@ def test_table_functions(): f"CREATE TABLE tablefunc37 (`x` int) AS azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_4.csv', '{azure_account_name}', '[HIDDEN]', 'CSV')", f"CREATE TABLE tablefunc38 (`x` int) AS azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_5.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none')", f"CREATE TABLE tablefunc39 (`x` int) AS azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_6.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none', 'auto')", - f"CREATE TABLE tablefunc40 (x int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_7.csv', 'CSV')", - f"CREATE TABLE tablefunc41 (x int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_8.csv', 'CSV', 'none')", - f"CREATE TABLE tablefunc42 (x int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_9.csv', 'CSV', 'none', 'auto')", - f"CREATE TABLE tablefunc43 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_10.csv', '{azure_account_name}', '[HIDDEN]')", - f"CREATE TABLE tablefunc44 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_11.csv', '{azure_account_name}', '[HIDDEN]', 'CSV')", - f"CREATE TABLE tablefunc45 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_12.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none')", - f"CREATE TABLE tablefunc46 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_13.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none', 'auto')", + f"CREATE TABLE tablefunc40 (x int) AS azureBlobStorage(named_collection_2, connection_string = '{azure_conn_string}', container = 'cont', blob_path = 'test_simple_7.csv', format = 'CSV')", + f"CREATE TABLE tablefunc41 (`x` int) AS azureBlobStorage(named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_8.csv', account_name = '{azure_account_name}', account_key = '[HIDDEN]')", + f"CREATE TABLE tablefunc42 (x int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_9.csv', 'CSV')", + f"CREATE TABLE tablefunc43 (x int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_10.csv', 'CSV', 'none')", + f"CREATE TABLE tablefunc44 (x int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_11.csv', 'CSV', 'none', 'auto')", + f"CREATE TABLE tablefunc45 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_12.csv', '{azure_account_name}', '[HIDDEN]')", + f"CREATE TABLE tablefunc46 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_13.csv', '{azure_account_name}', '[HIDDEN]', 'CSV')", + f"CREATE TABLE tablefunc47 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_14.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none')", + f"CREATE TABLE tablefunc48 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_15.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none', 'auto')", + f"CREATE TABLE tablefunc49 (x int) AS azureBlobStorageCluster('test_shard_localhost', named_collection_2, connection_string = '{azure_conn_string}', container = 'cont', blob_path = 'test_simple_16.csv', format = 'CSV')", + f"CREATE TABLE tablefunc50 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_17.csv', account_name = '{azure_account_name}', account_key = '[HIDDEN]')", ], must_not_contain=[password], ) From acbc3b3036497808153ffc0f223cbaee9ce6dbff Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 26 Jun 2024 14:36:48 +0200 Subject: [PATCH 87/95] Update 03002_part_log_rmt_fetch_mutate_error.sql --- .../0_stateless/03002_part_log_rmt_fetch_mutate_error.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/03002_part_log_rmt_fetch_mutate_error.sql b/tests/queries/0_stateless/03002_part_log_rmt_fetch_mutate_error.sql index 34ba034f798..d8b5ebb3148 100644 --- a/tests/queries/0_stateless/03002_part_log_rmt_fetch_mutate_error.sql +++ b/tests/queries/0_stateless/03002_part_log_rmt_fetch_mutate_error.sql @@ -4,9 +4,9 @@ drop table if exists rmt_master; drop table if exists rmt_slave; -create table rmt_master (key Int) engine=ReplicatedMergeTree('/clickhouse/{database}', 'master') order by tuple() settings always_fetch_merged_part=0; +create table rmt_master (key Int) engine=ReplicatedMergeTree('/clickhouse/{database}', 'master') order by tuple() settings always_fetch_merged_part=0, old_parts_lifetime=600; -- prefer_fetch_merged_part_*_threshold=0, consider this table as a "slave" -create table rmt_slave (key Int) engine=ReplicatedMergeTree('/clickhouse/{database}', 'slave') order by tuple() settings prefer_fetch_merged_part_time_threshold=0, prefer_fetch_merged_part_size_threshold=0; +create table rmt_slave (key Int) engine=ReplicatedMergeTree('/clickhouse/{database}', 'slave') order by tuple() settings prefer_fetch_merged_part_time_threshold=0, prefer_fetch_merged_part_size_threshold=0, old_parts_lifetime=600; insert into rmt_master values (1); From b97ac72be05193467daa8959fd06c4c26fd45b33 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 26 Jun 2024 16:24:00 +0300 Subject: [PATCH 88/95] PostgreSQL source support cancel --- src/Processors/Sources/PostgreSQLSource.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Sources/PostgreSQLSource.cpp b/src/Processors/Sources/PostgreSQLSource.cpp index 4b828d6699c..f18c63ed385 100644 --- a/src/Processors/Sources/PostgreSQLSource.cpp +++ b/src/Processors/Sources/PostgreSQLSource.cpp @@ -120,7 +120,7 @@ Chunk PostgreSQLSource::generate() MutableColumns columns = description.sample_block.cloneEmptyColumns(); size_t num_rows = 0; - while (true) + while (!isCancelled()) { const std::vector * row{stream->read_row()}; From 3f351ec3faedb7f9e7a617a0fccc850934b084af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Wed, 26 Jun 2024 16:40:08 +0000 Subject: [PATCH 89/95] Try to fix flaky test --- tests/queries/0_stateless/02265_column_ttl.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02265_column_ttl.sql b/tests/queries/0_stateless/02265_column_ttl.sql index 16ae2da2a2b..ac64dd9457a 100644 --- a/tests/queries/0_stateless/02265_column_ttl.sql +++ b/tests/queries/0_stateless/02265_column_ttl.sql @@ -16,7 +16,8 @@ insert into ttl_02265 values ('2010-01-01', 2010, 'foo'); optimize table ttl_02265 final; -- after, 20100101_0_0_2 will not have ttl.txt, but will have value.bin optimize table ttl_02265 final; -system sync replica ttl_02265; +system sync replica ttl_02265 STRICT; +system sync replica ttl_02265_r2 STRICT; -- after detach/attach it will not have TTL in-memory, and will not have ttl.txt detach table ttl_02265; From 2e7c23b82c8aea141f6551c4ed0d78fa778d1e0d Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 26 Jun 2024 19:49:14 +0200 Subject: [PATCH 90/95] Try fix data race in TCPHandler --- src/Server/TCPHandler.cpp | 27 ++++++++++++++++----------- src/Server/TCPHandler.h | 7 ++++++- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index 14a2bceebf1..b59fe2c1849 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -387,7 +387,7 @@ void TCPHandler::runImpl() query_scope.emplace(query_context, /* fatal_error_callback */ [this] { - std::lock_guard lock(fatal_error_mutex); + std::lock_guard lock(out_mutex); sendLogs(); }); @@ -475,7 +475,7 @@ void TCPHandler::runImpl() Stopwatch watch; CurrentMetrics::Increment callback_metric_increment(CurrentMetrics::ReadTaskRequestsSent); - std::lock_guard lock(task_callback_mutex); + std::scoped_lock lock(out_mutex, task_callback_mutex); if (state.cancellation_status == CancellationStatus::FULLY_CANCELLED) return {}; @@ -491,7 +491,7 @@ void TCPHandler::runImpl() { Stopwatch watch; CurrentMetrics::Increment callback_metric_increment(CurrentMetrics::MergeTreeAllRangesAnnouncementsSent); - std::lock_guard lock(task_callback_mutex); + std::scoped_lock lock(out_mutex, task_callback_mutex); if (state.cancellation_status == CancellationStatus::FULLY_CANCELLED) return; @@ -505,7 +505,7 @@ void TCPHandler::runImpl() { Stopwatch watch; CurrentMetrics::Increment callback_metric_increment(CurrentMetrics::MergeTreeReadTaskRequestsSent); - std::lock_guard lock(task_callback_mutex); + std::scoped_lock lock(out_mutex, task_callback_mutex); if (state.cancellation_status == CancellationStatus::FULLY_CANCELLED) return std::nullopt; @@ -553,7 +553,7 @@ void TCPHandler::runImpl() { auto callback = [this]() { - std::scoped_lock lock(task_callback_mutex, fatal_error_mutex); + std::scoped_lock lock(out_mutex, task_callback_mutex); if (getQueryCancellationStatus() == CancellationStatus::FULLY_CANCELLED) return true; @@ -572,7 +572,7 @@ void TCPHandler::runImpl() finish_or_cancel(); - std::lock_guard lock(task_callback_mutex); + std::lock_guard lock(out_mutex); /// Send final progress after calling onFinish(), since it will update the progress. /// @@ -595,7 +595,7 @@ void TCPHandler::runImpl() break; { - std::lock_guard lock(task_callback_mutex); + std::lock_guard lock(out_mutex); sendLogs(); sendEndOfStream(); } @@ -1014,7 +1014,7 @@ void TCPHandler::processOrdinaryQuery() if (query_context->getSettingsRef().allow_experimental_query_deduplication) { - std::lock_guard lock(task_callback_mutex); + std::lock_guard lock(out_mutex); sendPartUUIDs(); } @@ -1024,13 +1024,13 @@ void TCPHandler::processOrdinaryQuery() if (header) { - std::lock_guard lock(task_callback_mutex); + std::lock_guard lock(out_mutex); sendData(header); } } /// Defer locking to cover a part of the scope below and everything after it - std::unique_lock progress_lock(task_callback_mutex, std::defer_lock); + std::unique_lock out_lock(out_mutex, std::defer_lock); { PullingAsyncPipelineExecutor executor(pipeline); @@ -1056,6 +1056,9 @@ void TCPHandler::processOrdinaryQuery() executor.cancelReading(); } + lock.unlock(); + out_lock.lock(); + if (after_send_progress.elapsed() / 1000 >= interactive_delay) { /// Some time passed and there is a progress. @@ -1071,12 +1074,14 @@ void TCPHandler::processOrdinaryQuery() if (!state.io.null_format) sendData(block); } + + out_lock.unlock(); } /// This lock wasn't acquired before and we make .lock() call here /// so everything under this line is covered even together /// with sendProgress() out of the scope - progress_lock.lock(); + out_lock.lock(); /** If data has run out, we will send the profiling data and total values to * the last zero block to be able to use diff --git a/src/Server/TCPHandler.h b/src/Server/TCPHandler.h index 75e36836b63..7569a1272a6 100644 --- a/src/Server/TCPHandler.h +++ b/src/Server/TCPHandler.h @@ -225,8 +225,13 @@ private: std::optional nonce; String cluster; + /// `out_mutex` protects `out` (WriteBuffer). + /// So it is used for method sendData(), sendProgress(), sendLogs(), etc. + std::mutex out_mutex; + /// `task_callback_mutex` protects tasks callbacks. + /// Inside these callbacks we might also change cancellation status, + /// so it also protects cancellation status checks. std::mutex task_callback_mutex; - std::mutex fatal_error_mutex; /// At the moment, only one ongoing query in the connection is supported at a time. QueryState state; From 565665c4724e1f36db8ee7e433004a905cacced5 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Wed, 26 Jun 2024 23:00:33 +0200 Subject: [PATCH 91/95] remove confusing line --- tests/queries/0_stateless/03174_merge_join_bug.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/queries/0_stateless/03174_merge_join_bug.sql b/tests/queries/0_stateless/03174_merge_join_bug.sql index 9e31d0f7c1a..ab4cb6cd4a9 100644 --- a/tests/queries/0_stateless/03174_merge_join_bug.sql +++ b/tests/queries/0_stateless/03174_merge_join_bug.sql @@ -1,6 +1,5 @@ -- Tags: no-random-settings --- https://github.com/ClickHouse/ClickHouse/issues/24395 SET allow_experimental_analyzer=1, join_algorithm = 'full_sorting_merge'; CREATE TABLE xxxx_yyy (key UInt32, key_b ALIAS key) ENGINE=MergeTree() ORDER BY key SETTINGS ratio_of_defaults_for_sparse_serialization=0.0; INSERT INTO xxxx_yyy SELECT number FROM numbers(10); From 298f543f95f373d7267ff553c24656b7c527d437 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 27 Jun 2024 06:46:12 +0200 Subject: [PATCH 92/95] Update CHANGELOG.md --- CHANGELOG.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ddff2f7cdc5..c4935f88245 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,17 +29,17 @@ * Added `merge_workload` and `mutation_workload` settings to regulate how resources are utilized and shared between merges, mutations and other workloads. [#64061](https://github.com/ClickHouse/ClickHouse/pull/64061) ([Sergei Trifonov](https://github.com/serxa)). * Add support for comparing IPv4 and IPv6 types using the `=` operator. [#64292](https://github.com/ClickHouse/ClickHouse/pull/64292) ([Francisco J. Jurado Moreno](https://github.com/Beetelbrox)). * Allow to store named collections in zookeeper. [#64574](https://github.com/ClickHouse/ClickHouse/pull/64574) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Support decimal arguments in binary math functions (pow(), atan2(), max2, min2(), hypot(). [#64582](https://github.com/ClickHouse/ClickHouse/pull/64582) ([Mikhail Gorshkov](https://github.com/mgorshkov)). +* Support decimal arguments in binary math functions (pow, atan2, max2, min2, hypot). [#64582](https://github.com/ClickHouse/ClickHouse/pull/64582) ([Mikhail Gorshkov](https://github.com/mgorshkov)). * Add support for index analysis over `hilbertEncode`. [#64662](https://github.com/ClickHouse/ClickHouse/pull/64662) ([Artem Mustafin](https://github.com/Artemmm91)). * Added SQL functions `parseReadableSize` (along with `OrNull` and `OrZero` variants). [#64742](https://github.com/ClickHouse/ClickHouse/pull/64742) ([Francisco J. Jurado Moreno](https://github.com/Beetelbrox)). * Add server settings `max_table_num_to_throw` and `max_database_num_to_throw` to limit the number of databases or tables on `CREATE` queries. [#64781](https://github.com/ClickHouse/ClickHouse/pull/64781) ([Xu Jia](https://github.com/XuJia0210)). * Add _time virtual column to file alike storages (s3/file/hdfs/url/azureBlobStorage). [#64947](https://github.com/ClickHouse/ClickHouse/pull/64947) ([Ilya Golshtein](https://github.com/ilejn)). -* Introduced new functions `base64UrlEncode`, `base64UrlDecode` and `tryBase64UrlDecode`. [#64991](https://github.com/ClickHouse/ClickHouse/pull/64991) ([Mikhail Gorshkov](https://github.com/mgorshkov)). +* Introduced new functions `base64URLEncode`, `base64URLDecode` and `tryBase64URLDecode`. [#64991](https://github.com/ClickHouse/ClickHouse/pull/64991) ([Mikhail Gorshkov](https://github.com/mgorshkov)). * Add new function `editDistanceUTF8`, which calculates the [edit distance](https://en.wikipedia.org/wiki/Edit_distance) between two UTF8 strings. [#65269](https://github.com/ClickHouse/ClickHouse/pull/65269) ([LiuNeng](https://github.com/liuneng1994)). #### Performance Improvement * Add a native parquet reader, which can read parquet binary to ClickHouse Columns directly. It's controlled by the setting `input_format_parquet_use_native_reader` (disabled by default). [#60361](https://github.com/ClickHouse/ClickHouse/pull/60361) ([ZhiHong Zhang](https://github.com/copperybean)). -* Reduce the number of virtual function calls in ColumnNullable::size(). [#60556](https://github.com/ClickHouse/ClickHouse/pull/60556) ([HappenLee](https://github.com/HappenLee)). +* Reduce the number of virtual function calls in ColumnNullable::size. [#60556](https://github.com/ClickHouse/ClickHouse/pull/60556) ([HappenLee](https://github.com/HappenLee)). * Speedup `splitByRegexp` when the regular expression argument is a single-character. [#62696](https://github.com/ClickHouse/ClickHouse/pull/62696) ([Robert Schulze](https://github.com/rschu1ze)). * Speed up FixedHashTable by keeping track of the min and max keys used. This allows to reduce the number of cells that need to be verified. [#62746](https://github.com/ClickHouse/ClickHouse/pull/62746) ([Jiebin Sun](https://github.com/jiebinn)). * Optimize the resolution of in(LowCardinality, ConstantSet). [#64060](https://github.com/ClickHouse/ClickHouse/pull/64060) ([Zhiguo Zhou](https://github.com/ZhiguoZh)). @@ -51,7 +51,7 @@ * Improve function least/greatest for nullable numberic type arguments. [#64668](https://github.com/ClickHouse/ClickHouse/pull/64668) ([KevinyhZou](https://github.com/KevinyhZou)). * Allow merging two consequent `FilterSteps` of a query plan. This improves filter-push-down optimization if the filter condition can be pushed down from the parent step. [#64760](https://github.com/ClickHouse/ClickHouse/pull/64760) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). * Remove bad optimization in vertical final implementation and re-enable vertical final algorithm by default. [#64783](https://github.com/ClickHouse/ClickHouse/pull/64783) ([Duc Canh Le](https://github.com/canhld94)). -* Remove ALIAS nodes from the filter expression. This slightly improves performance for queries with `PREWHERE` (with new analyzer). [#64793](https://github.com/ClickHouse/ClickHouse/pull/64793) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Remove ALIAS nodes from the filter expression. This slightly improves performance for queries with `PREWHERE` (with the new analyzer). [#64793](https://github.com/ClickHouse/ClickHouse/pull/64793) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). * Fix performance regression in cross join introduced in [#60459](https://github.com/ClickHouse/ClickHouse/issues/60459) (24.5). [#65243](https://github.com/ClickHouse/ClickHouse/pull/65243) ([Nikita Taranov](https://github.com/nickitat)). #### Improvement @@ -63,7 +63,7 @@ * Reduce the memory usage when using Azure object storage by using fixed memory allocation, avoiding the allocation of an extra buffer. [#63160](https://github.com/ClickHouse/ClickHouse/pull/63160) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)). * Several minor corner case fixes to proxy support & tunneling. [#63427](https://github.com/ClickHouse/ClickHouse/pull/63427) ([Arthur Passos](https://github.com/arthurpassos)). * Add `http_response_headers` setting to support custom response headers in custom HTTP handlers. [#63562](https://github.com/ClickHouse/ClickHouse/pull/63562) ([Grigorii](https://github.com/GSokol)). -* Improve io_uring resubmits visibility. Rename profile event `IOUringSQEsResubmits` -> `IOUringSQEsResubmitsAsync` and add a new one `IOUringSQEsResubmitsSync`. [#63699](https://github.com/ClickHouse/ClickHouse/pull/63699) ([Tomer Shafir](https://github.com/tomershafir)). +* Improve io_uring resubmit visibility. Rename profile event `IOUringSQEsResubmits` -> `IOUringSQEsResubmitsAsync` and add a new one `IOUringSQEsResubmitsSync`. [#63699](https://github.com/ClickHouse/ClickHouse/pull/63699) ([Tomer Shafir](https://github.com/tomershafir)). * Introduce assertions to verify all functions are called with columns of the right size. [#63723](https://github.com/ClickHouse/ClickHouse/pull/63723) ([Raúl Marín](https://github.com/Algunenano)). * `SHOW CREATE TABLE` executed on top of system tables will now show the super handy comment unique for each table which will explain why this table is needed. [#63788](https://github.com/ClickHouse/ClickHouse/pull/63788) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). * Added setting `metadata_storage_type` to keep free space on metadata storage disk. [#64128](https://github.com/ClickHouse/ClickHouse/pull/64128) ([MikhailBurdukov](https://github.com/MikhailBurdukov)). From 7d26f45a2858ac9f7c94d83ee690174a060baced Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 27 Jun 2024 08:03:14 +0000 Subject: [PATCH 93/95] base64En/Decode64Url --> base64En/Decode64URL --- .../functions/string-functions.md | 28 +++++++------- .../functions/string-functions.md | 8 ++-- src/Functions/FunctionBase64Conversion.h | 24 ++++++------ ...ase64UrlDecode.cpp => base64URLDecode.cpp} | 10 ++--- ...ase64UrlEncode.cpp => base64URLEncode.cpp} | 8 ++-- ...64UrlDecode.cpp => tryBase64URLDecode.cpp} | 12 +++--- .../03167_base64_url_functions.sql | 38 +++++++++---------- .../aspell-ignore/en/aspell-dict.txt | 4 +- 8 files changed, 66 insertions(+), 66 deletions(-) rename src/Functions/{base64UrlDecode.cpp => base64URLDecode.cpp} (73%) rename src/Functions/{base64UrlEncode.cpp => base64URLEncode.cpp} (78%) rename src/Functions/{tryBase64UrlDecode.cpp => tryBase64URLDecode.cpp} (69%) diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index a258456345e..c068b0e9d17 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -1168,14 +1168,14 @@ Result: └────────────────────────────┘ ``` -## base64UrlEncode +## base64URLEncode Encodes an URL (String or FixedString) as base64 with URL-specific modifications, according to [RFC 4648](https://datatracker.ietf.org/doc/html/rfc4648#section-5). **Syntax** ```sql -base64UrlEncode(url) +base64URLEncode(url) ``` **Arguments** @@ -1189,13 +1189,13 @@ base64UrlEncode(url) **Example** ``` sql -SELECT base64UrlEncode('https://clickhouse.com'); +SELECT base64URLEncode('https://clickhouse.com'); ``` Result: ```result -┌─base64UrlEncode('https://clickhouse.com')─┐ +┌─base64URLEncode('https://clickhouse.com')─┐ │ aHR0cDovL2NsaWNraG91c2UuY29t │ └───────────────────────────────────────────┘ ``` @@ -1234,19 +1234,19 @@ Result: └──────────────────────────────────┘ ``` -## base64UrlDecode +## base64URLDecode Accepts a base64-encoded URL and decodes it from base64 with URL-specific modifications, according to [RFC 4648](https://datatracker.ietf.org/doc/html/rfc4648#section-5). Throws an exception in case of an error. **Syntax** ```sql -base64UrlDecode(encodedUrl) +base64URLDecode(encodedUrl) ``` **Arguments** -- `encodedUrl` — [String](../data-types/string.md) column or constant. If the string is not a valid Base64-encoded value with URL-specific modifications, an exception is thrown. +- `encodedURL` — [String](../data-types/string.md) column or constant. If the string is not a valid Base64-encoded value with URL-specific modifications, an exception is thrown. **Returned value** @@ -1255,13 +1255,13 @@ base64UrlDecode(encodedUrl) **Example** ``` sql -SELECT base64UrlDecode('aHR0cDovL2NsaWNraG91c2UuY29t'); +SELECT base64URLDecode('aHR0cDovL2NsaWNraG91c2UuY29t'); ``` Result: ```result -┌─base64UrlDecode('aHR0cDovL2NsaWNraG91c2UuY29t')─┐ +┌─base64URLDecode('aHR0cDovL2NsaWNraG91c2UuY29t')─┐ │ https://clickhouse.com │ └─────────────────────────────────────────────────┘ ``` @@ -1298,19 +1298,19 @@ SELECT tryBase64Decode('RW5jb2RlZA==') as res, tryBase64Decode('invalid') as res └────────────┴─────────────┘ ``` -## tryBase64UrlDecode +## tryBase64URLDecode -Like `base64UrlDecode` but returns an empty string in case of error. +Like `base64URLDecode` but returns an empty string in case of error. **Syntax** ```sql -tryBase64UrlDecode(encodedUrl) +tryBase64URLDecode(encodedUrl) ``` **Parameters** -- `encodedUrl`: [String](../data-types/string.md) column or constant. If the string is not a valid Base64-encoded value with URL-specific modifications, returns an empty string. +- `encodedURL`: [String](../data-types/string.md) column or constant. If the string is not a valid Base64-encoded value with URL-specific modifications, returns an empty string. **Returned value** @@ -1321,7 +1321,7 @@ tryBase64UrlDecode(encodedUrl) Query: ```sql -SELECT tryBase64UrlDecode('aHR0cDovL2NsaWNraG91c2UuY29t') as res, tryBase64Decode('aHR0cHM6Ly9jbGlja') as res_invalid; +SELECT tryBase64URLDecode('aHR0cDovL2NsaWNraG91c2UuY29t') as res, tryBase64Decode('aHR0cHM6Ly9jbGlja') as res_invalid; ``` ```response diff --git a/docs/ru/sql-reference/functions/string-functions.md b/docs/ru/sql-reference/functions/string-functions.md index fa76e84f130..2436581fc7f 100644 --- a/docs/ru/sql-reference/functions/string-functions.md +++ b/docs/ru/sql-reference/functions/string-functions.md @@ -538,7 +538,7 @@ SELECT base58Decode('3dc8KtHrwM'); Синоним: `TO_BASE64`. -## base64UrlEncode(s) +## base64URLEncode(s) Производит кодирование URL (String или FixedString) в base64-представление в соответствии с [RFC 4648](https://tools.ietf.org/html/rfc4648). @@ -548,7 +548,7 @@ SELECT base58Decode('3dc8KtHrwM'); Синоним: `FROM_BASE64`. -## base64UrlDecode(s) +## base64URLDecode(s) Декодирует base64-представление URL в исходную строку в соответствии с [RFC 4648](https://tools.ietf.org/html/rfc4648). При невозможности декодирования выбрасывает исключение @@ -556,9 +556,9 @@ SELECT base58Decode('3dc8KtHrwM'); Функционал аналогичен base64Decode, но при невозможности декодирования возвращает пустую строку. -## tryBase64UrlDecode(s) +## tryBase64URLDecode(s) -Функционал аналогичен base64UrlDecode, но при невозможности декодирования возвращает пустую строку. +Функционал аналогичен base64URLDecode, но при невозможности декодирования возвращает пустую строку. ## endsWith(s, suffix) {#endswith} diff --git a/src/Functions/FunctionBase64Conversion.h b/src/Functions/FunctionBase64Conversion.h index 05914be3837..083179c3ca8 100644 --- a/src/Functions/FunctionBase64Conversion.h +++ b/src/Functions/FunctionBase64Conversion.h @@ -25,10 +25,10 @@ namespace ErrorCodes enum class Base64Variant : uint8_t { Normal, - Url + URL }; -inline std::string preprocessBase64Url(std::string_view src) +inline std::string preprocessBase64URL(std::string_view src) { std::string padded_src; padded_src.reserve(src.size() + 3); @@ -70,7 +70,7 @@ inline std::string preprocessBase64Url(std::string_view src) return padded_src; } -inline size_t postprocessBase64Url(UInt8 * dst, size_t out_len) +inline size_t postprocessBase64URL(UInt8 * dst, size_t out_len) { // Do symbol substitution as described in https://datatracker.ietf.org/doc/html/rfc4648#section-5 for (size_t i = 0; i < out_len; ++i) @@ -95,7 +95,7 @@ inline size_t postprocessBase64Url(UInt8 * dst, size_t out_len) template struct Base64Encode { - static constexpr auto name = (variant == Base64Variant::Normal) ? "base64Encode" : "base64UrlEncode"; + static constexpr auto name = (variant == Base64Variant::Normal) ? "base64Encode" : "base64URLEncode"; static size_t getBufferSize(size_t string_length, size_t string_count) { @@ -111,8 +111,8 @@ struct Base64Encode /// Memory sanitizer doesn't understand if there was uninitialized memory in SIMD register but it was not used in the result of shuffle. __msan_unpoison(dst, outlen); - if constexpr (variant == Base64Variant::Url) - outlen = postprocessBase64Url(dst, outlen); + if constexpr (variant == Base64Variant::URL) + outlen = postprocessBase64URL(dst, outlen); return outlen; } @@ -121,7 +121,7 @@ struct Base64Encode template struct Base64Decode { - static constexpr auto name = (variant == Base64Variant::Normal) ? "base64Decode" : "base64UrlDecode"; + static constexpr auto name = (variant == Base64Variant::Normal) ? "base64Decode" : "base64URLDecode"; static size_t getBufferSize(size_t string_length, size_t string_count) { @@ -132,9 +132,9 @@ struct Base64Decode { int rc; size_t outlen = 0; - if constexpr (variant == Base64Variant::Url) + if constexpr (variant == Base64Variant::URL) { - std::string src_padded = preprocessBase64Url(src); + std::string src_padded = preprocessBase64URL(src); rc = base64_decode(src_padded.data(), src_padded.size(), reinterpret_cast(dst), &outlen, 0); } else @@ -156,7 +156,7 @@ struct Base64Decode template struct TryBase64Decode { - static constexpr auto name = (variant == Base64Variant::Normal) ? "tryBase64Decode" : "tryBase64UrlDecode"; + static constexpr auto name = (variant == Base64Variant::Normal) ? "tryBase64Decode" : "tryBase64URLDecode"; static size_t getBufferSize(size_t string_length, size_t string_count) { @@ -167,9 +167,9 @@ struct TryBase64Decode { int rc; size_t outlen = 0; - if constexpr (variant == Base64Variant::Url) + if constexpr (variant == Base64Variant::URL) { - std::string src_padded = preprocessBase64Url(src); + std::string src_padded = preprocessBase64URL(src); rc = base64_decode(src_padded.data(), src_padded.size(), reinterpret_cast(dst), &outlen, 0); } else diff --git a/src/Functions/base64UrlDecode.cpp b/src/Functions/base64URLDecode.cpp similarity index 73% rename from src/Functions/base64UrlDecode.cpp rename to src/Functions/base64URLDecode.cpp index 59975d8f9d1..f5766dc60bd 100644 --- a/src/Functions/base64UrlDecode.cpp +++ b/src/Functions/base64URLDecode.cpp @@ -5,16 +5,16 @@ namespace DB { -REGISTER_FUNCTION(Base64UrlDecode) +REGISTER_FUNCTION(Base64URLDecode) { FunctionDocumentation::Description description = R"(Accepts a base64-encoded URL and decodes it from base64 with URL-specific modifications, according to RFC 4648 (https://datatracker.ietf.org/doc/html/rfc4648#section-5).)"; - FunctionDocumentation::Syntax syntax = "base64UrlDecode(encodedUrl)"; - FunctionDocumentation::Arguments arguments = {{"encodedUrl", "String column or constant. If the string is not a valid Base64-encoded value, an exception is thrown."}}; + FunctionDocumentation::Syntax syntax = "base64URLDecode(encodedURL)"; + FunctionDocumentation::Arguments arguments = {{"encodedURL", "String column or constant. If the string is not a valid Base64-encoded value, an exception is thrown."}}; FunctionDocumentation::ReturnedValue returned_value = "A string containing the decoded value of the argument."; - FunctionDocumentation::Examples examples = {{"Example", "SELECT base64UrlDecode('aHR0cDovL2NsaWNraG91c2UuY29t')", "https://clickhouse.com"}}; + FunctionDocumentation::Examples examples = {{"Example", "SELECT base64URLDecode('aHR0cDovL2NsaWNraG91c2UuY29t')", "https://clickhouse.com"}}; FunctionDocumentation::Categories categories = {"String encoding"}; - factory.registerFunction>>({description, syntax, arguments, returned_value, examples, categories}); + factory.registerFunction>>({description, syntax, arguments, returned_value, examples, categories}); } } diff --git a/src/Functions/base64UrlEncode.cpp b/src/Functions/base64URLEncode.cpp similarity index 78% rename from src/Functions/base64UrlEncode.cpp rename to src/Functions/base64URLEncode.cpp index 05d50170c14..73a465a30c5 100644 --- a/src/Functions/base64UrlEncode.cpp +++ b/src/Functions/base64URLEncode.cpp @@ -5,16 +5,16 @@ namespace DB { -REGISTER_FUNCTION(Base64UrlEncode) +REGISTER_FUNCTION(Base64URLEncode) { FunctionDocumentation::Description description = R"(Encodes an URL (String or FixedString) as base64 with URL-specific modifications, according to RFC 4648 (https://datatracker.ietf.org/doc/html/rfc4648#section-5).)"; - FunctionDocumentation::Syntax syntax = "base64UrlEncode(url)"; + FunctionDocumentation::Syntax syntax = "base64URLEncode(url)"; FunctionDocumentation::Arguments arguments = {{"url", "String column or constant."}}; FunctionDocumentation::ReturnedValue returned_value = "A string containing the encoded value of the argument."; - FunctionDocumentation::Examples examples = {{"Example", "SELECT base64UrlEncode('https://clickhouse.com')", "aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ"}}; + FunctionDocumentation::Examples examples = {{"Example", "SELECT base64URLEncode('https://clickhouse.com')", "aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ"}}; FunctionDocumentation::Categories categories = {"String encoding"}; - factory.registerFunction>>({description, syntax, arguments, returned_value, examples, categories}); + factory.registerFunction>>({description, syntax, arguments, returned_value, examples, categories}); } } diff --git a/src/Functions/tryBase64UrlDecode.cpp b/src/Functions/tryBase64URLDecode.cpp similarity index 69% rename from src/Functions/tryBase64UrlDecode.cpp rename to src/Functions/tryBase64URLDecode.cpp index b9aaf4f9273..b44bc7538ee 100644 --- a/src/Functions/tryBase64UrlDecode.cpp +++ b/src/Functions/tryBase64URLDecode.cpp @@ -5,16 +5,16 @@ namespace DB { -REGISTER_FUNCTION(TryBase64UrlDecode) +REGISTER_FUNCTION(TryBase64URLDecode) { - FunctionDocumentation::Description description = R"(Decodes an URL from base64, like base64UrlDecode but returns an empty string in case of an error.)"; - FunctionDocumentation::Syntax syntax = "tryBase64UrlDecode(encodedUrl)"; - FunctionDocumentation::Arguments arguments = {{"encodedUrl", "String column or constant. If the string is not a valid Base64-encoded value with URL-specific modifications, returns an empty string."}}; + FunctionDocumentation::Description description = R"(Decodes an URL from base64, like base64URLDecode but returns an empty string in case of an error.)"; + FunctionDocumentation::Syntax syntax = "tryBase64URLDecode(encodedUrl)"; + FunctionDocumentation::Arguments arguments = {{"encodedURL", "String column or constant. If the string is not a valid Base64-encoded value with URL-specific modifications, returns an empty string."}}; FunctionDocumentation::ReturnedValue returned_value = "A string containing the decoded value of the argument."; - FunctionDocumentation::Examples examples = {{"valid", "SELECT tryBase64UrlDecode('aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ')", "https://clickhouse.com"}, {"invalid", "SELECT tryBase64UrlDecode('aHR0cHM6Ly9jbGlja')", ""}}; + FunctionDocumentation::Examples examples = {{"valid", "SELECT tryBase64URLDecode('aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ')", "https://clickhouse.com"}, {"invalid", "SELECT tryBase64UrlDecode('aHR0cHM6Ly9jbGlja')", ""}}; FunctionDocumentation::Categories categories = {"String encoding"}; - factory.registerFunction>>({description, syntax, arguments, returned_value, examples, categories}); + factory.registerFunction>>({description, syntax, arguments, returned_value, examples, categories}); } } diff --git a/tests/queries/0_stateless/03167_base64_url_functions.sql b/tests/queries/0_stateless/03167_base64_url_functions.sql index 674f1ae498b..6c394ba6c3a 100644 --- a/tests/queries/0_stateless/03167_base64_url_functions.sql +++ b/tests/queries/0_stateless/03167_base64_url_functions.sql @@ -2,35 +2,35 @@ -- no-fasttest because aklomp-base64 library is required -- incorrect number of arguments -SELECT base64UrlEncode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT base64UrlDecode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT tryBase64UrlDecode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT base64UrlEncode('foo', 'excess argument'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT base64UrlDecode('foo', 'excess argument'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -SELECT tryBase64UrlDecode('foo', 'excess argument'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT base64URLEncode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT base64URLDecode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT tryBase64URLDecode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT base64URLEncode('foo', 'excess argument'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT base64URLDecode('foo', 'excess argument'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT tryBase64URLDecode('foo', 'excess argument'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -- test with valid inputs -SELECT 'https://clickhouse.com' AS original, base64UrlEncode(original) AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); -SELECT '12?' AS original, base64UrlEncode(original) AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); -SELECT 'https://www.google.com/search?q=clickhouse+base64+decode&sca_esv=739f8bb380e4c7ed&ei=TfRiZqCDIrmnwPAP2KLRkA8&ved=0ahUKEwjg3ZHitsmGAxW5ExAIHVhRFPIQ4dUDCBA&uact=5&oq=clickhouse+base64+decode' AS original, base64UrlEncode(original) AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); +SELECT 'https://clickhouse.com' AS original, base64URLEncode(original) AS encoded, base64URLDecode(encoded), tryBase64URLDecode(encoded); +SELECT '12?' AS original, base64URLEncode(original) AS encoded, base64URLDecode(encoded), tryBase64URLDecode(encoded); +SELECT 'https://www.google.com/search?q=clickhouse+base64+decode&sca_esv=739f8bb380e4c7ed&ei=TfRiZqCDIrmnwPAP2KLRkA8&ved=0ahUKEwjg3ZHitsmGAxW5ExAIHVhRFPIQ4dUDCBA&uact=5&oq=clickhouse+base64+decode' AS original, base64URLEncode(original) AS encoded, base64URLDecode(encoded), tryBase64URLDecode(encoded); -- encoded value has no padding -SELECT 'aHR0cHM6Ly9jbGlj' AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); +SELECT 'aHR0cHM6Ly9jbGlj' AS encoded, base64URLDecode(encoded), tryBase64URLDecode(encoded); -- encoded value has one-byte padding -SELECT 'aHR0cHM6Ly9jbGlja2g' AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); +SELECT 'aHR0cHM6Ly9jbGlja2g' AS encoded, base64URLDecode(encoded), tryBase64URLDecode(encoded); -- encoded value has two-bytes padding -SELECT 'aHR0cHM6Ly9jbGljaw' AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); +SELECT 'aHR0cHM6Ly9jbGljaw' AS encoded, base64URLDecode(encoded), tryBase64URLDecode(encoded); -- test with invalid inputs -SELECT base64UrlDecode('https://clickhouse.com'); -- { serverError INCORRECT_DATA } -SELECT tryBase64UrlDecode('https://clickhouse.com'); -SELECT base64UrlDecode('12?'); -- { serverError INCORRECT_DATA } -SELECT tryBase64UrlDecode('12?'); -SELECT base64UrlDecode('aHR0cHM6Ly9jbGlja'); -- { serverError INCORRECT_DATA } -SELECT tryBase64UrlDecode('aHR0cHM6Ly9jbGlja'); +SELECT base64URLDecode('https://clickhouse.com'); -- { serverError INCORRECT_DATA } +SELECT tryBase64URLDecode('https://clickhouse.com'); +SELECT base64URLDecode('12?'); -- { serverError INCORRECT_DATA } +SELECT tryBase64URLDecode('12?'); +SELECT base64URLDecode('aHR0cHM6Ly9jbGlja'); -- { serverError INCORRECT_DATA } +SELECT tryBase64URLDecode('aHR0cHM6Ly9jbGlja'); -- test FixedString argument -SELECT toFixedString('https://clickhouse.com', 22) AS original, base64UrlEncode(original) AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); +SELECT toFixedString('https://clickhouse.com', 22) AS original, base64URLEncode(original) AS encoded, base64URLDecode(encoded), tryBase64URLDecode(encoded); diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 397cdaff212..d031fe4d957 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1012,8 +1012,8 @@ Updatable Uppercased Uptime Uptrace -UrlDecode -UrlEncode +URLDecode +URLEncode UserID Util VARCHAR From fbb96d17de9f4e8ce03e03feda0e099edfe12039 Mon Sep 17 00:00:00 2001 From: Max K Date: Thu, 27 Jun 2024 16:44:00 +0200 Subject: [PATCH 94/95] CI: New create release workflow --- .github/workflows/create_release.yml | 29 ++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 .github/workflows/create_release.yml diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml new file mode 100644 index 00000000000..3988df3b2b1 --- /dev/null +++ b/.github/workflows/create_release.yml @@ -0,0 +1,29 @@ +name: CreateRelease + +concurrency: + group: release + +'on': + workflow_dispatch: + inputs: + sha: + description: 'The SHA hash of the commit from which to create the release' + required: true + type: string + type: + description: 'The type of release: "new" for a new release or "patch" for a patch release' + required: true + type: choice + options: + - new + - patch + +jobs: + Release: + runs-on: [self-hosted, style-checker-aarch64] + steps: + - name: Check out repository code + uses: ClickHouse/checkout@v1 + - name: Print greeting + run: | + python3 ./tests/ci/release.py --commit ${{ inputs.sha }} --type ${{ inputs.type }} --dry-run From 9e4d976e5e466c3d7da8da13ddbcdb87ddc87e4a Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 27 Jun 2024 17:24:43 +0000 Subject: [PATCH 95/95] Fix clickhouse-test invocation --- docs/en/development/tests.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/development/tests.md b/docs/en/development/tests.md index 8dff6f0ed1d..269995a1a96 100644 --- a/docs/en/development/tests.md +++ b/docs/en/development/tests.md @@ -28,7 +28,7 @@ run, for example, the test `01428_hash_set_nan_key`, change to the repository folder and run the following command: ``` -PATH=$PATH: tests/clickhouse-test 01428_hash_set_nan_key +PATH=:$PATH tests/clickhouse-test 01428_hash_set_nan_key ``` Test results (`stderr` and `stdout`) are written to files `01428_hash_set_nan_key.[stderr|stdout]` which