From b320527158e3318dd28d9eaca4b31178d8a05a34 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 8 May 2023 16:33:39 +0000 Subject: [PATCH 01/45] Fix assert in SpanHolder::finish() with fibers --- contrib/boost | 2 +- contrib/boost-cmake/CMakeLists.txt | 23 ++++++++++ src/CMakeLists.txt | 3 ++ src/Common/OpenTelemetryTraceContext.cpp | 55 +++++++++++++++--------- 4 files changed, 62 insertions(+), 21 deletions(-) diff --git a/contrib/boost b/contrib/boost index 8fe7b3326ef..d6c95434acb 160000 --- a/contrib/boost +++ b/contrib/boost @@ -1 +1 @@ -Subproject commit 8fe7b3326ef482ee6ecdf5a4f698f2b8c2780f98 +Subproject commit d6c95434acbb1a02d0b9de52bf4f37cac6c00328 diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index c9a759eab9c..ae20568f386 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -151,6 +151,7 @@ add_library (_boost_context ${SRCS_CONTEXT}) add_library (boost::context ALIAS _boost_context) target_include_directories (_boost_context PRIVATE ${LIBRARY_DIR}) + if (SANITIZE OR BOOST_USE_UCONTEXT) target_compile_definitions(_boost_context PUBLIC BOOST_USE_UCONTEXT) endif() @@ -161,6 +162,28 @@ elseif (SANITIZE STREQUAL "thread") target_compile_definitions(_boost_context PUBLIC BOOST_USE_TSAN) endif() +# fiber + +set (SRCS_FIBER + "${LIBRARY_DIR}/libs/fiber/src/context.cpp" + "${LIBRARY_DIR}/libs/fiber/src/fiber.cpp" + "${LIBRARY_DIR}/libs/fiber/src/barrier.cpp" + "${LIBRARY_DIR}/libs/fiber/src/condition_variable.cpp" + "${LIBRARY_DIR}/libs/fiber/src/future.cpp" + "${LIBRARY_DIR}/libs/fiber/src/mutex.cpp" + "${LIBRARY_DIR}/libs/fiber/src/properties.cpp" + "${LIBRARY_DIR}/libs/fiber/src/recursive_mutex.cpp" + "${LIBRARY_DIR}/libs/fiber/src/recursive_timed_mutex.cpp" + "${LIBRARY_DIR}/libs/fiber/src/scheduler.cpp" + "${LIBRARY_DIR}/libs/fiber/src/timed_mutex.cpp" + "${LIBRARY_DIR}/libs/fiber/src/waker.cpp" + "${LIBRARY_DIR}/libs/fiber/src/algo/round_robin.cpp" +) + +add_library (_boost_fiber ${SRCS_FIBER}) +add_library (boost::fiber ALIAS _boost_fiber) +target_include_directories (_boost_fiber PRIVATE ${LIBRARY_DIR}) + # coroutine set (SRCS_COROUTINE diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 76a67ade99c..c69ac885154 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -547,6 +547,9 @@ endif () target_link_libraries(clickhouse_common_io PUBLIC boost::context) dbms_target_link_libraries(PUBLIC boost::context) +target_link_libraries(clickhouse_common_io PUBLIC boost::fiber) +dbms_target_link_libraries(PUBLIC boost::fiber) + if (ENABLE_NLP) dbms_target_link_libraries (PUBLIC ch_contrib::stemmer) dbms_target_link_libraries (PUBLIC ch_contrib::wnb) diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index 0d89c581318..1c75bd3efaf 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -6,13 +6,26 @@ #include #include #include +#include namespace DB { namespace OpenTelemetry { -thread_local TracingContextOnThread current_thread_trace_context; +static TracingContextOnThread & getCurrentThreadTraceContext() +{ + static boost::fibers::fiber_specific_ptr current_thread_trace_context; + + auto * ptr = current_thread_trace_context.get(); + if (unlikely(!ptr)) + { + ptr = new TracingContextOnThread(); + current_thread_trace_context.reset(ptr); + } + return *ptr; +} + bool Span::addAttribute(std::string_view name, UInt64 value) noexcept { @@ -104,7 +117,7 @@ bool Span::addAttributeImpl(std::string_view name, std::string_view value) noexc SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind) { - if (!current_thread_trace_context.isTraceEnabled()) + if (!getCurrentThreadTraceContext().isTraceEnabled()) { return; } @@ -112,8 +125,8 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind) /// Use try-catch to make sure the ctor is exception safe. try { - this->trace_id = current_thread_trace_context.trace_id; - this->parent_span_id = current_thread_trace_context.span_id; + this->trace_id = getCurrentThreadTraceContext().trace_id; + this->parent_span_id = getCurrentThreadTraceContext().span_id; this->span_id = thread_local_rng(); // create a new id for this span this->operation_name = _operation_name; this->kind = _kind; @@ -132,7 +145,7 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind) } /// Set current span as parent of other spans created later on this thread. - current_thread_trace_context.span_id = this->span_id; + getCurrentThreadTraceContext().span_id = this->span_id; } void SpanHolder::finish() noexcept @@ -141,12 +154,12 @@ void SpanHolder::finish() noexcept return; // First of all, restore old value of current span. - assert(current_thread_trace_context.span_id == span_id); - current_thread_trace_context.span_id = parent_span_id; + assert(getCurrentThreadTraceContext().span_id == span_id); + getCurrentThreadTraceContext().span_id = parent_span_id; try { - auto log = current_thread_trace_context.span_log.lock(); + auto log = getCurrentThreadTraceContext().span_log.lock(); /// The log might be disabled, check it before use if (log) @@ -269,7 +282,7 @@ void TracingContext::serialize(WriteBuffer & buf) const const TracingContextOnThread & CurrentContext() { - return current_thread_trace_context; + return getCurrentThreadTraceContext(); } void TracingContextOnThread::reset() noexcept @@ -291,7 +304,7 @@ TracingContextHolder::TracingContextHolder( /// If any exception is raised during the construction, the tracing is not enabled on current thread. try { - if (current_thread_trace_context.isTraceEnabled()) + if (getCurrentThreadTraceContext().isTraceEnabled()) { /// /// This is not the normal case, @@ -304,15 +317,15 @@ TracingContextHolder::TracingContextHolder( /// So this branch ensures this class can be instantiated multiple times on one same thread safely. /// this->is_context_owner = false; - this->root_span.trace_id = current_thread_trace_context.trace_id; - this->root_span.parent_span_id = current_thread_trace_context.span_id; + this->root_span.trace_id = getCurrentThreadTraceContext().trace_id; + this->root_span.parent_span_id = getCurrentThreadTraceContext().span_id; this->root_span.span_id = thread_local_rng(); this->root_span.operation_name = _operation_name; this->root_span.start_time_us = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); /// Set the root span as parent of other spans created on current thread - current_thread_trace_context.span_id = this->root_span.span_id; + getCurrentThreadTraceContext().span_id = this->root_span.span_id; return; } @@ -356,10 +369,10 @@ TracingContextHolder::TracingContextHolder( } /// Set up trace context on current thread only when the root span is successfully initialized. - current_thread_trace_context = _parent_trace_context; - current_thread_trace_context.span_id = this->root_span.span_id; - current_thread_trace_context.trace_flags = TRACE_FLAG_SAMPLED; - current_thread_trace_context.span_log = _span_log; + getCurrentThreadTraceContext() = _parent_trace_context; + getCurrentThreadTraceContext().span_id = this->root_span.span_id; + getCurrentThreadTraceContext().trace_flags = TRACE_FLAG_SAMPLED; + getCurrentThreadTraceContext().span_log = _span_log; } TracingContextHolder::~TracingContextHolder() @@ -371,7 +384,7 @@ TracingContextHolder::~TracingContextHolder() try { - auto shared_span_log = current_thread_trace_context.span_log.lock(); + auto shared_span_log = getCurrentThreadTraceContext().span_log.lock(); if (shared_span_log) { try @@ -402,12 +415,14 @@ TracingContextHolder::~TracingContextHolder() if (this->is_context_owner) { /// Clear the context on current thread - current_thread_trace_context.reset(); + getCurrentThreadTraceContext().reset(); } else { - current_thread_trace_context.span_id = this->root_span.parent_span_id; + getCurrentThreadTraceContext().span_id = this->root_span.parent_span_id; } + + } } From 5527d43a5d09f9a9e75d3f9b94bd8ef1bec9980a Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 8 May 2023 16:51:17 +0000 Subject: [PATCH 02/45] Use only needed src files --- contrib/boost-cmake/CMakeLists.txt | 8 -------- 1 file changed, 8 deletions(-) diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index ae20568f386..6c722c42e7d 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -167,15 +167,7 @@ endif() set (SRCS_FIBER "${LIBRARY_DIR}/libs/fiber/src/context.cpp" "${LIBRARY_DIR}/libs/fiber/src/fiber.cpp" - "${LIBRARY_DIR}/libs/fiber/src/barrier.cpp" - "${LIBRARY_DIR}/libs/fiber/src/condition_variable.cpp" - "${LIBRARY_DIR}/libs/fiber/src/future.cpp" - "${LIBRARY_DIR}/libs/fiber/src/mutex.cpp" - "${LIBRARY_DIR}/libs/fiber/src/properties.cpp" - "${LIBRARY_DIR}/libs/fiber/src/recursive_mutex.cpp" - "${LIBRARY_DIR}/libs/fiber/src/recursive_timed_mutex.cpp" "${LIBRARY_DIR}/libs/fiber/src/scheduler.cpp" - "${LIBRARY_DIR}/libs/fiber/src/timed_mutex.cpp" "${LIBRARY_DIR}/libs/fiber/src/waker.cpp" "${LIBRARY_DIR}/libs/fiber/src/algo/round_robin.cpp" ) From c961e3706ed1b4028f7420bd2fa8ac96126f378d Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 8 May 2023 16:52:57 +0000 Subject: [PATCH 03/45] Clean up --- contrib/boost-cmake/CMakeLists.txt | 1 - src/Common/OpenTelemetryTraceContext.cpp | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index 6c722c42e7d..c8be40be1d4 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -151,7 +151,6 @@ add_library (_boost_context ${SRCS_CONTEXT}) add_library (boost::context ALIAS _boost_context) target_include_directories (_boost_context PRIVATE ${LIBRARY_DIR}) - if (SANITIZE OR BOOST_USE_UCONTEXT) target_compile_definitions(_boost_context PUBLIC BOOST_USE_UCONTEXT) endif() diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index 1c75bd3efaf..86ce30941a3 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -16,7 +16,7 @@ namespace OpenTelemetry static TracingContextOnThread & getCurrentThreadTraceContext() { static boost::fibers::fiber_specific_ptr current_thread_trace_context; - + auto * ptr = current_thread_trace_context.get(); if (unlikely(!ptr)) { @@ -421,8 +421,6 @@ TracingContextHolder::~TracingContextHolder() { getCurrentThreadTraceContext().span_id = this->root_span.parent_span_id; } - - } } From 930c8c3043fbb2ff462f6ee3d14b2568271c774d Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 9 May 2023 18:56:09 +0000 Subject: [PATCH 04/45] Use own fiber local implementation --- src/Common/AsyncTaskExecutor.cpp | 10 +++++ src/Common/AsyncTaskExecutor.h | 25 +++++++++++ src/Common/OpenTelemetryTraceContext.cpp | 56 ++++++++++-------------- 3 files changed, 58 insertions(+), 33 deletions(-) diff --git a/src/Common/AsyncTaskExecutor.cpp b/src/Common/AsyncTaskExecutor.cpp index 4e2fb18fb78..d0c6454a849 100644 --- a/src/Common/AsyncTaskExecutor.cpp +++ b/src/Common/AsyncTaskExecutor.cpp @@ -3,11 +3,18 @@ namespace DB { +thread_local const Fiber * current_fiber = nullptr; + AsyncTaskExecutor::AsyncTaskExecutor(std::unique_ptr task_) : task(std::move(task_)) { createFiber(); } +const Fiber * AsyncTaskExecutor::getCurrentFiber() +{ + return current_fiber; +} + void AsyncTaskExecutor::resume() { if (routine_is_finished) @@ -31,7 +38,10 @@ void AsyncTaskExecutor::resume() void AsyncTaskExecutor::resumeUnlocked() { + const auto * parent_fiber = current_fiber; + current_fiber = &fiber; fiber = std::move(fiber).resume(); + current_fiber = parent_fiber; } void AsyncTaskExecutor::cancel() diff --git a/src/Common/AsyncTaskExecutor.h b/src/Common/AsyncTaskExecutor.h index f749c3066fc..cf7cdc5ad82 100644 --- a/src/Common/AsyncTaskExecutor.h +++ b/src/Common/AsyncTaskExecutor.h @@ -74,6 +74,7 @@ public: ERROR = 4, }; #endif + static const Fiber * getCurrentFiber(); protected: /// Method that is called in resume() before actual fiber resuming. @@ -118,6 +119,30 @@ private: std::unique_ptr task; }; +/// Simple class for storing fiber local variables. +template +class FiberLocalVariable +{ +public: + T & operator*() + { + return get(); + } + + T * operator->() + { + return &get(); + } + +private: + T & get() + { + return data[AsyncTaskExecutor::getCurrentFiber()]; + } + + std::unordered_map data; +}; + String getSocketTimeoutExceededMessageByTimeoutType(AsyncEventTimeoutType type, Poco::Timespan timeout, const String & socket_description); } diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index 86ce30941a3..f25acc571d8 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -6,26 +6,16 @@ #include #include #include -#include + +#include namespace DB { namespace OpenTelemetry { -static TracingContextOnThread & getCurrentThreadTraceContext() -{ - static boost::fibers::fiber_specific_ptr current_thread_trace_context; - - auto * ptr = current_thread_trace_context.get(); - if (unlikely(!ptr)) - { - ptr = new TracingContextOnThread(); - current_thread_trace_context.reset(ptr); - } - return *ptr; -} - +/// This code can be executed inside fiber, we should use fiber local context. +thread_local FiberLocalVariable current_fiber_trace_context; bool Span::addAttribute(std::string_view name, UInt64 value) noexcept { @@ -117,7 +107,7 @@ bool Span::addAttributeImpl(std::string_view name, std::string_view value) noexc SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind) { - if (!getCurrentThreadTraceContext().isTraceEnabled()) + if (!current_fiber_trace_context->isTraceEnabled()) { return; } @@ -125,8 +115,8 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind) /// Use try-catch to make sure the ctor is exception safe. try { - this->trace_id = getCurrentThreadTraceContext().trace_id; - this->parent_span_id = getCurrentThreadTraceContext().span_id; + this->trace_id =current_fiber_trace_context->trace_id; + this->parent_span_id =current_fiber_trace_context->span_id; this->span_id = thread_local_rng(); // create a new id for this span this->operation_name = _operation_name; this->kind = _kind; @@ -145,7 +135,7 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind) } /// Set current span as parent of other spans created later on this thread. - getCurrentThreadTraceContext().span_id = this->span_id; + current_fiber_trace_context->span_id = this->span_id; } void SpanHolder::finish() noexcept @@ -154,12 +144,12 @@ void SpanHolder::finish() noexcept return; // First of all, restore old value of current span. - assert(getCurrentThreadTraceContext().span_id == span_id); - getCurrentThreadTraceContext().span_id = parent_span_id; + assert(current_fiber_trace_context->span_id == span_id); + current_fiber_trace_context->span_id = parent_span_id; try { - auto log = getCurrentThreadTraceContext().span_log.lock(); + auto log =current_fiber_trace_context->span_log.lock(); /// The log might be disabled, check it before use if (log) @@ -282,7 +272,7 @@ void TracingContext::serialize(WriteBuffer & buf) const const TracingContextOnThread & CurrentContext() { - return getCurrentThreadTraceContext(); + return*current_fiber_trace_context; } void TracingContextOnThread::reset() noexcept @@ -304,7 +294,7 @@ TracingContextHolder::TracingContextHolder( /// If any exception is raised during the construction, the tracing is not enabled on current thread. try { - if (getCurrentThreadTraceContext().isTraceEnabled()) + if (current_fiber_trace_context->isTraceEnabled()) { /// /// This is not the normal case, @@ -317,15 +307,15 @@ TracingContextHolder::TracingContextHolder( /// So this branch ensures this class can be instantiated multiple times on one same thread safely. /// this->is_context_owner = false; - this->root_span.trace_id = getCurrentThreadTraceContext().trace_id; - this->root_span.parent_span_id = getCurrentThreadTraceContext().span_id; + this->root_span.trace_id =current_fiber_trace_context->trace_id; + this->root_span.parent_span_id =current_fiber_trace_context->span_id; this->root_span.span_id = thread_local_rng(); this->root_span.operation_name = _operation_name; this->root_span.start_time_us = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); /// Set the root span as parent of other spans created on current thread - getCurrentThreadTraceContext().span_id = this->root_span.span_id; + current_fiber_trace_context->span_id = this->root_span.span_id; return; } @@ -369,10 +359,10 @@ TracingContextHolder::TracingContextHolder( } /// Set up trace context on current thread only when the root span is successfully initialized. - getCurrentThreadTraceContext() = _parent_trace_context; - getCurrentThreadTraceContext().span_id = this->root_span.span_id; - getCurrentThreadTraceContext().trace_flags = TRACE_FLAG_SAMPLED; - getCurrentThreadTraceContext().span_log = _span_log; + *current_fiber_trace_context = _parent_trace_context; + current_fiber_trace_context->span_id = this->root_span.span_id; + current_fiber_trace_context->trace_flags = TRACE_FLAG_SAMPLED; + current_fiber_trace_context->span_log = _span_log; } TracingContextHolder::~TracingContextHolder() @@ -384,7 +374,7 @@ TracingContextHolder::~TracingContextHolder() try { - auto shared_span_log = getCurrentThreadTraceContext().span_log.lock(); + auto shared_span_log =current_fiber_trace_context->span_log.lock(); if (shared_span_log) { try @@ -415,11 +405,11 @@ TracingContextHolder::~TracingContextHolder() if (this->is_context_owner) { /// Clear the context on current thread - getCurrentThreadTraceContext().reset(); + current_fiber_trace_context->reset(); } else { - getCurrentThreadTraceContext().span_id = this->root_span.parent_span_id; + current_fiber_trace_context->span_id = this->root_span.parent_span_id; } } From 66971662de3a5cb7c5a3edc29b4a103c7c862329 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 9 May 2023 18:56:59 +0000 Subject: [PATCH 05/45] Update cmake --- contrib/boost-cmake/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index c8be40be1d4..cb0db5622a8 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -174,6 +174,7 @@ set (SRCS_FIBER add_library (_boost_fiber ${SRCS_FIBER}) add_library (boost::fiber ALIAS _boost_fiber) target_include_directories (_boost_fiber PRIVATE ${LIBRARY_DIR}) +target_link_libraries(_boost_fiber PRIVATE _boost_context) # coroutine From 0cf6b9f1459175388f7e2a58338a839004e0d6b8 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 10 May 2023 12:27:13 +0000 Subject: [PATCH 06/45] Inherit context from parent fiber --- contrib/boost | 2 +- contrib/boost-cmake/CMakeLists.txt | 15 -------- src/CMakeLists.txt | 3 -- src/Common/AsyncTaskExecutor.cpp | 12 +++---- src/Common/AsyncTaskExecutor.h | 31 ++++------------- src/Common/OpenTelemetryTraceContext.cpp | 44 ++++++++++++++++++++++-- 6 files changed, 55 insertions(+), 52 deletions(-) diff --git a/contrib/boost b/contrib/boost index d6c95434acb..1035c8bfcc9 160000 --- a/contrib/boost +++ b/contrib/boost @@ -1 +1 @@ -Subproject commit d6c95434acbb1a02d0b9de52bf4f37cac6c00328 +Subproject commit 1035c8bfcc9a3c1cfa7f6e827db94dae1ce1a43a diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index cb0db5622a8..c9a759eab9c 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -161,21 +161,6 @@ elseif (SANITIZE STREQUAL "thread") target_compile_definitions(_boost_context PUBLIC BOOST_USE_TSAN) endif() -# fiber - -set (SRCS_FIBER - "${LIBRARY_DIR}/libs/fiber/src/context.cpp" - "${LIBRARY_DIR}/libs/fiber/src/fiber.cpp" - "${LIBRARY_DIR}/libs/fiber/src/scheduler.cpp" - "${LIBRARY_DIR}/libs/fiber/src/waker.cpp" - "${LIBRARY_DIR}/libs/fiber/src/algo/round_robin.cpp" -) - -add_library (_boost_fiber ${SRCS_FIBER}) -add_library (boost::fiber ALIAS _boost_fiber) -target_include_directories (_boost_fiber PRIVATE ${LIBRARY_DIR}) -target_link_libraries(_boost_fiber PRIVATE _boost_context) - # coroutine set (SRCS_COROUTINE diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c69ac885154..76a67ade99c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -547,9 +547,6 @@ endif () target_link_libraries(clickhouse_common_io PUBLIC boost::context) dbms_target_link_libraries(PUBLIC boost::context) -target_link_libraries(clickhouse_common_io PUBLIC boost::fiber) -dbms_target_link_libraries(PUBLIC boost::fiber) - if (ENABLE_NLP) dbms_target_link_libraries (PUBLIC ch_contrib::stemmer) dbms_target_link_libraries (PUBLIC ch_contrib::wnb) diff --git a/src/Common/AsyncTaskExecutor.cpp b/src/Common/AsyncTaskExecutor.cpp index d0c6454a849..68af535b22a 100644 --- a/src/Common/AsyncTaskExecutor.cpp +++ b/src/Common/AsyncTaskExecutor.cpp @@ -3,16 +3,16 @@ namespace DB { -thread_local const Fiber * current_fiber = nullptr; +thread_local FiberInfo current_fiber_info; AsyncTaskExecutor::AsyncTaskExecutor(std::unique_ptr task_) : task(std::move(task_)) { createFiber(); } -const Fiber * AsyncTaskExecutor::getCurrentFiber() +FiberInfo AsyncTaskExecutor::getCurrentFiberInfo() { - return current_fiber; + return current_fiber_info; } void AsyncTaskExecutor::resume() @@ -38,10 +38,10 @@ void AsyncTaskExecutor::resume() void AsyncTaskExecutor::resumeUnlocked() { - const auto * parent_fiber = current_fiber; - current_fiber = &fiber; + auto parent_fiber_info = current_fiber_info; + current_fiber_info = FiberInfo{&fiber, &parent_fiber_info}; fiber = std::move(fiber).resume(); - current_fiber = parent_fiber; + current_fiber_info = parent_fiber_info; } void AsyncTaskExecutor::cancel() diff --git a/src/Common/AsyncTaskExecutor.h b/src/Common/AsyncTaskExecutor.h index cf7cdc5ad82..1c2f758504a 100644 --- a/src/Common/AsyncTaskExecutor.h +++ b/src/Common/AsyncTaskExecutor.h @@ -24,6 +24,11 @@ enum class AsyncEventTimeoutType using AsyncCallback = std::function; using ResumeCallback = std::function; +struct FiberInfo +{ + const Fiber * fiber = nullptr; + const FiberInfo * parent_fiber_info = nullptr; +}; /// Base class for a task that will be executed in a fiber. /// It has only one method - run, that takes 2 callbacks: @@ -74,7 +79,7 @@ public: ERROR = 4, }; #endif - static const Fiber * getCurrentFiber(); + static FiberInfo getCurrentFiberInfo(); protected: /// Method that is called in resume() before actual fiber resuming. @@ -119,30 +124,6 @@ private: std::unique_ptr task; }; -/// Simple class for storing fiber local variables. -template -class FiberLocalVariable -{ -public: - T & operator*() - { - return get(); - } - - T * operator->() - { - return &get(); - } - -private: - T & get() - { - return data[AsyncTaskExecutor::getCurrentFiber()]; - } - - std::unordered_map data; -}; - String getSocketTimeoutExceededMessageByTimeoutType(AsyncEventTimeoutType type, Poco::Timespan timeout, const String & socket_description); } diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index f25acc571d8..178efa33817 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -14,8 +14,48 @@ namespace DB namespace OpenTelemetry { -/// This code can be executed inside fiber, we should use fiber local context. -thread_local FiberLocalVariable current_fiber_trace_context; +/// This code can be executed inside several fibers in one thread, +/// we should use fiber local tracing context. +struct FiberLocalTracingContextOnThread +{ +public: + FiberLocalTracingContextOnThread() + { + /// Initialize main context for this thread. + /// Contexts for fibers will inherit this main context. + data[nullptr] = TracingContextOnThread(); + } + + TracingContextOnThread & operator*() + { + return get(); + } + + TracingContextOnThread * operator->() + { + return &get(); + } + +private: + TracingContextOnThread & get() + { + /// Get context for current fiber. + return getContextForFiber(AsyncTaskExecutor::getCurrentFiberInfo()); + } + + TracingContextOnThread & getContextForFiber(FiberInfo info) + { + auto it = data.find(info.fiber); + /// If it's the first request, we need to initialize context for the fiber using context from parent fiber. + if (it == data.end()) + it = data.insert({info.fiber, getContextForFiber(*info.parent_fiber_info)}).first; + return it->second; + } + + std::unordered_map data; +}; + +thread_local FiberLocalTracingContextOnThread current_fiber_trace_context; bool Span::addAttribute(std::string_view name, UInt64 value) noexcept { From 10e6f5b59a8f19bfab4b24f76c29b2b9a5324555 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 10 May 2023 12:31:34 +0000 Subject: [PATCH 07/45] Fix indents --- src/Common/OpenTelemetryTraceContext.cpp | 26 ++++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index 178efa33817..037ada88f80 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -175,7 +175,7 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind) } /// Set current span as parent of other spans created later on this thread. - current_fiber_trace_context->span_id = this->span_id; + current_fiber_trace_context->span_id = this->span_id; } void SpanHolder::finish() noexcept @@ -189,7 +189,7 @@ void SpanHolder::finish() noexcept try { - auto log =current_fiber_trace_context->span_log.lock(); + auto log = current_fiber_trace_context->span_log.lock(); /// The log might be disabled, check it before use if (log) @@ -312,7 +312,7 @@ void TracingContext::serialize(WriteBuffer & buf) const const TracingContextOnThread & CurrentContext() { - return*current_fiber_trace_context; + return *current_fiber_trace_context; } void TracingContextOnThread::reset() noexcept @@ -347,15 +347,15 @@ TracingContextHolder::TracingContextHolder( /// So this branch ensures this class can be instantiated multiple times on one same thread safely. /// this->is_context_owner = false; - this->root_span.trace_id =current_fiber_trace_context->trace_id; - this->root_span.parent_span_id =current_fiber_trace_context->span_id; + this->root_span.trace_id = current_fiber_trace_context->trace_id; + this->root_span.parent_span_id = current_fiber_trace_context->span_id; this->root_span.span_id = thread_local_rng(); this->root_span.operation_name = _operation_name; this->root_span.start_time_us = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); /// Set the root span as parent of other spans created on current thread - current_fiber_trace_context->span_id = this->root_span.span_id; + current_fiber_trace_context->span_id = this->root_span.span_id; return; } @@ -399,10 +399,10 @@ TracingContextHolder::TracingContextHolder( } /// Set up trace context on current thread only when the root span is successfully initialized. - *current_fiber_trace_context = _parent_trace_context; - current_fiber_trace_context->span_id = this->root_span.span_id; - current_fiber_trace_context->trace_flags = TRACE_FLAG_SAMPLED; - current_fiber_trace_context->span_log = _span_log; + *current_fiber_trace_context = _parent_trace_context; + current_fiber_trace_context->span_id = this->root_span.span_id; + current_fiber_trace_context->trace_flags = TRACE_FLAG_SAMPLED; + current_fiber_trace_context->span_log = _span_log; } TracingContextHolder::~TracingContextHolder() @@ -414,7 +414,7 @@ TracingContextHolder::~TracingContextHolder() try { - auto shared_span_log =current_fiber_trace_context->span_log.lock(); + auto shared_span_log = current_fiber_trace_context->span_log.lock(); if (shared_span_log) { try @@ -445,11 +445,11 @@ TracingContextHolder::~TracingContextHolder() if (this->is_context_owner) { /// Clear the context on current thread - current_fiber_trace_context->reset(); + current_fiber_trace_context->reset(); } else { - current_fiber_trace_context->span_id = this->root_span.parent_span_id; + current_fiber_trace_context->span_id = this->root_span.parent_span_id; } } From a7aec49fbbcab364bc443787ec2c11d2bf52762d Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 10 May 2023 12:43:29 +0000 Subject: [PATCH 08/45] Fix indents --- src/Common/OpenTelemetryTraceContext.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index 037ada88f80..8cf4879c1e2 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -155,8 +155,8 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind) /// Use try-catch to make sure the ctor is exception safe. try { - this->trace_id =current_fiber_trace_context->trace_id; - this->parent_span_id =current_fiber_trace_context->span_id; + this->trace_id = current_fiber_trace_context->trace_id; + this->parent_span_id = current_fiber_trace_context->span_id; this->span_id = thread_local_rng(); // create a new id for this span this->operation_name = _operation_name; this->kind = _kind; From 7fbf87be176081411918ae35f040f338892d1416 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Tue, 18 Apr 2023 13:11:42 +0200 Subject: [PATCH 09/45] rework WriteBufferFromS3, squashed --- contrib/googletest-cmake/CMakeLists.txt | 25 +- src/Backups/BackupIO_S3.cpp | 1 - src/CMakeLists.txt | 1 + src/Disks/DiskLocal.cpp | 1 + .../IO/CachedOnDiskWriteBufferFromFile.cpp | 13 +- .../ObjectStorages/S3/S3ObjectStorage.cpp | 3 +- src/IO/S3/Client.cpp | 33 +- src/IO/S3/Client.h | 7 + src/IO/SwapHelper.cpp | 17 + src/IO/SwapHelper.h | 21 +- src/IO/WriteBuffer.h | 5 +- src/IO/WriteBufferFromFileDecorator.cpp | 38 +- src/IO/WriteBufferFromFileDecorator.h | 7 +- src/IO/WriteBufferFromS3.cpp | 946 +++++++------- src/IO/WriteBufferFromS3.h | 95 +- src/IO/WriteBufferFromS3MemoryStream.cpp | 68 + src/IO/WriteBufferFromS3MemoryStream.h | 39 + src/IO/WriteBufferFromS3TaskTracker.cpp | 137 ++ src/IO/WriteBufferFromS3TaskTracker.h | 37 + src/IO/tests/gtest_writebuffer_s3.cpp | 1114 +++++++++++++++++ src/Storages/MergeTree/MergeTreeData.cpp | 1 + .../MergeTree/MergeTreeDeduplicationLog.cpp | 3 + .../MergeTree/MergeTreeMutationEntry.cpp | 1 + src/Storages/StorageS3.cpp | 1 - .../02240_filesystem_query_cache.reference | 1 + .../02240_filesystem_query_cache.sql | 2 +- ...system_cache_on_write_operations.reference | 36 +- ...41_filesystem_cache_on_write_operations.sh | 19 +- ...ilesystem_cache_persistent_files.reference | 17 +- ...events_from_query_log_and_client.reference | 4 +- 30 files changed, 2102 insertions(+), 591 deletions(-) create mode 100644 src/IO/SwapHelper.cpp create mode 100644 src/IO/WriteBufferFromS3MemoryStream.cpp create mode 100644 src/IO/WriteBufferFromS3MemoryStream.h create mode 100644 src/IO/WriteBufferFromS3TaskTracker.cpp create mode 100644 src/IO/WriteBufferFromS3TaskTracker.h create mode 100644 src/IO/tests/gtest_writebuffer_s3.cpp diff --git a/contrib/googletest-cmake/CMakeLists.txt b/contrib/googletest-cmake/CMakeLists.txt index 90fdde0c185..3905df03155 100644 --- a/contrib/googletest-cmake/CMakeLists.txt +++ b/contrib/googletest-cmake/CMakeLists.txt @@ -1,15 +1,30 @@ -set (SRC_DIR "${ClickHouse_SOURCE_DIR}/contrib/googletest/googletest") +set (SRC_DIR "${ClickHouse_SOURCE_DIR}/contrib/googletest") -add_library(_gtest "${SRC_DIR}/src/gtest-all.cc") +add_library(_gtest "${SRC_DIR}/googletest/src/gtest-all.cc") set_target_properties(_gtest PROPERTIES VERSION "1.0.0") target_compile_definitions (_gtest PUBLIC GTEST_HAS_POSIX_RE=0) -target_include_directories(_gtest SYSTEM PUBLIC "${SRC_DIR}/include") -target_include_directories(_gtest PRIVATE "${SRC_DIR}") +target_include_directories(_gtest SYSTEM PUBLIC "${SRC_DIR}/googletest/include") +target_include_directories(_gtest PRIVATE "${SRC_DIR}/googletest") -add_library(_gtest_main "${SRC_DIR}/src/gtest_main.cc") +add_library(_gtest_main "${SRC_DIR}/googletest/src/gtest_main.cc") set_target_properties(_gtest_main PROPERTIES VERSION "1.0.0") target_link_libraries(_gtest_main PUBLIC _gtest) add_library(_gtest_all INTERFACE) target_link_libraries(_gtest_all INTERFACE _gtest _gtest_main) add_library(ch_contrib::gtest_all ALIAS _gtest_all) + + +add_library(_gmock "${SRC_DIR}/googlemock/src/gmock-all.cc") +set_target_properties(_gmock PROPERTIES VERSION "1.0.0") +target_compile_definitions (_gmock PUBLIC GTEST_HAS_POSIX_RE=0) +target_include_directories(_gmock SYSTEM PUBLIC "${SRC_DIR}/googlemock/include" "${SRC_DIR}/googletest/include") +target_include_directories(_gmock PRIVATE "${SRC_DIR}/googlemock") + +add_library(_gmock_main "${SRC_DIR}/googlemock/src/gmock_main.cc") +set_target_properties(_gmock_main PROPERTIES VERSION "1.0.0") +target_link_libraries(_gmock_main PUBLIC _gmock) + +add_library(_gmock_all INTERFACE) +target_link_libraries(_gmock_all INTERFACE _gmock _gmock_main) +add_library(ch_contrib::gmock_all ALIAS _gmock_all) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 90333900d4a..84dba63ae4e 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -253,7 +253,6 @@ std::unique_ptr BackupWriterS3::writeFile(const String & file_name) fs::path(s3_uri.key) / file_name, request_settings, std::nullopt, - DBMS_DEFAULT_BUFFER_SIZE, threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupWriterS3")); } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b3f4fbb7420..ac99a7c3669 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -612,6 +612,7 @@ if (ENABLE_TESTS) target_link_libraries(unit_tests_dbms PRIVATE ch_contrib::gtest_all + ch_contrib::gmock_all clickhouse_functions clickhouse_aggregate_functions clickhouse_parsers diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp index 69b70da272a..1abecb7af4e 100644 --- a/src/Disks/DiskLocal.cpp +++ b/src/Disks/DiskLocal.cpp @@ -544,6 +544,7 @@ try auto tmp_file = std::make_unique(disk_ptr); auto buf = std::make_unique(std::move(tmp_file)); buf->write(data.data, data.PAGE_SIZE_IN_BYTES); + buf->finalize(); buf->sync(); } return true; diff --git a/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp index af2226ea6ca..9153af90312 100644 --- a/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp @@ -211,10 +211,16 @@ void CachedOnDiskWriteBufferFromFile::nextImpl() { size_t size = offset(); + /// Write data to cache. + cacheData(working_buffer.begin(), size, throw_on_error_from_cache); + current_download_offset += size; + try { SwapHelper swap(*this, *impl); /// Write data to the underlying buffer. + /// Actually here WriteBufferFromFileDecorator::nextImpl has to be called, but it is pivate method. + /// In particular WriteBufferFromFileDecorator introduces logic with swaps in order to achieve delegation. impl->next(); } catch (...) @@ -225,10 +231,6 @@ void CachedOnDiskWriteBufferFromFile::nextImpl() throw; } - - /// Write data to cache. - cacheData(working_buffer.begin(), size, throw_on_error_from_cache); - current_download_offset += size; } void CachedOnDiskWriteBufferFromFile::cacheData(char * data, size_t size, bool throw_on_error) @@ -292,8 +294,7 @@ void CachedOnDiskWriteBufferFromFile::finalizeImpl() { try { - SwapHelper swap(*this, *impl); - impl->finalize(); + WriteBufferFromFileDecorator::finalizeImpl(); } catch (...) { diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 2eee8bf5693..79b3d3a2b8b 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -161,7 +161,7 @@ std::unique_ptr S3ObjectStorage::writeObject( /// NOLIN WriteMode mode, // S3 doesn't support append, only rewrite std::optional attributes, FinalizeCallback && finalize_callback, - size_t buf_size, + size_t buf_size [[maybe_unused]], const WriteSettings & write_settings) { WriteSettings disk_write_settings = IObjectStorage::patchSettings(write_settings); @@ -180,7 +180,6 @@ std::unique_ptr S3ObjectStorage::writeObject( /// NOLIN object.remote_path, settings_ptr->request_settings, attributes, - buf_size, std::move(scheduler), disk_write_settings); diff --git a/src/IO/S3/Client.cpp b/src/IO/S3/Client.cpp index 3c0a8122a91..3c39893b44e 100644 --- a/src/IO/S3/Client.cpp +++ b/src/IO/S3/Client.cpp @@ -255,7 +255,7 @@ Model::HeadObjectOutcome Client::HeadObject(const HeadObjectRequest & request) c if (auto uri = getURIForBucket(bucket); uri.has_value()) request.overrideURI(std::move(*uri)); - auto result = Aws::S3::S3Client::HeadObject(request); + auto result = HeadObject(static_cast(request)); if (result.IsSuccess()) return result; @@ -312,70 +312,75 @@ Model::HeadObjectOutcome Client::HeadObject(const HeadObjectRequest & request) c request.overrideURI(std::move(*bucket_uri)); - return Aws::S3::S3Client::HeadObject(request); + /// The next call is NOT a recurcive call + /// This is a virtuall call Aws::S3::S3Client::HeadObject(const Model::HeadObjectRequest&) + return HeadObject(static_cast(request)); } +/// For each request, we wrap the request functions from Aws::S3::Client with doRequest +/// doRequest calls virtuall function from Aws::S3::Client while DB::S3::Client has not virtual calls for each request type + Model::ListObjectsV2Outcome Client::ListObjectsV2(const ListObjectsV2Request & request) const { - return doRequest(request, [this](const Model::ListObjectsV2Request & req) { return Aws::S3::S3Client::ListObjectsV2(req); }); + return doRequest(request, [this](const Model::ListObjectsV2Request & req) { return ListObjectsV2(req); }); } Model::ListObjectsOutcome Client::ListObjects(const ListObjectsRequest & request) const { - return doRequest(request, [this](const Model::ListObjectsRequest & req) { return Aws::S3::S3Client::ListObjects(req); }); + return doRequest(request, [this](const Model::ListObjectsRequest & req) { return ListObjects(req); }); } Model::GetObjectOutcome Client::GetObject(const GetObjectRequest & request) const { - return doRequest(request, [this](const Model::GetObjectRequest & req) { return Aws::S3::S3Client::GetObject(req); }); + return doRequest(request, [this](const Model::GetObjectRequest & req) { return GetObject(req); }); } Model::AbortMultipartUploadOutcome Client::AbortMultipartUpload(const AbortMultipartUploadRequest & request) const { return doRequest( - request, [this](const Model::AbortMultipartUploadRequest & req) { return Aws::S3::S3Client::AbortMultipartUpload(req); }); + request, [this](const Model::AbortMultipartUploadRequest & req) { return AbortMultipartUpload(req); }); } Model::CreateMultipartUploadOutcome Client::CreateMultipartUpload(const CreateMultipartUploadRequest & request) const { return doRequest( - request, [this](const Model::CreateMultipartUploadRequest & req) { return Aws::S3::S3Client::CreateMultipartUpload(req); }); + request, [this](const Model::CreateMultipartUploadRequest & req) { return CreateMultipartUpload(req); }); } Model::CompleteMultipartUploadOutcome Client::CompleteMultipartUpload(const CompleteMultipartUploadRequest & request) const { return doRequest( - request, [this](const Model::CompleteMultipartUploadRequest & req) { return Aws::S3::S3Client::CompleteMultipartUpload(req); }); + request, [this](const Model::CompleteMultipartUploadRequest & req) { return CompleteMultipartUpload(req); }); } Model::CopyObjectOutcome Client::CopyObject(const CopyObjectRequest & request) const { - return doRequest(request, [this](const Model::CopyObjectRequest & req) { return Aws::S3::S3Client::CopyObject(req); }); + return doRequest(request, [this](const Model::CopyObjectRequest & req) { return CopyObject(req); }); } Model::PutObjectOutcome Client::PutObject(const PutObjectRequest & request) const { - return doRequest(request, [this](const Model::PutObjectRequest & req) { return Aws::S3::S3Client::PutObject(req); }); + return doRequest(request, [this](const Model::PutObjectRequest & req) { return PutObject(req); }); } Model::UploadPartOutcome Client::UploadPart(const UploadPartRequest & request) const { - return doRequest(request, [this](const Model::UploadPartRequest & req) { return Aws::S3::S3Client::UploadPart(req); }); + return doRequest(request, [this](const Model::UploadPartRequest & req) { return UploadPart(req); }); } Model::UploadPartCopyOutcome Client::UploadPartCopy(const UploadPartCopyRequest & request) const { - return doRequest(request, [this](const Model::UploadPartCopyRequest & req) { return Aws::S3::S3Client::UploadPartCopy(req); }); + return doRequest(request, [this](const Model::UploadPartCopyRequest & req) { return UploadPartCopy(req); }); } Model::DeleteObjectOutcome Client::DeleteObject(const DeleteObjectRequest & request) const { - return doRequest(request, [this](const Model::DeleteObjectRequest & req) { return Aws::S3::S3Client::DeleteObject(req); }); + return doRequest(request, [this](const Model::DeleteObjectRequest & req) { return DeleteObject(req); }); } Model::DeleteObjectsOutcome Client::DeleteObjects(const DeleteObjectsRequest & request) const { - return doRequest(request, [this](const Model::DeleteObjectsRequest & req) { return Aws::S3::S3Client::DeleteObjects(req); }); + return doRequest(request, [this](const Model::DeleteObjectsRequest & req) { return DeleteObjects(req); }); } template diff --git a/src/IO/S3/Client.h b/src/IO/S3/Client.h index 63feb94e593..330c85c418a 100644 --- a/src/IO/S3/Client.h +++ b/src/IO/S3/Client.h @@ -40,6 +40,11 @@ struct ServerSideEncryptionKMSConfig #include #include +namespace MockS3 +{ + struct Client; +} + namespace DB::S3 { @@ -195,6 +200,8 @@ public: bool supportsMultiPartCopy() const; private: + friend struct ::MockS3::Client; + Client(size_t max_redirects_, ServerSideEncryptionKMSConfig sse_kms_config_, const std::shared_ptr& credentials_provider, diff --git a/src/IO/SwapHelper.cpp b/src/IO/SwapHelper.cpp new file mode 100644 index 00000000000..4a1cc8acf4c --- /dev/null +++ b/src/IO/SwapHelper.cpp @@ -0,0 +1,17 @@ +#include + +namespace DB +{ + +SwapHelper::SwapHelper(BufferBase & b1_, BufferBase & b2_) + : b1(b1_), b2(b2_) +{ + b1.swap(b2); +} + +SwapHelper::~SwapHelper() +{ + b1.swap(b2); +} + +} diff --git a/src/IO/SwapHelper.h b/src/IO/SwapHelper.h index cedbf5f78fe..fcf32927f23 100644 --- a/src/IO/SwapHelper.h +++ b/src/IO/SwapHelper.h @@ -1,16 +1,19 @@ #pragma once + #include namespace DB { - class SwapHelper - { - public: - SwapHelper(BufferBase & b1_, BufferBase & b2_) : b1(b1_), b2(b2_) { b1.swap(b2); } - ~SwapHelper() { b1.swap(b2); } - private: - BufferBase & b1; - BufferBase & b2; - }; +class SwapHelper +{ +public: + SwapHelper(BufferBase & b1_, BufferBase & b2_); + ~SwapHelper(); + +private: + BufferBase & b1; + BufferBase & b2; +}; + } diff --git a/src/IO/WriteBuffer.h b/src/IO/WriteBuffer.h index 436d07515a3..2c891e17d9a 100644 --- a/src/IO/WriteBuffer.h +++ b/src/IO/WriteBuffer.h @@ -42,7 +42,8 @@ public: { if (!offset()) return; - bytes += offset(); + + auto bytes_in_buffer = offset(); try { @@ -54,9 +55,11 @@ public: * so that later (for example, when the stack was expanded) there was no second attempt to write data. */ pos = working_buffer.begin(); + bytes += bytes_in_buffer; throw; } + bytes += bytes_in_buffer; pos = working_buffer.begin(); } diff --git a/src/IO/WriteBufferFromFileDecorator.cpp b/src/IO/WriteBufferFromFileDecorator.cpp index ac801534b4f..4cc881f177f 100644 --- a/src/IO/WriteBufferFromFileDecorator.cpp +++ b/src/IO/WriteBufferFromFileDecorator.cpp @@ -1,6 +1,7 @@ #include "WriteBufferFromFileDecorator.h" #include +#include namespace DB { @@ -13,12 +14,18 @@ WriteBufferFromFileDecorator::WriteBufferFromFileDecorator(std::unique_ptrfinalized is remain false + /// That leads to situation when the destructor of impl is called with impl->finalized equal false. if (!is_prefinalized) WriteBufferFromFileDecorator::preFinalize(); - impl->finalize(); + { + SwapHelper swap(*this, *impl); + impl->finalize(); + } } WriteBufferFromFileDecorator::~WriteBufferFromFileDecorator() @@ -31,11 +38,21 @@ WriteBufferFromFileDecorator::~WriteBufferFromFileDecorator() { tryLogCurrentException(__PRETTY_FUNCTION__); } + + /// It is not a mistake that swap is called here + /// Swap has been called at constructor, it should be called at destructor + /// In oreder to provide valid buffer for impl's d-tor call + swap(*impl); } void WriteBufferFromFileDecorator::sync() { - impl->sync(); + next(); + + { + SwapHelper swap(*this, *impl); + impl->sync(); + } } std::string WriteBufferFromFileDecorator::getFileName() const @@ -45,11 +62,22 @@ std::string WriteBufferFromFileDecorator::getFileName() const return std::string(); } +void WriteBufferFromFileDecorator::preFinalize() +{ + next(); + + { + SwapHelper swap(*this, *impl); + impl->preFinalize(); + } + + is_prefinalized = true; +} + void WriteBufferFromFileDecorator::nextImpl() { - swap(*impl); + SwapHelper swap(*this, *impl); impl->next(); - swap(*impl); } } diff --git a/src/IO/WriteBufferFromFileDecorator.h b/src/IO/WriteBufferFromFileDecorator.h index dde05276c28..5344bb1425c 100644 --- a/src/IO/WriteBufferFromFileDecorator.h +++ b/src/IO/WriteBufferFromFileDecorator.h @@ -17,12 +17,7 @@ public: std::string getFileName() const override; - void preFinalize() override - { - next(); - impl->preFinalize(); - is_prefinalized = true; - } + void preFinalize() override; const WriteBuffer & getImpl() const { return *impl; } diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 5a25cb89107..5630ed2cb68 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -2,13 +2,16 @@ #if USE_AWS_S3 +#include "WriteBufferFromS3.h" +#include "WriteBufferFromS3MemoryStream.h" +#include "WriteBufferFromS3TaskTracker.h" + #include #include #include #include #include -#include #include #include #include @@ -29,11 +32,13 @@ namespace ProfileEvents extern const Event S3CreateMultipartUpload; extern const Event S3CompleteMultipartUpload; + extern const Event S3AbortMultipartUpload; extern const Event S3UploadPart; extern const Event S3PutObject; extern const Event DiskS3CreateMultipartUpload; extern const Event DiskS3CompleteMultipartUpload; + extern const Event DiskS3AbortMultipartUpload; extern const Event DiskS3UploadPart; extern const Event DiskS3PutObject; @@ -43,30 +48,105 @@ namespace ProfileEvents namespace DB { -// S3 protocol does not allow to have multipart upload with more than 10000 parts. -// In case server does not return an error on exceeding that number, we print a warning -// because custom S3 implementation may allow relaxed requirements on that. -const int S3_WARN_MAX_PARTS = 10000; namespace ErrorCodes { extern const int S3_ERROR; extern const int INVALID_CONFIG_PARAMETER; + extern const int LOGICAL_ERROR; } -struct WriteBufferFromS3::UploadPartTask +struct WriteBufferFromS3::PartData { - S3::UploadPartRequest req; - bool is_finished = false; - std::string tag; - std::exception_ptr exception; + Memory<> memory; + size_t data_size = 0; + + std::shared_ptr createAwsBuffer() + { + auto buffer = std::make_shared(memory.data(), data_size); + buffer->exceptions(std::ios::badbit); + return buffer; + } + + bool isEmpty() const + { + return data_size == 0; + } }; -struct WriteBufferFromS3::PutObjectTask +struct WriteBufferFromS3::BufferAllocationPolicy { - S3::PutObjectRequest req; - bool is_finished = false; - std::exception_ptr exception; + size_t first_size = 0; + size_t second_size = 0; + + size_t multiply_factor = 0; + size_t multiply_threshold = 0; + size_t max_size = 0; + + size_t current_size = 0; + size_t buffer_number = 0; + + explicit BufferAllocationPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_) + : first_size(std::max(settings_.max_single_part_upload_size, settings_.min_upload_part_size)) + , second_size(settings_.min_upload_part_size) + , multiply_factor(settings_.upload_part_size_multiply_factor) + , multiply_threshold(settings_.upload_part_size_multiply_parts_count_threshold) + , max_size(settings_.max_upload_part_size) + { + if (settings_.strict_upload_part_size > 0) + { + first_size = settings_.strict_upload_part_size; + second_size = settings_.strict_upload_part_size; + multiply_factor = 1; + multiply_threshold = 10000; + max_size = settings_.max_upload_part_size; + } + else + { + first_size = std::max(settings_.max_single_part_upload_size, settings_.min_upload_part_size); + second_size = settings_.min_upload_part_size; + multiply_factor = settings_.upload_part_size_multiply_factor; + multiply_threshold = settings_.upload_part_size_multiply_parts_count_threshold; + max_size = settings_.max_upload_part_size; + } + + chassert(first_size > 0); + chassert(second_size > 0); + chassert(multiply_factor >= 1); + chassert(multiply_threshold > 0); + chassert(max_size > 0); + } + + size_t getNumber() const + { + return buffer_number; + } + + size_t getSize() const + { + chassert(buffer_number > 0); + return current_size; + } + + void next() + { + ++buffer_number; + + if (1 == buffer_number) + { + current_size = first_size; + return; + } + + if (2 == buffer_number) + current_size = second_size; + + if (0 == ((buffer_number-1) % multiply_threshold)) + { + current_size *= multiply_factor; + current_size = std::min(current_size, max_size); + } + } }; WriteBufferFromS3::WriteBufferFromS3( @@ -75,146 +155,88 @@ WriteBufferFromS3::WriteBufferFromS3( const String & key_, const S3Settings::RequestSettings & request_settings_, std::optional> object_metadata_, - size_t buffer_size_, ThreadPoolCallbackRunner schedule_, const WriteSettings & write_settings_) - : BufferWithOwnMemory(buffer_size_, nullptr, 0) - , bucket(bucket_) + : bucket(bucket_) , key(key_) , request_settings(request_settings_) , upload_settings(request_settings.getUploadSettings()) + , write_settings(write_settings_) , client_ptr(std::move(client_ptr_)) , object_metadata(std::move(object_metadata_)) - , strict_upload_part_size(upload_settings.strict_upload_part_size) - , current_upload_part_size(upload_settings.min_upload_part_size) - , schedule(std::move(schedule_)) - , write_settings(write_settings_) + , buffer_allocation_policy(std::make_unique(request_settings_.getUploadSettings())) + , task_tracker(std::make_unique(std::move(schedule_))) { + LOG_TRACE(log, "Create WriteBufferFromS3, {}", getLogDetails()); + allocateBuffer(); } void WriteBufferFromS3::nextImpl() { - if (!offset()) + LOG_TRACE(log, "nextImpl with incoming data size {}, memory buffer size {}. {}", offset(), memory.size(), getLogDetails()); + + if (is_prefinalized) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Cannot write to prefinalized buffer for S3, the file could have been created with PutObjectRequest"); + + /// Make sense to call to before adding new async task to check if there is an exception + task_tracker->getReady(); + + hidePartialData(); + + reallocateFirstBuffer(); + + if (available() > 0) return; - /// Buffer in a bad state after exception - if (temporary_buffer->tellp() == -1) - allocateBuffer(); - else - chassert(temporary_buffer->tellp() == static_cast(last_part_size)); + detachBuffer(); - if (strict_upload_part_size) - processWithStrictParts(); - else - processWithDynamicParts(); + if (!multipart_upload_id.empty() || detached_part_data.size() > 1) + writeMultipartUpload(); - waitForReadyBackgroundTasks(); -} - -void WriteBufferFromS3::processWithStrictParts() -{ - chassert(strict_upload_part_size > 0); - - size_t buffer_size = offset(); - size_t left_in_buffer = buffer_size; - size_t new_size = last_part_size + buffer_size; - size_t buffer_offset = 0; - - if (new_size > strict_upload_part_size) - { - /// Data size will exceed fixed part size threshold for multipart upload, need to use multipart upload. - if (multipart_upload_id.empty()) - createMultipartUpload(); - - while (new_size > strict_upload_part_size) - { - size_t to_write = strict_upload_part_size - last_part_size; - temporary_buffer->write(working_buffer.begin() + buffer_offset, to_write); - buffer_offset += to_write; - - writePart(); - allocateBuffer(); - - new_size -= strict_upload_part_size; - left_in_buffer -= to_write; - } - } - - if (left_in_buffer) - { - temporary_buffer->write(working_buffer.begin() + buffer_offset, left_in_buffer); - last_part_size += left_in_buffer; - } - - ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Bytes, buffer_size); - - if (write_settings.remote_throttler) - write_settings.remote_throttler->add(buffer_size, ProfileEvents::RemoteWriteThrottlerBytes, ProfileEvents::RemoteWriteThrottlerSleepMicroseconds); -} - -void WriteBufferFromS3::processWithDynamicParts() -{ - chassert(current_upload_part_size > 0); - - size_t size = offset(); - temporary_buffer->write(working_buffer.begin(), size); - ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Bytes, size); - last_part_size += size; - - if (write_settings.remote_throttler) - write_settings.remote_throttler->add(size, ProfileEvents::RemoteWriteThrottlerBytes, ProfileEvents::RemoteWriteThrottlerSleepMicroseconds); - - /// Data size exceeds singlepart upload threshold, need to use multipart upload. - if (multipart_upload_id.empty() && last_part_size > upload_settings.max_single_part_upload_size) - createMultipartUpload(); - - if (!multipart_upload_id.empty() && last_part_size > current_upload_part_size) - { - writePart(); - allocateBuffer(); - } -} - -void WriteBufferFromS3::allocateBuffer() -{ - temporary_buffer = Aws::MakeShared("temporary buffer"); - temporary_buffer->exceptions(std::ios::badbit); - last_part_size = 0; -} - -WriteBufferFromS3::~WriteBufferFromS3() -{ -#ifndef NDEBUG - if (!finalized) - { - LOG_ERROR(log, "WriteBufferFromS3 is not finalized in destructor. It's a bug"); - std::terminate(); - } -#else - try - { - finalize(); - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - } -#endif + allocateBuffer(); } void WriteBufferFromS3::preFinalize() { - next(); + if (is_prefinalized) + return; - if (multipart_upload_id.empty()) + LOG_TRACE(log, "preFinalize WriteBufferFromS3. {}", getLogDetails()); + + task_tracker->getReady(); + + hidePartialData(); + + if (hidden_size > 0) + detachBuffer(); + setFakeBufferWhenPreFinalized(); + + bool do_single_part_upload = false; + + if (multipart_upload_id.empty() && detached_part_data.size() <= 1) { - makeSinglepartUpload(); + if (detached_part_data.empty() || detached_part_data.front().data_size <= upload_settings.max_single_part_upload_size) + do_single_part_upload = true; + } + + if (do_single_part_upload) + { + if (detached_part_data.empty()) + { + makeSinglepartUpload({}); + } + else + { + makeSinglepartUpload(std::move(detached_part_data.front())); + detached_part_data.pop_front(); + } } else { - /// Write rest of the data as last part. - writePart(); + writeMultipartUpload(); } is_prefinalized = true; @@ -222,24 +244,182 @@ void WriteBufferFromS3::preFinalize() void WriteBufferFromS3::finalizeImpl() { + LOG_TRACE(log, "finalizeImpl WriteBufferFromS3. {}.", getLogDetails()); + if (!is_prefinalized) preFinalize(); - waitForAllBackgroundTasks(); + chassert(offset() == 0); + chassert(hidden_size == 0); + + task_tracker->getAll(); if (!multipart_upload_id.empty()) + { completeMultipartUpload(); + multipart_upload_finished = true; + } if (request_settings.check_objects_after_upload) { LOG_TRACE(log, "Checking object {} exists after upload", key); S3::checkObjectExists(*client_ptr, bucket, key, {}, request_settings, /* for_disk_s3= */ write_settings.for_object_storage, "Immediately after upload"); + + LOG_TRACE(log, "Checking object {} has size as expected {}", key, total_size); + size_t actual_size = S3::getObjectSize(*client_ptr, bucket, key, {}, request_settings, /* for_disk_s3= */ write_settings.for_object_storage); + if (actual_size != total_size) + throw Exception( + ErrorCodes::S3_ERROR, + "Object {} from bucket {} has unexpected size {} after upload, expected size {}, it's a bug in S3 or S3 API.", + key, bucket, actual_size, total_size); + LOG_TRACE(log, "Object {} exists after upload", key); } } -void WriteBufferFromS3::fillCreateMultipartRequest(DB::S3::CreateMultipartUploadRequest & req) +String WriteBufferFromS3::getLogDetails() const { + String multipart_upload_details; + if (!multipart_upload_id.empty()) + multipart_upload_details = fmt::format(", upload id {}, upload has finished {}" + , multipart_upload_id, multipart_upload_finished); + + return fmt::format("Details: bucket {}, key {}, total size {}, count {}, hidden_size {}, offset {}, with pool: {}, finalized {}{}", + bucket, key, total_size, count(), hidden_size, offset(), task_tracker->isAsync(), finalized, multipart_upload_details); +} + +void WriteBufferFromS3::tryToAbortMultipartUpload() +{ + try + { + task_tracker->safeWaitAll(); + abortMultipartUpload(); + } + catch (...) + { + LOG_ERROR(log, "Multipart upload hasn't aborted. {}", getLogDetails()); + tryLogCurrentException(__PRETTY_FUNCTION__); + } +} + +WriteBufferFromS3::~WriteBufferFromS3() +{ + LOG_TRACE(log, "Close WriteBufferFromS3. {}.", getLogDetails()); + + // That descructor could be call with finalized=false in case of exceptions + if (!finalized) + { + LOG_ERROR(log, "WriteBufferFromS3 is not finalized in destructor. It could be if an exception occurs. File is not written to S3. {}.", getLogDetails()); + } + + task_tracker->safeWaitAll(); + + if (!multipart_upload_id.empty() && !multipart_upload_finished) + { + LOG_WARNING(log, "WriteBufferFromS3 was neither finished nor aborted, try to abort upload in destructor. {}.", getLogDetails()); + tryToAbortMultipartUpload(); + } +} + +void WriteBufferFromS3::hidePartialData() +{ + if (write_settings.remote_throttler) + write_settings.remote_throttler->add(offset(), ProfileEvents::RemoteWriteThrottlerBytes, ProfileEvents::RemoteWriteThrottlerSleepMicroseconds); + + chassert(memory.size() >= hidden_size + offset()); + + hidden_size += offset(); + chassert(memory.data() + hidden_size == working_buffer.begin() + offset()); + chassert(memory.data() + hidden_size == position()); + + WriteBuffer::set(memory.data() + hidden_size, memory.size() - hidden_size); + chassert(offset() == 0); +} + +void WriteBufferFromS3::reallocateFirstBuffer() +{ + chassert(offset() == 0); + + if (buffer_allocation_policy->getNumber() > 1 || available() > 0) + return; + + const size_t max_first_buffer = buffer_allocation_policy->getSize(); + if (memory.size() == max_first_buffer) + return; + + size_t size = std::min(memory.size() * 2, max_first_buffer); + memory.resize(size); + + WriteBuffer::set(memory.data() + hidden_size, memory.size() - hidden_size); + + chassert(offset() == 0); + + LOG_TRACE(log, "Reallocated first buffer with size {}. {}", memory.size(), getLogDetails()); +} + +void WriteBufferFromS3::detachBuffer() +{ + size_t data_size = size_t(position() - memory.data()); + chassert(data_size == hidden_size); + + auto buf = std::move(memory); + + WriteBuffer::set(nullptr, 0); + total_size += hidden_size; + hidden_size = 0; + + detached_part_data.push_back({std::move(buf), data_size}); +} + +void WriteBufferFromS3::allocateFirstBuffer() +{ + const auto max_first_buffer = buffer_allocation_policy->getSize(); + const auto size = std::min(size_t(DBMS_DEFAULT_BUFFER_SIZE), max_first_buffer); + memory = Memory(size); + WriteBuffer::set(memory.data(), memory.size()); + + LOG_TRACE(log, "Allocated first buffer with size {}. {}", memory.size(), getLogDetails()); +} + +void WriteBufferFromS3::allocateBuffer() +{ + buffer_allocation_policy->next(); + chassert(0 == hidden_size); + + if (buffer_allocation_policy->getNumber() == 1) + return allocateFirstBuffer(); + + memory = Memory(buffer_allocation_policy->getSize()); + WriteBuffer::set(memory.data(), memory.size()); + + LOG_TRACE(log, "Allocated buffer with size {}. {}", buffer_allocation_policy->getSize(), getLogDetails()); +} + +void WriteBufferFromS3::setFakeBufferWhenPreFinalized() +{ + WriteBuffer::set(fake_buffer_when_prefinalized, sizeof(fake_buffer_when_prefinalized)); +} + +void WriteBufferFromS3::writeMultipartUpload() +{ + if (multipart_upload_id.empty()) + { + createMultipartUpload(); + } + + while (!detached_part_data.empty()) + { + writePart(std::move(detached_part_data.front())); + detached_part_data.pop_front(); + } +} + +void WriteBufferFromS3::createMultipartUpload() +{ + LOG_TRACE(log, "Create multipart upload. Bucket: {}, Key: {}, Upload id: {}", bucket, key, multipart_upload_id); + + S3::CreateMultipartUploadRequest req; + req.SetBucket(bucket); req.SetKey(key); @@ -250,12 +430,6 @@ void WriteBufferFromS3::fillCreateMultipartRequest(DB::S3::CreateMultipartUpload req.SetMetadata(object_metadata.value()); client_ptr->setKMSHeaders(req); -} - -void WriteBufferFromS3::createMultipartUpload() -{ - DB::S3::CreateMultipartUploadRequest req; - fillCreateMultipartRequest(req); ProfileEvents::increment(ProfileEvents::S3CreateMultipartUpload); if (write_settings.for_object_storage) @@ -267,184 +441,164 @@ void WriteBufferFromS3::createMultipartUpload() ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds()); - if (outcome.IsSuccess()) - { - multipart_upload_id = outcome.GetResult().GetUploadId(); - LOG_TRACE(log, "Multipart upload has created. Bucket: {}, Key: {}, Upload id: {}", bucket, key, multipart_upload_id); - } - else + if (!outcome.IsSuccess()) { ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1); throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType()); } + + multipart_upload_id = outcome.GetResult().GetUploadId(); + LOG_TRACE(log, "Multipart upload has created. {}", getLogDetails()); } -void WriteBufferFromS3::writePart() +void WriteBufferFromS3::abortMultipartUpload() { - auto size = temporary_buffer->tellp(); - - LOG_TRACE(log, "Writing part. Bucket: {}, Key: {}, Upload_id: {}, Size: {}", bucket, key, multipart_upload_id, size); - - if (size < 0) + if (multipart_upload_id.empty()) { - LOG_WARNING(log, "Skipping part upload. Buffer is in bad state, it means that we have tried to upload something, but got an exception."); + LOG_WARNING(log, "Nothing to abort. {}", getLogDetails()); return; } - if (size == 0) + LOG_WARNING(log, "Abort multipart upload. {}", getLogDetails()); + + S3::AbortMultipartUploadRequest req; + req.SetBucket(bucket); + req.SetKey(key); + req.SetUploadId(multipart_upload_id); + + ProfileEvents::increment(ProfileEvents::S3AbortMultipartUpload); + if (write_settings.for_object_storage) + ProfileEvents::increment(ProfileEvents::DiskS3AbortMultipartUpload); + + Stopwatch watch; + auto outcome = client_ptr->AbortMultipartUpload(req); + watch.stop(); + + ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds()); + + if (!outcome.IsSuccess()) { - LOG_TRACE(log, "Skipping writing part. Buffer is empty."); - return; + ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1); + throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType()); } - if (TSA_SUPPRESS_WARNING_FOR_READ(part_tags).size() == S3_WARN_MAX_PARTS) - { - // Don't throw exception here by ourselves but leave the decision to take by S3 server. - LOG_WARNING(log, "Maximum part number in S3 protocol has reached (too many parts). Server may not accept this whole upload."); - } - - if (schedule) - { - UploadPartTask * task = nullptr; - - { - std::lock_guard lock(bg_tasks_mutex); - task = &upload_object_tasks.emplace_back(); - ++num_added_bg_tasks; - } - - /// Notify waiting thread when task finished - auto task_finish_notify = [&, task]() - { - std::lock_guard lock(bg_tasks_mutex); - task->is_finished = true; - ++num_finished_bg_tasks; - - /// Notification under mutex is important here. - /// Otherwise, WriteBuffer could be destroyed in between - /// Releasing lock and condvar notification. - bg_tasks_condvar.notify_one(); - }; - - try - { - fillUploadRequest(task->req); - - schedule([this, task, task_finish_notify]() - { - try - { - processUploadRequest(*task); - } - catch (...) - { - task->exception = std::current_exception(); - } - - task_finish_notify(); - }, 0); - } - catch (...) - { - task_finish_notify(); - throw; - } - } - else - { - UploadPartTask task; - auto & tags = TSA_SUPPRESS_WARNING_FOR_WRITE(part_tags); /// Suppress warning because schedule == false. - - fillUploadRequest(task.req); - processUploadRequest(task); - tags.push_back(task.tag); - } + LOG_WARNING(log, "Multipart upload has aborted successfully. {}", getLogDetails()); } -void WriteBufferFromS3::fillUploadRequest(S3::UploadPartRequest & req) +S3::UploadPartRequest WriteBufferFromS3::getUploadRequest(size_t part_number, PartData & data) { - /// Increase part number. - ++part_number; + ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Bytes, data.data_size); + LOG_TRACE(log, "fillUploadRequest, size {}, key: {}", data.data_size, key); - auto max_part_number = upload_settings.max_part_number; - - if (!multipart_upload_id.empty() && (part_number > max_part_number)) - { - throw Exception( - ErrorCodes::INVALID_CONFIG_PARAMETER, - "Part number exceeded {} while writing {} bytes to S3. " - "Check min_upload_part_size = {}, max_upload_part_size = {}, " - "upload_part_size_multiply_factor = {}, upload_part_size_multiply_parts_count_threshold = {}, " - "max_single_part_upload_size = {}", - max_part_number, count(), - upload_settings.min_upload_part_size, upload_settings.max_upload_part_size, - upload_settings.upload_part_size_multiply_factor, - upload_settings.upload_part_size_multiply_parts_count_threshold, - upload_settings.max_single_part_upload_size); - } + S3::UploadPartRequest req; /// Setup request. req.SetBucket(bucket); req.SetKey(key); req.SetPartNumber(static_cast(part_number)); req.SetUploadId(multipart_upload_id); - req.SetContentLength(temporary_buffer->tellp()); - req.SetBody(temporary_buffer); - + req.SetContentLength(data.data_size); + req.SetBody(data.createAwsBuffer()); /// If we don't do it, AWS SDK can mistakenly set it to application/xml, see https://github.com/aws/aws-sdk-cpp/issues/1840 req.SetContentType("binary/octet-stream"); - if (!strict_upload_part_size) - { - /// Maybe increase `current_upload_part_size` (we need to increase it sometimes to keep `part_number` less or equal than `max_part_number`). - auto threshold = upload_settings.upload_part_size_multiply_parts_count_threshold; - if (!multipart_upload_id.empty() && (part_number % threshold == 0)) - { - auto max_upload_part_size = upload_settings.max_upload_part_size; - auto upload_part_size_multiply_factor = upload_settings.upload_part_size_multiply_factor; - current_upload_part_size *= upload_part_size_multiply_factor; - current_upload_part_size = std::min(current_upload_part_size, max_upload_part_size); - } - } + return req; } -void WriteBufferFromS3::processUploadRequest(UploadPartTask & task) +void WriteBufferFromS3::writePart(WriteBufferFromS3::PartData && data) { - ProfileEvents::increment(ProfileEvents::S3UploadPart); - if (write_settings.for_object_storage) - ProfileEvents::increment(ProfileEvents::DiskS3UploadPart); - - ResourceCost cost = task.req.GetContentLength(); - ResourceGuard rlock(write_settings.resource_link, cost); - Stopwatch watch; - auto outcome = client_ptr->UploadPart(task.req); - watch.stop(); - rlock.unlock(); // Avoid acquiring other locks under resource lock - - ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds()); - - if (outcome.IsSuccess()) + if (data.data_size == 0) { - task.tag = outcome.GetResult().GetETag(); - std::lock_guard lock(bg_tasks_mutex); /// Protect part_tags from race - LOG_TRACE(log, "Writing part finished. Bucket: {}, Key: {}, Upload_id: {}, Etag: {}, Parts: {}", bucket, key, multipart_upload_id, task.tag, part_tags.size()); + LOG_TRACE(log, "Skipping writing part as empty."); + return; } - else + + multipart_tags.push_back({}); + size_t part_number = multipart_tags.size(); + LOG_TRACE(log, "WritePart. {}, part size: {}, part number: {}", getLogDetails(), data.data_size, part_number); + + if (multipart_upload_id.empty()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Unable to write a part without multipart_upload_id, details: WriteBufferFromS3 created for bucket {}, key {}", + bucket, key); + + if (part_number > upload_settings.max_part_number) { - ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1); - write_settings.resource_link.accumulate(cost); // We assume no resource was used in case of failure - throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType()); + throw Exception( + ErrorCodes::INVALID_CONFIG_PARAMETER, + "Part number exceeded {} while writing {} bytes to S3. Check min_upload_part_size = {}, max_upload_part_size = {}, " + "upload_part_size_multiply_factor = {}, upload_part_size_multiply_parts_count_threshold = {}, max_single_part_upload_size = {}", + upload_settings.max_part_number, count(), upload_settings.min_upload_part_size, upload_settings.max_upload_part_size, + upload_settings.upload_part_size_multiply_factor, upload_settings.upload_part_size_multiply_parts_count_threshold, + upload_settings.max_single_part_upload_size); } + + if (data.data_size > upload_settings.max_upload_part_size) + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Part size exceeded max_upload_part_size, part number: {}, part size {}, max_upload_part_size {}, {}", + part_number, + data.data_size, + upload_settings.max_upload_part_size, + getLogDetails()); + } + + auto req = getUploadRequest(part_number, data); + auto worker_data = std::make_shared>(std::move(req), std::move(data)); + + auto upload_worker = [&, worker_data, part_number] () + { + LOG_TEST(log, "Writing part started. bucket {}, key {}, part id {}", bucket, key, part_number); + + ProfileEvents::increment(ProfileEvents::S3UploadPart); + if (write_settings.for_object_storage) + ProfileEvents::increment(ProfileEvents::DiskS3UploadPart); + + auto & request = std::get<0>(*worker_data); + + ResourceCost cost = request.GetContentLength(); + ResourceGuard rlock(write_settings.resource_link, cost); + Stopwatch watch; + auto outcome = client_ptr->UploadPart(request); + watch.stop(); + rlock.unlock(); // Avoid acquiring other locks under resource lock + + ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds()); + + if (!outcome.IsSuccess()) + { + ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1); + write_settings.resource_link.accumulate(cost); // We assume no resource was used in case of failure + throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType()); + } + + multipart_tags[part_number-1] = outcome.GetResult().GetETag(); + + LOG_TEST(log, "Writing part finished. bucket {}, key{}, part id {}, etag {}", bucket, key, part_number, multipart_tags[part_number-1]); + }; + + task_tracker->add(std::move(upload_worker)); } void WriteBufferFromS3::completeMultipartUpload() { - const auto & tags = TSA_SUPPRESS_WARNING_FOR_READ(part_tags); + LOG_TRACE(log, "Completing multipart upload. {}, Parts: {}", getLogDetails(), multipart_tags.size()); - LOG_TRACE(log, "Completing multipart upload. Bucket: {}, Key: {}, Upload_id: {}, Parts: {}", bucket, key, multipart_upload_id, tags.size()); + if (multipart_tags.empty()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Failed to complete multipart upload. No parts have uploaded"); - if (tags.empty()) - throw Exception(ErrorCodes::S3_ERROR, "Failed to complete multipart upload. No parts have uploaded"); + for (size_t i = 0; i < multipart_tags.size(); ++i) + { + const auto tag = multipart_tags.at(i); + if (tag.empty()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Failed to complete multipart upload. Part {} haven't been uploaded.", i); + } S3::CompleteMultipartUploadRequest req; req.SetBucket(bucket); @@ -452,10 +606,10 @@ void WriteBufferFromS3::completeMultipartUpload() req.SetUploadId(multipart_upload_id); Aws::S3::Model::CompletedMultipartUpload multipart_upload; - for (size_t i = 0; i < tags.size(); ++i) + for (size_t i = 0; i < multipart_tags.size(); ++i) { Aws::S3::Model::CompletedPart part; - multipart_upload.AddParts(part.WithETag(tags[i]).WithPartNumber(static_cast(i + 1))); + multipart_upload.AddParts(part.WithETag(multipart_tags[i]).WithPartNumber(static_cast(i + 1))); } req.SetMultipartUpload(multipart_upload); @@ -475,26 +629,24 @@ void WriteBufferFromS3::completeMultipartUpload() if (outcome.IsSuccess()) { - LOG_TRACE(log, "Multipart upload has completed. Bucket: {}, Key: {}, Upload_id: {}, Parts: {}", bucket, key, multipart_upload_id, tags.size()); + LOG_TRACE(log, "Multipart upload has completed. {}, Parts: {}", getLogDetails(), multipart_tags.size()); return; } + + ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1); + + if (outcome.GetError().GetErrorType() == Aws::S3::S3Errors::NO_SUCH_KEY) + { + /// For unknown reason, at least MinIO can respond with NO_SUCH_KEY for put requests + /// BTW, NO_SUCH_UPLOAD is expected error and we shouldn't retry it + LOG_INFO(log, "Multipart upload failed with NO_SUCH_KEY error, will retry. {}, Parts: {}", getLogDetails(), multipart_tags.size()); + } else { - ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1); - - if (outcome.GetError().GetErrorType() == Aws::S3::S3Errors::NO_SUCH_KEY) - { - /// For unknown reason, at least MinIO can respond with NO_SUCH_KEY for put requests - /// BTW, NO_SUCH_UPLOAD is expected error and we shouldn't retry it - LOG_INFO(log, "Multipart upload failed with NO_SUCH_KEY error for Bucket: {}, Key: {}, Upload_id: {}, Parts: {}, will retry", bucket, key, multipart_upload_id, tags.size()); - } - else - { - throw S3Exception( - outcome.GetError().GetErrorType(), - "Message: {}, Key: {}, Bucket: {}, Tags: {}", - outcome.GetError().GetMessage(), key, bucket, fmt::join(tags.begin(), tags.end(), " ")); - } + throw S3Exception( + outcome.GetError().GetErrorType(), + "Message: {}, Key: {}, Bucket: {}, Tags: {}", + outcome.GetError().GetMessage(), key, bucket, fmt::join(multipart_tags.begin(), multipart_tags.end(), " ")); } } @@ -504,73 +656,17 @@ void WriteBufferFromS3::completeMultipartUpload() max_retry, key, bucket); } -void WriteBufferFromS3::makeSinglepartUpload() +S3::PutObjectRequest WriteBufferFromS3::getPutRequest(PartData & data) { - auto size = temporary_buffer->tellp(); - bool with_pool = static_cast(schedule); + ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Bytes, data.data_size); + LOG_TRACE(log, "getPutRequest, size {}, key {}", data.data_size, key); - LOG_TRACE(log, "Making single part upload. Bucket: {}, Key: {}, Size: {}, WithPool: {}", bucket, key, size, with_pool); + S3::PutObjectRequest req; - if (size < 0) - { - LOG_WARNING(log, "Skipping single part upload. Buffer is in bad state, it mean that we have tried to upload something, but got an exception."); - return; - } - - if (schedule) - { - put_object_task = std::make_unique(); - - /// Notify waiting thread when put object task finished - auto task_notify_finish = [&]() - { - std::lock_guard lock(bg_tasks_mutex); - put_object_task->is_finished = true; - - /// Notification under mutex is important here. - /// Othervies, WriteBuffer could be destroyed in between - /// Releasing lock and condvar notification. - bg_tasks_condvar.notify_one(); - }; - - try - { - fillPutRequest(put_object_task->req); - - schedule([this, task_notify_finish]() - { - try - { - processPutRequest(*put_object_task); - } - catch (...) - { - put_object_task->exception = std::current_exception(); - } - - task_notify_finish(); - }, 0); - } - catch (...) - { - task_notify_finish(); - throw; - } - } - else - { - PutObjectTask task; - fillPutRequest(task.req); - processPutRequest(task); - } -} - -void WriteBufferFromS3::fillPutRequest(S3::PutObjectRequest & req) -{ req.SetBucket(bucket); req.SetKey(key); - req.SetContentLength(temporary_buffer->tellp()); - req.SetBody(temporary_buffer); + req.SetContentLength(data.data_size); + req.SetBody(data.createAwsBuffer()); if (object_metadata.has_value()) req.SetMetadata(object_metadata.value()); if (!upload_settings.storage_class_name.empty()) @@ -580,121 +676,73 @@ void WriteBufferFromS3::fillPutRequest(S3::PutObjectRequest & req) req.SetContentType("binary/octet-stream"); client_ptr->setKMSHeaders(req); + + return req; } -void WriteBufferFromS3::processPutRequest(const PutObjectTask & task) +void WriteBufferFromS3::makeSinglepartUpload(WriteBufferFromS3::PartData && data) { - size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries, 1UL); - for (size_t i = 0; i < max_retry; ++i) + LOG_TRACE(log, "Making single part upload. {}.", getLogDetails()); + + auto req = getPutRequest(data); + auto worker_data = std::make_shared>(std::move(req), std::move(data)); + + auto upload_worker = [&, worker_data] () { - ProfileEvents::increment(ProfileEvents::S3PutObject); - if (write_settings.for_object_storage) - ProfileEvents::increment(ProfileEvents::DiskS3PutObject); + LOG_TEST(log, "writing single part upload started. bucket {}, key {}", bucket, key); - ResourceCost cost = task.req.GetContentLength(); - ResourceGuard rlock(write_settings.resource_link, cost); - Stopwatch watch; - auto outcome = client_ptr->PutObject(task.req); - watch.stop(); - rlock.unlock(); + auto & request = std::get<0>(*worker_data); + size_t content_length = request.GetContentLength(); - ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds()); - - bool with_pool = static_cast(schedule); - if (outcome.IsSuccess()) - { - LOG_TRACE(log, "Single part upload has completed. Bucket: {}, Key: {}, Object size: {}, WithPool: {}", bucket, key, task.req.GetContentLength(), with_pool); - return; - } - else + size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries, 1UL); + for (size_t i = 0; i < max_retry; ++i) { + ProfileEvents::increment(ProfileEvents::S3PutObject); + if (write_settings.for_object_storage) + ProfileEvents::increment(ProfileEvents::DiskS3PutObject); + + ResourceCost cost = request.GetContentLength(); + ResourceGuard rlock(write_settings.resource_link, cost); + Stopwatch watch; + auto outcome = client_ptr->PutObject(request); + watch.stop(); + rlock.unlock(); + + ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds()); + + if (outcome.IsSuccess()) + { + LOG_TRACE(log, "Single part upload has completed. bucket {}, key {}, object size {}", bucket, key, content_length); + return; + } + ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1); + write_settings.resource_link.accumulate(cost); // We assume no resource was used in case of failure + if (outcome.GetError().GetErrorType() == Aws::S3::S3Errors::NO_SUCH_KEY) { - write_settings.resource_link.accumulate(cost); // We assume no resource was used in case of failure + /// For unknown reason, at least MinIO can respond with NO_SUCH_KEY for put requests - LOG_INFO(log, "Single part upload failed with NO_SUCH_KEY error for Bucket: {}, Key: {}, Object size: {}, WithPool: {}, will retry", bucket, key, task.req.GetContentLength(), with_pool); + LOG_INFO(log, "Single part upload failed with NO_SUCH_KEY error for bucket {}, key {}, object size {}, will retry", bucket, key, content_length); } else { - write_settings.resource_link.accumulate(cost); // We assume no resource was used in case of failure + LOG_ERROR(log, "S3Exception name {}, Message: {}, bucket {}, key {}, object size {}", + outcome.GetError().GetExceptionName(), outcome.GetError().GetMessage(), bucket, key, content_length); throw S3Exception( outcome.GetError().GetErrorType(), - "Message: {}, Key: {}, Bucket: {}, Object size: {}, WithPool: {}", - outcome.GetError().GetMessage(), key, bucket, task.req.GetContentLength(), with_pool); + "Message: {}, bucket {}, key {}, object size {}", + outcome.GetError().GetMessage(), bucket, key, content_length); } } - } - throw S3Exception( - Aws::S3::S3Errors::NO_SUCH_KEY, - "Message: Single part upload failed with NO_SUCH_KEY error, retries {}, Key: {}, Bucket: {}", - max_retry, key, bucket); -} + throw S3Exception( + Aws::S3::S3Errors::NO_SUCH_KEY, + "Message: Single part upload failed with NO_SUCH_KEY error, retries {}, Key: {}, Bucket: {}", + max_retry, key, bucket); + }; -void WriteBufferFromS3::waitForReadyBackgroundTasks() -{ - if (schedule) - { - std::unique_lock lock(bg_tasks_mutex); - - /// Suppress warnings because bg_tasks_mutex is actually hold, but tsa annotations do not understand std::unique_lock - auto & tasks = TSA_SUPPRESS_WARNING_FOR_WRITE(upload_object_tasks); - - while (!tasks.empty() && tasks.front().is_finished) - { - auto & task = tasks.front(); - auto exception = task.exception; - auto tag = std::move(task.tag); - tasks.pop_front(); - - if (exception) - { - waitForAllBackgroundTasksUnlocked(lock); - std::rethrow_exception(exception); - } - - TSA_SUPPRESS_WARNING_FOR_WRITE(part_tags).push_back(tag); - } - } -} - -void WriteBufferFromS3::waitForAllBackgroundTasks() -{ - if (schedule) - { - std::unique_lock lock(bg_tasks_mutex); - waitForAllBackgroundTasksUnlocked(lock); - } -} - -void WriteBufferFromS3::waitForAllBackgroundTasksUnlocked(std::unique_lock & bg_tasks_lock) -{ - if (schedule) - { - bg_tasks_condvar.wait(bg_tasks_lock, [this]() {return TSA_SUPPRESS_WARNING_FOR_READ(num_added_bg_tasks) == TSA_SUPPRESS_WARNING_FOR_READ(num_finished_bg_tasks); }); - - /// Suppress warnings because bg_tasks_mutex is actually hold, but tsa annotations do not understand std::unique_lock - auto & tasks = TSA_SUPPRESS_WARNING_FOR_WRITE(upload_object_tasks); - while (!tasks.empty()) - { - auto & task = tasks.front(); - - if (task.exception) - std::rethrow_exception(task.exception); - - TSA_SUPPRESS_WARNING_FOR_WRITE(part_tags).push_back(task.tag); - - tasks.pop_front(); - } - - if (put_object_task) - { - bg_tasks_condvar.wait(bg_tasks_lock, [this]() { return put_object_task->is_finished; }); - if (put_object_task->exception) - std::rethrow_exception(put_object_task->exception); - } - } + task_tracker->add(std::move(upload_worker)); } } diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index 2374f1502f5..13ed151ad57 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -4,20 +4,16 @@ #if USE_AWS_S3 -#include -#include -#include - #include #include #include #include -#include #include #include -#include - +#include +#include +#include namespace Aws::S3 { @@ -27,8 +23,6 @@ class Client; namespace DB { -class WriteBufferFromFile; - /** * Buffer to write a data to a S3 object with specified bucket and key. * If data size written to the buffer is less than 'max_single_part_upload_size' write is performed using singlepart upload. @@ -45,81 +39,74 @@ public: const String & key_, const S3Settings::RequestSettings & request_settings_, std::optional> object_metadata_ = std::nullopt, - size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE, ThreadPoolCallbackRunner schedule_ = {}, const WriteSettings & write_settings_ = {}); ~WriteBufferFromS3() override; - void nextImpl() override; - void preFinalize() override; private: - void allocateBuffer(); - - void processWithStrictParts(); - void processWithDynamicParts(); - - void fillCreateMultipartRequest(S3::CreateMultipartUploadRequest & req); - void createMultipartUpload(); - void writePart(); - void completeMultipartUpload(); - - void makeSinglepartUpload(); - /// Receives response from the server after sending all data. void finalizeImpl() override; - struct UploadPartTask; - void fillUploadRequest(S3::UploadPartRequest & req); - void processUploadRequest(UploadPartTask & task); + String getLogDetails() const; - struct PutObjectTask; - void fillPutRequest(S3::PutObjectRequest & req); - void processPutRequest(const PutObjectTask & task); + struct PartData; + void hidePartialData(); + void allocateFirstBuffer(); + void reallocateFirstBuffer(); + void detachBuffer(); + void allocateBuffer(); + void setFakeBufferWhenPreFinalized(); - void waitForReadyBackgroundTasks(); - void waitForAllBackgroundTasks(); - void waitForAllBackgroundTasksUnlocked(std::unique_lock & bg_tasks_lock); + S3::UploadPartRequest getUploadRequest(size_t part_number, PartData & data); + void writePart(PartData && data); + void writeMultipartUpload(); + void createMultipartUpload(); + void completeMultipartUpload(); + void abortMultipartUpload(); + void tryToAbortMultipartUpload(); + + S3::PutObjectRequest getPutRequest(PartData & data); + void makeSinglepartUpload(PartData && data); const String bucket; const String key; const S3Settings::RequestSettings request_settings; const S3Settings::RequestSettings::PartUploadSettings & upload_settings; + const WriteSettings write_settings; const std::shared_ptr client_ptr; const std::optional> object_metadata; + Poco::Logger * log = &Poco::Logger::get("WriteBufferFromS3"); - /// Strict/static Part size, no adjustments will be done on fly. - size_t strict_upload_part_size = 0; - /// Part size will be adjusted on fly (for bigger uploads) - size_t current_upload_part_size = 0; - std::shared_ptr temporary_buffer; /// Buffer to accumulate data. - size_t last_part_size = 0; - size_t part_number = 0; + struct BufferAllocationPolicy; + std::unique_ptr buffer_allocation_policy; /// Upload in S3 is made in parts. /// We initiate upload, then upload each part and get ETag as a response, and then finalizeImpl() upload with listing all our parts. String multipart_upload_id; - std::vector TSA_GUARDED_BY(bg_tasks_mutex) part_tags; + std::deque multipart_tags; + bool multipart_upload_finished = false; + /// Track that prefinalize() is called only once bool is_prefinalized = false; - /// Following fields are for background uploads in thread pool (if specified). - /// We use std::function to avoid dependency of Interpreters - const ThreadPoolCallbackRunner schedule; + /// First fully filled buffer has to be delayed + /// There are two ways after: + /// First is to call prefinalize/finalize, which leads to single part upload + /// Second is to write more data, which leads to multi part upload + std::deque detached_part_data; + char fake_buffer_when_prefinalized[1] = {}; - std::unique_ptr put_object_task; /// Does not need protection by mutex because of the logic around is_finished field. - std::list TSA_GUARDED_BY(bg_tasks_mutex) upload_object_tasks; - int num_added_bg_tasks TSA_GUARDED_BY(bg_tasks_mutex) = 0; - int num_finished_bg_tasks TSA_GUARDED_BY(bg_tasks_mutex) = 0; + /// offset() and count() are unstable inside nextImpl + /// For example nextImpl changes position hence offset() and count() is changed + /// This vars are dedicated to store information about sizes when offset() and count() are unstable + size_t total_size = 0; + size_t hidden_size = 0; - std::mutex bg_tasks_mutex; - std::condition_variable bg_tasks_condvar; - - Poco::Logger * log = &Poco::Logger::get("WriteBufferFromS3"); - - WriteSettings write_settings; + class TaskTracker; + std::unique_ptr task_tracker; }; } diff --git a/src/IO/WriteBufferFromS3MemoryStream.cpp b/src/IO/WriteBufferFromS3MemoryStream.cpp new file mode 100644 index 00000000000..6271f15f055 --- /dev/null +++ b/src/IO/WriteBufferFromS3MemoryStream.cpp @@ -0,0 +1,68 @@ +#include "config.h" + +#if USE_AWS_S3 + +#include + +namespace DB +{ + +MemoryStream::MemoryBuf::MemoryBuf(char * begin_, size_t size_) + : begin(begin_) + , size(size_) +{ + this->setg(begin, begin, begin + size); +} + +MemoryStream::MemoryBuf::int_type MemoryStream::MemoryBuf::underflow() +{ + if (gptr() < egptr()) + return traits_type::to_int_type(*gptr()); + return traits_type::eof(); +} + +MemoryStream::MemoryBuf::pos_type MemoryStream::MemoryBuf::seekoff(off_type off, std::ios_base::seekdir way, + std::ios_base::openmode mode) +{ + bool out_mode = (std::ios_base::out & mode) != 0; + if (out_mode) + return off_type(-1); + + off_type ret(-1); + + if (way == std::ios_base::beg) + ret = 0; + else if (way == std::ios_base::cur) + ret = gptr() - begin; + else if (way == std::ios_base::end) + ret = size; + + if (ret == off_type(-1)) + return ret; + + ret += off; + if (!(ret >= 0 && size_t(ret) <= size)) + return off_type(-1); + + this->setg(begin, begin + ret, begin + size); + + return pos_type(ret); +} + +MemoryStream::MemoryBuf::pos_type MemoryStream::MemoryBuf::seekpos(pos_type sp, + std::ios_base::openmode mode) +{ + return seekoff(off_type(sp), std::ios_base::beg, mode); +} + +MemoryStream::MemoryStream(char * begin_, size_t size_) + : std::iostream(nullptr) + , mem_buf(begin_, size_) +{ + init(&mem_buf); +} + +} + +#endif + diff --git a/src/IO/WriteBufferFromS3MemoryStream.h b/src/IO/WriteBufferFromS3MemoryStream.h new file mode 100644 index 00000000000..5a7cc17705d --- /dev/null +++ b/src/IO/WriteBufferFromS3MemoryStream.h @@ -0,0 +1,39 @@ +#pragma once + +#include "config.h" + +#if USE_AWS_S3 + +#include "WriteBufferFromS3.h" + +#include + +namespace DB +{ + +struct MemoryStream: std::iostream +{ + struct MemoryBuf: std::streambuf + { + MemoryBuf(char * begin_, size_t size_); + + int_type underflow() override; + + pos_type seekoff(off_type off, std::ios_base::seekdir way, + std::ios_base::openmode mode) override; + + pos_type seekpos(pos_type sp, + std::ios_base::openmode mode) override; + + char * begin = nullptr; + size_t size = 0; + }; + + MemoryStream(char * begin_, size_t size_); + + MemoryBuf mem_buf; +}; + +} + +#endif diff --git a/src/IO/WriteBufferFromS3TaskTracker.cpp b/src/IO/WriteBufferFromS3TaskTracker.cpp new file mode 100644 index 00000000000..0769f7731c2 --- /dev/null +++ b/src/IO/WriteBufferFromS3TaskTracker.cpp @@ -0,0 +1,137 @@ +#include "config.h" + +#if USE_AWS_S3 + +#include + +namespace DB +{ + +WriteBufferFromS3::TaskTracker::TaskTracker(ThreadPoolCallbackRunner scheduler_) + : is_async(bool(scheduler_)) + , scheduler(scheduler_ ? std::move(scheduler_) : syncRunner()) +{} + +WriteBufferFromS3::TaskTracker::~TaskTracker() +{ + safeWaitAll(); +} + +ThreadPoolCallbackRunner WriteBufferFromS3::TaskTracker::syncRunner() +{ + return [](Callback && callback, int64_t) mutable -> std::future + { + auto package = std::packaged_task(std::move(callback)); + /// No exceptions are propagated, exceptions are packed to future + package(); + return package.get_future(); + }; +} + +void WriteBufferFromS3::TaskTracker::getReady() +{ + LOG_TEST(log, "getReady, in queue {}", futures.size()); + + /// Exceptions are propagated + auto it = futures.begin(); + while (it != futures.end()) + { + chassert(it->valid()); + if (it->wait_for(std::chrono::seconds(0)) != std::future_status::ready) + { + ++it; + continue; + } + + try + { + it->get(); + } catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + throw; + } + + it = futures.erase(it); + } + + LOG_TEST(log, "getReady ended, in queue {}", futures.size()); +} + +void WriteBufferFromS3::TaskTracker::getAll() +{ + LOG_TEST(log, "getAll, in queue {}", futures.size()); + + /// Exceptions are propagated + for (auto & future : futures) + { + try + { + future.get(); + } catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + throw; + } + } + futures.clear(); +} + +void WriteBufferFromS3::TaskTracker::safeWaitAll() +{ + LOG_TEST(log, "safeWaitAll, wait in queue {}", futures.size()); + + /// Exceptions are not propagated + for (auto & future : futures) + { + LOG_TEST(log, "safeWaitAll, wait future"); + + if (future.valid()) + future.wait(); + } + + LOG_TEST(log, "safeWaitAll, get in queue {}", futures.size()); + + for (auto & future : futures) + { + if (future.valid()) + { + try + { + future.get(); + } catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } + } + futures.clear(); + LOG_TEST(log, "safeWaitAll ended, get in queue {}", futures.size()); +} + +void WriteBufferFromS3::TaskTracker::add(Callback && func) +{ + LOG_TEST(log, "add, in queue {}", futures.size()); + + auto future = scheduler(std::move(func), 0); + auto exit_scope = scope_guard( + [&future]() + { + future.wait(); + } + ); + + futures.push_back(std::move(future)); + + exit_scope.release(); + LOG_TEST(log, "add ended, in queue {}", futures.size()); +} + +bool WriteBufferFromS3::TaskTracker::isAsync() const +{ + return is_async; +} + +} + +#endif diff --git a/src/IO/WriteBufferFromS3TaskTracker.h b/src/IO/WriteBufferFromS3TaskTracker.h new file mode 100644 index 00000000000..fa214a4f8c5 --- /dev/null +++ b/src/IO/WriteBufferFromS3TaskTracker.h @@ -0,0 +1,37 @@ +#pragma once + +#include "config.h" + +#if USE_AWS_S3 + +#include "WriteBufferFromS3.h" + +namespace DB +{ + +class WriteBufferFromS3::TaskTracker +{ +public: + using Callback = std::function; + + explicit TaskTracker(ThreadPoolCallbackRunner scheduler_); + ~TaskTracker(); + + static ThreadPoolCallbackRunner syncRunner(); + + bool isAsync() const; + void getReady(); + void getAll(); + void safeWaitAll(); + void add(Callback && func); + +private: + bool is_async; + ThreadPoolCallbackRunner scheduler; + std::list> futures; + Poco::Logger * log = &Poco::Logger::get("TaskTracker"); +}; + +} + +#endif diff --git a/src/IO/tests/gtest_writebuffer_s3.cpp b/src/IO/tests/gtest_writebuffer_s3.cpp new file mode 100644 index 00000000000..d7661d3e3d0 --- /dev/null +++ b/src/IO/tests/gtest_writebuffer_s3.cpp @@ -0,0 +1,1114 @@ +#include + +#include "config.h" + +#if USE_AWS_S3 + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int S3_ERROR; +} + +} + +namespace MockS3 +{ + +class Sequencer +{ +public: + size_t next() { return counter++; } + std::string next_id() + { + std::stringstream ss; + ss << "id-" << next(); + return ss.str(); + } + +private: + size_t counter = 0; +}; + +class BucketMemStore +{ +public: + typedef std::string Key; + typedef std::string Data; + typedef std::string ETag; + typedef std::string MPU_ID; + typedef std::map MPUPartsInProgress; + typedef std::vector MPUParts; + + + std::map objects; + std::map multiPartUploads; + std::vector> CompletedPartUploads; + + Sequencer sequencer; + + std::string CreateMPU() + { + auto id = sequencer.next_id(); + multiPartUploads.emplace(id, MPUPartsInProgress{}); + return id; + } + + std::string UploadPart(const std::string & upload_id, const std::string & part) + { + auto etag = sequencer.next_id(); + auto & parts = multiPartUploads.at(upload_id); + parts.emplace(etag, part); + return etag; + } + + void PutObject(const std::string & key, const std::string & data) + { + objects[key] = data; + } + + void CompleteMPU(const std::string & key, const std::string & upload_id, const std::vector & etags) + { + MPUParts completedParts; + completedParts.reserve(etags.size()); + + auto & parts = multiPartUploads.at(upload_id); + for (const auto & tag: etags) { + completedParts.push_back(parts.at(tag)); + } + + std::stringstream file_data; + for (const auto & part_data: completedParts) { + file_data << part_data; + } + + CompletedPartUploads.emplace_back(upload_id, std::move(completedParts)); + objects[key] = file_data.str(); + multiPartUploads.erase(upload_id); + } + + void AbortMPU(const std::string & upload_id) + { + multiPartUploads.erase(upload_id); + } + + + const std::vector> & GetCompletedPartUploads() const + { + return CompletedPartUploads; + } + + static std::vector GetPartSizes(const MPUParts & parts) + { + std::vector result; + result.reserve(parts.size()); + for (auto & part_data : parts) + result.push_back(part_data.size()); + + return result; + } + +}; + +class S3MemStrore +{ +public: + void CreateBucket(const std::string & bucket) + { + assert(buckets.count(bucket) == 0); + buckets.emplace(bucket, BucketMemStore{}); + } + + BucketMemStore& GetBucketStore(const std::string & bucket) { + return buckets.at(bucket); + } + +private: + std::map buckets; +}; + +struct EventCounts +{ + size_t headObject = 0; + size_t getObject = 0; + size_t putObject = 0; + size_t multiUploadCreate = 0; + size_t multiUploadComplete = 0; + size_t multiUploadAbort = 0; + size_t uploadParts = 0; + size_t writtenSize = 0; + + size_t totalRequestsCount() const + { + return headObject + getObject + putObject + multiUploadCreate + multiUploadComplete + uploadParts; + } +}; + +struct Client; + +struct InjectionModel +{ + virtual ~InjectionModel() = default; + +#define DeclareInjectCall(ObjectTypePart) \ + virtual std::optional call(const Aws::S3::Model::ObjectTypePart##Request & /*request*/) \ + { \ + return std::nullopt; \ + } + DeclareInjectCall(PutObject) + DeclareInjectCall(HeadObject) + DeclareInjectCall(CreateMultipartUpload) + DeclareInjectCall(CompleteMultipartUpload) + DeclareInjectCall(AbortMultipartUpload) + DeclareInjectCall(UploadPart) +#undef DeclareInjectCall +}; + +struct Client : DB::S3::Client +{ + Client(std::shared_ptr mock_s3_store) + : DB::S3::Client( + 100, + DB::S3::ServerSideEncryptionKMSConfig(), + std::make_shared("", ""), + GetClientConfiguration(), + Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, + /* useVirtualAddressing = */ true) + , store(mock_s3_store) + { } + + static std::shared_ptr CreateClient(String bucket = "mock-s3-bucket") + { + auto s3store = std::make_shared(); + s3store->CreateBucket(bucket); + return std::make_shared(s3store); + } + + static DB::S3::PocoHTTPClientConfiguration GetClientConfiguration() + { + DB::RemoteHostFilter remote_host_filter; + return DB::S3::ClientFactory::instance().createClientConfiguration( + "some-region", + remote_host_filter, + /* s3_max_redirects = */ 100, + /* enable_s3_requests_logging = */ true, + /* for_disk_s3 = */ false, + /* get_request_throttler = */ {}, + /* put_request_throttler = */ {} + ); + } + + void setInjectionModel(std::shared_ptr injections_) + { + injections = injections_; + } + + Aws::S3::Model::PutObjectOutcome PutObject(const Aws::S3::Model::PutObjectRequest & request) const override + { + ++counters.putObject; + + if (injections) + { + if (auto opt_val = injections->call(request)) + { + return *opt_val; + } + } + + auto & bStore = store->GetBucketStore(request.GetBucket()); + std::stringstream data; + data << request.GetBody()->rdbuf(); + bStore.PutObject(request.GetKey(), data.str()); + counters.writtenSize += data.str().length(); + + Aws::S3::Model::PutObjectOutcome outcome; + Aws::S3::Model::PutObjectResult result(outcome.GetResultWithOwnership()); + return result; + } + + Aws::S3::Model::GetObjectOutcome GetObject(const Aws::S3::Model::GetObjectRequest & request) const override + { + ++counters.getObject; + + auto & bStore = store->GetBucketStore(request.GetBucket()); + + auto factory = request.GetResponseStreamFactory(); + Aws::Utils::Stream::ResponseStream responseStream(factory); + responseStream.GetUnderlyingStream() << std::stringstream(bStore.objects[request.GetKey()]).rdbuf(); + + Aws::AmazonWebServiceResult awsStream(std::move(responseStream), Aws::Http::HeaderValueCollection()); + Aws::S3::Model::GetObjectResult getObjectResult(std::move(awsStream)); + return Aws::S3::Model::GetObjectOutcome(std::move(getObjectResult)); + } + + Aws::S3::Model::HeadObjectOutcome HeadObject(const Aws::S3::Model::HeadObjectRequest & request) const override + { + ++counters.headObject; + + if (injections) + { + if (auto opt_val = injections->call(request)) + { + return std::move(*opt_val); + } + } + + auto & bStore = store->GetBucketStore(request.GetBucket()); + auto obj = bStore.objects[request.GetKey()]; + Aws::S3::Model::HeadObjectOutcome outcome; + Aws::S3::Model::HeadObjectResult result(outcome.GetResultWithOwnership()); + result.SetContentLength(obj.length()); + return result; + } + + Aws::S3::Model::CreateMultipartUploadOutcome CreateMultipartUpload(const Aws::S3::Model::CreateMultipartUploadRequest & request) const override + { + ++counters.multiUploadCreate; + + if (injections) + { + if (auto opt_val = injections->call(request)) + { + return std::move(*opt_val); + } + } + + auto & bStore = store->GetBucketStore(request.GetBucket()); + auto mpu_id = bStore.CreateMPU(); + + Aws::S3::Model::CreateMultipartUploadResult result; + result.SetUploadId(mpu_id.c_str()); + return Aws::S3::Model::CreateMultipartUploadOutcome(result); + } + + Aws::S3::Model::UploadPartOutcome UploadPart(const Aws::S3::Model::UploadPartRequest & request) const override + { + ++counters.uploadParts; + + if (injections) + { + if (auto opt_val = injections->call(request)) + { + return std::move(*opt_val); + } + } + + std::stringstream data; + data << request.GetBody()->rdbuf(); + counters.writtenSize += data.str().length(); + + auto & bStore = store->GetBucketStore(request.GetBucket()); + auto etag = bStore.UploadPart(request.GetUploadId(), data.str()); + + Aws::S3::Model::UploadPartResult result; + result.SetETag(etag); + return Aws::S3::Model::UploadPartOutcome(result); + } + + Aws::S3::Model::CompleteMultipartUploadOutcome CompleteMultipartUpload(const Aws::S3::Model::CompleteMultipartUploadRequest & request) const override + { + ++counters.multiUploadComplete; + + if (injections) + { + if (auto opt_val = injections->call(request)) + { + return std::move(*opt_val); + } + } + + auto & bStore = store->GetBucketStore(request.GetBucket()); + + std::vector etags; + for (const auto & x: request.GetMultipartUpload().GetParts()) { + etags.push_back(x.GetETag()); + } + bStore.CompleteMPU(request.GetKey(), request.GetUploadId(), etags); + + Aws::S3::Model::CompleteMultipartUploadResult result; + return Aws::S3::Model::CompleteMultipartUploadOutcome(result); + } + + Aws::S3::Model::AbortMultipartUploadOutcome AbortMultipartUpload(const Aws::S3::Model::AbortMultipartUploadRequest & request) const override + { + ++counters.multiUploadAbort; + + if (injections) + { + if (auto opt_val = injections->call(request)) + { + return std::move(*opt_val); + } + } + + auto & bStore = store->GetBucketStore(request.GetBucket()); + bStore.AbortMPU(request.GetUploadId()); + + Aws::S3::Model::AbortMultipartUploadResult result; + return Aws::S3::Model::AbortMultipartUploadOutcome(result); + } + + std::shared_ptr store; + mutable EventCounts counters; + mutable std::shared_ptr injections; + void resetCounters() const { counters = {}; } +}; + +struct PutObjectFailIngection: InjectionModel +{ + std::optional call(const Aws::S3::Model::PutObjectRequest & /*request*/) override + { + return Aws::Client::AWSError(Aws::Client::CoreErrors::VALIDATION, "FailInjection", "PutObjectFailIngection", false); + } +}; + +struct HeadObjectFailIngection: InjectionModel +{ + std::optional call(const Aws::S3::Model::HeadObjectRequest & /*request*/) override + { + return Aws::Client::AWSError(Aws::Client::CoreErrors::VALIDATION, "FailInjection", "HeadObjectFailIngection", false); + } +}; + +struct CreateMPUFailIngection: InjectionModel +{ + std::optional call(const Aws::S3::Model::CreateMultipartUploadRequest & /*request*/) override + { + return Aws::Client::AWSError(Aws::Client::CoreErrors::VALIDATION, "FailInjection", "CreateMPUFailIngection", false); + } +}; + +struct CompleteMPUFailIngection: InjectionModel +{ + std::optional call(const Aws::S3::Model::CompleteMultipartUploadRequest & /*request*/) override + { + return Aws::Client::AWSError(Aws::Client::CoreErrors::VALIDATION, "FailInjection", "CompleteMPUFailIngection", false); + } +}; + +struct UploadPartFailIngection: InjectionModel +{ + std::optional call(const Aws::S3::Model::UploadPartRequest & /*request*/) override + { + return Aws::Client::AWSError(Aws::Client::CoreErrors::VALIDATION, "FailInjection", "UploadPartFailIngection", false); + } +}; + +struct BaseSyncPolicy +{ + virtual ~BaseSyncPolicy() = default; + virtual DB::ThreadPoolCallbackRunner getScheduler() { return {}; } + virtual void execute(size_t = 0) {} + virtual void setAutoExecute(bool = true) {} + + virtual size_t size() const { return 0; } + virtual bool empty() const { return size() == 0; } +}; + +struct SimpleAsyncTasks : BaseSyncPolicy +{ + bool auto_execute = false; + std::deque> queue; + + virtual DB::ThreadPoolCallbackRunner getScheduler() override + { + return [this] (std::function && operation, size_t /*priority*/) + { + if (auto_execute) + { + auto task = std::packaged_task(std::move(operation)); + task(); + return task.get_future(); + } + + queue.emplace_back(std::move(operation)); + return queue.back().get_future(); + }; + } + + virtual void execute(size_t limit = 0) override + { + if (limit == 0) + limit = queue.size(); + + while (!queue.empty() && limit) + { + auto & request = queue.front(); + request(); + + queue.pop_front(); + --limit; + } + } + + virtual void setAutoExecute(bool value = true) override + { + auto_execute = value; + if (auto_execute) + execute(); + } + + virtual size_t size() const override { return queue.size(); } +}; + +} + +using namespace DB; + +void writeAsOneBlock(WriteBuffer& buf, size_t size) +{ + std::vector data(size, 'a'); + buf.write(data.data(), data.size()); +} + +void writeAsPieces(WriteBuffer& buf, size_t size) +{ + size_t ceil = 15ull*1024*1024*1024; + size_t piece = 1; + size_t written = 0; + while (written < size) { + size_t len = std::min({piece, size-written, ceil}); + writeAsOneBlock(buf, len); + written += len; + piece *= 2; + } +} + +class WBS3Test : public ::testing::Test +{ +public: + const String bucket = "WBS3Test-bucket"; + + Settings & getSettings() + { + return settings; + } + + MockS3::BaseSyncPolicy & getAsyncPolicy() + { + return *async_policy; + } + + std::unique_ptr getWriteBuffer(String file_name = "file") + { + S3Settings::RequestSettings request_settings; + request_settings.updateFromSettings(settings); + + client->resetCounters(); + + getAsyncPolicy().setAutoExecute(false); + + return std::make_unique( + client, + bucket, + file_name, + request_settings, + std::nullopt, + getAsyncPolicy().getScheduler()); + } + + void setInjectionModel(std::shared_ptr injections_) + { + client->setInjectionModel(injections_); + } + + void runSimpleScenario(MockS3::EventCounts expected_counters, size_t size) + { + auto scenario = [&] (std::function writeMethod) { + auto buffer = getWriteBuffer("file"); + writeMethod(*buffer, size); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + + expected_counters.writtenSize = size; + assertCountersEQ(expected_counters); + + auto & bStore = client->store->GetBucketStore(bucket); + auto & data = bStore.objects["file"]; + ASSERT_EQ(size, data.size()); + for (char c : data) + ASSERT_EQ('a', c); + }; + + scenario(writeAsOneBlock); + scenario(writeAsPieces); + } + + void assertCountersEQ(const MockS3::EventCounts & canonical) { + const auto & actual = client->counters; + ASSERT_EQ(canonical.headObject, actual.headObject); + ASSERT_EQ(canonical.getObject, actual.getObject); + ASSERT_EQ(canonical.putObject, actual.putObject); + ASSERT_EQ(canonical.multiUploadCreate, actual.multiUploadCreate); + ASSERT_EQ(canonical.multiUploadComplete, actual.multiUploadComplete); + ASSERT_EQ(canonical.multiUploadAbort, actual.multiUploadAbort); + ASSERT_EQ(canonical.uploadParts, actual.uploadParts); + ASSERT_EQ(canonical.writtenSize, actual.writtenSize); + } + + auto getCompletedPartUploads () + { + return client->store->GetBucketStore(bucket).GetCompletedPartUploads(); + } + +protected: + Settings settings; + + std::shared_ptr client; + std::unique_ptr async_policy; + + virtual void SetUp() override + { + client = MockS3::Client::CreateClient(bucket); + async_policy = std::make_unique(); + } + + virtual void TearDown() override + { + client.reset(); + async_policy.reset(); + } +}; + +class SyncAsync : public WBS3Test, public ::testing::WithParamInterface +{ +protected: + bool test_with_pool = false; + + virtual void SetUp() override + { + test_with_pool = GetParam(); + client = MockS3::Client::CreateClient(bucket); + if (test_with_pool) + async_policy = std::make_unique(); + else + async_policy = std::make_unique(); + } +}; + +INSTANTIATE_TEST_SUITE_P(WBS3 + , SyncAsync + , ::testing::Values(true, false) + , [] (const ::testing::TestParamInfo& info_param) { + std::string name = info_param.param ? "async" : "sync"; + return name; + }); + +TEST_P(SyncAsync, exception_on_head) { + setInjectionModel(std::make_shared()); + + getSettings().s3_check_objects_after_upload = true; + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_head_1"); + buffer->write('A'); + buffer->next(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch( const DB::Exception& e ) + { + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("Immediately after upload:")); + throw; + } + }, DB::S3Exception); +} + +TEST_P(SyncAsync, exception_on_put) { + setInjectionModel(std::make_shared()); + + EXPECT_THROW({ + try + { + auto buffer = getWriteBuffer("exception_on_put_1"); + buffer->write('A'); + buffer->next(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch( const DB::Exception& e ) + { + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("PutObjectFailIngection")); + throw; + } + }, DB::S3Exception); + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_put_2"); + buffer->write('A'); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch( const DB::Exception& e ) + { + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("PutObjectFailIngection")); + throw; + } + }, DB::S3Exception); + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_put_3"); + buffer->write('A'); + getAsyncPolicy().setAutoExecute(); + buffer->preFinalize(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch( const DB::Exception& e ) + { + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("PutObjectFailIngection")); + throw; + } + }, DB::S3Exception); + +} + +TEST_P(SyncAsync, exception_on_create_mpu) { + setInjectionModel(std::make_shared()); + + getSettings().s3_max_single_part_upload_size = 0; // no single part + getSettings().s3_min_upload_part_size = 1; // small parts ara ok + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_create_mpu_1"); + buffer->write('A'); + buffer->next(); + buffer->write('A'); + buffer->next(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch( const DB::Exception& e ) + { + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("CreateMPUFailIngection")); + throw; + } + }, DB::S3Exception); + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_create_mpu_2"); + buffer->write('A'); + buffer->preFinalize(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch( const DB::Exception& e ) + { + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("CreateMPUFailIngection")); + throw; + } + }, DB::S3Exception); + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_create_mpu_2"); + buffer->write('A'); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch( const DB::Exception& e ) + { + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("CreateMPUFailIngection")); + throw; + } + }, DB::S3Exception); +} + + +TEST_P(SyncAsync, exception_on_complete_mpu) { + setInjectionModel(std::make_shared()); + + getSettings().s3_max_single_part_upload_size = 0; // no single part + getSettings().s3_min_upload_part_size = 1; // small parts ara ok + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_complete_mpu_1"); + buffer->write('A'); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch(const DB::Exception & e) + { + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("CompleteMPUFailIngection")); + throw; + } + }, DB::S3Exception); +} + +TEST_P(SyncAsync, exception_on_upload_part) { + setInjectionModel(std::make_shared()); + + getSettings().s3_max_single_part_upload_size = 0; // no single part + getSettings().s3_min_upload_part_size = 1; // small parts ara ok + + MockS3::EventCounts counters = {.multiUploadCreate = 1, .multiUploadAbort = 1}; + + counters.uploadParts = 2; + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_upload_part_1"); + + buffer->write('A'); + buffer->next(); + buffer->write('A'); + buffer->next(); + + getAsyncPolicy().setAutoExecute(); + + buffer->finalize(); + } + catch(const DB::Exception & e) + { + assertCountersEQ(counters); + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("UploadPartFailIngection")); + throw; + } + }, DB::S3Exception); + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_upload_part_2"); + getAsyncPolicy().setAutoExecute(); + + buffer->write('A'); + buffer->next(); + + buffer->write('A'); + buffer->next(); + + buffer->finalize(); + } + catch(const DB::Exception & e) + { + assertCountersEQ(counters); + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("UploadPartFailIngection")); + throw; + } + }, DB::S3Exception); + + counters.uploadParts = 1; + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_upload_part_3"); + buffer->write('A'); + + buffer->preFinalize(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch(const DB::Exception & e) + { + assertCountersEQ(counters); + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("UploadPartFailIngection")); + throw; + } + }, DB::S3Exception); + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_upload_part_4"); + buffer->write('A'); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch(const DB::Exception & e) + { + assertCountersEQ(counters); + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("UploadPartFailIngection")); + throw; + } + }, DB::S3Exception); +} + + +TEST_F(WBS3Test, prefinalize_called_multiple_times) { +#ifdef ABORT_ON_LOGICAL_ERROR + GTEST_SKIP() << "this test trigger LOGICAL_ERROR, runs only if ABORT_ON_LOGICAL_ERROR is not defined"; +#else + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("prefinalize_called_multiple_times"); + buffer->write('A'); + buffer->next(); + buffer->preFinalize(); + buffer->write('A'); + buffer->next(); + buffer->preFinalize(); + buffer->finalize(); + } + catch(const DB::Exception & e) + { + ASSERT_EQ(ErrorCodes::LOGICAL_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("write to prefinalized buffer for S3")); + throw; + } + }, DB::Exception); +#endif +} + +TEST_P(SyncAsync, empty_file) { + getSettings().s3_check_objects_after_upload = true; + + MockS3::EventCounts counters = {.headObject = 2, .putObject = 1}; + runSimpleScenario(counters, 0); +} + +TEST_P(SyncAsync, manual_next_calls) { + getSettings().s3_check_objects_after_upload = true; + + { + MockS3::EventCounts counters = {.headObject = 2, .putObject = 1}; + + auto buffer = getWriteBuffer("manual_next_calls_1"); + buffer->next(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + + assertCountersEQ(counters); + } + + { + MockS3::EventCounts counters = {.headObject = 2, .putObject = 1}; + + auto buffer = getWriteBuffer("manual_next_calls_2"); + buffer->next(); + buffer->next(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + + assertCountersEQ(counters); + } + + { + MockS3::EventCounts counters = {.headObject = 2, .putObject = 1, .writtenSize = 1}; + + auto buffer = getWriteBuffer("manual_next_calls_3"); + buffer->next(); + buffer->write('A'); + buffer->next(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + + assertCountersEQ(counters); + } + + { + MockS3::EventCounts counters = {.headObject = 2, .putObject = 1, .writtenSize = 2}; + + auto buffer = getWriteBuffer("manual_next_calls_4"); + buffer->write('A'); + buffer->next(); + buffer->write('A'); + buffer->next(); + buffer->next(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + + assertCountersEQ(counters); + } +} + +TEST_P(SyncAsync, small_file_is_one_put_request) { + getSettings().s3_check_objects_after_upload = true; + + { + getSettings().s3_max_single_part_upload_size = 1000; + getSettings().s3_min_upload_part_size = 10; + + MockS3::EventCounts counters = {.headObject = 2, .putObject = 1}; + + runSimpleScenario(counters, 1); + runSimpleScenario(counters, getSettings().s3_max_single_part_upload_size-1); + runSimpleScenario(counters, getSettings().s3_max_single_part_upload_size); + runSimpleScenario(counters, getSettings().s3_max_single_part_upload_size/2); + } + + { + + getSettings().s3_max_single_part_upload_size = 10; + getSettings().s3_min_upload_part_size = 1000; + + MockS3::EventCounts counters = {.headObject = 2, .putObject = 1}; + + runSimpleScenario(counters, 1); + runSimpleScenario(counters, getSettings().s3_max_single_part_upload_size-1); + runSimpleScenario(counters, getSettings().s3_max_single_part_upload_size); + runSimpleScenario(counters, getSettings().s3_max_single_part_upload_size/2); + } +} + +TEST_P(SyncAsync, little_bigger_file_is_multi_part_upload) { + getSettings().s3_check_objects_after_upload = true; + + { + getSettings().s3_max_single_part_upload_size = 1000; + getSettings().s3_min_upload_part_size = 10; + + MockS3::EventCounts counters = {.headObject = 2, .multiUploadCreate = 1, .multiUploadComplete = 1, .uploadParts = 2}; + runSimpleScenario(counters, settings.s3_max_single_part_upload_size + 1); + + counters.uploadParts = 101; + runSimpleScenario(counters, 2*settings.s3_max_single_part_upload_size); + } + + { + getSettings().s3_max_single_part_upload_size = 10; + getSettings().s3_min_upload_part_size = 1000; + + MockS3::EventCounts counters = {.headObject = 2, .multiUploadCreate = 1, .multiUploadComplete = 1, .uploadParts = 1}; + + runSimpleScenario(counters, settings.s3_max_single_part_upload_size + 1); + runSimpleScenario(counters, 2*settings.s3_max_single_part_upload_size); + runSimpleScenario(counters, settings.s3_min_upload_part_size-1); + runSimpleScenario(counters, settings.s3_min_upload_part_size); + } +} + +TEST_P(SyncAsync, bigger_file_is_multi_part_upload) { + getSettings().s3_check_objects_after_upload = true; + + { + getSettings().s3_max_single_part_upload_size = 1000; + getSettings().s3_min_upload_part_size = 10; + + auto counters = MockS3::EventCounts{.headObject = 2, .multiUploadCreate = 1, .multiUploadComplete = 1, .uploadParts = 2}; + runSimpleScenario(counters, settings.s3_max_single_part_upload_size + settings.s3_min_upload_part_size); + + counters.uploadParts = 3; + runSimpleScenario(counters, settings.s3_max_single_part_upload_size + settings.s3_min_upload_part_size + 1); + runSimpleScenario(counters, settings.s3_max_single_part_upload_size + 2*settings.s3_min_upload_part_size - 1); + runSimpleScenario(counters, settings.s3_max_single_part_upload_size + 2*settings.s3_min_upload_part_size); + } + + + { + // but not in that case, when s3_min_upload_part_size > s3_max_single_part_upload_size + getSettings().s3_max_single_part_upload_size = 10; + getSettings().s3_min_upload_part_size = 1000; + + auto counters = MockS3::EventCounts{.headObject = 2, .multiUploadCreate = 1, .multiUploadComplete = 1, .uploadParts = 2}; + runSimpleScenario(counters, settings.s3_max_single_part_upload_size + settings.s3_min_upload_part_size); + runSimpleScenario(counters, settings.s3_max_single_part_upload_size + settings.s3_min_upload_part_size + 1); + runSimpleScenario(counters, 2*settings.s3_min_upload_part_size-1); + runSimpleScenario(counters, 2*settings.s3_min_upload_part_size); + + counters.uploadParts = 3; + runSimpleScenario(counters, 2*settings.s3_min_upload_part_size+1); + } +} + +TEST_P(SyncAsync, increase_upload_buffer) { + getSettings().s3_check_objects_after_upload = true; + + { + getSettings().s3_max_single_part_upload_size = 10; + getSettings().s3_min_upload_part_size = 10; + getSettings().s3_upload_part_size_multiply_parts_count_threshold = 1; + // parts: 10 20 40 80 160 + // size: 10 30 70 150 310 + + auto counters = MockS3::EventCounts{.headObject = 2, .multiUploadCreate = 1, .multiUploadComplete = 1, .uploadParts = 6}; + runSimpleScenario(counters, 350); + + auto actual_parts_sizes = MockS3::BucketMemStore::GetPartSizes(getCompletedPartUploads().back().second); + ASSERT_THAT(actual_parts_sizes, testing::ElementsAre(10, 20, 40, 80, 160, 40)); + } + + { + getSettings().s3_max_single_part_upload_size = 10; + getSettings().s3_min_upload_part_size = 10; + getSettings().s3_upload_part_size_multiply_parts_count_threshold = 2; + getSettings().s3_upload_part_size_multiply_factor = 3; + // parts: 10 10 30 30 90 + // size: 10 20 50 80 170 + + auto counters = MockS3::EventCounts{.headObject = 2, .multiUploadCreate = 1, .multiUploadComplete = 1, .uploadParts = 6}; + runSimpleScenario(counters, 190); + + auto actual_parts_sizes = MockS3::BucketMemStore::GetPartSizes(getCompletedPartUploads().back().second); + ASSERT_THAT(actual_parts_sizes, testing::ElementsAre(10, 10, 30, 30, 90, 20)); + } +} + +TEST_P(SyncAsync, increase_limited) { + getSettings().s3_check_objects_after_upload = true; + + { + getSettings().s3_max_single_part_upload_size = 10; + getSettings().s3_min_upload_part_size = 10; + getSettings().s3_upload_part_size_multiply_parts_count_threshold = 1; + getSettings().s3_max_upload_part_size = 45; + // parts: 10 20 40 45 45 45 + // size: 10 30 70 115 160 205 + + auto counters = MockS3::EventCounts{.headObject = 2, .multiUploadCreate = 1, .multiUploadComplete = 1, .uploadParts = 7}; + runSimpleScenario(counters, 220); + + auto actual_parts_sizes = MockS3::BucketMemStore::GetPartSizes(getCompletedPartUploads().back().second); + ASSERT_THAT(actual_parts_sizes, testing::ElementsAre(10, 20, 40, 45, 45, 45, 15)); + } +} + +#endif diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 9adfcf7fef7..f61c1dad59f 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -290,6 +290,7 @@ void MergeTreeData::initializeDirectoriesAndFormatVersion(const std::string & re { auto buf = disk->writeFile(format_version_path, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite, getContext()->getWriteSettings()); writeIntText(format_version.toUnderType(), *buf); + buf->finalize(); if (getContext()->getSettingsRef().fsync_metadata) buf->sync(); } diff --git a/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp b/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp index b843ce6a078..6c6a6ded5dd 100644 --- a/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp +++ b/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp @@ -160,7 +160,10 @@ void MergeTreeDeduplicationLog::rotate() existing_logs.emplace(current_log_number, log_description); if (current_writer) + { + current_writer->finalize(); current_writer->sync(); + } current_writer = disk->writeFile(log_description.path, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Append); } diff --git a/src/Storages/MergeTree/MergeTreeMutationEntry.cpp b/src/Storages/MergeTree/MergeTreeMutationEntry.cpp index 2e30a3f3986..feffffb57ea 100644 --- a/src/Storages/MergeTree/MergeTreeMutationEntry.cpp +++ b/src/Storages/MergeTree/MergeTreeMutationEntry.cpp @@ -75,6 +75,7 @@ MergeTreeMutationEntry::MergeTreeMutationEntry(MutationCommands commands_, DiskP TransactionID::write(tid, *out); *out << "\n"; } + out->finalize(); out->sync(); } catch (...) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 00e72482a17..a4d9dc9f2e3 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -777,7 +777,6 @@ public: key, configuration_.request_settings, std::nullopt, - DBMS_DEFAULT_BUFFER_SIZE, threadPoolCallbackRunner(IOThreadPool::get(), "S3ParallelWrite"), context->getWriteSettings()), compression_method, diff --git a/tests/queries/0_stateless/02240_filesystem_query_cache.reference b/tests/queries/0_stateless/02240_filesystem_query_cache.reference index f4b9f7bb127..16c4cd1c049 100644 --- a/tests/queries/0_stateless/02240_filesystem_query_cache.reference +++ b/tests/queries/0_stateless/02240_filesystem_query_cache.reference @@ -6,6 +6,7 @@ SET skip_download_if_exceeds_query_cache=1; SET filesystem_cache_max_download_size=128; DROP TABLE IF EXISTS test; CREATE TABLE test (key UInt32, value String) Engine=MergeTree() ORDER BY key SETTINGS storage_policy='s3_cache_4', min_bytes_for_wide_part = 10485760, compress_marks=false, compress_primary_key=false; +SYSTEM DROP FILESYSTEM CACHE; INSERT INTO test SELECT number, toString(number) FROM numbers(100); SELECT * FROM test FORMAT Null; SELECT file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache ORDER BY file_segment_range_end, size; diff --git a/tests/queries/0_stateless/02240_filesystem_query_cache.sql b/tests/queries/0_stateless/02240_filesystem_query_cache.sql index 94eb4bc5ccd..44856a2188c 100644 --- a/tests/queries/0_stateless/02240_filesystem_query_cache.sql +++ b/tests/queries/0_stateless/02240_filesystem_query_cache.sql @@ -9,8 +9,8 @@ SET filesystem_cache_max_download_size=128; DROP TABLE IF EXISTS test; CREATE TABLE test (key UInt32, value String) Engine=MergeTree() ORDER BY key SETTINGS storage_policy='s3_cache_4', min_bytes_for_wide_part = 10485760, compress_marks=false, compress_primary_key=false; +SYSTEM DROP FILESYSTEM CACHE; INSERT INTO test SELECT number, toString(number) FROM numbers(100); - SELECT * FROM test FORMAT Null; SELECT file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache ORDER BY file_segment_range_end, size; SYSTEM DROP FILESYSTEM CACHE; diff --git a/tests/queries/0_stateless/02241_filesystem_cache_on_write_operations.reference b/tests/queries/0_stateless/02241_filesystem_cache_on_write_operations.reference index f3fac9b32d3..b3b7d12d219 100644 --- a/tests/queries/0_stateless/02241_filesystem_cache_on_write_operations.reference +++ b/tests/queries/0_stateless/02241_filesystem_cache_on_write_operations.reference @@ -1,6 +1,6 @@ Using storage policy: s3_cache 0 -0 +0 0 Row 1: ────── file_segment_range_begin: 0 @@ -8,11 +8,11 @@ file_segment_range_end: 745 size: 746 state: DOWNLOADED 8 -8 +8 1100 0 2 2 -8 +8 1100 Row 1: ────── file_segment_range_begin: 0 @@ -20,17 +20,17 @@ file_segment_range_end: 1659 size: 1660 state: DOWNLOADED 8 -8 -8 -8 -24 -35 -43 +8 2014 +8 2014 +8 2014 +24 84045 +35 168815 +44 252113 5010500 18816 Using storage policy: local_cache 0 -0 +0 0 Row 1: ────── file_segment_range_begin: 0 @@ -38,11 +38,11 @@ file_segment_range_end: 745 size: 746 state: DOWNLOADED 8 -8 +8 1100 0 2 2 -8 +8 1100 Row 1: ────── file_segment_range_begin: 0 @@ -50,11 +50,11 @@ file_segment_range_end: 1659 size: 1660 state: DOWNLOADED 8 -8 -8 -8 -24 -35 -43 +8 2014 +8 2014 +8 2014 +24 84045 +35 168815 +44 252113 5010500 18816 diff --git a/tests/queries/0_stateless/02241_filesystem_cache_on_write_operations.sh b/tests/queries/0_stateless/02241_filesystem_cache_on_write_operations.sh index 048fb792e6e..e65bf9cb35f 100755 --- a/tests/queries/0_stateless/02241_filesystem_cache_on_write_operations.sh +++ b/tests/queries/0_stateless/02241_filesystem_cache_on_write_operations.sh @@ -33,7 +33,7 @@ for STORAGE_POLICY in 's3_cache' 'local_cache'; do FORMAT Vertical" $CLICKHOUSE_CLIENT --query "SELECT count() FROM (SELECT arrayJoin(cache_paths) AS cache_path, local_path, remote_path FROM system.remote_data_paths ) AS data_paths INNER JOIN system.filesystem_cache AS caches ON data_paths.cache_path = caches.cache_path" - $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache" + $CLICKHOUSE_CLIENT --query "SELECT count(), sum(size) FROM system.filesystem_cache" $CLICKHOUSE_CLIENT --enable_filesystem_cache_on_write_operations=1 --query "INSERT INTO test_02241 SELECT number, toString(number) FROM numbers(100)" @@ -54,7 +54,7 @@ for STORAGE_POLICY in 's3_cache' 'local_cache'; do FORMAT Vertical" $CLICKHOUSE_CLIENT --query "SELECT count() FROM (SELECT arrayJoin(cache_paths) AS cache_path, local_path, remote_path FROM system.remote_data_paths ) AS data_paths INNER JOIN system.filesystem_cache AS caches ON data_paths.cache_path = caches.cache_path" - $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache" + $CLICKHOUSE_CLIENT --query "SELECT count(), sum(size) FROM system.filesystem_cache" $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache WHERE cache_hits > 0" @@ -64,7 +64,7 @@ for STORAGE_POLICY in 's3_cache' 'local_cache'; do $CLICKHOUSE_CLIENT --query "SELECT * FROM test_02241 FORMAT Null" $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache WHERE cache_hits > 0" - $CLICKHOUSE_CLIENT --query "SELECT count() size FROM system.filesystem_cache" + $CLICKHOUSE_CLIENT --query "SELECT count(), sum(size) size FROM system.filesystem_cache" $CLICKHOUSE_CLIENT --query "SYSTEM DROP FILESYSTEM CACHE" @@ -87,24 +87,23 @@ for STORAGE_POLICY in 's3_cache' 'local_cache'; do FORMAT Vertical;" $CLICKHOUSE_CLIENT --query "SELECT count() FROM (SELECT arrayJoin(cache_paths) AS cache_path, local_path, remote_path FROM system.remote_data_paths ) AS data_paths INNER JOIN system.filesystem_cache AS caches ON data_paths.cache_path = caches.cache_path" - $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache" + $CLICKHOUSE_CLIENT --query "SELECT count(), sum(size) FROM system.filesystem_cache" - $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache" + $CLICKHOUSE_CLIENT --query "SELECT count(), sum(size) FROM system.filesystem_cache" $CLICKHOUSE_CLIENT --enable_filesystem_cache_on_write_operations=1 --query "INSERT INTO test_02241 SELECT number, toString(number) FROM numbers(100) SETTINGS enable_filesystem_cache_on_write_operations=0" - $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache" + $CLICKHOUSE_CLIENT --query "SELECT count(), sum(size) FROM system.filesystem_cache" $CLICKHOUSE_CLIENT --enable_filesystem_cache_on_write_operations=1 --query "INSERT INTO test_02241 SELECT number, toString(number) FROM numbers(100)" $CLICKHOUSE_CLIENT --enable_filesystem_cache_on_write_operations=1 --query "INSERT INTO test_02241 SELECT number, toString(number) FROM numbers(300, 10000)" - $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache" + $CLICKHOUSE_CLIENT --query "SELECT count(), sum(size) FROM system.filesystem_cache" $CLICKHOUSE_CLIENT --query "SYSTEM START MERGES test_02241" $CLICKHOUSE_CLIENT --enable_filesystem_cache_on_write_operations=1 --query "OPTIMIZE TABLE test_02241 FINAL" - $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache" + $CLICKHOUSE_CLIENT --query "SELECT count(), sum(size) FROM system.filesystem_cache" $CLICKHOUSE_CLIENT --enable_filesystem_cache_on_write_operations=1 --mutations_sync=2 --query "ALTER TABLE test_02241 UPDATE value = 'kek' WHERE key = 100" - $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache" - + $CLICKHOUSE_CLIENT --query "SELECT count(), sum(size) FROM system.filesystem_cache" $CLICKHOUSE_CLIENT --enable_filesystem_cache_on_write_operations=1 --query "INSERT INTO test_02241 SELECT number, toString(number) FROM numbers(5000000)" $CLICKHOUSE_CLIENT --query "SYSTEM FLUSH LOGS" diff --git a/tests/queries/0_stateless/02382_filesystem_cache_persistent_files.reference b/tests/queries/0_stateless/02382_filesystem_cache_persistent_files.reference index 083f0f69dc8..e77afc98007 100644 --- a/tests/queries/0_stateless/02382_filesystem_cache_persistent_files.reference +++ b/tests/queries/0_stateless/02382_filesystem_cache_persistent_files.reference @@ -8,7 +8,7 @@ SYSTEM STOP MERGES nopers; INSERT INTO nopers SELECT number, toString(number) FROM numbers(10); SELECT * FROM nopers FORMAT Null; SELECT sum(size) FROM system.filesystem_cache; -194 +195 SELECT extract(local_path, '.*/([\w.]+)') as file, extract(cache_path, '.*/([\w.]+)') as cache, size FROM ( @@ -21,17 +21,18 @@ ON data_paths.cache_path = caches.cache_path ORDER BY file, cache, size; data.bin 0 114 data.mrk3 0 80 +format_version.txt 0 1 DROP TABLE IF EXISTS test; CREATE TABLE test (key UInt32, value String) Engine=MergeTree() ORDER BY key SETTINGS storage_policy='s3_cache_small', min_bytes_for_wide_part = 10485760, compress_marks=false, compress_primary_key=false; SYSTEM STOP MERGES test; INSERT INTO test SELECT number, toString(number) FROM numbers(100); SELECT * FROM test FORMAT Null; SELECT sum(size) FROM system.filesystem_cache; -1020 +1021 SELECT count() FROM (SELECT arrayJoin(cache_paths) AS cache_path, local_path, remote_path FROM system.remote_data_paths ) AS data_paths INNER JOIN system.filesystem_cache AS caches ON data_paths.cache_path = caches.cache_path; -4 +5 SELECT count() FROM system.filesystem_cache; -4 +5 SELECT extract(local_path, '.*/([\w.]+)') as file, extract(cache_path, '.*/([\w.]+)') as cache, size FROM ( @@ -46,17 +47,18 @@ data.bin 0 114 data.bin 0 746 data.mrk3 0 80 data.mrk3 0_persistent 80 +format_version.txt 0 1 DROP TABLE IF EXISTS test2; CREATE TABLE test2 (key UInt32, value String) Engine=MergeTree() ORDER BY key SETTINGS storage_policy='s3_cache_small', min_bytes_for_wide_part = 10485760, compress_marks=false, compress_primary_key=false; SYSTEM STOP MERGES test2; INSERT INTO test2 SELECT number, toString(number) FROM numbers(100000); SELECT * FROM test2 FORMAT Null; SELECT sum(size) FROM system.filesystem_cache; -794 +795 SELECT count() FROM (SELECT arrayJoin(cache_paths) AS cache_path, local_path, remote_path FROM system.remote_data_paths ) AS data_paths INNER JOIN system.filesystem_cache AS caches ON data_paths.cache_path = caches.cache_path; -4 +5 SELECT count() FROM system.filesystem_cache; -4 +5 SELECT extract(local_path, '.*/([\w.]+)') as file, extract(cache_path, '.*/([\w.]+)') as cache, size FROM ( @@ -71,6 +73,7 @@ data.bin 0 114 data.mrk3 0 80 data.mrk3 0_persistent 80 data.mrk3 0_persistent 520 +format_version.txt 0 1 DROP TABLE test; DROP TABLE test2; DROP TABLE nopers; diff --git a/tests/queries/0_stateless/02675_profile_events_from_query_log_and_client.reference b/tests/queries/0_stateless/02675_profile_events_from_query_log_and_client.reference index 00e93b1db3d..3f34d5e2c79 100644 --- a/tests/queries/0_stateless/02675_profile_events_from_query_log_and_client.reference +++ b/tests/queries/0_stateless/02675_profile_events_from_query_log_and_client.reference @@ -1,8 +1,8 @@ INSERT TO S3 [ 0 ] S3CompleteMultipartUpload: 1 [ 0 ] S3CreateMultipartUpload: 1 - [ 0 ] S3HeadObject: 1 - [ 0 ] S3ReadRequestsCount: 1 + [ 0 ] S3HeadObject: 2 + [ 0 ] S3ReadRequestsCount: 2 [ 0 ] S3UploadPart: 1 [ 0 ] S3WriteRequestsCount: 3 CHECK WITH query_log From cd449cca38ce8bb831a75822921dd9f35c649562 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Fri, 5 May 2023 13:18:42 +0200 Subject: [PATCH 10/45] WriteBufferFromS3BufferAllocationPolicy for FixedSize and Exp policy --- src/IO/WriteBufferFromS3.cpp | 80 +------------ src/IO/WriteBufferFromS3.h | 5 +- ...riteBufferFromS3BufferAllocationPolicy.cpp | 108 ++++++++++++++++++ .../WriteBufferFromS3BufferAllocationPolicy.h | 26 +++++ src/IO/WriteBufferFromS3MemoryStream.h | 2 - ...02720_s3_strict_upload_part_size.reference | 8 +- .../02720_s3_strict_upload_part_size.sh | 2 +- 7 files changed, 144 insertions(+), 87 deletions(-) create mode 100644 src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp create mode 100644 src/IO/WriteBufferFromS3BufferAllocationPolicy.h diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 5630ed2cb68..73d78cb13be 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -74,80 +74,6 @@ struct WriteBufferFromS3::PartData } }; -struct WriteBufferFromS3::BufferAllocationPolicy -{ - size_t first_size = 0; - size_t second_size = 0; - - size_t multiply_factor = 0; - size_t multiply_threshold = 0; - size_t max_size = 0; - - size_t current_size = 0; - size_t buffer_number = 0; - - explicit BufferAllocationPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_) - : first_size(std::max(settings_.max_single_part_upload_size, settings_.min_upload_part_size)) - , second_size(settings_.min_upload_part_size) - , multiply_factor(settings_.upload_part_size_multiply_factor) - , multiply_threshold(settings_.upload_part_size_multiply_parts_count_threshold) - , max_size(settings_.max_upload_part_size) - { - if (settings_.strict_upload_part_size > 0) - { - first_size = settings_.strict_upload_part_size; - second_size = settings_.strict_upload_part_size; - multiply_factor = 1; - multiply_threshold = 10000; - max_size = settings_.max_upload_part_size; - } - else - { - first_size = std::max(settings_.max_single_part_upload_size, settings_.min_upload_part_size); - second_size = settings_.min_upload_part_size; - multiply_factor = settings_.upload_part_size_multiply_factor; - multiply_threshold = settings_.upload_part_size_multiply_parts_count_threshold; - max_size = settings_.max_upload_part_size; - } - - chassert(first_size > 0); - chassert(second_size > 0); - chassert(multiply_factor >= 1); - chassert(multiply_threshold > 0); - chassert(max_size > 0); - } - - size_t getNumber() const - { - return buffer_number; - } - - size_t getSize() const - { - chassert(buffer_number > 0); - return current_size; - } - - void next() - { - ++buffer_number; - - if (1 == buffer_number) - { - current_size = first_size; - return; - } - - if (2 == buffer_number) - current_size = second_size; - - if (0 == ((buffer_number-1) % multiply_threshold)) - { - current_size *= multiply_factor; - current_size = std::min(current_size, max_size); - } - } -}; WriteBufferFromS3::WriteBufferFromS3( std::shared_ptr client_ptr_, @@ -164,7 +90,7 @@ WriteBufferFromS3::WriteBufferFromS3( , write_settings(write_settings_) , client_ptr(std::move(client_ptr_)) , object_metadata(std::move(object_metadata_)) - , buffer_allocation_policy(std::make_unique(request_settings_.getUploadSettings())) + , buffer_allocation_policy(ChooseBufferPolicy(request_settings_.getUploadSettings())) , task_tracker(std::make_unique(std::move(schedule_))) { LOG_TRACE(log, "Create WriteBufferFromS3, {}", getLogDetails()); @@ -488,7 +414,7 @@ void WriteBufferFromS3::abortMultipartUpload() S3::UploadPartRequest WriteBufferFromS3::getUploadRequest(size_t part_number, PartData & data) { ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Bytes, data.data_size); - LOG_TRACE(log, "fillUploadRequest, size {}, key: {}", data.data_size, key); + LOG_TRACE(log, "getUploadRequest, size {}, key: {}", data.data_size, key); S3::UploadPartRequest req; @@ -515,7 +441,7 @@ void WriteBufferFromS3::writePart(WriteBufferFromS3::PartData && data) multipart_tags.push_back({}); size_t part_number = multipart_tags.size(); - LOG_TRACE(log, "WritePart. {}, part size: {}, part number: {}", getLogDetails(), data.data_size, part_number); + LOG_TRACE(log, "writePart {}, part size: {}, part number: {}", getLogDetails(), data.data_size, part_number); if (multipart_upload_id.empty()) throw Exception( diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index 13ed151ad57..b0d8d329589 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -22,7 +23,6 @@ class Client; namespace DB { - /** * Buffer to write a data to a S3 object with specified bucket and key. * If data size written to the buffer is less than 'max_single_part_upload_size' write is performed using singlepart upload. @@ -80,8 +80,7 @@ private: const std::optional> object_metadata; Poco::Logger * log = &Poco::Logger::get("WriteBufferFromS3"); - struct BufferAllocationPolicy; - std::unique_ptr buffer_allocation_policy; + IBufferAllocationPolicyPtr buffer_allocation_policy; /// Upload in S3 is made in parts. /// We initiate upload, then upload each part and get ETag as a response, and then finalizeImpl() upload with listing all our parts. diff --git a/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp b/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp new file mode 100644 index 00000000000..1e9b209087c --- /dev/null +++ b/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp @@ -0,0 +1,108 @@ +#include "config.h" + +#if USE_AWS_S3 + +#include + +namespace +{ + +struct FixedSizeBufferAllocationPolicy : DB::IBufferAllocationPolicy +{ + const size_t size = 0; + size_t buffer_number = 0; + + explicit FixedSizeBufferAllocationPolicy(const DB::S3Settings::RequestSettings::PartUploadSettings & settings_) + : size(settings_.strict_upload_part_size) + { + chassert(size > 0); + } + + size_t getNumber() const override { return buffer_number; } + + size_t getSize() const override + { + chassert(buffer_number > 0); + return size; + } + + void next() override + { + ++buffer_number; + } +}; + + +struct ExpBufferAllocationPolicy : DB::IBufferAllocationPolicy +{ + const size_t first_size = 0; + const size_t second_size = 0; + + const size_t multiply_factor = 0; + const size_t multiply_threshold = 0; + const size_t max_size = 0; + + size_t current_size = 0; + size_t buffer_number = 0; + + explicit ExpBufferAllocationPolicy(const DB::S3Settings::RequestSettings::PartUploadSettings & settings_) + : first_size(std::max(settings_.max_single_part_upload_size, settings_.min_upload_part_size)) + , second_size(settings_.min_upload_part_size) + , multiply_factor(settings_.upload_part_size_multiply_factor) + , multiply_threshold(settings_.upload_part_size_multiply_parts_count_threshold) + , max_size(settings_.max_upload_part_size) + { + chassert(first_size > 0); + chassert(second_size > 0); + chassert(multiply_factor >= 1); + chassert(multiply_threshold > 0); + chassert(max_size > 0); + } + + size_t getNumber() const override { return buffer_number; } + + size_t getSize() const override + { + chassert(buffer_number > 0); + return current_size; + } + + void next() override + { + ++buffer_number; + + if (1 == buffer_number) + { + current_size = first_size; + return; + } + + if (2 == buffer_number) + current_size = second_size; + + if (0 == ((buffer_number - 1) % multiply_threshold)) + { + current_size *= multiply_factor; + current_size = std::min(current_size, max_size); + } + } +}; + +} + +namespace DB +{ + +IBufferAllocationPolicy::~IBufferAllocationPolicy() { } + +IBufferAllocationPolicyPtr ChooseBufferPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_) +{ + if (settings_.strict_upload_part_size > 0) + return std::make_unique(settings_); + else + return std::make_unique(settings_); +} + +} + +#endif diff --git a/src/IO/WriteBufferFromS3BufferAllocationPolicy.h b/src/IO/WriteBufferFromS3BufferAllocationPolicy.h new file mode 100644 index 00000000000..1ee7c982ed2 --- /dev/null +++ b/src/IO/WriteBufferFromS3BufferAllocationPolicy.h @@ -0,0 +1,26 @@ +#pragma once + +#include "config.h" + +#if USE_AWS_S3 + +#include + +namespace DB +{ + +struct IBufferAllocationPolicy +{ + virtual size_t getNumber() const = 0; + virtual size_t getSize() const = 0; + virtual void next() = 0; + virtual ~IBufferAllocationPolicy() = 0; +}; + +using IBufferAllocationPolicyPtr = std::unique_ptr; + +IBufferAllocationPolicyPtr ChooseBufferPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_); + +} + +#endif diff --git a/src/IO/WriteBufferFromS3MemoryStream.h b/src/IO/WriteBufferFromS3MemoryStream.h index 5a7cc17705d..e9606798910 100644 --- a/src/IO/WriteBufferFromS3MemoryStream.h +++ b/src/IO/WriteBufferFromS3MemoryStream.h @@ -4,8 +4,6 @@ #if USE_AWS_S3 -#include "WriteBufferFromS3.h" - #include namespace DB diff --git a/tests/queries/0_stateless/02720_s3_strict_upload_part_size.reference b/tests/queries/0_stateless/02720_s3_strict_upload_part_size.reference index 360b484bf28..f7c4ece5f1f 100644 --- a/tests/queries/0_stateless/02720_s3_strict_upload_part_size.reference +++ b/tests/queries/0_stateless/02720_s3_strict_upload_part_size.reference @@ -1,4 +1,4 @@ -Size: 6000001 -Size: 6000001 -Size: 6000001 -Size: 2971517 +part size: 6000001, part number: 1 +part size: 6000001, part number: 2 +part size: 6000001, part number: 3 +part size: 2971517, part number: 4 diff --git a/tests/queries/0_stateless/02720_s3_strict_upload_part_size.sh b/tests/queries/0_stateless/02720_s3_strict_upload_part_size.sh index 69e2f734914..9799ef0478a 100755 --- a/tests/queries/0_stateless/02720_s3_strict_upload_part_size.sh +++ b/tests/queries/0_stateless/02720_s3_strict_upload_part_size.sh @@ -19,7 +19,7 @@ $CLICKHOUSE_LOCAL -q "SELECT randomPrintableASCII(1023) FROM numbers(20*1024) FO $CLICKHOUSE_CLIENT --send_logs_level=trace --server_logs_file="$log" -q "INSERT INTO FUNCTION s3(s3_conn, filename='$CLICKHOUSE_TEST_UNIQUE_NAME', format='LineAsString', structure='line String') FORMAT LineAsString" --s3_strict_upload_part_size=6000001 < "$in" grep -F '' "$log" || : -grep -o 'WriteBufferFromS3: Writing part.*Size: .*' "$log" | grep -o 'Size: .*' +grep -o 'WriteBufferFromS3: writePart.*, part size: .*' "$log" | grep -o 'part size: .*' $CLICKHOUSE_CLIENT -q "SELECT * FROM s3(s3_conn, filename='$CLICKHOUSE_TEST_UNIQUE_NAME', format='LineAsString', structure='line String') FORMAT LineAsString" > "$out" diff -q "$in" "$out" From c8028bfd7fc83ec64aeb65f75fa7f85b05225fb7 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Fri, 5 May 2023 15:46:52 +0000 Subject: [PATCH 11/45] ajust 02240_system_filesystem_cache_table --- ...40_system_filesystem_cache_table.reference | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/tests/queries/0_stateless/02240_system_filesystem_cache_table.reference b/tests/queries/0_stateless/02240_system_filesystem_cache_table.reference index cf2bf5fb521..f960b4eb21c 100644 --- a/tests/queries/0_stateless/02240_system_filesystem_cache_table.reference +++ b/tests/queries/0_stateless/02240_system_filesystem_cache_table.reference @@ -1,13 +1,15 @@ Using storage policy: s3_cache 0 Expect cache +DOWNLOADED 0 0 1 DOWNLOADED 0 79 80 DOWNLOADED 0 745 746 -2 +3 Expect cache +DOWNLOADED 0 0 1 DOWNLOADED 0 79 80 DOWNLOADED 0 745 746 -2 +3 Expect no cache Expect cache DOWNLOADED 0 79 80 @@ -15,13 +17,15 @@ DOWNLOADED 0 745 746 2 Expect no cache Expect cache +DOWNLOADED 0 0 1 DOWNLOADED 0 79 80 DOWNLOADED 0 745 746 -2 +3 Expect cache +DOWNLOADED 0 0 1 DOWNLOADED 0 79 80 DOWNLOADED 0 745 746 -2 +3 Expect no cache Expect cache DOWNLOADED 0 79 80 @@ -31,13 +35,15 @@ Expect no cache Using storage policy: local_cache 0 Expect cache +DOWNLOADED 0 0 1 DOWNLOADED 0 79 80 DOWNLOADED 0 745 746 -2 +3 Expect cache +DOWNLOADED 0 0 1 DOWNLOADED 0 79 80 DOWNLOADED 0 745 746 -2 +3 Expect no cache Expect cache DOWNLOADED 0 79 80 @@ -45,13 +51,15 @@ DOWNLOADED 0 745 746 2 Expect no cache Expect cache +DOWNLOADED 0 0 1 DOWNLOADED 0 79 80 DOWNLOADED 0 745 746 -2 +3 Expect cache +DOWNLOADED 0 0 1 DOWNLOADED 0 79 80 DOWNLOADED 0 745 746 -2 +3 Expect no cache Expect cache DOWNLOADED 0 79 80 From 60bf45f863a1e4184cd2159d5435f32e228a0008 Mon Sep 17 00:00:00 2001 From: Sema Checherinda <104093494+CheSema@users.noreply.github.com> Date: Sat, 6 May 2023 12:11:16 +0200 Subject: [PATCH 12/45] Update WriteBufferFromS3BufferAllocationPolicy.cpp --- src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp b/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp index 1e9b209087c..0eec6b0d034 100644 --- a/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp +++ b/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp @@ -93,7 +93,7 @@ struct ExpBufferAllocationPolicy : DB::IBufferAllocationPolicy namespace DB { -IBufferAllocationPolicy::~IBufferAllocationPolicy() { } +IBufferAllocationPolicy::~IBufferAllocationPolicy() = default; IBufferAllocationPolicyPtr ChooseBufferPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_) { From 8bc9a32d19b5ecd02bc08787800cd475564069fa Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Wed, 10 May 2023 18:45:59 +0000 Subject: [PATCH 13/45] fix special build --- src/IO/S3Common.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index 370c5911482..71d52c727c7 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -20,8 +20,6 @@ #include #include -namespace Aws::S3 { class Client; } - namespace DB { From 26743b54394de20cf0ff6e3e56b08bdc642c1330 Mon Sep 17 00:00:00 2001 From: "Diego Nieto (lesandie)" Date: Thu, 11 May 2023 15:36:47 +0200 Subject: [PATCH 14/45] Fix Local Cache documentation explanations --- docs/en/operations/storing-data.md | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index ac6ea22ab75..495716858ec 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -172,17 +172,19 @@ Example of configuration for versions earlier than 22.8: ``` -Cache **configuration settings**: +File Cache **configuration settings**: + +These settings should be defined in the disk configuration section. - `path` - path to the directory with cache. Default: None, this setting is obligatory. - `max_size` - maximum size of the cache in bytes or in readable format, e.g. `ki, Mi, Gi, etc`, example `10Gi` (such format works starting from `22.10` version). When the limit is reached, cache files are evicted according to the cache eviction policy. Default: None, this setting is obligatory. -- `cache_on_write_operations` - allow to turn on `write-through` cache (caching data on any write operations: `INSERT` queries, background merges). Default: `false`. The `write-through` cache can be disabled per query using setting `enable_filesystem_cache_on_write_operations` (data is cached only if both cache config settings and corresponding query setting are enabled). +- `cache_on_write_operations` - allow to turn on `write-through` cache (caching data on any write operations: `INSERT` queries, background merges). Default: `false`. - `enable_filesystem_query_cache_limit` - allow to limit the size of cache which is downloaded within each query (depends on user setting `max_query_cache_size`). Default: `false`. -- `enable_cache_hits_threshold` - a number, which defines how many times some data needs to be read before it will be cached. Default: `0`, e.g. the data is cached at the first attempt to read it. +- `enable_cache_hits_threshold` - number which defines how many times some data needs to be read before it will be cached. Default: `0`, e.g. the data is cached at the first attempt to read it. - `do_not_evict_index_and_mark_files` - do not evict small frequently used files according to cache policy. Default: `false`. This setting was added in version 22.8. If you used filesystem cache before this version, then it will not work on versions starting from 22.8 if this setting is set to `true`. If you want to use this setting, clear old cache created before version 22.8 before upgrading. @@ -190,19 +192,21 @@ Cache **configuration settings**: - `max_elements` - a limit for a number of cache files. Default: `1048576`. -Cache **query settings**: +File Cache **query settings**: -- `enable_filesystem_cache` - allows to disable cache per query even if storage policy was configured with `cache` disk type. Default: `true`. +Some of these settings will disable cache per query features that are enabled by default. For example, setting `cache_on_write_operations` to 1 means that general file and per query cache are enabled but also setting `enable_filesystem_cache_on_write_operations` to 1 means that file cache is enabled but disabled per query cache. The same approach must be used for `enable_filesystem_cache`, if set to 1 cache per query is disabled. -- `read_from_filesystem_cache_if_exists_otherwise_bypass_cache` - allows to use cache in query only if it already exists, otherwise query data will not be written to local cache storage. Default: `false`. +- `enable_filesystem_cache` - allows to disable cache per query even if storage policy was configured with `cache` disk type. Default: `true`. This setting should be defined in the disk configuration section. -- `enable_filesystem_cache_on_write_operations` - turn on `write-through` cache. This setting works only if setting `cache_on_write_operations` in cache configuration is turned on. +- `read_from_filesystem_cache_if_exists_otherwise_bypass_cache` - allows to use cache in query only if it already exists, otherwise query data will not be written to local cache storage. Default: `false`. This is a profile level configuration setting. -- `enable_filesystem_cache_log` - turn on logging to `system.filesystem_cache_log` table. Gives a detailed view of cache usage per query. Default: `false`. +- `enable_filesystem_cache_on_write_operations` - turn on `write-through` cache. This setting works only if setting `cache_on_write_operations` in cache configuration is turned on. This setting should be defined in the disk configuration section. -- `max_query_cache_size` - a limit for the cache size, which can be written to local cache storage. Requires enabled `enable_filesystem_query_cache_limit` in cache configuration. Default: `false`. +- `enable_filesystem_cache_log` - turn on logging to `system.filesystem_cache_log` table. Gives a detailed view of cache usage per query. Default: `false`. This is a profile level configuration setting. -- `skip_download_if_exceeds_query_cache` - allows to change the behaviour of setting `max_query_cache_size`. Default: `true`. If this setting is turned on and cache download limit during query was reached, no more cache will be downloaded to cache storage. If this setting is turned off and cache download limit during query was reached, cache will still be written by cost of evicting previously downloaded (within current query) data, e.g. second behaviour allows to preserve `last recentltly used` behaviour while keeping query cache limit. +- `max_query_cache_size` - a limit for the cache size, which can be written to local cache storage. Requires enabled `enable_filesystem_query_cache_limit` in cache configuration. Default: `false`. This setting should be defined in the disk configuration section. + +- `skip_download_if_exceeds_query_cache` - allows to change the behaviour of setting `max_query_cache_size`. Default: `true`. If this setting is turned on and cache download limit during query was reached, no more cache will be downloaded to cache storage. If this setting is turned off and cache download limit during query was reached, cache will still be written by cost of evicting previously downloaded (within current query) data, e.g. second behaviour allows to preserve `last recently used` behaviour while keeping query cache limit. This is a profile level configuration setting. ** Warning ** Cache configuration settings and cache query settings correspond to the latest ClickHouse version, for earlier versions something might not be supported. From 10e0c1d832769f3244e4ad7563a02b6cd532883b Mon Sep 17 00:00:00 2001 From: "Diego Nieto (lesandie)" Date: Thu, 11 May 2023 16:06:14 +0200 Subject: [PATCH 15/45] Reworked documentation using local cache section --- docs/en/operations/storing-data.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index 495716858ec..78c0fb8a049 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -194,7 +194,7 @@ These settings should be defined in the disk configuration section. File Cache **query settings**: -Some of these settings will disable cache per query features that are enabled by default. For example, setting `cache_on_write_operations` to 1 means that general file and per query cache are enabled but also setting `enable_filesystem_cache_on_write_operations` to 1 means that file cache is enabled but disabled per query cache. The same approach must be used for `enable_filesystem_cache`, if set to 1 cache per query is disabled. +Some of these settings will disable cache per query features that are enabled by default. For example, setting `cache_on_write_operations` to 1 means that general file and per query cache are enabled but also setting `enable_filesystem_cache_on_write_operations` to 1 means that file cache is enabled but disabled per query cache. The same approach must be used for `enable_filesystem_cache`, if set to 1, cache per query is disabled. - `enable_filesystem_cache` - allows to disable cache per query even if storage policy was configured with `cache` disk type. Default: `true`. This setting should be defined in the disk configuration section. From eb62030fa425af8ef176e7f7f333b8e34fa4dc4f Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Wed, 10 May 2023 05:02:52 +0200 Subject: [PATCH 16/45] Fix assigning a setting to NULL in settings profile's definition. --- src/Access/SettingsConstraints.cpp | 12 ++++---- src/Access/SettingsProfileElement.cpp | 30 +++++++++---------- src/Access/SettingsProfileElement.h | 6 ++-- .../Access/ASTSettingsProfileElement.cpp | 12 ++++---- .../Access/ASTSettingsProfileElement.h | 6 ++-- .../Access/ParserSettingsProfileElement.cpp | 24 +++++++-------- .../StorageSystemSettingsProfileElements.cpp | 12 ++++---- 7 files changed, 51 insertions(+), 51 deletions(-) diff --git a/src/Access/SettingsConstraints.cpp b/src/Access/SettingsConstraints.cpp index e83ab264f4f..12f584cab83 100644 --- a/src/Access/SettingsConstraints.cpp +++ b/src/Access/SettingsConstraints.cpp @@ -105,21 +105,21 @@ void SettingsConstraints::check(const Settings & current_settings, const Setting if (SettingsProfileElements::isAllowBackupSetting(element.setting_name)) continue; - if (!element.value.isNull()) + if (element.value) { - SettingChange value(element.setting_name, element.value); + SettingChange value(element.setting_name, *element.value); check(current_settings, value); } - if (!element.min_value.isNull()) + if (element.min_value) { - SettingChange value(element.setting_name, element.min_value); + SettingChange value(element.setting_name, *element.min_value); check(current_settings, value); } - if (!element.max_value.isNull()) + if (element.max_value) { - SettingChange value(element.setting_name, element.max_value); + SettingChange value(element.setting_name, *element.max_value); check(current_settings, value); } diff --git a/src/Access/SettingsProfileElement.cpp b/src/Access/SettingsProfileElement.cpp index ce56782d887..9358391cb93 100644 --- a/src/Access/SettingsProfileElement.cpp +++ b/src/Access/SettingsProfileElement.cpp @@ -63,18 +63,18 @@ void SettingsProfileElement::init(const ASTSettingsProfileElement & ast, const A max_value = ast.max_value; writability = ast.writability; - if (!value.isNull()) - value = Settings::castValueUtil(setting_name, value); - if (!min_value.isNull()) - min_value = Settings::castValueUtil(setting_name, min_value); - if (!max_value.isNull()) - max_value = Settings::castValueUtil(setting_name, max_value); + if (value) + value = Settings::castValueUtil(setting_name, *value); + if (min_value) + min_value = Settings::castValueUtil(setting_name, *min_value); + if (max_value) + max_value = Settings::castValueUtil(setting_name, *max_value); } } bool SettingsProfileElement::isConstraint() const { - return this->writability || !this->min_value.isNull() || !this->max_value.isNull(); + return this->writability || this->min_value || this->max_value; } std::shared_ptr SettingsProfileElement::toAST() const @@ -187,8 +187,8 @@ Settings SettingsProfileElements::toSettings() const Settings res; for (const auto & elem : *this) { - if (!elem.setting_name.empty() && !isAllowBackupSetting(elem.setting_name) && !elem.value.isNull()) - res.set(elem.setting_name, elem.value); + if (!elem.setting_name.empty() && !isAllowBackupSetting(elem.setting_name) && elem.value) + res.set(elem.setting_name, *elem.value); } return res; } @@ -200,8 +200,8 @@ SettingsChanges SettingsProfileElements::toSettingsChanges() const { if (!elem.setting_name.empty() && !isAllowBackupSetting(elem.setting_name)) { - if (!elem.value.isNull()) - res.push_back({elem.setting_name, elem.value}); + if (elem.value) + res.push_back({elem.setting_name, *elem.value}); } } return res; @@ -214,8 +214,8 @@ SettingsConstraints SettingsProfileElements::toSettingsConstraints(const AccessC if (!elem.setting_name.empty() && elem.isConstraint() && !isAllowBackupSetting(elem.setting_name)) res.set( elem.setting_name, - elem.min_value, - elem.max_value, + elem.min_value ? *elem.min_value : Field{}, + elem.max_value ? *elem.max_value : Field{}, elem.writability ? *elem.writability : SettingConstraintWritability::WRITABLE); return res; } @@ -240,8 +240,8 @@ bool SettingsProfileElements::isBackupAllowed() const { for (const auto & setting : *this) { - if (isAllowBackupSetting(setting.setting_name)) - return static_cast(SettingFieldBool{setting.value}); + if (isAllowBackupSetting(setting.setting_name) && setting.value) + return static_cast(SettingFieldBool{*setting.value}); } return true; } diff --git a/src/Access/SettingsProfileElement.h b/src/Access/SettingsProfileElement.h index 7f9379c1e47..7078f565295 100644 --- a/src/Access/SettingsProfileElement.h +++ b/src/Access/SettingsProfileElement.h @@ -23,9 +23,9 @@ struct SettingsProfileElement std::optional parent_profile; String setting_name; - Field value; - Field min_value; - Field max_value; + std::optional value; + std::optional min_value; + std::optional max_value; std::optional writability; auto toTuple() const { return std::tie(parent_profile, setting_name, value, min_value, max_value, writability); } diff --git a/src/Parsers/Access/ASTSettingsProfileElement.cpp b/src/Parsers/Access/ASTSettingsProfileElement.cpp index 76973c428b2..7b29b15cb29 100644 --- a/src/Parsers/Access/ASTSettingsProfileElement.cpp +++ b/src/Parsers/Access/ASTSettingsProfileElement.cpp @@ -35,21 +35,21 @@ void ASTSettingsProfileElement::formatImpl(const FormatSettings & settings, Form formatSettingName(setting_name, settings.ostr); - if (!value.isNull()) + if (value) { - settings.ostr << " = " << applyVisitor(FieldVisitorToString{}, value); + settings.ostr << " = " << applyVisitor(FieldVisitorToString{}, *value); } - if (!min_value.isNull()) + if (min_value) { settings.ostr << (settings.hilite ? IAST::hilite_keyword : "") << " MIN " << (settings.hilite ? IAST::hilite_none : "") - << applyVisitor(FieldVisitorToString{}, min_value); + << applyVisitor(FieldVisitorToString{}, *min_value); } - if (!max_value.isNull()) + if (max_value) { settings.ostr << (settings.hilite ? IAST::hilite_keyword : "") << " MAX " << (settings.hilite ? IAST::hilite_none : "") - << applyVisitor(FieldVisitorToString{}, max_value); + << applyVisitor(FieldVisitorToString{}, *max_value); } if (writability) diff --git a/src/Parsers/Access/ASTSettingsProfileElement.h b/src/Parsers/Access/ASTSettingsProfileElement.h index 275257e4f8e..13c1926d9b0 100644 --- a/src/Parsers/Access/ASTSettingsProfileElement.h +++ b/src/Parsers/Access/ASTSettingsProfileElement.h @@ -14,9 +14,9 @@ class ASTSettingsProfileElement : public IAST public: String parent_profile; String setting_name; - Field value; - Field min_value; - Field max_value; + std::optional value; + std::optional min_value; + std::optional max_value; std::optional writability; bool id_mode = false; /// If true then `parent_profile` keeps UUID, not a name. bool use_inherit_keyword = false; /// If true then this element is a part of ASTCreateSettingsProfileQuery. diff --git a/src/Parsers/Access/ParserSettingsProfileElement.cpp b/src/Parsers/Access/ParserSettingsProfileElement.cpp index db23a806a12..36330b96622 100644 --- a/src/Parsers/Access/ParserSettingsProfileElement.cpp +++ b/src/Parsers/Access/ParserSettingsProfileElement.cpp @@ -52,7 +52,7 @@ namespace } - bool parseValue(IParserBase::Pos & pos, Expected & expected, Field & res) + bool parseValue(IParserBase::Pos & pos, Expected & expected, std::optional & res) { return IParserBase::wrapParseImpl(pos, [&] { @@ -69,7 +69,7 @@ namespace } - bool parseMinMaxValue(IParserBase::Pos & pos, Expected & expected, Field & min_value, Field & max_value) + bool parseMinMaxValue(IParserBase::Pos & pos, Expected & expected, std::optional & min_value, std::optional & max_value) { return IParserBase::wrapParseImpl(pos, [&] { @@ -124,9 +124,9 @@ namespace IParserBase::Pos & pos, Expected & expected, String & setting_name, - Field & value, - Field & min_value, - Field & max_value, + std::optional & value, + std::optional & min_value, + std::optional & max_value, std::optional & writability) { return IParserBase::wrapParseImpl(pos, [&] @@ -136,9 +136,9 @@ namespace return false; String res_setting_name = getIdentifierName(name_ast); - Field res_value; - Field res_min_value; - Field res_max_value; + std::optional res_value; + std::optional res_min_value; + std::optional res_max_value; std::optional res_writability; bool has_value_or_constraint = false; @@ -151,7 +151,7 @@ namespace if (!has_value_or_constraint) return false; - if (boost::iequals(res_setting_name, "PROFILE") && res_value.isNull() && res_min_value.isNull() && res_max_value.isNull() + if (boost::iequals(res_setting_name, "PROFILE") && !res_value && !res_min_value && !res_max_value && res_writability == SettingConstraintWritability::CONST) { /// Ambiguity: "profile readonly" can be treated either as a profile named "readonly" or @@ -181,9 +181,9 @@ namespace { String parent_profile; String setting_name; - Field value; - Field min_value; - Field max_value; + std::optional value; + std::optional min_value; + std::optional max_value; std::optional writability; bool ok = parseSettingNameWithValueOrConstraints(pos, expected, setting_name, value, min_value, max_value, writability); diff --git a/src/Storages/System/StorageSystemSettingsProfileElements.cpp b/src/Storages/System/StorageSystemSettingsProfileElements.cpp index 6785a4392e1..e01d3cb0ace 100644 --- a/src/Storages/System/StorageSystemSettingsProfileElements.cpp +++ b/src/Storages/System/StorageSystemSettingsProfileElements.cpp @@ -87,27 +87,27 @@ void StorageSystemSettingsProfileElements::fillData(MutableColumns & res_columns size_t current_index = index++; bool inserted_value = false; - if (!element.value.isNull() && !element.setting_name.empty()) + if (element.value && !element.setting_name.empty()) { - String str = Settings::valueToStringUtil(element.setting_name, element.value); + String str = Settings::valueToStringUtil(element.setting_name, *element.value); column_value.insertData(str.data(), str.length()); column_value_null_map.push_back(false); inserted_value = true; } bool inserted_min = false; - if (!element.min_value.isNull() && !element.setting_name.empty()) + if (element.min_value && !element.setting_name.empty()) { - String str = Settings::valueToStringUtil(element.setting_name, element.min_value); + String str = Settings::valueToStringUtil(element.setting_name, *element.min_value); column_min.insertData(str.data(), str.length()); column_min_null_map.push_back(false); inserted_min = true; } bool inserted_max = false; - if (!element.max_value.isNull() && !element.setting_name.empty()) + if (element.max_value && !element.setting_name.empty()) { - String str = Settings::valueToStringUtil(element.setting_name, element.max_value); + String str = Settings::valueToStringUtil(element.setting_name, *element.max_value); column_max.insertData(str.data(), str.length()); column_max_null_map.push_back(false); inserted_max = true; From 8fc0083a264cb4f1afb821bd9884ea1f6f414710 Mon Sep 17 00:00:00 2001 From: "Diego Nieto (lesandie)" Date: Fri, 12 May 2023 11:45:38 +0200 Subject: [PATCH 17/45] Rewrite following conversation/comments --- docs/en/operations/storing-data.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index d0102ce38d4..e8b043e7a27 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -172,7 +172,7 @@ Example of configuration for versions earlier than 22.8: ``` -File Cache **configuration settings**: +File Cache **disk configuration settings**: These settings should be defined in the disk configuration section. @@ -180,7 +180,7 @@ These settings should be defined in the disk configuration section. - `max_size` - maximum size of the cache in bytes or in readable format, e.g. `ki, Mi, Gi, etc`, example `10Gi` (such format works starting from `22.10` version). When the limit is reached, cache files are evicted according to the cache eviction policy. Default: None, this setting is obligatory. -- `cache_on_write_operations` - allow to turn on `write-through` cache (caching data on any write operations: `INSERT` queries, background merges). Default: `false`. +- `cache_on_write_operations` - allow to turn on `write-through` cache (caching data on any write operations: `INSERT` queries, background merges). Default: `false`. The `write-through` cache can be disabled per query using setting `enable_filesystem_cache_on_write_operations` (data is cached only if both cache config settings and corresponding query setting are enabled). - `enable_filesystem_query_cache_limit` - allow to limit the size of cache which is downloaded within each query (depends on user setting `max_query_cache_size`). Default: `false`. @@ -192,23 +192,23 @@ These settings should be defined in the disk configuration section. - `max_elements` - a limit for a number of cache files. Default: `1048576`. -File Cache **query settings**: +File Cache **query/profile settings**: -Some of these settings will disable cache per query features that are enabled by default. For example, setting `cache_on_write_operations` to 1 means that general file and per query cache are enabled but also setting `enable_filesystem_cache_on_write_operations` to 1 means that file cache is enabled but disabled per query cache. The same approach must be used for `enable_filesystem_cache`, if set to 1, cache per query is disabled. +Some of these settings will disable cache features per query/profile that are enabled by default. For example, you can enable cache in disk configuration and disable it per query/profile setting `enable_filesystem_cache` to `false`. Also setting `cache_on_write_operations` to `true` in disk configuration means that general file and per query cache are enabled. But if you need to disable this general setting per specific queries then setting `enable_filesystem_cache_on_write_operations` to `false` means that write operations cache will be disabled for a specific query/profile. -- `enable_filesystem_cache` - allows to disable cache per query even if storage policy was configured with `cache` disk type. Default: `true`. This setting should be defined in the disk configuration section. +- `enable_filesystem_cache` - allows to disable cache per query even if storage policy was configured with `cache` disk type. Default: `true`. -- `read_from_filesystem_cache_if_exists_otherwise_bypass_cache` - allows to use cache in query only if it already exists, otherwise query data will not be written to local cache storage. Default: `false`. This is a profile level configuration setting. +- `read_from_filesystem_cache_if_exists_otherwise_bypass_cache` - allows to use cache in query only if it already exists, otherwise query data will not be written to local cache storage. Default: `false`. -- `enable_filesystem_cache_on_write_operations` - turn on `write-through` cache. This setting works only if setting `cache_on_write_operations` in cache configuration is turned on. This setting should be defined in the disk configuration section. +- `enable_filesystem_cache_on_write_operations` - turn on `write-through` cache. This setting works only if setting `cache_on_write_operations` in cache configuration is turned on. Default: `false`. -- `enable_filesystem_cache_log` - turn on logging to `system.filesystem_cache_log` table. Gives a detailed view of cache usage per query. Default: `false`. This is a profile level configuration setting. +- `enable_filesystem_cache_log` - turn on logging to `system.filesystem_cache_log` table. Gives a detailed view of cache usage per query. It can be turn on for specific queries or enabled in a profile. Default: `false`. -- `max_query_cache_size` - a limit for the cache size, which can be written to local cache storage. Requires enabled `enable_filesystem_query_cache_limit` in cache configuration. Default: `false`. This setting should be defined in the disk configuration section. +- `max_query_cache_size` - a limit for the cache size, which can be written to local cache storage. Requires enabled `enable_filesystem_query_cache_limit` in cache configuration. Default: `false`. -- `skip_download_if_exceeds_query_cache` - allows to change the behaviour of setting `max_query_cache_size`. Default: `true`. If this setting is turned on and cache download limit during query was reached, no more cache will be downloaded to cache storage. If this setting is turned off and cache download limit during query was reached, cache will still be written by cost of evicting previously downloaded (within current query) data, e.g. second behaviour allows to preserve `last recently used` behaviour while keeping query cache limit. This is a profile level configuration setting. +- `skip_download_if_exceeds_query_cache` - allows to change the behaviour of setting `max_query_cache_size`. Default: `true`. If this setting is turned on and cache download limit during query was reached, no more cache will be downloaded to cache storage. If this setting is turned off and cache download limit during query was reached, cache will still be written by cost of evicting previously downloaded (within current query) data, e.g. second behaviour allows to preserve `last recently used` behaviour while keeping query cache limit. -** Warning ** +**Warning** Cache configuration settings and cache query settings correspond to the latest ClickHouse version, for earlier versions something might not be supported. Cache **system tables**: From 22f7aa8d89107910f10c8ff5fb92296856c81283 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Fri, 12 May 2023 12:00:15 +0200 Subject: [PATCH 18/45] make special build pass --- src/IO/WriteBufferFromS3.h | 5 -- src/IO/tests/gtest_writebuffer_s3.cpp | 104 +++++++++++++------------- 2 files changed, 52 insertions(+), 57 deletions(-) diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index b0d8d329589..e65127872fa 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -16,11 +16,6 @@ #include #include -namespace Aws::S3 -{ -class Client; -} - namespace DB { /** diff --git a/src/IO/tests/gtest_writebuffer_s3.cpp b/src/IO/tests/gtest_writebuffer_s3.cpp index d7661d3e3d0..c0bd6742ea3 100644 --- a/src/IO/tests/gtest_writebuffer_s3.cpp +++ b/src/IO/tests/gtest_writebuffer_s3.cpp @@ -60,12 +60,12 @@ private: class BucketMemStore { public: - typedef std::string Key; - typedef std::string Data; - typedef std::string ETag; - typedef std::string MPU_ID; - typedef std::map MPUPartsInProgress; - typedef std::vector MPUParts; + using Key = std::string; + using Data = std::string; + using ETag = std::string; + using MPU_ID = std::string; + using MPUPartsInProgress = std::map; + using MPUParts = std::vector; std::map objects; @@ -129,7 +129,7 @@ public: { std::vector result; result.reserve(parts.size()); - for (auto & part_data : parts) + for (const auto & part_data : parts) result.push_back(part_data.size()); return result; @@ -142,7 +142,7 @@ class S3MemStrore public: void CreateBucket(const std::string & bucket) { - assert(buckets.count(bucket) == 0); + assert(!buckets.contains(bucket)); buckets.emplace(bucket, BucketMemStore{}); } @@ -193,14 +193,14 @@ struct InjectionModel struct Client : DB::S3::Client { - Client(std::shared_ptr mock_s3_store) + explicit Client(std::shared_ptr mock_s3_store) : DB::S3::Client( 100, DB::S3::ServerSideEncryptionKMSConfig(), std::make_shared("", ""), GetClientConfiguration(), Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, - /* useVirtualAddressing = */ true) + /* use_virtual_addressing = */ true) , store(mock_s3_store) { } @@ -425,8 +425,8 @@ struct BaseSyncPolicy { virtual ~BaseSyncPolicy() = default; virtual DB::ThreadPoolCallbackRunner getScheduler() { return {}; } - virtual void execute(size_t = 0) {} - virtual void setAutoExecute(bool = true) {} + virtual void execute(size_t) {} + virtual void setAutoExecute(bool) {} virtual size_t size() const { return 0; } virtual bool empty() const { return size() == 0; } @@ -437,7 +437,7 @@ struct SimpleAsyncTasks : BaseSyncPolicy bool auto_execute = false; std::deque> queue; - virtual DB::ThreadPoolCallbackRunner getScheduler() override + DB::ThreadPoolCallbackRunner getScheduler() override { return [this] (std::function && operation, size_t /*priority*/) { @@ -453,7 +453,7 @@ struct SimpleAsyncTasks : BaseSyncPolicy }; } - virtual void execute(size_t limit = 0) override + void execute(size_t limit) override { if (limit == 0) limit = queue.size(); @@ -468,14 +468,14 @@ struct SimpleAsyncTasks : BaseSyncPolicy } } - virtual void setAutoExecute(bool value = true) override + void setAutoExecute(bool value) override { auto_execute = value; if (auto_execute) - execute(); + execute(0); } - virtual size_t size() const override { return queue.size(); } + size_t size() const override { return queue.size(); } }; } @@ -545,7 +545,7 @@ public: auto buffer = getWriteBuffer("file"); writeMethod(*buffer, size); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); expected_counters.writtenSize = size; @@ -585,13 +585,13 @@ protected: std::shared_ptr client; std::unique_ptr async_policy; - virtual void SetUp() override + void SetUp() override { client = MockS3::Client::CreateClient(bucket); async_policy = std::make_unique(); } - virtual void TearDown() override + void TearDown() override { client.reset(); async_policy.reset(); @@ -603,13 +603,13 @@ class SyncAsync : public WBS3Test, public ::testing::WithParamInterface protected: bool test_with_pool = false; - virtual void SetUp() override + void SetUp() override { test_with_pool = GetParam(); client = MockS3::Client::CreateClient(bucket); if (test_with_pool) async_policy = std::make_unique(); - else + else async_policy = std::make_unique(); } }; @@ -622,7 +622,7 @@ INSTANTIATE_TEST_SUITE_P(WBS3 return name; }); -TEST_P(SyncAsync, exception_on_head) { +TEST_P(SyncAsync, ExceptionOnHead) { setInjectionModel(std::make_shared()); getSettings().s3_check_objects_after_upload = true; @@ -633,7 +633,7 @@ TEST_P(SyncAsync, exception_on_head) { buffer->write('A'); buffer->next(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch( const DB::Exception& e ) @@ -645,7 +645,7 @@ TEST_P(SyncAsync, exception_on_head) { }, DB::S3Exception); } -TEST_P(SyncAsync, exception_on_put) { +TEST_P(SyncAsync, ExceptionOnPut) { setInjectionModel(std::make_shared()); EXPECT_THROW({ @@ -655,7 +655,7 @@ TEST_P(SyncAsync, exception_on_put) { buffer->write('A'); buffer->next(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch( const DB::Exception& e ) @@ -671,7 +671,7 @@ TEST_P(SyncAsync, exception_on_put) { auto buffer = getWriteBuffer("exception_on_put_2"); buffer->write('A'); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch( const DB::Exception& e ) @@ -686,10 +686,10 @@ TEST_P(SyncAsync, exception_on_put) { try { auto buffer = getWriteBuffer("exception_on_put_3"); buffer->write('A'); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->preFinalize(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch( const DB::Exception& e ) @@ -702,7 +702,7 @@ TEST_P(SyncAsync, exception_on_put) { } -TEST_P(SyncAsync, exception_on_create_mpu) { +TEST_P(SyncAsync, ExceptionOnCreateMPU) { setInjectionModel(std::make_shared()); getSettings().s3_max_single_part_upload_size = 0; // no single part @@ -716,7 +716,7 @@ TEST_P(SyncAsync, exception_on_create_mpu) { buffer->write('A'); buffer->next(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch( const DB::Exception& e ) @@ -733,7 +733,7 @@ TEST_P(SyncAsync, exception_on_create_mpu) { buffer->write('A'); buffer->preFinalize(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch( const DB::Exception& e ) @@ -749,7 +749,7 @@ TEST_P(SyncAsync, exception_on_create_mpu) { auto buffer = getWriteBuffer("exception_on_create_mpu_2"); buffer->write('A'); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch( const DB::Exception& e ) @@ -762,7 +762,7 @@ TEST_P(SyncAsync, exception_on_create_mpu) { } -TEST_P(SyncAsync, exception_on_complete_mpu) { +TEST_P(SyncAsync, ExceptionOnCompleteMPU) { setInjectionModel(std::make_shared()); getSettings().s3_max_single_part_upload_size = 0; // no single part @@ -773,7 +773,7 @@ TEST_P(SyncAsync, exception_on_complete_mpu) { auto buffer = getWriteBuffer("exception_on_complete_mpu_1"); buffer->write('A'); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch(const DB::Exception & e) @@ -785,7 +785,7 @@ TEST_P(SyncAsync, exception_on_complete_mpu) { }, DB::S3Exception); } -TEST_P(SyncAsync, exception_on_upload_part) { +TEST_P(SyncAsync, ExceptionOnUploadPart) { setInjectionModel(std::make_shared()); getSettings().s3_max_single_part_upload_size = 0; // no single part @@ -804,7 +804,7 @@ TEST_P(SyncAsync, exception_on_upload_part) { buffer->write('A'); buffer->next(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } @@ -820,7 +820,7 @@ TEST_P(SyncAsync, exception_on_upload_part) { EXPECT_THROW({ try { auto buffer = getWriteBuffer("exception_on_upload_part_2"); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->write('A'); buffer->next(); @@ -848,7 +848,7 @@ TEST_P(SyncAsync, exception_on_upload_part) { buffer->preFinalize(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch(const DB::Exception & e) @@ -865,7 +865,7 @@ TEST_P(SyncAsync, exception_on_upload_part) { auto buffer = getWriteBuffer("exception_on_upload_part_4"); buffer->write('A'); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch(const DB::Exception & e) @@ -879,7 +879,7 @@ TEST_P(SyncAsync, exception_on_upload_part) { } -TEST_F(WBS3Test, prefinalize_called_multiple_times) { +TEST_F(WBS3Test, PrefinalizeCalledMultipleTimes) { #ifdef ABORT_ON_LOGICAL_ERROR GTEST_SKIP() << "this test trigger LOGICAL_ERROR, runs only if ABORT_ON_LOGICAL_ERROR is not defined"; #else @@ -904,14 +904,14 @@ TEST_F(WBS3Test, prefinalize_called_multiple_times) { #endif } -TEST_P(SyncAsync, empty_file) { +TEST_P(SyncAsync, EmptyFile) { getSettings().s3_check_objects_after_upload = true; MockS3::EventCounts counters = {.headObject = 2, .putObject = 1}; runSimpleScenario(counters, 0); } -TEST_P(SyncAsync, manual_next_calls) { +TEST_P(SyncAsync, ManualNextCalls) { getSettings().s3_check_objects_after_upload = true; { @@ -920,7 +920,7 @@ TEST_P(SyncAsync, manual_next_calls) { auto buffer = getWriteBuffer("manual_next_calls_1"); buffer->next(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); assertCountersEQ(counters); @@ -933,7 +933,7 @@ TEST_P(SyncAsync, manual_next_calls) { buffer->next(); buffer->next(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); assertCountersEQ(counters); @@ -947,7 +947,7 @@ TEST_P(SyncAsync, manual_next_calls) { buffer->write('A'); buffer->next(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); assertCountersEQ(counters); @@ -963,14 +963,14 @@ TEST_P(SyncAsync, manual_next_calls) { buffer->next(); buffer->next(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); assertCountersEQ(counters); } } -TEST_P(SyncAsync, small_file_is_one_put_request) { +TEST_P(SyncAsync, SmallFileIsOnePutRequest) { getSettings().s3_check_objects_after_upload = true; { @@ -999,7 +999,7 @@ TEST_P(SyncAsync, small_file_is_one_put_request) { } } -TEST_P(SyncAsync, little_bigger_file_is_multi_part_upload) { +TEST_P(SyncAsync, LittleBiggerFileIsMultiPartUpload) { getSettings().s3_check_objects_after_upload = true; { @@ -1026,7 +1026,7 @@ TEST_P(SyncAsync, little_bigger_file_is_multi_part_upload) { } } -TEST_P(SyncAsync, bigger_file_is_multi_part_upload) { +TEST_P(SyncAsync, BiggerFileIsMultiPartUpload) { getSettings().s3_check_objects_after_upload = true; { @@ -1059,7 +1059,7 @@ TEST_P(SyncAsync, bigger_file_is_multi_part_upload) { } } -TEST_P(SyncAsync, increase_upload_buffer) { +TEST_P(SyncAsync, IncreaseUploadBuffer) { getSettings().s3_check_objects_after_upload = true; { @@ -1092,7 +1092,7 @@ TEST_P(SyncAsync, increase_upload_buffer) { } } -TEST_P(SyncAsync, increase_limited) { +TEST_P(SyncAsync, IncreaseLimited) { getSettings().s3_check_objects_after_upload = true; { From a4694ac1858890bcd3d35547cf5c4417252933c9 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Fri, 12 May 2023 12:29:29 +0200 Subject: [PATCH 19/45] Add test. --- .../01418_custom_settings.reference | 16 +++++++++---- .../0_stateless/01418_custom_settings.sql | 24 +++++++++++++++---- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/tests/queries/0_stateless/01418_custom_settings.reference b/tests/queries/0_stateless/01418_custom_settings.reference index cf0cb35c72a..8484a5d0e6f 100644 --- a/tests/queries/0_stateless/01418_custom_settings.reference +++ b/tests/queries/0_stateless/01418_custom_settings.reference @@ -1,3 +1,4 @@ +--- assigning --- 5 UInt8 -177 Int16 98.11 Float64 @@ -6,7 +7,7 @@ custom_a UInt64_5 custom_b Int64_-177 custom_c Float64_98.11 custom_d \'abc def\' - +--- modifying --- changed String \N Nullable(Nothing) 50000 UInt16 @@ -15,9 +16,10 @@ custom_a \'changed\' custom_b NULL custom_c UInt64_50000 custom_d Float64_1.11 - +--- undefined setting --- 404 UInt16 - +--- wrong prefix --- +--- using query context --- -0.333 Float64 custom_e Float64_-0.333 404 UInt16 @@ -25,7 +27,13 @@ custom_e UInt64_404 word String custom_f \'word\' 0 - +--- compound identifier --- test String custom_compound.identifier.v1 \'test\' CREATE SETTINGS PROFILE s1_01418 SETTINGS custom_compound.identifier.v2 = 100 +--- null type --- +\N Nullable(Nothing) +custom_null NULL +\N Nullable(Nothing) +custom_null NULL +CREATE SETTINGS PROFILE s2_01418 SETTINGS custom_null = NULL diff --git a/tests/queries/0_stateless/01418_custom_settings.sql b/tests/queries/0_stateless/01418_custom_settings.sql index 95051db3a34..be18f553589 100644 --- a/tests/queries/0_stateless/01418_custom_settings.sql +++ b/tests/queries/0_stateless/01418_custom_settings.sql @@ -1,3 +1,6 @@ +DROP SETTINGS PROFILE IF EXISTS s1_01418, s2_01418; + +SELECT '--- assigning ---'; SET custom_a = 5; SET custom_b = -177; SET custom_c = 98.11; @@ -8,7 +11,7 @@ SELECT getSetting('custom_c') as v, toTypeName(v); SELECT getSetting('custom_d') as v, toTypeName(v); SELECT name, value FROM system.settings WHERE name LIKE 'custom_%' ORDER BY name; -SELECT ''; +SELECT '--- modifying ---'; SET custom_a = 'changed'; SET custom_b = NULL; SET custom_c = 50000; @@ -19,14 +22,15 @@ SELECT getSetting('custom_c') as v, toTypeName(v); SELECT getSetting('custom_d') as v, toTypeName(v); SELECT name, value FROM system.settings WHERE name LIKE 'custom_%' ORDER BY name; -SELECT ''; +SELECT '--- undefined setting ---'; SELECT getSetting('custom_e') as v, toTypeName(v); -- { serverError 115 } -- Setting not found. SET custom_e = 404; SELECT getSetting('custom_e') as v, toTypeName(v); +SELECT '--- wrong prefix ---'; SET invalid_custom = 8; -- { serverError 115 } -- Setting is neither a builtin nor started with one of the registered prefixes for user-defined settings. -SELECT ''; +SELECT '--- using query context ---'; SELECT getSetting('custom_e') as v, toTypeName(v) SETTINGS custom_e = -0.333; SELECT name, value FROM system.settings WHERE name = 'custom_e' SETTINGS custom_e = -0.333; SELECT getSetting('custom_e') as v, toTypeName(v); @@ -37,7 +41,7 @@ SELECT name, value FROM system.settings WHERE name = 'custom_f' SETTINGS custom_ SELECT getSetting('custom_f') as v, toTypeName(v); -- { serverError 115 } -- Setting not found. SELECT COUNT() FROM system.settings WHERE name = 'custom_f'; -SELECT ''; +SELECT '--- compound identifier ---'; SET custom_compound.identifier.v1 = 'test'; SELECT getSetting('custom_compound.identifier.v1') as v, toTypeName(v); SELECT name, value FROM system.settings WHERE name = 'custom_compound.identifier.v1'; @@ -45,3 +49,15 @@ SELECT name, value FROM system.settings WHERE name = 'custom_compound.identifier CREATE SETTINGS PROFILE s1_01418 SETTINGS custom_compound.identifier.v2 = 100; SHOW CREATE SETTINGS PROFILE s1_01418; DROP SETTINGS PROFILE s1_01418; + +SELECT '--- null type ---'; +SELECT getSetting('custom_null') as v, toTypeName(v) SETTINGS custom_null = NULL; +SELECT name, value FROM system.settings WHERE name = 'custom_null' SETTINGS custom_null = NULL; + +SET custom_null = NULL; +SELECT getSetting('custom_null') as v, toTypeName(v); +SELECT name, value FROM system.settings WHERE name = 'custom_null'; + +CREATE SETTINGS PROFILE s2_01418 SETTINGS custom_null = NULL; +SHOW CREATE SETTINGS PROFILE s2_01418; +DROP SETTINGS PROFILE s2_01418; From 37e4c531ba5ff9b639830f32c6a4b8cd9a42d9d6 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 12 May 2023 15:02:37 +0000 Subject: [PATCH 20/45] Don't update contrib --- contrib/boost | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/boost b/contrib/boost index 1035c8bfcc9..8fe7b3326ef 160000 --- a/contrib/boost +++ b/contrib/boost @@ -1 +1 @@ -Subproject commit 1035c8bfcc9a3c1cfa7f6e827db94dae1ce1a43a +Subproject commit 8fe7b3326ef482ee6ecdf5a4f698f2b8c2780f98 From f0d21a910078c12fe3a4a038ef283a5ceb6dc219 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Fri, 3 Mar 2023 11:00:12 +0100 Subject: [PATCH 21/45] row policy template: initial, works, restrictive rules fix --- src/Access/AccessEntityIO.cpp | 4 + src/Access/EnabledRowPolicies.cpp | 36 ++++- src/Access/EnabledRowPolicies.h | 8 ++ src/Access/RowPolicy.h | 1 + src/Access/RowPolicyCache.cpp | 128 ++++++++++++++---- src/Access/RowPolicyCache.h | 7 +- .../Access/ParserCreateRowPolicyQuery.cpp | 1 + src/Parsers/Access/ParserRowPolicyName.cpp | 22 ++- src/Parsers/parseQuery.cpp | 2 + 9 files changed, 183 insertions(+), 26 deletions(-) diff --git a/src/Access/AccessEntityIO.cpp b/src/Access/AccessEntityIO.cpp index 80bb63b04bf..008ba5e5dfe 100644 --- a/src/Access/AccessEntityIO.cpp +++ b/src/Access/AccessEntityIO.cpp @@ -28,6 +28,9 @@ #include #include +#include + + namespace DB { namespace ErrorCodes @@ -62,6 +65,7 @@ AccessEntityPtr deserializeAccessEntityImpl(const String & definition) const char * end = begin + definition.size(); while (pos < end) { + LOG_TRACE((&Poco::Logger::get("deserializeAccessEntityImpl")), "{}", std::string(pos, end)); queries.emplace_back(parseQueryAndMovePosition(parser, pos, end, "", true, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH)); while (isWhitespaceASCII(*pos) || *pos == ';') ++pos; diff --git a/src/Access/EnabledRowPolicies.cpp b/src/Access/EnabledRowPolicies.cpp index c00dcf9e3a7..225f211bdd4 100644 --- a/src/Access/EnabledRowPolicies.cpp +++ b/src/Access/EnabledRowPolicies.cpp @@ -3,6 +3,8 @@ #include #include +#include + namespace DB { @@ -18,6 +20,12 @@ size_t EnabledRowPolicies::Hash::operator()(const MixedFiltersKey & key) const return std::hash{}(key.database) - std::hash{}(key.table_name) + static_cast(key.filter_type); } + +// size_t EnabledRowPolicies::Hash::operator()(const MixedFiltersKey & key) const +// { +// return std::hash{}(key.database) + static_cast(key.filter_type); +// } + EnabledRowPolicies::EnabledRowPolicies() : params() { } @@ -32,11 +40,37 @@ EnabledRowPolicies::~EnabledRowPolicies() = default; RowPolicyFilterPtr EnabledRowPolicies::getFilter(const String & database, const String & table_name, RowPolicyFilterType filter_type) const { /// We don't lock `mutex` here. + auto loaded = mixed_filters.load(); + { + + + for (auto it = loaded->begin(); it != loaded->end(); ++it) + { + LOG_TRACE((&Poco::Logger::get("EnabledRowPolicies::getFilter")), " db: {}, table {}", it->first.database, it->first.table_name); + + } + + } + + + + auto it = loaded->find({database, table_name, filter_type}); if (it == loaded->end()) - return {}; + { + it = loaded->find({database, "*", filter_type}); + if (it == loaded->end()) + { + LOG_TRACE((&Poco::Logger::get("EnabledRowPolicies::getFilter")), "db: {}, table {} - not found ({} records)", + database, table_name, loaded->size()); + return {}; + } + } + + LOG_TRACE((&Poco::Logger::get("EnabledRowPolicies::getFilter")), "db: {}, table {} - found ({} records)", + database, table_name, loaded->size()); return it->second; } diff --git a/src/Access/EnabledRowPolicies.h b/src/Access/EnabledRowPolicies.h index b8e6b2e0549..e09d32264f0 100644 --- a/src/Access/EnabledRowPolicies.h +++ b/src/Access/EnabledRowPolicies.h @@ -72,6 +72,14 @@ private: auto toTuple() const { return std::tie(database, table_name, filter_type); } friend bool operator==(const MixedFiltersKey & left, const MixedFiltersKey & right) { return left.toTuple() == right.toTuple(); } friend bool operator!=(const MixedFiltersKey & left, const MixedFiltersKey & right) { return left.toTuple() != right.toTuple(); } + // friend bool operator==(const MixedFiltersKey & left, const MixedFiltersKey & right) + // { + // return left.database == right.database && left.filter_type == right.filter_type; + // } + // friend bool operator!=(const MixedFiltersKey & left, const MixedFiltersKey & right) + // { + // return left.database != right.database || left.filter_type != right.filter_type; + // } }; struct Hash diff --git a/src/Access/RowPolicy.h b/src/Access/RowPolicy.h index 99e6f1992f5..31ee876b47b 100644 --- a/src/Access/RowPolicy.h +++ b/src/Access/RowPolicy.h @@ -34,6 +34,7 @@ struct RowPolicy : public IAccessEntity /// in addition to all the restrictive policies. void setPermissive(bool permissive_ = true) { setRestrictive(!permissive_); } bool isPermissive() const { return !isRestrictive(); } + bool isDatabase() const { return full_name.table_name == "*"; } /// Sets that the policy is restrictive. /// A row is only accessible if at least one of the permissive policies passes, diff --git a/src/Access/RowPolicyCache.cpp b/src/Access/RowPolicyCache.cpp index 1036df92609..81e5acdf3ce 100644 --- a/src/Access/RowPolicyCache.cpp +++ b/src/Access/RowPolicyCache.cpp @@ -11,6 +11,8 @@ #include #include +#include + namespace DB { @@ -148,9 +150,19 @@ void RowPolicyCache::ensureAllRowPoliciesRead() for (const UUID & id : access_control.findAll()) { - auto quota = access_control.tryRead(id); - if (quota) - all_policies.emplace(id, PolicyInfo(quota)); + auto policy = access_control.tryRead(id); + if (policy) + { + PolicyInfo policy_info(policy); + if (policy_info.database_and_table_name->second == "*") + { + database_policies.emplace(id, std::move(policy_info)); + } + else + { + table_policies.emplace(id, std::move(policy_info)); + } + } } } @@ -158,15 +170,23 @@ void RowPolicyCache::ensureAllRowPoliciesRead() void RowPolicyCache::rowPolicyAddedOrChanged(const UUID & policy_id, const RowPolicyPtr & new_policy) { std::lock_guard lock{mutex}; - auto it = all_policies.find(policy_id); - if (it == all_policies.end()) + bool found = true; + + auto it = table_policies.find(policy_id); + if (it == table_policies.end()) { - it = all_policies.emplace(policy_id, PolicyInfo(new_policy)).first; + it = database_policies.find(policy_id); + if (it == database_policies.end()) + { + PolicyMap & policy_map = new_policy->isDatabase() ? database_policies : table_policies; + it = policy_map.emplace(policy_id, PolicyInfo(new_policy)).first; + found = false; + } } - else + + if (found && it->second.policy == new_policy) { - if (it->second.policy == new_policy) - return; + return; } auto & info = it->second; @@ -178,7 +198,15 @@ void RowPolicyCache::rowPolicyAddedOrChanged(const UUID & policy_id, const RowPo void RowPolicyCache::rowPolicyRemoved(const UUID & policy_id) { std::lock_guard lock{mutex}; - all_policies.erase(policy_id); + auto it = database_policies.find(policy_id); + if (it != database_policies.end()) + { + database_policies.erase(it); + } + else + { + table_policies.erase(policy_id); + } mixFilters(); } @@ -215,22 +243,71 @@ void RowPolicyCache::mixFiltersFor(EnabledRowPolicies & enabled) std::vector policies; }; - std::unordered_map mixers; + std::unordered_map table_mixers; + std::unordered_map database_mixers; - for (const auto & [policy_id, info] : all_policies) + + for (const auto & [policy_id, info] : database_policies) { const auto & policy = *info.policy; bool match = info.roles->match(enabled.params.user_id, enabled.params.enabled_roles); - MixedFiltersKey key; - key.database = info.database_and_table_name->first; - key.table_name = info.database_and_table_name->second; for (auto filter_type : collections::range(0, RowPolicyFilterType::MAX)) { auto filter_type_i = static_cast(filter_type); if (info.parsed_filters[filter_type_i]) { - key.filter_type = filter_type; - auto & mixer = mixers[key]; + MixedFiltersKey key{info.database_and_table_name->first, + info.database_and_table_name->second, + filter_type}; + LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "db: {} : {}", key.database, key.table_name); + + auto & mixer = database_mixers[key]; // getting database level mixer + mixer.database_and_table_name = info.database_and_table_name; + if (match) + { + mixer.mixer.add(info.parsed_filters[filter_type_i], policy.isRestrictive()); + mixer.policies.push_back(info.policy); + } + } + } + } + + + for (const auto & [policy_id, info] : table_policies) + { + const auto & policy = *info.policy; + bool match = info.roles->match(enabled.params.user_id, enabled.params.enabled_roles); + for (auto filter_type : collections::range(0, RowPolicyFilterType::MAX)) + { + auto filter_type_i = static_cast(filter_type); + if (info.parsed_filters[filter_type_i]) + { + MixedFiltersKey key{info.database_and_table_name->first, + info.database_and_table_name->second, + filter_type}; + LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "table: {} : {}", key.database, key.table_name); + auto table_it = table_mixers.find(key); + if (table_it == table_mixers.end()) + { + LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "table: not found, looking for db"); + MixedFiltersKey database_key = key; + database_key.table_name = "*"; + + auto database_it = database_mixers.find(database_key); + + if (database_it == database_mixers.end()) + { + LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "table: not found, database not found"); + table_it = table_mixers.try_emplace(key).first; + } + else + { + LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "table: not found, database found"); + table_it = table_mixers.insert({key, database_it->second}).first; + } + } + + auto & mixer = table_it->second; // table_mixers[key]; getting table level mixer mixer.database_and_table_name = info.database_and_table_name; if (match) { @@ -242,15 +319,20 @@ void RowPolicyCache::mixFiltersFor(EnabledRowPolicies & enabled) } auto mixed_filters = boost::make_shared(); - for (auto & [key, mixer] : mixers) + + for (auto mixer_map_ptr : { &table_mixers, &database_mixers}) { - auto mixed_filter = std::make_shared(); - mixed_filter->database_and_table_name = std::move(mixer.database_and_table_name); - mixed_filter->expression = std::move(mixer.mixer).getResult(access_control.isEnabledUsersWithoutRowPoliciesCanReadRows()); - mixed_filter->policies = std::move(mixer.policies); - mixed_filters->emplace(key, std::move(mixed_filter)); + for (auto & [key, mixer] : *mixer_map_ptr) + { + auto mixed_filter = std::make_shared(); + mixed_filter->database_and_table_name = std::move(mixer.database_and_table_name); + mixed_filter->expression = std::move(mixer.mixer).getResult(access_control.isEnabledUsersWithoutRowPoliciesCanReadRows()); + mixed_filter->policies = std::move(mixer.policies); + mixed_filters->emplace(key, std::move(mixed_filter)); + } } + enabled.mixed_filters.store(mixed_filters); } diff --git a/src/Access/RowPolicyCache.h b/src/Access/RowPolicyCache.h index 4fbf90d1a2d..49e6a2e5df3 100644 --- a/src/Access/RowPolicyCache.h +++ b/src/Access/RowPolicyCache.h @@ -12,6 +12,7 @@ namespace DB class AccessControl; struct RolesOrUsersSet; struct RowPolicy; + using RowPolicyPtr = std::shared_ptr; /// Stores read and parsed row policies. @@ -35,14 +36,18 @@ private: ASTPtr parsed_filters[static_cast(RowPolicyFilterType::MAX)]; }; + using PolicyMap = std::unordered_map; + void ensureAllRowPoliciesRead(); void rowPolicyAddedOrChanged(const UUID & policy_id, const RowPolicyPtr & new_policy); void rowPolicyRemoved(const UUID & policy_id); void mixFilters(); void mixFiltersFor(EnabledRowPolicies & enabled); + const AccessControl & access_control; - std::unordered_map all_policies; + PolicyMap database_policies; + PolicyMap table_policies; bool all_policies_read = false; scope_guard subscription; std::map> enabled_row_policies; diff --git a/src/Parsers/Access/ParserCreateRowPolicyQuery.cpp b/src/Parsers/Access/ParserCreateRowPolicyQuery.cpp index 2c25fc14e7d..03d0754ca1a 100644 --- a/src/Parsers/Access/ParserCreateRowPolicyQuery.cpp +++ b/src/Parsers/Access/ParserCreateRowPolicyQuery.cpp @@ -203,6 +203,7 @@ namespace bool ParserCreateRowPolicyQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { + // poco_assert("ParserCreateRowPolicyQuery::parseImpl" == nullptr); bool alter = false; if (attach_mode) { diff --git a/src/Parsers/Access/ParserRowPolicyName.cpp b/src/Parsers/Access/ParserRowPolicyName.cpp index cf5d2ab21b6..e087c45fad9 100644 --- a/src/Parsers/Access/ParserRowPolicyName.cpp +++ b/src/Parsers/Access/ParserRowPolicyName.cpp @@ -7,6 +7,8 @@ #include #include +#include + namespace DB { @@ -26,8 +28,19 @@ namespace return IParserBase::wrapParseImpl(pos, [&] { String res_database, res_table_name; - if (!parseDatabaseAndTableName(pos, expected, res_database, res_table_name)) + // if (!parseDatabaseAndTableName(pos, expected, res_database, res_table_name)) + bool any_database = false; + bool any_table = true; + + if (!parseDatabaseAndTableNameOrAsterisks(pos, expected, res_database, any_database, res_table_name, any_table)) + { + // poco_assert("parseDatabaseAndTableName failed" == nullptr); + LOG_TRACE((&Poco::Logger::get("ParserRowPolicyName")), "parseDatabaseAndTableName failed"); return false; + } + if (any_table) + res_table_name = "*"; + /// If table is specified without DB it cannot be followed by "ON" /// (but can be followed by "ON CLUSTER"). @@ -51,8 +64,10 @@ namespace } + bool parseOnDBAndTableName(IParser::Pos & pos, Expected & expected, String & database, String & table_name) { + // poco_assert("parseOnDBAndTableNames" == nullptr); return IParserBase::wrapParseImpl(pos, [&] { return ParserKeyword{"ON"}.ignore(pos, expected) && parseDBAndTableName(pos, expected, database, table_name); @@ -62,6 +77,9 @@ namespace bool parseOnDBAndTableNames(IParser::Pos & pos, Expected & expected, std::vector> & database_and_table_names) { + // poco_assert("parseOnDBAndTableNames" == nullptr); + + return IParserBase::wrapParseImpl(pos, [&] { if (!ParserKeyword{"ON"}.ignore(pos, expected)) @@ -146,6 +164,7 @@ namespace bool ParserRowPolicyName::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { + // poco_assert("ParserRowPolicyName::parseImpl" == nullptr); std::vector full_names; String cluster; if (!parseRowPolicyNamesAroundON(pos, expected, false, false, allow_on_cluster, full_names, cluster)) @@ -162,6 +181,7 @@ bool ParserRowPolicyName::parseImpl(Pos & pos, ASTPtr & node, Expected & expecte bool ParserRowPolicyNames::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { + // poco_assert("ParserRowPolicyName::parseImpl" == nullptr); std::vector full_names; size_t num_added_names_last_time = 0; String cluster; diff --git a/src/Parsers/parseQuery.cpp b/src/Parsers/parseQuery.cpp index 8d794409f78..1b1ff439e13 100644 --- a/src/Parsers/parseQuery.cpp +++ b/src/Parsers/parseQuery.cpp @@ -139,6 +139,8 @@ void writeCommonErrorMessage( if (!query_description.empty()) out << " (" << query_description << ")"; + // poco_assert("writeCommonErrorMessage" == nullptr); + out << ": failed at position " << (last_token.begin - begin + 1); if (last_token.type == TokenType::EndOfStream || last_token.type == TokenType::Semicolon) From 7fa6ea4ccc7c03ea879396572fad24f7ffe93551 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Thu, 16 Mar 2023 17:02:05 +0100 Subject: [PATCH 22/45] row policy template: tests and code cleanup, code review changes --- .../statements/create/row-policy.md | 20 ++- src/Access/AccessEntityIO.cpp | 4 - src/Access/EnabledRowPolicies.cpp | 30 +--- src/Access/EnabledRowPolicies.h | 8 - src/Access/RolesOrUsersSet.cpp | 10 +- src/Access/RowPolicy.h | 4 +- src/Access/RowPolicyCache.cpp | 152 ++++++++---------- src/Access/RowPolicyCache.h | 8 +- .../Access/ParserCreateRowPolicyQuery.cpp | 1 - src/Parsers/Access/ParserRowPolicyName.cpp | 26 +-- src/Parsers/parseQuery.cpp | 2 - .../02131_row_policies_combination.sql | 8 + ...ow_policies_database_combination.reference | 42 +++++ ...5341_row_policies_database_combination.sql | 88 ++++++++++ .../25341_row_policy_database.reference | 22 +++ .../0_stateless/25341_row_policy_database.sql | 53 ++++++ 16 files changed, 315 insertions(+), 163 deletions(-) create mode 100644 tests/queries/0_stateless/25341_row_policies_database_combination.reference create mode 100644 tests/queries/0_stateless/25341_row_policies_database_combination.sql create mode 100644 tests/queries/0_stateless/25341_row_policy_database.reference create mode 100644 tests/queries/0_stateless/25341_row_policy_database.sql diff --git a/docs/en/sql-reference/statements/create/row-policy.md b/docs/en/sql-reference/statements/create/row-policy.md index aa0a07747f2..83bb2e6bb9a 100644 --- a/docs/en/sql-reference/statements/create/row-policy.md +++ b/docs/en/sql-reference/statements/create/row-policy.md @@ -14,8 +14,8 @@ Row policies makes sense only for users with readonly access. If user can modify Syntax: ``` sql -CREATE [ROW] POLICY [IF NOT EXISTS | OR REPLACE] policy_name1 [ON CLUSTER cluster_name1] ON [db1.]table1 - [, policy_name2 [ON CLUSTER cluster_name2] ON [db2.]table2 ...] +CREATE [ROW] POLICY [IF NOT EXISTS | OR REPLACE] policy_name1 [ON CLUSTER cluster_name1] ON [db1.]table1|db1.* + [, policy_name2 [ON CLUSTER cluster_name2] ON [db2.]table2|db2.* ...] [FOR SELECT] USING condition [AS {PERMISSIVE | RESTRICTIVE}] [TO {role1 [, role2 ...] | ALL | ALL EXCEPT role1 [, role2 ...]}] @@ -76,6 +76,20 @@ CREATE ROW POLICY pol2 ON mydb.table1 USING c=2 AS RESTRICTIVE TO peter, antonio enables the user `peter` to see rows only if both `b=1` AND `c=2`. +Database policies are combined with table policies. + +For example, the following policies + +``` sql +CREATE ROW POLICY pol1 ON mydb.* USING b=1 TO mira, peter +CREATE ROW POLICY pol2 ON mydb.table1 USING c=2 AS RESTRICTIVE TO peter, antonio +``` + +enables the user `peter` to see table1 rows only if both `b=1` AND `c=2`, although +any other table in mydb would have only `b=1` policy applied for the user. + + + ## ON CLUSTER Clause Allows creating row policies on a cluster, see [Distributed DDL](../../../sql-reference/distributed-ddl.md). @@ -88,3 +102,5 @@ Allows creating row policies on a cluster, see [Distributed DDL](../../../sql-re `CREATE ROW POLICY filter2 ON mydb.mytable USING a<1000 AND b=5 TO ALL EXCEPT mira` `CREATE ROW POLICY filter3 ON mydb.mytable USING 1 TO admin` + +`CREATE ROW POLICY filter4 ON mydb.* USING 1 TO admin` diff --git a/src/Access/AccessEntityIO.cpp b/src/Access/AccessEntityIO.cpp index 008ba5e5dfe..80bb63b04bf 100644 --- a/src/Access/AccessEntityIO.cpp +++ b/src/Access/AccessEntityIO.cpp @@ -28,9 +28,6 @@ #include #include -#include - - namespace DB { namespace ErrorCodes @@ -65,7 +62,6 @@ AccessEntityPtr deserializeAccessEntityImpl(const String & definition) const char * end = begin + definition.size(); while (pos < end) { - LOG_TRACE((&Poco::Logger::get("deserializeAccessEntityImpl")), "{}", std::string(pos, end)); queries.emplace_back(parseQueryAndMovePosition(parser, pos, end, "", true, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH)); while (isWhitespaceASCII(*pos) || *pos == ';') ++pos; diff --git a/src/Access/EnabledRowPolicies.cpp b/src/Access/EnabledRowPolicies.cpp index 225f211bdd4..9efac6e992e 100644 --- a/src/Access/EnabledRowPolicies.cpp +++ b/src/Access/EnabledRowPolicies.cpp @@ -3,8 +3,6 @@ #include #include -#include - namespace DB { @@ -20,12 +18,6 @@ size_t EnabledRowPolicies::Hash::operator()(const MixedFiltersKey & key) const return std::hash{}(key.database) - std::hash{}(key.table_name) + static_cast(key.filter_type); } - -// size_t EnabledRowPolicies::Hash::operator()(const MixedFiltersKey & key) const -// { -// return std::hash{}(key.database) + static_cast(key.filter_type); -// } - EnabledRowPolicies::EnabledRowPolicies() : params() { } @@ -40,37 +32,17 @@ EnabledRowPolicies::~EnabledRowPolicies() = default; RowPolicyFilterPtr EnabledRowPolicies::getFilter(const String & database, const String & table_name, RowPolicyFilterType filter_type) const { /// We don't lock `mutex` here. - auto loaded = mixed_filters.load(); - { - - - for (auto it = loaded->begin(); it != loaded->end(); ++it) - { - LOG_TRACE((&Poco::Logger::get("EnabledRowPolicies::getFilter")), " db: {}, table {}", it->first.database, it->first.table_name); - - } - - } - - - - auto it = loaded->find({database, table_name, filter_type}); if (it == loaded->end()) { - it = loaded->find({database, "*", filter_type}); + it = loaded->find({database, RowPolicy::ANY_TABLE_MARK, filter_type}); if (it == loaded->end()) { - LOG_TRACE((&Poco::Logger::get("EnabledRowPolicies::getFilter")), "db: {}, table {} - not found ({} records)", - database, table_name, loaded->size()); return {}; } } - - LOG_TRACE((&Poco::Logger::get("EnabledRowPolicies::getFilter")), "db: {}, table {} - found ({} records)", - database, table_name, loaded->size()); return it->second; } diff --git a/src/Access/EnabledRowPolicies.h b/src/Access/EnabledRowPolicies.h index e09d32264f0..b8e6b2e0549 100644 --- a/src/Access/EnabledRowPolicies.h +++ b/src/Access/EnabledRowPolicies.h @@ -72,14 +72,6 @@ private: auto toTuple() const { return std::tie(database, table_name, filter_type); } friend bool operator==(const MixedFiltersKey & left, const MixedFiltersKey & right) { return left.toTuple() == right.toTuple(); } friend bool operator!=(const MixedFiltersKey & left, const MixedFiltersKey & right) { return left.toTuple() != right.toTuple(); } - // friend bool operator==(const MixedFiltersKey & left, const MixedFiltersKey & right) - // { - // return left.database == right.database && left.filter_type == right.filter_type; - // } - // friend bool operator!=(const MixedFiltersKey & left, const MixedFiltersKey & right) - // { - // return left.database != right.database || left.filter_type != right.filter_type; - // } }; struct Hash diff --git a/src/Access/RolesOrUsersSet.cpp b/src/Access/RolesOrUsersSet.cpp index 52374c3739d..c026ae42f76 100644 --- a/src/Access/RolesOrUsersSet.cpp +++ b/src/Access/RolesOrUsersSet.cpp @@ -228,25 +228,25 @@ void RolesOrUsersSet::add(const std::vector & ids_) bool RolesOrUsersSet::match(const UUID & id) const { - return (all || ids.count(id)) && !except_ids.count(id); + return (all || ids.contains(id)) && !except_ids.contains(id); } bool RolesOrUsersSet::match(const UUID & user_id, const boost::container::flat_set & enabled_roles) const { - if (!all && !ids.count(user_id)) + if (!all && !ids.contains(user_id)) { bool found_enabled_role = std::any_of( - enabled_roles.begin(), enabled_roles.end(), [this](const UUID & enabled_role) { return ids.count(enabled_role); }); + enabled_roles.begin(), enabled_roles.end(), [this](const UUID & enabled_role) { return ids.contains(enabled_role); }); if (!found_enabled_role) return false; } - if (except_ids.count(user_id)) + if (except_ids.contains(user_id)) return false; bool in_except_list = std::any_of( - enabled_roles.begin(), enabled_roles.end(), [this](const UUID & enabled_role) { return except_ids.count(enabled_role); }); + enabled_roles.begin(), enabled_roles.end(), [this](const UUID & enabled_role) { return except_ids.contains(enabled_role); }); return !in_except_list; } diff --git a/src/Access/RowPolicy.h b/src/Access/RowPolicy.h index 31ee876b47b..b9ba528e9bb 100644 --- a/src/Access/RowPolicy.h +++ b/src/Access/RowPolicy.h @@ -14,6 +14,8 @@ namespace DB */ struct RowPolicy : public IAccessEntity { + static constexpr char ANY_TABLE_MARK[] = "*"; + void setShortName(const String & short_name); void setDatabase(const String & database); void setTableName(const String & table_name); @@ -34,7 +36,7 @@ struct RowPolicy : public IAccessEntity /// in addition to all the restrictive policies. void setPermissive(bool permissive_ = true) { setRestrictive(!permissive_); } bool isPermissive() const { return !isRestrictive(); } - bool isDatabase() const { return full_name.table_name == "*"; } + bool isDatabase() const { return full_name.table_name == ANY_TABLE_MARK; } /// Sets that the policy is restrictive. /// A row is only accessible if at least one of the permissive policies passes, diff --git a/src/Access/RowPolicyCache.cpp b/src/Access/RowPolicyCache.cpp index 81e5acdf3ce..07bec185131 100644 --- a/src/Access/RowPolicyCache.cpp +++ b/src/Access/RowPolicyCache.cpp @@ -11,14 +11,13 @@ #include #include -#include - namespace DB { namespace { - /// Accumulates filters from multiple row policies and joins them using the AND logical operation. + /// Helper to accumulate filters from multiple row policies and join them together + /// by AND or OR logical operations. class FiltersMixer { public: @@ -153,15 +152,7 @@ void RowPolicyCache::ensureAllRowPoliciesRead() auto policy = access_control.tryRead(id); if (policy) { - PolicyInfo policy_info(policy); - if (policy_info.database_and_table_name->second == "*") - { - database_policies.emplace(id, std::move(policy_info)); - } - else - { - table_policies.emplace(id, std::move(policy_info)); - } + all_policies.emplace(id, PolicyInfo(policy)); } } } @@ -170,23 +161,15 @@ void RowPolicyCache::ensureAllRowPoliciesRead() void RowPolicyCache::rowPolicyAddedOrChanged(const UUID & policy_id, const RowPolicyPtr & new_policy) { std::lock_guard lock{mutex}; - bool found = true; - - auto it = table_policies.find(policy_id); - if (it == table_policies.end()) + auto it = all_policies.find(policy_id); + if (it == all_policies.end()) { - it = database_policies.find(policy_id); - if (it == database_policies.end()) - { - PolicyMap & policy_map = new_policy->isDatabase() ? database_policies : table_policies; - it = policy_map.emplace(policy_id, PolicyInfo(new_policy)).first; - found = false; - } + it = all_policies.emplace(policy_id, PolicyInfo(new_policy)).first; } - - if (found && it->second.policy == new_policy) + else { - return; + if (it->second.policy == new_policy) + return; } auto & info = it->second; @@ -198,15 +181,7 @@ void RowPolicyCache::rowPolicyAddedOrChanged(const UUID & policy_id, const RowPo void RowPolicyCache::rowPolicyRemoved(const UUID & policy_id) { std::lock_guard lock{mutex}; - auto it = database_policies.find(policy_id); - if (it != database_policies.end()) - { - database_policies.erase(it); - } - else - { - table_policies.erase(policy_id); - } + all_policies.erase(policy_id); mixFilters(); } @@ -246,73 +221,76 @@ void RowPolicyCache::mixFiltersFor(EnabledRowPolicies & enabled) std::unordered_map table_mixers; std::unordered_map database_mixers; - - for (const auto & [policy_id, info] : database_policies) + /// populate database_mixers using database-level policies + /// to aggregate (mix) rules per database + for (const auto & [policy_id, info] : all_policies) { - const auto & policy = *info.policy; - bool match = info.roles->match(enabled.params.user_id, enabled.params.enabled_roles); - for (auto filter_type : collections::range(0, RowPolicyFilterType::MAX)) + if (info.isDatabase()) { - auto filter_type_i = static_cast(filter_type); - if (info.parsed_filters[filter_type_i]) + const auto & policy = *info.policy; + bool match = info.roles->match(enabled.params.user_id, enabled.params.enabled_roles); + for (auto filter_type : collections::range(0, RowPolicyFilterType::MAX)) { - MixedFiltersKey key{info.database_and_table_name->first, - info.database_and_table_name->second, - filter_type}; - LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "db: {} : {}", key.database, key.table_name); - - auto & mixer = database_mixers[key]; // getting database level mixer - mixer.database_and_table_name = info.database_and_table_name; - if (match) + auto filter_type_i = static_cast(filter_type); + if (info.parsed_filters[filter_type_i]) { - mixer.mixer.add(info.parsed_filters[filter_type_i], policy.isRestrictive()); - mixer.policies.push_back(info.policy); + MixedFiltersKey key{info.database_and_table_name->first, + info.database_and_table_name->second, + filter_type}; + + auto & mixer = database_mixers[key]; + mixer.database_and_table_name = info.database_and_table_name; + if (match) + { + mixer.mixer.add(info.parsed_filters[filter_type_i], policy.isRestrictive()); + mixer.policies.push_back(info.policy); + } } } } } - - for (const auto & [policy_id, info] : table_policies) + /// populate table_mixers using database_mixers and table-level policies + for (const auto & [policy_id, info] : all_policies) { - const auto & policy = *info.policy; - bool match = info.roles->match(enabled.params.user_id, enabled.params.enabled_roles); - for (auto filter_type : collections::range(0, RowPolicyFilterType::MAX)) + if (!info.isDatabase()) { - auto filter_type_i = static_cast(filter_type); - if (info.parsed_filters[filter_type_i]) + const auto & policy = *info.policy; + bool match = info.roles->match(enabled.params.user_id, enabled.params.enabled_roles); + for (auto filter_type : collections::range(0, RowPolicyFilterType::MAX)) { - MixedFiltersKey key{info.database_and_table_name->first, - info.database_and_table_name->second, - filter_type}; - LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "table: {} : {}", key.database, key.table_name); - auto table_it = table_mixers.find(key); - if (table_it == table_mixers.end()) + auto filter_type_i = static_cast(filter_type); + if (info.parsed_filters[filter_type_i]) { - LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "table: not found, looking for db"); - MixedFiltersKey database_key = key; - database_key.table_name = "*"; + MixedFiltersKey key{info.database_and_table_name->first, + info.database_and_table_name->second, + filter_type}; + auto table_it = table_mixers.find(key); + if (table_it == table_mixers.end()) + { /// no exact match - create new mixer + MixedFiltersKey database_key = key; + database_key.table_name = RowPolicy::ANY_TABLE_MARK; - auto database_it = database_mixers.find(database_key); + auto database_it = database_mixers.find(database_key); - if (database_it == database_mixers.end()) - { - LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "table: not found, database not found"); - table_it = table_mixers.try_emplace(key).first; + if (database_it == database_mixers.end()) + { + table_it = table_mixers.try_emplace(key).first; + } + else + { + /// table policies are based on database ones + table_it = table_mixers.insert({key, database_it->second}).first; + } } - else - { - LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "table: not found, database found"); - table_it = table_mixers.insert({key, database_it->second}).first; - } - } - auto & mixer = table_it->second; // table_mixers[key]; getting table level mixer - mixer.database_and_table_name = info.database_and_table_name; - if (match) - { - mixer.mixer.add(info.parsed_filters[filter_type_i], policy.isRestrictive()); - mixer.policies.push_back(info.policy); + auto & mixer = table_it->second; /// getting table level mixer + mixer.database_and_table_name = info.database_and_table_name; + if (match) + { + mixer.mixer.add(info.parsed_filters[filter_type_i], policy.isRestrictive()); + mixer.policies.push_back(info.policy); + } } } } @@ -320,7 +298,8 @@ void RowPolicyCache::mixFiltersFor(EnabledRowPolicies & enabled) auto mixed_filters = boost::make_shared(); - for (auto mixer_map_ptr : { &table_mixers, &database_mixers}) + /// retrieve aggregated policies from mixers + for (auto * mixer_map_ptr : {&table_mixers, &database_mixers}) { for (auto & [key, mixer] : *mixer_map_ptr) { @@ -332,7 +311,6 @@ void RowPolicyCache::mixFiltersFor(EnabledRowPolicies & enabled) } } - enabled.mixed_filters.store(mixed_filters); } diff --git a/src/Access/RowPolicyCache.h b/src/Access/RowPolicyCache.h index 49e6a2e5df3..7260de22164 100644 --- a/src/Access/RowPolicyCache.h +++ b/src/Access/RowPolicyCache.h @@ -12,7 +12,6 @@ namespace DB class AccessControl; struct RolesOrUsersSet; struct RowPolicy; - using RowPolicyPtr = std::shared_ptr; /// Stores read and parsed row policies. @@ -30,24 +29,21 @@ private: explicit PolicyInfo(const RowPolicyPtr & policy_) { setPolicy(policy_); } void setPolicy(const RowPolicyPtr & policy_); + bool isDatabase() const { return policy->isDatabase(); } RowPolicyPtr policy; const RolesOrUsersSet * roles = nullptr; std::shared_ptr> database_and_table_name; ASTPtr parsed_filters[static_cast(RowPolicyFilterType::MAX)]; }; - using PolicyMap = std::unordered_map; - void ensureAllRowPoliciesRead(); void rowPolicyAddedOrChanged(const UUID & policy_id, const RowPolicyPtr & new_policy); void rowPolicyRemoved(const UUID & policy_id); void mixFilters(); void mixFiltersFor(EnabledRowPolicies & enabled); - const AccessControl & access_control; - PolicyMap database_policies; - PolicyMap table_policies; + std::unordered_map all_policies; bool all_policies_read = false; scope_guard subscription; std::map> enabled_row_policies; diff --git a/src/Parsers/Access/ParserCreateRowPolicyQuery.cpp b/src/Parsers/Access/ParserCreateRowPolicyQuery.cpp index 03d0754ca1a..2c25fc14e7d 100644 --- a/src/Parsers/Access/ParserCreateRowPolicyQuery.cpp +++ b/src/Parsers/Access/ParserCreateRowPolicyQuery.cpp @@ -203,7 +203,6 @@ namespace bool ParserCreateRowPolicyQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { - // poco_assert("ParserCreateRowPolicyQuery::parseImpl" == nullptr); bool alter = false; if (attach_mode) { diff --git a/src/Parsers/Access/ParserRowPolicyName.cpp b/src/Parsers/Access/ParserRowPolicyName.cpp index e087c45fad9..e5b4e01d5ac 100644 --- a/src/Parsers/Access/ParserRowPolicyName.cpp +++ b/src/Parsers/Access/ParserRowPolicyName.cpp @@ -7,8 +7,6 @@ #include #include -#include - namespace DB { @@ -28,19 +26,18 @@ namespace return IParserBase::wrapParseImpl(pos, [&] { String res_database, res_table_name; - // if (!parseDatabaseAndTableName(pos, expected, res_database, res_table_name)) - bool any_database = false; - bool any_table = true; + bool is_any_database = false; + bool is_any_table = false; - if (!parseDatabaseAndTableNameOrAsterisks(pos, expected, res_database, any_database, res_table_name, any_table)) + if (!parseDatabaseAndTableNameOrAsterisks(pos, expected, res_database, is_any_database, res_table_name, is_any_table) + || is_any_database) { - // poco_assert("parseDatabaseAndTableName failed" == nullptr); - LOG_TRACE((&Poco::Logger::get("ParserRowPolicyName")), "parseDatabaseAndTableName failed"); return false; } - if (any_table) - res_table_name = "*"; - + else if (is_any_table) + { + res_table_name = "*"; // RowPolicy::ANY_TABLE_MARK + } /// If table is specified without DB it cannot be followed by "ON" /// (but can be followed by "ON CLUSTER"). @@ -64,10 +61,8 @@ namespace } - bool parseOnDBAndTableName(IParser::Pos & pos, Expected & expected, String & database, String & table_name) { - // poco_assert("parseOnDBAndTableNames" == nullptr); return IParserBase::wrapParseImpl(pos, [&] { return ParserKeyword{"ON"}.ignore(pos, expected) && parseDBAndTableName(pos, expected, database, table_name); @@ -77,9 +72,6 @@ namespace bool parseOnDBAndTableNames(IParser::Pos & pos, Expected & expected, std::vector> & database_and_table_names) { - // poco_assert("parseOnDBAndTableNames" == nullptr); - - return IParserBase::wrapParseImpl(pos, [&] { if (!ParserKeyword{"ON"}.ignore(pos, expected)) @@ -164,7 +156,6 @@ namespace bool ParserRowPolicyName::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { - // poco_assert("ParserRowPolicyName::parseImpl" == nullptr); std::vector full_names; String cluster; if (!parseRowPolicyNamesAroundON(pos, expected, false, false, allow_on_cluster, full_names, cluster)) @@ -181,7 +172,6 @@ bool ParserRowPolicyName::parseImpl(Pos & pos, ASTPtr & node, Expected & expecte bool ParserRowPolicyNames::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { - // poco_assert("ParserRowPolicyName::parseImpl" == nullptr); std::vector full_names; size_t num_added_names_last_time = 0; String cluster; diff --git a/src/Parsers/parseQuery.cpp b/src/Parsers/parseQuery.cpp index 1b1ff439e13..8d794409f78 100644 --- a/src/Parsers/parseQuery.cpp +++ b/src/Parsers/parseQuery.cpp @@ -139,8 +139,6 @@ void writeCommonErrorMessage( if (!query_description.empty()) out << " (" << query_description << ")"; - // poco_assert("writeCommonErrorMessage" == nullptr); - out << ": failed at position " << (last_token.begin - begin + 1); if (last_token.type == TokenType::EndOfStream || last_token.type == TokenType::Semicolon) diff --git a/tests/queries/0_stateless/02131_row_policies_combination.sql b/tests/queries/0_stateless/02131_row_policies_combination.sql index b5be672bb1b..1cbbca754b6 100644 --- a/tests/queries/0_stateless/02131_row_policies_combination.sql +++ b/tests/queries/0_stateless/02131_row_policies_combination.sql @@ -8,6 +8,8 @@ DROP ROW POLICY IF EXISTS 02131_filter_3 ON 02131_rptable; DROP ROW POLICY IF EXISTS 02131_filter_4 ON 02131_rptable; DROP ROW POLICY IF EXISTS 02131_filter_5 ON 02131_rptable; +-- the test assumes users_without_row_policies_can_read_rows is true + SELECT 'None'; SELECT * FROM 02131_rptable; @@ -52,3 +54,9 @@ SELECT 'None'; SELECT * FROM 02131_rptable; DROP TABLE 02131_rptable; + +DROP ROW POLICY IF EXISTS 02131_filter_1 ON 02131_rptable; +DROP ROW POLICY IF EXISTS 02131_filter_2 ON 02131_rptable; +DROP ROW POLICY IF EXISTS 02131_filter_3 ON 02131_rptable; +DROP ROW POLICY IF EXISTS 02131_filter_4 ON 02131_rptable; +DROP ROW POLICY IF EXISTS 02131_filter_5 ON 02131_rptable; diff --git a/tests/queries/0_stateless/25341_row_policies_database_combination.reference b/tests/queries/0_stateless/25341_row_policies_database_combination.reference new file mode 100644 index 00000000000..68ed02d1dc0 --- /dev/null +++ b/tests/queries/0_stateless/25341_row_policies_database_combination.reference @@ -0,0 +1,42 @@ +None +1 10 +2 20 +3 30 +4 40 +R1: x == 1 +1 10 +R1, R2: (x == 1) OR (x == 2) +1 10 +2 20 +R1, R2: (x == 2) FROM ANOTHER +2 20 +R1, R2, R3: (x == 1) OR (x == 2) OR (x == 3) +1 10 +2 20 +3 30 +R1, R2, R3, R4: ((x == 1) OR (x == 2) OR (x == 3)) AND (x <= 2) +1 10 +2 20 +R1, R2, R3, R4, R5: ((x == 1) OR (x == 2) OR (x == 3)) AND (x <= 2) AND (y >= 20) +2 20 +2 20 +R1, R2, R3, R4, R5: (x == 2) AND (y >= 20) FROM AFTER_RP +2 20 +R1, R2, R3, R4, R5: (x == 2) AND (y >= 20) FROM ANOTHER +2 20 +R2, R3, R4, R5: ((x == 2) OR (x == 3)) AND (x <= 2) AND (y >= 20) +2 20 +R3, R4, R5: (x == 3) AND (x <= 2) AND (y >= 20) +R4, R5: (x <= 2) AND (y >= 20) +2 20 +R5: (x >= 2) +2 20 +3 30 +4 40 +Policy not applicable +None +1 10 +2 20 +3 30 +4 40 +No problematic policy, select works diff --git a/tests/queries/0_stateless/25341_row_policies_database_combination.sql b/tests/queries/0_stateless/25341_row_policies_database_combination.sql new file mode 100644 index 00000000000..aa9454b8c9b --- /dev/null +++ b/tests/queries/0_stateless/25341_row_policies_database_combination.sql @@ -0,0 +1,88 @@ +-- Tags: no-parallel + +DROP DATABASE IF EXISTS 25341_db; +CREATE DATABASE 25341_db; +DROP TABLE IF EXISTS 25341_db.25341_rptable; +DROP TABLE IF EXISTS 25341_db.25341_rptable_another; +CREATE TABLE 25341_db.25341_rptable (x UInt8, y UInt8) ENGINE = MergeTree ORDER BY x; + +INSERT INTO 25341_db.25341_rptable VALUES (1, 10), (2, 20), (3, 30), (4, 40); + +CREATE TABLE 25341_db.25341_rptable_another ENGINE = MergeTree ORDER BY x AS SELECT * FROM 25341_db.25341_rptable; + + +DROP ROW POLICY IF EXISTS 25341_filter_1 ON 25341_db.25341_rptable; +DROP ROW POLICY IF EXISTS 25341_filter_2 ON 25341_db.*; +DROP ROW POLICY IF EXISTS 25341_filter_3 ON 25341_db.25341_rptable; +DROP ROW POLICY IF EXISTS 25341_filter_4 ON 25341_db.25341_rptable; +DROP ROW POLICY IF EXISTS 25341_filter_5 ON 25341_db.*; + +-- the test assumes users_without_row_policies_can_read_rows is true + +SELECT 'None'; +SELECT * FROM 25341_db.25341_rptable; + +CREATE ROW POLICY 25341_filter_1 ON 25341_db.25341_rptable USING x=1 AS permissive TO ALL; +SELECT 'R1: x == 1'; +SELECT * FROM 25341_db.25341_rptable; + +CREATE ROW POLICY 25341_filter_2 ON 25341_db.* USING x=2 AS permissive TO ALL; +SELECT 'R1, R2: (x == 1) OR (x == 2)'; +SELECT * FROM 25341_db.25341_rptable; + +SELECT 'R1, R2: (x == 2) FROM ANOTHER'; +SELECT * FROM 25341_db.25341_rptable_another; + +CREATE ROW POLICY 25341_filter_3 ON 25341_db.25341_rptable USING x=3 AS permissive TO ALL; +SELECT 'R1, R2, R3: (x == 1) OR (x == 2) OR (x == 3)'; +SELECT * FROM 25341_db.25341_rptable; + +CREATE ROW POLICY 25341_filter_4 ON 25341_db.25341_rptable USING x<=2 AS restrictive TO ALL; +SELECT 'R1, R2, R3, R4: ((x == 1) OR (x == 2) OR (x == 3)) AND (x <= 2)'; +SELECT * FROM 25341_db.25341_rptable; + +CREATE ROW POLICY 25341_filter_5 ON 25341_db.* USING y>=20 AS restrictive TO ALL; +SELECT 'R1, R2, R3, R4, R5: ((x == 1) OR (x == 2) OR (x == 3)) AND (x <= 2) AND (y >= 20)'; +SELECT * FROM 25341_db.25341_rptable; + +CREATE TABLE 25341_db.25341_after_rp ENGINE = MergeTree ORDER BY x AS SELECT * FROM 25341_db.25341_rptable; +SELECT * FROM 25341_db.25341_after_rp; + +-- does not matter if policies or table are created first +SELECT 'R1, R2, R3, R4, R5: (x == 2) AND (y >= 20) FROM AFTER_RP'; +SELECT * FROM 25341_db.25341_after_rp; + +SELECT 'R1, R2, R3, R4, R5: (x == 2) AND (y >= 20) FROM ANOTHER'; +SELECT * FROM 25341_db.25341_rptable_another; + +DROP ROW POLICY 25341_filter_1 ON 25341_db.25341_rptable; +SELECT 'R2, R3, R4, R5: ((x == 2) OR (x == 3)) AND (x <= 2) AND (y >= 20)'; +SELECT * FROM 25341_db.25341_rptable; + +DROP ROW POLICY 25341_filter_2 ON 25341_db.*; +SELECT 'R3, R4, R5: (x == 3) AND (x <= 2) AND (y >= 20)'; +SELECT * FROM 25341_db.25341_rptable; + +DROP ROW POLICY 25341_filter_3 ON 25341_db.25341_rptable; +SELECT 'R4, R5: (x <= 2) AND (y >= 20)'; +SELECT * FROM 25341_db.25341_rptable; + +DROP ROW POLICY 25341_filter_4 ON 25341_db.25341_rptable; +SELECT 'R5: (x >= 2)'; +SELECT * FROM 25341_db.25341_rptable; + +CREATE TABLE 25341_db.25341_unexpected_columns (xx UInt8, yy UInt8) ENGINE = MergeTree ORDER BY xx; +SELECT 'Policy not applicable'; +SELECT * FROM 25341_db.25341_unexpected_columns; -- { serverError 47 } -- Missing columns: 'x' while processing query + +DROP ROW POLICY 25341_filter_5 ON 25341_db.*; +SELECT 'None'; +SELECT * FROM 25341_db.25341_rptable; + +SELECT 'No problematic policy, select works'; +SELECT 'Ok' FROM 25341_db.25341_unexpected_columns; + +DROP TABLE 25341_db.25341_rptable; +DROP TABLE 25341_db.25341_rptable_another; +DROP TABLE 25341_db.25341_unexpected_columns; +DROP DATABASE 25341_db; diff --git a/tests/queries/0_stateless/25341_row_policy_database.reference b/tests/queries/0_stateless/25341_row_policy_database.reference new file mode 100644 index 00000000000..57125b64056 --- /dev/null +++ b/tests/queries/0_stateless/25341_row_policy_database.reference @@ -0,0 +1,22 @@ +-- database level policies + -- SHOW CREATE POLICY db1_25341 ON db1_25341.* +CREATE ROW POLICY db1_25341 ON db1_25341.`*` FOR SELECT USING 1 TO ALL + -- SHOW CREATE POLICY ON db1_25341.* +CREATE ROW POLICY `25341_filter_11` ON db1_25341.`25341_rqtable` FOR SELECT USING x = 2 TO ALL +CREATE ROW POLICY db1_25341 ON db1_25341.`*` FOR SELECT USING 1 TO ALL +CREATE ROW POLICY tbl1_25341 ON db1_25341.table FOR SELECT USING 1 TO ALL + -- SHOW CREATE POLICY ON db1_25341.`*` +CREATE ROW POLICY db1_25341 ON db1_25341.`*` FOR SELECT USING 1 TO ALL +R1, R2: (x == 1) OR (x == 2) +1 +2 +Check system.query_log +SELECT \'-- database level policies\'; [] +SELECT \' -- SHOW CREATE POLICY db1_25341 ON db1_25341.*\'; [] +SELECT \' -- SHOW CREATE POLICY ON db1_25341.*\'; [] +SELECT \' -- SHOW CREATE POLICY ON db1_25341.`*`\'; [] +SELECT \'R1, R2: (x == 1) OR (x == 2)\'; [] +SELECT * FROM 25341_rqtable_default; ['`25341_filter_11_db` ON default.`*`','`25341_filter_11` ON default.`25341_rqtable_default`'] +SELECT \'Check system.query_log\'; [] + -- CREATE DATABSE-LEVEL POLICY ON IN CURRENT DATABASE +CREATE ROW POLICY db2_25341 ON db1_25341.`*` TO u1_25341 diff --git a/tests/queries/0_stateless/25341_row_policy_database.sql b/tests/queries/0_stateless/25341_row_policy_database.sql new file mode 100644 index 00000000000..9d865487f0b --- /dev/null +++ b/tests/queries/0_stateless/25341_row_policy_database.sql @@ -0,0 +1,53 @@ +-- Tags: no-parallel + +DROP DATABASE IF EXISTS db1_25341; +DROP USER IF EXISTS u1_25341; +CREATE USER u1_25341; + +CREATE DATABASE db1_25341; + +CREATE TABLE db1_25341.25341_rqtable (x UInt8) ENGINE = MergeTree ORDER BY x; +INSERT INTO db1_25341.25341_rqtable VALUES (1), (2), (3), (4); + + +SELECT '-- database level policies'; +CREATE ROW POLICY db1_25341 ON db1_25341.* USING 1 AS PERMISSIVE TO ALL; +CREATE ROW POLICY tbl1_25341 ON db1_25341.table USING 1 AS PERMISSIVE TO ALL; +SELECT ' -- SHOW CREATE POLICY db1_25341 ON db1_25341.*'; +SHOW CREATE POLICY db1_25341 ON db1_25341.*; +SELECT ' -- SHOW CREATE POLICY ON db1_25341.*'; +SHOW CREATE POLICY ON db1_25341.*; +SELECT ' -- SHOW CREATE POLICY ON db1_25341.`*`'; +SHOW CREATE POLICY ON db1_25341.`*`; +DROP POLICY db1_25341 ON db1_25341.*; +DROP POLICY tbl1_25341 ON db1_25341.table; + +CREATE ROW POLICY any_25341 ON *.some_table USING 1 AS PERMISSIVE TO ALL; -- { clientError 62 } + +CREATE TABLE 25341_rqtable_default (x UInt8) ENGINE = MergeTree ORDER BY x; + +CREATE ROW POLICY 25341_filter_11_db ON * USING x=1 AS permissive TO ALL; +CREATE ROW POLICY 25341_filter_11 ON 25341_rqtable_default USING x=2 AS permissive TO ALL; + +INSERT INTO 25341_rqtable_default VALUES (1), (2), (3), (4); + +SELECT 'R1, R2: (x == 1) OR (x == 2)'; +SELECT * FROM 25341_rqtable_default; + +DROP TABLE 25341_rqtable_default; + +SELECT 'Check system.query_log'; +SYSTEM FLUSH LOGS; +SELECT query, used_row_policies FROM system.query_log WHERE current_database == currentDatabase() AND type == 'QueryStart' AND query_kind == 'Select' ORDER BY event_time_microseconds; + +DROP ROW POLICY 25341_filter_11_db ON *; +DROP ROW POLICY 25341_filter_11 ON 25341_rqtable_default; + +USE db1_25341; +SELECT ' -- CREATE DATABSE-LEVEL POLICY ON IN CURRENT DATABASE'; +CREATE ROW POLICY db2_25341 ON * TO u1_25341; +SHOW CREATE POLICY db2_25341 ON *; + +DROP ROW POLICY db2_25341 ON *; + +DROP USER u1_25341; From 28c6d4fb630c47f821d2225677ec305a8fdfd883 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Thu, 30 Mar 2023 01:29:02 +0200 Subject: [PATCH 23/45] row policy template - rollback old test to make stylecheck happy --- .../0_stateless/02131_row_policies_combination.sql | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/queries/0_stateless/02131_row_policies_combination.sql b/tests/queries/0_stateless/02131_row_policies_combination.sql index 1cbbca754b6..b5be672bb1b 100644 --- a/tests/queries/0_stateless/02131_row_policies_combination.sql +++ b/tests/queries/0_stateless/02131_row_policies_combination.sql @@ -8,8 +8,6 @@ DROP ROW POLICY IF EXISTS 02131_filter_3 ON 02131_rptable; DROP ROW POLICY IF EXISTS 02131_filter_4 ON 02131_rptable; DROP ROW POLICY IF EXISTS 02131_filter_5 ON 02131_rptable; --- the test assumes users_without_row_policies_can_read_rows is true - SELECT 'None'; SELECT * FROM 02131_rptable; @@ -54,9 +52,3 @@ SELECT 'None'; SELECT * FROM 02131_rptable; DROP TABLE 02131_rptable; - -DROP ROW POLICY IF EXISTS 02131_filter_1 ON 02131_rptable; -DROP ROW POLICY IF EXISTS 02131_filter_2 ON 02131_rptable; -DROP ROW POLICY IF EXISTS 02131_filter_3 ON 02131_rptable; -DROP ROW POLICY IF EXISTS 02131_filter_4 ON 02131_rptable; -DROP ROW POLICY IF EXISTS 02131_filter_5 ON 02131_rptable; From 2ed8e318cfd9d857869de601ed08ff4d9904a7b1 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Thu, 30 Mar 2023 10:13:50 +0200 Subject: [PATCH 24/45] row policy template - tests renumbered to keep monotony --- ...w_policies_database_combination.reference} | 0 ...2703_row_policies_database_combination.sql | 88 +++++++++++++++++++ .../02703_row_policy_database.reference | 21 +++++ .../0_stateless/02703_row_policy_database.sql | 53 +++++++++++ ...5341_row_policies_database_combination.sql | 88 ------------------- .../25341_row_policy_database.reference | 22 ----- .../0_stateless/25341_row_policy_database.sql | 53 ----------- 7 files changed, 162 insertions(+), 163 deletions(-) rename tests/queries/0_stateless/{25341_row_policies_database_combination.reference => 02703_row_policies_database_combination.reference} (100%) create mode 100644 tests/queries/0_stateless/02703_row_policies_database_combination.sql create mode 100644 tests/queries/0_stateless/02703_row_policy_database.reference create mode 100644 tests/queries/0_stateless/02703_row_policy_database.sql delete mode 100644 tests/queries/0_stateless/25341_row_policies_database_combination.sql delete mode 100644 tests/queries/0_stateless/25341_row_policy_database.reference delete mode 100644 tests/queries/0_stateless/25341_row_policy_database.sql diff --git a/tests/queries/0_stateless/25341_row_policies_database_combination.reference b/tests/queries/0_stateless/02703_row_policies_database_combination.reference similarity index 100% rename from tests/queries/0_stateless/25341_row_policies_database_combination.reference rename to tests/queries/0_stateless/02703_row_policies_database_combination.reference diff --git a/tests/queries/0_stateless/02703_row_policies_database_combination.sql b/tests/queries/0_stateless/02703_row_policies_database_combination.sql new file mode 100644 index 00000000000..f9b466f1ade --- /dev/null +++ b/tests/queries/0_stateless/02703_row_policies_database_combination.sql @@ -0,0 +1,88 @@ +-- Tags: no-parallel + +DROP DATABASE IF EXISTS 02703_db; +CREATE DATABASE 02703_db; +DROP TABLE IF EXISTS 02703_db.02703_rptable; +DROP TABLE IF EXISTS 02703_db.02703_rptable_another; +CREATE TABLE 02703_db.02703_rptable (x UInt8, y UInt8) ENGINE = MergeTree ORDER BY x; + +INSERT INTO 02703_db.02703_rptable VALUES (1, 10), (2, 20), (3, 30), (4, 40); + +CREATE TABLE 02703_db.02703_rptable_another ENGINE = MergeTree ORDER BY x AS SELECT * FROM 02703_db.02703_rptable; + + +DROP ROW POLICY IF EXISTS 02703_filter_1 ON 02703_db.02703_rptable; +DROP ROW POLICY IF EXISTS 02703_filter_2 ON 02703_db.*; +DROP ROW POLICY IF EXISTS 02703_filter_3 ON 02703_db.02703_rptable; +DROP ROW POLICY IF EXISTS 02703_filter_4 ON 02703_db.02703_rptable; +DROP ROW POLICY IF EXISTS 02703_filter_5 ON 02703_db.*; + +-- the test assumes users_without_row_policies_can_read_rows is true + +SELECT 'None'; +SELECT * FROM 02703_db.02703_rptable; + +CREATE ROW POLICY 02703_filter_1 ON 02703_db.02703_rptable USING x=1 AS permissive TO ALL; +SELECT 'R1: x == 1'; +SELECT * FROM 02703_db.02703_rptable; + +CREATE ROW POLICY 02703_filter_2 ON 02703_db.* USING x=2 AS permissive TO ALL; +SELECT 'R1, R2: (x == 1) OR (x == 2)'; +SELECT * FROM 02703_db.02703_rptable; + +SELECT 'R1, R2: (x == 2) FROM ANOTHER'; +SELECT * FROM 02703_db.02703_rptable_another; + +CREATE ROW POLICY 02703_filter_3 ON 02703_db.02703_rptable USING x=3 AS permissive TO ALL; +SELECT 'R1, R2, R3: (x == 1) OR (x == 2) OR (x == 3)'; +SELECT * FROM 02703_db.02703_rptable; + +CREATE ROW POLICY 02703_filter_4 ON 02703_db.02703_rptable USING x<=2 AS restrictive TO ALL; +SELECT 'R1, R2, R3, R4: ((x == 1) OR (x == 2) OR (x == 3)) AND (x <= 2)'; +SELECT * FROM 02703_db.02703_rptable; + +CREATE ROW POLICY 02703_filter_5 ON 02703_db.* USING y>=20 AS restrictive TO ALL; +SELECT 'R1, R2, R3, R4, R5: ((x == 1) OR (x == 2) OR (x == 3)) AND (x <= 2) AND (y >= 20)'; +SELECT * FROM 02703_db.02703_rptable; + +CREATE TABLE 02703_db.02703_after_rp ENGINE = MergeTree ORDER BY x AS SELECT * FROM 02703_db.02703_rptable; +SELECT * FROM 02703_db.02703_after_rp; + +-- does not matter if policies or table are created first +SELECT 'R1, R2, R3, R4, R5: (x == 2) AND (y >= 20) FROM AFTER_RP'; +SELECT * FROM 02703_db.02703_after_rp; + +SELECT 'R1, R2, R3, R4, R5: (x == 2) AND (y >= 20) FROM ANOTHER'; +SELECT * FROM 02703_db.02703_rptable_another; + +DROP ROW POLICY 02703_filter_1 ON 02703_db.02703_rptable; +SELECT 'R2, R3, R4, R5: ((x == 2) OR (x == 3)) AND (x <= 2) AND (y >= 20)'; +SELECT * FROM 02703_db.02703_rptable; + +DROP ROW POLICY 02703_filter_2 ON 02703_db.*; +SELECT 'R3, R4, R5: (x == 3) AND (x <= 2) AND (y >= 20)'; +SELECT * FROM 02703_db.02703_rptable; + +DROP ROW POLICY 02703_filter_3 ON 02703_db.02703_rptable; +SELECT 'R4, R5: (x <= 2) AND (y >= 20)'; +SELECT * FROM 02703_db.02703_rptable; + +DROP ROW POLICY 02703_filter_4 ON 02703_db.02703_rptable; +SELECT 'R5: (x >= 2)'; +SELECT * FROM 02703_db.02703_rptable; + +CREATE TABLE 02703_db.02703_unexpected_columns (xx UInt8, yy UInt8) ENGINE = MergeTree ORDER BY xx; +SELECT 'Policy not applicable'; +SELECT * FROM 02703_db.02703_unexpected_columns; -- { serverError 47 } -- Missing columns: 'x' while processing query + +DROP ROW POLICY 02703_filter_5 ON 02703_db.*; +SELECT 'None'; +SELECT * FROM 02703_db.02703_rptable; + +SELECT 'No problematic policy, select works'; +SELECT 'Ok' FROM 02703_db.02703_unexpected_columns; + +DROP TABLE 02703_db.02703_rptable; +DROP TABLE 02703_db.02703_rptable_another; +DROP TABLE 02703_db.02703_unexpected_columns; +DROP DATABASE 02703_db; diff --git a/tests/queries/0_stateless/02703_row_policy_database.reference b/tests/queries/0_stateless/02703_row_policy_database.reference new file mode 100644 index 00000000000..e318dfac429 --- /dev/null +++ b/tests/queries/0_stateless/02703_row_policy_database.reference @@ -0,0 +1,21 @@ +-- database level policies + -- SHOW CREATE POLICY db1_02703 ON db1_02703.* +CREATE ROW POLICY db1_02703 ON db1_02703.`*` FOR SELECT USING 1 TO ALL + -- SHOW CREATE POLICY ON db1_02703.* +CREATE ROW POLICY db1_02703 ON db1_02703.`*` FOR SELECT USING 1 TO ALL +CREATE ROW POLICY tbl1_02703 ON db1_02703.table FOR SELECT USING 1 TO ALL + -- SHOW CREATE POLICY ON db1_02703.`*` +CREATE ROW POLICY db1_02703 ON db1_02703.`*` FOR SELECT USING 1 TO ALL +R1, R2: (x == 1) OR (x == 2) +1 +2 +Check system.query_log +SELECT \'-- database level policies\'; [] +SELECT \' -- SHOW CREATE POLICY db1_02703 ON db1_02703.*\'; [] +SELECT \' -- SHOW CREATE POLICY ON db1_02703.*\'; [] +SELECT \' -- SHOW CREATE POLICY ON db1_02703.`*`\'; [] +SELECT \'R1, R2: (x == 1) OR (x == 2)\'; [] +SELECT * FROM 02703_rqtable_default; ['`02703_filter_11_db` ON default.`*`','`02703_filter_11` ON default.`02703_rqtable_default`'] +SELECT \'Check system.query_log\'; [] + -- CREATE DATABSE-LEVEL POLICY ON IN CURRENT DATABASE +CREATE ROW POLICY db2_02703 ON db1_02703.`*` TO u1_02703 diff --git a/tests/queries/0_stateless/02703_row_policy_database.sql b/tests/queries/0_stateless/02703_row_policy_database.sql new file mode 100644 index 00000000000..85f5a44dfbf --- /dev/null +++ b/tests/queries/0_stateless/02703_row_policy_database.sql @@ -0,0 +1,53 @@ +-- Tags: no-parallel + +DROP DATABASE IF EXISTS db1_02703; +DROP USER IF EXISTS u1_02703; +CREATE USER u1_02703; + +CREATE DATABASE db1_02703; + +CREATE TABLE db1_02703.02703_rqtable (x UInt8) ENGINE = MergeTree ORDER BY x; +INSERT INTO db1_02703.02703_rqtable VALUES (1), (2), (3), (4); + + +SELECT '-- database level policies'; +CREATE ROW POLICY db1_02703 ON db1_02703.* USING 1 AS PERMISSIVE TO ALL; +CREATE ROW POLICY tbl1_02703 ON db1_02703.table USING 1 AS PERMISSIVE TO ALL; +SELECT ' -- SHOW CREATE POLICY db1_02703 ON db1_02703.*'; +SHOW CREATE POLICY db1_02703 ON db1_02703.*; +SELECT ' -- SHOW CREATE POLICY ON db1_02703.*'; +SHOW CREATE POLICY ON db1_02703.*; +SELECT ' -- SHOW CREATE POLICY ON db1_02703.`*`'; +SHOW CREATE POLICY ON db1_02703.`*`; +DROP POLICY db1_02703 ON db1_02703.*; +DROP POLICY tbl1_02703 ON db1_02703.table; + +CREATE ROW POLICY any_02703 ON *.some_table USING 1 AS PERMISSIVE TO ALL; -- { clientError 62 } + +CREATE TABLE 02703_rqtable_default (x UInt8) ENGINE = MergeTree ORDER BY x; + +CREATE ROW POLICY 02703_filter_11_db ON * USING x=1 AS permissive TO ALL; +CREATE ROW POLICY 02703_filter_11 ON 02703_rqtable_default USING x=2 AS permissive TO ALL; + +INSERT INTO 02703_rqtable_default VALUES (1), (2), (3), (4); + +SELECT 'R1, R2: (x == 1) OR (x == 2)'; +SELECT * FROM 02703_rqtable_default; + +DROP TABLE 02703_rqtable_default; + +SELECT 'Check system.query_log'; +SYSTEM FLUSH LOGS; +SELECT query, used_row_policies FROM system.query_log WHERE current_database == currentDatabase() AND type == 'QueryStart' AND query_kind == 'Select' ORDER BY event_time_microseconds; + +DROP ROW POLICY 02703_filter_11_db ON *; +DROP ROW POLICY 02703_filter_11 ON 02703_rqtable_default; + +USE db1_02703; +SELECT ' -- CREATE DATABSE-LEVEL POLICY ON IN CURRENT DATABASE'; +CREATE ROW POLICY db2_02703 ON * TO u1_02703; +SHOW CREATE POLICY db2_02703 ON *; + +DROP ROW POLICY db2_02703 ON *; + +DROP USER u1_02703; diff --git a/tests/queries/0_stateless/25341_row_policies_database_combination.sql b/tests/queries/0_stateless/25341_row_policies_database_combination.sql deleted file mode 100644 index aa9454b8c9b..00000000000 --- a/tests/queries/0_stateless/25341_row_policies_database_combination.sql +++ /dev/null @@ -1,88 +0,0 @@ --- Tags: no-parallel - -DROP DATABASE IF EXISTS 25341_db; -CREATE DATABASE 25341_db; -DROP TABLE IF EXISTS 25341_db.25341_rptable; -DROP TABLE IF EXISTS 25341_db.25341_rptable_another; -CREATE TABLE 25341_db.25341_rptable (x UInt8, y UInt8) ENGINE = MergeTree ORDER BY x; - -INSERT INTO 25341_db.25341_rptable VALUES (1, 10), (2, 20), (3, 30), (4, 40); - -CREATE TABLE 25341_db.25341_rptable_another ENGINE = MergeTree ORDER BY x AS SELECT * FROM 25341_db.25341_rptable; - - -DROP ROW POLICY IF EXISTS 25341_filter_1 ON 25341_db.25341_rptable; -DROP ROW POLICY IF EXISTS 25341_filter_2 ON 25341_db.*; -DROP ROW POLICY IF EXISTS 25341_filter_3 ON 25341_db.25341_rptable; -DROP ROW POLICY IF EXISTS 25341_filter_4 ON 25341_db.25341_rptable; -DROP ROW POLICY IF EXISTS 25341_filter_5 ON 25341_db.*; - --- the test assumes users_without_row_policies_can_read_rows is true - -SELECT 'None'; -SELECT * FROM 25341_db.25341_rptable; - -CREATE ROW POLICY 25341_filter_1 ON 25341_db.25341_rptable USING x=1 AS permissive TO ALL; -SELECT 'R1: x == 1'; -SELECT * FROM 25341_db.25341_rptable; - -CREATE ROW POLICY 25341_filter_2 ON 25341_db.* USING x=2 AS permissive TO ALL; -SELECT 'R1, R2: (x == 1) OR (x == 2)'; -SELECT * FROM 25341_db.25341_rptable; - -SELECT 'R1, R2: (x == 2) FROM ANOTHER'; -SELECT * FROM 25341_db.25341_rptable_another; - -CREATE ROW POLICY 25341_filter_3 ON 25341_db.25341_rptable USING x=3 AS permissive TO ALL; -SELECT 'R1, R2, R3: (x == 1) OR (x == 2) OR (x == 3)'; -SELECT * FROM 25341_db.25341_rptable; - -CREATE ROW POLICY 25341_filter_4 ON 25341_db.25341_rptable USING x<=2 AS restrictive TO ALL; -SELECT 'R1, R2, R3, R4: ((x == 1) OR (x == 2) OR (x == 3)) AND (x <= 2)'; -SELECT * FROM 25341_db.25341_rptable; - -CREATE ROW POLICY 25341_filter_5 ON 25341_db.* USING y>=20 AS restrictive TO ALL; -SELECT 'R1, R2, R3, R4, R5: ((x == 1) OR (x == 2) OR (x == 3)) AND (x <= 2) AND (y >= 20)'; -SELECT * FROM 25341_db.25341_rptable; - -CREATE TABLE 25341_db.25341_after_rp ENGINE = MergeTree ORDER BY x AS SELECT * FROM 25341_db.25341_rptable; -SELECT * FROM 25341_db.25341_after_rp; - --- does not matter if policies or table are created first -SELECT 'R1, R2, R3, R4, R5: (x == 2) AND (y >= 20) FROM AFTER_RP'; -SELECT * FROM 25341_db.25341_after_rp; - -SELECT 'R1, R2, R3, R4, R5: (x == 2) AND (y >= 20) FROM ANOTHER'; -SELECT * FROM 25341_db.25341_rptable_another; - -DROP ROW POLICY 25341_filter_1 ON 25341_db.25341_rptable; -SELECT 'R2, R3, R4, R5: ((x == 2) OR (x == 3)) AND (x <= 2) AND (y >= 20)'; -SELECT * FROM 25341_db.25341_rptable; - -DROP ROW POLICY 25341_filter_2 ON 25341_db.*; -SELECT 'R3, R4, R5: (x == 3) AND (x <= 2) AND (y >= 20)'; -SELECT * FROM 25341_db.25341_rptable; - -DROP ROW POLICY 25341_filter_3 ON 25341_db.25341_rptable; -SELECT 'R4, R5: (x <= 2) AND (y >= 20)'; -SELECT * FROM 25341_db.25341_rptable; - -DROP ROW POLICY 25341_filter_4 ON 25341_db.25341_rptable; -SELECT 'R5: (x >= 2)'; -SELECT * FROM 25341_db.25341_rptable; - -CREATE TABLE 25341_db.25341_unexpected_columns (xx UInt8, yy UInt8) ENGINE = MergeTree ORDER BY xx; -SELECT 'Policy not applicable'; -SELECT * FROM 25341_db.25341_unexpected_columns; -- { serverError 47 } -- Missing columns: 'x' while processing query - -DROP ROW POLICY 25341_filter_5 ON 25341_db.*; -SELECT 'None'; -SELECT * FROM 25341_db.25341_rptable; - -SELECT 'No problematic policy, select works'; -SELECT 'Ok' FROM 25341_db.25341_unexpected_columns; - -DROP TABLE 25341_db.25341_rptable; -DROP TABLE 25341_db.25341_rptable_another; -DROP TABLE 25341_db.25341_unexpected_columns; -DROP DATABASE 25341_db; diff --git a/tests/queries/0_stateless/25341_row_policy_database.reference b/tests/queries/0_stateless/25341_row_policy_database.reference deleted file mode 100644 index 57125b64056..00000000000 --- a/tests/queries/0_stateless/25341_row_policy_database.reference +++ /dev/null @@ -1,22 +0,0 @@ --- database level policies - -- SHOW CREATE POLICY db1_25341 ON db1_25341.* -CREATE ROW POLICY db1_25341 ON db1_25341.`*` FOR SELECT USING 1 TO ALL - -- SHOW CREATE POLICY ON db1_25341.* -CREATE ROW POLICY `25341_filter_11` ON db1_25341.`25341_rqtable` FOR SELECT USING x = 2 TO ALL -CREATE ROW POLICY db1_25341 ON db1_25341.`*` FOR SELECT USING 1 TO ALL -CREATE ROW POLICY tbl1_25341 ON db1_25341.table FOR SELECT USING 1 TO ALL - -- SHOW CREATE POLICY ON db1_25341.`*` -CREATE ROW POLICY db1_25341 ON db1_25341.`*` FOR SELECT USING 1 TO ALL -R1, R2: (x == 1) OR (x == 2) -1 -2 -Check system.query_log -SELECT \'-- database level policies\'; [] -SELECT \' -- SHOW CREATE POLICY db1_25341 ON db1_25341.*\'; [] -SELECT \' -- SHOW CREATE POLICY ON db1_25341.*\'; [] -SELECT \' -- SHOW CREATE POLICY ON db1_25341.`*`\'; [] -SELECT \'R1, R2: (x == 1) OR (x == 2)\'; [] -SELECT * FROM 25341_rqtable_default; ['`25341_filter_11_db` ON default.`*`','`25341_filter_11` ON default.`25341_rqtable_default`'] -SELECT \'Check system.query_log\'; [] - -- CREATE DATABSE-LEVEL POLICY ON IN CURRENT DATABASE -CREATE ROW POLICY db2_25341 ON db1_25341.`*` TO u1_25341 diff --git a/tests/queries/0_stateless/25341_row_policy_database.sql b/tests/queries/0_stateless/25341_row_policy_database.sql deleted file mode 100644 index 9d865487f0b..00000000000 --- a/tests/queries/0_stateless/25341_row_policy_database.sql +++ /dev/null @@ -1,53 +0,0 @@ --- Tags: no-parallel - -DROP DATABASE IF EXISTS db1_25341; -DROP USER IF EXISTS u1_25341; -CREATE USER u1_25341; - -CREATE DATABASE db1_25341; - -CREATE TABLE db1_25341.25341_rqtable (x UInt8) ENGINE = MergeTree ORDER BY x; -INSERT INTO db1_25341.25341_rqtable VALUES (1), (2), (3), (4); - - -SELECT '-- database level policies'; -CREATE ROW POLICY db1_25341 ON db1_25341.* USING 1 AS PERMISSIVE TO ALL; -CREATE ROW POLICY tbl1_25341 ON db1_25341.table USING 1 AS PERMISSIVE TO ALL; -SELECT ' -- SHOW CREATE POLICY db1_25341 ON db1_25341.*'; -SHOW CREATE POLICY db1_25341 ON db1_25341.*; -SELECT ' -- SHOW CREATE POLICY ON db1_25341.*'; -SHOW CREATE POLICY ON db1_25341.*; -SELECT ' -- SHOW CREATE POLICY ON db1_25341.`*`'; -SHOW CREATE POLICY ON db1_25341.`*`; -DROP POLICY db1_25341 ON db1_25341.*; -DROP POLICY tbl1_25341 ON db1_25341.table; - -CREATE ROW POLICY any_25341 ON *.some_table USING 1 AS PERMISSIVE TO ALL; -- { clientError 62 } - -CREATE TABLE 25341_rqtable_default (x UInt8) ENGINE = MergeTree ORDER BY x; - -CREATE ROW POLICY 25341_filter_11_db ON * USING x=1 AS permissive TO ALL; -CREATE ROW POLICY 25341_filter_11 ON 25341_rqtable_default USING x=2 AS permissive TO ALL; - -INSERT INTO 25341_rqtable_default VALUES (1), (2), (3), (4); - -SELECT 'R1, R2: (x == 1) OR (x == 2)'; -SELECT * FROM 25341_rqtable_default; - -DROP TABLE 25341_rqtable_default; - -SELECT 'Check system.query_log'; -SYSTEM FLUSH LOGS; -SELECT query, used_row_policies FROM system.query_log WHERE current_database == currentDatabase() AND type == 'QueryStart' AND query_kind == 'Select' ORDER BY event_time_microseconds; - -DROP ROW POLICY 25341_filter_11_db ON *; -DROP ROW POLICY 25341_filter_11 ON 25341_rqtable_default; - -USE db1_25341; -SELECT ' -- CREATE DATABSE-LEVEL POLICY ON IN CURRENT DATABASE'; -CREATE ROW POLICY db2_25341 ON * TO u1_25341; -SHOW CREATE POLICY db2_25341 ON *; - -DROP ROW POLICY db2_25341 ON *; - -DROP USER u1_25341; From 6852ae0d938131863138ed532ed291acbf750444 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Tue, 2 May 2023 21:11:18 +0000 Subject: [PATCH 25/45] row_policy_template - external code review changes --- src/Access/EnabledRowPolicies.cpp | 2 +- src/Access/RowPolicy.h | 4 +++- src/Access/RowPolicyCache.cpp | 16 ++++++++++++---- src/Access/RowPolicyCache.h | 2 +- ...ase.sql => 02703_row_policy_for_database.sql} | 0 5 files changed, 17 insertions(+), 7 deletions(-) rename tests/queries/0_stateless/{02703_row_policy_database.sql => 02703_row_policy_for_database.sql} (100%) diff --git a/src/Access/EnabledRowPolicies.cpp b/src/Access/EnabledRowPolicies.cpp index 9efac6e992e..be78dd62146 100644 --- a/src/Access/EnabledRowPolicies.cpp +++ b/src/Access/EnabledRowPolicies.cpp @@ -35,7 +35,7 @@ RowPolicyFilterPtr EnabledRowPolicies::getFilter(const String & database, const auto loaded = mixed_filters.load(); auto it = loaded->find({database, table_name, filter_type}); if (it == loaded->end()) - { + { /// Look for a policy for database if a table policy not found it = loaded->find({database, RowPolicy::ANY_TABLE_MARK, filter_type}); if (it == loaded->end()) { diff --git a/src/Access/RowPolicy.h b/src/Access/RowPolicy.h index b9ba528e9bb..348ebfa1637 100644 --- a/src/Access/RowPolicy.h +++ b/src/Access/RowPolicy.h @@ -36,7 +36,9 @@ struct RowPolicy : public IAccessEntity /// in addition to all the restrictive policies. void setPermissive(bool permissive_ = true) { setRestrictive(!permissive_); } bool isPermissive() const { return !isRestrictive(); } - bool isDatabase() const { return full_name.table_name == ANY_TABLE_MARK; } + + /// Applied for entire database + bool isForDatabase() const { return full_name.table_name == ANY_TABLE_MARK; } /// Sets that the policy is restrictive. /// A row is only accessible if at least one of the permissive policies passes, diff --git a/src/Access/RowPolicyCache.cpp b/src/Access/RowPolicyCache.cpp index 07bec185131..07670a8fe84 100644 --- a/src/Access/RowPolicyCache.cpp +++ b/src/Access/RowPolicyCache.cpp @@ -218,14 +218,13 @@ void RowPolicyCache::mixFiltersFor(EnabledRowPolicies & enabled) std::vector policies; }; - std::unordered_map table_mixers; std::unordered_map database_mixers; /// populate database_mixers using database-level policies /// to aggregate (mix) rules per database for (const auto & [policy_id, info] : all_policies) { - if (info.isDatabase()) + if (info.isForDatabase()) { const auto & policy = *info.policy; bool match = info.roles->match(enabled.params.user_id, enabled.params.enabled_roles); @@ -250,10 +249,12 @@ void RowPolicyCache::mixFiltersFor(EnabledRowPolicies & enabled) } } + std::unordered_map table_mixers; + /// populate table_mixers using database_mixers and table-level policies for (const auto & [policy_id, info] : all_policies) { - if (!info.isDatabase()) + if (!info.isForDatabase()) { const auto & policy = *info.policy; bool match = info.roles->match(enabled.params.user_id, enabled.params.enabled_roles); @@ -298,7 +299,14 @@ void RowPolicyCache::mixFiltersFor(EnabledRowPolicies & enabled) auto mixed_filters = boost::make_shared(); - /// retrieve aggregated policies from mixers + /// Retrieve aggregated policies from mixers + /// if a table has a policy for this particular table, we have all needed information in table_mixers + /// (policies for the database are already applied) + /// otherwise we would look for a policy for database using RowPolicy::ANY_TABLE_MARK + /// Consider restrictive policies a=1 for db.t, b=2 for db.* and c=3 for db.* + /// We are going to have two items in mixed_filters: + /// 1. a=1 AND b=2 AND c=3 for db.t (comes from table_mixers, where it had been created with the help of database_mixers) + /// 2. b=2 AND c=3 for db.* (comes directly from database_mixers) for (auto * mixer_map_ptr : {&table_mixers, &database_mixers}) { for (auto & [key, mixer] : *mixer_map_ptr) diff --git a/src/Access/RowPolicyCache.h b/src/Access/RowPolicyCache.h index 7260de22164..df263416509 100644 --- a/src/Access/RowPolicyCache.h +++ b/src/Access/RowPolicyCache.h @@ -29,7 +29,7 @@ private: explicit PolicyInfo(const RowPolicyPtr & policy_) { setPolicy(policy_); } void setPolicy(const RowPolicyPtr & policy_); - bool isDatabase() const { return policy->isDatabase(); } + bool isForDatabase() const { return policy->isForDatabase(); } RowPolicyPtr policy; const RolesOrUsersSet * roles = nullptr; std::shared_ptr> database_and_table_name; diff --git a/tests/queries/0_stateless/02703_row_policy_database.sql b/tests/queries/0_stateless/02703_row_policy_for_database.sql similarity index 100% rename from tests/queries/0_stateless/02703_row_policy_database.sql rename to tests/queries/0_stateless/02703_row_policy_for_database.sql From d54a62e8e9ac1db9b0e37b75902e6762e520e3aa Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Wed, 3 May 2023 08:56:45 +0000 Subject: [PATCH 26/45] row_policy_template - forgotten reference test file --- ...database.reference => 02703_row_policy_for_database.reference} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/queries/0_stateless/{02703_row_policy_database.reference => 02703_row_policy_for_database.reference} (100%) diff --git a/tests/queries/0_stateless/02703_row_policy_database.reference b/tests/queries/0_stateless/02703_row_policy_for_database.reference similarity index 100% rename from tests/queries/0_stateless/02703_row_policy_database.reference rename to tests/queries/0_stateless/02703_row_policy_for_database.reference From 9685aa0c9174d11905871aad2bd598e03d017bcd Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Thu, 4 May 2023 14:35:31 +0000 Subject: [PATCH 27/45] row_policy_template - minor changes, test renamed --- ... => 02703_row_policies_for_database_combination.reference} | 0 ...on.sql => 02703_row_policies_for_database_combination.sql} | 0 tests/queries/0_stateless/02703_row_policy_for_database.sql | 4 ++-- 3 files changed, 2 insertions(+), 2 deletions(-) rename tests/queries/0_stateless/{02703_row_policies_database_combination.reference => 02703_row_policies_for_database_combination.reference} (100%) rename tests/queries/0_stateless/{02703_row_policies_database_combination.sql => 02703_row_policies_for_database_combination.sql} (100%) diff --git a/tests/queries/0_stateless/02703_row_policies_database_combination.reference b/tests/queries/0_stateless/02703_row_policies_for_database_combination.reference similarity index 100% rename from tests/queries/0_stateless/02703_row_policies_database_combination.reference rename to tests/queries/0_stateless/02703_row_policies_for_database_combination.reference diff --git a/tests/queries/0_stateless/02703_row_policies_database_combination.sql b/tests/queries/0_stateless/02703_row_policies_for_database_combination.sql similarity index 100% rename from tests/queries/0_stateless/02703_row_policies_database_combination.sql rename to tests/queries/0_stateless/02703_row_policies_for_database_combination.sql diff --git a/tests/queries/0_stateless/02703_row_policy_for_database.sql b/tests/queries/0_stateless/02703_row_policy_for_database.sql index 85f5a44dfbf..03183a96b98 100644 --- a/tests/queries/0_stateless/02703_row_policy_for_database.sql +++ b/tests/queries/0_stateless/02703_row_policy_for_database.sql @@ -10,7 +10,7 @@ CREATE TABLE db1_02703.02703_rqtable (x UInt8) ENGINE = MergeTree ORDER BY x; INSERT INTO db1_02703.02703_rqtable VALUES (1), (2), (3), (4); -SELECT '-- database level policies'; +SELECT '-- row policies for database'; CREATE ROW POLICY db1_02703 ON db1_02703.* USING 1 AS PERMISSIVE TO ALL; CREATE ROW POLICY tbl1_02703 ON db1_02703.table USING 1 AS PERMISSIVE TO ALL; SELECT ' -- SHOW CREATE POLICY db1_02703 ON db1_02703.*'; @@ -44,7 +44,7 @@ DROP ROW POLICY 02703_filter_11_db ON *; DROP ROW POLICY 02703_filter_11 ON 02703_rqtable_default; USE db1_02703; -SELECT ' -- CREATE DATABSE-LEVEL POLICY ON IN CURRENT DATABASE'; +SELECT ' -- CREATE DATABASE-LEVEL POLICY IN CURRENT DATABASE'; CREATE ROW POLICY db2_02703 ON * TO u1_02703; SHOW CREATE POLICY db2_02703 ON *; From 1027db6acaadbb20fcfedaac3416d98bd75b8e7d Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Fri, 5 May 2023 11:06:15 +0000 Subject: [PATCH 28/45] row_policy_template - minor change, reference test file fix --- .../0_stateless/02703_row_policy_for_database.reference | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/02703_row_policy_for_database.reference b/tests/queries/0_stateless/02703_row_policy_for_database.reference index e318dfac429..5f0b486c1bd 100644 --- a/tests/queries/0_stateless/02703_row_policy_for_database.reference +++ b/tests/queries/0_stateless/02703_row_policy_for_database.reference @@ -1,4 +1,4 @@ --- database level policies +-- row policies for database -- SHOW CREATE POLICY db1_02703 ON db1_02703.* CREATE ROW POLICY db1_02703 ON db1_02703.`*` FOR SELECT USING 1 TO ALL -- SHOW CREATE POLICY ON db1_02703.* @@ -10,12 +10,12 @@ R1, R2: (x == 1) OR (x == 2) 1 2 Check system.query_log -SELECT \'-- database level policies\'; [] +SELECT \'-- row policies for database\'; [] SELECT \' -- SHOW CREATE POLICY db1_02703 ON db1_02703.*\'; [] SELECT \' -- SHOW CREATE POLICY ON db1_02703.*\'; [] SELECT \' -- SHOW CREATE POLICY ON db1_02703.`*`\'; [] SELECT \'R1, R2: (x == 1) OR (x == 2)\'; [] SELECT * FROM 02703_rqtable_default; ['`02703_filter_11_db` ON default.`*`','`02703_filter_11` ON default.`02703_rqtable_default`'] SELECT \'Check system.query_log\'; [] - -- CREATE DATABSE-LEVEL POLICY ON IN CURRENT DATABASE + -- CREATE DATABASE-LEVEL POLICY IN CURRENT DATABASE CREATE ROW POLICY db2_02703 ON db1_02703.`*` TO u1_02703 From 9ef610040f691bc39ca711566de9feb2fbde44f1 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Fri, 12 May 2023 22:30:00 +0000 Subject: [PATCH 29/45] row_policy_template - table `*` is respected --- docs/en/operations/system-tables/row_policies.md | 2 +- src/Access/Common/RowPolicyDefs.cpp | 2 +- src/Access/Common/RowPolicyDefs.h | 2 ++ src/Access/EnabledRowPolicies.cpp | 10 +++++++++- src/Access/RowPolicy.h | 4 +--- src/Access/RowPolicyCache.cpp | 2 +- src/Parsers/Access/ASTRowPolicyName.cpp | 11 ++++++++--- src/Parsers/Access/ASTRowPolicyName.h | 3 +++ src/Parsers/Access/ParserRowPolicyName.cpp | 2 +- ...03_row_policies_for_database_combination.reference | 2 ++ .../02703_row_policies_for_database_combination.sql | 10 ++++++++++ .../02703_row_policy_for_database.reference | 9 ++++----- 12 files changed, 43 insertions(+), 16 deletions(-) diff --git a/docs/en/operations/system-tables/row_policies.md b/docs/en/operations/system-tables/row_policies.md index 2c4d060ce66..e92ba1ece74 100644 --- a/docs/en/operations/system-tables/row_policies.md +++ b/docs/en/operations/system-tables/row_policies.md @@ -12,7 +12,7 @@ Columns: - `database` ([String](../../sql-reference/data-types/string.md)) — Database name. -- `table` ([String](../../sql-reference/data-types/string.md)) — Table name. +- `table` ([String](../../sql-reference/data-types/string.md)) — Table name. Empty if policy for database. - `id` ([UUID](../../sql-reference/data-types/uuid.md)) — Row policy ID. diff --git a/src/Access/Common/RowPolicyDefs.cpp b/src/Access/Common/RowPolicyDefs.cpp index ba7856116f6..b1f882fe971 100644 --- a/src/Access/Common/RowPolicyDefs.cpp +++ b/src/Access/Common/RowPolicyDefs.cpp @@ -22,7 +22,7 @@ String RowPolicyName::toString() const name += backQuoteIfNeed(database); name += '.'; } - name += backQuoteIfNeed(table_name); + name += (table_name == RowPolicyName::ANY_TABLE_MARK ? "*" : backQuoteIfNeed(table_name)); return name; } diff --git a/src/Access/Common/RowPolicyDefs.h b/src/Access/Common/RowPolicyDefs.h index 792884c56df..7ffc99e1272 100644 --- a/src/Access/Common/RowPolicyDefs.h +++ b/src/Access/Common/RowPolicyDefs.h @@ -9,6 +9,8 @@ namespace DB /// Represents the full name of a row policy, e.g. "myfilter ON mydb.mytable". struct RowPolicyName { + static constexpr char ANY_TABLE_MARK[] = ""; + String short_name; String database; String table_name; diff --git a/src/Access/EnabledRowPolicies.cpp b/src/Access/EnabledRowPolicies.cpp index be78dd62146..601f004e3ea 100644 --- a/src/Access/EnabledRowPolicies.cpp +++ b/src/Access/EnabledRowPolicies.cpp @@ -36,11 +36,19 @@ RowPolicyFilterPtr EnabledRowPolicies::getFilter(const String & database, const auto it = loaded->find({database, table_name, filter_type}); if (it == loaded->end()) { /// Look for a policy for database if a table policy not found - it = loaded->find({database, RowPolicy::ANY_TABLE_MARK, filter_type}); + it = loaded->find({database, RowPolicyName::ANY_TABLE_MARK, filter_type}); if (it == loaded->end()) { return {}; } + else + { + // deep copy found policy for database and change its table name to the actual one + auto policy_for_database = std::make_shared(*it->second); + auto database_and_table_name = std::make_shared>(database, table_name); + policy_for_database->database_and_table_name = database_and_table_name; + return policy_for_database; + } } return it->second; diff --git a/src/Access/RowPolicy.h b/src/Access/RowPolicy.h index 348ebfa1637..9c190458620 100644 --- a/src/Access/RowPolicy.h +++ b/src/Access/RowPolicy.h @@ -14,8 +14,6 @@ namespace DB */ struct RowPolicy : public IAccessEntity { - static constexpr char ANY_TABLE_MARK[] = "*"; - void setShortName(const String & short_name); void setDatabase(const String & database); void setTableName(const String & table_name); @@ -38,7 +36,7 @@ struct RowPolicy : public IAccessEntity bool isPermissive() const { return !isRestrictive(); } /// Applied for entire database - bool isForDatabase() const { return full_name.table_name == ANY_TABLE_MARK; } + bool isForDatabase() const { return full_name.table_name == RowPolicyName::ANY_TABLE_MARK; } /// Sets that the policy is restrictive. /// A row is only accessible if at least one of the permissive policies passes, diff --git a/src/Access/RowPolicyCache.cpp b/src/Access/RowPolicyCache.cpp index 07670a8fe84..bb9da674477 100644 --- a/src/Access/RowPolicyCache.cpp +++ b/src/Access/RowPolicyCache.cpp @@ -270,7 +270,7 @@ void RowPolicyCache::mixFiltersFor(EnabledRowPolicies & enabled) if (table_it == table_mixers.end()) { /// no exact match - create new mixer MixedFiltersKey database_key = key; - database_key.table_name = RowPolicy::ANY_TABLE_MARK; + database_key.table_name = RowPolicyName::ANY_TABLE_MARK; auto database_it = database_mixers.find(database_key); diff --git a/src/Parsers/Access/ASTRowPolicyName.cpp b/src/Parsers/Access/ASTRowPolicyName.cpp index 4edfa61f10e..81a90de9d53 100644 --- a/src/Parsers/Access/ASTRowPolicyName.cpp +++ b/src/Parsers/Access/ASTRowPolicyName.cpp @@ -30,6 +30,11 @@ void ASTRowPolicyName::replaceEmptyDatabase(const String & current_database) full_name.database = current_database; } +String ASTRowPolicyNames::tableOrAsterisk(const String & table_name) const +{ + return table_name == RowPolicyName::ANY_TABLE_MARK ? "*" : backQuoteIfNeed(table_name); +} + void ASTRowPolicyNames::formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const { @@ -73,7 +78,7 @@ void ASTRowPolicyNames::formatImpl(const FormatSettings & settings, FormatState const String & table_name = full_name.table_name; if (!database.empty()) settings.ostr << backQuoteIfNeed(database) + "."; - settings.ostr << backQuoteIfNeed(table_name); + settings.ostr << tableOrAsterisk(table_name); } } else if (same_db_and_table_name) @@ -92,7 +97,7 @@ void ASTRowPolicyNames::formatImpl(const FormatSettings & settings, FormatState settings.ostr << (settings.hilite ? hilite_keyword : "") << " ON " << (settings.hilite ? hilite_none : ""); if (!database.empty()) settings.ostr << backQuoteIfNeed(database) + "."; - settings.ostr << backQuoteIfNeed(table_name); + settings.ostr << tableOrAsterisk(table_name); } else { @@ -108,7 +113,7 @@ void ASTRowPolicyNames::formatImpl(const FormatSettings & settings, FormatState << (settings.hilite ? hilite_none : ""); if (!database.empty()) settings.ostr << backQuoteIfNeed(database) + "."; - settings.ostr << backQuoteIfNeed(table_name); + settings.ostr << tableOrAsterisk(table_name); } } diff --git a/src/Parsers/Access/ASTRowPolicyName.h b/src/Parsers/Access/ASTRowPolicyName.h index 9f4848bd612..86171475a0a 100644 --- a/src/Parsers/Access/ASTRowPolicyName.h +++ b/src/Parsers/Access/ASTRowPolicyName.h @@ -45,5 +45,8 @@ public: ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override { return removeOnCluster(clone()); } void replaceEmptyDatabase(const String & current_database); + +private: + String tableOrAsterisk(const String & table_name) const; }; } diff --git a/src/Parsers/Access/ParserRowPolicyName.cpp b/src/Parsers/Access/ParserRowPolicyName.cpp index e5b4e01d5ac..efdff3c24bf 100644 --- a/src/Parsers/Access/ParserRowPolicyName.cpp +++ b/src/Parsers/Access/ParserRowPolicyName.cpp @@ -36,7 +36,7 @@ namespace } else if (is_any_table) { - res_table_name = "*"; // RowPolicy::ANY_TABLE_MARK + res_table_name = RowPolicyName::ANY_TABLE_MARK; } /// If table is specified without DB it cannot be followed by "ON" diff --git a/tests/queries/0_stateless/02703_row_policies_for_database_combination.reference b/tests/queries/0_stateless/02703_row_policies_for_database_combination.reference index 68ed02d1dc0..fa01904b846 100644 --- a/tests/queries/0_stateless/02703_row_policies_for_database_combination.reference +++ b/tests/queries/0_stateless/02703_row_policies_for_database_combination.reference @@ -40,3 +40,5 @@ None 3 30 4 40 No problematic policy, select works +Policy for table `*` does not affect other tables in the database +other 100 20 diff --git a/tests/queries/0_stateless/02703_row_policies_for_database_combination.sql b/tests/queries/0_stateless/02703_row_policies_for_database_combination.sql index f9b466f1ade..9941d69979d 100644 --- a/tests/queries/0_stateless/02703_row_policies_for_database_combination.sql +++ b/tests/queries/0_stateless/02703_row_policies_for_database_combination.sql @@ -86,3 +86,13 @@ DROP TABLE 02703_db.02703_rptable; DROP TABLE 02703_db.02703_rptable_another; DROP TABLE 02703_db.02703_unexpected_columns; DROP DATABASE 02703_db; + +SELECT 'Policy for table `*` does not affect other tables in the database'; +CREATE DATABASE 02703_db_asterisk; +CREATE ROW POLICY 02703_asterisk ON 02703_db_asterisk.`*` USING x=1 AS permissive TO ALL; +CREATE TABLE 02703_db_asterisk.`*` (x UInt8, y UInt8) ENGINE = MergeTree ORDER BY x AS SELECT 100, 20; +CREATE TABLE 02703_db_asterisk.`other` (x UInt8, y UInt8) ENGINE = MergeTree ORDER BY x AS SELECT 100, 20; +SELECT 'star', * FROM 02703_db_asterisk.`*`; +SELECT 'other', * FROM 02703_db_asterisk.other; +DROP ROW POLICY 02703_asterisk ON 02703_db_asterisk.`*`; +DROP DATABASE 02703_db_asterisk; diff --git a/tests/queries/0_stateless/02703_row_policy_for_database.reference b/tests/queries/0_stateless/02703_row_policy_for_database.reference index 5f0b486c1bd..ec03e538d95 100644 --- a/tests/queries/0_stateless/02703_row_policy_for_database.reference +++ b/tests/queries/0_stateless/02703_row_policy_for_database.reference @@ -1,11 +1,10 @@ -- row policies for database -- SHOW CREATE POLICY db1_02703 ON db1_02703.* -CREATE ROW POLICY db1_02703 ON db1_02703.`*` FOR SELECT USING 1 TO ALL +CREATE ROW POLICY db1_02703 ON db1_02703.* FOR SELECT USING 1 TO ALL -- SHOW CREATE POLICY ON db1_02703.* -CREATE ROW POLICY db1_02703 ON db1_02703.`*` FOR SELECT USING 1 TO ALL +CREATE ROW POLICY db1_02703 ON db1_02703.* FOR SELECT USING 1 TO ALL CREATE ROW POLICY tbl1_02703 ON db1_02703.table FOR SELECT USING 1 TO ALL -- SHOW CREATE POLICY ON db1_02703.`*` -CREATE ROW POLICY db1_02703 ON db1_02703.`*` FOR SELECT USING 1 TO ALL R1, R2: (x == 1) OR (x == 2) 1 2 @@ -15,7 +14,7 @@ SELECT \' -- SHOW CREATE POLICY db1_02703 ON db1_02703.*\'; [] SELECT \' -- SHOW CREATE POLICY ON db1_02703.*\'; [] SELECT \' -- SHOW CREATE POLICY ON db1_02703.`*`\'; [] SELECT \'R1, R2: (x == 1) OR (x == 2)\'; [] -SELECT * FROM 02703_rqtable_default; ['`02703_filter_11_db` ON default.`*`','`02703_filter_11` ON default.`02703_rqtable_default`'] +SELECT * FROM 02703_rqtable_default; ['`02703_filter_11_db` ON default.*','`02703_filter_11` ON default.`02703_rqtable_default`'] SELECT \'Check system.query_log\'; [] -- CREATE DATABASE-LEVEL POLICY IN CURRENT DATABASE -CREATE ROW POLICY db2_02703 ON db1_02703.`*` TO u1_02703 +CREATE ROW POLICY db2_02703 ON db1_02703.* TO u1_02703 From ccd82842a0e197f3aa1dbfda2f2977382ad16345 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Fri, 12 May 2023 22:32:53 +0000 Subject: [PATCH 30/45] row_policy_template - rollback tweaking found policy for database --- src/Access/EnabledRowPolicies.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/Access/EnabledRowPolicies.cpp b/src/Access/EnabledRowPolicies.cpp index 601f004e3ea..8ab1bf5928b 100644 --- a/src/Access/EnabledRowPolicies.cpp +++ b/src/Access/EnabledRowPolicies.cpp @@ -41,14 +41,6 @@ RowPolicyFilterPtr EnabledRowPolicies::getFilter(const String & database, const { return {}; } - else - { - // deep copy found policy for database and change its table name to the actual one - auto policy_for_database = std::make_shared(*it->second); - auto database_and_table_name = std::make_shared>(database, table_name); - policy_for_database->database_and_table_name = database_and_table_name; - return policy_for_database; - } } return it->second; From ad8e114bcf411104e08ffe4c71752e08fa6d308a Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Sat, 13 May 2023 07:20:38 +0000 Subject: [PATCH 31/45] row_policy_template - row policies not in query log if analyzer used --- tests/broken_tests.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index 3278f720daf..0b4efacba0b 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -138,3 +138,4 @@ 01600_parts_states_metrics_long 01600_parts_types_metrics_long 01287_max_execution_speed +02703_row_policy_for_database From 47e7e7e60a5ac177e51d32dd9cf2080dbab85f75 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Sat, 13 May 2023 22:26:31 +0000 Subject: [PATCH 32/45] row_policy_template - test for asterisk table in a dedicated file --- .../02703_row_policies_for_asterisk.reference | 2 ++ .../0_stateless/02703_row_policies_for_asterisk.sql | 11 +++++++++++ ...03_row_policies_for_database_combination.reference | 2 -- .../02703_row_policies_for_database_combination.sql | 10 ---------- 4 files changed, 13 insertions(+), 12 deletions(-) create mode 100644 tests/queries/0_stateless/02703_row_policies_for_asterisk.reference create mode 100644 tests/queries/0_stateless/02703_row_policies_for_asterisk.sql diff --git a/tests/queries/0_stateless/02703_row_policies_for_asterisk.reference b/tests/queries/0_stateless/02703_row_policies_for_asterisk.reference new file mode 100644 index 00000000000..528bd7ef91e --- /dev/null +++ b/tests/queries/0_stateless/02703_row_policies_for_asterisk.reference @@ -0,0 +1,2 @@ +Policy for table `*` does not affect other tables in the database +other 100 20 diff --git a/tests/queries/0_stateless/02703_row_policies_for_asterisk.sql b/tests/queries/0_stateless/02703_row_policies_for_asterisk.sql new file mode 100644 index 00000000000..96b1c01a6d6 --- /dev/null +++ b/tests/queries/0_stateless/02703_row_policies_for_asterisk.sql @@ -0,0 +1,11 @@ +-- Tags: no-parallel + +SELECT 'Policy for table `*` does not affect other tables in the database'; +CREATE DATABASE 02703_db_asterisk; +CREATE ROW POLICY 02703_asterisk ON 02703_db_asterisk.`*` USING x=1 AS permissive TO ALL; +CREATE TABLE 02703_db_asterisk.`*` (x UInt8, y UInt8) ENGINE = MergeTree ORDER BY x AS SELECT 100, 20; +CREATE TABLE 02703_db_asterisk.`other` (x UInt8, y UInt8) ENGINE = MergeTree ORDER BY x AS SELECT 100, 20; +SELECT 'star', * FROM 02703_db_asterisk.`*`; +SELECT 'other', * FROM 02703_db_asterisk.other; +DROP ROW POLICY 02703_asterisk ON 02703_db_asterisk.`*`; +DROP DATABASE 02703_db_asterisk; diff --git a/tests/queries/0_stateless/02703_row_policies_for_database_combination.reference b/tests/queries/0_stateless/02703_row_policies_for_database_combination.reference index fa01904b846..68ed02d1dc0 100644 --- a/tests/queries/0_stateless/02703_row_policies_for_database_combination.reference +++ b/tests/queries/0_stateless/02703_row_policies_for_database_combination.reference @@ -40,5 +40,3 @@ None 3 30 4 40 No problematic policy, select works -Policy for table `*` does not affect other tables in the database -other 100 20 diff --git a/tests/queries/0_stateless/02703_row_policies_for_database_combination.sql b/tests/queries/0_stateless/02703_row_policies_for_database_combination.sql index 9941d69979d..f9b466f1ade 100644 --- a/tests/queries/0_stateless/02703_row_policies_for_database_combination.sql +++ b/tests/queries/0_stateless/02703_row_policies_for_database_combination.sql @@ -86,13 +86,3 @@ DROP TABLE 02703_db.02703_rptable; DROP TABLE 02703_db.02703_rptable_another; DROP TABLE 02703_db.02703_unexpected_columns; DROP DATABASE 02703_db; - -SELECT 'Policy for table `*` does not affect other tables in the database'; -CREATE DATABASE 02703_db_asterisk; -CREATE ROW POLICY 02703_asterisk ON 02703_db_asterisk.`*` USING x=1 AS permissive TO ALL; -CREATE TABLE 02703_db_asterisk.`*` (x UInt8, y UInt8) ENGINE = MergeTree ORDER BY x AS SELECT 100, 20; -CREATE TABLE 02703_db_asterisk.`other` (x UInt8, y UInt8) ENGINE = MergeTree ORDER BY x AS SELECT 100, 20; -SELECT 'star', * FROM 02703_db_asterisk.`*`; -SELECT 'other', * FROM 02703_db_asterisk.other; -DROP ROW POLICY 02703_asterisk ON 02703_db_asterisk.`*`; -DROP DATABASE 02703_db_asterisk; From 665545ec45d4fc0e15e602dcbcc990621d904623 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 15 May 2023 09:35:04 +0200 Subject: [PATCH 33/45] Fix "reference to local binding" after fixes for clang-17 Follow-up for: #49851 (cc @alexey-milovidov) Signed-off-by: Azat Khuzhin --- src/Coordination/KeeperStorage.cpp | 34 +++++++++++++++--------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index 58fccc83e6a..7a1a5e42632 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -339,37 +339,37 @@ void KeeperStorage::UncommittedState::applyDelta(const Delta & delta) nodes.emplace(delta.path, UncommittedNode{.node = nullptr}); } - auto & [node, acls, last_applied_zxid] = nodes.at(delta.path); - std::visit( - [&, &my_node = node, &my_acls = acls, &my_last_applied_zxid = last_applied_zxid](const DeltaType & operation) + [&](const DeltaType & operation) { + auto & [node, acls, last_applied_zxid] = nodes.at(delta.path); + if constexpr (std::same_as) { - assert(!my_node); - my_node = std::make_shared(); - my_node->stat = operation.stat; - my_node->setData(operation.data); - my_acls = operation.acls; - my_last_applied_zxid = delta.zxid; + assert(!node); + node = std::make_shared(); + node->stat = operation.stat; + node->setData(operation.data); + acls = operation.acls; + last_applied_zxid = delta.zxid; } else if constexpr (std::same_as) { - assert(my_node); - my_node = nullptr; - my_last_applied_zxid = delta.zxid; + assert(node); + node = nullptr; + last_applied_zxid = delta.zxid; } else if constexpr (std::same_as) { - assert(my_node); - my_node->invalidateDigestCache(); + assert(node); + node->invalidateDigestCache(); operation.update_fn(*node); - my_last_applied_zxid = delta.zxid; + last_applied_zxid = delta.zxid; } else if constexpr (std::same_as) { - my_acls = operation.acls; - my_last_applied_zxid = delta.zxid; + acls = operation.acls; + last_applied_zxid = delta.zxid; } }, delta.operation); From 424a20ee1cb53a0c64d13c6ed08d1ce54872ddb5 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 15 May 2023 14:17:32 +0200 Subject: [PATCH 34/45] Log with warning if the server was terminated forcefully In case of it is terminated forcefully it will not be terminated gracefully (i.e. run dtors and stuff), and by using warning log level those messages will go to clickhouse-server.err.log, in which messages are kept for a longer period then in clickhouse-server.log (at least because it contains only warnings, errors and fatals only). This will help with investigating some obscure issues. Signed-off-by: Azat Khuzhin --- programs/server/Server.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index b5e44d90129..632f3f3a02d 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1872,7 +1872,7 @@ try } if (current_connections) - LOG_INFO(log, "Closed all listening sockets. Waiting for {} outstanding connections.", current_connections); + LOG_WARNING(log, "Closed all listening sockets. Waiting for {} outstanding connections.", current_connections); else LOG_INFO(log, "Closed all listening sockets."); @@ -1884,7 +1884,7 @@ try current_connections = waitServersToFinish(servers, config().getInt("shutdown_wait_unfinished", 5)); if (current_connections) - LOG_INFO(log, "Closed connections. But {} remain." + LOG_WARNING(log, "Closed connections. But {} remain." " Tip: To increase wait time add to config: 60", current_connections); else LOG_INFO(log, "Closed connections."); @@ -1900,7 +1900,7 @@ try /// Dump coverage here, because std::atexit callback would not be called. dumpCoverageReportIfPossible(); - LOG_INFO(log, "Will shutdown forcefully."); + LOG_WARNING(log, "Will shutdown forcefully."); safeExit(0); } }); From dccdb3e6786d10803968504d1179804789edaf0a Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Mon, 15 May 2023 14:41:51 +0200 Subject: [PATCH 35/45] work with comments on PR --- ...oryStream.cpp => StdIStreamFromMemory.cpp} | 20 ++++------- ...3MemoryStream.h => StdIStreamFromMemory.h} | 17 +++++----- src/IO/WriteBufferFromS3.cpp | 24 ++++++------- src/IO/WriteBufferFromS3.h | 14 +++++++- ...riteBufferFromS3BufferAllocationPolicy.cpp | 34 +++++++++++-------- .../WriteBufferFromS3BufferAllocationPolicy.h | 26 -------------- src/IO/WriteBufferFromS3TaskTracker.cpp | 10 +++--- src/IO/WriteBufferFromS3TaskTracker.h | 10 ++++-- 8 files changed, 72 insertions(+), 83 deletions(-) rename src/IO/{WriteBufferFromS3MemoryStream.cpp => StdIStreamFromMemory.cpp} (67%) rename src/IO/{WriteBufferFromS3MemoryStream.h => StdIStreamFromMemory.h} (60%) delete mode 100644 src/IO/WriteBufferFromS3BufferAllocationPolicy.h diff --git a/src/IO/WriteBufferFromS3MemoryStream.cpp b/src/IO/StdIStreamFromMemory.cpp similarity index 67% rename from src/IO/WriteBufferFromS3MemoryStream.cpp rename to src/IO/StdIStreamFromMemory.cpp index 6271f15f055..3242a7e6383 100644 --- a/src/IO/WriteBufferFromS3MemoryStream.cpp +++ b/src/IO/StdIStreamFromMemory.cpp @@ -1,27 +1,24 @@ -#include "config.h" - -#if USE_AWS_S3 - -#include +#include namespace DB { -MemoryStream::MemoryBuf::MemoryBuf(char * begin_, size_t size_) +StdIStreamFromMemory::MemoryBuf::MemoryBuf(char * begin_, size_t size_) : begin(begin_) , size(size_) { this->setg(begin, begin, begin + size); } -MemoryStream::MemoryBuf::int_type MemoryStream::MemoryBuf::underflow() +StdIStreamFromMemory::MemoryBuf::int_type StdIStreamFromMemory::MemoryBuf::underflow() { if (gptr() < egptr()) return traits_type::to_int_type(*gptr()); return traits_type::eof(); } -MemoryStream::MemoryBuf::pos_type MemoryStream::MemoryBuf::seekoff(off_type off, std::ios_base::seekdir way, +StdIStreamFromMemory::MemoryBuf::pos_type +StdIStreamFromMemory::MemoryBuf::seekoff(off_type off, std::ios_base::seekdir way, std::ios_base::openmode mode) { bool out_mode = (std::ios_base::out & mode) != 0; @@ -49,13 +46,13 @@ MemoryStream::MemoryBuf::pos_type MemoryStream::MemoryBuf::seekoff(off_type off, return pos_type(ret); } -MemoryStream::MemoryBuf::pos_type MemoryStream::MemoryBuf::seekpos(pos_type sp, +StdIStreamFromMemory::MemoryBuf::pos_type StdIStreamFromMemory::MemoryBuf::seekpos(pos_type sp, std::ios_base::openmode mode) { return seekoff(off_type(sp), std::ios_base::beg, mode); } -MemoryStream::MemoryStream(char * begin_, size_t size_) +StdIStreamFromMemory::StdIStreamFromMemory(char * begin_, size_t size_) : std::iostream(nullptr) , mem_buf(begin_, size_) { @@ -63,6 +60,3 @@ MemoryStream::MemoryStream(char * begin_, size_t size_) } } - -#endif - diff --git a/src/IO/WriteBufferFromS3MemoryStream.h b/src/IO/StdIStreamFromMemory.h similarity index 60% rename from src/IO/WriteBufferFromS3MemoryStream.h rename to src/IO/StdIStreamFromMemory.h index e9606798910..64b147fd296 100644 --- a/src/IO/WriteBufferFromS3MemoryStream.h +++ b/src/IO/StdIStreamFromMemory.h @@ -1,15 +1,15 @@ #pragma once -#include "config.h" - -#if USE_AWS_S3 - #include namespace DB { -struct MemoryStream: std::iostream +/// StdIStreamFromMemory is used in WriteBufferFromS3 as a stream which is passed to the S3::Client +/// It provides istream interface (only reading) over the memory. +/// However S3::Client requires iostream interface it only reads from the stream + +class StdIStreamFromMemory : public std::iostream { struct MemoryBuf: std::streambuf { @@ -27,11 +27,10 @@ struct MemoryStream: std::iostream size_t size = 0; }; - MemoryStream(char * begin_, size_t size_); - MemoryBuf mem_buf; + +public: + StdIStreamFromMemory(char * begin_, size_t size_); }; } - -#endif diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 73d78cb13be..01ab8ff7cbb 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -2,8 +2,8 @@ #if USE_AWS_S3 +#include "StdIStreamFromMemory.h" #include "WriteBufferFromS3.h" -#include "WriteBufferFromS3MemoryStream.h" #include "WriteBufferFromS3TaskTracker.h" #include @@ -63,7 +63,7 @@ struct WriteBufferFromS3::PartData std::shared_ptr createAwsBuffer() { - auto buffer = std::make_shared(memory.data(), data_size); + auto buffer = std::make_shared(memory.data(), data_size); buffer->exceptions(std::ios::badbit); return buffer; } @@ -108,7 +108,7 @@ void WriteBufferFromS3::nextImpl() "Cannot write to prefinalized buffer for S3, the file could have been created with PutObjectRequest"); /// Make sense to call to before adding new async task to check if there is an exception - task_tracker->getReady(); + task_tracker->waitReady(); hidePartialData(); @@ -132,7 +132,7 @@ void WriteBufferFromS3::preFinalize() LOG_TRACE(log, "preFinalize WriteBufferFromS3. {}", getLogDetails()); - task_tracker->getReady(); + task_tracker->waitReady(); hidePartialData(); @@ -178,7 +178,7 @@ void WriteBufferFromS3::finalizeImpl() chassert(offset() == 0); chassert(hidden_size == 0); - task_tracker->getAll(); + task_tracker->waitAll(); if (!multipart_upload_id.empty()) { @@ -266,10 +266,10 @@ void WriteBufferFromS3::reallocateFirstBuffer() { chassert(offset() == 0); - if (buffer_allocation_policy->getNumber() > 1 || available() > 0) + if (buffer_allocation_policy->getBufferNumber() > 1 || available() > 0) return; - const size_t max_first_buffer = buffer_allocation_policy->getSize(); + const size_t max_first_buffer = buffer_allocation_policy->getBufferSize(); if (memory.size() == max_first_buffer) return; @@ -299,7 +299,7 @@ void WriteBufferFromS3::detachBuffer() void WriteBufferFromS3::allocateFirstBuffer() { - const auto max_first_buffer = buffer_allocation_policy->getSize(); + const auto max_first_buffer = buffer_allocation_policy->getBufferSize(); const auto size = std::min(size_t(DBMS_DEFAULT_BUFFER_SIZE), max_first_buffer); memory = Memory(size); WriteBuffer::set(memory.data(), memory.size()); @@ -309,16 +309,16 @@ void WriteBufferFromS3::allocateFirstBuffer() void WriteBufferFromS3::allocateBuffer() { - buffer_allocation_policy->next(); + buffer_allocation_policy->nextBuffer(); chassert(0 == hidden_size); - if (buffer_allocation_policy->getNumber() == 1) + if (buffer_allocation_policy->getBufferNumber() == 1) return allocateFirstBuffer(); - memory = Memory(buffer_allocation_policy->getSize()); + memory = Memory(buffer_allocation_policy->getBufferSize()); WriteBuffer::set(memory.data(), memory.size()); - LOG_TRACE(log, "Allocated buffer with size {}. {}", buffer_allocation_policy->getSize(), getLogDetails()); + LOG_TRACE(log, "Allocated buffer with size {}. {}", buffer_allocation_policy->getBufferSize(), getLogDetails()); } void WriteBufferFromS3::setFakeBufferWhenPreFinalized() diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index e65127872fa..ac6c430606f 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -8,7 +8,6 @@ #include #include #include -#include #include #include @@ -41,6 +40,19 @@ public: void nextImpl() override; void preFinalize() override; +public: + class IBufferAllocationPolicy + { + public: + virtual size_t getBufferNumber() const = 0; + virtual size_t getBufferSize() const = 0; + virtual void nextBuffer() = 0; + virtual ~IBufferAllocationPolicy() = 0; + }; + using IBufferAllocationPolicyPtr = std::unique_ptr; + + static IBufferAllocationPolicyPtr ChooseBufferPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_); + private: /// Receives response from the server after sending all data. void finalizeImpl() override; diff --git a/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp b/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp index 0eec6b0d034..6347c1acfd7 100644 --- a/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp +++ b/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp @@ -2,38 +2,41 @@ #if USE_AWS_S3 -#include +#include + +#include namespace { -struct FixedSizeBufferAllocationPolicy : DB::IBufferAllocationPolicy +class FixedSizeBufferAllocationPolicy : public DB::WriteBufferFromS3::IBufferAllocationPolicy { - const size_t size = 0; + const size_t buffer_size = 0; size_t buffer_number = 0; +public: explicit FixedSizeBufferAllocationPolicy(const DB::S3Settings::RequestSettings::PartUploadSettings & settings_) - : size(settings_.strict_upload_part_size) + : buffer_size(settings_.strict_upload_part_size) { - chassert(size > 0); + chassert(buffer_size > 0); } - size_t getNumber() const override { return buffer_number; } + size_t getBufferNumber() const override { return buffer_number; } - size_t getSize() const override + size_t getBufferSize() const override { chassert(buffer_number > 0); - return size; + return buffer_size; } - void next() override + void nextBuffer() override { ++buffer_number; } }; -struct ExpBufferAllocationPolicy : DB::IBufferAllocationPolicy +class ExpBufferAllocationPolicy : public DB::WriteBufferFromS3::IBufferAllocationPolicy { const size_t first_size = 0; const size_t second_size = 0; @@ -45,6 +48,7 @@ struct ExpBufferAllocationPolicy : DB::IBufferAllocationPolicy size_t current_size = 0; size_t buffer_number = 0; +public: explicit ExpBufferAllocationPolicy(const DB::S3Settings::RequestSettings::PartUploadSettings & settings_) : first_size(std::max(settings_.max_single_part_upload_size, settings_.min_upload_part_size)) , second_size(settings_.min_upload_part_size) @@ -59,15 +63,15 @@ struct ExpBufferAllocationPolicy : DB::IBufferAllocationPolicy chassert(max_size > 0); } - size_t getNumber() const override { return buffer_number; } + size_t getBufferNumber() const override { return buffer_number; } - size_t getSize() const override + size_t getBufferSize() const override { chassert(buffer_number > 0); return current_size; } - void next() override + void nextBuffer() override { ++buffer_number; @@ -93,9 +97,9 @@ struct ExpBufferAllocationPolicy : DB::IBufferAllocationPolicy namespace DB { -IBufferAllocationPolicy::~IBufferAllocationPolicy() = default; +WriteBufferFromS3::IBufferAllocationPolicy::~IBufferAllocationPolicy() = default; -IBufferAllocationPolicyPtr ChooseBufferPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_) +WriteBufferFromS3::IBufferAllocationPolicyPtr WriteBufferFromS3::ChooseBufferPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_) { if (settings_.strict_upload_part_size > 0) return std::make_unique(settings_); diff --git a/src/IO/WriteBufferFromS3BufferAllocationPolicy.h b/src/IO/WriteBufferFromS3BufferAllocationPolicy.h deleted file mode 100644 index 1ee7c982ed2..00000000000 --- a/src/IO/WriteBufferFromS3BufferAllocationPolicy.h +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once - -#include "config.h" - -#if USE_AWS_S3 - -#include - -namespace DB -{ - -struct IBufferAllocationPolicy -{ - virtual size_t getNumber() const = 0; - virtual size_t getSize() const = 0; - virtual void next() = 0; - virtual ~IBufferAllocationPolicy() = 0; -}; - -using IBufferAllocationPolicyPtr = std::unique_ptr; - -IBufferAllocationPolicyPtr ChooseBufferPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_); - -} - -#endif diff --git a/src/IO/WriteBufferFromS3TaskTracker.cpp b/src/IO/WriteBufferFromS3TaskTracker.cpp index 0769f7731c2..7826747c0a4 100644 --- a/src/IO/WriteBufferFromS3TaskTracker.cpp +++ b/src/IO/WriteBufferFromS3TaskTracker.cpp @@ -28,9 +28,9 @@ ThreadPoolCallbackRunner WriteBufferFromS3::TaskTracker::syncRunner() }; } -void WriteBufferFromS3::TaskTracker::getReady() +void WriteBufferFromS3::TaskTracker::waitReady() { - LOG_TEST(log, "getReady, in queue {}", futures.size()); + LOG_TEST(log, "waitReady, in queue {}", futures.size()); /// Exceptions are propagated auto it = futures.begin(); @@ -55,12 +55,12 @@ void WriteBufferFromS3::TaskTracker::getReady() it = futures.erase(it); } - LOG_TEST(log, "getReady ended, in queue {}", futures.size()); + LOG_TEST(log, "waitReady ended, in queue {}", futures.size()); } -void WriteBufferFromS3::TaskTracker::getAll() +void WriteBufferFromS3::TaskTracker::waitAll() { - LOG_TEST(log, "getAll, in queue {}", futures.size()); + LOG_TEST(log, "waitAll, in queue {}", futures.size()); /// Exceptions are propagated for (auto & future : futures) diff --git a/src/IO/WriteBufferFromS3TaskTracker.h b/src/IO/WriteBufferFromS3TaskTracker.h index fa214a4f8c5..c978b9a78f0 100644 --- a/src/IO/WriteBufferFromS3TaskTracker.h +++ b/src/IO/WriteBufferFromS3TaskTracker.h @@ -9,6 +9,12 @@ namespace DB { +/// That class is used only in WriteBufferFromS3 for now. +/// Therefore it declared as a part of WriteBufferFromS3. +/// TaskTracker takes a Callback which is run by scheduler in some external shared ThreadPool. +/// TaskTracker brings the methods waitReady, waitAll/safeWaitAll +/// to help with coordination of the running tasks. + class WriteBufferFromS3::TaskTracker { public: @@ -20,8 +26,8 @@ public: static ThreadPoolCallbackRunner syncRunner(); bool isAsync() const; - void getReady(); - void getAll(); + void waitReady(); + void waitAll(); void safeWaitAll(); void add(Callback && func); From 418a61a68c2e8089b5a7372b08e398757eb9522e Mon Sep 17 00:00:00 2001 From: AVMusorin Date: Fri, 12 May 2023 12:47:14 +0200 Subject: [PATCH 36/45] Allow using Alias column type for KafkaEngine ``` create table kafka ( a UInt32, a_str String Alias toString(a) ) engine = Kafka; create table data ( a UInt32; a_str String ) engine = MergeTree order by tuple(); create materialized view data_mv to data ( a UInt32, a_str String ) as select a, a_str from kafka; ``` Alias type works as expected in comparison with MATERIALIZED/EPHEMERAL or column with default expression. Ref: https://github.com/ClickHouse/ClickHouse/pull/47138 Co-authored-by: Azat Khuzhin --- .../table-engines/integrations/kafka.md | 4 +- src/Storages/ColumnsDescription.cpp | 9 ----- src/Storages/ColumnsDescription.h | 1 - src/Storages/Kafka/StorageKafka.cpp | 14 ++++++- tests/integration/test_storage_kafka/test.py | 38 +++++++++++++++---- 5 files changed, 44 insertions(+), 22 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/kafka.md b/docs/en/engines/table-engines/integrations/kafka.md index ab69e4e90ce..ccfca4c1f1f 100644 --- a/docs/en/engines/table-engines/integrations/kafka.md +++ b/docs/en/engines/table-engines/integrations/kafka.md @@ -19,8 +19,8 @@ Kafka lets you: ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( - name1 [type1], - name2 [type2], + name1 [type1] [ALIAS expr1], + name2 [type2] [ALIAS expr2], ... ) ENGINE = Kafka() SETTINGS diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 21b140bd73a..8eabae7929c 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -383,15 +383,6 @@ NamesAndTypesList ColumnsDescription::getEphemeral() const return ret; } -NamesAndTypesList ColumnsDescription::getWithDefaultExpression() const -{ - NamesAndTypesList ret; - for (const auto & col : columns) - if (col.default_desc.expression) - ret.emplace_back(col.name, col.type); - return ret; -} - NamesAndTypesList ColumnsDescription::getAll() const { NamesAndTypesList ret; diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index e5ec867cd64..365a999673e 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -132,7 +132,6 @@ public: NamesAndTypesList getInsertable() const; /// ordinary + ephemeral NamesAndTypesList getAliases() const; NamesAndTypesList getEphemeral() const; - NamesAndTypesList getWithDefaultExpression() const; // columns with default expression, for example set by `CREATE TABLE` statement NamesAndTypesList getAllPhysical() const; /// ordinary + materialized. NamesAndTypesList getAll() const; /// ordinary + materialized + aliases + ephemeral /// Returns .size0/.null/... diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index 3381561eb1b..7d504833a0a 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -41,6 +41,7 @@ #include #include +#include "Storages/ColumnDefault.h" #include "config_version.h" #include @@ -966,9 +967,18 @@ void registerStorageKafka(StorageFactory & factory) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "kafka_poll_max_batch_size can not be lower than 1"); } - if (args.columns.getOrdinary() != args.columns.getAll() || !args.columns.getWithDefaultExpression().empty()) + NamesAndTypesList supported_columns; + for (const auto & column : args.columns) { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "KafkaEngine doesn't support DEFAULT/MATERIALIZED/EPHEMERAL/ALIAS expressions for columns. " + if (column.default_desc.kind == ColumnDefaultKind::Alias) + supported_columns.emplace_back(column.name, column.type); + if (column.default_desc.kind == ColumnDefaultKind::Default && !column.default_desc.expression) + supported_columns.emplace_back(column.name, column.type); + } + // Kafka engine allows only ordinary columns without default expression or alias columns. + if (args.columns.getAll() != supported_columns) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "KafkaEngine doesn't support DEFAULT/MATERIALIZED/EPHEMERAL expressions for columns. " "See https://clickhouse.com/docs/en/engines/table-engines/integrations/kafka/#configuration"); } diff --git a/tests/integration/test_storage_kafka/test.py b/tests/integration/test_storage_kafka/test.py index 3a4fa6c6bfe..9a6d3e0513c 100644 --- a/tests/integration/test_storage_kafka/test.py +++ b/tests/integration/test_storage_kafka/test.py @@ -285,11 +285,11 @@ def avro_confluent_message(schema_registry_client, value): # Tests -def test_kafka_prohibited_column_types(kafka_cluster): +def test_kafka_column_types(kafka_cluster): def assert_returned_exception(e): assert e.value.returncode == 36 assert ( - "KafkaEngine doesn't support DEFAULT/MATERIALIZED/EPHEMERAL/ALIAS expressions for columns." + "KafkaEngine doesn't support DEFAULT/MATERIALIZED/EPHEMERAL expressions for columns." in str(e.value) ) @@ -314,17 +314,39 @@ def test_kafka_prohibited_column_types(kafka_cluster): assert_returned_exception(exception) # check ALIAS - with pytest.raises(QueryRuntimeException) as exception: - instance.query( - """ + instance.query( + """ CREATE TABLE test.kafka (a Int, b String Alias toString(a)) ENGINE = Kafka('{kafka_broker}:19092', '{kafka_topic_new}', '{kafka_group_name_new}', '{kafka_format_json_each_row}', '\\n') + SETTINGS kafka_commit_on_select = 1; """ - ) - assert_returned_exception(exception) + ) + messages = [] + for i in range(5): + messages.append(json.dumps({"a": i})) + kafka_produce(kafka_cluster, "new", messages) + result = "" + expected = TSV( + """ +0\t0 +1\t1 +2\t2 +3\t3 +4\t4 + """ + ) + retries = 50 + while retries > 0: + result += instance.query("SELECT a, b FROM test.kafka", ignore_error=True) + if TSV(result) == expected: + break + retries -= 1 + + assert TSV(result) == expected + + instance.query("DROP TABLE test.kafka SYNC") # check MATERIALIZED - # check ALIAS with pytest.raises(QueryRuntimeException) as exception: instance.query( """ From bf201a09b7e68fde4d84f87388adeb7047416fb0 Mon Sep 17 00:00:00 2001 From: Diego Nieto Date: Mon, 15 May 2023 15:43:39 +0200 Subject: [PATCH 37/45] Update docs/en/operations/storing-data.md @kssenii comments Co-authored-by: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> --- docs/en/operations/storing-data.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index e8b043e7a27..5804ad8545b 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -194,7 +194,7 @@ These settings should be defined in the disk configuration section. File Cache **query/profile settings**: -Some of these settings will disable cache features per query/profile that are enabled by default. For example, you can enable cache in disk configuration and disable it per query/profile setting `enable_filesystem_cache` to `false`. Also setting `cache_on_write_operations` to `true` in disk configuration means that general file and per query cache are enabled. But if you need to disable this general setting per specific queries then setting `enable_filesystem_cache_on_write_operations` to `false` means that write operations cache will be disabled for a specific query/profile. +Some of these settings will disable cache features per query/profile that are enabled by default or in disk configuration settings. For example, you can enable cache in disk configuration and disable it per query/profile setting `enable_filesystem_cache` to `false`. Also setting `cache_on_write_operations` to `true` in disk configuration means that "write-though" cache is enabled. But if you need to disable this general setting per specific queries then setting `enable_filesystem_cache_on_write_operations` to `false` means that write operations cache will be disabled for a specific query/profile. - `enable_filesystem_cache` - allows to disable cache per query even if storage policy was configured with `cache` disk type. Default: `true`. From b23afdc53390149236d02140dfc96775317a2e6c Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 15 May 2023 15:48:00 +0000 Subject: [PATCH 38/45] Fix build for aarch64-darwin --- contrib/boost-cmake/CMakeLists.txt | 18 +- src/Common/AsyncTaskExecutor.h | 44 +- src/Common/OpenTelemetryTraceContext.cpp | 45 +- .../FunctionGenerateRandomStructure.cpp | 457 ++++++++++++++++++ .../FunctionGenerateRandomStructure.h | 45 ++ 5 files changed, 561 insertions(+), 48 deletions(-) create mode 100644 src/Functions/FunctionGenerateRandomStructure.cpp create mode 100644 src/Functions/FunctionGenerateRandomStructure.h diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index c9a759eab9c..6f9dce0b042 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -103,11 +103,19 @@ set (SRCS_CONTEXT ) if (ARCH_AARCH64) - set (SRCS_CONTEXT ${SRCS_CONTEXT} - "${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_elf_gas.S" - "${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_elf_gas.S" - "${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_elf_gas.S" - ) + if (OS_DARWIN) + set (SRCS_CONTEXT ${SRCS_CONTEXT} + "${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_macho_gas.S" + "${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_macho_gas.S" + "${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_macho_gas.S" + ) + else() + set (SRCS_CONTEXT ${SRCS_CONTEXT} + "${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_elf_gas.S" + "${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_elf_gas.S" + "${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_elf_gas.S" + ) + endif() elseif (ARCH_PPC64LE) set (SRCS_CONTEXT ${SRCS_CONTEXT} "${LIBRARY_DIR}/libs/context/src/asm/jump_ppc64_sysv_elf_gas.S" diff --git a/src/Common/AsyncTaskExecutor.h b/src/Common/AsyncTaskExecutor.h index 1c2f758504a..f87abd7eb9b 100644 --- a/src/Common/AsyncTaskExecutor.h +++ b/src/Common/AsyncTaskExecutor.h @@ -79,8 +79,8 @@ public: ERROR = 4, }; #endif - static FiberInfo getCurrentFiberInfo(); + static FiberInfo getCurrentFiberInfo(); protected: /// Method that is called in resume() before actual fiber resuming. /// If it returns false, resume() will return immediately without actual fiber resuming. @@ -124,6 +124,48 @@ private: std::unique_ptr task; }; +/// Simple implementation for fiber local variable. +template +struct FiberLocal +{ +public: + FiberLocal() + { + /// Initialize main instance for this thread. + /// Contexts for fibers will inherit this instance + /// (it could be changed before creating fibers). + data[nullptr] = T(); + } + + T & operator*() + { + return get(); + } + + T * operator->() + { + return &get(); + } + +private: + T & get() + { + /// Get instance for current fiber. + return getInstanceForFiber(AsyncTaskExecutor::getCurrentFiberInfo()); + } + + T & getInstanceForFiber(FiberInfo info) + { + auto it = data.find(info.fiber); + /// If it's the first request, we need to initialize instance for the fiber using instance from parent fiber. + if (it == data.end()) + it = data.insert({info.fiber, getInstanceForFiber(*info.parent_fiber_info)}).first; + return it->second; + } + + std::unordered_map data; +}; + String getSocketTimeoutExceededMessageByTimeoutType(AsyncEventTimeoutType type, Poco::Timespan timeout, const String & socket_description); } diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index 8cf4879c1e2..40d06e71456 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -14,48 +14,9 @@ namespace DB namespace OpenTelemetry { -/// This code can be executed inside several fibers in one thread, -/// we should use fiber local tracing context. -struct FiberLocalTracingContextOnThread -{ -public: - FiberLocalTracingContextOnThread() - { - /// Initialize main context for this thread. - /// Contexts for fibers will inherit this main context. - data[nullptr] = TracingContextOnThread(); - } - - TracingContextOnThread & operator*() - { - return get(); - } - - TracingContextOnThread * operator->() - { - return &get(); - } - -private: - TracingContextOnThread & get() - { - /// Get context for current fiber. - return getContextForFiber(AsyncTaskExecutor::getCurrentFiberInfo()); - } - - TracingContextOnThread & getContextForFiber(FiberInfo info) - { - auto it = data.find(info.fiber); - /// If it's the first request, we need to initialize context for the fiber using context from parent fiber. - if (it == data.end()) - it = data.insert({info.fiber, getContextForFiber(*info.parent_fiber_info)}).first; - return it->second; - } - - std::unordered_map data; -}; - -thread_local FiberLocalTracingContextOnThread current_fiber_trace_context; +///// This code can be executed inside several fibers in one thread, +///// we should use fiber local tracing context. +thread_local FiberLocal current_fiber_trace_context; bool Span::addAttribute(std::string_view name, UInt64 value) noexcept { diff --git a/src/Functions/FunctionGenerateRandomStructure.cpp b/src/Functions/FunctionGenerateRandomStructure.cpp new file mode 100644 index 00000000000..023a73fe147 --- /dev/null +++ b/src/Functions/FunctionGenerateRandomStructure.cpp @@ -0,0 +1,457 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int BAD_ARGUMENTS; +} + +class FunctionGenerateRandomStructure : public IFunction +{ +private: + static constexpr std::array simple_types + { + TypeIndex::Int8, + TypeIndex::UInt8, + TypeIndex::Int16, + TypeIndex::UInt16, + TypeIndex::Int32, + TypeIndex::UInt32, + TypeIndex::Int64, + TypeIndex::UInt64, + TypeIndex::Int128, + TypeIndex::UInt128, + TypeIndex::Int256, + TypeIndex::UInt256, + TypeIndex::Float32, + TypeIndex::Float64, + TypeIndex::Decimal32, + TypeIndex::Decimal64, + TypeIndex::Decimal128, + TypeIndex::Decimal256, + TypeIndex::Date, + TypeIndex::Date32, + TypeIndex::DateTime, + TypeIndex::DateTime64, + TypeIndex::String, + TypeIndex::FixedString, + TypeIndex::Enum8, + TypeIndex::Enum16, + TypeIndex::IPv4, + TypeIndex::IPv6, + TypeIndex::UUID, + }; + + static constexpr std::array complex_types + { + TypeIndex::Nullable, + TypeIndex::LowCardinality, + TypeIndex::Array, + TypeIndex::Tuple, + TypeIndex::Map, + }; + + static constexpr std::array map_key_types + { + TypeIndex::Int8, + TypeIndex::UInt8, + TypeIndex::Int16, + TypeIndex::UInt16, + TypeIndex::Int32, + TypeIndex::UInt32, + TypeIndex::Int64, + TypeIndex::UInt64, + TypeIndex::Int128, + TypeIndex::UInt128, + TypeIndex::Int256, + TypeIndex::UInt256, + TypeIndex::Date, + TypeIndex::Date32, + TypeIndex::DateTime, + TypeIndex::String, + TypeIndex::FixedString, + TypeIndex::IPv4, + TypeIndex::Enum8, + TypeIndex::Enum16, + TypeIndex::UUID, + TypeIndex::LowCardinality, + }; + + static constexpr std::array suspicious_lc_types + { + TypeIndex::Int8, + TypeIndex::UInt8, + TypeIndex::Int16, + TypeIndex::UInt16, + TypeIndex::Int32, + TypeIndex::UInt32, + TypeIndex::Int64, + TypeIndex::UInt64, + TypeIndex::Int128, + TypeIndex::UInt128, + TypeIndex::Int256, + TypeIndex::UInt256, + TypeIndex::Float32, + TypeIndex::Float64, + TypeIndex::Date, + TypeIndex::Date32, + TypeIndex::DateTime, + TypeIndex::String, + TypeIndex::FixedString, + TypeIndex::IPv4, + TypeIndex::IPv6, + TypeIndex::UUID, + }; + + static constexpr size_t MAX_NUMBER_OF_COLUMNS = 128; + static constexpr size_t MAX_TUPLE_ELEMENTS = 16; + static constexpr size_t MAX_DATETIME64_PRECISION = 9; + static constexpr size_t MAX_DECIMAL32_PRECISION = 9; + static constexpr size_t MAX_DECIMAL64_PRECISION = 18; + static constexpr size_t MAX_DECIMAL128_PRECISION = 38; + static constexpr size_t MAX_DECIMAL256_PRECISION = 76; + static constexpr size_t MAX_DEPTH = 32; + +public: + static constexpr auto name = "generateRandomStructure"; + + explicit FunctionGenerateRandomStructure(bool allow_suspicious_lc_types_) : allow_suspicious_lc_types(allow_suspicious_lc_types_) + { + } + + static FunctionPtr create(ContextPtr context) + { + return std::make_shared(context->getSettingsRef().allow_suspicious_low_cardinality_types.value); + } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 0; } + + bool isVariadic() const override { return true; } + bool isDeterministic() const override { return false; } + bool isDeterministicInScopeOfQuery() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; } + bool useDefaultImplementationForConstants() const override { return false; } + bool useDefaultImplementationForNulls() const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (arguments.size() > 2) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, expected from 0 to 2", + getName(), arguments.size()); + + + for (size_t i = 0; i != arguments.size(); ++i) + { + if (!isUnsignedInteger(arguments[i]) && !arguments[i]->onlyNull()) + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the {} argument of function {}, expected unsigned integer or Null", + arguments[i]->getName(), + i + 1, + getName()); + } + } + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + size_t seed = randomSeed(); + size_t number_of_columns = 0; + + if (!arguments.empty() && !arguments[0].column->onlyNull()) + { + number_of_columns = arguments[0].column->getUInt(0); + if (number_of_columns > MAX_NUMBER_OF_COLUMNS) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Maximum allowed number of columns is {}, got {}", + MAX_NUMBER_OF_COLUMNS, + number_of_columns); + } + + if (arguments.size() > 1 && !arguments[1].column->onlyNull()) + seed = arguments[1].column->getUInt(0); + + pcg64 rng(seed); + if (number_of_columns == 0) + number_of_columns = generateNumberOfColumns(rng); + + auto col_res = ColumnString::create(); + auto & string_column = assert_cast(*col_res); + auto & chars = string_column.getChars(); + WriteBufferFromVector buf(chars); + writeRandomStructure(rng, number_of_columns, buf, allow_suspicious_lc_types); + buf.finalize(); + chars.push_back(0); + string_column.getOffsets().push_back(chars.size()); + return ColumnConst::create(std::move(col_res), input_rows_count); + } + + static String getRandomStructure(size_t seed, const ContextPtr & context) + { + pcg64 rng(seed); + size_t number_of_columns = generateNumberOfColumns(rng); + WriteBufferFromOwnString buf; + writeRandomStructure(rng, number_of_columns, buf, context->getSettingsRef().allow_suspicious_low_cardinality_types); + return buf.str(); + } + +private: + + static size_t generateNumberOfColumns(pcg64 & rng) + { + return rng() % MAX_NUMBER_OF_COLUMNS + 1; + } + + static void writeRandomStructure(pcg64 & rng, size_t number_of_columns, WriteBuffer & buf, bool allow_suspicious_lc_types) + { + for (size_t i = 0; i != number_of_columns; ++i) + { + if (i != 0) + writeCString(", ", buf); + String column_name = "c" + std::to_string(i + 1); + writeString(column_name, buf); + writeChar(' ', buf); + writeRandomType(column_name, rng, buf, allow_suspicious_lc_types); + } + } + + template + static void writeRandomType(const String & column_name, pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types, size_t depth = 0) + { + if (allow_complex_types && depth > MAX_DEPTH) + writeRandomType(column_name, rng, buf, depth); + + constexpr auto all_types = getAllTypes(); + auto type = all_types[rng() % all_types.size()]; + + switch (type) + { + case TypeIndex::UInt8: + if (rng() % 2) + writeCString("UInt8", buf); + else + writeCString("Bool", buf); + return; + case TypeIndex::FixedString: + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); + return; + case TypeIndex::DateTime64: + writeString("DateTime64(" + std::to_string(rng() % MAX_DATETIME64_PRECISION + 1) + ")", buf); + return; + case TypeIndex::Decimal32: + writeString("Decimal32(" + std::to_string(rng() % MAX_DECIMAL32_PRECISION + 1) + ")", buf); + return; + case TypeIndex::Decimal64: + writeString("Decimal64(" + std::to_string(rng() % MAX_DECIMAL64_PRECISION + 1) + ")", buf); + return; + case TypeIndex::Decimal128: + writeString("Decimal128(" + std::to_string(rng() % MAX_DECIMAL128_PRECISION + 1) + ")", buf); + return; + case TypeIndex::Decimal256: + writeString("Decimal256(" + std::to_string(rng() % MAX_DECIMAL256_PRECISION + 1) + ")", buf); + return; + case TypeIndex::Enum8: + writeCString("Enum8(", buf); + writeEnumValues(column_name, rng, buf, INT8_MAX); + writeChar(')', buf); + return; + case TypeIndex::Enum16: + writeCString("Enum16(", buf); + writeEnumValues(column_name, rng, buf, INT16_MAX); + writeChar(')', buf); + return; + case TypeIndex::LowCardinality: + writeCString("LowCardinality(", buf); + writeLowCardinalityNestedType(rng, buf, allow_suspicious_lc_types); + writeChar(')', buf); + return; + case TypeIndex::Nullable: + { + writeCString("Nullable(", buf); + writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); + writeChar(')', buf); + return; + } + case TypeIndex::Array: + { + writeCString("Array(", buf); + writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); + writeChar(')', buf); + return; + } + case TypeIndex::Map: + { + writeCString("Map(", buf); + writeMapKeyType(rng, buf, allow_suspicious_lc_types); + writeCString(", ", buf); + writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); + writeChar(')', buf); + return; + } + case TypeIndex::Tuple: + { + size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1; + bool generate_nested = rng() % 2; + bool generate_named_tuple = rng() % 2; + if (generate_nested) + writeCString("Nested(", buf); + else + writeCString("Tuple(", buf); + + for (size_t i = 0; i != elements; ++i) + { + if (i != 0) + writeCString(", ", buf); + + String element_name = "e" + std::to_string(i + 1); + if (generate_named_tuple || generate_nested) + { + writeString(element_name, buf); + writeChar(' ', buf); + } + writeRandomType(element_name, rng, buf, allow_suspicious_lc_types, depth + 1); + } + writeChar(')', buf); + return; + } + default: + writeString(magic_enum::enum_name(type), buf); + return; + } + } + + static void writeMapKeyType(pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types) + { + TypeIndex type = map_key_types[rng() % map_key_types.size()]; + if (type == TypeIndex::FixedString) + { + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); + } + else if (type == TypeIndex::LowCardinality) + { + writeCString("LowCardinality(", buf); + writeLowCardinalityNestedType(rng, buf, allow_suspicious_lc_types); + writeChar(')', buf); + } + else + { + writeString(magic_enum::enum_name(type), buf); + } + } + + static void writeLowCardinalityNestedType(pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types) + { + bool make_nullable = rng() % 2; + if (make_nullable) + writeCString("Nullable(", buf); + + if (allow_suspicious_lc_types) + { + TypeIndex type = suspicious_lc_types[rng() % map_key_types.size()]; + if (type == TypeIndex::FixedString) + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); + else + writeString(magic_enum::enum_name(type), buf); + } + else + { + /// Support only String and FixedString. + if (rng() % 2) + writeCString("String", buf); + else + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); + } + + if (make_nullable) + writeChar(')', buf); + } + + static void writeEnumValues(const String & column_name, pcg64 & rng, WriteBuffer & buf, ssize_t max_value) + { + /// Don't generate big enums, because it will lead to really big result + /// and slowness of this function, and it can lead to `Max query size exceeded` + /// while using this function with generateRandom. + size_t num_values = rng() % 16 + 1; + std::vector values(num_values); + + /// Generate random numbers from range [-(max_value + 1), max_value - num_values + 1] + for (Int16 & x : values) + x = rng() % (2 * (max_value + 1) - num_values) - max_value - 1; + /// Make all numbers unique. + std::sort(values.begin(), values.end()); + for (size_t i = 0; i < num_values; ++i) + values[i] += i; + std::shuffle(values.begin(), values.end(), rng); + for (size_t i = 0; i != num_values; ++i) + { + if (i != 0) + writeCString(", ", buf); + writeString("'" + column_name + "V" + std::to_string(values[i]) + "' = " + std::to_string(i), buf); + } + } + + template + static constexpr auto getAllTypes() + { + constexpr size_t complex_types_size = complex_types.size() * allow_complex_types; + constexpr size_t result_size = simple_types.size() + complex_types_size; + std::array result; + size_t index = 0; + + for (size_t i = 0; i != simple_types.size(); ++i, ++index) + result[index] = simple_types[i]; + + for (size_t i = 0; i != complex_types_size; ++i, ++index) + result[index] = complex_types[i]; + + return result; + } + + bool allow_suspicious_lc_types; +}; + + +REGISTER_FUNCTION(GenerateRandomStructure) +{ + factory.registerFunction( + { + R"( +Generates a random table structure. +This function takes 2 optional constant arguments: +the number of columns in the result structure (random by default) and random seed (random by default) +The maximum number of columns is 128. +The function returns a value of type String. +)", + Documentation::Examples{ + {"random", "SELECT generateRandomStructure()"}, + {"with specified number of columns", "SELECT generateRandomStructure(10)"}, + {"with specified seed", "SELECT generateRandomStructure(10, 42)"}, + }, + Documentation::Categories{"Random"} + }, + FunctionFactory::CaseSensitive); +} + +} diff --git a/src/Functions/FunctionGenerateRandomStructure.h b/src/Functions/FunctionGenerateRandomStructure.h new file mode 100644 index 00000000000..1d1bcb1a0a8 --- /dev/null +++ b/src/Functions/FunctionGenerateRandomStructure.h @@ -0,0 +1,45 @@ +#include +#include + +#include + +namespace DB +{ + +class FunctionGenerateRandomStructure : public IFunction +{ +public: + static constexpr auto name = "generateRandomStructure"; + + explicit FunctionGenerateRandomStructure(bool allow_suspicious_lc_types_) : allow_suspicious_lc_types(allow_suspicious_lc_types_) + { + } + + static FunctionPtr create(ContextPtr context) + { + return std::make_shared(context->getSettingsRef().allow_suspicious_low_cardinality_types.value); + } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 0; } + + bool isVariadic() const override { return true; } + bool isDeterministic() const override { return false; } + bool isDeterministicInScopeOfQuery() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; } + bool useDefaultImplementationForConstants() const override { return false; } + bool useDefaultImplementationForNulls() const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override; + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override; + + static String generateRandomStructure(size_t seed, const ContextPtr & context); + +private: + bool allow_suspicious_lc_types; +}; + +} From 78064d062266559b9032d328b84a22e5f60b75d7 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 15 May 2023 15:52:14 +0000 Subject: [PATCH 39/45] Better comments --- src/Common/AsyncTaskExecutor.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Common/AsyncTaskExecutor.h b/src/Common/AsyncTaskExecutor.h index f87abd7eb9b..55dc2913c13 100644 --- a/src/Common/AsyncTaskExecutor.h +++ b/src/Common/AsyncTaskExecutor.h @@ -131,9 +131,9 @@ struct FiberLocal public: FiberLocal() { - /// Initialize main instance for this thread. - /// Contexts for fibers will inherit this instance - /// (it could be changed before creating fibers). + /// Initialize main instance for this thread. Instances for fibers will inherit it, + /// (it's needed because main instance could be changed before creating fibers + /// and changes should be visible in fibers). data[nullptr] = T(); } @@ -150,14 +150,14 @@ public: private: T & get() { - /// Get instance for current fiber. return getInstanceForFiber(AsyncTaskExecutor::getCurrentFiberInfo()); } T & getInstanceForFiber(FiberInfo info) { auto it = data.find(info.fiber); - /// If it's the first request, we need to initialize instance for the fiber using instance from parent fiber. + /// If it's the first request, we need to initialize instance for the fiber + /// using instance from parent fiber or main thread that created fiber. if (it == data.end()) it = data.insert({info.fiber, getInstanceForFiber(*info.parent_fiber_info)}).first; return it->second; From bfcaf95aed8b6f88a02c19c384db9a11a875961d Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 15 May 2023 18:32:54 +0200 Subject: [PATCH 40/45] Delete unneded files --- .../FunctionGenerateRandomStructure.h | 45 ------------------- 1 file changed, 45 deletions(-) delete mode 100644 src/Functions/FunctionGenerateRandomStructure.h diff --git a/src/Functions/FunctionGenerateRandomStructure.h b/src/Functions/FunctionGenerateRandomStructure.h deleted file mode 100644 index 1d1bcb1a0a8..00000000000 --- a/src/Functions/FunctionGenerateRandomStructure.h +++ /dev/null @@ -1,45 +0,0 @@ -#include -#include - -#include - -namespace DB -{ - -class FunctionGenerateRandomStructure : public IFunction -{ -public: - static constexpr auto name = "generateRandomStructure"; - - explicit FunctionGenerateRandomStructure(bool allow_suspicious_lc_types_) : allow_suspicious_lc_types(allow_suspicious_lc_types_) - { - } - - static FunctionPtr create(ContextPtr context) - { - return std::make_shared(context->getSettingsRef().allow_suspicious_low_cardinality_types.value); - } - - String getName() const override { return name; } - - size_t getNumberOfArguments() const override { return 0; } - - bool isVariadic() const override { return true; } - bool isDeterministic() const override { return false; } - bool isDeterministicInScopeOfQuery() const override { return false; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; } - bool useDefaultImplementationForConstants() const override { return false; } - bool useDefaultImplementationForNulls() const override { return false; } - - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override; - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override; - - static String generateRandomStructure(size_t seed, const ContextPtr & context); - -private: - bool allow_suspicious_lc_types; -}; - -} From 900aca5f0a9c6c498b18cb1778e545b5f4d951f2 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 15 May 2023 18:33:09 +0200 Subject: [PATCH 41/45] Delete unneded files --- .../FunctionGenerateRandomStructure.cpp | 457 ------------------ 1 file changed, 457 deletions(-) delete mode 100644 src/Functions/FunctionGenerateRandomStructure.cpp diff --git a/src/Functions/FunctionGenerateRandomStructure.cpp b/src/Functions/FunctionGenerateRandomStructure.cpp deleted file mode 100644 index 023a73fe147..00000000000 --- a/src/Functions/FunctionGenerateRandomStructure.cpp +++ /dev/null @@ -1,457 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int BAD_ARGUMENTS; -} - -class FunctionGenerateRandomStructure : public IFunction -{ -private: - static constexpr std::array simple_types - { - TypeIndex::Int8, - TypeIndex::UInt8, - TypeIndex::Int16, - TypeIndex::UInt16, - TypeIndex::Int32, - TypeIndex::UInt32, - TypeIndex::Int64, - TypeIndex::UInt64, - TypeIndex::Int128, - TypeIndex::UInt128, - TypeIndex::Int256, - TypeIndex::UInt256, - TypeIndex::Float32, - TypeIndex::Float64, - TypeIndex::Decimal32, - TypeIndex::Decimal64, - TypeIndex::Decimal128, - TypeIndex::Decimal256, - TypeIndex::Date, - TypeIndex::Date32, - TypeIndex::DateTime, - TypeIndex::DateTime64, - TypeIndex::String, - TypeIndex::FixedString, - TypeIndex::Enum8, - TypeIndex::Enum16, - TypeIndex::IPv4, - TypeIndex::IPv6, - TypeIndex::UUID, - }; - - static constexpr std::array complex_types - { - TypeIndex::Nullable, - TypeIndex::LowCardinality, - TypeIndex::Array, - TypeIndex::Tuple, - TypeIndex::Map, - }; - - static constexpr std::array map_key_types - { - TypeIndex::Int8, - TypeIndex::UInt8, - TypeIndex::Int16, - TypeIndex::UInt16, - TypeIndex::Int32, - TypeIndex::UInt32, - TypeIndex::Int64, - TypeIndex::UInt64, - TypeIndex::Int128, - TypeIndex::UInt128, - TypeIndex::Int256, - TypeIndex::UInt256, - TypeIndex::Date, - TypeIndex::Date32, - TypeIndex::DateTime, - TypeIndex::String, - TypeIndex::FixedString, - TypeIndex::IPv4, - TypeIndex::Enum8, - TypeIndex::Enum16, - TypeIndex::UUID, - TypeIndex::LowCardinality, - }; - - static constexpr std::array suspicious_lc_types - { - TypeIndex::Int8, - TypeIndex::UInt8, - TypeIndex::Int16, - TypeIndex::UInt16, - TypeIndex::Int32, - TypeIndex::UInt32, - TypeIndex::Int64, - TypeIndex::UInt64, - TypeIndex::Int128, - TypeIndex::UInt128, - TypeIndex::Int256, - TypeIndex::UInt256, - TypeIndex::Float32, - TypeIndex::Float64, - TypeIndex::Date, - TypeIndex::Date32, - TypeIndex::DateTime, - TypeIndex::String, - TypeIndex::FixedString, - TypeIndex::IPv4, - TypeIndex::IPv6, - TypeIndex::UUID, - }; - - static constexpr size_t MAX_NUMBER_OF_COLUMNS = 128; - static constexpr size_t MAX_TUPLE_ELEMENTS = 16; - static constexpr size_t MAX_DATETIME64_PRECISION = 9; - static constexpr size_t MAX_DECIMAL32_PRECISION = 9; - static constexpr size_t MAX_DECIMAL64_PRECISION = 18; - static constexpr size_t MAX_DECIMAL128_PRECISION = 38; - static constexpr size_t MAX_DECIMAL256_PRECISION = 76; - static constexpr size_t MAX_DEPTH = 32; - -public: - static constexpr auto name = "generateRandomStructure"; - - explicit FunctionGenerateRandomStructure(bool allow_suspicious_lc_types_) : allow_suspicious_lc_types(allow_suspicious_lc_types_) - { - } - - static FunctionPtr create(ContextPtr context) - { - return std::make_shared(context->getSettingsRef().allow_suspicious_low_cardinality_types.value); - } - - String getName() const override { return name; } - - size_t getNumberOfArguments() const override { return 0; } - - bool isVariadic() const override { return true; } - bool isDeterministic() const override { return false; } - bool isDeterministicInScopeOfQuery() const override { return false; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; } - bool useDefaultImplementationForConstants() const override { return false; } - bool useDefaultImplementationForNulls() const override { return false; } - - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override - { - if (arguments.size() > 2) - throw Exception( - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Number of arguments for function {} doesn't match: passed {}, expected from 0 to 2", - getName(), arguments.size()); - - - for (size_t i = 0; i != arguments.size(); ++i) - { - if (!isUnsignedInteger(arguments[i]) && !arguments[i]->onlyNull()) - { - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of the {} argument of function {}, expected unsigned integer or Null", - arguments[i]->getName(), - i + 1, - getName()); - } - } - - return std::make_shared(); - } - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override - { - size_t seed = randomSeed(); - size_t number_of_columns = 0; - - if (!arguments.empty() && !arguments[0].column->onlyNull()) - { - number_of_columns = arguments[0].column->getUInt(0); - if (number_of_columns > MAX_NUMBER_OF_COLUMNS) - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Maximum allowed number of columns is {}, got {}", - MAX_NUMBER_OF_COLUMNS, - number_of_columns); - } - - if (arguments.size() > 1 && !arguments[1].column->onlyNull()) - seed = arguments[1].column->getUInt(0); - - pcg64 rng(seed); - if (number_of_columns == 0) - number_of_columns = generateNumberOfColumns(rng); - - auto col_res = ColumnString::create(); - auto & string_column = assert_cast(*col_res); - auto & chars = string_column.getChars(); - WriteBufferFromVector buf(chars); - writeRandomStructure(rng, number_of_columns, buf, allow_suspicious_lc_types); - buf.finalize(); - chars.push_back(0); - string_column.getOffsets().push_back(chars.size()); - return ColumnConst::create(std::move(col_res), input_rows_count); - } - - static String getRandomStructure(size_t seed, const ContextPtr & context) - { - pcg64 rng(seed); - size_t number_of_columns = generateNumberOfColumns(rng); - WriteBufferFromOwnString buf; - writeRandomStructure(rng, number_of_columns, buf, context->getSettingsRef().allow_suspicious_low_cardinality_types); - return buf.str(); - } - -private: - - static size_t generateNumberOfColumns(pcg64 & rng) - { - return rng() % MAX_NUMBER_OF_COLUMNS + 1; - } - - static void writeRandomStructure(pcg64 & rng, size_t number_of_columns, WriteBuffer & buf, bool allow_suspicious_lc_types) - { - for (size_t i = 0; i != number_of_columns; ++i) - { - if (i != 0) - writeCString(", ", buf); - String column_name = "c" + std::to_string(i + 1); - writeString(column_name, buf); - writeChar(' ', buf); - writeRandomType(column_name, rng, buf, allow_suspicious_lc_types); - } - } - - template - static void writeRandomType(const String & column_name, pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types, size_t depth = 0) - { - if (allow_complex_types && depth > MAX_DEPTH) - writeRandomType(column_name, rng, buf, depth); - - constexpr auto all_types = getAllTypes(); - auto type = all_types[rng() % all_types.size()]; - - switch (type) - { - case TypeIndex::UInt8: - if (rng() % 2) - writeCString("UInt8", buf); - else - writeCString("Bool", buf); - return; - case TypeIndex::FixedString: - writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); - return; - case TypeIndex::DateTime64: - writeString("DateTime64(" + std::to_string(rng() % MAX_DATETIME64_PRECISION + 1) + ")", buf); - return; - case TypeIndex::Decimal32: - writeString("Decimal32(" + std::to_string(rng() % MAX_DECIMAL32_PRECISION + 1) + ")", buf); - return; - case TypeIndex::Decimal64: - writeString("Decimal64(" + std::to_string(rng() % MAX_DECIMAL64_PRECISION + 1) + ")", buf); - return; - case TypeIndex::Decimal128: - writeString("Decimal128(" + std::to_string(rng() % MAX_DECIMAL128_PRECISION + 1) + ")", buf); - return; - case TypeIndex::Decimal256: - writeString("Decimal256(" + std::to_string(rng() % MAX_DECIMAL256_PRECISION + 1) + ")", buf); - return; - case TypeIndex::Enum8: - writeCString("Enum8(", buf); - writeEnumValues(column_name, rng, buf, INT8_MAX); - writeChar(')', buf); - return; - case TypeIndex::Enum16: - writeCString("Enum16(", buf); - writeEnumValues(column_name, rng, buf, INT16_MAX); - writeChar(')', buf); - return; - case TypeIndex::LowCardinality: - writeCString("LowCardinality(", buf); - writeLowCardinalityNestedType(rng, buf, allow_suspicious_lc_types); - writeChar(')', buf); - return; - case TypeIndex::Nullable: - { - writeCString("Nullable(", buf); - writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); - writeChar(')', buf); - return; - } - case TypeIndex::Array: - { - writeCString("Array(", buf); - writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); - writeChar(')', buf); - return; - } - case TypeIndex::Map: - { - writeCString("Map(", buf); - writeMapKeyType(rng, buf, allow_suspicious_lc_types); - writeCString(", ", buf); - writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); - writeChar(')', buf); - return; - } - case TypeIndex::Tuple: - { - size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1; - bool generate_nested = rng() % 2; - bool generate_named_tuple = rng() % 2; - if (generate_nested) - writeCString("Nested(", buf); - else - writeCString("Tuple(", buf); - - for (size_t i = 0; i != elements; ++i) - { - if (i != 0) - writeCString(", ", buf); - - String element_name = "e" + std::to_string(i + 1); - if (generate_named_tuple || generate_nested) - { - writeString(element_name, buf); - writeChar(' ', buf); - } - writeRandomType(element_name, rng, buf, allow_suspicious_lc_types, depth + 1); - } - writeChar(')', buf); - return; - } - default: - writeString(magic_enum::enum_name(type), buf); - return; - } - } - - static void writeMapKeyType(pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types) - { - TypeIndex type = map_key_types[rng() % map_key_types.size()]; - if (type == TypeIndex::FixedString) - { - writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); - } - else if (type == TypeIndex::LowCardinality) - { - writeCString("LowCardinality(", buf); - writeLowCardinalityNestedType(rng, buf, allow_suspicious_lc_types); - writeChar(')', buf); - } - else - { - writeString(magic_enum::enum_name(type), buf); - } - } - - static void writeLowCardinalityNestedType(pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types) - { - bool make_nullable = rng() % 2; - if (make_nullable) - writeCString("Nullable(", buf); - - if (allow_suspicious_lc_types) - { - TypeIndex type = suspicious_lc_types[rng() % map_key_types.size()]; - if (type == TypeIndex::FixedString) - writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); - else - writeString(magic_enum::enum_name(type), buf); - } - else - { - /// Support only String and FixedString. - if (rng() % 2) - writeCString("String", buf); - else - writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); - } - - if (make_nullable) - writeChar(')', buf); - } - - static void writeEnumValues(const String & column_name, pcg64 & rng, WriteBuffer & buf, ssize_t max_value) - { - /// Don't generate big enums, because it will lead to really big result - /// and slowness of this function, and it can lead to `Max query size exceeded` - /// while using this function with generateRandom. - size_t num_values = rng() % 16 + 1; - std::vector values(num_values); - - /// Generate random numbers from range [-(max_value + 1), max_value - num_values + 1] - for (Int16 & x : values) - x = rng() % (2 * (max_value + 1) - num_values) - max_value - 1; - /// Make all numbers unique. - std::sort(values.begin(), values.end()); - for (size_t i = 0; i < num_values; ++i) - values[i] += i; - std::shuffle(values.begin(), values.end(), rng); - for (size_t i = 0; i != num_values; ++i) - { - if (i != 0) - writeCString(", ", buf); - writeString("'" + column_name + "V" + std::to_string(values[i]) + "' = " + std::to_string(i), buf); - } - } - - template - static constexpr auto getAllTypes() - { - constexpr size_t complex_types_size = complex_types.size() * allow_complex_types; - constexpr size_t result_size = simple_types.size() + complex_types_size; - std::array result; - size_t index = 0; - - for (size_t i = 0; i != simple_types.size(); ++i, ++index) - result[index] = simple_types[i]; - - for (size_t i = 0; i != complex_types_size; ++i, ++index) - result[index] = complex_types[i]; - - return result; - } - - bool allow_suspicious_lc_types; -}; - - -REGISTER_FUNCTION(GenerateRandomStructure) -{ - factory.registerFunction( - { - R"( -Generates a random table structure. -This function takes 2 optional constant arguments: -the number of columns in the result structure (random by default) and random seed (random by default) -The maximum number of columns is 128. -The function returns a value of type String. -)", - Documentation::Examples{ - {"random", "SELECT generateRandomStructure()"}, - {"with specified number of columns", "SELECT generateRandomStructure(10)"}, - {"with specified seed", "SELECT generateRandomStructure(10, 42)"}, - }, - Documentation::Categories{"Random"} - }, - FunctionFactory::CaseSensitive); -} - -} From 4137a5e0582041b0f7fcb388156153c4eb5e360a Mon Sep 17 00:00:00 2001 From: Han Fei Date: Mon, 15 May 2023 18:51:16 +0200 Subject: [PATCH 42/45] use chassert in MergeTreeDeduplicationLog to have better log info --- src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp b/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp index b843ce6a078..ac03b0be779 100644 --- a/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp +++ b/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp @@ -227,7 +227,7 @@ std::pair MergeTreeDeduplicationLog::addPart(const std: return std::make_pair(info, false); } - assert(current_writer != nullptr); + chassert(current_writer != nullptr); /// Create new record MergeTreeDeduplicationLogRecord record; @@ -257,7 +257,7 @@ void MergeTreeDeduplicationLog::dropPart(const MergeTreePartInfo & drop_part_inf if (deduplication_window == 0) return; - assert(current_writer != nullptr); + chassert(current_writer != nullptr); for (auto itr = deduplication_map.begin(); itr != deduplication_map.end(); /* no increment here, we erasing from map */) { From 91db14851397cccb166bc96f4d335dca5928640e Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Thu, 4 May 2023 21:28:33 +0000 Subject: [PATCH 43/45] Fix AsynchronousReadIndirectBufferFromRemoteFS breaking on short seeks --- ...chronousReadIndirectBufferFromRemoteFS.cpp | 28 +++++++++++-------- ...ynchronousReadIndirectBufferFromRemoteFS.h | 4 ++- .../0_stateless/02731_parquet_s3.reference | 1 + .../queries/0_stateless/02731_parquet_s3.sql | 7 +++++ 4 files changed, 27 insertions(+), 13 deletions(-) create mode 100644 tests/queries/0_stateless/02731_parquet_s3.reference create mode 100644 tests/queries/0_stateless/02731_parquet_s3.sql diff --git a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp index bf9a476b785..24b7042e459 100644 --- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp +++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp @@ -118,12 +118,7 @@ std::future AsynchronousReadIndirectBufferFromRemot request.size = size; request.offset = file_offset_of_buffer_end; request.priority = base_priority + priority; - - if (bytes_to_ignore) - { - request.ignore = bytes_to_ignore; - bytes_to_ignore = 0; - } + request.ignore = bytes_to_ignore; return reader.submit(request); } @@ -165,8 +160,7 @@ void AsynchronousReadIndirectBufferFromRemoteFS::setReadUntilPosition(size_t pos void AsynchronousReadIndirectBufferFromRemoteFS::setReadUntilEnd() { - read_until_position = impl->getFileSize(); - impl->setReadUntilPosition(*read_until_position); + setReadUntilPosition(impl->getFileSize()); } @@ -228,12 +222,13 @@ bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl() chassert(memory.size() == read_settings.prefetch_buffer_size || memory.size() == read_settings.remote_fs_buffer_size); std::tie(size, offset) = impl->readInto(memory.data(), memory.size(), file_offset_of_buffer_end, bytes_to_ignore); - bytes_to_ignore = 0; ProfileEvents::increment(ProfileEvents::RemoteFSUnprefetchedReads); ProfileEvents::increment(ProfileEvents::RemoteFSUnprefetchedBytes, size); } + bytes_to_ignore = 0; + chassert(size >= offset); size_t bytes_read = size - offset; @@ -269,7 +264,7 @@ off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset, int whence) } else if (whence == SEEK_CUR) { - new_pos = file_offset_of_buffer_end - (working_buffer.end() - pos) + offset; + new_pos = static_cast(getPosition()) + offset; } else { @@ -277,13 +272,15 @@ off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset, int whence) } /// Position is unchanged. - if (new_pos + (working_buffer.end() - pos) == file_offset_of_buffer_end) + if (new_pos == static_cast(getPosition())) return new_pos; bool read_from_prefetch = false; while (true) { - if (file_offset_of_buffer_end - working_buffer.size() <= new_pos && new_pos <= file_offset_of_buffer_end) + /// The first condition implies bytes_to_ignore = 0. + if (!working_buffer.empty() && file_offset_of_buffer_end - working_buffer.size() <= new_pos && + new_pos <= file_offset_of_buffer_end) { /// Position is still inside the buffer. /// Probably it is at the end of the buffer - then we will load data on the following 'next' call. @@ -320,6 +317,7 @@ off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset, int whence) /// First reset the buffer so the next read will fetch new data to the buffer. resetWorkingBuffer(); + bytes_to_ignore = 0; if (read_until_position && new_pos > *read_until_position) { @@ -356,6 +354,12 @@ off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset, int whence) } +off_t AsynchronousReadIndirectBufferFromRemoteFS::getPosition() +{ + return file_offset_of_buffer_end - available() + bytes_to_ignore; +} + + void AsynchronousReadIndirectBufferFromRemoteFS::finalize() { resetPrefetch(FilesystemPrefetchState::UNNEEDED); diff --git a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h index 49b44916a46..e8fb3fe248b 100644 --- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h +++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h @@ -42,7 +42,7 @@ public: off_t seek(off_t offset_, int whence) override; - off_t getPosition() override { return file_offset_of_buffer_end - available(); } + off_t getPosition() override; String getFileName() const override; @@ -89,6 +89,8 @@ private: std::string current_reader_id; + /// If nonzero then working_buffer is empty. + /// If a prefetch is in flight, the prefetch task has been instructed to ignore this many bytes. size_t bytes_to_ignore = 0; std::optional read_until_position; diff --git a/tests/queries/0_stateless/02731_parquet_s3.reference b/tests/queries/0_stateless/02731_parquet_s3.reference new file mode 100644 index 00000000000..5a5aaeb0068 --- /dev/null +++ b/tests/queries/0_stateless/02731_parquet_s3.reference @@ -0,0 +1 @@ +12639441726720293784 diff --git a/tests/queries/0_stateless/02731_parquet_s3.sql b/tests/queries/0_stateless/02731_parquet_s3.sql new file mode 100644 index 00000000000..3c3f11f535b --- /dev/null +++ b/tests/queries/0_stateless/02731_parquet_s3.sql @@ -0,0 +1,7 @@ +-- Tags: no-fasttest +-- Tag no-fasttest: Depends on AWS + +-- Reading from s3 a parquet file of size between ~1 MB and ~2 MB was broken at some point. +insert into function s3(s3_conn, filename='test_02731_parquet_s3.parquet') select cityHash64(number) from numbers(170000) settings s3_truncate_on_insert=1; + +select sum(*) from s3(s3_conn, filename='test_02731_parquet_s3.parquet') settings remote_filesystem_read_method='threadpool', remote_filesystem_read_prefetch=1; From cbc15bf35a61d86e2c38e467d8b0f2210aec8230 Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Mon, 15 May 2023 23:13:17 +0200 Subject: [PATCH 44/45] Add `DynamicResourceManager` and `FairPolicy` into scheduling subsystem (#49671) * Add `DynamicResourceManager` and `FairPolicy` into scheduling subsystem * fix test * fix tidy build --- src/IO/Resource/DynamicResourceManager.cpp | 234 ++++++++++++++++++ src/IO/Resource/DynamicResourceManager.h | 93 +++++++ src/IO/Resource/FairPolicy.cpp | 13 + src/IO/Resource/FairPolicy.h | 232 +++++++++++++++++ src/IO/Resource/registerResourceManagers.cpp | 2 + src/IO/Resource/registerSchedulerNodes.cpp | 2 + .../tests/gtest_resource_class_fair.cpp | 187 ++++++++++++++ .../gtest_resource_manager_hierarchical.cpp | 116 +++++++++ src/Interpreters/Context.cpp | 2 +- 9 files changed, 880 insertions(+), 1 deletion(-) create mode 100644 src/IO/Resource/DynamicResourceManager.cpp create mode 100644 src/IO/Resource/DynamicResourceManager.h create mode 100644 src/IO/Resource/FairPolicy.cpp create mode 100644 src/IO/Resource/FairPolicy.h create mode 100644 src/IO/Resource/tests/gtest_resource_class_fair.cpp create mode 100644 src/IO/Resource/tests/gtest_resource_manager_hierarchical.cpp diff --git a/src/IO/Resource/DynamicResourceManager.cpp b/src/IO/Resource/DynamicResourceManager.cpp new file mode 100644 index 00000000000..49e12984e33 --- /dev/null +++ b/src/IO/Resource/DynamicResourceManager.cpp @@ -0,0 +1,234 @@ +#include + +#include +#include +#include + +#include +#include + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int RESOURCE_ACCESS_DENIED; + extern const int RESOURCE_NOT_FOUND; + extern const int INVALID_SCHEDULER_NODE; +} + +DynamicResourceManager::State::State(EventQueue * event_queue, const Poco::Util::AbstractConfiguration & config) + : classifiers(config) +{ + Poco::Util::AbstractConfiguration::Keys keys; + const String config_prefix = "resources"; + config.keys(config_prefix, keys); + + // Create resource for every element under tag + for (const auto & key : keys) + { + resources.emplace(key, std::make_shared(key, event_queue, config, config_prefix + "." + key)); + } +} + +DynamicResourceManager::State::Resource::Resource( + const String & name, + EventQueue * event_queue, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix) +{ + Poco::Util::AbstractConfiguration::Keys keys; + config.keys(config_prefix, keys); + + // Sort nodes by path to create parents before children + std::map path2key; + for (const auto & key : keys) + { + if (!startsWith(key, "node")) + continue; + String path = config.getString(config_prefix + "." + key + "[@path]", ""); + if (path.empty()) + throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Attribute 'path' must be specified in all nodes for resource '{}'", name); + if (path[0] != '/') + throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Path must start with '/' for resource '{}'", name); + if (auto [_, inserted] = path2key.emplace(path, key); !inserted) + throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Duplicate path '{}' for resource '{}'", path, name); + } + + // Create nodes + bool has_root = false; + for (auto [path, key] : path2key) + { + // Validate path + size_t slash = path.rfind('/'); + if (slash == String::npos) + throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Invalid scheduler node path '{}' for resource '{}'", path, name); + + // Create node + String basename = path.substr(slash + 1); // root name is empty string + auto [iter, _] = nodes.emplace(path, Node(basename, event_queue, config, config_prefix + "." + key)); + if (path == "/") + { + has_root = true; + continue; + } + + // Attach created node to parent (if not root) + // NOTE: resource root is attached to the scheduler using event queue for thread-safety + String parent_path = path.substr(0, slash); + if (parent_path.empty()) + parent_path = "/"; + if (auto parent = nodes.find(parent_path); parent != nodes.end()) + parent->second.ptr->attachChild(iter->second.ptr); + else + throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Parent node doesn't exist for path '{}' for resource '{}'", path, name); + } + + if (!has_root) + throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "undefined root node path '/' for resource '{}'", name); +} + +DynamicResourceManager::State::Resource::~Resource() +{ + // NOTE: we should rely on `attached_to` and cannot use `parent`, + // NOTE: because `parent` can be `nullptr` in case attachment is still in event queue + if (attached_to != nullptr) + { + ISchedulerNode * root = nodes.find("/")->second.ptr.get(); + attached_to->event_queue->enqueue([scheduler = attached_to, root] + { + scheduler->removeChild(root); + }); + } +} + +DynamicResourceManager::State::Node::Node(const String & name, EventQueue * event_queue, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix) + : type(config.getString(config_prefix + ".type", "fifo")) + , ptr(SchedulerNodeFactory::instance().get(type, event_queue, config, config_prefix)) +{ + ptr->basename = name; +} + +bool DynamicResourceManager::State::Resource::equals(const DynamicResourceManager::State::Resource & o) const +{ + if (nodes.size() != o.nodes.size()) + return false; + + for (const auto & [path, o_node] : o.nodes) + { + auto iter = nodes.find(path); + if (iter == nodes.end()) + return false; + if (!iter->second.equals(o_node)) + return false; + } + + return true; +} + +bool DynamicResourceManager::State::Node::equals(const DynamicResourceManager::State::Node & o) const +{ + if (type != o.type) + return false; + return ptr->equals(o.ptr.get()); +} + +DynamicResourceManager::Classifier::Classifier(const DynamicResourceManager::StatePtr & state_, const String & classifier_name) + : state(state_) +{ + // State is immutable, but nodes are mutable and thread-safe + // So it's safe to obtain node pointers w/o lock + for (auto [resource_name, path] : state->classifiers.get(classifier_name)) + { + if (auto resource_iter = state->resources.find(resource_name); resource_iter != state->resources.end()) + { + const auto & resource = resource_iter->second; + if (auto node_iter = resource->nodes.find(path); node_iter != resource->nodes.end()) + { + if (auto * queue = dynamic_cast(node_iter->second.ptr.get())) + resources.emplace(resource_name, ResourceLink{.queue = queue}); + else + throw Exception(ErrorCodes::RESOURCE_NOT_FOUND, "Unable to access non-queue node at path '{}' for resource '{}'", path, resource_name); + } + else + throw Exception(ErrorCodes::RESOURCE_NOT_FOUND, "Path '{}' for resource '{}' does not exist", path, resource_name); + } + else + resources.emplace(resource_name, ResourceLink{}); // resource not configured yet - use unlimited resource + } +} + +ResourceLink DynamicResourceManager::Classifier::get(const String & resource_name) +{ + if (auto iter = resources.find(resource_name); iter != resources.end()) + return iter->second; + else + throw Exception(ErrorCodes::RESOURCE_ACCESS_DENIED, "Access denied to resource '{}'", resource_name); +} + +DynamicResourceManager::DynamicResourceManager() + : state(new State()) +{ + scheduler.start(); +} + +void DynamicResourceManager::updateConfiguration(const Poco::Util::AbstractConfiguration & config) +{ + StatePtr new_state = std::make_shared(scheduler.event_queue, config); + + std::lock_guard lock{mutex}; + + // Resource update leads to loss of runtime data of nodes and may lead to temporary violation of constraints (e.g. limits) + // Try to minimise this by reusing "equal" resources (initialized with the same configuration). + for (auto & [name, new_resource] : new_state->resources) + { + if (auto iter = state->resources.find(name); iter != state->resources.end()) // Resource update + { + State::ResourcePtr old_resource = iter->second; + if (old_resource->equals(*new_resource)) + new_resource = old_resource; // Rewrite with older version to avoid loss of runtime data + } + } + + // Commit new state + // NOTE: dtor will detach from scheduler old resources that are not in use currently + state = new_state; + + // Attach new and updated resources to the scheduler + for (auto & [name, resource] : new_state->resources) + { + const SchedulerNodePtr & root = resource->nodes.find("/")->second.ptr; + if (root->parent == nullptr) + { + resource->attached_to = &scheduler; + scheduler.event_queue->enqueue([this, root] + { + scheduler.attachChild(root); + }); + } + } + + // NOTE: after mutex unlock `state` became available for Classifier(s) and must be immutable +} + +ClassifierPtr DynamicResourceManager::acquire(const String & classifier_name) +{ + // Acquire a reference to the current state + StatePtr state_; + { + std::lock_guard lock{mutex}; + state_ = state; + } + + return std::make_shared(state_, classifier_name); +} + +void registerDynamicResourceManager(ResourceManagerFactory & factory) +{ + factory.registerMethod("dynamic"); +} + +} diff --git a/src/IO/Resource/DynamicResourceManager.h b/src/IO/Resource/DynamicResourceManager.h new file mode 100644 index 00000000000..aa1147f1fb2 --- /dev/null +++ b/src/IO/Resource/DynamicResourceManager.h @@ -0,0 +1,93 @@ +#pragma once + +#include +#include +#include + +#include + +namespace DB +{ + +/* + * Implementation of `IResourceManager` supporting arbitrary dynamic hierarchy of scheduler nodes. + * All resources are controlled by single root `SchedulerRoot`. + * + * State of manager is set of resources attached to the scheduler. States are referenced by classifiers. + * Classifiers are used (1) to access resources and (2) to keep shared ownership of resources with pending + * resource requests. This allows `ResourceRequest` and `ResourceLink` to hold raw pointers as long as + * `ClassifierPtr` is acquired and held. + * + * Manager can update configuration after initialization. During update, new version of resources are also + * attached to scheduler, so multiple version can coexist for a short perid. This will violate constraints + * (e.g. in-fly-limit), because different version have independent nodes to impose constraints, the same + * violation will apply to fairness. Old version exists as long as there is at least one classifier + * instance referencing it. Classifiers are typically attached to queries and will be destructed with them. + */ +class DynamicResourceManager : public IResourceManager +{ +public: + DynamicResourceManager(); + void updateConfiguration(const Poco::Util::AbstractConfiguration & config) override; + ClassifierPtr acquire(const String & classifier_name) override; + +private: + /// Holds everything required to work with one specific configuration + struct State + { + struct Node + { + String type; + SchedulerNodePtr ptr; + + Node( + const String & name, + EventQueue * event_queue, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix); + bool equals(const Node & o) const; + }; + + struct Resource + { + std::unordered_map nodes; // by path + SchedulerRoot * attached_to = nullptr; + + Resource( + const String & name, + EventQueue * event_queue, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix); + ~Resource(); // unregisters resource from scheduler + bool equals(const Resource & o) const; + }; + + using ResourcePtr = std::shared_ptr; + + std::unordered_map resources; // by name + ClassifiersConfig classifiers; + + State() = default; + explicit State(EventQueue * event_queue, const Poco::Util::AbstractConfiguration & config); + }; + + using StatePtr = std::shared_ptr; + + /// Created per query, holds State used by that query + class Classifier : public IClassifier + { + public: + Classifier(const StatePtr & state_, const String & classifier_name); + ResourceLink get(const String & resource_name) override; + private: + std::unordered_map resources; // accessible resources by names + StatePtr state; // hold state to avoid ResourceLink invalidation due to resource deregistration from SchedulerRoot + }; + +private: + SchedulerRoot scheduler; + std::mutex mutex; + StatePtr state; +}; + +} diff --git a/src/IO/Resource/FairPolicy.cpp b/src/IO/Resource/FairPolicy.cpp new file mode 100644 index 00000000000..248ff04cbd7 --- /dev/null +++ b/src/IO/Resource/FairPolicy.cpp @@ -0,0 +1,13 @@ +#include + +#include + +namespace DB +{ + +void registerFairPolicy(SchedulerNodeFactory & factory) +{ + factory.registerMethod("fair"); +} + +} diff --git a/src/IO/Resource/FairPolicy.h b/src/IO/Resource/FairPolicy.h new file mode 100644 index 00000000000..9c0c78f057c --- /dev/null +++ b/src/IO/Resource/FairPolicy.h @@ -0,0 +1,232 @@ +#pragma once + +#include +#include + +#include + +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INVALID_SCHEDULER_NODE; +} + +/* + * Scheduler node that implements weight-based fair scheduling policy. + * Based on Start-time Fair Queueing (SFQ) algorithm. + * + * Algorithm description. + * Virtual runtime (total consumed cost divided by child weight) is tracked for every child. + * Active child with minimum vruntime is selected to be dequeued next. On activation, initial vruntime + * of a child is set to vruntime of "start" of the last request. This guarantees immediate processing + * of at least single request of newly activated children and thus best isolation and scheduling latency. + */ +class FairPolicy : public ISchedulerNode +{ + /// Scheduling state of a child + struct Item + { + ISchedulerNode * child = nullptr; + double vruntime = 0; /// total consumed cost divided by child weight + + /// For min-heap by vruntime + bool operator<(const Item & rhs) const noexcept + { + return vruntime > rhs.vruntime; + } + }; + +public: + explicit FairPolicy(EventQueue * event_queue_, const Poco::Util::AbstractConfiguration & config = emptyConfig(), const String & config_prefix = {}) + : ISchedulerNode(event_queue_, config, config_prefix) + {} + + bool equals(ISchedulerNode * other) override + { + if (auto * o = dynamic_cast(other)) + return true; + return false; + } + + void attachChild(const SchedulerNodePtr & child) override + { + // Take ownership + if (auto [it, inserted] = children.emplace(child->basename, child); !inserted) + throw Exception( + ErrorCodes::INVALID_SCHEDULER_NODE, + "Can't add another child with the same path: {}", + it->second->getPath()); + + // Attach + child->setParent(this); + + // At first attach as inactive child. + // Inactive attached child must have `info.parent.idx` equal it's index inside `items` array. + // This is needed to avoid later scanning through inactive `items` in O(N). Important optimization. + // NOTE: vruntime must be equal to `system_vruntime` for fairness. + child->info.parent.idx = items.size(); + items.emplace_back(Item{child.get(), system_vruntime}); + + // Activate child if it is not empty + if (child->isActive()) + activateChildImpl(items.size() - 1); + } + + void removeChild(ISchedulerNode * child) override + { + if (auto iter = children.find(child->basename); iter != children.end()) + { + SchedulerNodePtr removed = iter->second; + + // Deactivate: detach is not very common operation, so we can afford O(N) here + size_t child_idx = 0; + [[ maybe_unused ]] bool found = false; + for (; child_idx != items.size(); child_idx++) + { + if (items[child_idx].child == removed.get()) + { + found = true; + break; + } + } + assert(found); + if (child_idx < heap_size) // Detach of active child requires deactivation at first + { + heap_size--; + std::swap(items[child_idx], items[heap_size]); + // Element was removed from inside of heap -- heap must be rebuilt + std::make_heap(items.begin(), items.begin() + heap_size); + child_idx = heap_size; + } + + // Now detach inactive child + if (child_idx != items.size() - 1) + { + std::swap(items[child_idx], items.back()); + items[child_idx].child->info.parent.idx = child_idx; + } + items.pop_back(); + + // Detach + removed->setParent(nullptr); + + // Get rid of ownership + children.erase(iter); + } + } + + ISchedulerNode * getChild(const String & child_name) override + { + if (auto iter = children.find(child_name); iter != children.end()) + return iter->second.get(); + else + return nullptr; + } + + std::pair dequeueRequest() override + { + if (heap_size == 0) + return {nullptr, false}; + + // Recursively pull request from child + auto [request, child_active] = items.front().child->dequeueRequest(); + assert(request != nullptr); + std::pop_heap(items.begin(), items.begin() + heap_size); + Item & current = items[heap_size - 1]; + + // SFQ fairness invariant: system vruntime equals last served request start-time + assert(current.vruntime >= system_vruntime); + system_vruntime = current.vruntime; + + // By definition vruntime is amount of consumed resource (cost) divided by weight + current.vruntime += double(request->cost) / current.child->info.weight; + max_vruntime = std::max(max_vruntime, current.vruntime); + + if (child_active) // Put active child back in heap after vruntime update + { + std::push_heap(items.begin(), items.begin() + heap_size); + } + else // Deactivate child if it is empty, but remember it's vruntime for latter activations + { + heap_size--; + + // Store index of this inactive child in `parent.idx` + // This enables O(1) search of inactive children instead of O(n) + current.child->info.parent.idx = heap_size; + } + + // Reset any difference between children on busy period end + if (heap_size == 0) + { + // Reset vtime to zero to avoid floating-point error accumulation, + // but do not reset too often, because it's O(N) + UInt64 ns = clock_gettime_ns(); + if (last_reset_ns + 1000000000 < ns) + { + last_reset_ns = ns; + for (Item & item : items) + item.vruntime = 0; + max_vruntime = 0; + } + system_vruntime = max_vruntime; + } + + return {request, heap_size > 0}; + } + + bool isActive() override + { + return heap_size > 0; + } + + void activateChild(ISchedulerNode * child) override + { + // Find this child; this is O(1), thanks to inactive index we hold in `parent.idx` + activateChildImpl(child->info.parent.idx); + } + +private: + void activateChildImpl(size_t inactive_idx) + { + bool activate_parent = heap_size == 0; + + if (heap_size != inactive_idx) + { + std::swap(items[heap_size], items[inactive_idx]); + items[inactive_idx].child->info.parent.idx = inactive_idx; + } + + // Newly activated child should have at least `system_vruntime` to keep fairness + items[heap_size].vruntime = std::max(system_vruntime, items[heap_size].vruntime); + heap_size++; + std::push_heap(items.begin(), items.begin() + heap_size); + + // Recursive activation + if (activate_parent && parent) + parent->activateChild(this); + } + +private: + /// Beginning of `items` vector is heap of active children: [0; `heap_size`). + /// Next go inactive children in unsorted order. + /// NOTE: we have to track vruntime of inactive children for max-min fairness. + std::vector items; + size_t heap_size = 0; + + /// Last request vruntime + double system_vruntime = 0; + double max_vruntime = 0; + UInt64 last_reset_ns = 0; + + /// All children with ownership + std::unordered_map children; // basename -> child +}; + +} diff --git a/src/IO/Resource/registerResourceManagers.cpp b/src/IO/Resource/registerResourceManagers.cpp index 0a394e3f0cd..5217bcdfbec 100644 --- a/src/IO/Resource/registerResourceManagers.cpp +++ b/src/IO/Resource/registerResourceManagers.cpp @@ -4,11 +4,13 @@ namespace DB { +void registerDynamicResourceManager(ResourceManagerFactory &); void registerStaticResourceManager(ResourceManagerFactory &); void registerResourceManagers() { auto & factory = ResourceManagerFactory::instance(); + registerDynamicResourceManager(factory); registerStaticResourceManager(factory); } diff --git a/src/IO/Resource/registerSchedulerNodes.cpp b/src/IO/Resource/registerSchedulerNodes.cpp index 1b58b3981c2..896f96d7f50 100644 --- a/src/IO/Resource/registerSchedulerNodes.cpp +++ b/src/IO/Resource/registerSchedulerNodes.cpp @@ -8,6 +8,7 @@ namespace DB { void registerPriorityPolicy(SchedulerNodeFactory &); +void registerFairPolicy(SchedulerNodeFactory &); void registerSemaphoreConstraint(SchedulerNodeFactory &); void registerFifoQueue(SchedulerNodeFactory &); @@ -17,6 +18,7 @@ void registerSchedulerNodes() // ISchedulerNode registerPriorityPolicy(factory); + registerFairPolicy(factory); // ISchedulerConstraint registerSemaphoreConstraint(factory); diff --git a/src/IO/Resource/tests/gtest_resource_class_fair.cpp b/src/IO/Resource/tests/gtest_resource_class_fair.cpp new file mode 100644 index 00000000000..89ec2ac7c32 --- /dev/null +++ b/src/IO/Resource/tests/gtest_resource_class_fair.cpp @@ -0,0 +1,187 @@ +#include + +#include + +#include + +using namespace DB; + +using ResourceTest = ResourceTestClass; + +TEST(IOResourceFairPolicy, Factory) +{ + ResourceTest t; + + Poco::AutoPtr cfg = new Poco::Util::XMLConfiguration(); + SchedulerNodePtr fair = SchedulerNodeFactory::instance().get("fair", /* event_queue = */ nullptr, *cfg, ""); + EXPECT_TRUE(dynamic_cast(fair.get()) != nullptr); +} + +TEST(IOResourceFairPolicy, FairnessWeights) +{ + ResourceTest t; + + t.add("/"); + t.add("/A", "1.0"); + t.add("/B", "3.0"); + + t.enqueue("/A", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/B", {10, 10, 10, 10, 10, 10, 10, 10}); + + t.dequeue(4); + t.consumed("A", 10); + t.consumed("B", 30); + + t.dequeue(4); + t.consumed("A", 10); + t.consumed("B", 30); + + t.dequeue(); + t.consumed("A", 60); + t.consumed("B", 20); +} + +TEST(IOResourceFairPolicy, Activation) +{ + ResourceTest t; + + t.add("/"); + t.add("/A"); + t.add("/B"); + t.add("/C"); + + t.enqueue("/A", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/B", {10}); + t.enqueue("/C", {10, 10}); + + t.dequeue(3); + t.consumed("A", 10); + t.consumed("B", 10); + t.consumed("C", 10); + + t.dequeue(4); + t.consumed("A", 30); + t.consumed("B", 0); + t.consumed("C", 10); + + t.enqueue("/B", {10, 10}); + t.dequeue(1); + t.consumed("B", 10); + + t.enqueue("/C", {10, 10}); + t.dequeue(1); + t.consumed("C", 10); + + t.dequeue(2); // A B or B A + t.consumed("A", 10); + t.consumed("B", 10); +} + +TEST(IOResourceFairPolicy, FairnessMaxMin) +{ + ResourceTest t; + + t.add("/"); + t.add("/A"); + t.add("/B"); + + t.enqueue("/A", {10, 10}); // make sure A is never empty + + for (int i = 0; i < 10; i++) + { + t.enqueue("/A", {10, 10, 10, 10}); + t.enqueue("/B", {10, 10}); + + t.dequeue(6); + t.consumed("A", 40); + t.consumed("B", 20); + } + + t.dequeue(2); + t.consumed("A", 20); +} + +TEST(IOResourceFairPolicy, HierarchicalFairness) +{ + ResourceTest t; + + t.add("/"); + t.add("/X"); + t.add("/Y"); + t.add("/X/A"); + t.add("/X/B"); + t.add("/Y/C"); + t.add("/Y/D"); + + t.enqueue("/X/A", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/X/B", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/C", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/D", {10, 10, 10, 10, 10, 10, 10, 10}); + for (int i = 0; i < 4; i++) + { + t.dequeue(8); + t.consumed("A", 20); + t.consumed("B", 20); + t.consumed("C", 20); + t.consumed("D", 20); + } + + t.enqueue("/X/A", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/X/A", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/C", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/D", {10, 10, 10, 10, 10, 10, 10, 10}); + for (int i = 0; i < 4; i++) + { + t.dequeue(8); + t.consumed("A", 40); + t.consumed("C", 20); + t.consumed("D", 20); + } + + t.enqueue("/X/B", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/X/B", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/C", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/D", {10, 10, 10, 10, 10, 10, 10, 10}); + for (int i = 0; i < 4; i++) + { + t.dequeue(8); + t.consumed("B", 40); + t.consumed("C", 20); + t.consumed("D", 20); + } + + t.enqueue("/X/A", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/X/B", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/C", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/C", {10, 10, 10, 10, 10, 10, 10, 10}); + for (int i = 0; i < 4; i++) + { + t.dequeue(8); + t.consumed("A", 20); + t.consumed("B", 20); + t.consumed("C", 40); + } + + t.enqueue("/X/A", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/X/B", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/D", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/D", {10, 10, 10, 10, 10, 10, 10, 10}); + for (int i = 0; i < 4; i++) + { + t.dequeue(8); + t.consumed("A", 20); + t.consumed("B", 20); + t.consumed("D", 40); + } + + t.enqueue("/X/A", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/X/A", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/D", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/D", {10, 10, 10, 10, 10, 10, 10, 10}); + for (int i = 0; i < 4; i++) + { + t.dequeue(8); + t.consumed("A", 40); + t.consumed("D", 40); + } +} diff --git a/src/IO/Resource/tests/gtest_resource_manager_hierarchical.cpp b/src/IO/Resource/tests/gtest_resource_manager_hierarchical.cpp new file mode 100644 index 00000000000..b113da31d59 --- /dev/null +++ b/src/IO/Resource/tests/gtest_resource_manager_hierarchical.cpp @@ -0,0 +1,116 @@ +#include + +#include + +#include +#include + +using namespace DB; + +using ResourceTest = ResourceTestManager; +using TestGuard = ResourceTest::Guard; + +TEST(IOResourceDynamicResourceManager, Smoke) +{ + ResourceTest t; + + t.update(R"CONFIG( + + + + inflight_limit10 + fair + fifo + fifo3 + + + + /fair/A + /fair/B + + + )CONFIG"); + + ClassifierPtr cA = t.manager->acquire("A"); + ClassifierPtr cB = t.manager->acquire("B"); + + for (int i = 0; i < 10; i++) + { + ResourceGuard gA(cA->get("res1"), ResourceGuard::PostponeLocking); + gA.lock(); + gA.setFailure(); + gA.unlock(); + + ResourceGuard gB(cB->get("res1")); + } +} + +TEST(IOResourceDynamicResourceManager, Fairness) +{ + constexpr size_t T = 3; // threads per queue + int N = 100; // requests per thread + ResourceTest t(2 * T + 1); + + t.update(R"CONFIG( + + + + inflight_limit1 + fair + fifo + fifo + fifo + + + + /fair/A + /fair/B + /fair/leader + + + )CONFIG"); + + + // Total cost for A and B cannot differ for more than 1 (every request has cost equal to 1). + // Requests from A use `value = 1` and from B `value = -1` is used. + std::atomic unfairness = 0; + auto fairness_diff = [&] (Int64 value) + { + Int64 cur_unfairness = unfairness.fetch_add(value, std::memory_order_relaxed) + value; + EXPECT_NEAR(cur_unfairness, 0, 1); + }; + + for (int thr = 0; thr < T; thr++) + { + t.threads.emplace_back([&] + { + ClassifierPtr c = t.manager->acquire("A"); + ResourceLink link = c->get("res1"); + t.startBusyPeriod(link, 1, N); + for (int req = 0; req < N; req++) + { + TestGuard g(t, link, 1); + fairness_diff(1); + } + }); + } + + for (int thr = 0; thr < T; thr++) + { + t.threads.emplace_back([&] + { + ClassifierPtr c = t.manager->acquire("B"); + ResourceLink link = c->get("res1"); + t.startBusyPeriod(link, 1, N); + for (int req = 0; req < N; req++) + { + TestGuard g(t, link, 1); + fairness_diff(-1); + } + }); + } + + ClassifierPtr c = t.manager->acquire("leader"); + ResourceLink link = c->get("res1"); + t.blockResource(link); +} diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index e222b8655aa..b4bdb7cf233 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -1220,7 +1220,7 @@ ResourceManagerPtr Context::getResourceManager() const { auto lock = getLock(); if (!shared->resource_manager) - shared->resource_manager = ResourceManagerFactory::instance().get(getConfigRef().getString("resource_manager", "static")); + shared->resource_manager = ResourceManagerFactory::instance().get(getConfigRef().getString("resource_manager", "dynamic")); return shared->resource_manager; } From 11b94a626aca1cf346db8c797bbce672a72f0703 Mon Sep 17 00:00:00 2001 From: FFFFFFFHHHHHHH <75292180+FFFFFFFHHHHHHH@users.noreply.github.com> Date: Tue, 16 May 2023 05:20:29 +0800 Subject: [PATCH 45/45] Fix aggregate function kolmogorovSmirnovTest (#49768) --- .../AggregateFunctionKolmogorovSmirnovTest.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.h b/src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.h index 33a9966ee2c..5629de31c88 100644 --- a/src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.h +++ b/src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.h @@ -43,6 +43,7 @@ struct KolmogorovSmirnov : public StatisticalSample Float64 now_s = 0; UInt64 pos_x = 0; UInt64 pos_y = 0; + UInt64 pos_tmp; UInt64 n1 = x.size(); UInt64 n2 = y.size(); @@ -65,14 +66,22 @@ struct KolmogorovSmirnov : public StatisticalSample now_s -= n2_d; ++pos_y; } - max_s = std::max(max_s, now_s); - min_s = std::min(min_s, now_s); } else { - now_s += n1_d; - ++pos_x; + pos_tmp = pos_x + 1; + while (pos_tmp < x.size() && unlikely(fabs(x[pos_tmp] - x[pos_x]) <= tol)) + pos_tmp++; + now_s += n1_d * (pos_tmp - pos_x); + pos_x = pos_tmp; + pos_tmp = pos_y + 1; + while (pos_tmp < y.size() && unlikely(fabs(y[pos_tmp] - y[pos_y]) <= tol)) + pos_tmp++; + now_s -= n2_d * (pos_tmp - pos_y); + pos_y = pos_tmp; } + max_s = std::max(max_s, now_s); + min_s = std::min(min_s, now_s); } now_s += n1_d * (x.size() - pos_x) - n2_d * (y.size() - pos_y); min_s = std::min(min_s, now_s);