diff --git a/.github/workflows/backport.yml b/.github/workflows/backport.yml index 66dddbee640..da42bbae78a 100644 --- a/.github/workflows/backport.yml +++ b/.github/workflows/backport.yml @@ -13,7 +13,7 @@ on: # yamllint disable-line rule:truthy jobs: CherryPick: - runs-on: [self-hosted, style-checker] + runs-on: [self-hosted, style-checker-aarch64] steps: - name: Set envs # https://docs.github.com/en/actions/learn-github-actions/workflow-commands-for-github-actions#multiline-strings diff --git a/base/base/LineReader.h b/base/base/LineReader.h index 33daae49974..d4ab327fe00 100644 --- a/base/base/LineReader.h +++ b/base/base/LineReader.h @@ -7,6 +7,7 @@ #include #include +#include class LineReader { @@ -20,8 +21,8 @@ public: void addWords(Words && new_words); private: - Words words; - Words words_no_case; + Words words TSA_GUARDED_BY(mutex); + Words words_no_case TSA_GUARDED_BY(mutex); std::mutex mutex; }; @@ -29,7 +30,7 @@ public: using Patterns = std::vector; LineReader(const String & history_file_path, bool multiline, Patterns extenders, Patterns delimiters); - virtual ~LineReader() {} + virtual ~LineReader() = default; /// Reads the whole line until delimiter (in multiline mode) or until the last line without extender. /// If resulting line is empty, it means the user interrupted the input. diff --git a/base/base/defines.h b/base/base/defines.h index 084e710abf6..3959e690d71 100644 --- a/base/base/defines.h +++ b/base/base/defines.h @@ -124,6 +124,23 @@ #endif #endif +// Macros for Clang Thread Safety Analysis (TSA). They can be safely ignored by other compilers. +// Feel free to extend, but please stay close to https://clang.llvm.org/docs/ThreadSafetyAnalysis.html#mutexheader +#if defined(__clang__) +# define TSA_GUARDED_BY(...) __attribute__((guarded_by(__VA_ARGS__))) // data is protected by given capability +# define TSA_PT_GUARDED_BY(...) __attribute__((pt_guarded_by(__VA_ARGS__))) // pointed-to data is protected by the given capability +# define TSA_REQUIRES(...) __attribute__((requires_capability(__VA_ARGS__))) // thread needs exclusive possession of given capability +# define TSA_REQUIRES_SHARED(...) __attribute__((requires_shared_capability(__VA_ARGS__))) // thread needs shared possession of given capability +# define TSA_ACQUIRED_AFTER(...) __attribute__((acquired_after(__VA_ARGS__))) // annotated lock must be locked after given lock +# define TSA_NO_THREAD_SAFETY_ANALYSIS __attribute__((no_thread_safety_analysis)) // disable TSA for a function +#else +# define TSA_GUARDED_BY(...) +# define TSA_PT_GUARDED_BY(...) +# define TSA_REQUIRES(...) +# define TSA_REQUIRES_SHARED(...) +# define TSA_NO_THREAD_SAFETY_ANALYSIS +#endif + /// A template function for suppressing warnings about unused variables or function results. template constexpr void UNUSED(Args &&... args [[maybe_unused]]) diff --git a/base/harmful/harmful.c b/base/harmful/harmful.c index 5a27cae0383..6112f9a339c 100644 --- a/base/harmful/harmful.c +++ b/base/harmful/harmful.c @@ -260,4 +260,35 @@ TRAP(mq_timedreceive) TRAP(wordexp) TRAP(wordfree) +/// C11 threading primitives are not supported by ThreadSanitizer. +/// Also we should avoid using them for compatibility with old libc. +TRAP(thrd_create) +TRAP(thrd_equal) +TRAP(thrd_current) +TRAP(thrd_sleep) +TRAP(thrd_yield) +TRAP(thrd_exit) +TRAP(thrd_detach) +TRAP(thrd_join) + +TRAP(mtx_init) +TRAP(mtx_lock) +TRAP(mtx_timedlock) +TRAP(mtx_trylock) +TRAP(mtx_unlock) +TRAP(mtx_destroy) +TRAP(call_once) + +TRAP(cnd_init) +TRAP(cnd_signal) +TRAP(cnd_broadcast) +TRAP(cnd_wait) +TRAP(cnd_timedwait) +TRAP(cnd_destroy) + +TRAP(tss_create) +TRAP(tss_get) +TRAP(tss_set) +TRAP(tss_delete) + #endif diff --git a/cmake/tools.cmake b/cmake/tools.cmake index 0560bd46fed..181f5b8ac51 100644 --- a/cmake/tools.cmake +++ b/cmake/tools.cmake @@ -112,7 +112,7 @@ endif() # Archiver if (COMPILER_GCC) - find_program (LLVM_AR_PATH NAMES "llvm-ar" "llvm-ar-13" "llvm-ar-12" "llvm-ar-11") + find_program (LLVM_AR_PATH NAMES "llvm-ar" "llvm-ar-14" "llvm-ar-13" "llvm-ar-12") else () find_program (LLVM_AR_PATH NAMES "llvm-ar-${COMPILER_VERSION_MAJOR}" "llvm-ar") endif () @@ -126,7 +126,7 @@ message(STATUS "Using archiver: ${CMAKE_AR}") # Ranlib if (COMPILER_GCC) - find_program (LLVM_RANLIB_PATH NAMES "llvm-ranlib" "llvm-ranlib-13" "llvm-ranlib-12" "llvm-ranlib-11") + find_program (LLVM_RANLIB_PATH NAMES "llvm-ranlib" "llvm-ranlib-14" "llvm-ranlib-13" "llvm-ranlib-12") else () find_program (LLVM_RANLIB_PATH NAMES "llvm-ranlib-${COMPILER_VERSION_MAJOR}" "llvm-ranlib") endif () @@ -140,7 +140,7 @@ message(STATUS "Using ranlib: ${CMAKE_RANLIB}") # Install Name Tool if (COMPILER_GCC) - find_program (LLVM_INSTALL_NAME_TOOL_PATH NAMES "llvm-install-name-tool" "llvm-install-name-tool-13" "llvm-install-name-tool-12" "llvm-install-name-tool-11") + find_program (LLVM_INSTALL_NAME_TOOL_PATH NAMES "llvm-install-name-tool" "llvm-install-name-tool-14" "llvm-install-name-tool-13" "llvm-install-name-tool-12") else () find_program (LLVM_INSTALL_NAME_TOOL_PATH NAMES "llvm-install-name-tool-${COMPILER_VERSION_MAJOR}" "llvm-install-name-tool") endif () @@ -154,7 +154,7 @@ message(STATUS "Using install-name-tool: ${CMAKE_INSTALL_NAME_TOOL}") # Objcopy if (COMPILER_GCC) - find_program (OBJCOPY_PATH NAMES "llvm-objcopy" "llvm-objcopy-13" "llvm-objcopy-12" "llvm-objcopy-11" "objcopy") + find_program (OBJCOPY_PATH NAMES "llvm-objcopy" "llvm-objcopy-14" "llvm-objcopy-13" "llvm-objcopy-12" "objcopy") else () find_program (OBJCOPY_PATH NAMES "llvm-objcopy-${COMPILER_VERSION_MAJOR}" "llvm-objcopy" "objcopy") endif () @@ -168,7 +168,7 @@ endif () # Strip if (COMPILER_GCC) - find_program (STRIP_PATH NAMES "llvm-strip" "llvm-strip-13" "llvm-strip-12" "llvm-strip-11" "strip") + find_program (STRIP_PATH NAMES "llvm-strip" "llvm-strip-14" "llvm-strip-13" "llvm-strip-12" "strip") else () find_program (STRIP_PATH NAMES "llvm-strip-${COMPILER_VERSION_MAJOR}" "llvm-strip" "strip") endif () diff --git a/cmake/warnings.cmake b/cmake/warnings.cmake index 4b8f83df090..6b08f1fda05 100644 --- a/cmake/warnings.cmake +++ b/cmake/warnings.cmake @@ -19,7 +19,6 @@ if (COMPILER_CLANG) # Add some warnings that are not available even with -Wall -Wextra -Wpedantic. # We want to get everything out of the compiler for code quality. add_warning(everything) - add_warning(pedantic) no_warning(vla-extension) no_warning(zero-length-array) @@ -51,6 +50,7 @@ if (COMPILER_CLANG) no_warning(vla) no_warning(weak-template-vtables) no_warning(weak-vtables) + no_warning(thread-safety-negative) # experimental flag, too many false positives # TODO Enable conversion, sign-conversion, double-promotion warnings. elseif (COMPILER_GCC) # Add compiler options only to c++ compiler diff --git a/contrib/curl b/contrib/curl index 801bd5138ce..462196e6b4a 160000 --- a/contrib/curl +++ b/contrib/curl @@ -1 +1 @@ -Subproject commit 801bd5138ce31aa0d906fa4e2eabfc599d74e793 +Subproject commit 462196e6b4a47f924293a0e26b8e9c23d37ac26f diff --git a/contrib/curl-cmake/CMakeLists.txt b/contrib/curl-cmake/CMakeLists.txt index b1e1a0ded8a..761ee036e66 100644 --- a/contrib/curl-cmake/CMakeLists.txt +++ b/contrib/curl-cmake/CMakeLists.txt @@ -84,7 +84,6 @@ set (SRCS "${LIBRARY_DIR}/lib/gopher.c" "${LIBRARY_DIR}/lib/idn_win32.c" "${LIBRARY_DIR}/lib/http_proxy.c" - "${LIBRARY_DIR}/lib/non-ascii.c" "${LIBRARY_DIR}/lib/asyn-thread.c" "${LIBRARY_DIR}/lib/curl_gssapi.c" "${LIBRARY_DIR}/lib/http_ntlm.c" @@ -93,10 +92,8 @@ set (SRCS "${LIBRARY_DIR}/lib/curl_sasl.c" "${LIBRARY_DIR}/lib/rand.c" "${LIBRARY_DIR}/lib/curl_multibyte.c" - "${LIBRARY_DIR}/lib/hostcheck.c" "${LIBRARY_DIR}/lib/conncache.c" "${LIBRARY_DIR}/lib/dotdot.c" - "${LIBRARY_DIR}/lib/x509asn1.c" "${LIBRARY_DIR}/lib/http2.c" "${LIBRARY_DIR}/lib/smb.c" "${LIBRARY_DIR}/lib/curl_endian.c" @@ -120,6 +117,9 @@ set (SRCS "${LIBRARY_DIR}/lib/http_aws_sigv4.c" "${LIBRARY_DIR}/lib/mqtt.c" "${LIBRARY_DIR}/lib/rename.c" + "${LIBRARY_DIR}/lib/h2h3.c" + "${LIBRARY_DIR}/lib/headers.c" + "${LIBRARY_DIR}/lib/timediff.c" "${LIBRARY_DIR}/lib/vauth/vauth.c" "${LIBRARY_DIR}/lib/vauth/cleartext.c" "${LIBRARY_DIR}/lib/vauth/cram.c" @@ -142,11 +142,13 @@ set (SRCS "${LIBRARY_DIR}/lib/vtls/sectransp.c" "${LIBRARY_DIR}/lib/vtls/gskit.c" "${LIBRARY_DIR}/lib/vtls/mbedtls.c" - "${LIBRARY_DIR}/lib/vtls/mesalink.c" "${LIBRARY_DIR}/lib/vtls/bearssl.c" "${LIBRARY_DIR}/lib/vtls/keylog.c" + "${LIBRARY_DIR}/lib/vtls/x509asn1.c" + "${LIBRARY_DIR}/lib/vtls/hostcheck.c" "${LIBRARY_DIR}/lib/vquic/ngtcp2.c" "${LIBRARY_DIR}/lib/vquic/quiche.c" + "${LIBRARY_DIR}/lib/vquic/msh3.c" "${LIBRARY_DIR}/lib/vssh/libssh2.c" "${LIBRARY_DIR}/lib/vssh/libssh.c" ) diff --git a/contrib/libcxx-cmake/CMakeLists.txt b/contrib/libcxx-cmake/CMakeLists.txt index dc9df48b2c1..a501c4df64f 100644 --- a/contrib/libcxx-cmake/CMakeLists.txt +++ b/contrib/libcxx-cmake/CMakeLists.txt @@ -78,6 +78,9 @@ target_compile_options(cxx PUBLIC $<$:-nostdinc++>) # Third party library may have substandard code. target_compile_options(cxx PRIVATE -w) +# Enable support for Clang-Thread-Safety-Analysis in libcxx +target_compile_definitions(cxx PUBLIC -D_LIBCPP_ENABLE_THREAD_SAFETY_ANNOTATIONS) + target_link_libraries(cxx PUBLIC cxxabi) # For __udivmodti4, __divmodti4. diff --git a/contrib/librdkafka b/contrib/librdkafka index b8554f16820..6062e711a91 160000 --- a/contrib/librdkafka +++ b/contrib/librdkafka @@ -1 +1 @@ -Subproject commit b8554f1682062c85ba519eb54ef2f90e02b812cb +Subproject commit 6062e711a919fb3b669b243b7dceabd045d0e4a2 diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index cafc62b365e..2bbdd978e5e 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -37,38 +37,13 @@ export FASTTEST_DATA export FASTTEST_OUT export PATH -server_pid=none - -function stop_server -{ - if ! kill -0 -- "$server_pid" - then - echo "ClickHouse server pid '$server_pid' is not running" - return 0 - fi - - for _ in {1..60} - do - if ! pkill -f "clickhouse-server" && ! kill -- "$server_pid" ; then break ; fi - sleep 1 - done - - if kill -0 -- "$server_pid" - then - pstree -apgT - jobs - echo "Failed to kill the ClickHouse server pid '$server_pid'" - return 1 - fi - - server_pid=none -} - function start_server { set -m # Spawn server in its own process groups + local opts=( --config-file "$FASTTEST_DATA/config.xml" + --pid-file "$FASTTEST_DATA/clickhouse-server.pid" -- --path "$FASTTEST_DATA" --user_files_path "$FASTTEST_DATA/user_files" @@ -76,40 +51,22 @@ function start_server --keeper_server.storage_path "$FASTTEST_DATA/coordination" ) clickhouse-server "${opts[@]}" &>> "$FASTTEST_OUTPUT/server.log" & - server_pid=$! set +m - if [ "$server_pid" == "0" ] - then - echo "Failed to start ClickHouse server" - # Avoid zero PID because `kill` treats it as our process group PID. - server_pid="none" - return 1 - fi - - for _ in {1..60} - do - if clickhouse-client --query "select 1" || ! kill -0 -- "$server_pid" - then + for _ in {1..60}; do + if clickhouse-client --query "select 1"; then break fi sleep 1 done - if ! clickhouse-client --query "select 1" - then + if ! clickhouse-client --query "select 1"; then echo "Failed to wait until ClickHouse server starts." - server_pid="none" - return 1 - fi - - if ! kill -0 -- "$server_pid" - then - echo "Wrong clickhouse server started: PID '$server_pid' we started is not running, but '$(pgrep -f clickhouse-server)' is running" - server_pid="none" return 1 fi + local server_pid + server_pid="$(cat "$FASTTEST_DATA/clickhouse-server.pid")" echo "ClickHouse server pid '$server_pid' started and responded" } @@ -254,9 +211,6 @@ function run_tests clickhouse-server --version clickhouse-test --help - # Kill the server in case we are running locally and not in docker - stop_server ||: - start_server set +e @@ -284,6 +238,8 @@ function run_tests | ts '%Y-%m-%d %H:%M:%S' \ | tee "$FASTTEST_OUTPUT/test_result.txt" set -e + + clickhouse stop --pid-path "$FASTTEST_DATA" } case "$stage" in diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh index 4bc10aee95f..f74760e3339 100755 --- a/docker/test/fuzzer/run-fuzzer.sh +++ b/docker/test/fuzzer/run-fuzzer.sh @@ -125,16 +125,7 @@ function filter_exists_and_template function stop_server { clickhouse-client --query "select elapsed, query from system.processes" ||: - killall clickhouse-server ||: - for _ in {1..10} - do - if ! pgrep -f clickhouse-server - then - break - fi - sleep 1 - done - killall -9 clickhouse-server ||: + clickhouse stop # Debug. date @@ -159,10 +150,12 @@ function fuzz NEW_TESTS_OPT="${NEW_TESTS_OPT:-}" fi + mkdir -p /var/run/clickhouse-server + # interferes with gdb export CLICKHOUSE_WATCHDOG_ENABLE=0 # NOTE: we use process substitution here to preserve keep $! as a pid of clickhouse-server - clickhouse-server --config-file db/config.xml -- --path db > >(tail -100000 > server.log) 2>&1 & + clickhouse-server --config-file db/config.xml --pid-file /var/run/clickhouse-server/clickhouse-server.pid -- --path db > >(tail -100000 > server.log) 2>&1 & server_pid=$! kill -0 $server_pid diff --git a/docker/test/sqlancer/run.sh b/docker/test/sqlancer/run.sh index e465ba1c993..a1891569d34 100755 --- a/docker/test/sqlancer/run.sh +++ b/docker/test/sqlancer/run.sh @@ -21,7 +21,7 @@ export NUM_QUERIES=1000 ( java -jar target/sqlancer-*.jar --num-threads 10 --timeout-seconds $TIMEOUT --num-queries $NUM_QUERIES --username default --password "" clickhouse --oracle TLPDistinct | tee /test_output/TLPDistinct.out ) 3>&1 1>&2 2>&3 | tee /test_output/TLPDistinct.err ( java -jar target/sqlancer-*.jar --num-threads 10 --timeout-seconds $TIMEOUT --num-queries $NUM_QUERIES --username default --password "" clickhouse --oracle TLPAggregate | tee /test_output/TLPAggregate.out ) 3>&1 1>&2 2>&3 | tee /test_output/TLPAggregate.err -service clickhouse-server stop && sleep 10 +service clickhouse stop ls /var/log/clickhouse-server/ tar czf /test_output/logs.tar.gz -C /var/log/clickhouse-server/ . diff --git a/docker/test/stateful/run.sh b/docker/test/stateful/run.sh index b598e28d91d..5f55bb9fa21 100755 --- a/docker/test/stateful/run.sh +++ b/docker/test/stateful/run.sh @@ -22,17 +22,23 @@ ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test function start() { if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then + mkdir -p /var/run/clickhouse-server1 + sudo chown clickhouse:clickhouse /var/run/clickhouse-server1 # NOTE We run "clickhouse server" instead of "clickhouse-server" # to make "pidof clickhouse-server" return single pid of the main instance. # We wil run main instance using "service clickhouse-server start" sudo -E -u clickhouse /usr/bin/clickhouse server --config /etc/clickhouse-server1/config.xml --daemon \ + --pid-file /var/run/clickhouse-server1/clickhouse-server.pid \ -- --path /var/lib/clickhouse1/ --logger.stderr /var/log/clickhouse-server/stderr1.log \ --logger.log /var/log/clickhouse-server/clickhouse-server1.log --logger.errorlog /var/log/clickhouse-server/clickhouse-server1.err.log \ --tcp_port 19000 --tcp_port_secure 19440 --http_port 18123 --https_port 18443 --interserver_http_port 19009 --tcp_with_proxy_port 19010 \ --mysql_port 19004 --postgresql_port 19005 \ --keeper_server.tcp_port 19181 --keeper_server.server_id 2 + mkdir -p /var/run/clickhouse-server2 + sudo chown clickhouse:clickhouse /var/run/clickhouse-server2 sudo -E -u clickhouse /usr/bin/clickhouse server --config /etc/clickhouse-server2/config.xml --daemon \ + --pid-file /var/run/clickhouse-server2/clickhouse-server.pid \ -- --path /var/lib/clickhouse2/ --logger.stderr /var/log/clickhouse-server/stderr2.log \ --logger.log /var/log/clickhouse-server/clickhouse-server2.log --logger.errorlog /var/log/clickhouse-server/clickhouse-server2.err.log \ --tcp_port 29000 --tcp_port_secure 29440 --http_port 28123 --https_port 28443 --interserver_http_port 29009 --tcp_with_proxy_port 29010 \ @@ -135,6 +141,12 @@ ls -la / /process_functional_tests_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv +sudo clickhouse stop ||: +if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then + sudo clickhouse stop --pid-path /var/run/clickhouse-server1 ||: + sudo clickhouse stop --pid-path /var/run/clickhouse-server2 ||: +fi + grep -Fa "Fatal" /var/log/clickhouse-server/clickhouse-server.log ||: pigz < /var/log/clickhouse-server/clickhouse-server.log > /test_output/clickhouse-server.log.gz ||: diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index 727c4f8a0c1..52bf8a60669 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -41,15 +41,18 @@ if [ "$NUM_TRIES" -gt "1" ]; then export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000 export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000 + mkdir -p /var/run/clickhouse-server # simpliest way to forward env variables to server - sudo -E -u clickhouse /usr/bin/clickhouse-server --config /etc/clickhouse-server/config.xml --daemon + sudo -E -u clickhouse /usr/bin/clickhouse-server --config /etc/clickhouse-server/config.xml --daemon --pid-file /var/run/clickhouse-server/clickhouse-server.pid else sudo clickhouse start fi if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then - + mkdir -p /var/run/clickhouse-server1 + sudo chown clickhouse:clickhouse /var/run/clickhouse-server1 sudo -E -u clickhouse /usr/bin/clickhouse server --config /etc/clickhouse-server1/config.xml --daemon \ + --pid-file /var/run/clickhouse-server1/clickhouse-server.pid \ -- --path /var/lib/clickhouse1/ --logger.stderr /var/log/clickhouse-server/stderr1.log \ --logger.log /var/log/clickhouse-server/clickhouse-server1.log --logger.errorlog /var/log/clickhouse-server/clickhouse-server1.err.log \ --tcp_port 19000 --tcp_port_secure 19440 --http_port 18123 --https_port 18443 --interserver_http_port 19009 --tcp_with_proxy_port 19010 \ @@ -57,7 +60,10 @@ if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]] --keeper_server.tcp_port 19181 --keeper_server.server_id 2 \ --macros.replica r2 # It doesn't work :( + mkdir -p /var/run/clickhouse-server2 + sudo chown clickhouse:clickhouse /var/run/clickhouse-server2 sudo -E -u clickhouse /usr/bin/clickhouse server --config /etc/clickhouse-server2/config.xml --daemon \ + --pid-file /var/run/clickhouse-server2/clickhouse-server.pid \ -- --path /var/lib/clickhouse2/ --logger.stderr /var/log/clickhouse-server/stderr2.log \ --logger.log /var/log/clickhouse-server/clickhouse-server2.log --logger.errorlog /var/log/clickhouse-server/clickhouse-server2.err.log \ --tcp_port 29000 --tcp_port_secure 29440 --http_port 28123 --https_port 28443 --interserver_http_port 29009 --tcp_with_proxy_port 29010 \ @@ -133,18 +139,10 @@ clickhouse-client -q "system flush logs" ||: # Stop server so we can safely read data with clickhouse-local. # Why do we read data with clickhouse-local? # Because it's the simplest way to read it when server has crashed. -if [ "$NUM_TRIES" -gt "1" ]; then - clickhouse-client -q "system shutdown" ||: - sleep 10 -else - sudo clickhouse stop ||: -fi - - +sudo clickhouse stop ||: if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then - clickhouse-client --port 19000 -q "system shutdown" ||: - clickhouse-client --port 29000 -q "system shutdown" ||: - sleep 10 + sudo clickhouse stop --pid-path /var/run/clickhouse-server1 ||: + sudo clickhouse stop --pid-path /var/run/clickhouse-server2 ||: fi grep -Fa "Fatal" /var/log/clickhouse-server/clickhouse-server.log ||: diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index 9688ae424c9..c73784c4ef1 100755 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -7,26 +7,29 @@ set -x # Thread Fuzzer allows to check more permutations of possible thread scheduling # and find more potential issues. +# +# But under thread fuzzer, TSan build is too slow and this produces some flaky +# tests, so for now, as a temporary solution it had been disabled. +if ! test -f package_folder/clickhouse-server*tsan*.deb; then + export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000 + export THREAD_FUZZER_SLEEP_PROBABILITY=0.1 + export THREAD_FUZZER_SLEEP_TIME_US=100000 -export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000 -export THREAD_FUZZER_SLEEP_PROBABILITY=0.1 -export THREAD_FUZZER_SLEEP_TIME_US=100000 + export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1 + export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1 + export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_MIGRATE_PROBABILITY=1 + export THREAD_FUZZER_pthread_mutex_unlock_AFTER_MIGRATE_PROBABILITY=1 -export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1 -export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1 -export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_MIGRATE_PROBABILITY=1 -export THREAD_FUZZER_pthread_mutex_unlock_AFTER_MIGRATE_PROBABILITY=1 - -export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY=0.001 -export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001 -export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001 -export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001 -export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000 - -export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000 -export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000 -export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000 + export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY=0.001 + export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001 + export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001 + export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001 + export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000 + export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000 + export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000 + export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000 +fi function install_packages() { diff --git a/docker/test/stress/stress b/docker/test/stress/stress index 94fdfd536a7..e195f81b551 100755 --- a/docker/test/stress/stress +++ b/docker/test/stress/stress @@ -18,8 +18,10 @@ def get_options(i, backward_compatibility_check): options.append("--db-engine=Ordinary") if i % 3 == 2 and not backward_compatibility_check: - options.append('''--db-engine="Replicated('/test/db/test_{}', 's1', 'r1')"'''.format(i)) - client_options.append('allow_experimental_database_replicated=1') + options.append( + '''--db-engine="Replicated('/test/db/test_{}', 's1', 'r1')"'''.format(i) + ) + client_options.append("allow_experimental_database_replicated=1") # If database name is not specified, new database is created for each functional test. # Run some threads with one database for all tests. @@ -37,38 +39,58 @@ def get_options(i, backward_compatibility_check): if i % 15 == 11: client_options.append("join_algorithm='auto'") - client_options.append('max_rows_in_join=1000') + client_options.append("max_rows_in_join=1000") if i == 13: - client_options.append('memory_tracker_fault_probability=0.001') + client_options.append("memory_tracker_fault_probability=0.001") if client_options: - options.append(" --client-option " + ' '.join(client_options)) + options.append(" --client-option " + " ".join(client_options)) - return ' '.join(options) + return " ".join(options) -def run_func_test(cmd, output_prefix, num_processes, skip_tests_option, global_time_limit, backward_compatibility_check): - backward_compatibility_check_option = '--backward-compatibility-check' if backward_compatibility_check else '' - global_time_limit_option = '' +def run_func_test( + cmd, + output_prefix, + num_processes, + skip_tests_option, + global_time_limit, + backward_compatibility_check, +): + backward_compatibility_check_option = ( + "--backward-compatibility-check" if backward_compatibility_check else "" + ) + global_time_limit_option = "" if global_time_limit: global_time_limit_option = "--global_time_limit={}".format(global_time_limit) - output_paths = [os.path.join(output_prefix, "stress_test_run_{}.txt".format(i)) for i in range(num_processes)] + output_paths = [ + os.path.join(output_prefix, "stress_test_run_{}.txt".format(i)) + for i in range(num_processes) + ] pipes = [] for i in range(0, len(output_paths)): - f = open(output_paths[i], 'w') - full_command = "{} {} {} {} {}".format(cmd, get_options(i, backward_compatibility_check), global_time_limit_option, skip_tests_option, backward_compatibility_check_option) + f = open(output_paths[i], "w") + full_command = "{} {} {} {} {}".format( + cmd, + get_options(i, backward_compatibility_check), + global_time_limit_option, + skip_tests_option, + backward_compatibility_check_option, + ) logging.info("Run func tests '%s'", full_command) p = Popen(full_command, shell=True, stdout=f, stderr=f) pipes.append(p) time.sleep(0.5) return pipes + def compress_stress_logs(output_path, files_prefix): cmd = f"cd {output_path} && tar -zcf stress_run_logs.tar.gz {files_prefix}* && rm {files_prefix}*" check_output(cmd, shell=True) + def call_with_retry(query, timeout=30, retry_count=5): for i in range(retry_count): code = call(query, shell=True, stderr=STDOUT, timeout=timeout) @@ -77,6 +99,7 @@ def call_with_retry(query, timeout=30, retry_count=5): else: break + def make_query_command(query): return f"""clickhouse client -q "{query}" --max_untracked_memory=1Gi --memory_profiler_step=1Gi --max_memory_usage_for_user=0""" @@ -93,28 +116,34 @@ def prepare_for_hung_check(drop_databases): # ThreadFuzzer significantly slows down server and causes false-positive hung check failures call_with_retry("clickhouse client -q 'SYSTEM STOP THREAD FUZZER'") - call_with_retry(make_query_command('SELECT 1 FORMAT Null')) + call_with_retry(make_query_command("SELECT 1 FORMAT Null")) # Some tests execute SYSTEM STOP MERGES or similar queries. # It may cause some ALTERs to hang. # Possibly we should fix tests and forbid to use such queries without specifying table. - call_with_retry(make_query_command('SYSTEM START MERGES')) - call_with_retry(make_query_command('SYSTEM START DISTRIBUTED SENDS')) - call_with_retry(make_query_command('SYSTEM START TTL MERGES')) - call_with_retry(make_query_command('SYSTEM START MOVES')) - call_with_retry(make_query_command('SYSTEM START FETCHES')) - call_with_retry(make_query_command('SYSTEM START REPLICATED SENDS')) - call_with_retry(make_query_command('SYSTEM START REPLICATION QUEUES')) - call_with_retry(make_query_command('SYSTEM DROP MARK CACHE')) + call_with_retry(make_query_command("SYSTEM START MERGES")) + call_with_retry(make_query_command("SYSTEM START DISTRIBUTED SENDS")) + call_with_retry(make_query_command("SYSTEM START TTL MERGES")) + call_with_retry(make_query_command("SYSTEM START MOVES")) + call_with_retry(make_query_command("SYSTEM START FETCHES")) + call_with_retry(make_query_command("SYSTEM START REPLICATED SENDS")) + call_with_retry(make_query_command("SYSTEM START REPLICATION QUEUES")) + call_with_retry(make_query_command("SYSTEM DROP MARK CACHE")) # Issue #21004, live views are experimental, so let's just suppress it call_with_retry(make_query_command("KILL QUERY WHERE upper(query) LIKE 'WATCH %'")) # Kill other queries which known to be slow # It's query from 01232_preparing_sets_race_condition_long, it may take up to 1000 seconds in slow builds - call_with_retry(make_query_command("KILL QUERY WHERE query LIKE 'insert into tableB select %'")) + call_with_retry( + make_query_command("KILL QUERY WHERE query LIKE 'insert into tableB select %'") + ) # Long query from 00084_external_agregation - call_with_retry(make_query_command("KILL QUERY WHERE query LIKE 'SELECT URL, uniq(SearchPhrase) AS u FROM test.hits GROUP BY URL ORDER BY u %'")) + call_with_retry( + make_query_command( + "KILL QUERY WHERE query LIKE 'SELECT URL, uniq(SearchPhrase) AS u FROM test.hits GROUP BY URL ORDER BY u %'" + ) + ) if drop_databases: for i in range(5): @@ -123,23 +152,35 @@ def prepare_for_hung_check(drop_databases): # Otherwise we will get rid of queries which wait for background pool. It can take a long time on slow builds (more than 900 seconds). # # Also specify max_untracked_memory to allow 1GiB of memory to overcommit. - databases = check_output(make_query_command('SHOW DATABASES'), shell=True, timeout=30).decode('utf-8').strip().split() + databases = ( + check_output( + make_query_command("SHOW DATABASES"), shell=True, timeout=30 + ) + .decode("utf-8") + .strip() + .split() + ) for db in databases: if db == "system": continue - command = make_query_command(f'DROP DATABASE {db}') + command = make_query_command(f"DROP DATABASE {db}") # we don't wait for drop Popen(command, shell=True) break except Exception as ex: - logging.error("Failed to SHOW or DROP databasese, will retry %s", str(ex)) + logging.error( + "Failed to SHOW or DROP databasese, will retry %s", str(ex) + ) time.sleep(i) else: - raise Exception("Cannot drop databases after stress tests. Probably server consumed too much memory and cannot execute simple queries") - + raise Exception( + "Cannot drop databases after stress tests. Probably server consumed too much memory and cannot execute simple queries" + ) # Wait for last queries to finish if any, not longer than 300 seconds - call(make_query_command(""" + call( + make_query_command( + """ select sleepEachRow(( select maxOrDefault(300 - elapsed) + 1 from system.processes @@ -147,39 +188,58 @@ def prepare_for_hung_check(drop_databases): ) / 300) from numbers(300) format Null - """), shell=True, stderr=STDOUT, timeout=330) + """ + ), + shell=True, + stderr=STDOUT, + timeout=330, + ) # Even if all clickhouse-test processes are finished, there are probably some sh scripts, # which still run some new queries. Let's ignore them. try: query = """clickhouse client -q "SELECT count() FROM system.processes where where elapsed > 300" """ - output = check_output(query, shell=True, stderr=STDOUT, timeout=30).decode('utf-8').strip() + output = ( + check_output(query, shell=True, stderr=STDOUT, timeout=30) + .decode("utf-8") + .strip() + ) if int(output) == 0: return False except: pass return True + if __name__ == "__main__": - logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') - parser = argparse.ArgumentParser(description="ClickHouse script for running stresstest") - parser.add_argument("--test-cmd", default='/usr/bin/clickhouse-test') - parser.add_argument("--skip-func-tests", default='') - parser.add_argument("--client-cmd", default='clickhouse-client') - parser.add_argument("--server-log-folder", default='/var/log/clickhouse-server') + logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s") + parser = argparse.ArgumentParser( + description="ClickHouse script for running stresstest" + ) + parser.add_argument("--test-cmd", default="/usr/bin/clickhouse-test") + parser.add_argument("--skip-func-tests", default="") + parser.add_argument("--client-cmd", default="clickhouse-client") + parser.add_argument("--server-log-folder", default="/var/log/clickhouse-server") parser.add_argument("--output-folder") parser.add_argument("--global-time-limit", type=int, default=1800) parser.add_argument("--num-parallel", type=int, default=cpu_count()) - parser.add_argument('--backward-compatibility-check', action='store_true') - parser.add_argument('--hung-check', action='store_true', default=False) + parser.add_argument("--backward-compatibility-check", action="store_true") + parser.add_argument("--hung-check", action="store_true", default=False) # make sense only for hung check - parser.add_argument('--drop-databases', action='store_true', default=False) + parser.add_argument("--drop-databases", action="store_true", default=False) args = parser.parse_args() if args.drop_databases and not args.hung_check: raise Exception("--drop-databases only used in hung check (--hung-check)") func_pipes = [] - func_pipes = run_func_test(args.test_cmd, args.output_folder, args.num_parallel, args.skip_func_tests, args.global_time_limit, args.backward_compatibility_check) + func_pipes = run_func_test( + args.test_cmd, + args.output_folder, + args.num_parallel, + args.skip_func_tests, + args.global_time_limit, + args.backward_compatibility_check, + ) logging.info("Will wait functests to finish") while True: @@ -205,32 +265,41 @@ if __name__ == "__main__": have_long_running_queries = True logging.error("Failed to prepare for hung check %s", str(ex)) logging.info("Checking if some queries hung") - cmd = ' '.join([args.test_cmd, - # Do not track memory allocations up to 1Gi, - # this will allow to ignore server memory limit (max_server_memory_usage) for this query. - # - # NOTE: memory_profiler_step should be also adjusted, because: - # - # untracked_memory_limit = min(settings.max_untracked_memory, settings.memory_profiler_step) - # - # NOTE: that if there will be queries with GROUP BY, this trick - # will not work due to CurrentMemoryTracker::check() from - # Aggregator code. - # But right now it should work, since neither hung check, nor 00001_select_1 has GROUP BY. - "--client-option", "max_untracked_memory=1Gi", - "--client-option", "max_memory_usage_for_user=0", - "--client-option", "memory_profiler_step=1Gi", - # Use system database to avoid CREATE/DROP DATABASE queries - "--database=system", - "--hung-check", - "00001_select_1" - ]) + cmd = " ".join( + [ + args.test_cmd, + # Do not track memory allocations up to 1Gi, + # this will allow to ignore server memory limit (max_server_memory_usage) for this query. + # + # NOTE: memory_profiler_step should be also adjusted, because: + # + # untracked_memory_limit = min(settings.max_untracked_memory, settings.memory_profiler_step) + # + # NOTE: that if there will be queries with GROUP BY, this trick + # will not work due to CurrentMemoryTracker::check() from + # Aggregator code. + # But right now it should work, since neither hung check, nor 00001_select_1 has GROUP BY. + "--client-option", + "max_untracked_memory=1Gi", + "--client-option", + "max_memory_usage_for_user=0", + "--client-option", + "memory_profiler_step=1Gi", + # Use system database to avoid CREATE/DROP DATABASE queries + "--database=system", + "--hung-check", + "--stress", + "00001_select_1", + ] + ) res = call(cmd, shell=True, stderr=STDOUT) hung_check_status = "No queries hung\tOK\n" if res != 0 and have_long_running_queries: logging.info("Hung check failed with exit code {}".format(res)) hung_check_status = "Hung check failed\tFAIL\n" - with open(os.path.join(args.output_folder, "test_results.tsv"), 'w+') as results: + with open( + os.path.join(args.output_folder, "test_results.tsv"), "w+" + ) as results: results.write(hung_check_status) logging.info("Stress test finished") diff --git a/docs/changelogs/v20.5.1.3833-prestable.md b/docs/changelogs/v20.5.1.3833-prestable.md index 824fb051914..11a67ca8430 100644 --- a/docs/changelogs/v20.5.1.3833-prestable.md +++ b/docs/changelogs/v20.5.1.3833-prestable.md @@ -14,7 +14,7 @@ * Selects with final are executed in parallel. Added setting `max_final_threads` to limit the number of threads used. [#10463](https://github.com/ClickHouse/ClickHouse/pull/10463) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). * Function that extracts from haystack all matching non-overlapping groups with regular expressions, and put those into `Array(Array(String))` column. [#10534](https://github.com/ClickHouse/ClickHouse/pull/10534) ([Vasily Nemkov](https://github.com/Enmk)). * Added ability to delete a subset of expired rows, which satisfies the condition in WHERE clause. Added ability to replace expired rows with aggregates of them specified in GROUP BY clause. [#10537](https://github.com/ClickHouse/ClickHouse/pull/10537) ([expl0si0nn](https://github.com/expl0si0nn)). -* (Only Linux) Clickhouse server now tries to fallback to ProcfsMetricsProvider when clickhouse binary is not attributed with CAP_NET_ADMIN capability to collect per-query system metrics (for CPU and I/O). [#10544](https://github.com/ClickHouse/ClickHouse/pull/10544) ([Alexander Kazakov](https://github.com/Akazz)). +* (Only Linux) ClickHouse server now tries to fallback to ProcfsMetricsProvider when clickhouse binary is not attributed with CAP_NET_ADMIN capability to collect per-query system metrics (for CPU and I/O). [#10544](https://github.com/ClickHouse/ClickHouse/pull/10544) ([Alexander Kazakov](https://github.com/Akazz)). * - Add Arrow IPC File format (Input and Output) - Fix incorrect work of resetParser() for Parquet Input Format - Add zero-copy optimization for ORC for RandomAccessFiles - Add missing halffloat type for input parquet and ORC formats ... [#10580](https://github.com/ClickHouse/ClickHouse/pull/10580) ([Zhanna](https://github.com/FawnD2)). * Allowed to profile memory with finer granularity steps than 4 MiB. Added sampling memory profiler to capture random allocations/deallocations. [#10598](https://github.com/ClickHouse/ClickHouse/pull/10598) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Add new input format `JSONAsString` that accepts a sequence of JSON objects separated by newlines, spaces and/or commas. [#10607](https://github.com/ClickHouse/ClickHouse/pull/10607) ([Kruglov Pavel](https://github.com/Avogar)). diff --git a/docs/changelogs/v21.11.1.8636-prestable.md b/docs/changelogs/v21.11.1.8636-prestable.md index ade8084055c..2aab0293223 100644 --- a/docs/changelogs/v21.11.1.8636-prestable.md +++ b/docs/changelogs/v21.11.1.8636-prestable.md @@ -13,7 +13,7 @@ * Users now can set comments to database in `CREATE DATABASE` statement ... [#29429](https://github.com/ClickHouse/ClickHouse/pull/29429) ([Vasily Nemkov](https://github.com/Enmk)). * New function` mapContainsKeyLike` to get the map that key matches a simple regular expression. [#29471](https://github.com/ClickHouse/ClickHouse/pull/29471) ([凌涛](https://github.com/lingtaolf)). * Huawei OBS Storage support. Closes [#24294](https://github.com/ClickHouse/ClickHouse/issues/24294). [#29511](https://github.com/ClickHouse/ClickHouse/pull/29511) ([kevin wan](https://github.com/MaxWk)). -* Clickhouse HTTP Server can enable HSTS by set `hsts_max_age` in config.xml with a positive number. [#29516](https://github.com/ClickHouse/ClickHouse/pull/29516) ([凌涛](https://github.com/lingtaolf)). +* ClickHouse HTTP Server can enable HSTS by set `hsts_max_age` in config.xml with a positive number. [#29516](https://github.com/ClickHouse/ClickHouse/pull/29516) ([凌涛](https://github.com/lingtaolf)). * - Added MD4 and SHA384 functions. [#29602](https://github.com/ClickHouse/ClickHouse/pull/29602) ([Nikita Tikhomirov](https://github.com/NSTikhomirov)). * Support EXISTS(subquery). Closes [#6852](https://github.com/ClickHouse/ClickHouse/issues/6852). [#29731](https://github.com/ClickHouse/ClickHouse/pull/29731) ([Kseniia Sumarokova](https://github.com/kssenii)). * Added function `ngram`. Closes [#29699](https://github.com/ClickHouse/ClickHouse/issues/29699). [#29738](https://github.com/ClickHouse/ClickHouse/pull/29738) ([Maksim Kita](https://github.com/kitaisreal)). diff --git a/docs/changelogs/v21.12.1.9017-prestable.md b/docs/changelogs/v21.12.1.9017-prestable.md index bcf5424fc63..f5e036c9c52 100644 --- a/docs/changelogs/v21.12.1.9017-prestable.md +++ b/docs/changelogs/v21.12.1.9017-prestable.md @@ -54,7 +54,7 @@ * Add settings `merge_tree_min_rows_for_concurrent_read_for_remote_filesystem` and `merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem`. [#30970](https://github.com/ClickHouse/ClickHouse/pull/30970) ([Kseniia Sumarokova](https://github.com/kssenii)). * Do not allow to drop a table or dictionary if some tables or dictionaries depend on it. [#30977](https://github.com/ClickHouse/ClickHouse/pull/30977) ([Alexander Tokmakov](https://github.com/tavplubix)). * Only grab AlterLock when we do alter command. Let's see if the assumption is correct. [#31010](https://github.com/ClickHouse/ClickHouse/pull/31010) ([Amos Bird](https://github.com/amosbird)). -* The local session inside a Clickhouse dictionary source won't send its events to the session log anymore. This fixes a possible deadlock (tsan alert) on shutdown. Also this PR fixes flaky `test_dictionaries_dependency_xml/`. [#31013](https://github.com/ClickHouse/ClickHouse/pull/31013) ([Vitaly Baranov](https://github.com/vitlibar)). +* The local session inside a ClickHouse dictionary source won't send its events to the session log anymore. This fixes a possible deadlock (tsan alert) on shutdown. Also this PR fixes flaky `test_dictionaries_dependency_xml/`. [#31013](https://github.com/ClickHouse/ClickHouse/pull/31013) ([Vitaly Baranov](https://github.com/vitlibar)). * Cancel vertical merges when partition is dropped. This is a follow-up of https://github.com/ClickHouse/ClickHouse/pull/25684 and https://github.com/ClickHouse/ClickHouse/pull/30996. [#31057](https://github.com/ClickHouse/ClickHouse/pull/31057) ([Amos Bird](https://github.com/amosbird)). * Support `IF EXISTS` modifier for `RENAME DATABASE`/`TABLE`/`DICTIONARY` query, If this directive is used, one will not get an error if the DATABASE/TABLE/DICTIONARY to be renamed doesn't exist. [#31081](https://github.com/ClickHouse/ClickHouse/pull/31081) ([victorgao](https://github.com/kafka1991)). * Function name normalization for ALTER queries. This helps avoid metadata mismatch between creating table with indices/projections and adding indices/projections via alter commands. This is a follow-up PR of https://github.com/ClickHouse/ClickHouse/pull/20174. Mark as improvements as there are no bug reports and the senario is somehow rare. [#31095](https://github.com/ClickHouse/ClickHouse/pull/31095) ([Amos Bird](https://github.com/amosbird)). diff --git a/docs/changelogs/v21.12.3.32-stable.md b/docs/changelogs/v21.12.3.32-stable.md index 3c08aae4cba..b650f62dd34 100644 --- a/docs/changelogs/v21.12.3.32-stable.md +++ b/docs/changelogs/v21.12.3.32-stable.md @@ -1,7 +1,7 @@ ### ClickHouse release v21.12.3.32-stable FIXME as compared to v21.12.2.17-stable #### Bug Fix -* Backported in [#33018](https://github.com/ClickHouse/ClickHouse/issues/33018): - Clickhouse Keeper handler should remove operation when response sent. [#32988](https://github.com/ClickHouse/ClickHouse/pull/32988) ([JackyWoo](https://github.com/JackyWoo)). +* Backported in [#33018](https://github.com/ClickHouse/ClickHouse/issues/33018): - ClickHouse Keeper handler should remove operation when response sent. [#32988](https://github.com/ClickHouse/ClickHouse/pull/32988) ([JackyWoo](https://github.com/JackyWoo)). #### Bug Fix (user-visible misbehaviour in official stable or prestable release) diff --git a/docs/changelogs/v21.2.2.8-stable.md b/docs/changelogs/v21.2.2.8-stable.md index 368243120f1..73baea91547 100644 --- a/docs/changelogs/v21.2.2.8-stable.md +++ b/docs/changelogs/v21.2.2.8-stable.md @@ -68,7 +68,7 @@ * Add separate pool for message brokers (RabbitMQ and Kafka). [#19722](https://github.com/ClickHouse/ClickHouse/pull/19722) ([Azat Khuzhin](https://github.com/azat)). * In distributed queries if the setting `async_socket_for_remote` is enabled, it was possible to get stack overflow at least in debug build configuration if very deeply nested data type is used in table (e.g. `Array(Array(Array(...more...)))`). This fixes [#19108](https://github.com/ClickHouse/ClickHouse/issues/19108). This change introduces minor backward incompatibility: excessive parenthesis in type definitions no longer supported, example: `Array((UInt8))`. [#19736](https://github.com/ClickHouse/ClickHouse/pull/19736) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Table function `S3` will use global region if the region can't be determined exactly. This closes [#10998](https://github.com/ClickHouse/ClickHouse/issues/10998). [#19750](https://github.com/ClickHouse/ClickHouse/pull/19750) ([Vladimir Chebotarev](https://github.com/excitoon)). -* Clickhouse client query param CTE added test. [#19762](https://github.com/ClickHouse/ClickHouse/pull/19762) ([Maksim Kita](https://github.com/kitaisreal)). +* ClickHouse client query param CTE added test. [#19762](https://github.com/ClickHouse/ClickHouse/pull/19762) ([Maksim Kita](https://github.com/kitaisreal)). * Correctly output infinite arguments for `formatReadableTimeDelta` function. In previous versions, there was implicit conversion to implementation specific integer value. [#19791](https://github.com/ClickHouse/ClickHouse/pull/19791) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * `S3` table function now supports `auto` compression mode (autodetect). This closes [#18754](https://github.com/ClickHouse/ClickHouse/issues/18754). [#19793](https://github.com/ClickHouse/ClickHouse/pull/19793) ([Vladimir Chebotarev](https://github.com/excitoon)). * Set charset to utf8mb4 when interacting with remote MySQL servers. Fixes [#19795](https://github.com/ClickHouse/ClickHouse/issues/19795). [#19800](https://github.com/ClickHouse/ClickHouse/pull/19800) ([Alexey Milovidov](https://github.com/alexey-milovidov)). diff --git a/docs/changelogs/v21.5.1.6601-prestable.md b/docs/changelogs/v21.5.1.6601-prestable.md index 2d09ce6c20b..d64936fefce 100644 --- a/docs/changelogs/v21.5.1.6601-prestable.md +++ b/docs/changelogs/v21.5.1.6601-prestable.md @@ -34,7 +34,7 @@ * Allow to use CTE in VIEW definition. This closes [#22491](https://github.com/ClickHouse/ClickHouse/issues/22491). [#22657](https://github.com/ClickHouse/ClickHouse/pull/22657) ([Amos Bird](https://github.com/amosbird)). * Add metric to track how much time is spend during waiting for Buffer layer lock. [#22725](https://github.com/ClickHouse/ClickHouse/pull/22725) ([Azat Khuzhin](https://github.com/azat)). * Allow RBAC row policy via postgresql protocol. Closes [#22658](https://github.com/ClickHouse/ClickHouse/issues/22658). PostgreSQL protocol is enabled in configuration by default. [#22755](https://github.com/ClickHouse/ClickHouse/pull/22755) ([Kseniia Sumarokova](https://github.com/kssenii)). -* MaterializeMySQL (experimental feature). Make Clickhouse to be able to replicate MySQL databases containing views without failing. This is accomplished by ignoring the views. ... [#22760](https://github.com/ClickHouse/ClickHouse/pull/22760) ([Christian Frøystad](https://github.com/cfroystad)). +* MaterializeMySQL (experimental feature). Make ClickHouse to be able to replicate MySQL databases containing views without failing. This is accomplished by ignoring the views. ... [#22760](https://github.com/ClickHouse/ClickHouse/pull/22760) ([Christian Frøystad](https://github.com/cfroystad)). * `dateDiff` now works with `DateTime64` arguments (even for values outside of `DateTime` range) ... [#22931](https://github.com/ClickHouse/ClickHouse/pull/22931) ([Vasily Nemkov](https://github.com/Enmk)). * Set `background_fetches_pool_size` to 8 that is better for production usage with frequent small insertions or slow ZooKeeper cluster. [#22945](https://github.com/ClickHouse/ClickHouse/pull/22945) ([Alexey Milovidov](https://github.com/alexey-milovidov)). * Fix inactive_parts_to_throw_insert=0 with inactive_parts_to_delay_insert>0. [#22947](https://github.com/ClickHouse/ClickHouse/pull/22947) ([Azat Khuzhin](https://github.com/azat)). diff --git a/docs/changelogs/v22.1.1.2542-prestable.md b/docs/changelogs/v22.1.1.2542-prestable.md index e1e30e28ec6..c9071422949 100644 --- a/docs/changelogs/v22.1.1.2542-prestable.md +++ b/docs/changelogs/v22.1.1.2542-prestable.md @@ -84,7 +84,7 @@ #### Bug Fix * Quota limit was not reached, but the limit was exceeded. This PR fixes [#31174](https://github.com/ClickHouse/ClickHouse/issues/31174). [#31656](https://github.com/ClickHouse/ClickHouse/pull/31656) ([sunny](https://github.com/sunny19930321)). -* - Clickhouse Keeper handler should remove operation when response sent. [#32988](https://github.com/ClickHouse/ClickHouse/pull/32988) ([JackyWoo](https://github.com/JackyWoo)). +* - ClickHouse Keeper handler should remove operation when response sent. [#32988](https://github.com/ClickHouse/ClickHouse/pull/32988) ([JackyWoo](https://github.com/JackyWoo)). * Fix null pointer dereference in low cardinality data when deserializing LowCardinality data in the Native format. [#33021](https://github.com/ClickHouse/ClickHouse/pull/33021) ([Harry Lee](https://github.com/HarryLeeIBM)). * Specifically crafted input data for `Native` format may lead to reading uninitialized memory or crash. This is relevant if `clickhouse-server` is open for write access to adversary. [#33050](https://github.com/ClickHouse/ClickHouse/pull/33050) ([Heena Bansal](https://github.com/HeenaBansal2009)). @@ -196,7 +196,7 @@ * NO CL ENTRY: 'Update CHANGELOG.md'. [#32472](https://github.com/ClickHouse/ClickHouse/pull/32472) ([Rich Raposa](https://github.com/rfraposa)). * NO CL ENTRY: 'Revert "Split long tests into multiple checks"'. [#32514](https://github.com/ClickHouse/ClickHouse/pull/32514) ([alesapin](https://github.com/alesapin)). * NO CL ENTRY: 'Revert "Revert "Split long tests into multiple checks""'. [#32515](https://github.com/ClickHouse/ClickHouse/pull/32515) ([alesapin](https://github.com/alesapin)). -* NO CL ENTRY: 'blog post how to enable predictive capabilities in Clickhouse'. [#32768](https://github.com/ClickHouse/ClickHouse/pull/32768) ([Tom Risse](https://github.com/flickerbox-tom)). +* NO CL ENTRY: 'blog post how to enable predictive capabilities in ClickHouse'. [#32768](https://github.com/ClickHouse/ClickHouse/pull/32768) ([Tom Risse](https://github.com/flickerbox-tom)). * NO CL ENTRY: 'Revert "Fix build issue related to azure blob storage"'. [#32845](https://github.com/ClickHouse/ClickHouse/pull/32845) ([alesapin](https://github.com/alesapin)). * NO CL ENTRY: 'Revert "Dictionaries added Date32 type support"'. [#33053](https://github.com/ClickHouse/ClickHouse/pull/33053) ([Alexander Tokmakov](https://github.com/tavplubix)). * NO CL ENTRY: 'Updated Lawrence Berkeley National Lab stats'. [#33066](https://github.com/ClickHouse/ClickHouse/pull/33066) ([Michael Smitasin](https://github.com/michaelsmitasin)). diff --git a/docs/changelogs/v22.3.1.1262-prestable.md b/docs/changelogs/v22.3.1.1262-prestable.md index f47afd67021..03e81bd1808 100644 --- a/docs/changelogs/v22.3.1.1262-prestable.md +++ b/docs/changelogs/v22.3.1.1262-prestable.md @@ -65,7 +65,7 @@ * Add setting to lower column case when reading parquet/ORC file. [#35145](https://github.com/ClickHouse/ClickHouse/pull/35145) ([shuchaome](https://github.com/shuchaome)). * Do not retry non-rertiable errors. Closes [#35161](https://github.com/ClickHouse/ClickHouse/issues/35161). [#35172](https://github.com/ClickHouse/ClickHouse/pull/35172) ([Kseniia Sumarokova](https://github.com/kssenii)). * Added disk_name to system.part_log. [#35178](https://github.com/ClickHouse/ClickHouse/pull/35178) ([Artyom Yurkov](https://github.com/Varinara)). -* Currently,Clickhouse validates hosts defined under for URL and Remote Table functions. This PR extends the RemoteHostFilter to Mysql and PostgreSQL table functions. [#35191](https://github.com/ClickHouse/ClickHouse/pull/35191) ([Heena Bansal](https://github.com/HeenaBansal2009)). +* Currently,ClickHouse validates hosts defined under for URL and Remote Table functions. This PR extends the RemoteHostFilter to Mysql and PostgreSQL table functions. [#35191](https://github.com/ClickHouse/ClickHouse/pull/35191) ([Heena Bansal](https://github.com/HeenaBansal2009)). * Sometimes it is not enough for us to distinguish queries hierachy only by is_initial_query in system.query_log and system.processes. So distributed_depth is needed. [#35207](https://github.com/ClickHouse/ClickHouse/pull/35207) ([李扬](https://github.com/taiyang-li)). * Support test mode for clickhouse-local. [#35264](https://github.com/ClickHouse/ClickHouse/pull/35264) ([Kseniia Sumarokova](https://github.com/kssenii)). * Return const for function getMacro if not in distributed query. Close [#34727](https://github.com/ClickHouse/ClickHouse/issues/34727). [#35289](https://github.com/ClickHouse/ClickHouse/pull/35289) ([李扬](https://github.com/taiyang-li)). diff --git a/docs/en/development/contrib.md b/docs/en/development/contrib.md index 7713c397e46..3936b613bcb 100644 --- a/docs/en/development/contrib.md +++ b/docs/en/development/contrib.md @@ -97,7 +97,7 @@ SELECT library_name, license_type, license_path FROM system.licenses ORDER BY li ## Adding new third-party libraries and maintaining patches in third-party libraries {#adding-third-party-libraries} 1. Each third-party library must reside in a dedicated directory under the `contrib/` directory of the ClickHouse repository. Avoid dumps/copies of external code, instead use Git submodule feature to pull third-party code from an external upstream repository. -2. Submodules are listed in `.gitmodule`. If the external library can be used as-is, you may reference the upstream repository directly. Otherwise, i.e. the external library requires patching/customization, create a fork of the official repository in the [Clickhouse organization in GitHub](https://github.com/ClickHouse). +2. Submodules are listed in `.gitmodule`. If the external library can be used as-is, you may reference the upstream repository directly. Otherwise, i.e. the external library requires patching/customization, create a fork of the official repository in the [ClickHouse organization in GitHub](https://github.com/ClickHouse). 3. In the latter case, create a branch with `clickhouse/` prefix from the branch you want to integrate, e.g. `clickhouse/master` (for `master`) or `clickhouse/release/vX.Y.Z` (for a `release/vX.Y.Z` tag). The purpose of this branch is to isolate customization of the library from upstream work. For example, pulls from the upstream repository into the fork will leave all `clickhouse/` branches unaffected. Submodules in `contrib/` must only track `clickhouse/` branches of forked third-party repositories. 4. To patch a fork of a third-party library, create a dedicated branch with `clickhouse/` prefix in the fork, e.g. `clickhouse/fix-some-desaster`. Finally, merge the patch branch into the custom tracking branch (e.g. `clickhouse/master` or `clickhouse/release/vX.Y.Z`) using a PR. 5. Always create patches of third-party libraries with the official repository in mind. Once a PR of a patch branch to the `clickhouse/` branch in the fork repository is done and the submodule version in ClickHouse official repository is bumped, consider opening another PR from the patch branch to the upstream library repository. This ensures, that 1) the contribution has more than a single use case and importance, 2) others will also benefit from it, 3) the change will not remain a maintenance burden solely on ClickHouse developers. diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 3e2c7618181..15f66d2695f 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -66,7 +66,7 @@ For a description of parameters, see the [CREATE query description](../../../sql A tuple of column names or arbitrary expressions. Example: `ORDER BY (CounterID, EventDate)`. - ClickHouse uses the sorting key as a primary key if the primary key is not defined obviously by the `PRIMARY KEY` clause. + ClickHouse uses the sorting key as a primary key if the primary key is not defined explicitly by the `PRIMARY KEY` clause. Use the `ORDER BY tuple()` syntax, if you do not need sorting. See [Selecting the Primary Key](#selecting-the-primary-key). diff --git a/docs/en/engines/table-engines/mergetree-family/replication.md b/docs/en/engines/table-engines/mergetree-family/replication.md index cbe586d75a3..3562bdf6d3a 100644 --- a/docs/en/engines/table-engines/mergetree-family/replication.md +++ b/docs/en/engines/table-engines/mergetree-family/replication.md @@ -27,7 +27,7 @@ Compressed data for `INSERT` and `ALTER` queries is replicated (for more informa - The `DROP TABLE` query deletes the replica located on the server where the query is run. - The `RENAME` query renames the table on one of the replicas. In other words, replicated tables can have different names on different replicas. -ClickHouse uses [Apache ZooKeeper](https://zookeeper.apache.org) for storing replicas meta information. Use ZooKeeper version 3.4.5 or newer. +ClickHouse uses [ClickHouse Keeper](../../../guides/sre/keeper/clickhouse-keeper.md) for storing replicas meta information. It is possible to use ZooKeeper version 3.4.5 or newer, but ClickHouse Keeper is recommended. To use replication, set parameters in the [zookeeper](../../../operations/server-configuration-parameters/settings.md#server-settings_zookeeper) server configuration section. @@ -35,7 +35,7 @@ To use replication, set parameters in the [zookeeper](../../../operations/server Don’t neglect the security setting. ClickHouse supports the `digest` [ACL scheme](https://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) of the ZooKeeper security subsystem. ::: -Example of setting the addresses of the ZooKeeper cluster: +Example of setting the addresses of the ClickHouse Keeper cluster: ``` xml @@ -54,8 +54,8 @@ Example of setting the addresses of the ZooKeeper cluster: ``` -ClickHouse also supports to store replicas meta information in the auxiliary ZooKeeper cluster by providing ZooKeeper cluster name and path as engine arguments. -In other word, it supports to store the metadata of differnt tables in different ZooKeeper clusters. +ClickHouse also supports storing replicas meta information in an auxiliary ZooKeeper cluster. Do this by providing the ZooKeeper cluster name and path as engine arguments. +In other words, it supports storing the metadata of different tables in different ZooKeeper clusters. Example of setting the addresses of the auxiliary ZooKeeper cluster: @@ -122,8 +122,8 @@ The `Replicated` prefix is added to the table engine name. For example:`Replicat **Replicated\*MergeTree parameters** -- `zoo_path` — The path to the table in ZooKeeper. -- `replica_name` — The replica name in ZooKeeper. +- `zoo_path` — The path to the table in ClickHouse Keeper. +- `replica_name` — The replica name in ClickHouse Keeper. - `other_parameters` — Parameters of an engine which is used for creating the replicated version, for example, version in `ReplacingMergeTree`. Example: @@ -168,18 +168,18 @@ Example: ``` -The path to the table in ZooKeeper should be unique for each replicated table. Tables on different shards should have different paths. +The path to the table in ClickHouse Keeper should be unique for each replicated table. Tables on different shards should have different paths. In this case, the path consists of the following parts: `/clickhouse/tables/` is the common prefix. We recommend using exactly this one. `{layer}-{shard}` is the shard identifier. In this example it consists of two parts, since the example cluster uses bi-level sharding. For most tasks, you can leave just the {shard} substitution, which will be expanded to the shard identifier. -`table_name` is the name of the node for the table in ZooKeeper. It is a good idea to make it the same as the table name. It is defined explicitly, because in contrast to the table name, it does not change after a RENAME query. +`table_name` is the name of the node for the table in ClickHouse Keeper. It is a good idea to make it the same as the table name. It is defined explicitly, because in contrast to the table name, it does not change after a RENAME query. *HINT*: you could add a database name in front of `table_name` as well. E.g. `db_name.table_name` The two built-in substitutions `{database}` and `{table}` can be used, they expand into the table name and the database name respectively (unless these macros are defined in the `macros` section). So the zookeeper path can be specified as `'/clickhouse/tables/{layer}-{shard}/{database}/{table}'`. -Be careful with table renames when using these built-in substitutions. The path in Zookeeper cannot be changed, and when the table is renamed, the macros will expand into a different path, the table will refer to a path that does not exist in Zookeeper, and will go into read-only mode. +Be careful with table renames when using these built-in substitutions. The path in ClickHouse Keeper cannot be changed, and when the table is renamed, the macros will expand into a different path, the table will refer to a path that does not exist in ClickHouse Keeper, and will go into read-only mode. The replica name identifies different replicas of the same table. You can use the server name for this, as in the example. The name only needs to be unique within each shard. @@ -220,21 +220,21 @@ To delete a replica, run `DROP TABLE`. However, only one replica is deleted – ## Recovery After Failures {#recovery-after-failures} -If ZooKeeper is unavailable when a server starts, replicated tables switch to read-only mode. The system periodically attempts to connect to ZooKeeper. +If ClickHouse Keeper is unavailable when a server starts, replicated tables switch to read-only mode. The system periodically attempts to connect to ClickHouse Keeper. -If ZooKeeper is unavailable during an `INSERT`, or an error occurs when interacting with ZooKeeper, an exception is thrown. +If ClickHouse Keeper is unavailable during an `INSERT`, or an error occurs when interacting with ClickHouse Keeper, an exception is thrown. -After connecting to ZooKeeper, the system checks whether the set of data in the local file system matches the expected set of data (ZooKeeper stores this information). If there are minor inconsistencies, the system resolves them by syncing data with the replicas. +After connecting to ClickHouse Keeper, the system checks whether the set of data in the local file system matches the expected set of data (ClickHouse Keeper stores this information). If there are minor inconsistencies, the system resolves them by syncing data with the replicas. -If the system detects broken data parts (with the wrong size of files) or unrecognized parts (parts written to the file system but not recorded in ZooKeeper), it moves them to the `detached` subdirectory (they are not deleted). Any missing parts are copied from the replicas. +If the system detects broken data parts (with the wrong size of files) or unrecognized parts (parts written to the file system but not recorded in ClickHouse Keeper), it moves them to the `detached` subdirectory (they are not deleted). Any missing parts are copied from the replicas. Note that ClickHouse does not perform any destructive actions such as automatically deleting a large amount of data. -When the server starts (or establishes a new session with ZooKeeper), it only checks the quantity and sizes of all files. If the file sizes match but bytes have been changed somewhere in the middle, this is not detected immediately, but only when attempting to read the data for a `SELECT` query. The query throws an exception about a non-matching checksum or size of a compressed block. In this case, data parts are added to the verification queue and copied from the replicas if necessary. +When the server starts (or establishes a new session with ClickHouse Keeper), it only checks the quantity and sizes of all files. If the file sizes match but bytes have been changed somewhere in the middle, this is not detected immediately, but only when attempting to read the data for a `SELECT` query. The query throws an exception about a non-matching checksum or size of a compressed block. In this case, data parts are added to the verification queue and copied from the replicas if necessary. If the local set of data differs too much from the expected one, a safety mechanism is triggered. The server enters this in the log and refuses to launch. The reason for this is that this case may indicate a configuration error, such as if a replica on a shard was accidentally configured like a replica on a different shard. However, the thresholds for this mechanism are set fairly low, and this situation might occur during normal failure recovery. In this case, data is restored semi-automatically - by “pushing a button”. -To start recovery, create the node `/path_to_table/replica_name/flags/force_restore_data` in ZooKeeper with any content, or run the command to restore all replicated tables: +To start recovery, create the node `/path_to_table/replica_name/flags/force_restore_data` in ClickHouse Keeper with any content, or run the command to restore all replicated tables: ``` bash sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data @@ -249,11 +249,11 @@ If all data and metadata disappeared from one of the servers, follow these steps 1. Install ClickHouse on the server. Define substitutions correctly in the config file that contains the shard identifier and replicas, if you use them. 2. If you had unreplicated tables that must be manually duplicated on the servers, copy their data from a replica (in the directory `/var/lib/clickhouse/data/db_name/table_name/`). 3. Copy table definitions located in `/var/lib/clickhouse/metadata/` from a replica. If a shard or replica identifier is defined explicitly in the table definitions, correct it so that it corresponds to this replica. (Alternatively, start the server and make all the `ATTACH TABLE` queries that should have been in the .sql files in `/var/lib/clickhouse/metadata/`.) -4. To start recovery, create the ZooKeeper node `/path_to_table/replica_name/flags/force_restore_data` with any content, or run the command to restore all replicated tables: `sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data` +4. To start recovery, create the ClickHouse Keeper node `/path_to_table/replica_name/flags/force_restore_data` with any content, or run the command to restore all replicated tables: `sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data` Then start the server (restart, if it is already running). Data will be downloaded from replicas. -An alternative recovery option is to delete information about the lost replica from ZooKeeper (`/path_to_table/replica_name`), then create the replica again as described in “[Creating replicated tables](#creating-replicated-tables)”. +An alternative recovery option is to delete information about the lost replica from ClickHouse Keeper (`/path_to_table/replica_name`), then create the replica again as described in “[Creating replicated tables](#creating-replicated-tables)”. There is no restriction on network bandwidth during recovery. Keep this in mind if you are restoring many replicas at once. @@ -276,13 +276,13 @@ Create a MergeTree table with a different name. Move all the data from the direc If you want to get rid of a `ReplicatedMergeTree` table without launching the server: - Delete the corresponding `.sql` file in the metadata directory (`/var/lib/clickhouse/metadata/`). -- Delete the corresponding path in ZooKeeper (`/path_to_table/replica_name`). +- Delete the corresponding path in ClickHouse Keeper (`/path_to_table/replica_name`). After this, you can launch the server, create a `MergeTree` table, move the data to its directory, and then restart the server. -## Recovery When Metadata in the Zookeeper Cluster Is Lost or Damaged {#recovery-when-metadata-in-the-zookeeper-cluster-is-lost-or-damaged} +## Recovery When Metadata in the ClickHouse Keeper Cluster Is Lost or Damaged {#recovery-when-metadata-in-the-zookeeper-cluster-is-lost-or-damaged} -If the data in ZooKeeper was lost or damaged, you can save data by moving it to an unreplicated table as described above. +If the data in ClickHouse Keeper was lost or damaged, you can save data by moving it to an unreplicated table as described above. **See Also** diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index c8c430c4e03..e0882d1f9e1 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -5,6 +5,8 @@ sidebar_label: Command-Line Client # Command-line Client +## clickhouse-client + ClickHouse provides a native command-line client: `clickhouse-client`. The client supports command-line options and configuration files. For more information, see [Configuring](#interfaces_cli_configuration). [Install](../getting-started/install.md) it from the `clickhouse-client` package and run it with the command `clickhouse-client`. @@ -115,7 +117,7 @@ You can pass parameters to `clickhouse-client` (all parameters have a default va - `--user, -u` – The username. Default value: default. - `--password` – The password. Default value: empty string. - `--query, -q` – The query to process when using non-interactive mode. You must specify either `query` or `queries-file` option. -- `--queries-file, -qf` – file path with queries to execute. You must specify either `query` or `queries-file` option. +- `--queries-file` – file path with queries to execute. You must specify either `query` or `queries-file` option. - `--database, -d` – Select the current default database. Default value: the current database from the server settings (‘default’ by default). - `--multiline, -m` – If specified, allow multiline queries (do not send the query on Enter). - `--multiquery, -n` – If specified, allow processing multiple queries separated by semicolons. @@ -183,4 +185,3 @@ If the configuration above is applied, the ID of a query is shown in the followi ``` text speedscope:http://speedscope-host/#profileURL=qp%3Fid%3Dc8ecc783-e753-4b38-97f1-42cddfb98b7d ``` - diff --git a/docs/en/interfaces/cpp.md b/docs/en/interfaces/cpp.md index b23e11399d9..364d77c21a4 100644 --- a/docs/en/interfaces/cpp.md +++ b/docs/en/interfaces/cpp.md @@ -7,4 +7,6 @@ sidebar_label: C++ Client Library See README at [clickhouse-cpp](https://github.com/ClickHouse/clickhouse-cpp) repository. -[Original article](https://clickhouse.com/docs/en/interfaces/cpp/) +# userver Asynchronous Framework + +[userver (beta)](https://github.com/userver-framework/userver) has builtin support for ClickHouse. diff --git a/docs/en/interfaces/third-party/integrations.md b/docs/en/interfaces/third-party/integrations.md index 317e5ca5bda..6708cd103bc 100644 --- a/docs/en/interfaces/third-party/integrations.md +++ b/docs/en/interfaces/third-party/integrations.md @@ -28,6 +28,9 @@ ClickHouse, Inc. does **not** maintain the tools and libraries listed below and - [Kafka](https://kafka.apache.org) - [clickhouse_sinker](https://github.com/housepower/clickhouse_sinker) (uses [Go client](https://github.com/ClickHouse/clickhouse-go/)) - [stream-loader-clickhouse](https://github.com/adform/stream-loader) +- Batch processing + - [Spark](https://spark.apache.org) + - [spark-clickhouse-connector](https://github.com/housepower/spark-clickhouse-connector) - Stream processing - [Flink](https://flink.apache.org) - [flink-clickhouse-sink](https://github.com/ivi-ru/flink-clickhouse-sink) diff --git a/docs/en/operations/clickhouse-keeper.md b/docs/en/operations/clickhouse-keeper.md index 5baeac80748..0950568cc82 100644 --- a/docs/en/operations/clickhouse-keeper.md +++ b/docs/en/operations/clickhouse-keeper.md @@ -325,14 +325,14 @@ clickhouse-keeper-converter --zookeeper-logs-dir /var/lib/zookeeper/version-2 -- ## Recovering after losing quorum -Because Clickhouse Keeper uses Raft it can tolerate certain amount of node crashes depending on the cluster size. \ +Because ClickHouse Keeper uses Raft it can tolerate certain amount of node crashes depending on the cluster size. \ E.g. for a 3-node cluster, it will continue working correctly if only 1 node crashes. Cluster configuration can be dynamically configured but there are some limitations. Reconfiguration relies on Raft also so to add/remove a node from the cluster you need to have a quorum. If you lose too many nodes in your cluster at the same time without any chance of starting them again, Raft will stop working and not allow you to reconfigure your cluster using the conventional way. -Nevertheless, Clickhouse Keeper has a recovery mode which allows you to forcefully reconfigure your cluster with only 1 node. +Nevertheless, ClickHouse Keeper has a recovery mode which allows you to forcefully reconfigure your cluster with only 1 node. This should be done only as your last resort if you cannot start your nodes again, or start a new instance on the same endpoint. Important things to note before continuing: diff --git a/docs/en/operations/named-collections.md b/docs/en/operations/named-collections.md index aea218f6ad5..7623f7b7203 100644 --- a/docs/en/operations/named-collections.md +++ b/docs/en/operations/named-collections.md @@ -34,7 +34,7 @@ Example of configuration: AKIAIOSFODNN7EXAMPLE - wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY + wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY CSV https://s3.us-east-1.amazonaws.com/yourbucket/mydata/ @@ -227,4 +227,4 @@ SELECT dictGet('dict', 'b', 2); ┌─dictGet('dict', 'b', 2)─┐ │ two │ └─────────────────────────┘ -``` \ No newline at end of file +``` diff --git a/docs/en/operations/system-tables/replication_queue.md b/docs/en/operations/system-tables/replication_queue.md index ba2eac1a854..cb22345c3a2 100644 --- a/docs/en/operations/system-tables/replication_queue.md +++ b/docs/en/operations/system-tables/replication_queue.md @@ -1,6 +1,6 @@ # replication_queue -Contains information about tasks from replication queues stored in Clickhouse Keeper, or ZooKeeper, for tables in the `ReplicatedMergeTree` family. +Contains information about tasks from replication queues stored in ClickHouse Keeper, or ZooKeeper, for tables in the `ReplicatedMergeTree` family. Columns: diff --git a/docs/en/operations/tips.md b/docs/en/operations/tips.md index 836b61d4954..f364bc85088 100644 --- a/docs/en/operations/tips.md +++ b/docs/en/operations/tips.md @@ -274,6 +274,6 @@ end script ## Antivirus software {#antivirus-software} -If you use antivirus software configure it to skip folders with Clickhouse datafiles (`/var/lib/clickhouse`) otherwise performance may be reduced and you may experience unexpected errors during data ingestion and background merges. +If you use antivirus software configure it to skip folders with ClickHouse datafiles (`/var/lib/clickhouse`) otherwise performance may be reduced and you may experience unexpected errors during data ingestion and background merges. [Original article](https://clickhouse.com/docs/en/operations/tips/) diff --git a/docs/en/operations/utilities/clickhouse-local.md b/docs/en/operations/utilities/clickhouse-local.md index 15ac5a9a287..a22b8ae0750 100644 --- a/docs/en/operations/utilities/clickhouse-local.md +++ b/docs/en/operations/utilities/clickhouse-local.md @@ -31,12 +31,12 @@ $ clickhouse-local --structure "table_structure" --input-format "format_of_incom Arguments: - `-S`, `--structure` — table structure for input data. -- `-if`, `--input-format` — input format, `TSV` by default. +- `--input-format` — input format, `TSV` by default. - `-f`, `--file` — path to data, `stdin` by default. - `-q`, `--query` — queries to execute with `;` as delimeter. You must specify either `query` or `queries-file` option. -- `-qf`, `--queries-file` - file path with queries to execute. You must specify either `query` or `queries-file` option. +- `--queries-file` - file path with queries to execute. You must specify either `query` or `queries-file` option. - `-N`, `--table` — table name where to put output data, `table` by default. -- `-of`, `--format`, `--output-format` — output format, `TSV` by default. +- `--format`, `--output-format` — output format, `TSV` by default. - `-d`, `--database` — default database, `_local` by default. - `--stacktrace` — whether to dump debug output in case of exception. - `--echo` — print query before execution. diff --git a/docs/en/sql-reference/functions/encryption-functions.md b/docs/en/sql-reference/functions/encryption-functions.md index fb821ca7783..58a1d9d56f8 100644 --- a/docs/en/sql-reference/functions/encryption-functions.md +++ b/docs/en/sql-reference/functions/encryption-functions.md @@ -19,11 +19,10 @@ This function encrypts data using these modes: - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc -- aes-128-cfb1, aes-192-cfb1, aes-256-cfb1 -- aes-128-cfb8, aes-192-cfb8, aes-256-cfb8 -- aes-128-cfb128, aes-192-cfb128, aes-256-cfb128 +- aes-128-cfb128 - aes-128-ofb, aes-192-ofb, aes-256-ofb - aes-128-gcm, aes-192-gcm, aes-256-gcm +- aes-128-ctr, aes-192-ctr, aes-256-ctr **Syntax** @@ -63,9 +62,9 @@ Insert some data (please avoid storing the keys/ivs in the database as this unde Query: ``` sql -INSERT INTO encryption_test VALUES('aes-256-cfb128 no IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212')),\ -('aes-256-cfb128 no IV, different key', encrypt('aes-256-cfb128', 'Secret', 'keykeykeykeykeykeykeykeykeykeyke')),\ -('aes-256-cfb128 with IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')),\ +INSERT INTO encryption_test VALUES('aes-256-ofb no IV', encrypt('aes-256-ofb', 'Secret', '12345678910121314151617181920212')),\ +('aes-256-ofb no IV, different key', encrypt('aes-256-ofb', 'Secret', 'keykeykeykeykeykeykeykeykeykeyke')),\ +('aes-256-ofb with IV', encrypt('aes-256-ofb', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')),\ ('aes-256-cbc no IV', encrypt('aes-256-cbc', 'Secret', '12345678910121314151617181920212')); ``` @@ -78,12 +77,12 @@ SELECT comment, hex(secret) FROM encryption_test; Result: ``` text -┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐ -│ aes-256-cfb128 no IV │ B4972BDC4459 │ -│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9 │ -│ aes-256-cfb128 with IV │ 5E6CB398F653 │ -│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ -└─────────────────────────────────────┴──────────────────────────────────┘ +┌─comment──────────────────────────┬─hex(secret)──────────────────────┐ +│ aes-256-ofb no IV │ B4972BDC4459 │ +│ aes-256-ofb no IV, different key │ 2FF57C092DC9 │ +│ aes-256-ofb with IV │ 5E6CB398F653 │ +│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ +└──────────────────────────────────┴──────────────────────────────────┘ ``` Example with `-gcm`: @@ -116,9 +115,7 @@ Supported encryption modes: - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc -- aes-128-cfb1, aes-192-cfb1, aes-256-cfb1 -- aes-128-cfb8, aes-192-cfb8, aes-256-cfb8 -- aes-128-cfb128, aes-192-cfb128, aes-256-cfb128 +- aes-128-cfb128 - aes-128-ofb, aes-192-ofb, aes-256-ofb **Syntax** @@ -145,7 +142,7 @@ Given equal input `encrypt` and `aes_encrypt_mysql` produce the same ciphertext: Query: ``` sql -SELECT encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') = aes_encrypt_mysql('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') AS ciphertexts_equal; +SELECT encrypt('aes-256-ofb', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') = aes_encrypt_mysql('aes-256-ofb', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') AS ciphertexts_equal; ``` Result: @@ -161,14 +158,14 @@ But `encrypt` fails when `key` or `iv` is longer than expected: Query: ``` sql -SELECT encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'); +SELECT encrypt('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'); ``` Result: ``` text -Received exception from server (version 21.1.2): -Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid key size: 33 expected 32: While processing encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'). +Received exception from server (version 22.6.1): +Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid key size: 33 expected 32: While processing encrypt('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'). ``` While `aes_encrypt_mysql` produces MySQL-compatitalbe output: @@ -176,7 +173,7 @@ While `aes_encrypt_mysql` produces MySQL-compatitalbe output: Query: ``` sql -SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123')) AS ciphertext; +SELECT hex(aes_encrypt_mysql('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123')) AS ciphertext; ``` Result: @@ -192,7 +189,7 @@ Notice how supplying even longer `IV` produces the same result Query: ``` sql -SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456')) AS ciphertext +SELECT hex(aes_encrypt_mysql('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456')) AS ciphertext ``` Result: @@ -206,7 +203,7 @@ Result: Which is binary equal to what MySQL produces on same inputs: ``` sql -mysql> SET block_encryption_mode='aes-256-cfb128'; +mysql> SET block_encryption_mode='aes-256-ofb'; Query OK, 0 rows affected (0.00 sec) mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext; @@ -224,11 +221,10 @@ This function decrypts ciphertext into a plaintext using these modes: - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc -- aes-128-cfb1, aes-192-cfb1, aes-256-cfb1 -- aes-128-cfb8, aes-192-cfb8, aes-256-cfb8 -- aes-128-cfb128, aes-192-cfb128, aes-256-cfb128 +- aes-128-cfb128 - aes-128-ofb, aes-192-ofb, aes-256-ofb - aes-128-gcm, aes-192-gcm, aes-256-gcm +- aes-128-ctr, aes-192-ctr, aes-256-ctr **Syntax** @@ -265,12 +261,12 @@ Result: │ aes-256-gcm │ A8A3CCBC6426CFEEB60E4EAE03D3E94204C1B09E0254 │ │ aes-256-gcm with AAD │ A8A3CCBC6426D9A1017A0A932322F1852260A4AD6837 │ └──────────────────────┴──────────────────────────────────────────────┘ -┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐ -│ aes-256-cfb128 no IV │ B4972BDC4459 │ -│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9 │ -│ aes-256-cfb128 with IV │ 5E6CB398F653 │ -│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ -└─────────────────────────────────────┴──────────────────────────────────┘ +┌─comment──────────────────────────┬─hex(secret)──────────────────────┐ +│ aes-256-ofb no IV │ B4972BDC4459 │ +│ aes-256-ofb no IV, different key │ 2FF57C092DC9 │ +│ aes-256-ofb with IV │ 5E6CB398F653 │ +│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ +└──────────────────────────────────┴──────────────────────────────────┘ ``` Now let's try to decrypt all that data. @@ -284,13 +280,19 @@ SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920 Result: ``` text -┌─comment─────────────────────────────┬─plaintext─┐ -│ aes-256-cfb128 no IV │ Secret │ -│ aes-256-cfb128 no IV, different key │ �4� - � │ -│ aes-256-cfb128 with IV │ ���6�~ │ - │aes-256-cbc no IV │ �2*4�h3c�4w��@ -└─────────────────────────────────────┴───────────┘ +┌─comment──────────────┬─plaintext──┐ +│ aes-256-gcm │ OQ�E + �t�7T�\���\� │ +│ aes-256-gcm with AAD │ OQ�E + �\��si����;�o�� │ +└──────────────────────┴────────────┘ +┌─comment──────────────────────────┬─plaintext─┐ +│ aes-256-ofb no IV │ Secret │ +│ aes-256-ofb no IV, different key │ �4� + � │ +│ aes-256-ofb with IV │ ���6�~ │ + │aes-256-cbc no IV │ �2*4�h3c�4w��@ +└──────────────────────────────────┴───────────┘ ``` Notice how only a portion of the data was properly decrypted, and the rest is gibberish since either `mode`, `key`, or `iv` were different upon encryption. @@ -305,9 +307,7 @@ Supported decryption modes: - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc -- aes-128-cfb1, aes-192-cfb1, aes-256-cfb1 -- aes-128-cfb8, aes-192-cfb8, aes-256-cfb8 -- aes-128-cfb128, aes-192-cfb128, aes-256-cfb128 +- aes-128-cfb128 - aes-128-ofb, aes-192-ofb, aes-256-ofb **Syntax** @@ -332,7 +332,7 @@ aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv]) Let's decrypt data we've previously encrypted with MySQL: ``` sql -mysql> SET block_encryption_mode='aes-256-cfb128'; +mysql> SET block_encryption_mode='aes-256-ofb'; Query OK, 0 rows affected (0.00 sec) mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext; @@ -347,7 +347,7 @@ mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviv Query: ``` sql -SELECT aes_decrypt_mysql('aes-256-cfb128', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext +SELECT aes_decrypt_mysql('aes-256-ofb', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext ``` Result: diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 66e9aa98e67..cac546d9a70 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -273,16 +273,16 @@ Converts ASCII Latin symbols in a string to uppercase. ## lowerUTF8 Converts a string to lowercase, assuming the string contains a set of bytes that make up a UTF-8 encoded text. -It does not detect the language. So for Turkish the result might not be exactly correct. +It does not detect the language. E.g. for Turkish the result might not be exactly correct (i/İ vs. i/I). If the length of the UTF-8 byte sequence is different for upper and lower case of a code point, the result may be incorrect for this code point. -If the string contains a set of bytes that is not UTF-8, then the behavior is undefined. +If the string contains a sequence of bytes that are not valid UTF-8, then the behavior is undefined. ## upperUTF8 Converts a string to uppercase, assuming the string contains a set of bytes that make up a UTF-8 encoded text. -It does not detect the language. So for Turkish the result might not be exactly correct. +It does not detect the language. E.g. for Turkish the result might not be exactly correct (i/İ vs. i/I). If the length of the UTF-8 byte sequence is different for upper and lower case of a code point, the result may be incorrect for this code point. -If the string contains a set of bytes that is not UTF-8, then the behavior is undefined. +If the string contains a sequence of bytes that are not valid UTF-8, then the behavior is undefined. ## isValidUTF8 diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index f4a13708770..305e0c8c5cd 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -7,7 +7,7 @@ sidebar_label: For Searching in Strings The search is case-sensitive by default in all these functions. There are separate variants for case insensitive search. -:::note +:::note Functions for [replacing](../../sql-reference/functions/string-replace-functions.md) and [other manipulations with strings](../../sql-reference/functions/string-functions.md) are described separately. ::: @@ -31,7 +31,7 @@ position(needle IN haystack) Alias: `locate(haystack, needle[, start_pos])`. -:::note +:::note Syntax of `position(needle IN haystack)` provides SQL-compatibility, the function works the same way as to `position(haystack, needle)`. ::: @@ -344,24 +344,27 @@ Returns 1, if at least one string needlei matches the string `haystac For a case-insensitive search or/and in UTF-8 format use functions `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`. -:::note +:::note In all `multiSearch*` functions the number of needles should be less than 28 because of implementation specification. ::: ## match(haystack, pattern) -Checks whether the string matches the `pattern` regular expression. A `re2` regular expression. The [syntax](https://github.com/google/re2/wiki/Syntax) of the `re2` regular expressions is more limited than the syntax of the Perl regular expressions. +Checks whether the string matches the regular expression `pattern` in `re2` syntax. `Re2` has a more limited [syntax](https://github.com/google/re2/wiki/Syntax) than Perl regular expressions. Returns 0 if it does not match, or 1 if it matches. -The regular expression works with the string as if it is a set of bytes. The regular expression can’t contain null bytes. +Matching is based on UTF-8, e.g. `.` matches the Unicode code point `¥` which is represented in UTF-8 using two bytes. The regular expression must not contain null bytes. +If the haystack or pattern contain a sequence of bytes that are not valid UTF-8, then the behavior is undefined. +No automatic Unicode normalization is performed, if you need it you can use the [normalizeUTF8*()](https://clickhouse.com/docs/en/sql-reference/functions/string-functions/) functions for that. + For patterns to search for substrings in a string, it is better to use LIKE or ‘position’, since they work much faster. ## multiMatchAny(haystack, \[pattern1, pattern2, …, patternn\]) The same as `match`, but returns 0 if none of the regular expressions are matched and 1 if any of the patterns matches. It uses [hyperscan](https://github.com/intel/hyperscan) library. For patterns to search substrings in a string, it is better to use `multiSearchAny` since it works much faster. -:::note +:::note The length of any of the `haystack` string must be less than 232 bytes otherwise the exception is thrown. This restriction takes place because of hyperscan API. ::: @@ -385,11 +388,11 @@ The same as `multiFuzzyMatchAny`, but returns any index that matches the haystac The same as `multiFuzzyMatchAny`, but returns the array of all indices in any order that match the haystack within a constant edit distance. -:::note +:::note `multiFuzzyMatch*` functions do not support UTF-8 regular expressions, and such expressions are treated as bytes because of hyperscan restriction. ::: -:::note +:::note To turn off all functions that use hyperscan, use setting `SET allow_hyperscan = 0;`. ::: @@ -405,7 +408,7 @@ Extracts all the fragments of a string using a regular expression. If ‘haystac Matches all groups of the `haystack` string using the `pattern` regular expression. Returns an array of arrays, where the first array includes all fragments matching the first group, the second array - matching the second group, etc. -:::note +:::note `extractAllGroupsHorizontal` function is slower than [extractAllGroupsVertical](#extractallgroups-vertical). ::: @@ -498,6 +501,10 @@ The regular expression can contain the metasymbols `%` and `_`. Use the backslash (`\`) for escaping metasymbols. See the note on escaping in the description of the ‘match’ function. +Matching is based on UTF-8, e.g. `_` matches the Unicode code point `¥` which is represented in UTF-8 using two bytes. +If the haystack or pattern contain a sequence of bytes that are not valid UTF-8, then the behavior is undefined. +No automatic Unicode normalization is performed, if you need it you can use the [normalizeUTF8*()](https://clickhouse.com/docs/en/sql-reference/functions/string-functions/) functions for that. + For regular expressions like `%needle%`, the code is more optimal and works as fast as the `position` function. For other regular expressions, the code is the same as for the ‘match’ function. @@ -509,6 +516,8 @@ The same thing as ‘like’, but negative. Case insensitive variant of [like](https://clickhouse.com/docs/en/sql-reference/functions/string-search-functions/#function-like) function. You can use `ILIKE` operator instead of the `ilike` function. +The function ignores the language, e.g. for Turkish (i/İ), the result might be incorrect. + **Syntax** ``` sql @@ -577,7 +586,7 @@ Same as `ngramDistance` but calculates the non-symmetric difference between `nee For case-insensitive search or/and in UTF-8 format use functions `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`. -:::note +:::note For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the (non-)symmetric difference between these hash tables – collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function – we zero the 5-th bit (starting from zero) of each codepoint byte and first bit of zeroth byte if bytes more than one – this works for Latin and mostly for all Cyrillic letters. ::: diff --git a/docs/en/sql-reference/operators/index.md b/docs/en/sql-reference/operators/index.md index 5df018bb920..17b8f014366 100644 --- a/docs/en/sql-reference/operators/index.md +++ b/docs/en/sql-reference/operators/index.md @@ -43,28 +43,38 @@ For tuple subtraction: [tupleMinus](../../sql-reference/functions/tuple-function ## Comparison Operators +### equals function `a = b` – The `equals(a, b)` function. `a == b` – The `equals(a, b)` function. +### notEquals function `a != b` – The `notEquals(a, b)` function. `a <> b` – The `notEquals(a, b)` function. +### lessOrEquals function `a <= b` – The `lessOrEquals(a, b)` function. +### greaterOrEquals function `a >= b` – The `greaterOrEquals(a, b)` function. +### less function `a < b` – The `less(a, b)` function. +### greater function `a > b` – The `greater(a, b)` function. +### like function `a LIKE s` – The `like(a, b)` function. +### notLike function `a NOT LIKE s` – The `notLike(a, b)` function. +### ilike function `a ILIKE s` – The `ilike(a, b)` function. +### BETWEEN function `a BETWEEN b AND c` – The same as `a >= b AND a <= c`. `a NOT BETWEEN b AND c` – The same as `a < b OR a > c`. @@ -73,20 +83,28 @@ For tuple subtraction: [tupleMinus](../../sql-reference/functions/tuple-function See [IN operators](../../sql-reference/operators/in.md) and [EXISTS](../../sql-reference/operators/exists.md) operator. +### in function `a IN ...` – The `in(a, b)` function. +### notIn function `a NOT IN ...` – The `notIn(a, b)` function. +### globalIn function `a GLOBAL IN ...` – The `globalIn(a, b)` function. +### globalNotIn function `a GLOBAL NOT IN ...` – The `globalNotIn(a, b)` function. +### in subquery function `a = ANY (subquery)` – The `in(a, subquery)` function. +### notIn subquery function `a != ANY (subquery)` – The same as `a NOT IN (SELECT singleValueOrNull(*) FROM subquery)`. +### in subquery function `a = ALL (subquery)` – The same as `a IN (SELECT singleValueOrNull(*) FROM subquery)`. +### notIn subquery function `a != ALL (subquery)` – The `notIn(a, subquery)` function. diff --git a/docs/en/sql-reference/statements/select/group-by.md b/docs/en/sql-reference/statements/select/group-by.md index 45230d0b3b1..e02db6d4f6b 100644 --- a/docs/en/sql-reference/statements/select/group-by.md +++ b/docs/en/sql-reference/statements/select/group-by.md @@ -48,9 +48,9 @@ You can see that `GROUP BY` for `y = NULL` summed up `x`, as if `NULL` is this v If you pass several keys to `GROUP BY`, the result will give you all the combinations of the selection, as if `NULL` were a specific value. -## WITH ROLLUP Modifier +## ROLLUP Modifier -`WITH ROLLUP` modifier is used to calculate subtotals for the key expressions, based on their order in the `GROUP BY` list. The subtotals rows are added after the result table. +`ROLLUP` modifier is used to calculate subtotals for the key expressions, based on their order in the `GROUP BY` list. The subtotals rows are added after the result table. The subtotals are calculated in the reverse order: at first subtotals are calculated for the last key expression in the list, then for the previous one, and so on up to the first key expression. @@ -78,7 +78,7 @@ Consider the table t: Query: ```sql -SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH ROLLUP; +SELECT year, month, day, count(*) FROM t GROUP BY ROLLUP(year, month, day); ``` As `GROUP BY` section has three key expressions, the result contains four tables with subtotals "rolled up" from right to left: @@ -109,10 +109,14 @@ As `GROUP BY` section has three key expressions, the result contains four tables │ 0 │ 0 │ 0 │ 6 │ └──────┴───────┴─────┴─────────┘ ``` +The same query also can be written using `WITH` keyword. +```sql +SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH ROLLUP; +``` -## WITH CUBE Modifier +## CUBE Modifier -`WITH CUBE` modifier is used to calculate subtotals for every combination of the key expressions in the `GROUP BY` list. The subtotals rows are added after the result table. +`CUBE` modifier is used to calculate subtotals for every combination of the key expressions in the `GROUP BY` list. The subtotals rows are added after the result table. In the subtotals rows the values of all "grouped" key expressions are set to `0` or empty line. @@ -138,7 +142,7 @@ Consider the table t: Query: ```sql -SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH CUBE; +SELECT year, month, day, count(*) FROM t GROUP BY CUBE(year, month, day); ``` As `GROUP BY` section has three key expressions, the result contains eight tables with subtotals for all key expression combinations: @@ -196,6 +200,10 @@ Columns, excluded from `GROUP BY`, are filled with zeros. │ 0 │ 0 │ 0 │ 6 │ └──────┴───────┴─────┴─────────┘ ``` +The same query also can be written using `WITH` keyword. +```sql +SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH CUBE; +``` ## WITH TOTALS Modifier @@ -260,6 +268,39 @@ GROUP BY domain For every different key value encountered, `GROUP BY` calculates a set of aggregate function values. +## GROUPING SETS modifier + +This is the most general modifier. +This modifier allows to manually specify several aggregation key sets (grouping sets). +Aggregation is performed separately for each grouping set, after that all results are combined. +If a column is not presented in a grouping set, it's filled with a default value. + +In other words, modifiers described above can be represented via `GROUPING SETS`. +Despite the fact that queries with `ROLLUP`, `CUBE` and `GROUPING SETS` modifiers are syntactically equal, they may have different performance. +When `GROUPING SETS` try to execute everything in parallel, `ROLLUP` and `CUBE` are executing the final merging of the aggregates in a single thread. + +In the situation when source columns contain default values, it might be hard to distinguish if a row is a part of the aggregation which uses those columns as keys or not. +To solve this problem `GROUPING` function must be used. + +**Example** + +The following two queries are equivalent. + +```sql +-- Query 1 +SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH ROLLUP; + +-- Query 2 +SELECT year, month, day, count(*) FROM t GROUP BY +GROUPING SETS +( + (year, month, day), + (year, month), + (year), + () +); +``` + ## Implementation Details Aggregation is one of the most important features of a column-oriented DBMS, and thus it’s implementation is one of the most heavily optimized parts of ClickHouse. By default, aggregation is done in memory using a hash-table. It has 40+ specializations that are chosen automatically depending on “grouping key” data types. diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md index e53ea41d606..0a563cd6d66 100644 --- a/docs/en/sql-reference/window-functions/index.md +++ b/docs/en/sql-reference/window-functions/index.md @@ -10,7 +10,7 @@ ClickHouse supports the standard grammar for defining windows and window functio | Feature | Support or workaround | | --------| ----------| | ad hoc window specification (`count(*) over (partition by id order by time desc)`) | supported | -| expressions involving window functions, e.g. `(count(*) over ()) / 2)` | not supported, wrap in a subquery ([feature request](https://github.com/ClickHouse/ClickHouse/issues/19857)) | +| expressions involving window functions, e.g. `(count(*) over ()) / 2)` | supported | | `WINDOW` clause (`select ... from table window w as (partition by id)`) | supported | | `ROWS` frame | supported | | `RANGE` frame | supported, the default | @@ -55,3 +55,372 @@ https://dev.mysql.com/doc/refman/8.0/en/window-function-descriptions.html https://dev.mysql.com/doc/refman/8.0/en/window-functions-usage.html https://dev.mysql.com/doc/refman/8.0/en/window-functions-frames.html + +## Syntax + +```text +aggregate_function (column_name) + OVER ([PARTITION BY groupping_column] [ORDER BY sorting_column] + [ROWS or RANGE expression_to_bounds_of_frame]) +``` + +- `PARTITION BY` - defines how to break a resultset into groups. +- `ORDER BY` - defines how to order rows inside the group during calculation aggregate_function. +- `ROWS or RANGE` - defines bounds of a frame, aggregate_function is calculated within a frame. + +```text + PARTITION +┌─────────────────┐ <-- UNBOUNDED PRECEDING (BEGINNING of the PARTITION) +│ │ +│ │ +│=================│ <-- N PRECEDING <─┐ +│ N ROWS │ │ F +│ Before CURRENT │ │ R +│~~~~~~~~~~~~~~~~~│ <-- CURRENT ROW │ A +│ M ROWS │ │ M +│ After CURRENT │ │ E +│=================│ <-- M FOLLOWING <─┘ +│ │ +│ │ +└─────────────────┘ <--- UNBOUNDED FOLLOWING (END of the PARTITION) +``` + +## Examples + +```sql +CREATE TABLE wf_partition +( + `part_key` UInt64, + `value` UInt64 +) +ENGINE = Memory; + +INSERT INTO wf_partition FORMAT Values + (1,1,1), (1,2,2), (1,3,3), (2,0,0), (3,0,0); + +SELECT + part_key, + value, + order, + groupArray(value) OVER (PARTITION BY part_key) AS frame_values +FROM wf_partition +ORDER BY + part_key ASC, + value ASC; + +┌─part_key─┬─value─┬─order─┬─frame_values─┐ +│ 1 │ 1 │ 1 │ [1,2,3] │ <┐ +│ 1 │ 2 │ 2 │ [1,2,3] │ │ 1-st group +│ 1 │ 3 │ 3 │ [1,2,3] │ <┘ +│ 2 │ 0 │ 0 │ [0] │ <- 2-nd group +│ 3 │ 0 │ 0 │ [0] │ <- 3-d group +└──────────┴───────┴───────┴──────────────┘ +``` + +```sql +CREATE TABLE wf_frame +( + `part_key` UInt64, + `value` UInt64, + `order` UInt64 +) +ENGINE = Memory; + +INSERT INTO wf_frame FORMAT Values + (1,1,1), (1,2,2), (1,3,3), (1,4,4), (1,5,5); + +-- frame is bounded by bounds of a partition (BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) +SELECT + part_key, + value, + order, + groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC + Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS frame_values +FROM wf_frame +ORDER BY + part_key ASC, + value ASC; + +┌─part_key─┬─value─┬─order─┬─frame_values─┐ +│ 1 │ 1 │ 1 │ [1,2,3,4,5] │ +│ 1 │ 2 │ 2 │ [1,2,3,4,5] │ +│ 1 │ 3 │ 3 │ [1,2,3,4,5] │ +│ 1 │ 4 │ 4 │ [1,2,3,4,5] │ +│ 1 │ 5 │ 5 │ [1,2,3,4,5] │ +└──────────┴───────┴───────┴──────────────┘ + +-- short form - no bound expression, no order by +SELECT + part_key, + value, + order, + groupArray(value) OVER (PARTITION BY part_key) AS frame_values +FROM wf_frame +ORDER BY + part_key ASC, + value ASC; +┌─part_key─┬─value─┬─order─┬─frame_values─┐ +│ 1 │ 1 │ 1 │ [1,2,3,4,5] │ +│ 1 │ 2 │ 2 │ [1,2,3,4,5] │ +│ 1 │ 3 │ 3 │ [1,2,3,4,5] │ +│ 1 │ 4 │ 4 │ [1,2,3,4,5] │ +│ 1 │ 5 │ 5 │ [1,2,3,4,5] │ +└──────────┴───────┴───────┴──────────────┘ + +-- frame is bounded by the beggining of a partition and the current row +SELECT + part_key, + value, + order, + groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC + Rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS frame_values +FROM wf_frame +ORDER BY + part_key ASC, + value ASC; + +┌─part_key─┬─value─┬─order─┬─frame_values─┐ +│ 1 │ 1 │ 1 │ [1] │ +│ 1 │ 2 │ 2 │ [1,2] │ +│ 1 │ 3 │ 3 │ [1,2,3] │ +│ 1 │ 4 │ 4 │ [1,2,3,4] │ +│ 1 │ 5 │ 5 │ [1,2,3,4,5] │ +└──────────┴───────┴───────┴──────────────┘ + +-- short form (frame is bounded by the beggining of a partition and the current row) +SELECT + part_key, + value, + order, + groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC) AS frame_values +FROM wf_frame +ORDER BY + part_key ASC, + value ASC; +┌─part_key─┬─value─┬─order─┬─frame_values─┐ +│ 1 │ 1 │ 1 │ [1] │ +│ 1 │ 2 │ 2 │ [1,2] │ +│ 1 │ 3 │ 3 │ [1,2,3] │ +│ 1 │ 4 │ 4 │ [1,2,3,4] │ +│ 1 │ 5 │ 5 │ [1,2,3,4,5] │ +└──────────┴───────┴───────┴──────────────┘ + +-- frame is bounded by the beggining of a partition and the current row, but order is backward +SELECT + part_key, + value, + order, + groupArray(value) OVER (PARTITION BY part_key ORDER BY order DESC) AS frame_values +FROM wf_frame +ORDER BY + part_key ASC, + value ASC; +┌─part_key─┬─value─┬─order─┬─frame_values─┐ +│ 1 │ 1 │ 1 │ [5,4,3,2,1] │ +│ 1 │ 2 │ 2 │ [5,4,3,2] │ +│ 1 │ 3 │ 3 │ [5,4,3] │ +│ 1 │ 4 │ 4 │ [5,4] │ +│ 1 │ 5 │ 5 │ [5] │ +└──────────┴───────┴───────┴──────────────┘ + +-- sliding frame - 1 PRECEDING ROW AND CURRENT ROW +SELECT + part_key, + value, + order, + groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC + Rows BETWEEN 1 PRECEDING AND CURRENT ROW) AS frame_values +FROM wf_frame +ORDER BY + part_key ASC, + value ASC; + +┌─part_key─┬─value─┬─order─┬─frame_values─┐ +│ 1 │ 1 │ 1 │ [1] │ +│ 1 │ 2 │ 2 │ [1,2] │ +│ 1 │ 3 │ 3 │ [2,3] │ +│ 1 │ 4 │ 4 │ [3,4] │ +│ 1 │ 5 │ 5 │ [4,5] │ +└──────────┴───────┴───────┴──────────────┘ + +-- sliding frame - Rows BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING +SELECT + part_key, + value, + order, + groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC + Rows BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING) AS frame_values +FROM wf_frame +ORDER BY + part_key ASC, + value ASC; +┌─part_key─┬─value─┬─order─┬─frame_values─┐ +│ 1 │ 1 │ 1 │ [1,2,3,4,5] │ +│ 1 │ 2 │ 2 │ [1,2,3,4,5] │ +│ 1 │ 3 │ 3 │ [2,3,4,5] │ +│ 1 │ 4 │ 4 │ [3,4,5] │ +│ 1 │ 5 │ 5 │ [4,5] │ +└──────────┴───────┴───────┴──────────────┘ +``` + +## Real world examples + +### Maximum/total salary per department. + +```sql +CREATE TABLE employees +( + `department` String, + `employee_name` String, + `salary` Float +) +ENGINE = Memory; + +INSERT INTO employees FORMAT Values + ('Finance', 'Jonh', 200), + ('Finance', 'Joan', 210), + ('Finance', 'Jean', 505), + ('IT', 'Tim', 200), + ('IT', 'Anna', 300), + ('IT', 'Elen', 500); + +SELECT + department, + employee_name AS emp, + salary, + max_salary_per_dep, + total_salary_per_dep, + round((salary / total_salary_per_dep) * 100, 2) AS `share_per_dep(%)` +FROM +( + SELECT + department, + employee_name, + salary, + max(salary) OVER wndw AS max_salary_per_dep, + sum(salary) OVER wndw AS total_salary_per_dep + FROM employees + WINDOW wndw AS (PARTITION BY department + rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) + ORDER BY + department ASC, + employee_name ASC +); + +┌─department─┬─emp──┬─salary─┬─max_salary_per_dep─┬─total_salary_per_dep─┬─share_per_dep(%)─┐ +│ Finance │ Jean │ 505 │ 505 │ 915 │ 55.19 │ +│ Finance │ Joan │ 210 │ 505 │ 915 │ 22.95 │ +│ Finance │ Jonh │ 200 │ 505 │ 915 │ 21.86 │ +│ IT │ Anna │ 300 │ 500 │ 1000 │ 30 │ +│ IT │ Elen │ 500 │ 500 │ 1000 │ 50 │ +│ IT │ Tim │ 200 │ 500 │ 1000 │ 20 │ +└────────────┴──────┴────────┴────────────────────┴──────────────────────┴──────────────────┘ +``` + +### Cumulative sum. + +```sql +CREATE TABLE events +( + `metric` String, + `ts` DateTime, + `value` Float +) +ENGINE = Memory + +INSERT INTO warehouse VALUES + ('sku38', '2020-01-01', 9), + ('sku38', '2020-02-01', 1), + ('sku38', '2020-03-01', -4), + ('sku1', '2020-01-01', 1), + ('sku1', '2020-02-01', 1), + ('sku1', '2020-03-01', 1); + +SELECT + item, + ts, + value, + sum(value) OVER (PARTITION BY item ORDER BY ts ASC) AS stock_balance +FROM warehouse +ORDER BY + item ASC, + ts ASC; + +┌─item──┬──────────────────ts─┬─value─┬─stock_balance─┐ +│ sku1 │ 2020-01-01 00:00:00 │ 1 │ 1 │ +│ sku1 │ 2020-02-01 00:00:00 │ 1 │ 2 │ +│ sku1 │ 2020-03-01 00:00:00 │ 1 │ 3 │ +│ sku38 │ 2020-01-01 00:00:00 │ 9 │ 9 │ +│ sku38 │ 2020-02-01 00:00:00 │ 1 │ 10 │ +│ sku38 │ 2020-03-01 00:00:00 │ -4 │ 6 │ +└───────┴─────────────────────┴───────┴───────────────┘ +``` + +### Moving / Sliding Average (per 3 rows) + +```sql +CREATE TABLE sensors +( + `metric` String, + `ts` DateTime, + `value` Float +) +ENGINE = Memory; + +insert into sensors values('cpu_temp', '2020-01-01 00:00:00', 87), + ('cpu_temp', '2020-01-01 00:00:01', 77), + ('cpu_temp', '2020-01-01 00:00:02', 93), + ('cpu_temp', '2020-01-01 00:00:03', 87), + ('cpu_temp', '2020-01-01 00:00:04', 87), + ('cpu_temp', '2020-01-01 00:00:05', 87), + ('cpu_temp', '2020-01-01 00:00:06', 87), + ('cpu_temp', '2020-01-01 00:00:07', 87); +SELECT + metric, + ts, + value, + avg(value) OVER + (PARTITION BY metric ORDER BY ts ASC Rows BETWEEN 2 PRECEDING AND CURRENT ROW) + AS moving_avg_temp +FROM sensors +ORDER BY + metric ASC, + ts ASC; + +┌─metric───┬──────────────────ts─┬─value─┬───moving_avg_temp─┐ +│ cpu_temp │ 2020-01-01 00:00:00 │ 87 │ 87 │ +│ cpu_temp │ 2020-01-01 00:00:01 │ 77 │ 82 │ +│ cpu_temp │ 2020-01-01 00:00:02 │ 93 │ 85.66666666666667 │ +│ cpu_temp │ 2020-01-01 00:00:03 │ 87 │ 85.66666666666667 │ +│ cpu_temp │ 2020-01-01 00:00:04 │ 87 │ 89 │ +│ cpu_temp │ 2020-01-01 00:00:05 │ 87 │ 87 │ +│ cpu_temp │ 2020-01-01 00:00:06 │ 87 │ 87 │ +│ cpu_temp │ 2020-01-01 00:00:07 │ 87 │ 87 │ +└──────────┴─────────────────────┴───────┴───────────────────┘ +``` + +### Moving / Sliding Average (per 10 seconds) + +```sql +SELECT + metric, + ts, + value, + avg(value) OVER (PARTITION BY metric ORDER BY ts + Range BETWEEN 10 PRECEDING AND CURRENT ROW) AS moving_avg_10_seconds_temp +FROM sensors +ORDER BY + metric ASC, + ts ASC; + +┌─metric───┬──────────────────ts─┬─value─┬─moving_avg_10_seconds_temp─┐ +│ cpu_temp │ 2020-01-01 00:00:00 │ 87 │ 87 │ +│ cpu_temp │ 2020-01-01 00:01:10 │ 77 │ 77 │ +│ cpu_temp │ 2020-01-01 00:02:20 │ 93 │ 93 │ +│ cpu_temp │ 2020-01-01 00:03:30 │ 87 │ 87 │ +│ cpu_temp │ 2020-01-01 00:04:40 │ 87 │ 87 │ +│ cpu_temp │ 2020-01-01 00:05:50 │ 87 │ 87 │ +│ cpu_temp │ 2020-01-01 00:06:00 │ 87 │ 87 │ +│ cpu_temp │ 2020-01-01 00:07:10 │ 87 │ 87 │ +└──────────┴─────────────────────┴───────┴────────────────────────────┘ +``` diff --git a/docs/ru/faq/index.md b/docs/ru/faq/index.md index d362035284d..1d1dc7df819 100644 --- a/docs/ru/faq/index.md +++ b/docs/ru/faq/index.md @@ -39,6 +39,6 @@ Question candidates: - How to kill a process (query) in ClickHouse? - How to implement pivot (like in pandas)? - How to remove the default ClickHouse user through users.d? -- Importing MySQL dump to Clickhouse +- Importing MySQL dump to ClickHouse - Window function workarounds (row\_number, lag/lead, running diff/sum/average) ##} diff --git a/docs/ru/interfaces/cli.md b/docs/ru/interfaces/cli.md index 7a2c712d0a3..f51b70c17bf 100644 --- a/docs/ru/interfaces/cli.md +++ b/docs/ru/interfaces/cli.md @@ -121,7 +121,7 @@ $ clickhouse-client --param_tbl="numbers" --param_db="system" --param_col="numbe - `--user, -u` — имя пользователя, по умолчанию — ‘default’. - `--password` — пароль, по умолчанию — пустая строка. - `--query, -q` — запрос для выполнения, при использовании в неинтерактивном режиме. -- `--queries-file, -qf` - путь к файлу с запросами для выполнения. Необходимо указать только одну из опций: `query` или `queries-file`. +- `--queries-file` - путь к файлу с запросами для выполнения. Необходимо указать только одну из опций: `query` или `queries-file`. - `--database, -d` — выбрать текущую БД. Без указания значение берется из настроек сервера (по умолчанию — БД ‘default’). - `--multiline, -m` — если указано — разрешить многострочные запросы, не отправлять запрос по нажатию Enter. - `--multiquery, -n` — если указано — разрешить выполнять несколько запросов, разделённых точкой с запятой. diff --git a/docs/ru/operations/utilities/clickhouse-local.md b/docs/ru/operations/utilities/clickhouse-local.md index 65a6ee08aef..e463c31eb0c 100644 --- a/docs/ru/operations/utilities/clickhouse-local.md +++ b/docs/ru/operations/utilities/clickhouse-local.md @@ -28,12 +28,12 @@ $ clickhouse-local --structure "table_structure" --input-format "format_of_incom Ключи команды: - `-S`, `--structure` — структура таблицы, в которую будут помещены входящие данные. -- `-if`, `--input-format` — формат входящих данных. По умолчанию — `TSV`. +- `--input-format` — формат входящих данных. По умолчанию — `TSV`. - `-f`, `--file` — путь к файлу с данными. По умолчанию — `stdin`. - `-q`, `--query` — запросы на выполнение. Разделитель запросов — `;`. -- `-qf`, `--queries-file` - путь к файлу с запросами для выполнения. Необходимо задать либо параметр `query`, либо `queries-file`. +- `--queries-file` - путь к файлу с запросами для выполнения. Необходимо задать либо параметр `query`, либо `queries-file`. - `-N`, `--table` — имя таблицы, в которую будут помещены входящие данные. По умолчанию - `table`. -- `-of`, `--format`, `--output-format` — формат выходных данных. По умолчанию — `TSV`. +- `--format`, `--output-format` — формат выходных данных. По умолчанию — `TSV`. - `-d`, `--database` — база данных по умолчанию. Если не указано, используется значение `_local`. - `--stacktrace` — вывод отладочной информации при исключениях. - `--echo` — перед выполнением запрос выводится в консоль. @@ -109,4 +109,3 @@ Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec. ├──────────┼──────────┤ ... ``` - diff --git a/docs/ru/sql-reference/data-types/lowcardinality.md b/docs/ru/sql-reference/data-types/lowcardinality.md index 14a9e923ac8..2b9abd0ab2d 100644 --- a/docs/ru/sql-reference/data-types/lowcardinality.md +++ b/docs/ru/sql-reference/data-types/lowcardinality.md @@ -55,5 +55,5 @@ ORDER BY id ## Смотрите также -- [Reducing Clickhouse Storage Cost with the Low Cardinality Type – Lessons from an Instana Engineer](https://www.instana.com/blog/reducing-clickhouse-storage-cost-with-the-low-cardinality-type-lessons-from-an-instana-engineer/). +- [Reducing ClickHouse Storage Cost with the Low Cardinality Type – Lessons from an Instana Engineer](https://www.instana.com/blog/reducing-clickhouse-storage-cost-with-the-low-cardinality-type-lessons-from-an-instana-engineer/). - [String Optimization (video presentation in Russian)](https://youtu.be/rqf-ILRgBdY?list=PL0Z2YDlm0b3iwXCpEFiOOYmwXzVmjJfEt). [Slides in English](https://github.com/ClickHouse/clickhouse-presentations/raw/master/meetup19/string_optimization.pdf). diff --git a/docs/ru/sql-reference/functions/encryption-functions.md b/docs/ru/sql-reference/functions/encryption-functions.md index 2eaad0e1930..fdb2831c5c3 100644 --- a/docs/ru/sql-reference/functions/encryption-functions.md +++ b/docs/ru/sql-reference/functions/encryption-functions.md @@ -11,7 +11,7 @@ sidebar_label: "Функции для шифрования" Длина инициализирующего вектора всегда 16 байт (лишние байты игнорируются). -Обратите внимание, что до версии Clickhouse 21.1 эти функции работали медленно. +Обратите внимание, что до версии ClickHouse 21.1 эти функции работали медленно. ## encrypt {#encrypt} @@ -19,11 +19,10 @@ sidebar_label: "Функции для шифрования" - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc -- aes-128-cfb1, aes-192-cfb1, aes-256-cfb1 -- aes-128-cfb8, aes-192-cfb8, aes-256-cfb8 -- aes-128-cfb128, aes-192-cfb128, aes-256-cfb128 +- aes-128-cfb128 - aes-128-ofb, aes-192-ofb, aes-256-ofb - aes-128-gcm, aes-192-gcm, aes-256-gcm +- aes-128-ctr, aes-192-ctr, aes-256-ctr **Синтаксис** @@ -63,9 +62,9 @@ ENGINE = Memory; Запрос: ``` sql -INSERT INTO encryption_test VALUES('aes-256-cfb128 no IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212')),\ -('aes-256-cfb128 no IV, different key', encrypt('aes-256-cfb128', 'Secret', 'keykeykeykeykeykeykeykeykeykeyke')),\ -('aes-256-cfb128 with IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')),\ +INSERT INTO encryption_test VALUES('aes-256-ofb no IV', encrypt('aes-256-ofb', 'Secret', '12345678910121314151617181920212')),\ +('aes-256-ofb no IV, different key', encrypt('aes-256-ofb', 'Secret', 'keykeykeykeykeykeykeykeykeykeyke')),\ +('aes-256-ofb with IV', encrypt('aes-256-ofb', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')),\ ('aes-256-cbc no IV', encrypt('aes-256-cbc', 'Secret', '12345678910121314151617181920212')); ``` @@ -78,12 +77,12 @@ SELECT comment, hex(secret) FROM encryption_test; Результат: ``` text -┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐ -│ aes-256-cfb128 no IV │ B4972BDC4459 │ -│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9 │ -│ aes-256-cfb128 with IV │ 5E6CB398F653 │ -│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ -└─────────────────────────────────────┴──────────────────────────────────┘ +┌─comment──────────────────────────┬─hex(secret)──────────────────────┐ +│ aes-256-ofb no IV │ B4972BDC4459 │ +│ aes-256-ofb no IV, different key │ 2FF57C092DC9 │ +│ aes-256-ofb with IV │ 5E6CB398F653 │ +│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ +└──────────────────────────────────┴──────────────────────────────────┘ ``` Пример в режиме `-gcm`: @@ -116,9 +115,7 @@ SELECT comment, hex(secret) FROM encryption_test WHERE comment LIKE '%gcm%'; - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc -- aes-128-cfb1, aes-192-cfb1, aes-256-cfb1 -- aes-128-cfb8, aes-192-cfb8, aes-256-cfb8 -- aes-128-cfb128, aes-192-cfb128, aes-256-cfb128 +- aes-128-cfb128 - aes-128-ofb, aes-192-ofb, aes-256-ofb **Синтаксис** @@ -145,7 +142,7 @@ aes_encrypt_mysql('mode', 'plaintext', 'key' [, iv]) Запрос: ``` sql -SELECT encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') = aes_encrypt_mysql('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') AS ciphertexts_equal; +SELECT encrypt('aes-256-ofb', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') = aes_encrypt_mysql('aes-256-ofb', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') AS ciphertexts_equal; ``` Результат: @@ -161,14 +158,14 @@ SELECT encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', ' Запрос: ``` sql -SELECT encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'); +SELECT encrypt('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'); ``` Результат: ``` text Received exception from server (version 21.1.2): -Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid key size: 33 expected 32: While processing encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'). +Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid key size: 33 expected 32: While processing encrypt('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'). ``` Однако функция `aes_encrypt_mysql` в аналогичном случае возвращает результат, который может быть обработан MySQL: @@ -176,7 +173,7 @@ Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid ke Запрос: ``` sql -SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123')) AS ciphertext; +SELECT hex(aes_encrypt_mysql('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123')) AS ciphertext; ``` Результат: @@ -192,7 +189,7 @@ SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '1234567891012131415161 Запрос: ``` sql -SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456')) AS ciphertext +SELECT hex(aes_encrypt_mysql('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456')) AS ciphertext ``` Результат: @@ -206,7 +203,7 @@ SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '1234567891012131415161 Это совпадает с результатом, возвращаемым MySQL при таких же входящих значениях: ``` sql -mysql> SET block_encryption_mode='aes-256-cfb128'; +mysql> SET block_encryption_mode='aes-256-ofb'; Query OK, 0 rows affected (0.00 sec) mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext; @@ -224,11 +221,10 @@ mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviv - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc -- aes-128-cfb1, aes-192-cfb1, aes-256-cfb1 -- aes-128-cfb8, aes-192-cfb8, aes-256-cfb8 -- aes-128-cfb128, aes-192-cfb128, aes-256-cfb128 +- aes-128-cfb128 - aes-128-ofb, aes-192-ofb, aes-256-ofb - aes-128-gcm, aes-192-gcm, aes-256-gcm +- aes-128-ctr, aes-192-ctr, aes-256-ctr **Синтаксис** @@ -265,12 +261,12 @@ SELECT comment, hex(secret) FROM encryption_test; │ aes-256-gcm │ A8A3CCBC6426CFEEB60E4EAE03D3E94204C1B09E0254 │ │ aes-256-gcm with AAD │ A8A3CCBC6426D9A1017A0A932322F1852260A4AD6837 │ └──────────────────────┴──────────────────────────────────────────────┘ -┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐ -│ aes-256-cfb128 no IV │ B4972BDC4459 │ -│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9 │ -│ aes-256-cfb128 with IV │ 5E6CB398F653 │ -│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ -└─────────────────────────────────────┴──────────────────────────────────┘ +┌─comment──────────────────────────┬─hex(secret)──────────────────────┐ +│ aes-256-ofb no IV │ B4972BDC4459 │ +│ aes-256-ofb no IV, different key │ 2FF57C092DC9 │ +│ aes-256-ofb with IV │ 5E6CB398F653 │ +│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ +└──────────────────────────────────┴──────────────────────────────────┘ ``` Теперь попытаемся расшифровать эти данные: @@ -278,19 +274,25 @@ SELECT comment, hex(secret) FROM encryption_test; Запрос: ``` sql -SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920212') as plaintext FROM encryption_test; +SELECT comment, decrypt('aes-256-ofb', secret, '12345678910121314151617181920212') as plaintext FROM encryption_test; ``` Результат: ``` text -┌─comment─────────────────────────────┬─plaintext─┐ -│ aes-256-cfb128 no IV │ Secret │ -│ aes-256-cfb128 no IV, different key │ �4� - � │ -│ aes-256-cfb128 with IV │ ���6�~ │ - │aes-256-cbc no IV │ �2*4�h3c�4w��@ -└─────────────────────────────────────┴───────────┘ +┌─comment──────────────┬─plaintext──┐ +│ aes-256-gcm │ OQ�E + �t�7T�\���\� │ +│ aes-256-gcm with AAD │ OQ�E + �\��si����;�o�� │ +└──────────────────────┴────────────┘ +┌─comment──────────────────────────┬─plaintext─┐ +│ aes-256-ofb no IV │ Secret │ +│ aes-256-ofb no IV, different key │ �4� + � │ +│ aes-256-ofb with IV │ ���6�~ │ + │aes-256-cbc no IV │ �2*4�h3c�4w��@ +└──────────────────────────────────┴───────────┘ ``` Обратите внимание, что только часть данных была расшифрована верно. Оставшаяся часть расшифрована некорректно, так как при шифровании использовались другие значения `mode`, `key`, или `iv`. @@ -305,9 +307,7 @@ SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920 - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc -- aes-128-cfb1, aes-192-cfb1, aes-256-cfb1 -- aes-128-cfb8, aes-192-cfb8, aes-256-cfb8 -- aes-128-cfb128, aes-192-cfb128, aes-256-cfb128 +- aes-128-cfb128 - aes-128-ofb, aes-192-ofb, aes-256-ofb **Синтаксис** @@ -333,7 +333,7 @@ aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv]) ``` sql -mysql> SET block_encryption_mode='aes-256-cfb128'; +mysql> SET block_encryption_mode='aes-256-ofb'; Query OK, 0 rows affected (0.00 sec) mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext; @@ -348,7 +348,7 @@ mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviv Запрос: ``` sql -SELECT aes_decrypt_mysql('aes-256-cfb128', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext; +SELECT aes_decrypt_mysql('aes-256-ofb', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext; ``` Результат: diff --git a/docs/ru/sql-reference/statements/create/view.md b/docs/ru/sql-reference/statements/create/view.md index 9739d9ec841..9a2db0ac2de 100644 --- a/docs/ru/sql-reference/statements/create/view.md +++ b/docs/ru/sql-reference/statements/create/view.md @@ -43,7 +43,7 @@ CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]na При создании материализованного представления без использования `TO [db].[table]`, нужно обязательно указать `ENGINE` - движок таблицы для хранения данных. -При создании материализованного представления с испольованием `TO [db].[table]`, нельзя указывать `POPULATE`. +При создании материализованного представления с использованием `TO [db].[table]`, нельзя указывать `POPULATE`. Материализованное представление устроено следующим образом: при вставке данных в таблицу, указанную в SELECT-е, кусок вставляемых данных преобразуется этим запросом SELECT, и полученный результат вставляется в представление. diff --git a/docs/zh/operations/utilities/clickhouse-local.md b/docs/zh/operations/utilities/clickhouse-local.md index 9a70e225d69..f6b25a1faf0 100644 --- a/docs/zh/operations/utilities/clickhouse-local.md +++ b/docs/zh/operations/utilities/clickhouse-local.md @@ -29,12 +29,12 @@ clickhouse-local --structure "table_structure" --input-format "format_of_incomin 参数: - `-S`, `--structure` — 输入数据的表结构。 -- `-if`, `--input-format` — 输入格式化类型, 默认是`TSV`。 +- `--input-format` — 输入格式化类型, 默认是`TSV`。 - `-f`, `--file` — 数据路径, 默认是`stdin`。 - `-q`, `--query` — 要查询的SQL语句使用`;`做分隔符。您必须指定`query`或`queries-file`选项。 -- `-qf`, `--queries-file` - 包含执行查询的文件路径。您必须指定`query`或`queries-file`选项。 +- `--queries-file` - 包含执行查询的文件路径。您必须指定`query`或`queries-file`选项。 - `-N`, `--table` — 数据输出的表名,默认是`table`。 -- `-of`, `--format`, `--output-format` — 输出格式化类型, 默认是`TSV`。 +- `--format`, `--output-format` — 输出格式化类型, 默认是`TSV`。 - `-d`, `--database` — 默认数据库名,默认是`_local`。 - `--stacktrace` — 是否在出现异常时输出栈信息。 - `--echo` — 执行前打印查询。 @@ -53,7 +53,7 @@ clickhouse-local --structure "table_structure" --input-format "format_of_incomin ## 示例 {#examples} ``` bash -echo -e "1,2\n3,4" | clickhouse-local -S "a Int64, b Int64" -if "CSV" -q "SELECT * FROM table" +echo -e "1,2\n3,4" | clickhouse-local -S "a Int64, b Int64" --input-format "CSV" -q "SELECT * FROM table" Read 2 rows, 32.00 B in 0.000 sec., 5182 rows/sec., 80.97 KiB/sec. 1 2 3 4 diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index cbbf195a68c..4e3aa701d95 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -994,7 +994,7 @@ void Client::processConfig() /// The value of the option is used as the text of query (or of multiple queries). /// If stdin is not a terminal, INSERT data for the first query is read from it. /// - stdin is not a terminal. In this case queries are read from it. - /// - -qf (--queries-file) command line option is present. + /// - --queries-file command line option is present. /// The value of the option is used as file with query (or of multiple queries) to execute. delayed_interactive = config().has("interactive") && (config().has("query") || config().has("queries-file")); diff --git a/programs/keeper/TinyContext.h b/programs/keeper/TinyContext.h index a53a6d0377d..edc48c7cdc4 100644 --- a/programs/keeper/TinyContext.h +++ b/programs/keeper/TinyContext.h @@ -3,6 +3,7 @@ #include #include +#include namespace DB { @@ -24,9 +25,9 @@ public: private: mutable std::mutex keeper_dispatcher_mutex; - mutable std::shared_ptr keeper_dispatcher; + mutable std::shared_ptr keeper_dispatcher TSA_GUARDED_BY(keeper_dispatcher_mutex); - ConfigurationPtr config; + ConfigurationPtr config TSA_GUARDED_BY(keeper_dispatcher_mutex); }; } diff --git a/programs/library-bridge/SharedLibraryHandlerFactory.h b/programs/library-bridge/SharedLibraryHandlerFactory.h index 115cc78ae52..1a6dfb01e34 100644 --- a/programs/library-bridge/SharedLibraryHandlerFactory.h +++ b/programs/library-bridge/SharedLibraryHandlerFactory.h @@ -1,6 +1,8 @@ #pragma once #include "SharedLibraryHandler.h" +#include + #include #include @@ -30,7 +32,7 @@ public: private: /// map: dict_id -> sharedLibraryHandler - std::unordered_map library_handlers; + std::unordered_map library_handlers TSA_GUARDED_BY(mutex); std::mutex mutex; }; diff --git a/programs/local/LocalServer.h b/programs/local/LocalServer.h index 5defd01663a..ca0ce513b09 100644 --- a/programs/local/LocalServer.h +++ b/programs/local/LocalServer.h @@ -3,7 +3,6 @@ #include #include -#include #include #include #include diff --git a/programs/odbc-bridge/ODBCPooledConnectionFactory.h b/programs/odbc-bridge/ODBCPooledConnectionFactory.h index 5c198c3be97..4d8d3f50ab9 100644 --- a/programs/odbc-bridge/ODBCPooledConnectionFactory.h +++ b/programs/odbc-bridge/ODBCPooledConnectionFactory.h @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -165,7 +166,7 @@ public: private: /// [connection_settings_string] -> [connection_pool] using PoolFactory = std::unordered_map; - PoolFactory factory; + PoolFactory factory TSA_GUARDED_BY(mutex); std::mutex mutex; }; diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index b013ba9ee05..476725c5627 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1515,7 +1515,7 @@ int Server::main(const std::vector & /*args*/) /// Init trace collector only after trace_log system table was created /// Disable it if we collect test coverage information, because it will work extremely slow. -#if USE_UNWIND && !WITH_COVERAGE && defined(__x86_64__) +#if USE_UNWIND && !WITH_COVERAGE /// Profilers cannot work reliably with any other libunwind or without PHDR cache. if (hasPHDRCache()) { diff --git a/programs/server/play.html b/programs/server/play.html index 5d0482c8169..c7ea5e4ef12 100644 --- a/programs/server/play.html +++ b/programs/server/play.html @@ -386,6 +386,39 @@ text-align: center; margin-top: 5em; } + + #chart + { + background-color: var(--element-background-color); + filter: drop-shadow(.2rem .2rem .2rem var(--shadow-color)); + display: none; + height: 70vh; + } + + /* This is for charts (uPlot), Copyright (c) 2022 Leon Sorokin, MIT License, https://github.com/leeoniya/uPlot/ */ + .u-wrap {position: relative;user-select: none;} + .u-over, .u-under, .u-axis {position: absolute;} + .u-under {overflow: hidden;} + .uplot canvas {display: block;position: relative;width: 100%;height: 100%;} + .u-legend {margin: auto;text-align: center; margin-top: 1em; font-family: Liberation Mono, DejaVu Sans Mono, MonoLisa, Consolas, monospace;} + .u-inline {display: block;} + .u-inline * {display: inline-block;} + .u-inline tr {margin-right: 16px;} + .u-legend th {font-weight: 600;} + .u-legend th > * {vertical-align: middle;display: inline-block;} + .u-legend td { min-width: 13em; } + .u-legend .u-marker {width: 1em;height: 1em;margin-right: 4px;background-clip: padding-box !important;} + .u-inline.u-live th::after {content: ":";vertical-align: middle;} + .u-inline:not(.u-live) .u-value {display: none;} + .u-series > * {padding: 4px;} + .u-series th {cursor: pointer;} + .u-legend .u-off > * {opacity: 0.3;} + .u-select {background: rgba(0,0,0,0.07);position: absolute;pointer-events: none;} + .u-cursor-x, .u-cursor-y {position: absolute;left: 0;top: 0;pointer-events: none;will-change: transform;z-index: 100;} + .u-hz .u-cursor-x, .u-vt .u-cursor-y {height: 100%;border-right: 1px dashed #607D8B;} + .u-hz .u-cursor-y, .u-vt .u-cursor-x {width: 100%;border-bottom: 1px dashed #607D8B;} + .u-cursor-pt {position: absolute;top: 0;left: 0;border-radius: 50%;border: 0 solid;pointer-events: none;will-change: transform;z-index: 100;/*this has to be !important since we set inline "background" shorthand */background-clip: padding-box !important;} + .u-axis.u-off, .u-select.u-off, .u-cursor-x.u-off, .u-cursor-y.u-off, .u-cursor-pt.u-off {display: none;} @@ -410,6 +443,7 @@

     
+    

@@ -530,8 +564,13 @@ if (status === 200) { let json; try { json = JSON.parse(response); } catch (e) {} + if (json !== undefined && json.statistics !== undefined) { renderResult(json); + } else if (Array.isArray(json) && json.length == 2 && + Array.isArray(json[0]) && Array.isArray(json[1]) && json[0].length > 1 && json[0].length == json[1].length) { + /// If user requested FORMAT JSONCompactColumns, we will render it as a chart. + renderChart(json); } else { renderUnparsedResult(response); } @@ -578,30 +617,27 @@ } } + function clearElement(id) + { + let elem = document.getElementById(id); + while (elem.firstChild) { + elem.removeChild(elem.lastChild); + } + elem.style.display = 'none'; + } + function clear() { - let table = document.getElementById('data-table'); - while (table.firstChild) { - table.removeChild(table.lastChild); - } - - let graph = document.getElementById('graph'); - while (graph.firstChild) { - graph.removeChild(graph.lastChild); - } - graph.style.display = 'none'; - - document.getElementById('data-unparsed').innerText = ''; - document.getElementById('data-unparsed').style.display = 'none'; - - document.getElementById('error').innerText = ''; - document.getElementById('error').style.display = 'none'; + clearElement('data-table'); + clearElement('graph'); + clearElement('chart'); + clearElement('data-unparsed'); + clearElement('error'); + clearElement('hourglass'); + document.getElementById('check-mark').innerText = ''; + document.getElementById('hourglass').innerText = ''; document.getElementById('stats').innerText = ''; - - document.getElementById('hourglass').style.display = 'none'; - document.getElementById('check-mark').style.display = 'none'; - document.getElementById('logo-container').style.display = 'block'; } @@ -738,6 +774,7 @@ } let table = document.getElementById('data-table'); table.appendChild(tbody); + table.style.display = 'table'; } function renderTable(response) @@ -792,6 +829,7 @@ let table = document.getElementById('data-table'); table.appendChild(thead); table.appendChild(tbody); + table.style.display = 'table'; } /// A function to render raw data when non-default format is specified. @@ -873,16 +911,80 @@ svg.style.height = graph.graph().height; } - function setColorTheme(theme) { - window.localStorage.setItem('theme', theme); - document.documentElement.setAttribute('data-theme', theme); + let load_uplot_promise; + function loadUplot() { + if (load_uplot_promise) { return load_uplot_promise; } + load_uplot_promise = loadJS('https://cdn.jsdelivr.net/npm/uplot@1.6.21/dist/uPlot.iife.min.js', + 'sha384-TwdJPnTsKP6pnvFZZKda0WJCXpjcHCa7MYHmjrYDu6rsEsb/UnFdoL0phS5ODqTA'); + return load_uplot_promise; } + let uplot; + async function renderChart(json) + { + await loadUplot(); + clear(); + + let chart = document.getElementById('chart'); + chart.style.display = 'block'; + + let paths = uPlot.paths.stepped({align: 1}); + + const [line_color, fill_color, grid_color, axes_color] = theme == 'light' + ? ["#F80", "#FED", "#c7d0d9", "#2c3235"] + : ["#888", "#045", "#2c3235", "#c7d0d9"]; + + const opts = { + width: chart.clientWidth, + height: chart.clientHeight, + scales: { x: { time: json[0][0] > 1000000000 && json[0][0] < 2000000000 } }, + axes: [ { stroke: axes_color, + grid: { width: 1 / devicePixelRatio, stroke: grid_color }, + ticks: { width: 1 / devicePixelRatio, stroke: grid_color } }, + { stroke: axes_color, + grid: { width: 1 / devicePixelRatio, stroke: grid_color }, + ticks: { width: 1 / devicePixelRatio, stroke: grid_color } } ], + series: [ { label: "x" }, + { label: "y", stroke: line_color, fill: fill_color, + drawStyle: 0, lineInterpolation: 1, paths } ], + padding: [ null, null, null, (Math.ceil(Math.log10(Math.max(...json[1]))) + Math.floor(Math.log10(Math.max(...json[1])) / 3)) * 6 ], + }; + + uplot = new uPlot(opts, json, chart); + } + + function resizeChart() { + if (uplot) { + let chart = document.getElementById('chart'); + uplot.setSize({ width: chart.clientWidth, height: chart.clientHeight }); + } + } + + function redrawChart() { + if (uplot && document.getElementById('chart').style.display == 'block') { + renderChart(uplot.data); + } + } + + new ResizeObserver(resizeChart).observe(document.getElementById('chart')); + /// First we check if theme is set via the 'theme' GET parameter, if not, we check localStorage, otherwise we check OS preference. let theme = current_url.searchParams.get('theme'); if (['dark', 'light'].indexOf(theme) === -1) { theme = window.localStorage.getItem('theme'); } + if (!theme) { + theme = 'light'; + } + + function setColorTheme(new_theme, update_preference) { + theme = new_theme; + if (update_preference) { + window.localStorage.setItem('theme', theme); + } + document.documentElement.setAttribute('data-theme', theme); + redrawChart(); + } if (theme) { document.documentElement.setAttribute('data-theme', theme); @@ -890,26 +992,21 @@ /// Obtain system-level user preference const media_query_list = window.matchMedia('(prefers-color-scheme: dark)'); if (media_query_list.matches) { - /// Set without saving to localstorage - document.documentElement.setAttribute('data-theme', 'dark'); + setColorTheme('dark'); } /// There is a rumor that on some computers, the theme is changing automatically on day/night. media_query_list.addEventListener('change', function(e) { - if (e.matches) { - document.documentElement.setAttribute('data-theme', 'dark'); - } else { - document.documentElement.setAttribute('data-theme', 'light'); - } + setColorTheme(e.matches ? 'dark' : 'light'); }); } document.getElementById('toggle-light').onclick = function() { - setColorTheme('light'); + setColorTheme('light', true); } document.getElementById('toggle-dark').onclick = function() { - setColorTheme('dark'); + setColorTheme('dark', true); } diff --git a/src/Access/AccessBackup.cpp b/src/Access/AccessBackup.cpp new file mode 100644 index 00000000000..0322ad7457b --- /dev/null +++ b/src/Access/AccessBackup.cpp @@ -0,0 +1,358 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace fs = std::filesystem; + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + + +namespace +{ + /// Represents a list of access entities as they're stored in a backup. + struct AccessEntitiesInBackup + { + std::unordered_map entities; + std::unordered_map> dependencies; + + BackupEntryPtr toBackupEntry() const + { + WriteBufferFromOwnString buf; + + for (const auto & [id, entity] : entities) + { + writeText(id, buf); + writeChar('\t', buf); + writeText(entity->getTypeInfo().name, buf); + writeChar('\t', buf); + writeText(entity->getName(), buf); + writeChar('\n', buf); + writeText(serializeAccessEntity(*entity), buf); + writeChar('\n', buf); + } + + if (!dependencies.empty()) + { + writeText("DEPENDENCIES\n", buf); + for (const auto & [id, name_and_type] : dependencies) + { + writeText(id, buf); + writeChar('\t', buf); + writeText(AccessEntityTypeInfo::get(name_and_type.second).name, buf); + writeChar('\t', buf); + writeText(name_and_type.first, buf); + writeChar('\n', buf); + } + } + + return std::make_shared(buf.str()); + } + + static AccessEntitiesInBackup fromBackupEntry(const IBackupEntry & backup_entry, const String & file_path) + { + try + { + AccessEntitiesInBackup res; + std::unique_ptr buf = backup_entry.getReadBuffer(); + + bool dependencies_found = false; + + while (!buf->eof()) + { + String line; + readStringUntilNewlineInto(line, *buf); + buf->ignore(); + if (line == "DEPENDENCIES") + { + dependencies_found = true; + break; + } + + size_t id_endpos = line.find('\t'); + String id_as_string = line.substr(0, id_endpos); + UUID id = parse(line); + line.clear(); + + String queries; + while (!buf->eof()) + { + String query; + readStringUntilNewlineInto(query, *buf); + buf->ignore(); + if (query.empty()) + break; + if (!queries.empty()) + queries.append("\n"); + queries.append(query); + } + + AccessEntityPtr entity = deserializeAccessEntity(queries); + res.entities.emplace(id, entity); + } + + if (dependencies_found) + { + while (!buf->eof()) + { + String id_as_string; + readStringInto(id_as_string, *buf); + buf->ignore(); + UUID id = parse(id_as_string); + + String type_as_string; + readStringInto(type_as_string, *buf); + buf->ignore(); + AccessEntityType type = AccessEntityTypeInfo::parseType(type_as_string); + + String name; + readStringInto(name, *buf); + buf->ignore(); + + if (!res.entities.contains(id)) + res.dependencies.emplace(id, std::pair{name, type}); + } + } + + return res; + } + catch (Exception & e) + { + e.addMessage("While parsing " + file_path); + throw; + } + } + }; + + std::vector findDependencies(const std::vector> & entities) + { + std::vector res; + for (const auto & entity : entities | boost::adaptors::map_values) + insertAtEnd(res, entity->findDependencies()); + + /// Remove duplicates in the list of dependencies (some entities can refer to other entities). + ::sort(res.begin(), res.end()); + res.erase(std::unique(res.begin(), res.end()), res.end()); + for (const auto & id : entities | boost::adaptors::map_keys) + { + auto it = std::lower_bound(res.begin(), res.end(), id); + if ((it != res.end()) && (*it == id)) + res.erase(it); + } + return res; + } + + std::unordered_map> readDependenciesNamesAndTypes(const std::vector & dependencies, const AccessControl & access_control) + { + std::unordered_map> res; + for (const auto & id : dependencies) + { + if (auto name_and_type = access_control.tryReadNameWithType(id)) + res.emplace(id, name_and_type.value()); + } + return res; + } + + std::unordered_map resolveDependencies(const std::unordered_map> & dependencies, const AccessControl & access_control, bool allow_unresolved_dependencies) + { + std::unordered_map old_to_new_ids; + for (const auto & [id, name_and_type] : dependencies) + { + std::optional new_id; + if (allow_unresolved_dependencies) + new_id = access_control.find(name_and_type.second, name_and_type.first); + else + new_id = access_control.getID(name_and_type.second, name_and_type.first); + if (new_id) + old_to_new_ids.emplace(id, *new_id); + } + return old_to_new_ids; + } + + void generateRandomIDs(std::vector> & entities, std::unordered_map & old_to_new_ids) + { + Poco::UUIDGenerator generator; + for (auto & [id, entity] : entities) + { + UUID new_id; + generator.createRandom().copyTo(reinterpret_cast(&new_id)); + old_to_new_ids.emplace(id, new_id); + id = new_id; + } + } + + void replaceDependencies(std::vector> & entities, const std::unordered_map & old_to_new_ids) + { + for (auto & entity : entities | boost::adaptors::map_values) + { + bool need_replace = false; + for (const auto & dependency : entity->findDependencies()) + { + if (old_to_new_ids.contains(dependency)) + { + need_replace = true; + break; + } + } + + if (!need_replace) + continue; + + auto new_entity = entity->clone(); + new_entity->replaceDependencies(old_to_new_ids); + entity = new_entity; + } + } + + AccessRightsElements getRequiredAccessToRestore(const std::unordered_map & entities) + { + AccessRightsElements res; + for (const auto & entity : entities | boost::adaptors::map_values) + { + auto entity_type = entity->getType(); + switch (entity_type) + { + case User::TYPE: + { + const auto & user = typeid_cast(*entity); + res.emplace_back(AccessType::CREATE_USER); + auto elements = user.access.getElements(); + for (auto & element : elements) + { + if (element.is_partial_revoke) + continue; + element.grant_option = true; + res.emplace_back(element); + } + if (!user.granted_roles.isEmpty()) + res.emplace_back(AccessType::ROLE_ADMIN); + break; + } + + case Role::TYPE: + { + const auto & role = typeid_cast(*entity); + res.emplace_back(AccessType::CREATE_ROLE); + auto elements = role.access.getElements(); + for (auto & element : elements) + { + if (element.is_partial_revoke) + continue; + element.grant_option = true; + res.emplace_back(element); + } + if (!role.granted_roles.isEmpty()) + res.emplace_back(AccessType::ROLE_ADMIN); + break; + } + + case SettingsProfile::TYPE: + { + res.emplace_back(AccessType::CREATE_SETTINGS_PROFILE); + break; + } + + case RowPolicy::TYPE: + { + const auto & policy = typeid_cast(*entity); + res.emplace_back(AccessType::CREATE_ROW_POLICY, policy.getDatabase(), policy.getTableName()); + break; + } + + case Quota::TYPE: + { + res.emplace_back(AccessType::CREATE_QUOTA); + break; + } + + default: + throw Exception("Unknown type: " + toString(entity_type), ErrorCodes::LOGICAL_ERROR); + } + } + return res; + } +} + +void backupAccessEntities( + BackupEntriesCollector & backup_entries_collector, + const String & data_path_in_backup, + const AccessControl & access_control, + AccessEntityType type) +{ + auto entities = access_control.readAllForBackup(type, backup_entries_collector.getBackupSettings()); + auto dependencies = readDependenciesNamesAndTypes(findDependencies(entities), access_control); + AccessEntitiesInBackup ab; + boost::range::copy(entities, std::inserter(ab.entities, ab.entities.end())); + ab.dependencies = std::move(dependencies); + backup_entries_collector.addBackupEntry(fs::path{data_path_in_backup} / "access.txt", ab.toBackupEntry()); +} + + +AccessRestoreTask::AccessRestoreTask( + const BackupPtr & backup_, const RestoreSettings & restore_settings_, std::shared_ptr restore_coordination_) + : backup(backup_), restore_settings(restore_settings_), restore_coordination(restore_coordination_) +{ +} + +AccessRestoreTask::~AccessRestoreTask() = default; + +void AccessRestoreTask::addDataPath(const String & data_path) +{ + if (!data_paths.emplace(data_path).second) + return; + + String file_path = fs::path{data_path} / "access.txt"; + auto backup_entry = backup->readFile(file_path); + auto ab = AccessEntitiesInBackup::fromBackupEntry(*backup_entry, file_path); + + boost::range::copy(ab.entities, std::inserter(entities, entities.end())); + boost::range::copy(ab.dependencies, std::inserter(dependencies, dependencies.end())); + for (const auto & id : entities | boost::adaptors::map_keys) + dependencies.erase(id); +} + +bool AccessRestoreTask::hasDataPath(const String & data_path) const +{ + return data_paths.contains(data_path); +} + +AccessRightsElements AccessRestoreTask::getRequiredAccess() const +{ + return getRequiredAccessToRestore(entities); +} + +void AccessRestoreTask::restore(AccessControl & access_control) const +{ + auto old_to_new_ids = resolveDependencies(dependencies, access_control, restore_settings.allow_unresolved_access_dependencies); + + std::vector> new_entities; + boost::range::copy(entities, std::back_inserter(new_entities)); + generateRandomIDs(new_entities, old_to_new_ids); + + replaceDependencies(new_entities, old_to_new_ids); + + access_control.insertFromBackup(new_entities, restore_settings, restore_coordination); +} + +} diff --git a/src/Access/AccessBackup.h b/src/Access/AccessBackup.h new file mode 100644 index 00000000000..5c70e268eae --- /dev/null +++ b/src/Access/AccessBackup.h @@ -0,0 +1,56 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ +class AccessControl; +enum class AccessEntityType; +class BackupEntriesCollector; +class RestorerFromBackup; +class IBackup; +using BackupPtr = std::shared_ptr; +class IRestoreCoordination; +struct IAccessEntity; +using AccessEntityPtr = std::shared_ptr; +class AccessRightsElements; + + +/// Makes a backup of access entities of a specified type. +void backupAccessEntities( + BackupEntriesCollector & backup_entries_collector, + const String & data_path_in_backup, + const AccessControl & access_control, + AccessEntityType type); + +/// Restores access entities from a backup. +class AccessRestoreTask +{ +public: + AccessRestoreTask( + const BackupPtr & backup_, const RestoreSettings & restore_settings_, std::shared_ptr restore_coordination_); + ~AccessRestoreTask(); + + /// Adds a data path to loads access entities from. + void addDataPath(const String & data_path); + bool hasDataPath(const String & data_path) const; + + /// Checks that the current user can do restoring. + AccessRightsElements getRequiredAccess() const; + + /// Inserts all access entities loaded from all the paths added by addDataPath(). + void restore(AccessControl & access_control) const; + +private: + BackupPtr backup; + RestoreSettings restore_settings; + std::shared_ptr restore_coordination; + std::unordered_map entities; + std::unordered_map> dependencies; + std::unordered_set data_paths; +}; + +} diff --git a/src/Access/AccessControl.cpp b/src/Access/AccessControl.cpp index 1141f92a0d8..aa58044a6b0 100644 --- a/src/Access/AccessControl.cpp +++ b/src/Access/AccessControl.cpp @@ -15,7 +15,11 @@ #include #include #include +#include +#include +#include #include +#include #include #include #include @@ -130,7 +134,7 @@ public: } private: - Strings registered_prefixes; + Strings registered_prefixes TSA_GUARDED_BY(mutex); mutable std::mutex mutex; }; @@ -184,39 +188,25 @@ void AccessControl::setUsersConfig(const Poco::Util::AbstractConfiguration & use return; } } - addUsersConfigStorage(users_config_); + addUsersConfigStorage(UsersConfigAccessStorage::STORAGE_TYPE, users_config_, false); } -void AccessControl::addUsersConfigStorage(const Poco::Util::AbstractConfiguration & users_config_) +void AccessControl::addUsersConfigStorage(const String & storage_name_, const Poco::Util::AbstractConfiguration & users_config_, bool allow_backup_) { - addUsersConfigStorage(UsersConfigAccessStorage::STORAGE_TYPE, users_config_); -} - -void AccessControl::addUsersConfigStorage(const String & storage_name_, const Poco::Util::AbstractConfiguration & users_config_) -{ - auto new_storage = std::make_shared(storage_name_, *this); + auto new_storage = std::make_shared(storage_name_, *this, allow_backup_); new_storage->setConfig(users_config_); addStorage(new_storage); LOG_DEBUG(getLogger(), "Added {} access storage '{}', path: {}", String(new_storage->getStorageType()), new_storage->getStorageName(), new_storage->getPath()); } -void AccessControl::addUsersConfigStorage( - const String & users_config_path_, - const String & include_from_path_, - const String & preprocessed_dir_, - const zkutil::GetZooKeeper & get_zookeeper_function_) -{ - addUsersConfigStorage( - UsersConfigAccessStorage::STORAGE_TYPE, users_config_path_, include_from_path_, preprocessed_dir_, get_zookeeper_function_); -} - void AccessControl::addUsersConfigStorage( const String & storage_name_, const String & users_config_path_, const String & include_from_path_, const String & preprocessed_dir_, - const zkutil::GetZooKeeper & get_zookeeper_function_) + const zkutil::GetZooKeeper & get_zookeeper_function_, + bool allow_backup_) { auto storages = getStoragesPtr(); for (const auto & storage : *storages) @@ -227,7 +217,7 @@ void AccessControl::addUsersConfigStorage( return; } } - auto new_storage = std::make_shared(storage_name_, *this); + auto new_storage = std::make_shared(storage_name_, *this, allow_backup_); new_storage->load(users_config_path_, include_from_path_, preprocessed_dir_, get_zookeeper_function_); addStorage(new_storage); LOG_DEBUG(getLogger(), "Added {} access storage '{}', path: {}", String(new_storage->getStorageType()), new_storage->getStorageName(), new_storage->getPath()); @@ -237,7 +227,8 @@ void AccessControl::addUsersConfigStorage( void AccessControl::addReplicatedStorage( const String & storage_name_, const String & zookeeper_path_, - const zkutil::GetZooKeeper & get_zookeeper_function_) + const zkutil::GetZooKeeper & get_zookeeper_function_, + bool allow_backup_) { auto storages = getStoragesPtr(); for (const auto & storage : *storages) @@ -245,17 +236,12 @@ void AccessControl::addReplicatedStorage( if (auto replicated_storage = typeid_cast>(storage)) return; } - auto new_storage = std::make_shared(storage_name_, zookeeper_path_, get_zookeeper_function_, *changes_notifier); + auto new_storage = std::make_shared(storage_name_, zookeeper_path_, get_zookeeper_function_, *changes_notifier, allow_backup_); addStorage(new_storage); LOG_DEBUG(getLogger(), "Added {} access storage '{}'", String(new_storage->getStorageType()), new_storage->getStorageName()); } -void AccessControl::addDiskStorage(const String & directory_, bool readonly_) -{ - addDiskStorage(DiskAccessStorage::STORAGE_TYPE, directory_, readonly_); -} - -void AccessControl::addDiskStorage(const String & storage_name_, const String & directory_, bool readonly_) +void AccessControl::addDiskStorage(const String & storage_name_, const String & directory_, bool readonly_, bool allow_backup_) { auto storages = getStoragesPtr(); for (const auto & storage : *storages) @@ -270,13 +256,13 @@ void AccessControl::addDiskStorage(const String & storage_name_, const String & } } } - auto new_storage = std::make_shared(storage_name_, directory_, readonly_, *changes_notifier); + auto new_storage = std::make_shared(storage_name_, directory_, *changes_notifier, readonly_, allow_backup_); addStorage(new_storage); LOG_DEBUG(getLogger(), "Added {} access storage '{}', path: {}", String(new_storage->getStorageType()), new_storage->getStorageName(), new_storage->getPath()); } -void AccessControl::addMemoryStorage(const String & storage_name_) +void AccessControl::addMemoryStorage(const String & storage_name_, bool allow_backup_) { auto storages = getStoragesPtr(); for (const auto & storage : *storages) @@ -284,7 +270,7 @@ void AccessControl::addMemoryStorage(const String & storage_name_) if (auto memory_storage = typeid_cast>(storage)) return; } - auto new_storage = std::make_shared(storage_name_, *changes_notifier); + auto new_storage = std::make_shared(storage_name_, *changes_notifier, allow_backup_); addStorage(new_storage); LOG_DEBUG(getLogger(), "Added {} access storage '{}'", String(new_storage->getStorageType()), new_storage->getStorageName()); } @@ -327,20 +313,23 @@ void AccessControl::addStoragesFromUserDirectoriesConfig( if (type == MemoryAccessStorage::STORAGE_TYPE) { - addMemoryStorage(name); + bool allow_backup = config.getBool(prefix + ".allow_backup", true); + addMemoryStorage(name, allow_backup); } else if (type == UsersConfigAccessStorage::STORAGE_TYPE) { String path = config.getString(prefix + ".path"); if (std::filesystem::path{path}.is_relative() && std::filesystem::exists(config_dir + path)) path = config_dir + path; - addUsersConfigStorage(name, path, include_from_path, dbms_dir, get_zookeeper_function); + bool allow_backup = config.getBool(prefix + ".allow_backup", false); /// We don't backup users.xml by default. + addUsersConfigStorage(name, path, include_from_path, dbms_dir, get_zookeeper_function, allow_backup); } else if (type == DiskAccessStorage::STORAGE_TYPE) { String path = config.getString(prefix + ".path"); bool readonly = config.getBool(prefix + ".readonly", false); - addDiskStorage(name, path, readonly); + bool allow_backup = config.getBool(prefix + ".allow_backup", true); + addDiskStorage(name, path, readonly, allow_backup); } else if (type == LDAPAccessStorage::STORAGE_TYPE) { @@ -349,7 +338,8 @@ void AccessControl::addStoragesFromUserDirectoriesConfig( else if (type == ReplicatedAccessStorage::STORAGE_TYPE) { String zookeeper_path = config.getString(prefix + ".zookeeper_path"); - addReplicatedStorage(name, zookeeper_path, get_zookeeper_function); + bool allow_backup = config.getBool(prefix + ".allow_backup", true); + addReplicatedStorage(name, zookeeper_path, get_zookeeper_function, allow_backup); } else throw Exception("Unknown storage type '" + type + "' at " + prefix + " in config", ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG); @@ -383,12 +373,18 @@ void AccessControl::addStoragesFromMainConfig( if (users_config_path != config_path) checkForUsersNotInMainConfig(config, config_path, users_config_path, getLogger()); - addUsersConfigStorage(users_config_path, include_from_path, dbms_dir, get_zookeeper_function); + addUsersConfigStorage( + UsersConfigAccessStorage::STORAGE_TYPE, + users_config_path, + include_from_path, + dbms_dir, + get_zookeeper_function, + /* allow_backup= */ false); } String disk_storage_dir = config.getString("access_control_path", ""); if (!disk_storage_dir.empty()) - addDiskStorage(disk_storage_dir); + addDiskStorage(DiskAccessStorage::STORAGE_TYPE, disk_storage_dir, /* readonly= */ false, /* allow_backup= */ true); if (has_user_directories) addStoragesFromUserDirectoriesConfig(config, "user_directories", config_dir, dbms_dir, include_from_path, get_zookeeper_function); @@ -463,6 +459,23 @@ UUID AccessControl::authenticate(const Credentials & credentials, const Poco::Ne } } +void AccessControl::backup(BackupEntriesCollector & backup_entries_collector, AccessEntityType type, const String & data_path_in_backup) const +{ + backupAccessEntities(backup_entries_collector, data_path_in_backup, *this, type); +} + +void AccessControl::restore(RestorerFromBackup & restorer, const String & data_path_in_backup) +{ + /// The restorer must already know about `data_path_in_backup`, but let's check. + restorer.checkPathInBackupToRestoreAccess(data_path_in_backup); +} + +void AccessControl::insertFromBackup(const std::vector> & entities_from_backup, const RestoreSettings & restore_settings, std::shared_ptr restore_coordination) +{ + MultipleAccessStorage::insertFromBackup(entities_from_backup, restore_settings, restore_coordination); + changes_notifier->sendNotifications(); +} + void AccessControl::setExternalAuthenticatorsConfig(const Poco::Util::AbstractConfiguration & config) { external_authenticators->setConfiguration(config, getLogger()); diff --git a/src/Access/AccessControl.h b/src/Access/AccessControl.h index cbc71241316..90ad2895122 100644 --- a/src/Access/AccessControl.h +++ b/src/Access/AccessControl.h @@ -42,6 +42,8 @@ class ClientInfo; class ExternalAuthenticators; class AccessChangesNotifier; struct Settings; +class BackupEntriesCollector; +class RestorerFromBackup; /// Manages access control entities. @@ -60,37 +62,31 @@ public: void setUsersConfig(const Poco::Util::AbstractConfiguration & users_config_); /// Adds UsersConfigAccessStorage. - void addUsersConfigStorage(const Poco::Util::AbstractConfiguration & users_config_); - void addUsersConfigStorage(const String & storage_name_, - const Poco::Util::AbstractConfiguration & users_config_); - - void addUsersConfigStorage(const String & users_config_path_, - const String & include_from_path_, - const String & preprocessed_dir_, - const zkutil::GetZooKeeper & get_zookeeper_function_ = {}); + const Poco::Util::AbstractConfiguration & users_config_, + bool allow_backup_); void addUsersConfigStorage(const String & storage_name_, const String & users_config_path_, const String & include_from_path_, const String & preprocessed_dir_, - const zkutil::GetZooKeeper & get_zookeeper_function_ = {}); + const zkutil::GetZooKeeper & get_zookeeper_function_, + bool allow_backup_); /// Loads access entities from the directory on the local disk. /// Use that directory to keep created users/roles/etc. - void addDiskStorage(const String & directory_, bool readonly_ = false); - void addDiskStorage(const String & storage_name_, const String & directory_, bool readonly_ = false); + void addDiskStorage(const String & storage_name_, const String & directory_, bool readonly_, bool allow_backup_); /// Adds MemoryAccessStorage which keeps access entities in memory. - void addMemoryStorage(); - void addMemoryStorage(const String & storage_name_); + void addMemoryStorage(const String & storage_name_, bool allow_backup_); /// Adds LDAPAccessStorage which allows querying remote LDAP server for user info. void addLDAPStorage(const String & storage_name_, const Poco::Util::AbstractConfiguration & config_, const String & prefix_); void addReplicatedStorage(const String & storage_name, const String & zookeeper_path, - const zkutil::GetZooKeeper & get_zookeeper_function); + const zkutil::GetZooKeeper & get_zookeeper_function, + bool allow_backup); /// Adds storages from config. void addStoragesFromUserDirectoriesConfig(const Poco::Util::AbstractConfiguration & config, @@ -123,6 +119,11 @@ public: scope_guard subscribeForChanges(const std::vector & ids, const OnChangedHandler & handler) const; UUID authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address) const; + + /// Makes a backup of access entities. + void backup(BackupEntriesCollector & backup_entries_collector, AccessEntityType type, const String & data_path_in_backup) const; + static void restore(RestorerFromBackup & restorer, const String & data_path_in_backup); + void setExternalAuthenticatorsConfig(const Poco::Util::AbstractConfiguration & config); /// Sets the default profile's name. @@ -197,6 +198,8 @@ public: /// Gets manager of notifications. AccessChangesNotifier & getChangesNotifier(); + void insertFromBackup(const std::vector> & entities_from_backup, const RestoreSettings & restore_settings, std::shared_ptr restore_coordination) override; + private: class ContextAccessCache; class CustomSettingsPrefixes; diff --git a/src/Access/AccessRights.cpp b/src/Access/AccessRights.cpp index c3598df048b..7c3139dbb0f 100644 --- a/src/Access/AccessRights.cpp +++ b/src/Access/AccessRights.cpp @@ -736,6 +736,18 @@ AccessRights::AccessRights(const AccessFlags & access) } +AccessRights::AccessRights(const AccessRightsElement & element) +{ + grant(element); +} + + +AccessRights::AccessRights(const AccessRightsElements & elements) +{ + grant(elements); +} + + bool AccessRights::isEmpty() const { return !root && !root_with_grant_option; diff --git a/src/Access/AccessRights.h b/src/Access/AccessRights.h index ef5995ad9a5..b7499d69f70 100644 --- a/src/Access/AccessRights.h +++ b/src/Access/AccessRights.h @@ -16,6 +16,9 @@ class AccessRights public: AccessRights(); explicit AccessRights(const AccessFlags & access); + explicit AccessRights(const AccessRightsElement & element); + explicit AccessRights(const AccessRightsElements & elements); + ~AccessRights(); AccessRights(const AccessRights & src); AccessRights & operator =(const AccessRights & src); diff --git a/src/Access/Common/AccessEntityType.cpp b/src/Access/Common/AccessEntityType.cpp index b9c618a9fc0..9277961fdae 100644 --- a/src/Access/Common/AccessEntityType.cpp +++ b/src/Access/Common/AccessEntityType.cpp @@ -1,7 +1,9 @@ #include #include #include +#include #include +#include #include @@ -15,6 +17,7 @@ namespace ErrorCodes extern const int UNKNOWN_QUOTA; extern const int THERE_IS_NO_PROFILE; extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; } @@ -83,4 +86,15 @@ const AccessEntityTypeInfo & AccessEntityTypeInfo::get(AccessEntityType type_) throw Exception("Unknown type: " + std::to_string(static_cast(type_)), ErrorCodes::LOGICAL_ERROR); } +AccessEntityType AccessEntityTypeInfo::parseType(const String & name_) +{ + for (auto type : collections::range(AccessEntityType::MAX)) + { + const auto & info = get(type); + if (boost::iequals(info.name, name_)) + return type; + } + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown type: {}", name_); +} + } diff --git a/src/Access/Common/AccessEntityType.h b/src/Access/Common/AccessEntityType.h index 44caeecb37a..12df7cf5538 100644 --- a/src/Access/Common/AccessEntityType.h +++ b/src/Access/Common/AccessEntityType.h @@ -35,6 +35,7 @@ struct AccessEntityTypeInfo String formatEntityNameWithType(const String & entity_name) const; static const AccessEntityTypeInfo & get(AccessEntityType type_); + static AccessEntityType parseType(const String & name_); }; } diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index 58c2f3a3b97..41739406de4 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -99,6 +99,7 @@ enum class AccessType \ M(TRUNCATE, "TRUNCATE TABLE", TABLE, ALL) \ M(OPTIMIZE, "OPTIMIZE TABLE", TABLE, ALL) \ + M(BACKUP, "", TABLE, ALL) /* allows to backup tables */\ \ M(KILL_QUERY, "", GLOBAL, ALL) /* allows to kill a query started by another user (anyone can kill his own queries) */\ diff --git a/src/Access/DiskAccessStorage.cpp b/src/Access/DiskAccessStorage.cpp index 57e09d40b35..a9b7a6a265b 100644 --- a/src/Access/DiskAccessStorage.cpp +++ b/src/Access/DiskAccessStorage.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -165,11 +166,12 @@ namespace } -DiskAccessStorage::DiskAccessStorage(const String & storage_name_, const String & directory_path_, bool readonly_, AccessChangesNotifier & changes_notifier_) +DiskAccessStorage::DiskAccessStorage(const String & storage_name_, const String & directory_path_, AccessChangesNotifier & changes_notifier_, bool readonly_, bool allow_backup_) : IAccessStorage(storage_name_), changes_notifier(changes_notifier_) { directory_path = makeDirectoryPathCanonical(directory_path_); readonly = readonly_; + backup_allowed = allow_backup_; std::error_code create_dir_error_code; std::filesystem::create_directories(directory_path, create_dir_error_code); @@ -457,7 +459,7 @@ AccessEntityPtr DiskAccessStorage::readImpl(const UUID & id, bool throw_if_not_e } -std::optional DiskAccessStorage::readNameImpl(const UUID & id, bool throw_if_not_exists) const +std::optional> DiskAccessStorage::readNameWithTypeImpl(const UUID & id, bool throw_if_not_exists) const { std::lock_guard lock{mutex}; auto it = entries_by_id.find(id); @@ -468,21 +470,27 @@ std::optional DiskAccessStorage::readNameImpl(const UUID & id, bool thro else return std::nullopt; } - return it->second.name; + return std::make_pair(it->second.name, it->second.type); } std::optional DiskAccessStorage::insertImpl(const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists) { UUID id = generateRandomID(); - std::lock_guard lock{mutex}; - if (insertNoLock(id, new_entity, replace_if_exists, throw_if_exists)) + if (insertWithID(id, new_entity, replace_if_exists, throw_if_exists)) return id; return std::nullopt; } +bool DiskAccessStorage::insertWithID(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists) +{ + std::lock_guard lock{mutex}; + return insertNoLock(id, new_entity, replace_if_exists, throw_if_exists); +} + + bool DiskAccessStorage::insertNoLock(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists) { const String & name = new_entity->getName(); @@ -649,4 +657,20 @@ void DiskAccessStorage::deleteAccessEntityOnDisk(const UUID & id) const throw Exception("Couldn't delete " + file_path, ErrorCodes::FILE_DOESNT_EXIST); } + +void DiskAccessStorage::insertFromBackup( + const std::vector> & entities_from_backup, + const RestoreSettings & restore_settings, + std::shared_ptr) +{ + if (!isRestoreAllowed()) + throwRestoreNotAllowed(); + + bool replace_if_exists = (restore_settings.create_access == RestoreAccessCreationMode::kReplace); + bool throw_if_exists = (restore_settings.create_access == RestoreAccessCreationMode::kCreate); + + for (const auto & [id, entity] : entities_from_backup) + insertWithID(id, entity, replace_if_exists, throw_if_exists); +} + } diff --git a/src/Access/DiskAccessStorage.h b/src/Access/DiskAccessStorage.h index 7784a80e779..1bdefbf82f9 100644 --- a/src/Access/DiskAccessStorage.h +++ b/src/Access/DiskAccessStorage.h @@ -15,7 +15,7 @@ class DiskAccessStorage : public IAccessStorage public: static constexpr char STORAGE_TYPE[] = "local directory"; - DiskAccessStorage(const String & storage_name_, const String & directory_path_, bool readonly_, AccessChangesNotifier & changes_notifier_); + DiskAccessStorage(const String & storage_name_, const String & directory_path_, AccessChangesNotifier & changes_notifier_, bool readonly_, bool allow_backup_); ~DiskAccessStorage() override; const char * getStorageType() const override { return STORAGE_TYPE; } @@ -29,11 +29,14 @@ public: bool exists(const UUID & id) const override; + bool isBackupAllowed() const override { return backup_allowed; } + void insertFromBackup(const std::vector> & entities_from_backup, const RestoreSettings & restore_settings, std::shared_ptr restore_coordination) override; + private: std::optional findImpl(AccessEntityType type, const String & name) const override; std::vector findAllImpl(AccessEntityType type) const override; AccessEntityPtr readImpl(const UUID & id, bool throw_if_not_exists) const override; - std::optional readNameImpl(const UUID & id, bool throw_if_not_exists) const override; + std::optional> readNameWithTypeImpl(const UUID & id, bool throw_if_not_exists) const override; std::optional insertImpl(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists) override; bool removeImpl(const UUID & id, bool throw_if_not_exists) override; bool updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) override; @@ -47,6 +50,7 @@ private: void listsWritingThreadFunc(); void stopListsWritingThread(); + bool insertWithID(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists); bool insertNoLock(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists); bool removeNoLock(const UUID & id, bool throw_if_not_exists); bool updateNoLock(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists); @@ -65,7 +69,6 @@ private: }; String directory_path; - std::atomic readonly; std::unordered_map entries_by_id; std::unordered_map entries_by_name_and_type[static_cast(AccessEntityType::MAX)]; boost::container::flat_set types_of_lists_to_write; @@ -74,6 +77,8 @@ private: std::condition_variable lists_writing_thread_should_exit; /// Signals `lists_writing_thread` to exit. bool lists_writing_thread_is_waiting = false; AccessChangesNotifier & changes_notifier; + std::atomic readonly; + std::atomic backup_allowed; mutable std::mutex mutex; }; } diff --git a/src/Access/EnabledSettings.h b/src/Access/EnabledSettings.h index 94d2ec2952f..6532d32cffc 100644 --- a/src/Access/EnabledSettings.h +++ b/src/Access/EnabledSettings.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -42,7 +43,7 @@ private: void setInfo(const std::shared_ptr & info_); const Params params; - std::shared_ptr info; + std::shared_ptr info TSA_GUARDED_BY(mutex); mutable std::mutex mutex; }; } diff --git a/src/Access/ExternalAuthenticators.cpp b/src/Access/ExternalAuthenticators.cpp index 9431fc543d4..8da16b86f4e 100644 --- a/src/Access/ExternalAuthenticators.cpp +++ b/src/Access/ExternalAuthenticators.cpp @@ -231,18 +231,23 @@ void parseLDAPRoleSearchParams(LDAPClient::RoleSearchParams & params, const Poco params.prefix = config.getString(prefix + ".prefix"); } -void ExternalAuthenticators::reset() +void ExternalAuthenticators::resetImpl() { - std::scoped_lock lock(mutex); ldap_client_params_blueprint.clear(); ldap_caches.clear(); kerberos_params.reset(); } +void ExternalAuthenticators::reset() +{ + std::scoped_lock lock(mutex); + resetImpl(); +} + void ExternalAuthenticators::setConfiguration(const Poco::Util::AbstractConfiguration & config, Poco::Logger * log) { std::scoped_lock lock(mutex); - reset(); + resetImpl(); Poco::Util::AbstractConfiguration::Keys all_keys; config.keys("", all_keys); diff --git a/src/Access/ExternalAuthenticators.h b/src/Access/ExternalAuthenticators.h index 0b3af4d0216..bf928c18d5b 100644 --- a/src/Access/ExternalAuthenticators.h +++ b/src/Access/ExternalAuthenticators.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -22,7 +23,6 @@ namespace Poco } } - namespace DB { @@ -51,10 +51,12 @@ private: using LDAPCaches = std::map; // server name -> cache using LDAPParams = std::map; // server name -> params - mutable std::recursive_mutex mutex; - LDAPParams ldap_client_params_blueprint; - mutable LDAPCaches ldap_caches; - std::optional kerberos_params; + mutable std::mutex mutex; + LDAPParams ldap_client_params_blueprint TSA_GUARDED_BY(mutex) ; + mutable LDAPCaches ldap_caches TSA_GUARDED_BY(mutex) ; + std::optional kerberos_params TSA_GUARDED_BY(mutex) ; + + void resetImpl() TSA_REQUIRES(mutex); }; void parseLDAPRoleSearchParams(LDAPClient::RoleSearchParams & params, const Poco::Util::AbstractConfiguration & config, const String & prefix); diff --git a/src/Access/GrantedRoles.cpp b/src/Access/GrantedRoles.cpp index 7d16e3e65bb..4df6809e0fe 100644 --- a/src/Access/GrantedRoles.cpp +++ b/src/Access/GrantedRoles.cpp @@ -2,6 +2,8 @@ #include #include #include +#include + namespace DB { @@ -166,4 +168,57 @@ void GrantedRoles::makeIntersection(const GrantedRoles & other) return other.roles_with_admin_option.find(id) == other.roles_with_admin_option.end(); }); } + +std::vector GrantedRoles::findDependencies() const +{ + std::vector res; + boost::range::copy(roles, std::back_inserter(res)); + return res; +} + +void GrantedRoles::replaceDependencies(const std::unordered_map & old_to_new_ids) +{ + std::vector new_ids; + + for (auto it = roles.begin(); it != roles.end();) + { + auto id = *it; + auto it_new_id = old_to_new_ids.find(id); + if (it_new_id != old_to_new_ids.end()) + { + auto new_id = it_new_id->second; + new_ids.push_back(new_id); + it = roles.erase(it); + } + else + { + ++it; + } + } + + if (!new_ids.empty()) + { + boost::range::copy(new_ids, std::inserter(roles, roles.end())); + new_ids.clear(); + + for (auto it = roles_with_admin_option.begin(); it != roles_with_admin_option.end();) + { + auto id = *it; + auto it_new_id = old_to_new_ids.find(id); + if (it_new_id != old_to_new_ids.end()) + { + auto new_id = it_new_id->second; + new_ids.push_back(new_id); + it = roles_with_admin_option.erase(it); + } + else + { + ++it; + } + } + + boost::range::copy(new_ids, std::inserter(roles_with_admin_option, roles_with_admin_option.end())); + } +} + } diff --git a/src/Access/GrantedRoles.h b/src/Access/GrantedRoles.h index 75ea56aba96..ac252822089 100644 --- a/src/Access/GrantedRoles.h +++ b/src/Access/GrantedRoles.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB @@ -24,6 +25,8 @@ public: void revokeAdminOption(const UUID & role_); void revokeAdminOption(const std::vector & roles_); + bool isEmpty() const { return roles.empty(); } + bool isGranted(const UUID & role_) const; bool isGrantedWithAdminOption(const UUID & role_) const; @@ -54,6 +57,9 @@ public: friend bool operator ==(const GrantedRoles & left, const GrantedRoles & right) { return (left.roles == right.roles) && (left.roles_with_admin_option == right.roles_with_admin_option); } friend bool operator !=(const GrantedRoles & left, const GrantedRoles & right) { return !(left == right); } + std::vector findDependencies() const; + void replaceDependencies(const std::unordered_map & old_to_new_ids); + private: boost::container::flat_set roles; boost::container::flat_set roles_with_admin_option; diff --git a/src/Access/IAccessEntity.h b/src/Access/IAccessEntity.h index 04faa879040..5614a172f6f 100644 --- a/src/Access/IAccessEntity.h +++ b/src/Access/IAccessEntity.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB @@ -45,6 +46,15 @@ struct IAccessEntity bool operator()(const std::shared_ptr & lhs, const std::shared_ptr & rhs) const { return operator()(*lhs, *rhs); } }; + /// Finds all dependencies. + virtual std::vector findDependencies() const { return {}; } + + /// Replaces dependencies according to a specified map. + virtual void replaceDependencies(const std::unordered_map & /* old_to_new_ids */) {} + + /// Whether this access entity should be written to a backup. + virtual bool isBackupAllowed() const { return false; } + protected: String name; diff --git a/src/Access/IAccessStorage.cpp b/src/Access/IAccessStorage.cpp index 6b04355099d..476b1674ce1 100644 --- a/src/Access/IAccessStorage.cpp +++ b/src/Access/IAccessStorage.cpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace DB @@ -19,6 +20,7 @@ namespace ErrorCodes extern const int ACCESS_ENTITY_ALREADY_EXISTS; extern const int ACCESS_ENTITY_NOT_FOUND; extern const int ACCESS_STORAGE_READONLY; + extern const int ACCESS_STORAGE_DOESNT_ALLOW_BACKUP; extern const int WRONG_PASSWORD; extern const int IP_ADDRESS_NOT_ALLOWED; extern const int LOGICAL_ERROR; @@ -83,13 +85,15 @@ std::vector IAccessStorage::getIDs(AccessEntityType type, const Strings & String IAccessStorage::readName(const UUID & id) const { - return *readNameImpl(id, /* throw_if_not_exists = */ true); + return readNameWithType(id).first; } std::optional IAccessStorage::readName(const UUID & id, bool throw_if_not_exists) const { - return readNameImpl(id, throw_if_not_exists); + if (auto name_and_type = readNameWithType(id, throw_if_not_exists)) + return name_and_type->first; + return std::nullopt; } @@ -99,7 +103,7 @@ Strings IAccessStorage::readNames(const std::vector & ids, bool throw_if_n res.reserve(ids.size()); for (const auto & id : ids) { - if (auto name = readNameImpl(id, throw_if_not_exists)) + if (auto name = readName(id, throw_if_not_exists)) res.emplace_back(std::move(name).value()); } return res; @@ -118,14 +122,42 @@ Strings IAccessStorage::tryReadNames(const std::vector & ids) const } -std::optional IAccessStorage::readNameImpl(const UUID & id, bool throw_if_not_exists) const +std::pair IAccessStorage::readNameWithType(const UUID & id) const +{ + return *readNameWithTypeImpl(id, /* throw_if_not_exists = */ true); +} + +std::optional> IAccessStorage::readNameWithType(const UUID & id, bool throw_if_not_exists) const +{ + return readNameWithTypeImpl(id, throw_if_not_exists); +} + +std::optional> IAccessStorage::tryReadNameWithType(const UUID & id) const +{ + return readNameWithTypeImpl(id, /* throw_if_not_exists = */ false); +} + + +std::optional> IAccessStorage::readNameWithTypeImpl(const UUID & id, bool throw_if_not_exists) const { if (auto entity = read(id, throw_if_not_exists)) - return entity->getName(); + return std::make_pair(entity->getName(), entity->getType()); return std::nullopt; } +std::vector> IAccessStorage::readAllWithIDs(AccessEntityType type) const +{ + std::vector> entities; + for (const auto & id : findAll(type)) + { + if (auto entity = tryRead(id)) + entities.emplace_back(id, entity); + } + return entities; +} + + UUID IAccessStorage::insert(const AccessEntityPtr & entity) { return *insert(entity, /* replace_if_exists = */ false, /* throw_if_exists = */ true); @@ -488,6 +520,29 @@ bool IAccessStorage::isAddressAllowed(const User & user, const Poco::Net::IPAddr } +bool IAccessStorage::isRestoreAllowed() const +{ + return isBackupAllowed() && !isReadOnly(); +} + +std::vector> IAccessStorage::readAllForBackup(AccessEntityType type, const BackupSettings &) const +{ + if (!isBackupAllowed()) + throwBackupNotAllowed(); + + auto res = readAllWithIDs(type); + boost::range::remove_erase_if(res, [](const std::pair & x) { return !x.second->isBackupAllowed(); }); + return res; +} + +void IAccessStorage::insertFromBackup(const std::vector> &, const RestoreSettings &, std::shared_ptr) +{ + if (!isRestoreAllowed()) + throwRestoreNotAllowed(); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "insertFromBackup() is not implemented in {}", getStorageType()); +} + + UUID IAccessStorage::generateRandomID() { static Poco::UUIDGenerator generator; @@ -577,6 +632,7 @@ void IAccessStorage::throwReadonlyCannotRemove(AccessEntityType type, const Stri ErrorCodes::ACCESS_STORAGE_READONLY); } + void IAccessStorage::throwAddressNotAllowed(const Poco::Net::IPAddress & address) { throw Exception("Connections from " + address.toString() + " are not allowed", ErrorCodes::IP_ADDRESS_NOT_ALLOWED); @@ -589,9 +645,20 @@ void IAccessStorage::throwAuthenticationTypeNotAllowed(AuthenticationType auth_t "Authentication type {} is not allowed, check the setting allow_{} in the server configuration", toString(auth_type), AuthenticationTypeInfo::get(auth_type).name); } + void IAccessStorage::throwInvalidCredentials() { throw Exception("Invalid credentials", ErrorCodes::WRONG_PASSWORD); } +void IAccessStorage::throwBackupNotAllowed() const +{ + throw Exception(ErrorCodes::ACCESS_STORAGE_DOESNT_ALLOW_BACKUP, "Backup of access entities is not allowed in {}", getStorageName()); +} + +void IAccessStorage::throwRestoreNotAllowed() const +{ + throw Exception(ErrorCodes::ACCESS_STORAGE_DOESNT_ALLOW_BACKUP, "Restore of access entities is not allowed in {}", getStorageName()); +} + } diff --git a/src/Access/IAccessStorage.h b/src/Access/IAccessStorage.h index 5de20cad286..7b43309204d 100644 --- a/src/Access/IAccessStorage.h +++ b/src/Access/IAccessStorage.h @@ -18,6 +18,9 @@ struct User; class Credentials; class ExternalAuthenticators; enum class AuthenticationType; +struct BackupSettings; +struct RestoreSettings; +class IRestoreCoordination; /// Contains entities, i.e. instances of classes derived from IAccessEntity. /// The implementations of this class MUST be thread-safe. @@ -101,6 +104,16 @@ public: std::optional tryReadName(const UUID & id) const; Strings tryReadNames(const std::vector & ids) const; + std::pair readNameWithType(const UUID & id) const; + std::optional> readNameWithType(const UUID & id, bool throw_if_not_exists) const; + std::optional> tryReadNameWithType(const UUID & id) const; + + /// Reads all entities and returns them with their IDs. + template + std::vector>> readAllWithIDs() const; + + std::vector> readAllWithIDs(AccessEntityType type) const; + /// Inserts an entity to the storage. Returns ID of a new entry in the storage. /// Throws an exception if the specified name already exists. UUID insert(const AccessEntityPtr & entity); @@ -143,11 +156,19 @@ public: UUID authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool allow_no_password, bool allow_plaintext_password) const; std::optional authenticate(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const; + /// Returns true if this storage can be stored to or restored from a backup. + virtual bool isBackupAllowed() const { return false; } + virtual bool isRestoreAllowed() const; + + /// Makes a backup of this access storage. + virtual std::vector> readAllForBackup(AccessEntityType type, const BackupSettings & backup_settings) const; + virtual void insertFromBackup(const std::vector> & entities_from_backup, const RestoreSettings & restore_settings, std::shared_ptr restore_coordination); + protected: virtual std::optional findImpl(AccessEntityType type, const String & name) const = 0; virtual std::vector findAllImpl(AccessEntityType type) const = 0; virtual AccessEntityPtr readImpl(const UUID & id, bool throw_if_not_exists) const = 0; - virtual std::optional readNameImpl(const UUID & id, bool throw_if_not_exists) const; + virtual std::optional> readNameWithTypeImpl(const UUID & id, bool throw_if_not_exists) const; virtual std::optional insertImpl(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists); virtual bool removeImpl(const UUID & id, bool throw_if_not_exists); virtual bool updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists); @@ -170,6 +191,8 @@ protected: [[noreturn]] static void throwAddressNotAllowed(const Poco::Net::IPAddress & address); [[noreturn]] static void throwInvalidCredentials(); [[noreturn]] static void throwAuthenticationTypeNotAllowed(AuthenticationType auth_type); + [[noreturn]] void throwBackupNotAllowed() const; + [[noreturn]] void throwRestoreNotAllowed() const; private: const String storage_name; @@ -218,4 +241,17 @@ std::shared_ptr IAccessStorage::tryRead(const String & name) { return read(name, false); } + +template +std::vector>> IAccessStorage::readAllWithIDs() const +{ + std::vector>> entities; + for (const auto & id : findAll()) + { + if (auto entity = tryRead(id)) + entities.emplace_back(id, entity); + } + return entities; +} + } diff --git a/src/Access/LDAPAccessStorage.cpp b/src/Access/LDAPAccessStorage.cpp index 480d0050e2a..50e231f2c51 100644 --- a/src/Access/LDAPAccessStorage.cpp +++ b/src/Access/LDAPAccessStorage.cpp @@ -28,7 +28,7 @@ namespace ErrorCodes LDAPAccessStorage::LDAPAccessStorage(const String & storage_name_, AccessControl & access_control_, const Poco::Util::AbstractConfiguration & config, const String & prefix) - : IAccessStorage(storage_name_), access_control(access_control_), memory_storage(storage_name_, access_control.getChangesNotifier()) + : IAccessStorage(storage_name_), access_control(access_control_), memory_storage(storage_name_, access_control.getChangesNotifier(), false) { setConfiguration(config, prefix); } @@ -36,6 +36,7 @@ LDAPAccessStorage::LDAPAccessStorage(const String & storage_name_, AccessControl String LDAPAccessStorage::getLDAPServerName() const { + std::scoped_lock lock(mutex); return ldap_server_name; } @@ -442,10 +443,10 @@ AccessEntityPtr LDAPAccessStorage::readImpl(const UUID & id, bool throw_if_not_e } -std::optional LDAPAccessStorage::readNameImpl(const UUID & id, bool throw_if_not_exists) const +std::optional> LDAPAccessStorage::readNameWithTypeImpl(const UUID & id, bool throw_if_not_exists) const { std::scoped_lock lock(mutex); - return memory_storage.readName(id, throw_if_not_exists); + return memory_storage.readNameWithType(id, throw_if_not_exists); } @@ -504,4 +505,5 @@ std::optional LDAPAccessStorage::authenticateImpl( return id; } + } diff --git a/src/Access/LDAPAccessStorage.h b/src/Access/LDAPAccessStorage.h index df13eff179b..21413070f4c 100644 --- a/src/Access/LDAPAccessStorage.h +++ b/src/Access/LDAPAccessStorage.h @@ -47,7 +47,7 @@ private: // IAccessStorage implementations. virtual std::optional findImpl(AccessEntityType type, const String & name) const override; virtual std::vector findAllImpl(AccessEntityType type) const override; virtual AccessEntityPtr readImpl(const UUID & id, bool throw_if_not_exists) const override; - virtual std::optional readNameImpl(const UUID & id, bool throw_if_not_exists) const override; + virtual std::optional> readNameWithTypeImpl(const UUID & id, bool throw_if_not_exists) const override; virtual std::optional authenticateImpl(const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, bool throw_if_user_not_exists, bool allow_no_password, bool allow_plaintext_password) const override; void setConfiguration(const Poco::Util::AbstractConfiguration & config, const String & prefix); diff --git a/src/Access/MemoryAccessStorage.cpp b/src/Access/MemoryAccessStorage.cpp index 9ed80f4a64d..ad877e263ad 100644 --- a/src/Access/MemoryAccessStorage.cpp +++ b/src/Access/MemoryAccessStorage.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -8,8 +9,8 @@ namespace DB { -MemoryAccessStorage::MemoryAccessStorage(const String & storage_name_, AccessChangesNotifier & changes_notifier_) - : IAccessStorage(storage_name_), changes_notifier(changes_notifier_) +MemoryAccessStorage::MemoryAccessStorage(const String & storage_name_, AccessChangesNotifier & changes_notifier_, bool allow_backup_) + : IAccessStorage(storage_name_), changes_notifier(changes_notifier_), backup_allowed(allow_backup_) { } @@ -65,14 +66,20 @@ AccessEntityPtr MemoryAccessStorage::readImpl(const UUID & id, bool throw_if_not std::optional MemoryAccessStorage::insertImpl(const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists) { UUID id = generateRandomID(); - std::lock_guard lock{mutex}; - if (insertNoLock(id, new_entity, replace_if_exists, throw_if_exists)) + if (insertWithID(id, new_entity, replace_if_exists, throw_if_exists)) return id; return std::nullopt; } +bool MemoryAccessStorage::insertWithID(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists) +{ + std::lock_guard lock{mutex}; + return insertNoLock(id, new_entity, replace_if_exists, throw_if_exists); +} + + bool MemoryAccessStorage::insertNoLock(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists) { const String & name = new_entity->getName(); @@ -264,4 +271,20 @@ void MemoryAccessStorage::setAll(const std::vector> & entities_from_backup, + const RestoreSettings & restore_settings, + std::shared_ptr) +{ + if (!isRestoreAllowed()) + throwRestoreNotAllowed(); + + bool replace_if_exists = (restore_settings.create_access == RestoreAccessCreationMode::kReplace); + bool throw_if_exists = (restore_settings.create_access == RestoreAccessCreationMode::kCreate); + + for (const auto & [id, entity] : entities_from_backup) + insertWithID(id, entity, replace_if_exists, throw_if_exists); +} + } diff --git a/src/Access/MemoryAccessStorage.h b/src/Access/MemoryAccessStorage.h index 690383c6941..aa4cd08252c 100644 --- a/src/Access/MemoryAccessStorage.h +++ b/src/Access/MemoryAccessStorage.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -17,7 +18,7 @@ class MemoryAccessStorage : public IAccessStorage public: static constexpr char STORAGE_TYPE[] = "memory"; - explicit MemoryAccessStorage(const String & storage_name_, AccessChangesNotifier & changes_notifier_); + explicit MemoryAccessStorage(const String & storage_name_, AccessChangesNotifier & changes_notifier_, bool allow_backup_); const char * getStorageType() const override { return STORAGE_TYPE; } @@ -27,6 +28,9 @@ public: bool exists(const UUID & id) const override; + bool isBackupAllowed() const override { return backup_allowed; } + void insertFromBackup(const std::vector> & entities_from_backup, const RestoreSettings & restore_settings, std::shared_ptr restore_coordination) override; + private: std::optional findImpl(AccessEntityType type, const String & name) const override; std::vector findAllImpl(AccessEntityType type) const override; @@ -35,9 +39,10 @@ private: bool removeImpl(const UUID & id, bool throw_if_not_exists) override; bool updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) override; - bool insertNoLock(const UUID & id, const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists); - bool removeNoLock(const UUID & id, bool throw_if_not_exists); - bool updateNoLock(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists); + bool insertWithID(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists); + bool insertNoLock(const UUID & id, const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists) TSA_REQUIRES(mutex); + bool removeNoLock(const UUID & id, bool throw_if_not_exists) TSA_REQUIRES(mutex); + bool updateNoLock(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) TSA_REQUIRES(mutex); struct Entry { @@ -46,8 +51,9 @@ private: }; mutable std::mutex mutex; - std::unordered_map entries_by_id; /// We want to search entries both by ID and by the pair of name and type. - std::unordered_map entries_by_name_and_type[static_cast(AccessEntityType::MAX)]; + std::unordered_map entries_by_id TSA_GUARDED_BY(mutex); /// We want to search entries both by ID and by the pair of name and type. + std::unordered_map entries_by_name_and_type[static_cast(AccessEntityType::MAX)] TSA_GUARDED_BY(mutex); AccessChangesNotifier & changes_notifier; + bool backup_allowed = false; }; } diff --git a/src/Access/MultipleAccessStorage.cpp b/src/Access/MultipleAccessStorage.cpp index ce4c9f3fd01..6f654f68e57 100644 --- a/src/Access/MultipleAccessStorage.cpp +++ b/src/Access/MultipleAccessStorage.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -42,14 +43,14 @@ MultipleAccessStorage::~MultipleAccessStorage() void MultipleAccessStorage::setStorages(const std::vector & storages) { - std::unique_lock lock{mutex}; + std::lock_guard lock{mutex}; nested_storages = std::make_shared(storages); ids_cache.reset(); } void MultipleAccessStorage::addStorage(const StoragePtr & new_storage) { - std::unique_lock lock{mutex}; + std::lock_guard lock{mutex}; if (boost::range::find(*nested_storages, new_storage) != nested_storages->end()) return; auto new_storages = std::make_shared(*nested_storages); @@ -59,7 +60,7 @@ void MultipleAccessStorage::addStorage(const StoragePtr & new_storage) void MultipleAccessStorage::removeStorage(const StoragePtr & storage_to_remove) { - std::unique_lock lock{mutex}; + std::lock_guard lock{mutex}; auto it = boost::range::find(*nested_storages, storage_to_remove); if (it == nested_storages->end()) return; @@ -189,10 +190,10 @@ AccessEntityPtr MultipleAccessStorage::readImpl(const UUID & id, bool throw_if_n } -std::optional MultipleAccessStorage::readNameImpl(const UUID & id, bool throw_if_not_exists) const +std::optional> MultipleAccessStorage::readNameWithTypeImpl(const UUID & id, bool throw_if_not_exists) const { if (auto storage = findStorage(id)) - return storage->readName(id, throw_if_not_exists); + return storage->readNameWithType(id, throw_if_not_exists); if (throw_if_not_exists) throwNotFound(id); @@ -357,4 +358,65 @@ MultipleAccessStorage::authenticateImpl(const Credentials & credentials, const P return std::nullopt; } + +bool MultipleAccessStorage::isBackupAllowed() const +{ + auto storages = getStoragesInternal(); + for (const auto & storage : *storages) + { + if (storage->isBackupAllowed()) + return true; + } + return false; +} + + +bool MultipleAccessStorage::isRestoreAllowed() const +{ + auto storages = getStoragesInternal(); + for (const auto & storage : *storages) + { + if (storage->isRestoreAllowed()) + return true; + } + return false; +} + + +std::vector> MultipleAccessStorage::readAllForBackup(AccessEntityType type, const BackupSettings & backup_settings) const +{ + std::vector> res; + auto storages = getStoragesInternal(); + size_t count = 0; + + for (const auto & storage : *storages) + { + if (storage->isBackupAllowed()) + { + insertAtEnd(res, storage->readAllForBackup(type, backup_settings)); + ++count; + } + } + + if (!count) + throwBackupNotAllowed(); + + return res; +} + + +void MultipleAccessStorage::insertFromBackup(const std::vector> & entities_from_backup, const RestoreSettings & restore_settings, std::shared_ptr restore_coordination) +{ + auto storages = getStoragesInternal(); + for (const auto & storage : *storages) + { + if (storage->isRestoreAllowed()) + { + storage->insertFromBackup(entities_from_backup, restore_settings, restore_coordination); + return; + } + } + throwRestoreNotAllowed(); +} + } diff --git a/src/Access/MultipleAccessStorage.h b/src/Access/MultipleAccessStorage.h index 61a975050b6..2eacdafd3f3 100644 --- a/src/Access/MultipleAccessStorage.h +++ b/src/Access/MultipleAccessStorage.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -42,11 +43,16 @@ public: bool exists(const UUID & id) const override; + bool isBackupAllowed() const override; + bool isRestoreAllowed() const override; + std::vector> readAllForBackup(AccessEntityType type, const BackupSettings & backup_settings) const override; + void insertFromBackup(const std::vector> & entities_from_backup, const RestoreSettings & restore_settings, std::shared_ptr restore_coordination) override; + protected: std::optional findImpl(AccessEntityType type, const String & name) const override; std::vector findAllImpl(AccessEntityType type) const override; AccessEntityPtr readImpl(const UUID & id, bool throw_if_not_exists) const override; - std::optional readNameImpl(const UUID & id, bool throw_if_not_exists) const override; + std::optional> readNameWithTypeImpl(const UUID & id, bool throw_if_not_exists) const override; std::optional insertImpl(const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists) override; bool removeImpl(const UUID & id, bool throw_if_not_exists) override; bool updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) override; @@ -56,8 +62,8 @@ private: using Storages = std::vector; std::shared_ptr getStoragesInternal() const; - std::shared_ptr nested_storages; - mutable LRUCache ids_cache; + std::shared_ptr nested_storages TSA_GUARDED_BY(mutex); + mutable LRUCache ids_cache TSA_GUARDED_BY(mutex); mutable std::mutex mutex; }; diff --git a/src/Access/Quota.cpp b/src/Access/Quota.cpp index dc855599999..87b15e722c3 100644 --- a/src/Access/Quota.cpp +++ b/src/Access/Quota.cpp @@ -19,5 +19,14 @@ bool Quota::equal(const IAccessEntity & other) const return (all_limits == other_quota.all_limits) && (key_type == other_quota.key_type) && (to_roles == other_quota.to_roles); } +std::vector Quota::findDependencies() const +{ + return to_roles.findDependencies(); } +void Quota::replaceDependencies(const std::unordered_map & old_to_new_ids) +{ + to_roles.replaceDependencies(old_to_new_ids); +} + +} diff --git a/src/Access/Quota.h b/src/Access/Quota.h index 487af49d684..eb9edb14fb0 100644 --- a/src/Access/Quota.h +++ b/src/Access/Quota.h @@ -45,6 +45,10 @@ struct Quota : public IAccessEntity std::shared_ptr clone() const override { return cloneImpl(); } static constexpr const auto TYPE = AccessEntityType::QUOTA; AccessEntityType getType() const override { return TYPE; } + + std::vector findDependencies() const override; + void replaceDependencies(const std::unordered_map & old_to_new_ids) override; + bool isBackupAllowed() const override { return true; } }; using QuotaPtr = std::shared_ptr; diff --git a/src/Access/ReplicatedAccessStorage.cpp b/src/Access/ReplicatedAccessStorage.cpp index d3d1ee3fb6b..6a9d716c2f9 100644 --- a/src/Access/ReplicatedAccessStorage.cpp +++ b/src/Access/ReplicatedAccessStorage.cpp @@ -2,6 +2,8 @@ #include #include #include +#include +#include #include #include #include @@ -33,12 +35,14 @@ ReplicatedAccessStorage::ReplicatedAccessStorage( const String & storage_name_, const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, - AccessChangesNotifier & changes_notifier_) + AccessChangesNotifier & changes_notifier_, + bool allow_backup_) : IAccessStorage(storage_name_) , zookeeper_path(zookeeper_path_) , get_zookeeper(get_zookeeper_) , watched_queue(std::make_shared>(std::numeric_limits::max())) , changes_notifier(changes_notifier_) + , backup_allowed(allow_backup_) { if (zookeeper_path.empty()) throw Exception("ZooKeeper path must be non-empty", ErrorCodes::BAD_ARGUMENTS); @@ -99,6 +103,15 @@ static void retryOnZooKeeperUserError(size_t attempts, Func && function) std::optional ReplicatedAccessStorage::insertImpl(const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists) { const UUID id = generateRandomID(); + if (insertWithID(id, new_entity, replace_if_exists, throw_if_exists)) + return id; + + return std::nullopt; +} + + +bool ReplicatedAccessStorage::insertWithID(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists) +{ const AccessEntityTypeInfo type_info = AccessEntityTypeInfo::get(new_entity->getType()); const String & name = new_entity->getName(); LOG_DEBUG(getLogger(), "Inserting entity of type {} named {} with id {}", type_info.name, name, toString(id)); @@ -108,11 +121,11 @@ std::optional ReplicatedAccessStorage::insertImpl(const AccessEntityPtr & retryOnZooKeeperUserError(10, [&]{ ok = insertZooKeeper(zookeeper, id, new_entity, replace_if_exists, throw_if_exists); }); if (!ok) - return std::nullopt; + return false; std::lock_guard lock{mutex}; refreshEntityNoLock(zookeeper, id); - return id; + return true; } @@ -600,4 +613,19 @@ AccessEntityPtr ReplicatedAccessStorage::readImpl(const UUID & id, bool throw_if return entry.entity; } +void ReplicatedAccessStorage::insertFromBackup(const std::vector> & entities_from_backup, const RestoreSettings & restore_settings, std::shared_ptr restore_coordination) +{ + if (!isRestoreAllowed()) + throwRestoreNotAllowed(); + + if (!restore_coordination->acquireReplicatedAccessStorage(zookeeper_path)) + return; + + bool replace_if_exists = (restore_settings.create_access == RestoreAccessCreationMode::kReplace); + bool throw_if_exists = (restore_settings.create_access == RestoreAccessCreationMode::kCreate); + + for (const auto & [id, entity] : entities_from_backup) + insertWithID(id, entity, replace_if_exists, throw_if_exists); +} + } diff --git a/src/Access/ReplicatedAccessStorage.h b/src/Access/ReplicatedAccessStorage.h index f9f579e2ba7..7cccdc1793f 100644 --- a/src/Access/ReplicatedAccessStorage.h +++ b/src/Access/ReplicatedAccessStorage.h @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -26,7 +27,7 @@ class ReplicatedAccessStorage : public IAccessStorage public: static constexpr char STORAGE_TYPE[] = "replicated"; - ReplicatedAccessStorage(const String & storage_name, const String & zookeeper_path, zkutil::GetZooKeeper get_zookeeper, AccessChangesNotifier & changes_notifier_); + ReplicatedAccessStorage(const String & storage_name, const String & zookeeper_path, zkutil::GetZooKeeper get_zookeeper, AccessChangesNotifier & changes_notifier_, bool allow_backup); virtual ~ReplicatedAccessStorage() override; const char * getStorageType() const override { return STORAGE_TYPE; } @@ -36,6 +37,9 @@ public: bool exists(const UUID & id) const override; + bool isBackupAllowed() const override { return backup_allowed; } + void insertFromBackup(const std::vector> & entities_from_backup, const RestoreSettings & restore_settings, std::shared_ptr restore_coordination) override; + private: String zookeeper_path; zkutil::GetZooKeeper get_zookeeper; @@ -50,6 +54,7 @@ private: bool removeImpl(const UUID & id, bool throw_if_not_exists) override; bool updateImpl(const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists) override; + bool insertWithID(const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists, bool throw_if_exists); bool insertZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, const AccessEntityPtr & entity, bool replace_if_exists, bool throw_if_exists); bool removeZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, bool throw_if_not_exists); bool updateZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, const UpdateFunc & update_func, bool throw_if_not_exists); @@ -66,10 +71,10 @@ private: bool refresh(); void refreshEntities(const zkutil::ZooKeeperPtr & zookeeper); void refreshEntity(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id); - void refreshEntityNoLock(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id); + void refreshEntityNoLock(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id) TSA_REQUIRES(mutex); - void setEntityNoLock(const UUID & id, const AccessEntityPtr & entity); - void removeEntityNoLock(const UUID & id); + void setEntityNoLock(const UUID & id, const AccessEntityPtr & entity) TSA_REQUIRES(mutex); + void removeEntityNoLock(const UUID & id) TSA_REQUIRES(mutex); struct Entry { @@ -82,8 +87,9 @@ private: AccessEntityPtr readImpl(const UUID & id, bool throw_if_not_exists) const override; mutable std::mutex mutex; - std::unordered_map entries_by_id; - std::unordered_map entries_by_name_and_type[static_cast(AccessEntityType::MAX)]; + std::unordered_map entries_by_id TSA_GUARDED_BY(mutex); + std::unordered_map entries_by_name_and_type[static_cast(AccessEntityType::MAX)] TSA_GUARDED_BY(mutex); AccessChangesNotifier & changes_notifier; + bool backup_allowed = false; }; } diff --git a/src/Access/Role.cpp b/src/Access/Role.cpp index 3df562ad1f0..089488e7aba 100644 --- a/src/Access/Role.cpp +++ b/src/Access/Role.cpp @@ -1,4 +1,5 @@ #include +#include namespace DB @@ -12,4 +13,18 @@ bool Role::equal(const IAccessEntity & other) const return (access == other_role.access) && (granted_roles == other_role.granted_roles) && (settings == other_role.settings); } +std::vector Role::findDependencies() const +{ + std::vector res; + insertAtEnd(res, granted_roles.findDependencies()); + insertAtEnd(res, settings.findDependencies()); + return res; +} + +void Role::replaceDependencies(const std::unordered_map & old_to_new_ids) +{ + granted_roles.replaceDependencies(old_to_new_ids); + settings.replaceDependencies(old_to_new_ids); +} + } diff --git a/src/Access/Role.h b/src/Access/Role.h index c7f1e107d24..b2f879dc357 100644 --- a/src/Access/Role.h +++ b/src/Access/Role.h @@ -19,6 +19,10 @@ struct Role : public IAccessEntity std::shared_ptr clone() const override { return cloneImpl(); } static constexpr const auto TYPE = AccessEntityType::ROLE; AccessEntityType getType() const override { return TYPE; } + + std::vector findDependencies() const override; + void replaceDependencies(const std::unordered_map & old_to_new_ids) override; + bool isBackupAllowed() const override { return settings.isBackupAllowed(); } }; using RolePtr = std::shared_ptr; diff --git a/src/Access/RolesOrUsersSet.cpp b/src/Access/RolesOrUsersSet.cpp index 3e0d56c77a0..dce89434aff 100644 --- a/src/Access/RolesOrUsersSet.cpp +++ b/src/Access/RolesOrUsersSet.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -286,4 +287,54 @@ bool operator ==(const RolesOrUsersSet & lhs, const RolesOrUsersSet & rhs) return (lhs.all == rhs.all) && (lhs.ids == rhs.ids) && (lhs.except_ids == rhs.except_ids); } +std::vector RolesOrUsersSet::findDependencies() const +{ + std::vector res; + boost::range::copy(ids, std::back_inserter(res)); + boost::range::copy(except_ids, std::back_inserter(res)); + return res; +} + +void RolesOrUsersSet::replaceDependencies(const std::unordered_map & old_to_new_ids) +{ + std::vector new_ids; + + for (auto it = ids.begin(); it != ids.end();) + { + auto id = *it; + auto it_new_id = old_to_new_ids.find(id); + if (it_new_id != old_to_new_ids.end()) + { + auto new_id = it_new_id->second; + new_ids.push_back(new_id); + it = ids.erase(it); + } + else + { + ++it; + } + } + + boost::range::copy(new_ids, std::inserter(ids, ids.end())); + new_ids.clear(); + + for (auto it = except_ids.begin(); it != except_ids.end();) + { + auto id = *it; + auto it_new_id = old_to_new_ids.find(id); + if (it_new_id != old_to_new_ids.end()) + { + auto new_id = it_new_id->second; + new_ids.push_back(new_id); + it = except_ids.erase(it); + } + else + { + ++it; + } + } + + boost::range::copy(new_ids, std::inserter(except_ids, except_ids.end())); +} + } diff --git a/src/Access/RolesOrUsersSet.h b/src/Access/RolesOrUsersSet.h index 43247057c86..29e499bc81b 100644 --- a/src/Access/RolesOrUsersSet.h +++ b/src/Access/RolesOrUsersSet.h @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB @@ -62,6 +63,9 @@ struct RolesOrUsersSet friend bool operator ==(const RolesOrUsersSet & lhs, const RolesOrUsersSet & rhs); friend bool operator !=(const RolesOrUsersSet & lhs, const RolesOrUsersSet & rhs) { return !(lhs == rhs); } + std::vector findDependencies() const; + void replaceDependencies(const std::unordered_map & old_to_new_ids); + bool all = false; boost::container::flat_set ids; boost::container::flat_set except_ids; diff --git a/src/Access/RowPolicy.cpp b/src/Access/RowPolicy.cpp index c09675e0e34..cc4ad9c6e24 100644 --- a/src/Access/RowPolicy.cpp +++ b/src/Access/RowPolicy.cpp @@ -58,4 +58,14 @@ bool RowPolicy::equal(const IAccessEntity & other) const && restrictive == other_policy.restrictive && (to_roles == other_policy.to_roles); } +std::vector RowPolicy::findDependencies() const +{ + return to_roles.findDependencies(); +} + +void RowPolicy::replaceDependencies(const std::unordered_map & old_to_new_ids) +{ + to_roles.replaceDependencies(old_to_new_ids); +} + } diff --git a/src/Access/RowPolicy.h b/src/Access/RowPolicy.h index 9c143aff725..99e6f1992f5 100644 --- a/src/Access/RowPolicy.h +++ b/src/Access/RowPolicy.h @@ -46,6 +46,10 @@ struct RowPolicy : public IAccessEntity static constexpr const auto TYPE = AccessEntityType::ROW_POLICY; AccessEntityType getType() const override { return TYPE; } + std::vector findDependencies() const override; + void replaceDependencies(const std::unordered_map & old_to_new_ids) override; + bool isBackupAllowed() const override { return true; } + /// Which roles or users should use this row policy. RolesOrUsersSet to_roles; diff --git a/src/Access/SettingsProfile.cpp b/src/Access/SettingsProfile.cpp index 64fb91eb66b..48aa48040ab 100644 --- a/src/Access/SettingsProfile.cpp +++ b/src/Access/SettingsProfile.cpp @@ -1,4 +1,5 @@ #include +#include namespace DB @@ -12,4 +13,18 @@ bool SettingsProfile::equal(const IAccessEntity & other) const return (elements == other_profile.elements) && (to_roles == other_profile.to_roles); } +std::vector SettingsProfile::findDependencies() const +{ + std::vector res; + insertAtEnd(res, elements.findDependencies()); + insertAtEnd(res, to_roles.findDependencies()); + return res; +} + +void SettingsProfile::replaceDependencies(const std::unordered_map & old_to_new_ids) +{ + elements.replaceDependencies(old_to_new_ids); + to_roles.replaceDependencies(old_to_new_ids); +} + } diff --git a/src/Access/SettingsProfile.h b/src/Access/SettingsProfile.h index e554924b45e..f85630d324d 100644 --- a/src/Access/SettingsProfile.h +++ b/src/Access/SettingsProfile.h @@ -20,6 +20,10 @@ struct SettingsProfile : public IAccessEntity std::shared_ptr clone() const override { return cloneImpl(); } static constexpr const auto TYPE = AccessEntityType::SETTINGS_PROFILE; AccessEntityType getType() const override { return TYPE; } + + std::vector findDependencies() const override; + void replaceDependencies(const std::unordered_map & old_to_new_ids) override; + bool isBackupAllowed() const override { return elements.isBackupAllowed(); } }; using SettingsProfilePtr = std::shared_ptr; diff --git a/src/Access/SettingsProfileElement.cpp b/src/Access/SettingsProfileElement.cpp index ea6edef94a6..465f26f37d9 100644 --- a/src/Access/SettingsProfileElement.cpp +++ b/src/Access/SettingsProfileElement.cpp @@ -12,6 +12,13 @@ namespace DB { + +namespace +{ + constexpr const char ALLOW_BACKUP_SETTING_NAME[] = "allow_backup"; +} + + SettingsProfileElement::SettingsProfileElement(const ASTSettingsProfileElement & ast) { init(ast, nullptr); @@ -41,7 +48,10 @@ void SettingsProfileElement::init(const ASTSettingsProfileElement & ast, const A /// Optionally check if a setting with that name is allowed. if (access_control) - access_control->checkSettingNameIsAllowed(setting_name); + { + if (setting_name != ALLOW_BACKUP_SETTING_NAME) + access_control->checkSettingNameIsAllowed(setting_name); + } value = ast.value; min_value = ast.min_value; @@ -127,6 +137,36 @@ std::shared_ptr SettingsProfileElements::toASTWithNa } +std::vector SettingsProfileElements::findDependencies() const +{ + std::vector res; + for (const auto & element : *this) + { + if (element.parent_profile) + res.push_back(*element.parent_profile); + } + return res; +} + + +void SettingsProfileElements::replaceDependencies(const std::unordered_map & old_to_new_ids) +{ + for (auto & element : *this) + { + if (element.parent_profile) + { + auto id = *element.parent_profile; + auto it_new_id = old_to_new_ids.find(id); + if (it_new_id != old_to_new_ids.end()) + { + auto new_id = it_new_id->second; + element.parent_profile = new_id; + } + } + } +} + + void SettingsProfileElements::merge(const SettingsProfileElements & other) { insert(end(), other.begin(), other.end()); @@ -138,8 +178,11 @@ Settings SettingsProfileElements::toSettings() const Settings res; for (const auto & elem : *this) { - if (!elem.setting_name.empty() && !elem.value.isNull()) - res.set(elem.setting_name, elem.value); + if (!elem.setting_name.empty() && (elem.setting_name != ALLOW_BACKUP_SETTING_NAME)) + { + if (!elem.value.isNull()) + res.set(elem.setting_name, elem.value); + } } return res; } @@ -149,8 +192,11 @@ SettingsChanges SettingsProfileElements::toSettingsChanges() const SettingsChanges res; for (const auto & elem : *this) { - if (!elem.setting_name.empty() && !elem.value.isNull()) - res.push_back({elem.setting_name, elem.value}); + if (!elem.setting_name.empty() && (elem.setting_name != ALLOW_BACKUP_SETTING_NAME)) + { + if (!elem.value.isNull()) + res.push_back({elem.setting_name, elem.value}); + } } return res; } @@ -160,7 +206,7 @@ SettingsConstraints SettingsProfileElements::toSettingsConstraints(const AccessC SettingsConstraints res{access_control}; for (const auto & elem : *this) { - if (!elem.setting_name.empty()) + if (!elem.setting_name.empty() && (elem.setting_name != ALLOW_BACKUP_SETTING_NAME)) { if (!elem.min_value.isNull()) res.setMinValue(elem.setting_name, elem.min_value); @@ -189,5 +235,14 @@ std::vector SettingsProfileElements::toProfileIDs() const return res; } +bool SettingsProfileElements::isBackupAllowed() const +{ + for (const auto & setting : *this) + { + if (setting.setting_name == ALLOW_BACKUP_SETTING_NAME) + return static_cast(SettingFieldBool{setting.value}); + } + return true; +} } diff --git a/src/Access/SettingsProfileElement.h b/src/Access/SettingsProfileElement.h index c9f6936c832..818e7804a76 100644 --- a/src/Access/SettingsProfileElement.h +++ b/src/Access/SettingsProfileElement.h @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -57,12 +58,17 @@ public: std::shared_ptr toAST() const; std::shared_ptr toASTWithNames(const AccessControl & access_control) const; + std::vector findDependencies() const; + void replaceDependencies(const std::unordered_map & old_to_new_ids); + void merge(const SettingsProfileElements & other); Settings toSettings() const; SettingsChanges toSettingsChanges() const; SettingsConstraints toSettingsConstraints(const AccessControl & access_control) const; std::vector toProfileIDs() const; + + bool isBackupAllowed() const; }; } diff --git a/src/Access/User.cpp b/src/Access/User.cpp index 0476242c504..c5750cdcd6c 100644 --- a/src/Access/User.cpp +++ b/src/Access/User.cpp @@ -1,5 +1,6 @@ #include #include +#include namespace DB @@ -31,4 +32,22 @@ void User::setName(const String & name_) name = name_; } +std::vector User::findDependencies() const +{ + std::vector res; + insertAtEnd(res, default_roles.findDependencies()); + insertAtEnd(res, granted_roles.findDependencies()); + insertAtEnd(res, grantees.findDependencies()); + insertAtEnd(res, settings.findDependencies()); + return res; +} + +void User::replaceDependencies(const std::unordered_map & old_to_new_ids) +{ + default_roles.replaceDependencies(old_to_new_ids); + granted_roles.replaceDependencies(old_to_new_ids); + grantees.replaceDependencies(old_to_new_ids); + settings.replaceDependencies(old_to_new_ids); +} + } diff --git a/src/Access/User.h b/src/Access/User.h index 57a3b178acf..958d8bb486f 100644 --- a/src/Access/User.h +++ b/src/Access/User.h @@ -29,6 +29,10 @@ struct User : public IAccessEntity static constexpr const auto TYPE = AccessEntityType::USER; AccessEntityType getType() const override { return TYPE; } void setName(const String & name_) override; + + std::vector findDependencies() const override; + void replaceDependencies(const std::unordered_map & old_to_new_ids) override; + bool isBackupAllowed() const override { return settings.isBackupAllowed(); } }; using UserPtr = std::shared_ptr; diff --git a/src/Access/UsersConfigAccessStorage.cpp b/src/Access/UsersConfigAccessStorage.cpp index a6c4388fef8..4561d7f5766 100644 --- a/src/Access/UsersConfigAccessStorage.cpp +++ b/src/Access/UsersConfigAccessStorage.cpp @@ -523,8 +523,11 @@ namespace } } -UsersConfigAccessStorage::UsersConfigAccessStorage(const String & storage_name_, AccessControl & access_control_) - : IAccessStorage(storage_name_), access_control(access_control_), memory_storage(storage_name_, access_control.getChangesNotifier()) +UsersConfigAccessStorage::UsersConfigAccessStorage(const String & storage_name_, AccessControl & access_control_, bool allow_backup_) + : IAccessStorage(storage_name_) + , access_control(access_control_) + , memory_storage(storage_name_, access_control.getChangesNotifier(), false) + , backup_allowed(allow_backup_) { } @@ -655,9 +658,9 @@ AccessEntityPtr UsersConfigAccessStorage::readImpl(const UUID & id, bool throw_i } -std::optional UsersConfigAccessStorage::readNameImpl(const UUID & id, bool throw_if_not_exists) const +std::optional> UsersConfigAccessStorage::readNameWithTypeImpl(const UUID & id, bool throw_if_not_exists) const { - return memory_storage.readName(id, throw_if_not_exists); + return memory_storage.readNameWithType(id, throw_if_not_exists); } } diff --git a/src/Access/UsersConfigAccessStorage.h b/src/Access/UsersConfigAccessStorage.h index 5c99bf30160..3fa8b4185a8 100644 --- a/src/Access/UsersConfigAccessStorage.h +++ b/src/Access/UsersConfigAccessStorage.h @@ -22,7 +22,7 @@ public: static constexpr char STORAGE_TYPE[] = "users.xml"; - UsersConfigAccessStorage(const String & storage_name_, AccessControl & access_control_); + UsersConfigAccessStorage(const String & storage_name_, AccessControl & access_control_, bool allow_backup_); ~UsersConfigAccessStorage() override; const char * getStorageType() const override { return STORAGE_TYPE; } @@ -44,17 +44,20 @@ public: bool exists(const UUID & id) const override; + bool isBackupAllowed() const override { return backup_allowed; } + private: void parseFromConfig(const Poco::Util::AbstractConfiguration & config); std::optional findImpl(AccessEntityType type, const String & name) const override; std::vector findAllImpl(AccessEntityType type) const override; AccessEntityPtr readImpl(const UUID & id, bool throw_if_not_exists) const override; - std::optional readNameImpl(const UUID & id, bool throw_if_not_exists) const override; + std::optional> readNameWithTypeImpl(const UUID & id, bool throw_if_not_exists) const override; AccessControl & access_control; MemoryAccessStorage memory_storage; String path; std::unique_ptr config_reloader; + bool backup_allowed = false; mutable std::mutex load_mutex; }; } diff --git a/src/Access/tests/gtest_access_rights_ops.cpp b/src/Access/tests/gtest_access_rights_ops.cpp index 7513d72ba26..02aafb7415b 100644 --- a/src/Access/tests/gtest_access_rights_ops.cpp +++ b/src/Access/tests/gtest_access_rights_ops.cpp @@ -49,7 +49,7 @@ TEST(AccessRights, Union) "GRANT INSERT ON *.*, " "GRANT SHOW, SELECT, ALTER, CREATE DATABASE, CREATE TABLE, CREATE VIEW, " "CREATE DICTIONARY, DROP DATABASE, DROP TABLE, DROP VIEW, DROP DICTIONARY, " - "TRUNCATE, OPTIMIZE, CREATE ROW POLICY, ALTER ROW POLICY, DROP ROW POLICY, " + "TRUNCATE, OPTIMIZE, BACKUP, CREATE ROW POLICY, ALTER ROW POLICY, DROP ROW POLICY, " "SHOW ROW POLICIES, SYSTEM MERGES, SYSTEM TTL MERGES, SYSTEM FETCHES, " "SYSTEM MOVES, SYSTEM SENDS, SYSTEM REPLICATION QUEUES, " "SYSTEM DROP REPLICA, SYSTEM SYNC REPLICA, SYSTEM RESTART REPLICA, " diff --git a/src/Access/tests/gtest_replicated_access_storage.cpp b/src/Access/tests/gtest_replicated_access_storage.cpp index c780e598b64..2b80ccfb323 100644 --- a/src/Access/tests/gtest_replicated_access_storage.cpp +++ b/src/Access/tests/gtest_replicated_access_storage.cpp @@ -24,7 +24,7 @@ TEST(ReplicatedAccessStorage, ShutdownWithFailedStartup) try { - auto storage = ReplicatedAccessStorage("replicated", "/clickhouse/access", get_zk, changes_notifier); + auto storage = ReplicatedAccessStorage("replicated", "/clickhouse/access", get_zk, changes_notifier, false); } catch (Exception & e) { diff --git a/src/AggregateFunctions/AggregateFunctionFactory.h b/src/AggregateFunctions/AggregateFunctionFactory.h index e5263a54d79..a860831cb88 100644 --- a/src/AggregateFunctions/AggregateFunctionFactory.h +++ b/src/AggregateFunctions/AggregateFunctionFactory.h @@ -2,6 +2,7 @@ #include #include +#include #include @@ -105,4 +106,12 @@ private: }; +struct AggregateUtils +{ + static bool isAggregateFunction(const ASTFunction & node) + { + return AggregateFunctionFactory::instance().isAggregateFunctionName(node.name); + } +}; + } diff --git a/src/Backups/BackupCoordinationDistributed.cpp b/src/Backups/BackupCoordinationDistributed.cpp index 7033f65e57b..945239482fc 100644 --- a/src/Backups/BackupCoordinationDistributed.cpp +++ b/src/Backups/BackupCoordinationDistributed.cpp @@ -14,6 +14,7 @@ namespace DB namespace ErrorCodes { extern const int UNEXPECTED_NODE_IN_ZOOKEEPER; + extern const int LOGICAL_ERROR; } /// zookeeper_path/file_names/file_name->checksum_and_size @@ -27,32 +28,40 @@ namespace using FileInfo = IBackupCoordination::FileInfo; using PartNameAndChecksum = IBackupCoordination::PartNameAndChecksum; - String serializePartNamesAndChecksums(const std::vector & part_names_and_checksums) + struct ReplicatedPartNames { - WriteBufferFromOwnString out; - writeBinary(part_names_and_checksums.size(), out); - for (const auto & part_name_and_checksum : part_names_and_checksums) - { - writeBinary(part_name_and_checksum.part_name, out); - writeBinary(part_name_and_checksum.checksum, out); - } - return out.str(); - } - - std::vector deserializePartNamesAndChecksums(const String & str) - { - ReadBufferFromString in{str}; std::vector part_names_and_checksums; - size_t num; - readBinary(num, in); - part_names_and_checksums.resize(num); - for (size_t i = 0; i != num; ++i) + String table_name_for_logs; + + static String serialize(const std::vector & part_names_and_checksums_, const String & table_name_for_logs_) { - readBinary(part_names_and_checksums[i].part_name, in); - readBinary(part_names_and_checksums[i].checksum, in); + WriteBufferFromOwnString out; + writeBinary(part_names_and_checksums_.size(), out); + for (const auto & part_name_and_checksum : part_names_and_checksums_) + { + writeBinary(part_name_and_checksum.part_name, out); + writeBinary(part_name_and_checksum.checksum, out); + } + writeBinary(table_name_for_logs_, out); + return out.str(); } - return part_names_and_checksums; - } + + static ReplicatedPartNames deserialize(const String & str) + { + ReadBufferFromString in{str}; + ReplicatedPartNames res; + size_t num; + readBinary(num, in); + res.part_names_and_checksums.resize(num); + for (size_t i = 0; i != num; ++i) + { + readBinary(res.part_names_and_checksums[i].part_name, in); + readBinary(res.part_names_and_checksums[i].checksum, in); + } + readBinary(res.table_name_for_logs, in); + return res; + } + }; String serializeFileInfo(const FileInfo & info) { @@ -122,7 +131,7 @@ namespace BackupCoordinationDistributed::BackupCoordinationDistributed(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_) : zookeeper_path(zookeeper_path_) , get_zookeeper(get_zookeeper_) - , preparing_barrier(zookeeper_path_ + "/preparing", get_zookeeper_, "BackupCoordination", "preparing") + , stage_sync(zookeeper_path_ + "/stage", get_zookeeper_, &Poco::Logger::get("BackupCoordination")) { createRootNodes(); } @@ -134,8 +143,8 @@ void BackupCoordinationDistributed::createRootNodes() auto zookeeper = get_zookeeper(); zookeeper->createAncestors(zookeeper_path); zookeeper->createIfNotExists(zookeeper_path, ""); - zookeeper->createIfNotExists(zookeeper_path + "/repl_tables_paths", ""); - zookeeper->createIfNotExists(zookeeper_path + "/repl_tables_parts", ""); + zookeeper->createIfNotExists(zookeeper_path + "/repl_part_names", ""); + zookeeper->createIfNotExists(zookeeper_path + "/repl_data_paths", ""); zookeeper->createIfNotExists(zookeeper_path + "/file_names", ""); zookeeper->createIfNotExists(zookeeper_path + "/file_infos", ""); zookeeper->createIfNotExists(zookeeper_path + "/archive_suffixes", ""); @@ -147,101 +156,89 @@ void BackupCoordinationDistributed::removeAllNodes() zookeeper->removeRecursive(zookeeper_path); } -void BackupCoordinationDistributed::addReplicatedTableDataPath(const String & table_zk_path, const String & table_data_path) + +void BackupCoordinationDistributed::syncStage(const String & current_host, int new_stage, const Strings & wait_hosts, std::chrono::seconds timeout) { - auto zookeeper = get_zookeeper(); - - String path = zookeeper_path + "/repl_tables_paths/" + escapeForFileName(table_zk_path); - zookeeper->createIfNotExists(path, ""); - - path += "/" + escapeForFileName(table_data_path); - zookeeper->createIfNotExists(path, ""); + stage_sync.syncStage(current_host, new_stage, wait_hosts, timeout); } -void BackupCoordinationDistributed::addReplicatedTablePartNames( - const String & host_id, - const DatabaseAndTableName & table_name, +void BackupCoordinationDistributed::syncStageError(const String & current_host, const String & error_message) +{ + stage_sync.syncStageError(current_host, error_message); +} + + +void BackupCoordinationDistributed::addReplicatedPartNames( const String & table_zk_path, + const String & table_name_for_logs, + const String & replica_name, const std::vector & part_names_and_checksums) { - auto zookeeper = get_zookeeper(); - - String path = zookeeper_path + "/repl_tables_parts/" + escapeForFileName(table_zk_path); - zookeeper->createIfNotExists(path, ""); - - path += "/" + escapeForFileName(host_id); - zookeeper->createIfNotExists(path, ""); - - path += "/" + escapeForFileName(table_name.first); - zookeeper->createIfNotExists(path, ""); - - path += "/" + escapeForFileName(table_name.second); - zookeeper->create(path, serializePartNamesAndChecksums(part_names_and_checksums), zkutil::CreateMode::Persistent); -} - -void BackupCoordinationDistributed::finishPreparing(const String & host_id, const String & error_message) -{ - preparing_barrier.finish(host_id, error_message); -} - -void BackupCoordinationDistributed::waitForAllHostsPrepared(const Strings & host_ids, std::chrono::seconds timeout) const -{ - preparing_barrier.waitForAllHostsToFinish(host_ids, timeout); - prepareReplicatedTablesInfo(); -} - -void BackupCoordinationDistributed::prepareReplicatedTablesInfo() const -{ - replicated_tables.emplace(); - auto zookeeper = get_zookeeper(); - - String path = zookeeper_path + "/repl_tables_paths"; - for (const String & escaped_table_zk_path : zookeeper->getChildren(path)) { - String table_zk_path = unescapeForFileName(escaped_table_zk_path); - for (const String & escaped_data_path : zookeeper->getChildren(path + "/" + escaped_table_zk_path)) - { - String data_path = unescapeForFileName(escaped_data_path); - replicated_tables->addDataPath(table_zk_path, data_path); - } + std::lock_guard lock{mutex}; + if (replicated_part_names) + throw Exception(ErrorCodes::LOGICAL_ERROR, "addPartNames() must not be called after getPartNames()"); } - path = zookeeper_path + "/repl_tables_parts"; + auto zookeeper = get_zookeeper(); + String path = zookeeper_path + "/repl_part_names/" + escapeForFileName(table_zk_path); + zookeeper->createIfNotExists(path, ""); + path += "/" + escapeForFileName(replica_name); + zookeeper->create(path, ReplicatedPartNames::serialize(part_names_and_checksums, table_name_for_logs), zkutil::CreateMode::Persistent); +} + +Strings BackupCoordinationDistributed::getReplicatedPartNames(const String & table_zk_path, const String & replica_name) const +{ + std::lock_guard lock{mutex}; + prepareReplicatedPartNames(); + return replicated_part_names->getPartNames(table_zk_path, replica_name); +} + + +void BackupCoordinationDistributed::addReplicatedDataPath( + const String & table_zk_path, const String & data_path) +{ + auto zookeeper = get_zookeeper(); + String path = zookeeper_path + "/repl_data_paths/" + escapeForFileName(table_zk_path); + zookeeper->createIfNotExists(path, ""); + path += "/"; + zookeeper->create(path, data_path, zkutil::CreateMode::PersistentSequential); +} + +Strings BackupCoordinationDistributed::getReplicatedDataPaths(const String & table_zk_path) const +{ + auto zookeeper = get_zookeeper(); + String path = zookeeper_path + "/repl_data_paths/" + escapeForFileName(table_zk_path); + Strings children = zookeeper->getChildren(path); + Strings data_paths; + data_paths.reserve(children.size()); + for (const String & child : children) + data_paths.push_back(zookeeper->get(path + "/" + child)); + return data_paths; +} + + +void BackupCoordinationDistributed::prepareReplicatedPartNames() const +{ + if (replicated_part_names) + return; + + replicated_part_names.emplace(); + auto zookeeper = get_zookeeper(); + String path = zookeeper_path + "/repl_part_names"; for (const String & escaped_table_zk_path : zookeeper->getChildren(path)) { String table_zk_path = unescapeForFileName(escaped_table_zk_path); String path2 = path + "/" + escaped_table_zk_path; - for (const String & escaped_host_id : zookeeper->getChildren(path2)) + for (const String & escaped_replica_name : zookeeper->getChildren(path2)) { - String host_id = unescapeForFileName(escaped_host_id); - String path3 = path2 + "/" + escaped_host_id; - for (const String & escaped_database_name : zookeeper->getChildren(path3)) - { - String database_name = unescapeForFileName(escaped_database_name); - String path4 = path3 + "/" + escaped_database_name; - for (const String & escaped_table_name : zookeeper->getChildren(path4)) - { - String table_name = unescapeForFileName(escaped_table_name); - String path5 = path4 + "/" + escaped_table_name; - auto part_names_and_checksums = deserializePartNamesAndChecksums(zookeeper->get(path5)); - replicated_tables->addPartNames(host_id, {database_name, table_name}, table_zk_path, part_names_and_checksums); - } - } + String replica_name = unescapeForFileName(escaped_replica_name); + auto part_names = ReplicatedPartNames::deserialize(zookeeper->get(path2 + "/" + escaped_replica_name)); + replicated_part_names->addPartNames(table_zk_path, part_names.table_name_for_logs, replica_name, part_names.part_names_and_checksums); } } - - replicated_tables->preparePartNamesByLocations(); } -Strings BackupCoordinationDistributed::getReplicatedTableDataPaths(const String & table_zk_path) const -{ - return replicated_tables->getDataPaths(table_zk_path); -} - -Strings BackupCoordinationDistributed::getReplicatedTablePartNames(const String & host_id, const DatabaseAndTableName & table_name, const String & table_zk_path) const -{ - return replicated_tables->getPartNames(host_id, table_name, table_zk_path); -} void BackupCoordinationDistributed::addFileInfo(const FileInfo & file_info, bool & is_data_file_required) { @@ -305,12 +302,19 @@ std::vector BackupCoordinationDistributed::getAllFileInfos() const return file_infos; } -Strings BackupCoordinationDistributed::listFiles(const String & prefix, const String & terminator) const +Strings BackupCoordinationDistributed::listFiles(const String & directory, bool recursive) const { auto zookeeper = get_zookeeper(); Strings escaped_names = zookeeper->getChildren(zookeeper_path + "/file_names"); + String prefix = directory; + if (!prefix.empty() && !prefix.ends_with('/')) + prefix += '/'; + String terminator = recursive ? "" : "/"; + Strings elements; + std::unordered_set unique_elements; + for (const String & escaped_name : escaped_names) { String name = unescapeForFileName(escaped_name); @@ -321,15 +325,35 @@ Strings BackupCoordinationDistributed::listFiles(const String & prefix, const St if (!terminator.empty()) end_pos = name.find(terminator, start_pos); std::string_view new_element = std::string_view{name}.substr(start_pos, end_pos - start_pos); - if (!elements.empty() && (elements.back() == new_element)) + if (unique_elements.contains(new_element)) continue; elements.push_back(String{new_element}); + unique_elements.emplace(new_element); } ::sort(elements.begin(), elements.end()); return elements; } +bool BackupCoordinationDistributed::hasFiles(const String & directory) const +{ + auto zookeeper = get_zookeeper(); + Strings escaped_names = zookeeper->getChildren(zookeeper_path + "/file_names"); + + String prefix = directory; + if (!prefix.empty() && !prefix.ends_with('/')) + prefix += '/'; + + for (const String & escaped_name : escaped_names) + { + String name = unescapeForFileName(escaped_name); + if (name.starts_with(prefix)) + return true; + } + + return false; +} + std::optional BackupCoordinationDistributed::getFileInfo(const String & file_name) const { auto zookeeper = get_zookeeper(); diff --git a/src/Backups/BackupCoordinationDistributed.h b/src/Backups/BackupCoordinationDistributed.h index 62136487dbc..2872e1f3ae4 100644 --- a/src/Backups/BackupCoordinationDistributed.h +++ b/src/Backups/BackupCoordinationDistributed.h @@ -2,9 +2,6 @@ #include #include -#include -#include -#include namespace DB @@ -17,24 +14,26 @@ public: BackupCoordinationDistributed(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_); ~BackupCoordinationDistributed() override; - void addReplicatedTableDataPath(const String & table_zk_path, const String & table_data_path) override; - void addReplicatedTablePartNames( - const String & host_id, - const DatabaseAndTableName & table_name, + void syncStage(const String & current_host, int new_stage, const Strings & wait_hosts, std::chrono::seconds timeout) override; + void syncStageError(const String & current_host, const String & error_message) override; + + void addReplicatedPartNames( const String & table_zk_path, + const String & table_name_for_logs, + const String & replica_name, const std::vector & part_names_and_checksums) override; - void finishPreparing(const String & host_id, const String & error_message) override; - void waitForAllHostsPrepared(const Strings & host_ids, std::chrono::seconds timeout) const override; + Strings getReplicatedPartNames(const String & table_zk_path, const String & replica_name) const override; - Strings getReplicatedTableDataPaths(const String & table_zk_path) const override; - Strings getReplicatedTablePartNames(const String & host_id, const DatabaseAndTableName & table_name, const String & table_zk_path) const override; + void addReplicatedDataPath(const String & table_zk_path, const String & data_path) override; + Strings getReplicatedDataPaths(const String & table_zk_path) const override; void addFileInfo(const FileInfo & file_info, bool & is_data_file_required) override; void updateFileInfo(const FileInfo & file_info) override; std::vector getAllFileInfos() const override; - Strings listFiles(const String & prefix, const String & terminator) const override; + Strings listFiles(const String & directory, bool recursive) const override; + bool hasFiles(const String & directory) const override; std::optional getFileInfo(const String & file_name) const override; std::optional getFileInfo(const SizeAndChecksum & size_and_checksum) const override; std::optional getFileSizeAndChecksum(const String & file_name) const override; @@ -47,12 +46,15 @@ public: private: void createRootNodes(); void removeAllNodes(); - void prepareReplicatedTablesInfo() const; + void prepareReplicatedPartNames() const; const String zookeeper_path; const zkutil::GetZooKeeper get_zookeeper; - BackupCoordinationDistributedBarrier preparing_barrier; - mutable std::optional replicated_tables; + + BackupCoordinationStageSync stage_sync; + + mutable std::mutex mutex; + mutable std::optional replicated_part_names; }; } diff --git a/src/Backups/BackupCoordinationHelpers.cpp b/src/Backups/BackupCoordinationHelpers.cpp index b0327a9b667..9528f888770 100644 --- a/src/Backups/BackupCoordinationHelpers.cpp +++ b/src/Backups/BackupCoordinationHelpers.cpp @@ -1,6 +1,8 @@ #include #include #include +#include +#include #include #include @@ -16,37 +18,26 @@ namespace ErrorCodes } -struct BackupCoordinationReplicatedTablesInfo::HostAndTableName +namespace { - String host_id; - DatabaseAndTableName table_name; - - struct Less + struct LessReplicaName { - bool operator()(const HostAndTableName & lhs, const HostAndTableName & rhs) const - { - return (lhs.host_id < rhs.host_id) || ((lhs.host_id == rhs.host_id) && (lhs.table_name < rhs.table_name)); - } - - bool operator()(const std::shared_ptr & lhs, const std::shared_ptr & rhs) const - { - return operator()(*lhs, *rhs); - } + bool operator()(const std::shared_ptr & left, const std::shared_ptr & right) { return *left < *right; } }; -}; +} -class BackupCoordinationReplicatedTablesInfo::CoveredPartsFinder +class BackupCoordinationReplicatedPartNames::CoveredPartsFinder { public: - CoveredPartsFinder() = default; + explicit CoveredPartsFinder(const String & table_name_for_logs_) : table_name_for_logs(table_name_for_logs_) {} - void addPart(const String & new_part_name, const std::shared_ptr & host_and_table_name) + void addPartName(const String & new_part_name, const std::shared_ptr & replica_name) { - addPart(MergeTreePartInfo::fromPartName(new_part_name, MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING), host_and_table_name); + addPartName(MergeTreePartInfo::fromPartName(new_part_name, MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING), replica_name); } - void addPart(MergeTreePartInfo && new_part_info, const std::shared_ptr & host_and_table_name) + void addPartName(MergeTreePartInfo && new_part_info, const std::shared_ptr & replica_name) { auto new_min_block = new_part_info.min_block; auto new_max_block = new_part_info.max_block; @@ -57,7 +48,7 @@ public: if (first_it == parts.end()) { /// All max_blocks < part_info.min_block, so we can safely add the `part_info` to the list of parts. - parts.emplace(new_max_block, PartInfo{std::move(new_part_info), host_and_table_name}); + parts.emplace(new_max_block, PartInfo{std::move(new_part_info), replica_name}); return; } @@ -68,7 +59,7 @@ public: { /// (prev_info.max_block < part_info.min_block) AND (part_info.max_block < current_info.min_block), /// so we can safely add the `part_info` to the list of parts. - parts.emplace(new_max_block, PartInfo{std::move(new_part_info), host_and_table_name}); + parts.emplace(new_max_block, PartInfo{std::move(new_part_info), replica_name}); return; } @@ -92,22 +83,19 @@ public: { throw Exception( ErrorCodes::CANNOT_BACKUP_TABLE, - "Intersected parts detected: {} in the table {}.{}{} and {} in the table {}.{}{}. It should be investigated", + "Intersected parts detected in the table {}: {} on replica {} and {} on replica {}. It should be investigated", + table_name_for_logs, part.info.getPartName(), - part.host_and_table_name->table_name.first, - part.host_and_table_name->table_name.second, - part.host_and_table_name->host_id.empty() ? "" : (" on the host " + part.host_and_table_name->host_id), + *part.replica_name, new_part_info.getPartName(), - host_and_table_name->table_name.first, - host_and_table_name->table_name.second, - host_and_table_name->host_id.empty() ? "" : (" on the host " + host_and_table_name->host_id)); + *replica_name); } ++last_it; } /// `part_info` will replace multiple parts [first_it..last_it) parts.erase(first_it, last_it); - parts.emplace(new_max_block, PartInfo{std::move(new_part_info), host_and_table_name}); + parts.emplace(new_max_block, PartInfo{std::move(new_part_info), replica_name}); } bool isCoveredByAnotherPart(const String & part_name) const @@ -156,185 +144,175 @@ private: struct PartInfo { MergeTreePartInfo info; - std::shared_ptr host_and_table_name; + std::shared_ptr replica_name; }; using Parts = std::map; std::unordered_map partitions; + const String table_name_for_logs; }; -void BackupCoordinationReplicatedTablesInfo::addDataPath(const String & table_zk_path, const String & table_data_path) -{ - tables[table_zk_path].data_paths.push_back(table_data_path); -} +BackupCoordinationReplicatedPartNames::BackupCoordinationReplicatedPartNames() = default; +BackupCoordinationReplicatedPartNames::~BackupCoordinationReplicatedPartNames() = default; -Strings BackupCoordinationReplicatedTablesInfo::getDataPaths(const String & table_zk_path) const -{ - auto it = tables.find(table_zk_path); - if (it == tables.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "getDataPaths() called for unknown table_zk_path: {}", table_zk_path); - const auto & replicated_table = it->second; - return replicated_table.data_paths; -} - -void BackupCoordinationReplicatedTablesInfo::addPartNames( - const String & host_id, - const DatabaseAndTableName & table_name, +void BackupCoordinationReplicatedPartNames::addPartNames( const String & table_zk_path, + const String & table_name_for_logs, + const String & replica_name, const std::vector & part_names_and_checksums) { - auto & table = tables[table_zk_path]; - auto & part_locations_by_names = table.part_locations_by_names; - auto host_and_table_name = std::make_shared(); - host_and_table_name->host_id = host_id; - host_and_table_name->table_name = table_name; + if (part_names_prepared) + throw Exception(ErrorCodes::LOGICAL_ERROR, "addPartNames() must not be called after getPartNames()"); + + auto & table_info = table_infos[table_zk_path]; + if (!table_info.covered_parts_finder) + table_info.covered_parts_finder = std::make_unique(table_name_for_logs); + + auto replica_name_ptr = std::make_shared(replica_name); for (const auto & part_name_and_checksum : part_names_and_checksums) { const auto & part_name = part_name_and_checksum.part_name; const auto & checksum = part_name_and_checksum.checksum; - auto it = part_locations_by_names.find(part_name); - if (it == part_locations_by_names.end()) + auto it = table_info.parts_replicas.find(part_name); + if (it == table_info.parts_replicas.end()) { - it = part_locations_by_names.emplace(part_name, PartLocations{}).first; + it = table_info.parts_replicas.emplace(part_name, PartReplicas{}).first; it->second.checksum = checksum; } else { - const auto & existing = it->second; - if (existing.checksum != checksum) + const auto & other = it->second; + if (other.checksum != checksum) { - const auto & existing_host_and_table_name = **existing.host_and_table_names.begin(); + const String & other_replica_name = **other.replica_names.begin(); throw Exception( ErrorCodes::CANNOT_BACKUP_TABLE, - "Table {}.{} has part {} which is different from the part of table {}.{}. Must be the same", - table_name.first, - table_name.second, + "Table {} on replica {} has part {} which is different from the part on replica {}. Must be the same", + table_name_for_logs, + replica_name, part_name, - existing_host_and_table_name.table_name.first, - existing_host_and_table_name.table_name.second); + other_replica_name); } } - auto & host_and_table_names = it->second.host_and_table_names; + auto & replica_names = it->second.replica_names; - /// `host_and_table_names` should be ordered because we need this vector to be in the same order on every replica. - host_and_table_names.insert( - std::upper_bound(host_and_table_names.begin(), host_and_table_names.end(), host_and_table_name, HostAndTableName::Less{}), - host_and_table_name); + /// `replica_names` should be ordered because we need this vector to be in the same order on every replica. + replica_names.insert( + std::upper_bound(replica_names.begin(), replica_names.end(), replica_name_ptr, LessReplicaName{}), replica_name_ptr); + + table_info.covered_parts_finder->addPartName(part_name, replica_name_ptr); } } -Strings BackupCoordinationReplicatedTablesInfo::getPartNames(const String & host_id, const DatabaseAndTableName & table_name, const String & table_zk_path) const +Strings BackupCoordinationReplicatedPartNames::getPartNames(const String & table_zk_path, const String & replica_name) const { - if (!part_names_by_locations_prepared) - throw Exception(ErrorCodes::LOGICAL_ERROR, "preparePartNamesByLocations() was not called before getPartNames()"); - - auto it = tables.find(table_zk_path); - if (it == tables.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "getPartNames() called for unknown table_zk_path: {}", table_zk_path); - const auto & table = it->second; - auto it2 = table.part_names_by_locations.find(host_id); - if (it2 == table.part_names_by_locations.end()) + preparePartNames(); + auto it = table_infos.find(table_zk_path); + if (it == table_infos.end()) return {}; - const auto & part_names_by_host_id = it2->second; - auto it3 = part_names_by_host_id.find(table_name); - if (it3 == part_names_by_host_id.end()) + const auto & replicas_parts = it->second.replicas_parts; + auto it2 = replicas_parts.find(replica_name); + if (it2 == replicas_parts.end()) return {}; - return it3->second; + return it2->second; } -void BackupCoordinationReplicatedTablesInfo::preparePartNamesByLocations() +void BackupCoordinationReplicatedPartNames::preparePartNames() const { - if (part_names_by_locations_prepared) + if (part_names_prepared) return; - part_names_by_locations_prepared = true; size_t counter = 0; - for (auto & table : tables | boost::adaptors::map_values) + for (const auto & table_info : table_infos | boost::adaptors::map_values) { - CoveredPartsFinder covered_parts_finder; - for (const auto & [part_name, part_locations] : table.part_locations_by_names) - covered_parts_finder.addPart(part_name, *part_locations.host_and_table_names.begin()); - - table.part_names_by_locations.clear(); - for (const auto & [part_name, part_locations] : table.part_locations_by_names) + for (const auto & [part_name, part_replicas] : table_info.parts_replicas) { - if (covered_parts_finder.isCoveredByAnotherPart(part_name)) + if (table_info.covered_parts_finder->isCoveredByAnotherPart(part_name)) continue; - size_t chosen_index = (counter++) % part_locations.host_and_table_names.size(); - const auto & chosen_host_id = part_locations.host_and_table_names[chosen_index]->host_id; - const auto & chosen_table_name = part_locations.host_and_table_names[chosen_index]->table_name; - table.part_names_by_locations[chosen_host_id][chosen_table_name].push_back(part_name); + size_t chosen_index = (counter++) % part_replicas.replica_names.size(); + const auto & chosen_replica_name = *part_replicas.replica_names[chosen_index]; + table_info.replicas_parts[chosen_replica_name].push_back(part_name); } } + + part_names_prepared = true; } -BackupCoordinationDistributedBarrier::BackupCoordinationDistributedBarrier( - const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, const String & logger_name_, const String & operation_name_) +/// Helps to wait until all hosts come to a specified stage. +BackupCoordinationStageSync::BackupCoordinationStageSync(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, Poco::Logger * log_) : zookeeper_path(zookeeper_path_) , get_zookeeper(get_zookeeper_) - , log(&Poco::Logger::get(logger_name_)) - , operation_name(operation_name_) + , log(log_) { createRootNodes(); } -void BackupCoordinationDistributedBarrier::createRootNodes() +void BackupCoordinationStageSync::createRootNodes() { auto zookeeper = get_zookeeper(); zookeeper->createAncestors(zookeeper_path); zookeeper->createIfNotExists(zookeeper_path, ""); } -void BackupCoordinationDistributedBarrier::finish(const String & host_id, const String & error_message) +void BackupCoordinationStageSync::syncStage(const String & current_host, int new_stage, const Strings & wait_hosts, std::chrono::seconds timeout) { - if (error_message.empty()) - LOG_TRACE(log, "Host {} has finished {}", host_id, operation_name); - else - LOG_ERROR(log, "Host {} has failed {} with message: {}", host_id, operation_name, error_message); - + /// Put new stage to ZooKeeper. auto zookeeper = get_zookeeper(); - if (error_message.empty()) - zookeeper->create(zookeeper_path + "/" + host_id + ":ready", "", zkutil::CreateMode::Persistent); - else - zookeeper->create(zookeeper_path + "/" + host_id + ":error", error_message, zkutil::CreateMode::Persistent); -} + zookeeper->createIfNotExists(zookeeper_path + "/" + current_host + "|" + std::to_string(new_stage), ""); -void BackupCoordinationDistributedBarrier::waitForAllHostsToFinish(const Strings & host_ids, const std::chrono::seconds timeout) const -{ - auto zookeeper = get_zookeeper(); + if (wait_hosts.empty() || ((wait_hosts.size() == 1) && (wait_hosts.front() == current_host))) + return; - bool all_hosts_ready = false; - String not_ready_host_id; - String error_host_id; - String error_message; + /// Wait for other hosts. - /// Returns true of everything's ready, or false if we need to wait more. - auto process_nodes = [&](const Strings & nodes) + /// Current stages of all hosts. + std::optional host_with_error; + std::optional error_message; + + std::map> unready_hosts; + for (const String & host : wait_hosts) + unready_hosts.emplace(host, std::optional{}); + + /// Process ZooKeeper's nodes and set `all_hosts_ready` or `unready_host` or `error_message`. + auto process_zk_nodes = [&](const Strings & zk_nodes) { - std::unordered_set set{nodes.begin(), nodes.end()}; - for (const String & host_id : host_ids) + for (const String & zk_node : zk_nodes) { - if (set.contains(host_id + ":error")) + if (zk_node == "error") { - error_host_id = host_id; - error_message = zookeeper->get(zookeeper_path + "/" + host_id + ":error"); + String str = zookeeper->get(zookeeper_path + "/" + zk_node); + size_t separator_pos = str.find('|'); + if (separator_pos == String::npos) + throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, "Unexpected value of zk node {}: {}", zookeeper_path + "/" + zk_node, str); + host_with_error = str.substr(0, separator_pos); + error_message = str.substr(separator_pos + 1); return; } - if (!set.contains(host_id + ":ready")) + else if (!zk_node.starts_with("remove_watch-")) { - LOG_TRACE(log, "Waiting for host {} {}", host_id, operation_name); - not_ready_host_id = host_id; - return; + size_t separator_pos = zk_node.find('|'); + if (separator_pos == String::npos) + throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, "Unexpected zk node {}", zookeeper_path + "/" + zk_node); + String host = zk_node.substr(0, separator_pos); + int found_stage = parseFromString(zk_node.substr(separator_pos + 1)); + auto it = unready_hosts.find(host); + if (it != unready_hosts.end()) + { + auto & stage = it->second; + if (!stage || (stage < found_stage)) + stage = found_stage; + if (stage >= new_stage) + unready_hosts.erase(it); + } } } - - all_hosts_ready = true; }; + /// Wait until all hosts are ready or an error happens or time is out. std::atomic watch_set = false; std::condition_variable watch_triggered_event; @@ -347,33 +325,25 @@ void BackupCoordinationDistributedBarrier::waitForAllHostsToFinish(const Strings auto watch_triggered = [&] { return !watch_set; }; bool use_timeout = (timeout.count() >= 0); - std::chrono::steady_clock::duration time_left = timeout; + std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now(); + std::chrono::steady_clock::duration elapsed; std::mutex dummy_mutex; - while (true) + while (!unready_hosts.empty() && !error_message) { - if (use_timeout && (time_left.count() <= 0)) - { - Strings children = zookeeper->getChildren(zookeeper_path); - process_nodes(children); - break; - } - watch_set = true; - Strings children = zookeeper->getChildrenWatch(zookeeper_path, nullptr, watch_callback); - process_nodes(children); - - if (!error_message.empty() || all_hosts_ready) - break; + Strings nodes = zookeeper->getChildrenWatch(zookeeper_path, nullptr, watch_callback); + process_zk_nodes(nodes); + if (!unready_hosts.empty() && !error_message) { + LOG_TRACE(log, "Waiting for host {}", unready_hosts.begin()->first); std::unique_lock dummy_lock{dummy_mutex}; if (use_timeout) { - std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now(); - if (!watch_triggered_event.wait_for(dummy_lock, time_left, watch_triggered)) + elapsed = std::chrono::steady_clock::now() - start_time; + if ((elapsed > timeout) || !watch_triggered_event.wait_for(dummy_lock, timeout - elapsed, watch_triggered)) break; - time_left -= (std::chrono::steady_clock::now() - start_time); } else watch_triggered_event.wait(dummy_lock, watch_triggered); @@ -385,32 +355,26 @@ void BackupCoordinationDistributedBarrier::waitForAllHostsToFinish(const Strings /// Remove watch by triggering it. zookeeper->create(zookeeper_path + "/remove_watch-", "", zkutil::CreateMode::EphemeralSequential); std::unique_lock dummy_lock{dummy_mutex}; - watch_triggered_event.wait_for(dummy_lock, timeout, watch_triggered); + watch_triggered_event.wait(dummy_lock, watch_triggered); } - if (!error_message.empty()) + if (error_message) + throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, "Error occurred on host {}: {}", *host_with_error, *error_message); + + if (!unready_hosts.empty()) { throw Exception( ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, - "Host {} failed {} with message: {}", - error_host_id, - operation_name, - error_message); + "Waited for host {} too long ({})", + unready_hosts.begin()->first, + to_string(elapsed)); } +} - if (all_hosts_ready) - { - LOG_TRACE(log, "All hosts have finished {}", operation_name); - return; - } - - - throw Exception( - ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, - "Host {} has failed {}: Time ({}) is out", - not_ready_host_id, - operation_name, - to_string(timeout)); +void BackupCoordinationStageSync::syncStageError(const String & current_host, const String & error_message) +{ + auto zookeeper = get_zookeeper(); + zookeeper->createIfNotExists(zookeeper_path + "/error", current_host + "|" + error_message); } } diff --git a/src/Backups/BackupCoordinationHelpers.h b/src/Backups/BackupCoordinationHelpers.h index d119fc844ce..b0cd0440b98 100644 --- a/src/Backups/BackupCoordinationHelpers.h +++ b/src/Backups/BackupCoordinationHelpers.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -10,81 +11,67 @@ namespace DB { /// Helper designed to be used in an implementation of the IBackupCoordination interface in the part related to replicated tables. -class BackupCoordinationReplicatedTablesInfo +class BackupCoordinationReplicatedPartNames { public: - BackupCoordinationReplicatedTablesInfo() = default; - - /// Adds a data path in backup for a replicated table. - /// Multiple replicas of the replicated table call this function and then all the added paths can be returned by call of the function - /// getReplicatedTableDataPaths(). - void addDataPath(const String & table_zk_path, const String & table_data_path); - - /// Returns all the data paths in backup added for a replicated table (see also addReplicatedTableDataPath()). - Strings getDataPaths(const String & table_zk_path) const; + BackupCoordinationReplicatedPartNames(); + ~BackupCoordinationReplicatedPartNames(); using PartNameAndChecksum = IBackupCoordination::PartNameAndChecksum; /// Adds part names which a specified replica of a replicated table is going to put to the backup. /// Multiple replicas of the replicated table call this function and then the added part names can be returned by call of the function - /// getReplicatedTablePartNames(). + /// getPartNames(). /// Checksums are used only to control that parts under the same names on different replicas are the same. void addPartNames( - const String & host_id, - const DatabaseAndTableName & table_name, const String & table_zk_path, + const String & table_name_for_logs, + const String & replica_name, const std::vector & part_names_and_checksums); - void preparePartNamesByLocations(); - /// Returns the names of the parts which a specified replica of a replicated table should put to the backup. - /// This is the same list as it was added by call of the function addReplicatedTablePartNames() but without duplications and without + /// This is the same list as it was added by call of the function addPartNames() but without duplications and without /// parts covered by another parts. - Strings getPartNames(const String & host_id, const DatabaseAndTableName & table_name, const String & table_zk_path) const; + Strings getPartNames(const String & table_zk_path, const String & replica_name) const; private: - class CoveredPartsFinder; - struct HostAndTableName; + void preparePartNames() const; - struct PartLocations + class CoveredPartsFinder; + + struct PartReplicas { - std::vector> host_and_table_names; + std::vector> replica_names; UInt128 checksum; }; struct TableInfo { - Strings data_paths; - std::map part_locations_by_names; /// Should be ordered because we need this map to be in the same order on every replica. - std::unordered_map> part_names_by_locations; + std::map parts_replicas; /// Should be ordered because we need this map to be in the same order on every replica. + mutable std::unordered_map */, Strings> replicas_parts; + std::unique_ptr covered_parts_finder; }; - std::unordered_map tables; - bool part_names_by_locations_prepared = false; + std::map table_infos; /// Should be ordered because we need this map to be in the same order on every replica. + mutable bool part_names_prepared = false; }; -/// Helper designed to be used in the implementation of the BackupCoordinationDistributed and RestoreCoordinationDistributed classes -/// to implement synchronization when we need all hosts to finish a specific task and then continue. -class BackupCoordinationDistributedBarrier +/// Helps to wait until all hosts come to a specified stage. +class BackupCoordinationStageSync { public: - BackupCoordinationDistributedBarrier(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, const String & logger_name_, const String & operation_name_); + BackupCoordinationStageSync(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, Poco::Logger * log_); - /// Sets that a specified host has finished the specific task, successfully or with an error. - /// In the latter case `error_message` should be set. - void finish(const String & host_id, const String & error_message = {}); - - /// Waits for a specified list of hosts to finish the specific task. - void waitForAllHostsToFinish(const Strings & host_ids, const std::chrono::seconds timeout = std::chrono::seconds(-1) /* no timeout */) const; + void syncStage(const String & current_host, int stage, const Strings & wait_hosts, std::chrono::seconds timeout); + void syncStageError(const String & current_host, const String & error_message); private: void createRootNodes(); String zookeeper_path; zkutil::GetZooKeeper get_zookeeper; - const Poco::Logger * log; - String operation_name; + Poco::Logger * log; }; } diff --git a/src/Backups/BackupCoordinationLocal.cpp b/src/Backups/BackupCoordinationLocal.cpp index 2d970feb5a3..55a3c671a6e 100644 --- a/src/Backups/BackupCoordinationLocal.cpp +++ b/src/Backups/BackupCoordinationLocal.cpp @@ -10,47 +10,43 @@ namespace DB using SizeAndChecksum = IBackupCoordination::SizeAndChecksum; using FileInfo = IBackupCoordination::FileInfo; -BackupCoordinationLocal::BackupCoordinationLocal() : log(&Poco::Logger::get("BackupCoordination")) -{ -} - +BackupCoordinationLocal::BackupCoordinationLocal() = default; BackupCoordinationLocal::~BackupCoordinationLocal() = default; -void BackupCoordinationLocal::addReplicatedTableDataPath(const String & table_zk_path, const String & table_data_path) +void BackupCoordinationLocal::syncStage(const String &, int, const Strings &, std::chrono::seconds) +{ +} + +void BackupCoordinationLocal::syncStageError(const String &, const String &) +{ +} + +void BackupCoordinationLocal::addReplicatedPartNames(const String & table_zk_path, const String & table_name_for_logs, const String & replica_name, const std::vector & part_names_and_checksums) { std::lock_guard lock{mutex}; - replicated_tables.addDataPath(table_zk_path, table_data_path); + replicated_part_names.addPartNames(table_zk_path, table_name_for_logs, replica_name, part_names_and_checksums); } -void BackupCoordinationLocal::addReplicatedTablePartNames(const String & /* host_id */, const DatabaseAndTableName & table_name, const String & table_zk_path, const std::vector & part_names_and_checksums) +Strings BackupCoordinationLocal::getReplicatedPartNames(const String & table_zk_path, const String & replica_name) const { std::lock_guard lock{mutex}; - replicated_tables.addPartNames("", table_name, table_zk_path, part_names_and_checksums); + return replicated_part_names.getPartNames(table_zk_path, replica_name); } -void BackupCoordinationLocal::finishPreparing(const String & /* host_id */, const String & error_message) -{ - LOG_TRACE(log, "Finished preparing{}", (error_message.empty() ? "" : (" with error " + error_message))); - if (!error_message.empty()) - return; - replicated_tables.preparePartNamesByLocations(); -} - -void BackupCoordinationLocal::waitForAllHostsPrepared(const Strings & /* host_ids */, std::chrono::seconds /* timeout */) const -{ -} - -Strings BackupCoordinationLocal::getReplicatedTableDataPaths(const String & table_zk_path) const +void BackupCoordinationLocal::addReplicatedDataPath(const String & table_zk_path, const String & data_path) { std::lock_guard lock{mutex}; - return replicated_tables.getDataPaths(table_zk_path); + replicated_data_paths[table_zk_path].push_back(data_path); } -Strings BackupCoordinationLocal::getReplicatedTablePartNames(const String & /* host_id */, const DatabaseAndTableName & table_name, const String & table_zk_path) const +Strings BackupCoordinationLocal::getReplicatedDataPaths(const String & table_zk_path) const { std::lock_guard lock{mutex}; - return replicated_tables.getPartNames("", table_name, table_zk_path); + auto it = replicated_data_paths.find(table_zk_path); + if (it == replicated_data_paths.end()) + return {}; + return it->second; } @@ -93,9 +89,14 @@ std::vector BackupCoordinationLocal::getAllFileInfos() const return res; } -Strings BackupCoordinationLocal::listFiles(const String & prefix, const String & terminator) const +Strings BackupCoordinationLocal::listFiles(const String & directory, bool recursive) const { std::lock_guard lock{mutex}; + String prefix = directory; + if (!prefix.empty() && !prefix.ends_with('/')) + prefix += '/'; + String terminator = recursive ? "" : "/"; + Strings elements; for (auto it = file_names.lower_bound(prefix); it != file_names.end(); ++it) { @@ -111,9 +112,25 @@ Strings BackupCoordinationLocal::listFiles(const String & prefix, const String & continue; elements.push_back(String{new_element}); } + return elements; } +bool BackupCoordinationLocal::hasFiles(const String & directory) const +{ + std::lock_guard lock{mutex}; + String prefix = directory; + if (!prefix.empty() && !prefix.ends_with('/')) + prefix += '/'; + + auto it = file_names.lower_bound(prefix); + if (it == file_names.end()) + return false; + + const String & name = it->first; + return name.starts_with(prefix); +} + std::optional BackupCoordinationLocal::getFileInfo(const String & file_name) const { std::lock_guard lock{mutex}; diff --git a/src/Backups/BackupCoordinationLocal.h b/src/Backups/BackupCoordinationLocal.h index 875e519353b..6529184c61a 100644 --- a/src/Backups/BackupCoordinationLocal.h +++ b/src/Backups/BackupCoordinationLocal.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -18,24 +19,22 @@ public: BackupCoordinationLocal(); ~BackupCoordinationLocal() override; - void addReplicatedTableDataPath(const String & table_zk_path, const String & table_data_path) override; - void addReplicatedTablePartNames( - const String & host_id, - const DatabaseAndTableName & table_name, - const String & table_zk_path, - const std::vector & part_names_and_checksums) override; + void syncStage(const String & current_host, int stage, const Strings & wait_hosts, std::chrono::seconds timeout) override; + void syncStageError(const String & current_host, const String & error_message) override; - void finishPreparing(const String & host_id, const String & error_message) override; - void waitForAllHostsPrepared(const Strings & host_ids, std::chrono::seconds timeout) const override; + void addReplicatedPartNames(const String & table_zk_path, const String & table_name_for_logs, const String & replica_name, + const std::vector & part_names_and_checksums) override; + Strings getReplicatedPartNames(const String & table_zk_path, const String & replica_name) const override; - Strings getReplicatedTableDataPaths(const String & table_zk_path) const override; - Strings getReplicatedTablePartNames(const String & host_id, const DatabaseAndTableName & table_name, const String & table_zk_path) const override; + void addReplicatedDataPath(const String & table_zk_path, const String & data_path) override; + Strings getReplicatedDataPaths(const String & table_zk_path) const override; void addFileInfo(const FileInfo & file_info, bool & is_data_file_required) override; void updateFileInfo(const FileInfo & file_info) override; std::vector getAllFileInfos() const override; - Strings listFiles(const String & prefix, const String & terminator) const override; + Strings listFiles(const String & directory, bool recursive) const override; + bool hasFiles(const String & directory) const override; std::optional getFileInfo(const String & file_name) const override; std::optional getFileInfo(const SizeAndChecksum & size_and_checksum) const override; @@ -46,13 +45,12 @@ public: private: mutable std::mutex mutex; - BackupCoordinationReplicatedTablesInfo replicated_tables; - std::map file_names; /// Should be ordered alphabetically, see listFiles(). For empty files we assume checksum = 0. - std::map file_infos; /// Information about files. Without empty files. - Strings archive_suffixes; - size_t current_archive_suffix = 0; - - const Poco::Logger * log; + BackupCoordinationReplicatedPartNames replicated_part_names TSA_GUARDED_BY(mutex); + std::unordered_map replicated_data_paths TSA_GUARDED_BY(mutex); + std::map file_names TSA_GUARDED_BY(mutex); /// Should be ordered alphabetically, see listFiles(). For empty files we assume checksum = 0. + std::map file_infos TSA_GUARDED_BY(mutex); /// Information about files. Without empty files. + Strings archive_suffixes TSA_GUARDED_BY(mutex); + size_t current_archive_suffix TSA_GUARDED_BY(mutex) = 0; }; diff --git a/src/Backups/BackupEntriesCollector.cpp b/src/Backups/BackupEntriesCollector.cpp new file mode 100644 index 00000000000..322bc00ee3c --- /dev/null +++ b/src/Backups/BackupEntriesCollector.cpp @@ -0,0 +1,550 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace fs = std::filesystem; + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_COLLECT_OBJECTS_FOR_BACKUP; + extern const int CANNOT_BACKUP_TABLE; + extern const int TABLE_IS_DROPPED; + extern const int LOGICAL_ERROR; +} + + +bool BackupEntriesCollector::TableKey::operator ==(const TableKey & right) const +{ + return (name == right.name) && (is_temporary == right.is_temporary); +} + +bool BackupEntriesCollector::TableKey::operator <(const TableKey & right) const +{ + return (name < right.name) || ((name == right.name) && (is_temporary < right.is_temporary)); +} + +std::string_view BackupEntriesCollector::toString(Stage stage) +{ + switch (stage) + { + case Stage::kPreparing: return "Preparing"; + case Stage::kFindingTables: return "Finding tables"; + case Stage::kExtractingDataFromTables: return "Extracting data from tables"; + case Stage::kRunningPostTasks: return "Running post tasks"; + case Stage::kWritingBackup: return "Writing backup"; + case Stage::kError: return "Error"; + } + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup stage: {}", static_cast(stage)); +} + + +BackupEntriesCollector::BackupEntriesCollector( + const ASTBackupQuery::Elements & backup_query_elements_, + const BackupSettings & backup_settings_, + std::shared_ptr backup_coordination_, + const ContextPtr & context_, + std::chrono::seconds timeout_) + : backup_query_elements(backup_query_elements_) + , backup_settings(backup_settings_) + , backup_coordination(backup_coordination_) + , context(context_) + , timeout(timeout_) + , log(&Poco::Logger::get("BackupEntriesCollector")) +{ +} + +BackupEntriesCollector::~BackupEntriesCollector() = default; + +BackupEntries BackupEntriesCollector::getBackupEntries() +{ + try + { + /// getBackupEntries() must not be called multiple times. + if (current_stage != Stage::kPreparing) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Already making backup entries"); + + /// Calculate the root path for collecting backup entries, it's either empty or has the format "shards//replicas//". + calculateRootPathInBackup(); + + /// Do renaming in the create queries according to the renaming config. + renaming_map = makeRenamingMapFromBackupQuery(backup_query_elements); + + /// Find databases and tables which we're going to put to the backup. + setStage(Stage::kFindingTables); + collectDatabasesAndTablesInfo(); + + /// Make backup entries for the definitions of the found databases. + makeBackupEntriesForDatabasesDefs(); + + /// Make backup entries for the definitions of the found tables. + makeBackupEntriesForTablesDefs(); + + /// Make backup entries for the data of the found tables. + setStage(Stage::kExtractingDataFromTables); + makeBackupEntriesForTablesData(); + + /// Run all the tasks added with addPostCollectingTask(). + setStage(Stage::kRunningPostTasks); + runPostCollectingTasks(); + + /// No more backup entries or tasks are allowed after this point. + setStage(Stage::kWritingBackup); + + return std::move(backup_entries); + } + catch (...) + { + try + { + setStage(Stage::kError, getCurrentExceptionMessage(false)); + } + catch (...) + { + } + throw; + } +} + +void BackupEntriesCollector::setStage(Stage new_stage, const String & error_message) +{ + if (new_stage == Stage::kError) + LOG_ERROR(log, "{} failed with error: {}", toString(current_stage), error_message); + else + LOG_TRACE(log, "{}", toString(new_stage)); + + current_stage = new_stage; + + if (new_stage == Stage::kError) + { + backup_coordination->syncStageError(backup_settings.host_id, error_message); + } + else + { + auto all_hosts + = BackupSettings::Util::filterHostIDs(backup_settings.cluster_host_ids, backup_settings.shard_num, backup_settings.replica_num); + backup_coordination->syncStage(backup_settings.host_id, static_cast(new_stage), all_hosts, timeout); + } +} + +/// Calculates the root path for collecting backup entries, +/// it's either empty or has the format "shards//replicas//". +void BackupEntriesCollector::calculateRootPathInBackup() +{ + root_path_in_backup = "/"; + if (!backup_settings.host_id.empty()) + { + auto [shard_num, replica_num] + = BackupSettings::Util::findShardNumAndReplicaNum(backup_settings.cluster_host_ids, backup_settings.host_id); + root_path_in_backup = root_path_in_backup / fs::path{"shards"} / std::to_string(shard_num) / "replicas" / std::to_string(replica_num); + } + LOG_TRACE(log, "Will use path in backup: {}", doubleQuoteString(String{root_path_in_backup})); +} + +/// Finds databases and tables which we will put to the backup. +void BackupEntriesCollector::collectDatabasesAndTablesInfo() +{ + bool use_timeout = (timeout.count() >= 0); + auto start_time = std::chrono::steady_clock::now(); + + int pass = 0; + do + { + database_infos.clear(); + table_infos.clear(); + consistent = true; + + /// Collect information about databases and tables specified in the BACKUP query. + for (const auto & element : backup_query_elements) + { + switch (element.type) + { + case ASTBackupQuery::ElementType::TABLE: + { + collectTableInfo({element.database_name, element.table_name}, false, element.partitions, true); + break; + } + + case ASTBackupQuery::ElementType::TEMPORARY_TABLE: + { + collectTableInfo({"", element.table_name}, true, element.partitions, true); + break; + } + + case ASTBackupQuery::ElementType::DATABASE: + { + collectDatabaseInfo(element.database_name, element.except_tables, true); + break; + } + + case ASTBackupQuery::ElementType::ALL: + { + collectAllDatabasesInfo(element.except_databases, element.except_tables); + break; + } + } + } + + /// We have to check consistency of collected information to protect from the case when some table or database is + /// renamed during this collecting making the collected information invalid. + checkConsistency(); + + /// Two passes is absolute minimum (see `previous_table_names` & `previous_database_names`). + auto elapsed = std::chrono::steady_clock::now() - start_time; + if (!consistent && (pass >= 2) && use_timeout) + { + if (elapsed > timeout) + throw Exception( + ErrorCodes::CANNOT_COLLECT_OBJECTS_FOR_BACKUP, + "Couldn't collect tables and databases to make a backup (pass #{}, elapsed {})", + pass, + to_string(elapsed)); + } + + if (pass >= 2) + LOG_WARNING(log, "Couldn't collect tables and databases to make a backup (pass #{}, elapsed {})", pass, to_string(elapsed)); + ++pass; + } while (!consistent); + + LOG_INFO(log, "Will backup {} databases and {} tables", database_infos.size(), table_infos.size()); +} + +void BackupEntriesCollector::collectTableInfo( + const QualifiedTableName & table_name, bool is_temporary_table, const std::optional & partitions, bool throw_if_not_found) +{ + /// Gather information about the table. + DatabasePtr database; + StoragePtr storage; + TableLockHolder table_lock; + ASTPtr create_table_query; + + TableKey table_key{table_name, is_temporary_table}; + + if (throw_if_not_found) + { + auto resolved_id = is_temporary_table + ? context->resolveStorageID(StorageID{"", table_name.table}, Context::ResolveExternal) + : context->resolveStorageID(StorageID{table_name.database, table_name.table}, Context::ResolveGlobal); + std::tie(database, storage) = DatabaseCatalog::instance().getDatabaseAndTable(resolved_id, context); + table_lock = storage->lockForShare(context->getInitialQueryId(), context->getSettingsRef().lock_acquire_timeout); + create_table_query = storage->getCreateQueryForBackup(*this); + } + else + { + auto resolved_id = is_temporary_table + ? context->tryResolveStorageID(StorageID{"", table_name.table}, Context::ResolveExternal) + : context->tryResolveStorageID(StorageID{table_name.database, table_name.table}, Context::ResolveGlobal); + if (!resolved_id.empty()) + std::tie(database, storage) = DatabaseCatalog::instance().tryGetDatabaseAndTable(resolved_id, context); + + if (storage) + { + try + { + table_lock = storage->lockForShare(context->getInitialQueryId(), context->getSettingsRef().lock_acquire_timeout); + create_table_query = storage->getCreateQueryForBackup(*this); + } + catch (Exception & e) + { + if (e.code() != ErrorCodes::TABLE_IS_DROPPED) + throw; + } + } + + if (!create_table_query) + { + consistent &= !table_infos.contains(table_key); + return; + } + } + + fs::path data_path_in_backup; + if (is_temporary_table) + { + auto table_name_in_backup = renaming_map.getNewTemporaryTableName(table_name.table); + data_path_in_backup = root_path_in_backup / "temporary_tables" / "data" / escapeForFileName(table_name_in_backup); + } + else + { + auto table_name_in_backup = renaming_map.getNewTableName(table_name); + data_path_in_backup + = root_path_in_backup / "data" / escapeForFileName(table_name_in_backup.database) / escapeForFileName(table_name_in_backup.table); + } + + /// Check that information is consistent. + const auto & create = create_table_query->as(); + if ((create.getTable() != table_name.table) || (is_temporary_table != create.temporary) || (create.getDatabase() != table_name.database)) + { + /// Table was renamed recently. + consistent = false; + return; + } + + if (auto it = table_infos.find(table_key); it != table_infos.end()) + { + const auto & table_info = it->second; + if ((table_info.database != database) || (table_info.storage != storage)) + { + /// Table was renamed recently. + consistent = false; + return; + } + } + + /// Add information to `table_infos`. + auto & res_table_info = table_infos[table_key]; + res_table_info.database = database; + res_table_info.storage = storage; + res_table_info.table_lock = table_lock; + res_table_info.create_table_query = create_table_query; + res_table_info.data_path_in_backup = data_path_in_backup; + + if (partitions) + { + if (!res_table_info.partitions) + res_table_info.partitions.emplace(); + insertAtEnd(*res_table_info.partitions, *partitions); + } +} + +void BackupEntriesCollector::collectDatabaseInfo(const String & database_name, const std::set & except_table_names, bool throw_if_not_found) +{ + /// Gather information about the database. + DatabasePtr database; + ASTPtr create_database_query; + + if (throw_if_not_found) + { + database = DatabaseCatalog::instance().getDatabase(database_name); + create_database_query = database->getCreateDatabaseQueryForBackup(); + } + else + { + database = DatabaseCatalog::instance().tryGetDatabase(database_name); + if (!database) + { + consistent &= !database_infos.contains(database_name); + return; + } + + try + { + create_database_query = database->getCreateDatabaseQueryForBackup(); + } + catch (...) + { + /// The database has been dropped recently. + consistent &= !database_infos.contains(database_name); + return; + } + } + + /// Check that information is consistent. + const auto & create = create_database_query->as(); + if (create.getDatabase() != database_name) + { + /// Database was renamed recently. + consistent = false; + return; + } + + if (auto it = database_infos.find(database_name); it != database_infos.end()) + { + const auto & database_info = it->second; + if (database_info.database != database) + { + /// Database was renamed recently. + consistent = false; + return; + } + } + + /// Add information to `database_infos`. + auto & res_database_info = database_infos[database_name]; + res_database_info.database = database; + res_database_info.create_database_query = create_database_query; + + /// Add information about tables too. + for (auto it = database->getTablesIteratorForBackup(*this); it->isValid(); it->next()) + { + if (except_table_names.contains({database_name, it->name()})) + continue; + + collectTableInfo({database_name, it->name()}, /* is_temporary_table= */ false, {}, /* throw_if_not_found= */ false); + if (!consistent) + return; + } +} + +void BackupEntriesCollector::collectAllDatabasesInfo(const std::set & except_database_names, const std::set & except_table_names) +{ + for (const auto & [database_name, database] : DatabaseCatalog::instance().getDatabases()) + { + if (except_database_names.contains(database_name)) + continue; + collectDatabaseInfo(database_name, except_table_names, false); + if (!consistent) + return; + } +} + +/// Check for consistency of collected information about databases and tables. +void BackupEntriesCollector::checkConsistency() +{ + if (!consistent) + return; /// Already inconsistent, no more checks necessary + + /// Databases found while we were scanning tables and while we were scanning databases - must be the same. + for (const auto & [key, table_info] : table_infos) + { + auto it = database_infos.find(key.name.database); + if (it != database_infos.end()) + { + const auto & database_info = it->second; + if (database_info.database != table_info.database) + { + consistent = false; + return; + } + } + } + + /// We need to scan tables at least twice to be sure that we haven't missed any table which could be renamed + /// while we were scanning. + std::set database_names; + std::set table_names; + boost::range::copy(database_infos | boost::adaptors::map_keys, std::inserter(database_names, database_names.end())); + boost::range::copy(table_infos | boost::adaptors::map_keys, std::inserter(table_names, table_names.end())); + + if (!previous_database_names || !previous_table_names || (*previous_database_names != database_names) + || (*previous_table_names != table_names)) + { + previous_database_names = std::move(database_names); + previous_table_names = std::move(table_names); + consistent = false; + } +} + +/// Make backup entries for all the definitions of all the databases found. +void BackupEntriesCollector::makeBackupEntriesForDatabasesDefs() +{ + for (const auto & [database_name, database_info] : database_infos) + { + LOG_TRACE(log, "Adding definition of database {}", backQuoteIfNeed(database_name)); + + ASTPtr new_create_query = database_info.create_database_query; + renameDatabaseAndTableNameInCreateQuery(context->getGlobalContext(), renaming_map, new_create_query); + + String new_database_name = renaming_map.getNewDatabaseName(database_name); + auto metadata_path_in_backup = root_path_in_backup / "metadata" / (escapeForFileName(new_database_name) + ".sql"); + + backup_entries.emplace_back(metadata_path_in_backup, std::make_shared(serializeAST(*new_create_query))); + } +} + +/// Calls IDatabase::backupTable() for all the tables found to make backup entries for tables. +void BackupEntriesCollector::makeBackupEntriesForTablesDefs() +{ + for (const auto & [key, table_info] : table_infos) + { + LOG_TRACE(log, "Adding definition of {}table {}", (key.is_temporary ? "temporary " : ""), key.name.getFullName()); + + ASTPtr new_create_query = table_info.create_table_query; + renameDatabaseAndTableNameInCreateQuery(context->getGlobalContext(), renaming_map, new_create_query); + + fs::path metadata_path_in_backup; + if (key.is_temporary) + { + auto new_name = renaming_map.getNewTemporaryTableName(key.name.table); + metadata_path_in_backup = root_path_in_backup / "temporary_tables" / "metadata" / (escapeForFileName(new_name) + ".sql"); + } + else + { + auto new_name = renaming_map.getNewTableName(key.name); + metadata_path_in_backup + = root_path_in_backup / "metadata" / escapeForFileName(new_name.database) / (escapeForFileName(new_name.table) + ".sql"); + } + + backup_entries.emplace_back(metadata_path_in_backup, std::make_shared(serializeAST(*new_create_query))); + } +} + +void BackupEntriesCollector::makeBackupEntriesForTablesData() +{ + if (backup_settings.structure_only) + return; + + for (const auto & [key, table_info] : table_infos) + { + LOG_TRACE(log, "Adding data of {}table {}", (key.is_temporary ? "temporary " : ""), key.name.getFullName()); + const auto & storage = table_info.storage; + const auto & data_path_in_backup = table_info.data_path_in_backup; + const auto & partitions = table_info.partitions; + storage->backupData(*this, data_path_in_backup, partitions); + } +} + +void BackupEntriesCollector::addBackupEntry(const String & file_name, BackupEntryPtr backup_entry) +{ + if (current_stage == Stage::kWritingBackup) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding backup entries is not allowed"); + backup_entries.emplace_back(file_name, backup_entry); +} + +void BackupEntriesCollector::addBackupEntries(const BackupEntries & backup_entries_) +{ + if (current_stage == Stage::kWritingBackup) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding backup entries is not allowed"); + insertAtEnd(backup_entries, backup_entries_); +} + +void BackupEntriesCollector::addBackupEntries(BackupEntries && backup_entries_) +{ + if (current_stage == Stage::kWritingBackup) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding backup entries is not allowed"); + insertAtEnd(backup_entries, std::move(backup_entries_)); +} + +void BackupEntriesCollector::addPostCollectingTask(std::function task) +{ + if (current_stage == Stage::kWritingBackup) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding post tasks is not allowed"); + post_collecting_tasks.push(std::move(task)); +} + +/// Runs all the tasks added with addPostCollectingTask(). +void BackupEntriesCollector::runPostCollectingTasks() +{ + /// Post collecting tasks can add other post collecting tasks, our code is fine with that. + while (!post_collecting_tasks.empty()) + { + auto task = std::move(post_collecting_tasks.front()); + post_collecting_tasks.pop(); + std::move(task)(); + } +} + +void BackupEntriesCollector::throwPartitionsNotSupported(const StorageID & storage_id, const String & table_engine) +{ + throw Exception( + ErrorCodes::CANNOT_BACKUP_TABLE, + "Table engine {} doesn't support partitions, cannot backup table {}", + table_engine, + storage_id.getFullTableName()); +} + +} diff --git a/src/Backups/BackupEntriesCollector.h b/src/Backups/BackupEntriesCollector.h new file mode 100644 index 00000000000..1466815f3a7 --- /dev/null +++ b/src/Backups/BackupEntriesCollector.h @@ -0,0 +1,138 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +class IBackupEntry; +using BackupEntryPtr = std::shared_ptr; +using BackupEntries = std::vector>; +class IBackupCoordination; +class IDatabase; +using DatabasePtr = std::shared_ptr; +struct StorageID; + +/// Collects backup entries for all databases and tables which should be put to a backup. +class BackupEntriesCollector : private boost::noncopyable +{ +public: + BackupEntriesCollector(const ASTBackupQuery::Elements & backup_query_elements_, + const BackupSettings & backup_settings_, + std::shared_ptr backup_coordination_, + const ContextPtr & context_, + std::chrono::seconds timeout_ = std::chrono::seconds(-1) /* no timeout */); + ~BackupEntriesCollector(); + + /// Collects backup entries and returns the result. + /// This function first generates a list of databases and then call IDatabase::backup() for each database from this list. + /// At this moment IDatabase::backup() calls IStorage::backup() and they both call addBackupEntry() to build a list of backup entries. + BackupEntries getBackupEntries(); + + const BackupSettings & getBackupSettings() const { return backup_settings; } + std::shared_ptr getBackupCoordination() const { return backup_coordination; } + ContextPtr getContext() const { return context; } + + /// Adds a backup entry which will be later returned by getBackupEntries(). + /// These function can be called by implementations of IStorage::backup() in inherited storage classes. + void addBackupEntry(const String & file_name, BackupEntryPtr backup_entry); + void addBackupEntries(const BackupEntries & backup_entries_); + void addBackupEntries(BackupEntries && backup_entries_); + + /// Adds a function which must be called after all IStorage::backup() have finished their work on all hosts. + /// This function is designed to help making a consistent in some complex cases like + /// 1) we need to join (in a backup) the data of replicated tables gathered on different hosts. + void addPostCollectingTask(std::function task); + + /// Writing a backup includes a few stages: + enum class Stage + { + /// Initial stage. + kPreparing, + + /// Finding all tables and databases which we're going to put to the backup. + kFindingTables, + + /// Making temporary hard links and prepare backup entries. + kExtractingDataFromTables, + + /// Running special tasks for replicated databases or tables which can also prepare some backup entries. + kRunningPostTasks, + + /// Writing backup entries to the backup and removing temporary hard links. + kWritingBackup, + + /// An error happens during any of the stages above, the backup won't be written. + kError, + }; + static std::string_view toString(Stage stage); + + /// Throws an exception that a specified table engine doesn't support partitions. + [[noreturn]] static void throwPartitionsNotSupported(const StorageID & storage_id, const String & table_engine); + +private: + void setStage(Stage new_stage, const String & error_message = {}); + void calculateRootPathInBackup(); + void collectDatabasesAndTablesInfo(); + void collectTableInfo(const QualifiedTableName & table_name, bool is_temporary_table, const std::optional & partitions, bool throw_if_not_found); + void collectDatabaseInfo(const String & database_name, const std::set & except_table_names, bool throw_if_not_found); + void collectAllDatabasesInfo(const std::set & except_database_names, const std::set & except_table_names); + void checkConsistency(); + void makeBackupEntriesForDatabasesDefs(); + void makeBackupEntriesForTablesDefs(); + void makeBackupEntriesForTablesData(); + void runPostCollectingTasks(); + + const ASTBackupQuery::Elements backup_query_elements; + const BackupSettings backup_settings; + std::shared_ptr backup_coordination; + ContextPtr context; + std::chrono::seconds timeout; + Poco::Logger * log; + + Stage current_stage = Stage::kPreparing; + std::filesystem::path root_path_in_backup; + DDLRenamingMap renaming_map; + + struct DatabaseInfo + { + DatabasePtr database; + ASTPtr create_database_query; + }; + + struct TableInfo + { + DatabasePtr database; + StoragePtr storage; + TableLockHolder table_lock; + ASTPtr create_table_query; + std::filesystem::path data_path_in_backup; + std::optional partitions; + }; + + struct TableKey + { + QualifiedTableName name; + bool is_temporary = false; + bool operator ==(const TableKey & right) const; + bool operator <(const TableKey & right) const; + }; + + std::unordered_map database_infos; + std::map table_infos; + std::optional> previous_database_names; + std::optional> previous_table_names; + bool consistent = false; + + BackupEntries backup_entries; + std::queue> post_collecting_tasks; +}; + +} diff --git a/src/Backups/BackupEntryFromImmutableFile.h b/src/Backups/BackupEntryFromImmutableFile.h index 4d5f47b4f61..5103518c873 100644 --- a/src/Backups/BackupEntryFromImmutableFile.h +++ b/src/Backups/BackupEntryFromImmutableFile.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include namespace Poco { class TemporaryFile; } @@ -41,7 +42,7 @@ public: private: const DiskPtr disk; const String file_path; - mutable std::optional file_size; + mutable std::optional file_size TSA_GUARDED_BY(get_file_size_mutex); mutable std::mutex get_file_size_mutex; const std::optional checksum; const std::shared_ptr temporary_file; diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index 6fc32e08dbd..09c1cd5e9b5 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -36,7 +36,7 @@ namespace ErrorCodes extern const int WRONG_BASE_BACKUP; extern const int BACKUP_ENTRY_ALREADY_EXISTS; extern const int BACKUP_ENTRY_NOT_FOUND; - extern const int BAD_ARGUMENTS; + extern const int BACKUP_IS_EMPTY; extern const int LOGICAL_ERROR; } @@ -65,6 +65,14 @@ namespace { return hexChecksum(size_and_checksum.second) + std::to_string(size_and_checksum.first); } + + /// We store entries' file names in the backup without leading slashes. + String removeLeadingSlash(const String & path) + { + if (path.starts_with('/')) + return path.substr(1); + return path; + } } @@ -151,7 +159,7 @@ BackupImpl::BackupImpl( , uuid(backup_uuid_) , version(CURRENT_BACKUP_VERSION) , base_backup_info(base_backup_info_) - , log(&Poco::Logger::get("Backup")) + , log(&Poco::Logger::get("BackupImpl")) { open(context_); } @@ -218,13 +226,6 @@ void BackupImpl::close() { std::lock_guard lock{mutex}; - if (!is_internal_backup && writing_finalized) - { - LOG_TRACE(log, "Finalizing backup {}", backup_name); - writeBackupMetadata(); - LOG_INFO(log, "Finalized backup {}", backup_name); - } - archive_readers.clear(); for (auto & archive_writer : archive_writers) archive_writer = {"", nullptr}; @@ -249,10 +250,12 @@ void BackupImpl::writeBackupMetadata() config->setString("timestamp", toString(LocalDateTime{timestamp})); config->setString("uuid", toString(*uuid)); + auto all_file_infos = coordination->getAllFileInfos(); + if (base_backup_info) { bool base_backup_in_use = false; - for (const auto & info : coordination->getAllFileInfos()) + for (const auto & info : all_file_infos) { if (info.base_size) base_backup_in_use = true; @@ -266,13 +269,13 @@ void BackupImpl::writeBackupMetadata() } size_t index = 0; - for (const auto & info : coordination->getAllFileInfos()) + for (const auto & info : all_file_infos) { String prefix = index ? "contents.file[" + std::to_string(index) + "]." : "contents.file."; + config->setString(prefix + "name", info.file_name); config->setUInt(prefix + "size", info.size); if (info.size) { - config->setString(prefix + "name", info.file_name); config->setString(prefix + "checksum", hexChecksum(info.checksum)); if (info.base_size) { @@ -303,6 +306,7 @@ void BackupImpl::writeBackupMetadata() else out = writer->writeFile(".backup"); out->write(str.data(), str.size()); + out->finalize(); } void BackupImpl::readBackupMetadata() @@ -375,18 +379,25 @@ void BackupImpl::readBackupMetadata() } } -Strings BackupImpl::listFiles(const String & prefix, const String & terminator) const +Strings BackupImpl::listFiles(const String & directory, bool recursive) const { std::lock_guard lock{mutex}; - if (!prefix.ends_with('/') && !prefix.empty()) - throw Exception("prefix should end with '/'", ErrorCodes::BAD_ARGUMENTS); - return coordination->listFiles(prefix, terminator); + auto adjusted_dir = removeLeadingSlash(directory); + return coordination->listFiles(adjusted_dir, recursive); +} + +bool BackupImpl::hasFiles(const String & directory) const +{ + std::lock_guard lock{mutex}; + auto adjusted_dir = removeLeadingSlash(directory); + return coordination->hasFiles(adjusted_dir); } bool BackupImpl::fileExists(const String & file_name) const { std::lock_guard lock{mutex}; - return coordination->getFileInfo(file_name).has_value(); + auto adjusted_path = removeLeadingSlash(file_name); + return coordination->getFileInfo(adjusted_path).has_value(); } bool BackupImpl::fileExists(const SizeAndChecksum & size_and_checksum) const @@ -398,7 +409,8 @@ bool BackupImpl::fileExists(const SizeAndChecksum & size_and_checksum) const UInt64 BackupImpl::getFileSize(const String & file_name) const { std::lock_guard lock{mutex}; - auto info = coordination->getFileInfo(file_name); + auto adjusted_path = removeLeadingSlash(file_name); + auto info = coordination->getFileInfo(adjusted_path); if (!info) throw Exception( ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", backup_name, quoteString(file_name)); @@ -408,7 +420,8 @@ UInt64 BackupImpl::getFileSize(const String & file_name) const UInt128 BackupImpl::getFileChecksum(const String & file_name) const { std::lock_guard lock{mutex}; - auto info = coordination->getFileInfo(file_name); + auto adjusted_path = removeLeadingSlash(file_name); + auto info = coordination->getFileInfo(adjusted_path); if (!info) throw Exception( ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", backup_name, quoteString(file_name)); @@ -418,7 +431,8 @@ UInt128 BackupImpl::getFileChecksum(const String & file_name) const SizeAndChecksum BackupImpl::getFileSizeAndChecksum(const String & file_name) const { std::lock_guard lock{mutex}; - auto info = coordination->getFileInfo(file_name); + auto adjusted_path = removeLeadingSlash(file_name); + auto info = coordination->getFileInfo(adjusted_path); if (!info) throw Exception( ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", backup_name, quoteString(file_name)); @@ -436,17 +450,18 @@ BackupEntryPtr BackupImpl::readFile(const SizeAndChecksum & size_and_checksum) c if (open_mode != OpenMode::READ) throw Exception("Backup is not opened for reading", ErrorCodes::LOGICAL_ERROR); + if (!size_and_checksum.first) + { + /// Entry's data is empty. + return std::make_unique(nullptr, 0, UInt128{0, 0}); + } + auto info_opt = coordination->getFileInfo(size_and_checksum); if (!info_opt) throw Exception( ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", backup_name, formatSizeAndChecksum(size_and_checksum)); const auto & info = *info_opt; - if (!info.size) - { - /// Entry's data is empty. - return std::make_unique(nullptr, 0, UInt128{0, 0}); - } if (!info.base_size) { @@ -494,12 +509,16 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) if (open_mode != OpenMode::WRITE) throw Exception("Backup is not opened for writing", ErrorCodes::LOGICAL_ERROR); - if (coordination->getFileInfo(file_name)) + if (writing_finalized) + throw Exception("Backup is already finalized", ErrorCodes::LOGICAL_ERROR); + + auto adjusted_path = removeLeadingSlash(file_name); + if (coordination->getFileInfo(adjusted_path)) throw Exception( ErrorCodes::BACKUP_ENTRY_ALREADY_EXISTS, "Backup {}: Entry {} already exists", backup_name, quoteString(file_name)); FileInfo info; - info.file_name = file_name; + info.file_name = adjusted_path; size_t size = entry->getSize(); info.size = size; @@ -520,13 +539,13 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) } /// Check if a entry with such name exists in the base backup. - bool base_exists = (base_backup && base_backup->fileExists(file_name)); + bool base_exists = (base_backup && base_backup->fileExists(adjusted_path)); UInt64 base_size = 0; UInt128 base_checksum{0, 0}; if (base_exists) { - base_size = base_backup->getFileSize(file_name); - base_checksum = base_backup->getFileChecksum(file_name); + base_size = base_backup->getFileSize(adjusted_path); + base_checksum = base_backup->getFileChecksum(adjusted_path); } std::unique_ptr read_buffer; /// We'll set that later. @@ -647,6 +666,7 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) } copyData(*read_buffer, *out); + out->finalize(); } @@ -656,6 +676,19 @@ void BackupImpl::finalizeWriting() if (open_mode != OpenMode::WRITE) throw Exception("Backup is not opened for writing", ErrorCodes::LOGICAL_ERROR); + if (writing_finalized) + throw Exception("Backup is already finalized", ErrorCodes::LOGICAL_ERROR); + + if (!coordination->hasFiles("")) + throw Exception("Backup must not be empty", ErrorCodes::BACKUP_IS_EMPTY); + + if (!is_internal_backup) + { + LOG_TRACE(log, "Finalizing backup {}", backup_name); + writeBackupMetadata(); + LOG_TRACE(log, "Finalized backup {}", backup_name); + } + writing_finalized = true; } diff --git a/src/Backups/BackupImpl.h b/src/Backups/BackupImpl.h index bb31a76ead1..f8c5bc0cf5f 100644 --- a/src/Backups/BackupImpl.h +++ b/src/Backups/BackupImpl.h @@ -57,7 +57,8 @@ public: OpenMode getOpenMode() const override { return open_mode; } time_t getTimestamp() const override; UUID getUUID() const override { return *uuid; } - Strings listFiles(const String & prefix, const String & terminator) const override; + Strings listFiles(const String & directory, bool recursive) const override; + bool hasFiles(const String & directory) const override; bool fileExists(const String & file_name) const override; bool fileExists(const SizeAndChecksum & size_and_checksum) const override; UInt64 getFileSize(const String & file_name) const override; diff --git a/src/Backups/BackupUtils.cpp b/src/Backups/BackupUtils.cpp index 270dad2d594..08a5836ef31 100644 --- a/src/Backups/BackupUtils.cpp +++ b/src/Backups/BackupUtils.cpp @@ -1,437 +1,60 @@ #include -#include -#include -#include -#include #include -#include -#include -#include -#include +#include #include -#include -#include -#include -#include -#include -#include -#include +#include +#include namespace DB { -namespace ErrorCodes -{ - extern const int CANNOT_BACKUP_TABLE; - extern const int CANNOT_BACKUP_DATABASE; - extern const int BACKUP_IS_EMPTY; - extern const int LOGICAL_ERROR; -} -namespace +DDLRenamingMap makeRenamingMapFromBackupQuery(const ASTBackupQuery::Elements & elements) { - /// Helper to calculate paths inside a backup. - class PathsInBackup + DDLRenamingMap map; + + for (const auto & element : elements) { - public: - /// Returns the path to metadata in backup. - static String getMetadataPath(const DatabaseAndTableName & table_name, size_t shard_index, size_t replica_index) + switch (element.type) { - if (table_name.first.empty() || table_name.second.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Database name and table name must not be empty"); - return getPathForShardAndReplica(shard_index, replica_index) + String{"metadata/"} + escapeForFileName(table_name.first) + "/" - + escapeForFileName(table_name.second) + ".sql"; - } - - static String getMetadataPath(const String & database_name, size_t shard_index, size_t replica_index) - { - if (database_name.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Database name must not be empty"); - return getPathForShardAndReplica(shard_index, replica_index) + String{"metadata/"} + escapeForFileName(database_name) + ".sql"; - } - - static String getMetadataPath(const IAST & create_query, size_t shard_index, size_t replica_index) - { - const auto & create = create_query.as(); - if (!create.table) - return getMetadataPath(create.getDatabase(), shard_index, replica_index); - if (create.temporary) - return getMetadataPath({DatabaseCatalog::TEMPORARY_DATABASE, create.getTable()}, shard_index, replica_index); - return getMetadataPath({create.getDatabase(), create.getTable()}, shard_index, replica_index); - } - - /// Returns the path to table's data in backup. - static String getDataPath(const DatabaseAndTableName & table_name, size_t shard_index, size_t replica_index) - { - if (table_name.first.empty() || table_name.second.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Database name and table name must not be empty"); - assert(!table_name.first.empty() && !table_name.second.empty()); - return getPathForShardAndReplica(shard_index, replica_index) + String{"data/"} + escapeForFileName(table_name.first) + "/" - + escapeForFileName(table_name.second) + "/"; - } - - static String getDataPath(const IAST & create_query, size_t shard_index, size_t replica_index) - { - const auto & create = create_query.as(); - if (!create.table) - return {}; - if (create.temporary) - return getDataPath({DatabaseCatalog::TEMPORARY_DATABASE, create.getTable()}, shard_index, replica_index); - return getDataPath({create.getDatabase(), create.getTable()}, shard_index, replica_index); - } - - private: - static String getPathForShardAndReplica(size_t shard_index, size_t replica_index) - { - if (shard_index || replica_index) - return fmt::format("shards/{}/replicas/{}/", shard_index, replica_index); - else - return ""; - } - }; - - using Kind = ASTBackupQuery::Kind; - using Element = ASTBackupQuery::Element; - using Elements = ASTBackupQuery::Elements; - using ElementType = ASTBackupQuery::ElementType; - - /// Makes backup entries to backup databases and tables according to the elements of ASTBackupQuery. - /// Keep this class consistent with RestoreTasksBuilder. - class BackupEntriesBuilder - { - public: - BackupEntriesBuilder(const ContextPtr & context_, const BackupSettings & backup_settings_, std::shared_ptr backup_coordination_) - : context(context_), backup_settings(backup_settings_), backup_coordination(backup_coordination_) - { - } - - /// Prepares internal structures for making backup entries. - void prepare(const ASTBackupQuery::Elements & elements, std::chrono::seconds timeout_for_other_nodes_to_prepare) - { - try + case ASTBackupQuery::TABLE: { - prepareImpl(elements); - } - catch (...) - { - backup_coordination->finishPreparing(backup_settings.host_id, getCurrentExceptionMessage(false)); - throw; + const String & table_name = element.table_name; + const String & database_name = element.database_name; + const String & new_table_name = element.new_table_name; + const String & new_database_name = element.new_database_name; + assert(!table_name.empty()); + assert(!new_table_name.empty()); + assert(!database_name.empty()); + assert(!new_database_name.empty()); + map.setNewTableName({database_name, table_name}, {new_database_name, new_table_name}); + break; } - /// We've finished restoring metadata, now we will wait for other replicas and shards to finish too. - /// We need this waiting because we're going to call some functions which requires data collected from other nodes too, - /// see IRestoreCoordination::checkTablesNotExistedInReplicatedDBs(), IRestoreCoordination::getReplicatedTableDataPath(). - backup_coordination->finishPreparing(backup_settings.host_id); - - backup_coordination->waitForAllHostsPrepared( - BackupSettings::Util::filterHostIDs( - backup_settings.cluster_host_ids, backup_settings.shard_num, backup_settings.replica_num), - timeout_for_other_nodes_to_prepare); - } - - /// Makes backup entries, should be called after prepare(). - BackupEntries makeBackupEntries() const - { - BackupEntries res; - for (const auto & info : databases | boost::adaptors::map_values) - res.push_back(makeBackupEntryForMetadata(*info.create_query)); - - for (const auto & info : tables | boost::adaptors::map_values) + case ASTBackupQuery::TEMPORARY_TABLE: { - res.push_back(makeBackupEntryForMetadata(*info.create_query)); - appendBackupEntriesForData(res, info); + const String & table_name = element.table_name; + const String & new_table_name = element.new_table_name; + assert(!table_name.empty()); + assert(!new_table_name.empty()); + map.setNewTemporaryTableName(table_name, new_table_name); + break; } - /// A backup cannot be empty. - if (res.empty()) - throw Exception("Backup must not be empty", ErrorCodes::BACKUP_IS_EMPTY); - - return res; - } - - private: - void prepareImpl(const ASTBackupQuery::Elements & elements) - { - calculateShardNumAndReplicaNumInBackup(); - renaming_settings.setFromBackupQuery(elements); - - for (const auto & element : elements) + case ASTBackupQuery::DATABASE: { - switch (element.type) - { - case ElementType::TABLE: - { - prepareToBackupTable(element.name, element.partitions); - break; - } - - case ElementType::DATABASE: - { - const String & database_name = element.name.first; - prepareToBackupDatabase(database_name, element.except_list); - break; - } - - case ElementType::ALL_DATABASES: - { - prepareToBackupAllDatabases(element.except_list); - break; - } - } - } - } - - void calculateShardNumAndReplicaNumInBackup() - { - size_t shard_num = 0; - size_t replica_num = 0; - if (!backup_settings.host_id.empty()) - { - std::tie(shard_num, replica_num) - = BackupSettings::Util::findShardNumAndReplicaNum(backup_settings.cluster_host_ids, backup_settings.host_id); - } - shard_num_in_backup = shard_num; - replica_num_in_backup = replica_num; - } - - /// Prepares to backup a single table and probably its database's definition. - void prepareToBackupTable(const DatabaseAndTableName & table_name_, const ASTs & partitions_) - { - auto [database, storage] = DatabaseCatalog::instance().getDatabaseAndTable({table_name_.first, table_name_.second}, context); - prepareToBackupTable(table_name_, {database, storage}, partitions_); - } - - void prepareToBackupTable(const DatabaseAndTableName & table_name_, const DatabaseAndTable & table_, const ASTs & partitions_) - { - const auto & database = table_.first; - const auto & storage = table_.second; - - if (!database->hasTablesToBackup()) - throw Exception( - ErrorCodes::CANNOT_BACKUP_TABLE, - "Cannot backup the {} because it's contained in a hollow database (engine: {})", - formatTableNameOrTemporaryTableName(table_name_), - database->getEngineName()); - - /// Check that we are not trying to backup the same table again. - DatabaseAndTableName name_in_backup = renaming_settings.getNewTableName(table_name_); - if (tables.contains(name_in_backup)) - throw Exception(ErrorCodes::CANNOT_BACKUP_TABLE, "Cannot backup the {} twice", formatTableNameOrTemporaryTableName(name_in_backup)); - - /// Make a create query for this table. - auto create_query = prepareCreateQueryForBackup(database->getCreateTableQuery(table_name_.second, context)); - String data_path = PathsInBackup::getDataPath(*create_query, shard_num_in_backup, replica_num_in_backup); - - String zk_path; - BackupEntries data = prepareToBackupTableData(table_name_, storage, partitions_, data_path, zk_path); - - TableInfo info; - info.table_name = table_name_; - info.create_query = create_query; - info.storage = storage; - info.data = std::move(data); - info.data_path = std::move(data_path); - info.zk_path = std::move(zk_path); - tables[name_in_backup] = std::move(info); - } - - BackupEntries prepareToBackupTableData(const DatabaseAndTableName & table_name_, const StoragePtr & storage_, const ASTs & partitions_, const String & data_path, String & zk_path) - { - zk_path.clear(); - - const StorageReplicatedMergeTree * replicated_table = typeid_cast(storage_.get()); - bool has_data = (storage_->hasDataToBackup() || replicated_table) && !backup_settings.structure_only; - if (!has_data) - return {}; - - BackupEntries data = storage_->backupData(context, partitions_); - if (!replicated_table) - return data; - - zk_path = replicated_table->getZooKeeperName() + replicated_table->getZooKeeperPath(); - backup_coordination->addReplicatedTableDataPath(zk_path, data_path); - std::unordered_map parts; - for (const auto & [relative_path, backup_entry] : data) - { - size_t slash_pos = relative_path.find('/'); - if (slash_pos != String::npos) - { - String part_name = relative_path.substr(0, slash_pos); - if (MergeTreePartInfo::tryParsePartName(part_name, MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING)) - { - auto & hash = parts[part_name]; - if (relative_path.ends_with(".bin")) - { - auto checksum = backup_entry->getChecksum(); - hash.update(relative_path); - hash.update(backup_entry->getSize()); - hash.update(*checksum); - } - } - } + const String & database_name = element.database_name; + const String & new_database_name = element.new_database_name; + assert(!database_name.empty()); + assert(!new_database_name.empty()); + map.setNewDatabaseName(database_name, new_database_name); + break; } - std::vector part_names_and_checksums; - part_names_and_checksums.reserve(parts.size()); - for (auto & [part_name, hash] : parts) - { - UInt128 checksum; - hash.get128(checksum); - auto & part_name_and_checksum = part_names_and_checksums.emplace_back(); - part_name_and_checksum.part_name = part_name; - part_name_and_checksum.checksum = checksum; - } - backup_coordination->addReplicatedTablePartNames(backup_settings.host_id, table_name_, zk_path, part_names_and_checksums); - - return data; + case ASTBackupQuery::ALL: break; } - - /// Prepares to restore a database and all tables in it. - void prepareToBackupDatabase(const String & database_name_, const std::set & except_list_) - { - auto database = DatabaseCatalog::instance().getDatabase(database_name_, context); - prepareToBackupDatabase(database_name_, database, except_list_); - } - - void prepareToBackupDatabase(const String & database_name_, const DatabasePtr & database_, const std::set & except_list_) - { - /// Check that we are not trying to restore the same database again. - String name_in_backup = renaming_settings.getNewDatabaseName(database_name_); - if (databases.contains(name_in_backup)) - throw Exception(ErrorCodes::CANNOT_BACKUP_DATABASE, "Cannot backup the database {} twice", backQuoteIfNeed(name_in_backup)); - - /// Of course we're not going to backup the definition of the system or the temporary database. - if (!isSystemOrTemporaryDatabase(database_name_)) - { - /// Make a create query for this database. - auto create_query = prepareCreateQueryForBackup(database_->getCreateDatabaseQuery()); - - DatabaseInfo info; - info.create_query = create_query; - databases[name_in_backup] = std::move(info); - } - - /// Backup tables in this database. - if (database_->hasTablesToBackup()) - { - for (auto it = database_->getTablesIterator(context); it->isValid(); it->next()) - { - if (except_list_.contains(it->name())) - continue; - prepareToBackupTable({database_name_, it->name()}, {database_, it->table()}, {}); - } - } - } - - /// Prepares to backup all the databases contained in the backup. - void prepareToBackupAllDatabases(const std::set & except_list_) - { - for (const auto & [database_name, database] : DatabaseCatalog::instance().getDatabases()) - { - if (except_list_.contains(database_name)) - continue; - if (isSystemOrTemporaryDatabase(database_name)) - continue; - prepareToBackupDatabase(database_name, database, {}); - } - } - - /// Do renaming in the create query according to the renaming config. - std::shared_ptr prepareCreateQueryForBackup(const ASTPtr & ast) const - { - ASTPtr query = ast; - ::DB::renameInCreateQuery(query, context, renaming_settings); - auto create_query = typeid_cast>(query); - replaceTableUUIDWithMacroInReplicatedTableDef(*create_query, create_query->uuid); - create_query->uuid = UUIDHelpers::Nil; - create_query->to_inner_uuid = UUIDHelpers::Nil; - return create_query; - } - - static bool isSystemOrTemporaryDatabase(const String & database_name) - { - return (database_name == DatabaseCatalog::SYSTEM_DATABASE) || (database_name == DatabaseCatalog::TEMPORARY_DATABASE); - } - - std::pair makeBackupEntryForMetadata(const IAST & create_query) const - { - auto metadata_entry = std::make_unique(serializeAST(create_query)); - String metadata_path = PathsInBackup::getMetadataPath(create_query, shard_num_in_backup, replica_num_in_backup); - return {metadata_path, std::move(metadata_entry)}; - } - - struct TableInfo; - - void appendBackupEntriesForData(BackupEntries & res, const TableInfo & info) const - { - if (info.zk_path.empty()) - { - for (const auto & [relative_path, backup_entry] : info.data) - res.emplace_back(info.data_path + relative_path, backup_entry); - return; - } - - Strings data_paths = backup_coordination->getReplicatedTableDataPaths(info.zk_path); - Strings part_names = backup_coordination->getReplicatedTablePartNames(backup_settings.host_id, info.table_name, info.zk_path); - std::unordered_set part_names_set{part_names.begin(), part_names.end()}; - for (const auto & [relative_path, backup_entry] : info.data) - { - size_t slash_pos = relative_path.find('/'); - if (slash_pos != String::npos) - { - String part_name = relative_path.substr(0, slash_pos); - if (MergeTreePartInfo::tryParsePartName(part_name, MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING)) - { - if (!part_names_set.contains(part_name)) - continue; - for (const auto & data_path : data_paths) - res.emplace_back(data_path + relative_path, backup_entry); - continue; - } - } - res.emplace_back(info.data_path + relative_path, backup_entry); - } - } - - /// Information which is used to make an instance of RestoreTableFromBackupTask. - struct TableInfo - { - DatabaseAndTableName table_name; - ASTPtr create_query; - StoragePtr storage; - BackupEntries data; - String data_path; - String zk_path; - }; - - /// Information which is used to make an instance of RestoreDatabaseFromBackupTask. - struct DatabaseInfo - { - ASTPtr create_query; - }; - - ContextPtr context; - BackupSettings backup_settings; - std::shared_ptr backup_coordination; - size_t shard_num_in_backup = 0; - size_t replica_num_in_backup = 0; - DDLRenamingSettings renaming_settings; - std::unordered_map databases; - std::map tables; - }; -} - - -BackupEntries makeBackupEntries( - const ContextPtr & context, - const Elements & elements, - const BackupSettings & backup_settings, - std::shared_ptr backup_coordination, - std::chrono::seconds timeout_for_other_nodes_to_prepare) -{ - BackupEntriesBuilder builder{context, backup_settings, backup_coordination}; - builder.prepare(elements, timeout_for_other_nodes_to_prepare); - return builder.makeBackupEntries(); + } + return map; } @@ -499,13 +122,73 @@ void writeBackupEntries(BackupMutablePtr backup, BackupEntries && backup_entries /// And IBackup's implementation should remove the backup in its destructor if finalizeWriting() hasn't called before. std::rethrow_exception(exception); } +} - backup->finalizeWriting(); + +void restoreTablesData(DataRestoreTasks && tasks, ThreadPool & thread_pool) +{ + size_t num_active_jobs = 0; + std::mutex mutex; + std::condition_variable event; + std::exception_ptr exception; + + for (auto & task : tasks) + { + { + std::unique_lock lock{mutex}; + if (exception) + break; + ++num_active_jobs; + } + + auto job = [&]() + { + SCOPE_EXIT({ + std::lock_guard lock{mutex}; + if (!--num_active_jobs) + event.notify_all(); + }); + + { + std::lock_guard lock{mutex}; + if (exception) + return; + } + + try + { + std::move(task)(); + } + catch (...) + { + std::lock_guard lock{mutex}; + if (!exception) + exception = std::current_exception(); + } + }; + + if (!thread_pool.trySchedule(job)) + job(); + } + + { + std::unique_lock lock{mutex}; + event.wait(lock, [&] { return !num_active_jobs; }); + } + + tasks.clear(); + + if (exception) + { + /// We don't call finalizeWriting() if an error occurs. + /// And IBackup's implementation should remove the backup in its destructor if finalizeWriting() hasn't called before. + std::rethrow_exception(exception); + } } /// Returns access required to execute BACKUP query. -AccessRightsElements getRequiredAccessToBackup(const ASTBackupQuery::Elements & elements, const BackupSettings & backup_settings) +AccessRightsElements getRequiredAccessToBackup(const ASTBackupQuery::Elements & elements) { AccessRightsElements required_access; for (const auto & element : elements) @@ -514,32 +197,27 @@ AccessRightsElements getRequiredAccessToBackup(const ASTBackupQuery::Elements & { case ASTBackupQuery::TABLE: { - if (element.is_temp_db) - break; - AccessFlags flags = AccessType::SHOW_TABLES; - if (!backup_settings.structure_only) - flags |= AccessType::SELECT; - required_access.emplace_back(flags, element.name.first, element.name.second); + required_access.emplace_back(AccessType::BACKUP, element.database_name, element.table_name); break; } + + case ASTBackupQuery::TEMPORARY_TABLE: + { + /// It's always allowed to backup temporary tables. + break; + } + case ASTBackupQuery::DATABASE: { - if (element.is_temp_db) - break; - AccessFlags flags = AccessType::SHOW_TABLES | AccessType::SHOW_DATABASES; - if (!backup_settings.structure_only) - flags |= AccessType::SELECT; - required_access.emplace_back(flags, element.name.first); - /// TODO: It's better to process `element.except_list` somehow. + /// TODO: It's better to process `element.except_tables` somehow. + required_access.emplace_back(AccessType::BACKUP, element.database_name); break; } - case ASTBackupQuery::ALL_DATABASES: + + case ASTBackupQuery::ALL: { - AccessFlags flags = AccessType::SHOW_TABLES | AccessType::SHOW_DATABASES; - if (!backup_settings.structure_only) - flags |= AccessType::SELECT; - required_access.emplace_back(flags); - /// TODO: It's better to process `element.except_list` somehow. + /// TODO: It's better to process `element.except_databases` & `element.except_tables` somehow. + required_access.emplace_back(AccessType::BACKUP); break; } } diff --git a/src/Backups/BackupUtils.h b/src/Backups/BackupUtils.h index 315443a0f1d..cda9121b1fa 100644 --- a/src/Backups/BackupUtils.h +++ b/src/Backups/BackupUtils.h @@ -7,29 +7,23 @@ namespace DB { class IBackup; -using BackupPtr = std::shared_ptr; using BackupMutablePtr = std::shared_ptr; class IBackupEntry; -using BackupEntryPtr = std::shared_ptr; -using BackupEntries = std::vector>; -struct BackupSettings; -class IBackupCoordination; +using BackupEntries = std::vector>>; +using DataRestoreTasks = std::vector>; class AccessRightsElements; -class Context; -using ContextPtr = std::shared_ptr; +class DDLRenamingMap; -/// Prepares backup entries. -BackupEntries makeBackupEntries( - const ContextPtr & context, - const ASTBackupQuery::Elements & elements, - const BackupSettings & backup_settings, - std::shared_ptr backup_coordination, - std::chrono::seconds timeout_for_other_nodes_to_prepare = std::chrono::seconds::zero()); +/// Initializes a DDLRenamingMap from a BACKUP or RESTORE query. +DDLRenamingMap makeRenamingMapFromBackupQuery(const ASTBackupQuery::Elements & elements); /// Write backup entries to an opened backup. void writeBackupEntries(BackupMutablePtr backup, BackupEntries && backup_entries, ThreadPool & thread_pool); +/// Run data restoring tasks which insert data to tables. +void restoreTablesData(DataRestoreTasks && tasks, ThreadPool & thread_pool); + /// Returns access required to execute BACKUP query. -AccessRightsElements getRequiredAccessToBackup(const ASTBackupQuery::Elements & elements, const BackupSettings & backup_settings); +AccessRightsElements getRequiredAccessToBackup(const ASTBackupQuery::Elements & elements); } diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index e84ec350be7..c19d730cf7e 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -4,13 +4,13 @@ #include #include #include +#include #include #include -#include #include #include #include -#include +#include #include #include #include @@ -51,119 +51,121 @@ UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & c { UUID backup_uuid = UUIDHelpers::generateV4(); auto backup_query = std::static_pointer_cast(query->clone()); - auto backup_info = BackupInfo::fromAST(*backup_query->backup_name); auto backup_settings = BackupSettings::fromBackupQuery(*backup_query); + auto backup_info = BackupInfo::fromAST(*backup_query->backup_name); + + bool on_cluster = !backup_query->cluster.empty(); + ContextPtr context_in_use = context; + ContextMutablePtr mutable_context; + if (on_cluster || backup_settings.async) + context_in_use = mutable_context = Context::createCopy(context); addInfo(backup_uuid, backup_info.toString(), BackupStatus::MAKING_BACKUP, backup_settings.internal); - std::shared_ptr backup_coordination; - SCOPE_EXIT({ - if (backup_coordination && !backup_settings.internal) - backup_coordination->drop(); - }); - - BackupMutablePtr backup; - ContextPtr cloned_context; - bool on_cluster = !backup_query->cluster.empty(); - std::shared_ptr on_cluster_io; - - try - { - auto access_to_check = getRequiredAccessToBackup(backup_query->elements, backup_settings); - if (!on_cluster) - context->checkAccess(access_to_check); - - ClusterPtr cluster; - if (on_cluster) - { - backup_query->cluster = context->getMacros()->expand(backup_query->cluster); - cluster = context->getCluster(backup_query->cluster); - backup_settings.cluster_host_ids = cluster->getHostIDs(); - if (backup_settings.coordination_zk_path.empty()) - { - String root_zk_path = context->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups"); - backup_settings.coordination_zk_path = root_zk_path + "/backup-" + toString(backup_uuid); - } - backup_settings.copySettingsToQuery(*backup_query); - } - - if (!backup_settings.coordination_zk_path.empty()) - backup_coordination = std::make_shared( - backup_settings.coordination_zk_path, - [global_context = context->getGlobalContext()] { return global_context->getZooKeeper(); }); - else - backup_coordination = std::make_shared(); - - BackupFactory::CreateParams backup_create_params; - backup_create_params.open_mode = IBackup::OpenMode::WRITE; - backup_create_params.context = context; - backup_create_params.backup_info = backup_info; - backup_create_params.base_backup_info = backup_settings.base_backup_info; - backup_create_params.compression_method = backup_settings.compression_method; - backup_create_params.compression_level = backup_settings.compression_level; - backup_create_params.password = backup_settings.password; - backup_create_params.backup_uuid = backup_uuid; - backup_create_params.is_internal_backup = backup_settings.internal; - backup_create_params.backup_coordination = backup_coordination; - backup = BackupFactory::instance().createBackup(backup_create_params); - - ContextMutablePtr mutable_context; - if (on_cluster || backup_settings.async) - cloned_context = mutable_context = Context::createCopy(context); - else - cloned_context = context; /// No need to clone context - - if (on_cluster) - { - DDLQueryOnClusterParams params; - params.cluster = cluster; - params.only_shard_num = backup_settings.shard_num; - params.only_replica_num = backup_settings.replica_num; - params.access_to_check = access_to_check; - mutable_context->setSetting("distributed_ddl_task_timeout", -1); // No timeout - mutable_context->setSetting("distributed_ddl_output_mode", Field{"throw"}); - auto res = executeDDLQueryOnCluster(backup_query, mutable_context, params); - on_cluster_io = std::make_shared(std::move(res)); - } - } - catch (...) - { - setStatus(backup_uuid, BackupStatus::FAILED_TO_BACKUP); - throw; - } - auto job = [this, - backup, backup_uuid, backup_query, backup_settings, - backup_coordination, - on_cluster_io, - cloned_context](bool in_separate_thread) + backup_info, + on_cluster, + context_in_use, + mutable_context](bool in_separate_thread) mutable { try { - if (on_cluster_io) + /// Checks access rights if this is not ON CLUSTER query. + /// (If this is ON CLUSTER query executeDDLQueryOnCluster() will check access rights later.) + auto required_access = getRequiredAccessToBackup(backup_query->elements); + if (!on_cluster) + context_in_use->checkAccess(required_access); + + /// Make a backup coordination. + std::shared_ptr backup_coordination; + SCOPE_EXIT({ + if (backup_coordination && !backup_settings.internal) + backup_coordination->drop(); + }); + + ClusterPtr cluster; + if (on_cluster) { + backup_query->cluster = context_in_use->getMacros()->expand(backup_query->cluster); + cluster = context_in_use->getCluster(backup_query->cluster); + backup_settings.cluster_host_ids = cluster->getHostIDs(); + if (backup_settings.coordination_zk_path.empty()) + { + String root_zk_path = context_in_use->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups"); + backup_settings.coordination_zk_path = root_zk_path + "/backup-" + toString(backup_uuid); + } + } + + if (!backup_settings.coordination_zk_path.empty()) + { + backup_coordination = std::make_shared( + backup_settings.coordination_zk_path, + [global_context = context_in_use->getGlobalContext()] { return global_context->getZooKeeper(); }); + } + else + { + backup_coordination = std::make_shared(); + } + + /// Opens a backup for writing. + BackupFactory::CreateParams backup_create_params; + backup_create_params.open_mode = IBackup::OpenMode::WRITE; + backup_create_params.context = context_in_use; + backup_create_params.backup_info = backup_info; + backup_create_params.base_backup_info = backup_settings.base_backup_info; + backup_create_params.compression_method = backup_settings.compression_method; + backup_create_params.compression_level = backup_settings.compression_level; + backup_create_params.password = backup_settings.password; + backup_create_params.backup_uuid = backup_uuid; + backup_create_params.is_internal_backup = backup_settings.internal; + backup_create_params.backup_coordination = backup_coordination; + BackupMutablePtr backup = BackupFactory::instance().createBackup(backup_create_params); + + /// Write the backup. + if (on_cluster) + { + DDLQueryOnClusterParams params; + params.cluster = cluster; + params.only_shard_num = backup_settings.shard_num; + params.only_replica_num = backup_settings.replica_num; + params.access_to_check = required_access; + mutable_context->setSetting("distributed_ddl_task_timeout", -1); // No timeout + mutable_context->setSetting("distributed_ddl_output_mode", Field{"throw"}); + backup_settings.copySettingsToQuery(*backup_query); + auto res = executeDDLQueryOnCluster(backup_query, mutable_context, params); + auto on_cluster_io = std::make_shared(std::move(res)); PullingPipelineExecutor executor(on_cluster_io->pipeline); Block block; - while (executor.pull(block)) - ; - backup->finalizeWriting(); + while (executor.pull(block)); } else { std::optional query_scope; if (in_separate_thread) - query_scope.emplace(cloned_context); + query_scope.emplace(context_in_use); - backup_query->setDatabase(cloned_context->getCurrentDatabase()); + backup_query->setCurrentDatabase(context_in_use->getCurrentDatabase()); + + BackupEntries backup_entries; + { + auto timeout = std::chrono::seconds{context_in_use->getConfigRef().getInt("backups.backup_prepare_timeout", -1)}; + BackupEntriesCollector backup_entries_collector{backup_query->elements, backup_settings, backup_coordination, context_in_use, timeout}; + backup_entries = backup_entries_collector.getBackupEntries(); + } - auto timeout_for_preparing = std::chrono::seconds{cloned_context->getConfigRef().getInt("backups.backup_prepare_timeout", -1)}; - auto backup_entries - = makeBackupEntries(cloned_context, backup_query->elements, backup_settings, backup_coordination, timeout_for_preparing); writeBackupEntries(backup, std::move(backup_entries), backups_thread_pool); } + + /// Finalize backup (write its metadata). + if (!backup_settings.internal) + backup->finalizeWriting(); + + /// Close the backup. + backup.reset(); + setStatus(backup_uuid, BackupStatus::BACKUP_COMPLETE); } catch (...) @@ -175,7 +177,7 @@ UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & c }; if (backup_settings.async) - backups_thread_pool.scheduleOrThrowOnError([job] { job(true); }); + backups_thread_pool.scheduleOrThrowOnError([job]() mutable { job(true); }); else job(false); @@ -187,85 +189,99 @@ UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr conte { UUID restore_uuid = UUIDHelpers::generateV4(); auto restore_query = std::static_pointer_cast(query->clone()); - auto backup_info = BackupInfo::fromAST(*restore_query->backup_name); auto restore_settings = RestoreSettings::fromRestoreQuery(*restore_query); + auto backup_info = BackupInfo::fromAST(*restore_query->backup_name); + + bool on_cluster = !restore_query->cluster.empty(); + ContextMutablePtr context_in_use = context; + if (restore_settings.async || on_cluster) + context_in_use = Context::createCopy(context); addInfo(restore_uuid, backup_info.toString(), BackupStatus::RESTORING, restore_settings.internal); - std::shared_ptr restore_coordination; - SCOPE_EXIT({ - if (restore_coordination && !restore_settings.internal) - restore_coordination->drop(); - }); - - ContextMutablePtr cloned_context; - std::shared_ptr on_cluster_io; - bool on_cluster = !restore_query->cluster.empty(); - - try - { - auto access_to_check = getRequiredAccessToRestore(restore_query->elements, restore_settings); - if (!on_cluster) - context->checkAccess(access_to_check); - - ClusterPtr cluster; - if (on_cluster) - { - restore_query->cluster = context->getMacros()->expand(restore_query->cluster); - cluster = context->getCluster(restore_query->cluster); - restore_settings.cluster_host_ids = cluster->getHostIDs(); - if (restore_settings.coordination_zk_path.empty()) - { - String root_zk_path = context->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups"); - restore_settings.coordination_zk_path = root_zk_path + "/restore-" + toString(restore_uuid); - } - restore_settings.copySettingsToQuery(*restore_query); - } - - if (!restore_settings.coordination_zk_path.empty()) - restore_coordination = std::make_shared( - restore_settings.coordination_zk_path, - [global_context = context->getGlobalContext()] { return global_context->getZooKeeper(); }); - else - restore_coordination = std::make_shared(); - - if (on_cluster || restore_settings.async) - cloned_context = Context::createCopy(context); - else - cloned_context = context; /// No need to clone context - - if (on_cluster) - { - DDLQueryOnClusterParams params; - params.cluster = cluster; - params.only_shard_num = restore_settings.shard_num; - params.only_replica_num = restore_settings.replica_num; - params.access_to_check = access_to_check; - cloned_context->setSetting("distributed_ddl_task_timeout", -1); // No timeout - cloned_context->setSetting("distributed_ddl_output_mode", Field{"throw"}); - auto res = executeDDLQueryOnCluster(restore_query, cloned_context, params); - on_cluster_io = std::make_shared(std::move(res)); - } - } - catch (...) - { - setStatus(restore_uuid, BackupStatus::FAILED_TO_RESTORE); - throw; - } - auto job = [this, - backup_info, restore_uuid, restore_query, restore_settings, - restore_coordination, - on_cluster_io, - cloned_context](bool in_separate_thread) + backup_info, + on_cluster, + context_in_use](bool in_separate_thread) mutable { try { - if (on_cluster_io) + /// Open the backup for reading. + BackupFactory::CreateParams backup_open_params; + backup_open_params.open_mode = IBackup::OpenMode::READ; + backup_open_params.context = context_in_use; + backup_open_params.backup_info = backup_info; + backup_open_params.base_backup_info = restore_settings.base_backup_info; + backup_open_params.password = restore_settings.password; + BackupPtr backup = BackupFactory::instance().createBackup(backup_open_params); + + String current_database = context_in_use->getCurrentDatabase(); + + /// Checks access rights if this is ON CLUSTER query. + /// (If this isn't ON CLUSTER query RestorerFromBackup will check access rights later.) + ClusterPtr cluster; + if (on_cluster) { + restore_query->cluster = context_in_use->getMacros()->expand(restore_query->cluster); + cluster = context_in_use->getCluster(restore_query->cluster); + restore_settings.cluster_host_ids = cluster->getHostIDs(); + + /// We cannot just use access checking provided by the function executeDDLQueryOnCluster(): it would be incorrect + /// because different replicas can contain different set of tables and so the required access rights can differ too. + /// So the right way is pass through the entire cluster and check access for each host. + auto addresses = cluster->filterAddressesByShardOrReplica(restore_settings.shard_num, restore_settings.replica_num); + for (const auto * address : addresses) + { + restore_settings.host_id = address->toString(); + auto restore_elements = restore_query->elements; + String addr_database = address->default_database.empty() ? current_database : address->default_database; + for (auto & element : restore_elements) + element.setCurrentDatabase(addr_database); + RestorerFromBackup dummy_restorer{restore_elements, restore_settings, nullptr, backup, context_in_use, {}}; + dummy_restorer.checkAccessOnly(); + } + } + + /// Make a restore coordination. + std::shared_ptr restore_coordination; + SCOPE_EXIT({ + if (restore_coordination && !restore_settings.internal) + restore_coordination->drop(); + }); + + if (on_cluster && restore_settings.coordination_zk_path.empty()) + { + String root_zk_path = context_in_use->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups"); + restore_settings.coordination_zk_path = root_zk_path + "/restore-" + toString(restore_uuid); + } + + if (!restore_settings.coordination_zk_path.empty()) + { + restore_coordination = std::make_shared( + restore_settings.coordination_zk_path, + [global_context = context_in_use->getGlobalContext()] { return global_context->getZooKeeper(); }); + } + else + { + restore_coordination = std::make_shared(); + } + + /// Do RESTORE. + if (on_cluster) + { + + DDLQueryOnClusterParams params; + params.cluster = cluster; + params.only_shard_num = restore_settings.shard_num; + params.only_replica_num = restore_settings.replica_num; + context_in_use->setSetting("distributed_ddl_task_timeout", -1); // No timeout + context_in_use->setSetting("distributed_ddl_output_mode", Field{"throw"}); + restore_settings.copySettingsToQuery(*restore_query); + auto res = executeDDLQueryOnCluster(restore_query, context_in_use, params); + auto on_cluster_io = std::make_shared(std::move(res)); PullingPipelineExecutor executor(on_cluster_io->pipeline); Block block; while (executor.pull(block)) @@ -275,24 +291,20 @@ UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr conte { std::optional query_scope; if (in_separate_thread) - query_scope.emplace(cloned_context); + query_scope.emplace(context_in_use); - restore_query->setDatabase(cloned_context->getCurrentDatabase()); + restore_query->setCurrentDatabase(current_database); - BackupFactory::CreateParams backup_open_params; - backup_open_params.open_mode = IBackup::OpenMode::READ; - backup_open_params.context = cloned_context; - backup_open_params.backup_info = backup_info; - backup_open_params.base_backup_info = restore_settings.base_backup_info; - backup_open_params.password = restore_settings.password; - BackupPtr backup = BackupFactory::instance().createBackup(backup_open_params); + DataRestoreTasks data_restore_tasks; + { + auto timeout = std::chrono::seconds{context_in_use->getConfigRef().getInt("backups.restore_metadata_timeout", -1)}; + RestorerFromBackup restorer{restore_query->elements, restore_settings, restore_coordination, + backup, context_in_use, timeout}; + restorer.restoreMetadata(); + data_restore_tasks = restorer.getDataRestoreTasks(); + } - auto timeout_for_restoring_metadata - = std::chrono::seconds{cloned_context->getConfigRef().getInt("backups.restore_metadata_timeout", -1)}; - auto restore_tasks = makeRestoreTasks( - cloned_context, backup, restore_query->elements, restore_settings, restore_coordination, timeout_for_restoring_metadata); - restoreMetadata(restore_tasks, restore_settings, restore_coordination, timeout_for_restoring_metadata); - restoreData(restore_tasks, restores_thread_pool); + restoreTablesData(std::move(data_restore_tasks), restores_thread_pool); } setStatus(restore_uuid, BackupStatus::RESTORED); @@ -306,7 +318,7 @@ UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr conte }; if (restore_settings.async) - backups_thread_pool.scheduleOrThrowOnError([job] { job(true); }); + backups_thread_pool.scheduleOrThrowOnError([job]() mutable { job(true); }); else job(false); diff --git a/src/Backups/DDLCompareUtils.cpp b/src/Backups/DDLCompareUtils.cpp deleted file mode 100644 index 625a0befe63..00000000000 --- a/src/Backups/DDLCompareUtils.cpp +++ /dev/null @@ -1,87 +0,0 @@ -#include -#include -#include - - -namespace DB -{ -namespace -{ - std::shared_ptr prepareDDLToCompare(const ASTCreateQuery & ast) - { - auto res = typeid_cast>(ast.shared_from_this()); - - std::shared_ptr clone; - auto get_clone = [&] - { - if (!clone) - { - clone = typeid_cast>(res->clone()); - res = clone; - } - return clone; - }; - - /// Remove UUIDs. - if (res->uuid != UUIDHelpers::Nil) - get_clone()->uuid = UUIDHelpers::Nil; - - if (res->to_inner_uuid != UUIDHelpers::Nil) - get_clone()->to_inner_uuid = UUIDHelpers::Nil; - - /// Clear IF NOT EXISTS flag. - if (res->if_not_exists) - get_clone()->if_not_exists = false; - - return res; - } -} - - -bool areTableDefinitionsSame(const IAST & table1, const IAST & table2) -{ - auto ast1 = typeid_cast>(table1.shared_from_this()); - if (!ast1 || !ast1->table) - return false; - - auto ast2 = typeid_cast>(table2.shared_from_this()); - if (!ast2 || !ast2->table) - return false; - - if ((ast1->uuid != ast2->uuid) || (ast1->to_inner_uuid != ast2->to_inner_uuid) || - (ast1->if_not_exists != ast2->if_not_exists)) - { - ast1 = prepareDDLToCompare(*ast1); - ast2 = prepareDDLToCompare(*ast2); - } - - return serializeAST(*ast1) == serializeAST(*ast1); -} - - -bool areDatabaseDefinitionsSame(const IAST & database1, const IAST & database2) -{ - auto ast1 = typeid_cast>(database1.shared_from_this()); - if (!ast1 || ast1->table || !ast1->database) - return false; - - auto ast2 = typeid_cast>(database2.shared_from_this()); - if (!ast2 || ast2->table || !ast2->database) - return false; - - if ((ast1->uuid != ast2->uuid) || (ast1->if_not_exists != ast2->if_not_exists)) - { - ast1 = prepareDDLToCompare(*ast1); - ast2 = prepareDDLToCompare(*ast2); - } - - return serializeAST(*ast1) == serializeAST(*ast1); -} - - -bool areTableDataCompatible(const IAST & src_table, const IAST & dest_table) -{ - return areTableDefinitionsSame(src_table, dest_table); -} - -} diff --git a/src/Backups/DDLCompareUtils.h b/src/Backups/DDLCompareUtils.h deleted file mode 100644 index acb99c243ea..00000000000 --- a/src/Backups/DDLCompareUtils.h +++ /dev/null @@ -1,17 +0,0 @@ -#pragma once - - -namespace DB -{ -class IAST; - -/// Checks that two table definitions are actually the same. -bool areTableDefinitionsSame(const IAST & table1, const IAST & table2); - -/// Checks that two database definitions are actually the same. -bool areDatabaseDefinitionsSame(const IAST & database1, const IAST & database2); - -/// Whether the data from the first table can be attached to the second table. -bool areTableDataCompatible(const IAST & src_table, const IAST & dest_table); - -} diff --git a/src/Backups/DDLRenamingVisitor.cpp b/src/Backups/DDLRenamingVisitor.cpp deleted file mode 100644 index fc5cd6f3958..00000000000 --- a/src/Backups/DDLRenamingVisitor.cpp +++ /dev/null @@ -1,387 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace DB -{ -namespace ErrorCodes -{ - extern const int WRONG_DDL_RENAMING_SETTINGS; - extern const int LOGICAL_ERROR; -} - -namespace -{ - /// Replaces names of tables and databases used in a CREATE query, which can be either CREATE TABLE or - /// CREATE DICTIONARY or CREATE VIEW or CREATE TEMPORARY TABLE or CREATE DATABASE query. - void visitCreateQuery(ASTCreateQuery & create, const DDLRenamingVisitor::Data & data) - { - if (create.table) - { - DatabaseAndTableName table_name; - table_name.second = create.getTable(); - if (create.temporary) - table_name.first = DatabaseCatalog::TEMPORARY_DATABASE; - else if (create.database) - table_name.first = create.getDatabase(); - else - throw Exception(ErrorCodes::LOGICAL_ERROR, "Database name specified in the CREATE TABLE query must not be empty"); - - table_name = data.renaming_settings.getNewTableName(table_name); - - if (table_name.first == DatabaseCatalog::TEMPORARY_DATABASE) - { - create.temporary = true; - create.setDatabase(""); - } - else - { - create.temporary = false; - create.setDatabase(table_name.first); - } - create.setTable(table_name.second); - } - else if (create.database) - { - String database_name = create.getDatabase(); - database_name = data.renaming_settings.getNewDatabaseName(database_name); - create.setDatabase(database_name); - } - else - throw Exception(ErrorCodes::LOGICAL_ERROR, "Database name specified in the CREATE DATABASE query must not be empty"); - - if (!create.as_table.empty() && !create.as_database.empty()) - std::tie(create.as_database, create.as_table) = data.renaming_settings.getNewTableName({create.as_database, create.as_table}); - - if (!create.to_table_id.table_name.empty() && !create.to_table_id.database_name.empty()) - { - auto to_table = data.renaming_settings.getNewTableName({create.to_table_id.database_name, create.to_table_id.table_name}); - create.to_table_id = StorageID{to_table.first, to_table.second}; - } - } - - /// Replaces names of a database and a table in a expression like `db`.`table` - void visitTableExpression(ASTTableExpression & expr, const DDLRenamingVisitor::Data & data) - { - if (!expr.database_and_table_name) - return; - - ASTIdentifier * id = expr.database_and_table_name->as(); - if (!id) - return; - - auto table_id = id->createTable(); - if (!table_id) - return; - - const String & db_name = table_id->getDatabaseName(); - const String & table_name = table_id->shortName(); - if (db_name.empty() || table_name.empty()) - return; - - String new_db_name, new_table_name; - std::tie(new_db_name, new_table_name) = data.renaming_settings.getNewTableName({db_name, table_name}); - if ((new_db_name == db_name) && (new_table_name == table_name)) - return; - - expr.database_and_table_name = std::make_shared(Strings{new_db_name, new_table_name}); - expr.children.push_back(expr.database_and_table_name); - } - - /// Replaces a database's name passed via an argument of the function merge() or the table engine Merge. - void visitFunctionMerge(ASTFunction & function, const DDLRenamingVisitor::Data & data) - { - if (!function.arguments) - return; - - /// The first argument is a database's name and we can rename it. - /// The second argument is a regular expression and we can do nothing about it. - auto & args = function.arguments->as().children; - size_t db_name_arg_index = 0; - if (args.size() <= db_name_arg_index) - return; - - String db_name = evaluateConstantExpressionForDatabaseName(args[db_name_arg_index], data.context)->as().value.safeGet(); - if (db_name.empty()) - return; - - String new_db_name = data.renaming_settings.getNewDatabaseName(db_name); - if (new_db_name == db_name) - return; - args[db_name_arg_index] = std::make_shared(new_db_name); - } - - /// Replaces names of a table and a database passed via arguments of the function remote() or cluster() or the table engine Distributed. - void visitFunctionRemote(ASTFunction & function, const DDLRenamingVisitor::Data & data) - { - if (!function.arguments) - return; - - /// The first argument is an address or cluster's name, so we skip it. - /// The second argument can be either 'db.name' or just 'db' followed by the third argument 'table'. - auto & args = function.arguments->as().children; - - const auto * second_arg_as_function = args[1]->as(); - if (second_arg_as_function && TableFunctionFactory::instance().isTableFunctionName(second_arg_as_function->name)) - return; - - size_t db_name_index = 1; - if (args.size() <= db_name_index) - return; - - String name = evaluateConstantExpressionForDatabaseName(args[db_name_index], data.context)->as().value.safeGet(); - - size_t table_name_index = static_cast(-1); - - QualifiedTableName qualified_name; - - if (function.name == "Distributed") - qualified_name.table = name; - else - qualified_name = QualifiedTableName::parseFromString(name); - - if (qualified_name.database.empty()) - { - std::swap(qualified_name.database, qualified_name.table); - table_name_index = 2; - if (args.size() <= table_name_index) - return; - qualified_name.table = evaluateConstantExpressionForDatabaseName(args[table_name_index], data.context)->as().value.safeGet(); - } - - const String & db_name = qualified_name.database; - const String & table_name = qualified_name.table; - - if (db_name.empty() || table_name.empty()) - return; - - String new_db_name, new_table_name; - std::tie(new_db_name, new_table_name) = data.renaming_settings.getNewTableName({db_name, table_name}); - if ((new_db_name == db_name) && (new_table_name == table_name)) - return; - - if (table_name_index != static_cast(-1)) - { - if (new_db_name != db_name) - args[db_name_index] = std::make_shared(new_db_name); - if (new_table_name != table_name) - args[table_name_index] = std::make_shared(new_table_name); - } - else - { - args[db_name_index] = std::make_shared(new_db_name); - args.insert(args.begin() + db_name_index + 1, std::make_shared(new_table_name)); - } - } - - /// Replaces names of tables and databases used in arguments of a table function or a table engine. - void visitFunction(ASTFunction & function, const DDLRenamingVisitor::Data & data) - { - if ((function.name == "merge") || (function.name == "Merge")) - { - visitFunctionMerge(function, data); - } - else if ((function.name == "remote") || (function.name == "remoteSecure") || (function.name == "cluster") || - (function.name == "clusterAllReplicas") || (function.name == "Distributed")) - { - visitFunctionRemote(function, data); - } - } - - /// Replaces names of a table and a database used in source parameters of a dictionary. - void visitDictionary(ASTDictionary & dictionary, const DDLRenamingVisitor::Data & data) - { - if (!dictionary.source || dictionary.source->name != "clickhouse" || !dictionary.source->elements) - return; - - auto & elements = dictionary.source->elements->as().children; - String db_name, table_name; - size_t db_name_index = static_cast(-1); - size_t table_name_index = static_cast(-1); - - for (size_t i = 0; i != elements.size(); ++i) - { - auto & pair = elements[i]->as(); - if (pair.first == "db") - { - if (db_name_index != static_cast(-1)) - return; - db_name = pair.second->as().value.safeGet(); - db_name_index = i; - } - else if (pair.first == "table") - { - if (table_name_index != static_cast(-1)) - return; - table_name = pair.second->as().value.safeGet(); - table_name_index = i; - } - } - - if (db_name.empty() || table_name.empty()) - return; - - String new_db_name, new_table_name; - std::tie(new_db_name, new_table_name) = data.renaming_settings.getNewTableName({db_name, table_name}); - if ((new_db_name == db_name) && (new_table_name == table_name)) - return; - - if (new_db_name != db_name) - { - auto & pair = elements[db_name_index]->as(); - pair.replace(pair.second, std::make_shared(new_db_name)); - } - if (new_table_name != table_name) - { - auto & pair = elements[table_name_index]->as(); - pair.replace(pair.second, std::make_shared(new_table_name)); - } - } -} - - -void DDLRenamingSettings::setNewTableName(const DatabaseAndTableName & old_table_name, const DatabaseAndTableName & new_table_name) -{ - if (old_table_name.first.empty() || old_table_name.second.empty() || new_table_name.first.empty() || new_table_name.second.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty names are not allowed for DDLRenamingSettings::setNewTableName"); - - auto it = old_to_new_table_names.find(old_table_name); - if ((it != old_to_new_table_names.end())) - { - if (it->second == new_table_name) - return; - throw Exception(ErrorCodes::WRONG_DDL_RENAMING_SETTINGS, "Wrong renaming: it's specified that table {}.{} should be renamed to {}.{} and to {}.{} at the same time", - backQuoteIfNeed(old_table_name.first), backQuoteIfNeed(old_table_name.second), - backQuoteIfNeed(it->second.first), backQuoteIfNeed(it->second.second), - backQuoteIfNeed(new_table_name.first), backQuoteIfNeed(new_table_name.second)); - } - old_to_new_table_names[old_table_name] = new_table_name; -} - -void DDLRenamingSettings::setNewDatabaseName(const String & old_database_name, const String & new_database_name) -{ - if (old_database_name.empty() || new_database_name.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty names are not allowed for DDLRenamingSettings::setNewDatabaseName"); - - auto it = old_to_new_database_names.find(old_database_name); - if ((it != old_to_new_database_names.end())) - { - if (it->second == new_database_name) - return; - throw Exception(ErrorCodes::WRONG_DDL_RENAMING_SETTINGS, "Wrong renaming: it's specified that database {} should be renamed to {} and to {} at the same time", - backQuoteIfNeed(old_database_name), backQuoteIfNeed(it->second), backQuoteIfNeed(new_database_name)); - } - old_to_new_database_names[old_database_name] = new_database_name; -} - -void DDLRenamingSettings::setFromBackupQuery(const ASTBackupQuery & backup_query) -{ - setFromBackupQuery(backup_query.elements); -} - -void DDLRenamingSettings::setFromBackupQuery(const ASTBackupQuery::Elements & backup_query_elements) -{ - old_to_new_table_names.clear(); - old_to_new_database_names.clear(); - - using ElementType = ASTBackupQuery::ElementType; - - for (const auto & element : backup_query_elements) - { - switch (element.type) - { - case ElementType::TABLE: - { - const String & table_name = element.name.second; - String database_name = element.name.first; - if (element.is_temp_db) - database_name = DatabaseCatalog::TEMPORARY_DATABASE; - assert(!table_name.empty()); - assert(!database_name.empty()); - - const String & new_table_name = element.new_name.second; - String new_database_name = element.new_name.first; - if (element.is_temp_db) - new_database_name = DatabaseCatalog::TEMPORARY_DATABASE; - assert(!new_table_name.empty()); - assert(!new_database_name.empty()); - - setNewTableName({database_name, table_name}, {new_database_name, new_table_name}); - break; - } - - case ASTBackupQuery::DATABASE: - { - String database_name = element.name.first; - if (element.is_temp_db) - database_name = DatabaseCatalog::TEMPORARY_DATABASE; - assert(!database_name.empty()); - - String new_database_name = element.new_name.first; - if (element.is_temp_db) - new_database_name = DatabaseCatalog::TEMPORARY_DATABASE; - assert(!new_database_name.empty()); - - setNewDatabaseName(database_name, new_database_name); - break; - } - - case ASTBackupQuery::ALL_DATABASES: break; - } - } -} - -DatabaseAndTableName DDLRenamingSettings::getNewTableName(const DatabaseAndTableName & old_table_name) const -{ - auto it = old_to_new_table_names.find(old_table_name); - if (it != old_to_new_table_names.end()) - return it->second; - return {getNewDatabaseName(old_table_name.first), old_table_name.second}; -} - -const String & DDLRenamingSettings::getNewDatabaseName(const String & old_database_name) const -{ - auto it = old_to_new_database_names.find(old_database_name); - if (it != old_to_new_database_names.end()) - return it->second; - return old_database_name; -} - - -bool DDLRenamingVisitor::needChildVisit(ASTPtr &, const ASTPtr &) { return true; } - -void DDLRenamingVisitor::visit(ASTPtr & ast, const Data & data) -{ - if (auto * create = ast->as()) - visitCreateQuery(*create, data); - else if (auto * expr = ast->as()) - visitTableExpression(*expr, data); - else if (auto * function = ast->as()) - visitFunction(*function, data); - else if (auto * dictionary = ast->as()) - visitDictionary(*dictionary, data); -} - -void renameInCreateQuery(ASTPtr & ast, const ContextPtr & global_context, const DDLRenamingSettings & renaming_settings) -{ - try - { - DDLRenamingVisitor::Data data{renaming_settings, global_context}; - DDLRenamingVisitor::Visitor{data}.visit(ast); - } - catch (...) - { - tryLogCurrentException("Backup", "Error while renaming in AST"); - } -} - -} diff --git a/src/Backups/DDLRenamingVisitor.h b/src/Backups/DDLRenamingVisitor.h deleted file mode 100644 index c255039dea7..00000000000 --- a/src/Backups/DDLRenamingVisitor.h +++ /dev/null @@ -1,61 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - - -namespace DB -{ -using DatabaseAndTableName = std::pair; -class IAST; -using ASTPtr = std::shared_ptr; -class Context; -using ContextPtr = std::shared_ptr; - -/// Keeps information about renamings of databases or tables being processed -/// while we're making a backup or while we're restoring from a backup. -class DDLRenamingSettings -{ -public: - DDLRenamingSettings() = default; - - void setNewTableName(const DatabaseAndTableName & old_table_name, const DatabaseAndTableName & new_table_name); - void setNewDatabaseName(const String & old_database_name, const String & new_database_name); - - void setFromBackupQuery(const ASTBackupQuery & backup_query); - void setFromBackupQuery(const ASTBackupQuery::Elements & backup_query_elements); - - /// Changes names according to the renaming. - DatabaseAndTableName getNewTableName(const DatabaseAndTableName & old_table_name) const; - const String & getNewDatabaseName(const String & old_database_name) const; - -private: - std::map old_to_new_table_names; - std::unordered_map old_to_new_database_names; -}; - - -/// Changes names in AST according to the renaming settings. -void renameInCreateQuery(ASTPtr & ast, const ContextPtr & global_context, const DDLRenamingSettings & renaming_settings); - -/// Visits ASTCreateQuery and changes names of tables and databases according to passed DDLRenamingConfig. -class DDLRenamingVisitor -{ -public: - struct Data - { - const DDLRenamingSettings & renaming_settings; - ContextPtr context; - }; - - using Visitor = InDepthNodeVisitor; - - static bool needChildVisit(ASTPtr &, const ASTPtr &); - static void visit(ASTPtr & ast, const Data & data); -}; - -} diff --git a/src/Backups/IBackup.h b/src/Backups/IBackup.h index 4e8375a078e..467c8fea4cd 100644 --- a/src/Backups/IBackup.h +++ b/src/Backups/IBackup.h @@ -36,18 +36,19 @@ public: /// Returns UUID of the backup. virtual UUID getUUID() const = 0; - /// Returns names of entries stored in the backup. - /// If `prefix` isn't empty the function will return only the names starting with - /// the prefix (but without the prefix itself). - /// If the `terminator` isn't empty the function will returns only parts of the names - /// before the terminator. For example, list("", "") returns names of all the entries - /// in the backup; and list("data/", "/") return kind of a list of folders and - /// files stored in the "data/" directory inside the backup. - virtual Strings listFiles(const String & prefix = "", const String & terminator = "/") const = 0; /// NOLINT + /// Returns names of entries stored in a specified directory in the backup. + /// If `directory` is empty or '/' the functions returns entries in the backup's root. + virtual Strings listFiles(const String & directory, bool recursive = false) const = 0; + + /// Checks if a specified directory contains any files. + /// The function returns the same as `!listFiles(directory).empty()`. + virtual bool hasFiles(const String & directory) const = 0; + + using SizeAndChecksum = std::pair; /// Checks if an entry with a specified name exists. virtual bool fileExists(const String & file_name) const = 0; - virtual bool fileExists(const std::pair & size_and_checksum) const = 0; + virtual bool fileExists(const SizeAndChecksum & size_and_checksum) const = 0; /// Returns the size of the entry's data. /// This function does the same as `read(file_name)->getSize()` but faster. @@ -57,8 +58,6 @@ public: /// This function does the same as `read(file_name)->getCheckum()` but faster. virtual UInt128 getFileChecksum(const String & file_name) const = 0; - using SizeAndChecksum = std::pair; - /// Returns both the size and checksum in one call. virtual SizeAndChecksum getFileSizeAndChecksum(const String & file_name) const = 0; diff --git a/src/Backups/IBackupCoordination.h b/src/Backups/IBackupCoordination.h index e09f1d973b9..92b7139ed5f 100644 --- a/src/Backups/IBackupCoordination.h +++ b/src/Backups/IBackupCoordination.h @@ -6,7 +6,6 @@ namespace DB { -using DatabaseAndTableName = std::pair; /// Keeps information about files contained in a backup. class IBackupCoordination @@ -14,10 +13,11 @@ class IBackupCoordination public: virtual ~IBackupCoordination() = default; - /// Adds a data path in backup for a replicated table. - /// Multiple replicas of the replicated table call this function and then all the added paths can be returned by call of the function - /// getReplicatedTableDataPaths(). - virtual void addReplicatedTableDataPath(const String & table_zk_path, const String & table_data_path) = 0; + /// Sets the current stage and waits for other hosts to come to this stage too. + virtual void syncStage(const String & current_host, int stage, const Strings & wait_hosts, std::chrono::seconds timeout) = 0; + + /// Sets that the current host encountered an error, so other hosts should know that and stop waiting in syncStage(). + virtual void syncStageError(const String & current_host, const String & error_message) = 0; struct PartNameAndChecksum { @@ -27,30 +27,23 @@ public: /// Adds part names which a specified replica of a replicated table is going to put to the backup. /// Multiple replicas of the replicated table call this function and then the added part names can be returned by call of the function - /// getReplicatedTablePartNames(). + /// getReplicatedPartNames(). /// Checksums are used only to control that parts under the same names on different replicas are the same. - virtual void addReplicatedTablePartNames( - const String & host_id, - const DatabaseAndTableName & table_name, - const String & table_zk_path, - const std::vector & part_names_and_checksums) - = 0; - - /// Sets that a specified host finished preparations for copying the backup's files, successfully or not. - /// `error_message` should be set to true if it was not successful. - virtual void finishPreparing(const String & host_id, const String & error_message = {}) = 0; - - /// Waits for a specified time for specified hosts to finish preparation for copying the backup's files. - virtual void - waitForAllHostsPrepared(const Strings & host_ids, std::chrono::seconds timeout = std::chrono::seconds(-1) /* no timeout */) const = 0; - - /// Returns all the data paths in backup added for a replicated table (see also addReplicatedTableDataPath()). - virtual Strings getReplicatedTableDataPaths(const String & table_zk_path) const = 0; + virtual void addReplicatedPartNames(const String & table_zk_path, const String & table_name_for_logs, const String & replica_name, + const std::vector & part_names_and_checksums) = 0; /// Returns the names of the parts which a specified replica of a replicated table should put to the backup. - /// This is the same list as it was added by call of the function addReplicatedTablePartNames() but without duplications and without + /// This is the same list as it was added by call of the function addReplicatedPartNames() but without duplications and without /// parts covered by another parts. - virtual Strings getReplicatedTablePartNames(const String & host_id, const DatabaseAndTableName & table_name, const String & table_zk_path) const = 0; + virtual Strings getReplicatedPartNames(const String & table_zk_path, const String & replica_name) const = 0; + + /// Adds a data path in backup for a replicated table. + /// Multiple replicas of the replicated table call this function and then all the added paths can be returned by call of the function + /// getReplicatedDataPaths(). + virtual void addReplicatedDataPath(const String & table_zk_path, const String & data_path) = 0; + + /// Returns all the data paths in backup added for a replicated table (see also addReplicatedDataPath()). + virtual Strings getReplicatedDataPaths(const String & table_zk_path) const = 0; struct FileInfo { @@ -87,7 +80,8 @@ public: virtual void updateFileInfo(const FileInfo & file_info) = 0; virtual std::vector getAllFileInfos() const = 0; - virtual Strings listFiles(const String & prefix, const String & terminator) const = 0; + virtual Strings listFiles(const String & directory, bool recursive) const = 0; + virtual bool hasFiles(const String & directory) const = 0; using SizeAndChecksum = std::pair; diff --git a/src/Backups/IRestoreCoordination.h b/src/Backups/IRestoreCoordination.h index 473b3199d04..fd9a67e1b96 100644 --- a/src/Backups/IRestoreCoordination.h +++ b/src/Backups/IRestoreCoordination.h @@ -13,53 +13,22 @@ class IRestoreCoordination public: virtual ~IRestoreCoordination() = default; + /// Sets the current stage and waits for other hosts to come to this stage too. + virtual void syncStage(const String & current_host, int stage, const Strings & wait_hosts, std::chrono::seconds timeout) = 0; + + /// Sets that the current host encountered an error, so other hosts should know that and stop waiting in syncStage(). + virtual void syncStageError(const String & current_host, const String & error_message) = 0; + /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table. - virtual bool startCreatingTableInReplicatedDB( - const String & host_id, const String & database_name, const String & database_zk_path, const String & table_name) - = 0; - - /// Sets that either we have been created a table in a replicated database or failed doing that. - /// In the latter case `error_message` should be set. - /// Calling this function unblocks other hosts waiting for this table to be created (see waitForCreatingTableInReplicatedDB()). - virtual void finishCreatingTableInReplicatedDB( - const String & host_id, - const String & database_name, - const String & database_zk_path, - const String & table_name, - const String & error_message = {}) - = 0; - - /// Wait for another host to create a table in a replicated database. - virtual void waitForTableCreatedInReplicatedDB( - const String & database_name, - const String & database_zk_path, - const String & table_name, - std::chrono::seconds timeout = std::chrono::seconds(-1) /* no timeout */) - = 0; - - /// Adds a path in backup used by a replicated table. - /// This function can be called multiple times for the same table with different `host_id`, and in that case - /// getReplicatedTableDataPath() will choose `data_path_in_backup` with the lexicographycally first `host_id`. - virtual void addReplicatedTableDataPath( - const String & host_id, const DatabaseAndTableName & table_name, const String & table_zk_path, const String & data_path_in_backup) - = 0; - - /// Sets that a specified host has finished restoring metadata, successfully or with an error. - /// In the latter case `error_message` should be set. - virtual void finishRestoringMetadata(const String & host_id, const String & error_message = {}) = 0; - - /// Waits for a specified list of hosts to finish restoring their metadata. - virtual void waitForAllHostsRestoredMetadata( - const Strings & host_ids, std::chrono::seconds timeout = std::chrono::seconds(-1) /* no timeout */) const = 0; - - /// Gets path in backup used by a replicated table. - virtual String getReplicatedTableDataPath(const String & table_zk_path) const = 0; + virtual bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) = 0; /// Sets that this replica is going to restore a partition in a replicated table. /// The function returns false if this partition is being already restored by another replica. - virtual bool startInsertingDataToPartitionInReplicatedTable( - const String & host_id, const DatabaseAndTableName & table_name, const String & table_zk_path, const String & partition_name) - = 0; + virtual bool acquireInsertingDataIntoReplicatedTable(const String & table_zk_path) = 0; + + /// Sets that this replica is going to restore a ReplicatedAccessStorage. + /// The function returns false if this access storage is being already restored by another replica. + virtual bool acquireReplicatedAccessStorage(const String & access_storage_zk_path) = 0; /// Removes remotely stored information. virtual void drop() {} diff --git a/src/Backups/IRestoreTask.h b/src/Backups/IRestoreTask.h deleted file mode 100644 index 6e6a28eacf0..00000000000 --- a/src/Backups/IRestoreTask.h +++ /dev/null @@ -1,36 +0,0 @@ -#pragma once - -#include -#include - - -namespace DB -{ - -/// Represents a task of restoring something (database / table / table's part) from backup. -class IRestoreTask -{ -public: - IRestoreTask() = default; - virtual ~IRestoreTask() = default; - - enum class RestoreKind - { - /// This task restores metadata (definitions of databases and tables). - /// Tasks restoring metadata are executed first and strictly in one thread. - METADATA, - - /// This task restores tables' data. Such tasks can be executed in parallel. - DATA, - }; - - virtual RestoreKind getRestoreKind() const { return RestoreKind::DATA; } - - /// Perform restoring, the function also can return a list of nested tasks that should be run later. - virtual std::vector> run() = 0; -}; - -using RestoreTaskPtr = std::unique_ptr; -using RestoreTasks = std::vector; - -} diff --git a/src/Backups/RestoreCoordinationDistributed.cpp b/src/Backups/RestoreCoordinationDistributed.cpp index 8b1360aa744..e131ce7fe24 100644 --- a/src/Backups/RestoreCoordinationDistributed.cpp +++ b/src/Backups/RestoreCoordinationDistributed.cpp @@ -1,248 +1,15 @@ #include -#include #include #include -#include -#include -#include -#include -#include -#include namespace DB { -namespace ErrorCodes -{ - extern const int FAILED_TO_SYNC_BACKUP_OR_RESTORE; -} - -namespace -{ - struct ReplicatedTableDataPath - { - String host_id; - DatabaseAndTableName table_name; - String data_path_in_backup; - - String serialize() const - { - WriteBufferFromOwnString out; - writeBinary(host_id, out); - writeBinary(table_name.first, out); - writeBinary(table_name.second, out); - writeBinary(data_path_in_backup, out); - return out.str(); - } - - static ReplicatedTableDataPath deserialize(const String & str) - { - ReadBufferFromString in{str}; - ReplicatedTableDataPath res; - readBinary(res.host_id, in); - readBinary(res.table_name.first, in); - readBinary(res.table_name.second, in); - readBinary(res.data_path_in_backup, in); - return res; - } - }; -} - - -class RestoreCoordinationDistributed::ReplicatedDatabasesMetadataSync -{ -public: - ReplicatedDatabasesMetadataSync(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_) - : zookeeper_path(zookeeper_path_), get_zookeeper(get_zookeeper_), log(&Poco::Logger::get("RestoreCoordination")) - { - createRootNodes(); - } - - /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table. - bool startCreatingTable( - const String & host_id_, const String & database_name_, const String & database_zk_path_, const String & table_name_) - { - auto zookeeper = get_zookeeper(); - - String path = zookeeper_path + "/" + escapeForFileName(database_zk_path_); - zookeeper->createIfNotExists(path, ""); - - TableStatus status; - status.host_id = host_id_; - status.table_name = DatabaseAndTableName{database_name_, table_name_}; - - path += "/" + escapeForFileName(table_name_); - auto code = zookeeper->tryCreate(path, status.serialize(), zkutil::CreateMode::Persistent); - if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS)) - throw zkutil::KeeperException(code, path); - - return (code == Coordination::Error::ZOK); - } - - /// Sets that either we have been created a table in a replicated database or failed doing that. - /// In the latter case `error_message` should be set. - /// Calling this function unblocks other hosts waiting for this table to be created (see waitForCreatingTableInReplicatedDB()). - void finishCreatingTable( - const String & /* host_id_ */, - const String & database_name_, - const String & database_zk_path_, - const String & table_name_, - const String & error_message_) - { - if (error_message_.empty()) - LOG_TRACE(log, "Created table {}.{}", database_name_, table_name_); - else - LOG_TRACE(log, "Failed to created table {}.{}: {}", database_name_, table_name_, error_message_); - - auto zookeeper = get_zookeeper(); - String path = zookeeper_path + "/" + escapeForFileName(database_zk_path_) + "/" + escapeForFileName(table_name_); - - auto status = TableStatus::deserialize(zookeeper->get(path)); - - status.error_message = error_message_; - status.ready = error_message_.empty(); - - zookeeper->set(path, status.serialize()); - } - - /// Wait for another host to create a table in a replicated database. - void waitForTableCreated( - const String & /* database_name_ */, const String & database_zk_path_, const String & table_name_, std::chrono::seconds timeout_) - { - auto zookeeper = get_zookeeper(); - String path = zookeeper_path + "/" + escapeForFileName(database_zk_path_) + "/" + escapeForFileName(table_name_); - - TableStatus status; - - std::atomic watch_set = false; - std::condition_variable watch_triggered_event; - - auto watch_callback = [&](const Coordination::WatchResponse &) - { - watch_set = false; /// After it's triggered it's not set until we call getChildrenWatch() again. - watch_triggered_event.notify_all(); - }; - - auto watch_triggered = [&] { return !watch_set; }; - - bool use_timeout = (timeout_.count() >= 0); - std::chrono::steady_clock::duration time_left = timeout_; - std::mutex dummy_mutex; - - while (true) - { - if (use_timeout && (time_left.count() <= 0)) - { - status = TableStatus::deserialize(zookeeper->get(path)); - break; - } - - watch_set = true; - status = TableStatus::deserialize(zookeeper->getWatch(path, nullptr, watch_callback)); - - if (!status.error_message.empty() || status.ready) - break; - - LOG_TRACE(log, "Waiting for host {} to create table {}.{}", status.host_id, status.table_name.first, status.table_name.second); - - { - std::unique_lock dummy_lock{dummy_mutex}; - if (use_timeout) - { - std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now(); - if (!watch_triggered_event.wait_for(dummy_lock, time_left, watch_triggered)) - break; - time_left -= (std::chrono::steady_clock::now() - start_time); - } - else - watch_triggered_event.wait(dummy_lock, watch_triggered); - } - } - - if (watch_set) - { - /// Remove watch by triggering it. - ++status.increment; - zookeeper->set(path, status.serialize()); - std::unique_lock dummy_lock{dummy_mutex}; - watch_triggered_event.wait_for(dummy_lock, timeout_, watch_triggered); - } - - if (!status.error_message.empty()) - throw Exception( - ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, - "Host {} failed to create table {}.{}: {}", status.host_id, status.table_name.first, status.table_name.second, status.error_message); - - if (status.ready) - { - LOG_TRACE(log, "Host {} created table {}.{}", status.host_id, status.table_name.first, status.table_name.second); - return; - } - - throw Exception( - ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, - "Host {} was unable to create table {}.{} in {}", - status.host_id, - status.table_name.first, - table_name_, - to_string(timeout_)); - } - -private: - void createRootNodes() - { - auto zookeeper = get_zookeeper(); - zookeeper->createAncestors(zookeeper_path); - zookeeper->createIfNotExists(zookeeper_path, ""); - } - - struct TableStatus - { - String host_id; - DatabaseAndTableName table_name; - bool ready = false; - String error_message; - size_t increment = 0; - - String serialize() const - { - WriteBufferFromOwnString out; - writeBinary(host_id, out); - writeBinary(table_name.first, out); - writeBinary(table_name.second, out); - writeBinary(ready, out); - writeBinary(error_message, out); - writeBinary(increment, out); - return out.str(); - } - - static TableStatus deserialize(const String & str) - { - ReadBufferFromString in{str}; - TableStatus res; - readBinary(res.host_id, in); - readBinary(res.table_name.first, in); - readBinary(res.table_name.second, in); - readBinary(res.ready, in); - readBinary(res.error_message, in); - readBinary(res.increment, in); - return res; - } - }; - - const String zookeeper_path; - const zkutil::GetZooKeeper get_zookeeper; - const Poco::Logger * log; -}; - - RestoreCoordinationDistributed::RestoreCoordinationDistributed(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_) : zookeeper_path(zookeeper_path_) , get_zookeeper(get_zookeeper_) - , replicated_databases_metadata_sync( - std::make_unique(zookeeper_path_ + "/repl_databases_metadata", get_zookeeper_)) - , all_metadata_barrier(zookeeper_path_ + "/all_metadata", get_zookeeper_, "RestoreCoordination", "restoring metadata") + , stage_sync(zookeeper_path_ + "/stage", get_zookeeper_, &Poco::Logger::get("RestoreCoordination")) { createRootNodes(); } @@ -254,8 +21,58 @@ void RestoreCoordinationDistributed::createRootNodes() auto zookeeper = get_zookeeper(); zookeeper->createAncestors(zookeeper_path); zookeeper->createIfNotExists(zookeeper_path, ""); - zookeeper->createIfNotExists(zookeeper_path + "/repl_tables_paths", ""); - zookeeper->createIfNotExists(zookeeper_path + "/repl_tables_partitions", ""); + zookeeper->createIfNotExists(zookeeper_path + "/repl_databases_tables_acquired", ""); + zookeeper->createIfNotExists(zookeeper_path + "/repl_tables_data_acquired", ""); + zookeeper->createIfNotExists(zookeeper_path + "/repl_access_storages_acquired", ""); +} + +void RestoreCoordinationDistributed::syncStage(const String & current_host, int new_stage, const Strings & wait_hosts, std::chrono::seconds timeout) +{ + stage_sync.syncStage(current_host, new_stage, wait_hosts, timeout); +} + +void RestoreCoordinationDistributed::syncStageError(const String & current_host, const String & error_message) +{ + stage_sync.syncStageError(current_host, error_message); +} + +bool RestoreCoordinationDistributed::acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) +{ + auto zookeeper = get_zookeeper(); + + String path = zookeeper_path + "/repl_databases_tables_acquired/" + escapeForFileName(database_zk_path); + zookeeper->createIfNotExists(path, ""); + + path += "/" + escapeForFileName(table_name); + auto code = zookeeper->tryCreate(path, "", zkutil::CreateMode::Persistent); + if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS)) + throw zkutil::KeeperException(code, path); + + return (code == Coordination::Error::ZOK); +} + +bool RestoreCoordinationDistributed::acquireInsertingDataIntoReplicatedTable(const String & table_zk_path) +{ + auto zookeeper = get_zookeeper(); + + String path = zookeeper_path + "/repl_tables_data_acquired/" + escapeForFileName(table_zk_path); + auto code = zookeeper->tryCreate(path, "", zkutil::CreateMode::Persistent); + if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS)) + throw zkutil::KeeperException(code, path); + + return (code == Coordination::Error::ZOK); +} + +bool RestoreCoordinationDistributed::acquireReplicatedAccessStorage(const String & access_storage_zk_path) +{ + auto zookeeper = get_zookeeper(); + + String path = zookeeper_path + "/repl_access_storages_acquired/" + escapeForFileName(access_storage_zk_path); + auto code = zookeeper->tryCreate(path, "", zkutil::CreateMode::Persistent); + if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS)) + throw zkutil::KeeperException(code, path); + + return (code == Coordination::Error::ZOK); } void RestoreCoordinationDistributed::removeAllNodes() @@ -264,104 +81,6 @@ void RestoreCoordinationDistributed::removeAllNodes() zookeeper->removeRecursive(zookeeper_path); } -bool RestoreCoordinationDistributed::startCreatingTableInReplicatedDB( - const String & host_id, const String & database_name, const String & database_zk_path, const String & table_name) -{ - return replicated_databases_metadata_sync->startCreatingTable(host_id, database_name, database_zk_path, table_name); -} - -/// Ends creating table in a replicated database, successfully or with an error. -/// In the latter case `error_message` should be set. -void RestoreCoordinationDistributed::finishCreatingTableInReplicatedDB( - const String & host_id, - const String & database_name, - const String & database_zk_path, - const String & table_name, - const String & error_message) -{ - return replicated_databases_metadata_sync->finishCreatingTable(host_id, database_name, database_zk_path, table_name, error_message); -} - -/// Wait for another host to create a table in a replicated database. -void RestoreCoordinationDistributed::waitForTableCreatedInReplicatedDB( - const String & database_name, const String & database_zk_path, const String & table_name, std::chrono::seconds timeout) -{ - return replicated_databases_metadata_sync->waitForTableCreated(database_name, database_zk_path, table_name, timeout); -} - -void RestoreCoordinationDistributed::finishRestoringMetadata(const String & host_id, const String & error_message) -{ - all_metadata_barrier.finish(host_id, error_message); -} - -void RestoreCoordinationDistributed::waitForAllHostsRestoredMetadata(const Strings & host_ids, std::chrono::seconds timeout) const -{ - all_metadata_barrier.waitForAllHostsToFinish(host_ids, timeout); -} - -void RestoreCoordinationDistributed::addReplicatedTableDataPath( - const String & host_id, - const DatabaseAndTableName & table_name, - const String & table_zk_path, - const String & data_path_in_backup) -{ - auto zookeeper = get_zookeeper(); - String path = zookeeper_path + "/repl_tables_paths/" + escapeForFileName(table_zk_path); - - ReplicatedTableDataPath new_info; - new_info.host_id = host_id; - new_info.table_name = table_name; - new_info.data_path_in_backup = data_path_in_backup; - String new_info_str = new_info.serialize(); - - auto code = zookeeper->tryCreate(path, new_info_str, zkutil::CreateMode::Persistent); - if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS)) - throw zkutil::KeeperException(code, path); - - while (code != Coordination::Error::ZOK) - { - Coordination::Stat stat; - ReplicatedTableDataPath cur_info = ReplicatedTableDataPath::deserialize(zookeeper->get(path, &stat)); - if ((cur_info.host_id < host_id) || ((cur_info.host_id == host_id) && (cur_info.table_name <= table_name))) - break; - code = zookeeper->trySet(path, new_info_str, stat.version); - if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZBADVERSION)) - throw zkutil::KeeperException(code, path); - } -} - -String RestoreCoordinationDistributed::getReplicatedTableDataPath(const String & table_zk_path_) const -{ - auto zookeeper = get_zookeeper(); - String path = zookeeper_path + "/repl_tables_paths/" + escapeForFileName(table_zk_path_); - auto info = ReplicatedTableDataPath::deserialize(zookeeper->get(path)); - return info.data_path_in_backup; -} - -bool RestoreCoordinationDistributed::startInsertingDataToPartitionInReplicatedTable( - const String & host_id_, - const DatabaseAndTableName & table_name_, - const String & table_zk_path_, - const String & partition_name_) -{ - auto zookeeper = get_zookeeper(); - - String path = zookeeper_path + "/repl_tables_partitions/" + escapeForFileName(table_zk_path_); - zookeeper->createIfNotExists(path, ""); - - path += "/" + escapeForFileName(partition_name_); - String new_info = host_id_ + "|" + table_name_.first + "|" + table_name_.second; - - auto code = zookeeper->tryCreate(path, new_info, zkutil::CreateMode::Persistent); - if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS)) - throw zkutil::KeeperException(code, path); - - if (code == Coordination::Error::ZOK) - return true; - - return zookeeper->get(path) == new_info; -} - void RestoreCoordinationDistributed::drop() { removeAllNodes(); diff --git a/src/Backups/RestoreCoordinationDistributed.h b/src/Backups/RestoreCoordinationDistributed.h index 689f3456945..0ea5db3f062 100644 --- a/src/Backups/RestoreCoordinationDistributed.h +++ b/src/Backups/RestoreCoordinationDistributed.h @@ -2,7 +2,6 @@ #include #include -#include namespace DB @@ -15,50 +14,22 @@ public: RestoreCoordinationDistributed(const String & zookeeper_path, zkutil::GetZooKeeper get_zookeeper); ~RestoreCoordinationDistributed() override; + /// Sets the current stage and waits for other hosts to come to this stage too. + void syncStage(const String & current_host, int new_stage, const Strings & wait_hosts, std::chrono::seconds timeout) override; + + /// Sets that the current host encountered an error, so other hosts should know that and stop waiting in syncStage(). + void syncStageError(const String & current_host, const String & error_message) override; + /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table. - bool startCreatingTableInReplicatedDB( - const String & host_id, const String & database_name, const String & database_zk_path, const String & table_name) override; - - /// Sets that either we have been created a table in a replicated database or failed doing that. - /// In the latter case `error_message` should be set. - /// Calling this function unblocks other hosts waiting for this table to be created (see waitForCreatingTableInReplicatedDB()). - void finishCreatingTableInReplicatedDB( - const String & host_id, - const String & database_name, - const String & database_zk_path, - const String & table_name, - const String & error_message) override; - - /// Wait for another host to create a table in a replicated database. - void waitForTableCreatedInReplicatedDB( - const String & database_name, const String & database_zk_path, const String & table_name, std::chrono::seconds timeout) override; - - /// Sets path in backup used by a replicated table. - /// This function can be called multiple times for the same table with different `host_id`, and in that case - /// getReplicatedTableDataPath() will choose `data_path_in_backup` with the lexicographycally first `host_id`. - void addReplicatedTableDataPath( - const String & host_id, - const DatabaseAndTableName & table_name, - const String & table_zk_path, - const String & data_path_in_backup) override; - - /// Sets that a specified host has finished restoring metadata, successfully or with an error. - /// In the latter case `error_message` should be set. - void finishRestoringMetadata(const String & host_id, const String & error_message) override; - - /// Waits for all hosts to finish restoring their metadata (i.e. to finish creating databases and tables). Returns false if time is out. - void waitForAllHostsRestoredMetadata(const Strings & host_ids, std::chrono::seconds timeout) const override; - - /// Gets path in backup used by a replicated table. - String getReplicatedTableDataPath(const String & table_zk_path) const override; + bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) override; /// Sets that this replica is going to restore a partition in a replicated table. /// The function returns false if this partition is being already restored by another replica. - bool startInsertingDataToPartitionInReplicatedTable( - const String & host_id, - const DatabaseAndTableName & table_name, - const String & table_zk_path, - const String & partition_name) override; + bool acquireInsertingDataIntoReplicatedTable(const String & table_zk_path) override; + + /// Sets that this replica is going to restore a ReplicatedAccessStorage. + /// The function returns false if this access storage is being already restored by another replica. + bool acquireReplicatedAccessStorage(const String & access_storage_zk_path) override; /// Removes remotely stored information. void drop() override; @@ -71,8 +42,7 @@ private: const String zookeeper_path; const zkutil::GetZooKeeper get_zookeeper; - std::unique_ptr replicated_databases_metadata_sync; - BackupCoordinationDistributedBarrier all_metadata_barrier; + BackupCoordinationStageSync stage_sync; }; } diff --git a/src/Backups/RestoreCoordinationLocal.cpp b/src/Backups/RestoreCoordinationLocal.cpp index a0f61ca3a23..9cecc3f90c9 100644 --- a/src/Backups/RestoreCoordinationLocal.cpp +++ b/src/Backups/RestoreCoordinationLocal.cpp @@ -1,107 +1,35 @@ #include -#include -#include -#include -#include namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - - -RestoreCoordinationLocal::RestoreCoordinationLocal() - : log(&Poco::Logger::get("RestoreCoordination")) -{} - +RestoreCoordinationLocal::RestoreCoordinationLocal() = default; RestoreCoordinationLocal::~RestoreCoordinationLocal() = default; -bool RestoreCoordinationLocal::startCreatingTableInReplicatedDB( - const String & /* host_id */, - const String & /* database_name */, - const String & /* database_zk_path */, - const String & /* table_name */) +void RestoreCoordinationLocal::syncStage(const String &, int, const Strings &, std::chrono::seconds) +{ +} + +void RestoreCoordinationLocal::syncStageError(const String &, const String &) +{ +} + +bool RestoreCoordinationLocal::acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) +{ + std::lock_guard lock{mutex}; + return acquired_tables_in_replicated_databases.emplace(std::pair{database_zk_path, table_name}).second; +} + +bool RestoreCoordinationLocal::acquireInsertingDataIntoReplicatedTable(const String & table_zk_path) +{ + std::lock_guard lock{mutex}; + return acquired_data_in_replicated_tables.emplace(table_zk_path).second; +} + +bool RestoreCoordinationLocal::acquireReplicatedAccessStorage(const String &) { return true; } -void RestoreCoordinationLocal::finishCreatingTableInReplicatedDB( - const String & /* host_id */, - const String & database_name, - const String & /* database_zk_path */, - const String & table_name, - const String & error_message) -{ - if (error_message.empty()) - LOG_TRACE(log, "Created table {}.{}", database_name, table_name); - else - LOG_TRACE(log, "Failed to created table {}.{}: {}", database_name, table_name, error_message); -} - -/// Wait for another host to create a table in a replicated database. -void RestoreCoordinationLocal::waitForTableCreatedInReplicatedDB( - const String & /* database_name */, - const String & /* database_zk_path */, - const String & /* table_name */, - std::chrono::seconds /* timeout */) -{ -} - -void RestoreCoordinationLocal::finishRestoringMetadata(const String & /* host_id */, const String & error_message) -{ - LOG_TRACE(log, "Finished restoring metadata{}", (error_message.empty() ? "" : (" with error " + error_message))); -} - -void RestoreCoordinationLocal::waitForAllHostsRestoredMetadata(const Strings & /* host_ids */, std::chrono::seconds /* timeout */) const -{ -} - -void RestoreCoordinationLocal::addReplicatedTableDataPath(const String & /* host_id */, - const DatabaseAndTableName & table_name, - const String & table_zk_path, - const String & data_path_in_backup) -{ - std::lock_guard lock{mutex}; - auto it = replicated_tables_data_paths.find(table_zk_path); - if (it == replicated_tables_data_paths.end()) - { - ReplicatedTableDataPath new_info; - new_info.table_name = table_name; - new_info.data_path_in_backup = data_path_in_backup; - replicated_tables_data_paths.emplace(table_zk_path, std::move(new_info)); - return; - } - else - { - auto & cur_info = it->second; - if (table_name < cur_info.table_name) - { - cur_info.table_name = table_name; - cur_info.data_path_in_backup = data_path_in_backup; - } - } -} - -String RestoreCoordinationLocal::getReplicatedTableDataPath(const String & table_zk_path) const -{ - std::lock_guard lock{mutex}; - auto it = replicated_tables_data_paths.find(table_zk_path); - if (it == replicated_tables_data_paths.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Replicated data path is not set for zk_path={}", table_zk_path); - return it->second.data_path_in_backup; -} - -bool RestoreCoordinationLocal::startInsertingDataToPartitionInReplicatedTable( - const String & /* host_id */, const DatabaseAndTableName & table_name, const String & table_zk_path, const String & partition_name) -{ - std::lock_guard lock{mutex}; - auto key = std::pair{table_zk_path, partition_name}; - auto it = replicated_tables_partitions.try_emplace(std::move(key), table_name).first; - return it->second == table_name; -} - } diff --git a/src/Backups/RestoreCoordinationLocal.h b/src/Backups/RestoreCoordinationLocal.h index a74617f3b60..b73f345df47 100644 --- a/src/Backups/RestoreCoordinationLocal.h +++ b/src/Backups/RestoreCoordinationLocal.h @@ -1,10 +1,9 @@ #pragma once #include -#include -#include #include -#include +#include +#include namespace Poco { class Logger; } @@ -18,64 +17,27 @@ public: RestoreCoordinationLocal(); ~RestoreCoordinationLocal() override; + /// Sets the current stage and waits for other hosts to come to this stage too. + void syncStage(const String & current_host, int stage, const Strings & wait_hosts, std::chrono::seconds timeout) override; + + /// Sets that the current host encountered an error, so other hosts should know that and stop waiting in syncStage(). + void syncStageError(const String & current_host, const String & error_message) override; + /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table. - bool startCreatingTableInReplicatedDB( - const String & host_id, const String & database_name, const String & database_zk_path, const String & table_name) override; - - /// Sets that either we have been created a table in a replicated database or failed doing that. - /// In the latter case `error_message` should be set. - /// Calling this function unblocks other hosts waiting for this table to be created (see waitForCreatingTableInReplicatedDB()). - void finishCreatingTableInReplicatedDB( - const String & host_id, - const String & database_name, - const String & database_zk_path, - const String & table_name, - const String & error_message) override; - - /// Wait for another host to create a table in a replicated database. - void waitForTableCreatedInReplicatedDB( - const String & database_name, const String & database_zk_path, const String & table_name, std::chrono::seconds timeout) override; - - /// Sets path in backup used by a replicated table. - /// This function can be called multiple times for the same table with different `host_id`, and in that case - /// getReplicatedTableDataPath() will choose `data_path_in_backup` with the lexicographycally first `host_id`. - void addReplicatedTableDataPath( - const String & host_id, - const DatabaseAndTableName & table_name, - const String & table_zk_path, - const String & data_path_in_backup) override; - - /// Sets that a specified host has finished restoring metadata, successfully or with an error. - /// In the latter case `error_message` should be set. - void finishRestoringMetadata(const String & host_id, const String & error_message) override; - - /// Waits for all hosts to finish restoring their metadata (i.e. to finish creating databases and tables). Returns false if time is out. - void waitForAllHostsRestoredMetadata(const Strings & host_ids, std::chrono::seconds timeout) const override; - - /// Gets path in backup used by a replicated table. - String getReplicatedTableDataPath(const String & table_zk_path) const override; + bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) override; /// Sets that this replica is going to restore a partition in a replicated table. /// The function returns false if this partition is being already restored by another replica. - bool startInsertingDataToPartitionInReplicatedTable( - const String & host_id, - const DatabaseAndTableName & table_name, - const String & table_zk_path, - const String & partition_name) override; + bool acquireInsertingDataIntoReplicatedTable(const String & table_zk_path) override; + + /// Sets that this replica is going to restore a ReplicatedAccessStorage. + /// The function returns false if this access storage is being already restored by another replica. + bool acquireReplicatedAccessStorage(const String & access_storage_zk_path) override; private: - struct ReplicatedTableDataPath - { - DatabaseAndTableName table_name; - String data_path_in_backup; - }; - - std::unordered_map replicated_tables_data_paths; - - std::map, DatabaseAndTableName> replicated_tables_partitions; - + std::set> acquired_tables_in_replicated_databases; + std::unordered_set acquired_data_in_replicated_tables; mutable std::mutex mutex; - const Poco::Logger * log; }; } diff --git a/src/Backups/RestoreSettings.cpp b/src/Backups/RestoreSettings.cpp index 486edc6344c..590d39f24f8 100644 --- a/src/Backups/RestoreSettings.cpp +++ b/src/Backups/RestoreSettings.cpp @@ -29,22 +29,43 @@ namespace if (field.getType() == Field::Types::String) { const String & str = field.get(); - if (str == "1" || boost::iequals(str, "true")) + if (str == "1" || boost::iequals(str, "true") || boost::iequals(str, "create")) + { value = RestoreTableCreationMode::kCreate; - else if (str == "0" || boost::iequals(str, "false")) + return; + } + + if (str == "0" || boost::iequals(str, "false") || boost::iequals(str, "must exist") || boost::iequals(str, "must-exist")) + { value = RestoreTableCreationMode::kMustExist; - else if (boost::iequals(str, "if not exists")) + return; + } + + if (boost::iequals(str, "if not exists") || boost::iequals(str, "if-not-exists") + || boost::iequals(str, "create if not exists") || boost::iequals(str, "create-if-not-exists")) + { value = RestoreTableCreationMode::kCreateIfNotExists; - else throw Exception("Cannot parse creation mode from string '" + str + "'", - ErrorCodes::CANNOT_PARSE_BACKUP_SETTINGS); + return; + } } - else + + if (field.getType() == Field::Types::UInt64) { - if (applyVisitor(FieldVisitorConvertToNumber(), field)) + UInt64 number = field.get(); + if (number == 1) + { value = RestoreTableCreationMode::kCreate; - else + return; + } + + if (number == 0) + { value = RestoreTableCreationMode::kMustExist; + return; + } } + + throw Exception(ErrorCodes::CANNOT_PARSE_BACKUP_SETTINGS, "Cannot parse creation mode from {}", field); } explicit operator Field() const @@ -60,6 +81,62 @@ namespace }; using SettingFieldRestoreDatabaseCreationMode = SettingFieldRestoreTableCreationMode; + + struct SettingFieldRestoreAccessCreationMode + { + RestoreAccessCreationMode value; + + explicit SettingFieldRestoreAccessCreationMode(RestoreAccessCreationMode value_) : value(value_) {} + + explicit SettingFieldRestoreAccessCreationMode(const Field & field) + { + if (field.getType() == Field::Types::String) + { + const String & str = field.get(); + if (str == "1" || boost::iequals(str, "true") || boost::iequals(str, "create")) + { + value = RestoreAccessCreationMode::kCreate; + return; + } + + if (boost::iequals(str, "if not exists") || boost::iequals(str, "if-not-exists") + || boost::iequals(str, "create if not exists") || boost::iequals(str, "create-if-not-exists")) + { + value = RestoreAccessCreationMode::kCreateIfNotExists; + return; + } + + if (boost::iequals(str, "replace") || boost::iequals(str, "create or replace") || boost::iequals(str, "create-or-replace")) + { + value = RestoreAccessCreationMode::kReplace; + return; + } + } + + if (field.getType() == Field::Types::UInt64) + { + UInt64 number = field.get(); + if (number == 1) + { + value = RestoreAccessCreationMode::kCreate; + return; + } + } + + throw Exception(ErrorCodes::CANNOT_PARSE_BACKUP_SETTINGS, "Cannot parse creation mode from {}", field); + } + + explicit operator Field() const + { + switch (value) + { + case RestoreAccessCreationMode::kCreate: return Field{true}; + case RestoreAccessCreationMode::kCreateIfNotExists: return Field{"if not exists"}; + case RestoreAccessCreationMode::kReplace: return Field{"replace"}; + } + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected value of enum RestoreAccessCreationMode: {}", static_cast(value)); + } + }; } /// List of restore settings except base_backup_name and cluster_host_ids. @@ -76,6 +153,8 @@ namespace M(UInt64, shard_num_in_backup) \ M(UInt64, replica_num_in_backup) \ M(Bool, allow_non_empty_tables) \ + M(RestoreAccessCreationMode, create_access) \ + M(Bool, allow_unresolved_access_dependencies) \ M(Bool, internal) \ M(String, host_id) \ M(String, coordination_zk_path) diff --git a/src/Backups/RestoreSettings.h b/src/Backups/RestoreSettings.h index 602d75cc283..5e941b79508 100644 --- a/src/Backups/RestoreSettings.h +++ b/src/Backups/RestoreSettings.h @@ -8,13 +8,6 @@ namespace DB { class ASTBackupQuery; -struct StorageRestoreSettings -{ - /// Internal, should not be specified by user. - /// The current host's ID in the format 'escaped_host_name:port'. - String host_id; -}; - /// How the RESTORE command will handle table/database existence. enum class RestoreTableCreationMode { @@ -30,8 +23,21 @@ enum class RestoreTableCreationMode using RestoreDatabaseCreationMode = RestoreTableCreationMode; +/// How the RESTORE command will handle if an user (or role or profile) which it's going to restore already exists. +enum class RestoreAccessCreationMode +{ + /// RESTORE will throw an exception if some user already exists. + kCreate, + + /// RESTORE will skip existing users. + kCreateIfNotExists, + + /// RESTORE will replace existing users with definitions from backup. + kReplace, +}; + /// Settings specified in the "SETTINGS" clause of a RESTORE query. -struct RestoreSettings : public StorageRestoreSettings +struct RestoreSettings { /// Base backup, with this setting we can override the location of the base backup while restoring. /// Any incremental backup keeps inside the information about its base backup, so using this setting is optional. @@ -86,9 +92,20 @@ struct RestoreSettings : public StorageRestoreSettings /// Setting "allow_non_empty_tables=true" thus can cause data duplication in the table, use with caution. bool allow_non_empty_tables = false; + /// How the RESTORE command will handle if an user (or role or profile) which it's going to restore already exists. + RestoreAccessCreationMode create_access = RestoreAccessCreationMode::kCreateIfNotExists; + + /// Skip dependencies of access entities which can't be resolved. + /// For example, if an user has a profile assigned and that profile is not in the backup and doesn't exist locally. + bool allow_unresolved_access_dependencies = false; + /// Internal, should not be specified by user. bool internal = false; + /// Internal, should not be specified by user. + /// The current host's ID in the format 'escaped_host_name:port'. + String host_id; + /// Internal, should not be specified by user. /// Cluster's hosts' IDs in the format 'escaped_host_name:port' for all shards and replicas in a cluster specified in BACKUP ON CLUSTER. std::vector cluster_host_ids; diff --git a/src/Backups/RestoreUtils.cpp b/src/Backups/RestoreUtils.cpp deleted file mode 100644 index f1242098c93..00000000000 --- a/src/Backups/RestoreUtils.cpp +++ /dev/null @@ -1,1008 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace DB -{ -namespace ErrorCodes -{ - extern const int CANNOT_RESTORE_TABLE; - extern const int CANNOT_RESTORE_DATABASE; - extern const int BACKUP_ENTRY_NOT_FOUND; -} - -namespace -{ - class PathsInBackup - { - public: - explicit PathsInBackup(const IBackup & backup_) : backup(backup_) { } - - std::vector getShards() const - { - std::vector res; - for (const String & shard_index : backup.listFiles("shards/")) - res.push_back(parse(shard_index)); - if (res.empty()) - res.push_back(1); - return res; - } - - std::vector getReplicas(size_t shard_index) const - { - std::vector res; - for (const String & replica_index : backup.listFiles(fmt::format("shards/{}/replicas/", shard_index))) - res.push_back(parse(replica_index)); - if (res.empty()) - res.push_back(1); - return res; - } - - std::vector getDatabases(size_t shard_index, size_t replica_index) const - { - std::vector res; - - insertAtEnd(res, backup.listFiles(fmt::format("shards/{}/replicas/{}/metadata/", shard_index, replica_index))); - insertAtEnd(res, backup.listFiles(fmt::format("shards/{}/metadata/", shard_index))); - insertAtEnd(res, backup.listFiles(fmt::format("metadata/"))); - - boost::range::remove_erase_if( - res, - [](String & str) - { - if (str.ends_with(".sql")) - { - str.resize(str.length() - strlen(".sql")); - str = unescapeForFileName(str); - return false; - } - return true; - }); - - ::sort(res.begin(), res.end()); - res.erase(std::unique(res.begin(), res.end()), res.end()); - return res; - } - - std::vector getTables(const String & database_name, size_t shard_index, size_t replica_index) const - { - std::vector res; - - String escaped_database_name = escapeForFileName(database_name); - insertAtEnd( - res, - backup.listFiles(fmt::format("shards/{}/replicas/{}/metadata/{}/", shard_index, replica_index, escaped_database_name))); - insertAtEnd(res, backup.listFiles(fmt::format("shards/{}/metadata/{}/", shard_index, escaped_database_name))); - insertAtEnd(res, backup.listFiles(fmt::format("metadata/{}/", escaped_database_name))); - - boost::range::remove_erase_if( - res, - [](String & str) - { - if (str.ends_with(".sql")) - { - str.resize(str.length() - strlen(".sql")); - str = unescapeForFileName(str); - return false; - } - return true; - }); - - ::sort(res.begin(), res.end()); - res.erase(std::unique(res.begin(), res.end()), res.end()); - return res; - } - - /// Returns the path to metadata in backup. - String getMetadataPath(const DatabaseAndTableName & table_name, size_t shard_index, size_t replica_index) const - { - String escaped_table_name = escapeForFileName(table_name.first) + "/" + escapeForFileName(table_name.second); - String path1 = fmt::format("shards/{}/replicas/{}/metadata/{}.sql", shard_index, replica_index, escaped_table_name); - if (backup.fileExists(path1)) - return path1; - String path2 = fmt::format("shards/{}/metadata/{}.sql", shard_index, escaped_table_name); - if (backup.fileExists(path2)) - return path2; - String path3 = fmt::format("metadata/{}.sql", escaped_table_name); - return path3; - } - - String getMetadataPath(const String & database_name, size_t shard_index, size_t replica_index) const - { - String escaped_database_name = escapeForFileName(database_name); - String path1 = fmt::format("shards/{}/replicas/{}/metadata/{}.sql", shard_index, replica_index, escaped_database_name); - if (backup.fileExists(path1)) - return path1; - String path2 = fmt::format("shards/{}/metadata/{}.sql", shard_index, escaped_database_name); - if (backup.fileExists(path2)) - return path2; - String path3 = fmt::format("metadata/{}.sql", escaped_database_name); - return path3; - } - - String getDataPath(const DatabaseAndTableName & table_name, size_t shard_index, size_t replica_index) const - { - String escaped_table_name = escapeForFileName(table_name.first) + "/" + escapeForFileName(table_name.second); - if (backup.fileExists(fmt::format("shards/{}/replicas/{}/metadata/{}.sql", shard_index, replica_index, escaped_table_name))) - return fmt::format("shards/{}/replicas/{}/data/{}/", shard_index, replica_index, escaped_table_name); - if (backup.fileExists(fmt::format("shards/{}/metadata/{}.sql", shard_index, escaped_table_name))) - return fmt::format("shards/{}/data/{}/", shard_index, escaped_table_name); - return fmt::format("data/{}/", escaped_table_name); - } - - private: - const IBackup & backup; - }; - - - using Kind = ASTBackupQuery::Kind; - using Element = ASTBackupQuery::Element; - using Elements = ASTBackupQuery::Elements; - using ElementType = ASTBackupQuery::ElementType; - using RestoreSettingsPtr = std::shared_ptr; - - - /// Restores a database (without tables inside), should be executed before executing - /// RestoreTableTask. - class RestoreDatabaseTask : public IRestoreTask - { - public: - RestoreDatabaseTask(ContextMutablePtr context_, const ASTPtr & create_query_, const RestoreSettingsPtr & restore_settings_) - : context(context_) - , create_query(typeid_cast>(create_query_)) - , restore_settings(restore_settings_) - { - } - - RestoreTasks run() override - { - createDatabase(); - getDatabase(); - checkDatabaseCreateQuery(); - return {}; - } - - RestoreKind getRestoreKind() const override { return RestoreKind::METADATA; } - - private: - void createDatabase() - { - if (restore_settings->create_database == RestoreDatabaseCreationMode::kMustExist) - return; - - auto cloned_create_query = typeid_cast>(create_query->clone()); - cloned_create_query->if_not_exists = (restore_settings->create_database == RestoreDatabaseCreationMode::kCreateIfNotExists); - InterpreterCreateQuery create_interpreter{cloned_create_query, context}; - create_interpreter.setInternal(true); - create_interpreter.execute(); - } - - DatabasePtr getDatabase() - { - if (!database) - database = DatabaseCatalog::instance().getDatabase(create_query->getDatabase()); - return database; - } - - ASTPtr getDatabaseCreateQuery() - { - if (!database_create_query) - database_create_query = getDatabase()->getCreateDatabaseQuery(); - return database_create_query; - } - - void checkDatabaseCreateQuery() - { - if (restore_settings->allow_different_database_def) - return; - - getDatabaseCreateQuery(); - if (areDatabaseDefinitionsSame(*create_query, *database_create_query)) - return; - - throw Exception( - ErrorCodes::CANNOT_RESTORE_DATABASE, - "The database {} already exists but has a different definition: {}, " - "compare to its definition in the backup: {}", - backQuoteIfNeed(create_query->getDatabase()), - serializeAST(*database_create_query), - serializeAST(*create_query)); - } - - ContextMutablePtr context; - std::shared_ptr create_query; - RestoreSettingsPtr restore_settings; - DatabasePtr database; - ASTPtr database_create_query; - }; - - - /// Restores a table. - class RestoreTableTask : public IRestoreTask - { - public: - RestoreTableTask( - ContextMutablePtr context_, - const ASTPtr & create_query_, - const ASTs & partitions_, - const BackupPtr & backup_, - const DatabaseAndTableName & table_name_in_backup_, - const RestoreSettingsPtr & restore_settings_, - const std::shared_ptr & restore_coordination_, - std::chrono::seconds timeout_for_restoring_metadata_) - : context(context_) - , create_query(typeid_cast>(create_query_)) - , partitions(partitions_) - , backup(backup_) - , table_name_in_backup(table_name_in_backup_) - , restore_settings(restore_settings_) - , restore_coordination(restore_coordination_) - , timeout_for_restoring_metadata(timeout_for_restoring_metadata_) - { - table_name = DatabaseAndTableName{create_query->getDatabase(), create_query->getTable()}; - if (create_query->temporary) - table_name.first = DatabaseCatalog::TEMPORARY_DATABASE; - } - - RestoreTasks run() override - { - getDatabase(); - createStorage(); - getStorage(); - checkStorageCreateQuery(); - checkTableIsEmpty(); - checkTableDataCompatible(); - return insertData(); - } - - RestoreKind getRestoreKind() const override { return RestoreKind::METADATA; } - - private: - void getDatabase() - { - database = DatabaseCatalog::instance().getDatabase(table_name.first); - replicated_database = typeid_cast>(database); - } - - void createStorage() - { - if (restore_settings->create_table == RestoreTableCreationMode::kMustExist) - return; - - auto cloned_create_query = typeid_cast>(create_query->clone()); - cloned_create_query->if_not_exists = (restore_settings->create_table == RestoreTableCreationMode::kCreateIfNotExists); - - /// We need a special processing for tables in replicated databases. - /// Because of the replication multiple nodes can try to restore the same tables again and failed with "Table already exists" - /// because of some table could be restored already on other node and then replicated to this node. - /// To solve this problem we use the restore coordination: the first node calls - /// IRestoreCoordination::startCreatingTableInReplicatedDB() and then for other nodes this function returns false which means - /// this table is already being created by some other node. - bool wait_instead_of_creating = false; - if (replicated_database) - wait_instead_of_creating = !restore_coordination->startCreatingTableInReplicatedDB( - restore_settings->host_id, table_name.first, replicated_database->getZooKeeperPath(), table_name.second); - - if (wait_instead_of_creating) - { - waitForReplicatedDatabaseToSyncTable(); - } - else - { - try - { - InterpreterCreateQuery create_interpreter{cloned_create_query, context}; - create_interpreter.setInternal(true); - create_interpreter.execute(); - } - catch (...) - { - if (replicated_database) - { - restore_coordination->finishCreatingTableInReplicatedDB( - restore_settings->host_id, - table_name.first, - replicated_database->getZooKeeperPath(), - table_name.second, - getCurrentExceptionMessage(false)); - } - throw; - } - - if (replicated_database) - restore_coordination->finishCreatingTableInReplicatedDB( - restore_settings->host_id, table_name.first, replicated_database->getZooKeeperPath(), table_name.second); - } - } - - void waitForReplicatedDatabaseToSyncTable() - { - if (!replicated_database) - return; - - restore_coordination->waitForTableCreatedInReplicatedDB( - table_name.first, replicated_database->getZooKeeperPath(), table_name.second); - - /// The table `table_name` was created on other host, must be in the replicated database's queue, - /// we have to wait until the replicated database syncs that. - bool replicated_database_synced = false; - auto start_time = std::chrono::steady_clock::now(); - bool use_timeout = (timeout_for_restoring_metadata.count() > 0); - while (!database->isTableExist(table_name.second, context)) - { - if (replicated_database_synced - || (use_timeout && (std::chrono::steady_clock::now() - start_time) >= timeout_for_restoring_metadata)) - { - throw Exception( - ErrorCodes::CANNOT_RESTORE_TABLE, - "Table {}.{} in the replicated database {} was not synced from another node in {}", - table_name.first, - table_name.second, - table_name.first, - to_string(timeout_for_restoring_metadata)); - } - replicated_database_synced = replicated_database->waitForReplicaToProcessAllEntries(50); - } - } - - void getStorage() - { - storage = database->getTable(table_name.second, context); - storage_create_query = database->getCreateTableQuery(table_name.second, context); - - if (!restore_settings->structure_only) - { - data_path_in_backup = PathsInBackup{*backup}.getDataPath( - table_name_in_backup, restore_settings->shard_num_in_backup, restore_settings->replica_num_in_backup); - has_data = !backup->listFiles(data_path_in_backup).empty(); - - const auto * replicated_table = typeid_cast(storage.get()); - if (replicated_table) - { - /// We need to be consistent when we're restoring replicated tables. - /// It's allowed for a backup to contain multiple replicas of the same replicated table, - /// and when we restore it we need to choose single data path in the backup to restore this table on each replica. - /// That's why we use the restore coordination here: on restoring metadata stage each replica sets its own - /// `data_path_in_backup` for same zookeeper path, and then the restore coordination choose one `data_path_in_backup` - /// to use for restoring data. - restore_coordination->addReplicatedTableDataPath( - restore_settings->host_id, - table_name_in_backup, - replicated_table->getZooKeeperName() + replicated_table->getZooKeeperPath(), - data_path_in_backup); - has_data = true; - } - } - } - - void checkStorageCreateQuery() - { - if (!restore_settings->allow_different_table_def && !areTableDefinitionsSame(*create_query, *storage_create_query)) - { - throw Exception( - ErrorCodes::CANNOT_RESTORE_TABLE, - "The {} already exists but has a different definition: {}, " - "compare to its definition in the backup: {}", - formatTableNameOrTemporaryTableName(table_name), - serializeAST(*storage_create_query), - serializeAST(*create_query)); - } - } - - void checkTableIsEmpty() - { - if (restore_settings->allow_non_empty_tables || restore_settings->structure_only || !has_data) - return; - - bool empty = true; - if (auto total_rows = storage->totalRows(context->getSettingsRef())) - empty = (*total_rows == 0); - else if (auto total_bytes = storage->totalBytes(context->getSettingsRef())) - empty = (*total_bytes == 0); - - if (empty) - { - /// If this is a replicated table new parts could be in its queue but not fetched yet. - /// In that case we consider the table as not empty. - if (auto * replicated_table = typeid_cast(storage.get())) - { - StorageReplicatedMergeTree::Status status; - replicated_table->getStatus(status, /* with_zk_fields = */ false); - - if (status.queue.inserts_in_queue) - { - empty = false; - } - else - { - /// Check total_rows again to be sure. - if (auto total_rows = storage->totalRows(context->getSettingsRef()); *total_rows != 0) - empty = false; - } - } - } - - if (!empty) - { - throw Exception( - ErrorCodes::CANNOT_RESTORE_TABLE, - "Cannot restore {} because it already contains some data. You can set structure_only=true or " - "allow_non_empty_tables=true to overcome that in the way you want", - formatTableNameOrTemporaryTableName(table_name)); - } - } - - void checkTableDataCompatible() - { - if (restore_settings->structure_only || !has_data) - return; - - if (!areTableDataCompatible(*create_query, *storage_create_query)) - { - throw Exception( - ErrorCodes::CANNOT_RESTORE_TABLE, - "Cannot attach data of the {} in the backup to the existing {} because of they are not compatible. " - "Here is the definition of the {} in the backup: {}, and here is the definition of the existing {}: {}", - formatTableNameOrTemporaryTableName(table_name_in_backup), - formatTableNameOrTemporaryTableName(table_name), - formatTableNameOrTemporaryTableName(table_name_in_backup), - serializeAST(*create_query), - formatTableNameOrTemporaryTableName(table_name), - serializeAST(*storage_create_query)); - } - } - - RestoreTasks insertData() - { - if (restore_settings->structure_only || !has_data) - return {}; - - RestoreTasks tasks; - tasks.emplace_back( - storage->restoreData(context, partitions, backup, data_path_in_backup, *restore_settings, restore_coordination)); - return tasks; - } - - ContextMutablePtr context; - std::shared_ptr create_query; - DatabaseAndTableName table_name; - ASTs partitions; - BackupPtr backup; - DatabaseAndTableName table_name_in_backup; - RestoreSettingsPtr restore_settings; - std::shared_ptr restore_coordination; - std::chrono::seconds timeout_for_restoring_metadata; - DatabasePtr database; - std::shared_ptr replicated_database; - StoragePtr storage; - ASTPtr storage_create_query; - bool has_data = false; - String data_path_in_backup; - }; - - - /// Makes tasks for restoring databases and tables according to the elements of ASTBackupQuery. - /// Keep this class consistent with BackupEntriesBuilder. - class RestoreTasksBuilder - { - public: - RestoreTasksBuilder( - ContextMutablePtr context_, - const BackupPtr & backup_, - const RestoreSettings & restore_settings_, - const std::shared_ptr & restore_coordination_, - std::chrono::seconds timeout_for_restoring_metadata_) - : context(context_) - , backup(backup_) - , restore_settings(restore_settings_) - , restore_coordination(restore_coordination_) - , timeout_for_restoring_metadata(timeout_for_restoring_metadata_) - { - } - - /// Prepares internal structures for making tasks for restoring. - void prepare(const ASTBackupQuery::Elements & elements) - { - calculateShardNumAndReplicaNumInBackup(); - renaming_settings.setFromBackupQuery(elements); - - for (const auto & element : elements) - { - switch (element.type) - { - case ElementType::TABLE: { - prepareToRestoreTable(element.name, element.partitions); - break; - } - - case ElementType::DATABASE: { - const String & database_name = element.name.first; - prepareToRestoreDatabase(database_name, element.except_list); - break; - } - - case ElementType::ALL_DATABASES: { - prepareToRestoreAllDatabases(element.except_list); - break; - } - } - } - } - - /// Makes tasks for restoring, should be called after prepare(). - RestoreTasks makeTasks() const - { - auto restore_settings_ptr = std::make_shared(restore_settings); - - RestoreTasks res; - for (const auto & info : databases | boost::adaptors::map_values) - res.push_back(std::make_unique(context, info.create_query, restore_settings_ptr)); - - /// TODO: We need to restore tables according to their dependencies. - for (const auto & info : tables | boost::adaptors::map_values) - res.push_back(std::make_unique( - context, - info.create_query, - info.partitions, - backup, - info.name_in_backup, - restore_settings_ptr, - restore_coordination, - timeout_for_restoring_metadata)); - - return res; - } - - private: - void calculateShardNumAndReplicaNumInBackup() - { - size_t shard_num = 0; - size_t replica_num = 0; - if (!restore_settings.host_id.empty()) - { - std::tie(shard_num, replica_num) - = BackupSettings::Util::findShardNumAndReplicaNum(restore_settings.cluster_host_ids, restore_settings.host_id); - } - - auto shards_in_backup = PathsInBackup{*backup}.getShards(); - if (!restore_settings.shard_num_in_backup) - { - if (shards_in_backup.size() == 1) - restore_settings.shard_num_in_backup = shards_in_backup[0]; - else - restore_settings.shard_num_in_backup = shard_num; - } - - if (std::find(shards_in_backup.begin(), shards_in_backup.end(), restore_settings.shard_num_in_backup) == shards_in_backup.end()) - throw Exception(ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "No shard #{} in backup", restore_settings.shard_num_in_backup); - - auto replicas_in_backup = PathsInBackup{*backup}.getReplicas(restore_settings.shard_num_in_backup); - if (!restore_settings.replica_num_in_backup) - { - if (replicas_in_backup.size() == 1) - restore_settings.replica_num_in_backup = replicas_in_backup[0]; - else if (std::find(replicas_in_backup.begin(), replicas_in_backup.end(), replica_num) != replicas_in_backup.end()) - restore_settings.replica_num_in_backup = replica_num; - else - restore_settings.replica_num_in_backup = replicas_in_backup[0]; - } - } - - /// Prepares to restore a single table and probably its database's definition. - void prepareToRestoreTable(const DatabaseAndTableName & table_name_, const ASTs & partitions_) - { - /// Check that we are not trying to restore the same table again. - DatabaseAndTableName new_table_name = renaming_settings.getNewTableName(table_name_); - if (tables.contains(new_table_name)) - throw Exception( - ErrorCodes::CANNOT_RESTORE_TABLE, "Cannot restore the {} twice", formatTableNameOrTemporaryTableName(new_table_name)); - - /// Make a create query for this table. - auto create_query = renameInCreateQuery(readCreateQueryFromBackup(table_name_)); - - CreateTableInfo info; - info.create_query = create_query; - info.name_in_backup = table_name_; - info.partitions = partitions_; - tables[new_table_name] = std::move(info); - } - - /// Prepares to restore a database and all tables in it. - void prepareToRestoreDatabase(const String & database_name_, const std::set & except_list_) - { - /// Check that we are not trying to restore the same database again. - String new_database_name = renaming_settings.getNewDatabaseName(database_name_); - if (databases.contains(new_database_name)) - throw Exception( - ErrorCodes::CANNOT_RESTORE_DATABASE, "Cannot restore the database {} twice", backQuoteIfNeed(new_database_name)); - - Strings table_names = PathsInBackup{*backup}.getTables( - database_name_, restore_settings.shard_num_in_backup, restore_settings.replica_num_in_backup); - bool has_tables_in_backup = !table_names.empty(); - bool has_create_query_in_backup = hasCreateQueryInBackup(database_name_); - - if (!has_create_query_in_backup && !has_tables_in_backup) - throw Exception( - ErrorCodes::CANNOT_RESTORE_DATABASE, - "Cannot restore the database {} because there is no such database in the backup", - backQuoteIfNeed(database_name_)); - - /// Of course we're not going to restore the definition of the system or the temporary database. - if (!isSystemOrTemporaryDatabase(new_database_name)) - { - /// Make a create query for this database. - std::shared_ptr create_query; - if (has_create_query_in_backup) - { - create_query = renameInCreateQuery(readCreateQueryFromBackup(database_name_)); - } - else - { - create_query = std::make_shared(); - create_query->setDatabase(database_name_); - } - - CreateDatabaseInfo info; - info.create_query = create_query; - databases[new_database_name] = std::move(info); - } - - /// Restore tables in this database. - for (const String & table_name : table_names) - { - if (except_list_.contains(table_name)) - continue; - prepareToRestoreTable(DatabaseAndTableName{database_name_, table_name}, ASTs{}); - } - } - - /// Prepares to restore all the databases contained in the backup. - void prepareToRestoreAllDatabases(const std::set & except_list_) - { - for (const String & database_name : - PathsInBackup{*backup}.getDatabases(restore_settings.shard_num_in_backup, restore_settings.replica_num_in_backup)) - { - if (except_list_.contains(database_name)) - continue; - prepareToRestoreDatabase(database_name, std::set{}); - } - } - - /// Reads a create query for creating a specified table from the backup. - std::shared_ptr readCreateQueryFromBackup(const DatabaseAndTableName & table_name) const - { - String create_query_path = PathsInBackup{*backup}.getMetadataPath( - table_name, restore_settings.shard_num_in_backup, restore_settings.replica_num_in_backup); - if (!backup->fileExists(create_query_path)) - throw Exception( - ErrorCodes::CANNOT_RESTORE_TABLE, - "Cannot restore the {} because there is no such table in the backup", - formatTableNameOrTemporaryTableName(table_name)); - auto read_buffer = backup->readFile(create_query_path)->getReadBuffer(); - String create_query_str; - readStringUntilEOF(create_query_str, *read_buffer); - read_buffer.reset(); - ParserCreateQuery create_parser; - return typeid_cast>( - parseQuery(create_parser, create_query_str, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH)); - } - - /// Reads a create query for creating a specified database from the backup. - std::shared_ptr readCreateQueryFromBackup(const String & database_name) const - { - String create_query_path = PathsInBackup{*backup}.getMetadataPath( - database_name, restore_settings.shard_num_in_backup, restore_settings.replica_num_in_backup); - if (!backup->fileExists(create_query_path)) - throw Exception( - ErrorCodes::CANNOT_RESTORE_DATABASE, - "Cannot restore the database {} because there is no such database in the backup", - backQuoteIfNeed(database_name)); - auto read_buffer = backup->readFile(create_query_path)->getReadBuffer(); - String create_query_str; - readStringUntilEOF(create_query_str, *read_buffer); - read_buffer.reset(); - ParserCreateQuery create_parser; - return typeid_cast>( - parseQuery(create_parser, create_query_str, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH)); - } - - /// Whether there is a create query for creating a specified database in the backup. - bool hasCreateQueryInBackup(const String & database_name) const - { - String create_query_path = PathsInBackup{*backup}.getMetadataPath( - database_name, restore_settings.shard_num_in_backup, restore_settings.replica_num_in_backup); - return backup->fileExists(create_query_path); - } - - /// Do renaming in the create query according to the renaming config. - std::shared_ptr renameInCreateQuery(const ASTPtr & ast) const - { - ASTPtr query = ast; - ::DB::renameInCreateQuery(query, context, renaming_settings); - auto create_query = typeid_cast>(query); - return create_query; - } - - static bool isSystemOrTemporaryDatabase(const String & database_name) - { - return (database_name == DatabaseCatalog::SYSTEM_DATABASE) || (database_name == DatabaseCatalog::TEMPORARY_DATABASE); - } - - /// Information which is used to make an instance of RestoreTableTask. - struct CreateTableInfo - { - ASTPtr create_query; - DatabaseAndTableName name_in_backup; - ASTs partitions; - }; - - /// Information which is used to make an instance of RestoreDatabaseTask. - struct CreateDatabaseInfo - { - ASTPtr create_query; - }; - - ContextMutablePtr context; - BackupPtr backup; - RestoreSettings restore_settings; - std::shared_ptr restore_coordination; - std::chrono::seconds timeout_for_restoring_metadata; - DDLRenamingSettings renaming_settings; - std::map databases; - std::map tables; - }; - - - RestoreTasks makeRestoreTasksImpl( - ContextMutablePtr context, - const BackupPtr & backup, - const Elements & elements, - const RestoreSettings & restore_settings, - const std::shared_ptr & restore_coordination, - std::chrono::seconds timeout_for_restoring_metadata) - { - RestoreTasksBuilder builder{context, backup, restore_settings, restore_coordination, timeout_for_restoring_metadata}; - builder.prepare(elements); - return builder.makeTasks(); - } - - - void restoreMetadataImpl(RestoreTasks & restore_tasks) - { - /// There are two kinds of restore tasks: sequential and non-sequential ones. - /// Sequential tasks are executed first and always in one thread. - std::deque> restore_metadata_tasks; - boost::range::remove_erase_if( - restore_tasks, - [&restore_metadata_tasks](RestoreTaskPtr & task) - { - if (task->getRestoreKind() == IRestoreTask::RestoreKind::METADATA) - { - restore_metadata_tasks.push_back(std::move(task)); - return true; - } - return false; - }); - - /// Sequential tasks. - while (!restore_metadata_tasks.empty()) - { - auto current_task = std::move(restore_metadata_tasks.front()); - restore_metadata_tasks.pop_front(); - - RestoreTasks new_tasks = current_task->run(); - - for (auto & task : new_tasks) - { - if (task->getRestoreKind() == IRestoreTask::RestoreKind::METADATA) - restore_metadata_tasks.push_back(std::move(task)); - else - restore_tasks.push_back(std::move(task)); - } - } - } -} - - -RestoreTasks makeRestoreTasks( - ContextMutablePtr context, - const BackupPtr & backup, - const Elements & elements, - const RestoreSettings & restore_settings, - const std::shared_ptr & restore_coordination, - std::chrono::seconds timeout_for_restoring_metadata) -{ - try - { - return makeRestoreTasksImpl(context, backup, elements, restore_settings, restore_coordination, timeout_for_restoring_metadata); - } - catch (...) - { - restore_coordination->finishRestoringMetadata(restore_settings.host_id, getCurrentExceptionMessage(false)); - throw; - } -} - - -void restoreMetadata( - RestoreTasks & restore_tasks, - const RestoreSettings & restore_settings, - const std::shared_ptr & restore_coordination, - std::chrono::seconds timeout_for_restoring_metadata) -{ - try - { - restoreMetadataImpl(restore_tasks); - } - catch (...) - { - restore_coordination->finishRestoringMetadata(restore_settings.host_id, getCurrentExceptionMessage(false)); - throw; - } - - /// We've finished restoring metadata, now we will wait for other replicas and shards to finish too. - /// We need this waiting because we're going to call some functions which requires data collected from other nodes too, - /// see IRestoreCoordination::checkTablesNotExistedInReplicatedDBs(), IRestoreCoordination::getReplicatedTableDataPath(). - restore_coordination->finishRestoringMetadata(restore_settings.host_id); - - restore_coordination->waitForAllHostsRestoredMetadata( - BackupSettings::Util::filterHostIDs( - restore_settings.cluster_host_ids, restore_settings.shard_num, restore_settings.replica_num), - timeout_for_restoring_metadata); -} - - -void restoreData(RestoreTasks & restore_tasks, ThreadPool & thread_pool) -{ - std::deque> tasks(std::make_move_iterator(restore_tasks.begin()), std::make_move_iterator(restore_tasks.end())); - restore_tasks.clear(); - - /// Non-sequential tasks. - size_t num_active_jobs = 0; - std::mutex mutex; - std::condition_variable event; - std::exception_ptr exception; - - while (true) - { - std::unique_ptr current_task; - { - std::unique_lock lock{mutex}; - event.wait(lock, [&] { return !tasks.empty() || exception || !num_active_jobs; }); - if ((tasks.empty() && !num_active_jobs) || exception) - break; - current_task = std::move(tasks.front()); - tasks.pop_front(); - ++num_active_jobs; - } - - auto job = [current_task = std::shared_ptr(std::move(current_task)), &tasks, &num_active_jobs, &exception, &mutex, &event]() mutable - { - SCOPE_EXIT({ - --num_active_jobs; - event.notify_all(); - }); - - { - std::lock_guard lock{mutex}; - if (exception) - return; - } - - RestoreTasks new_tasks; - try - { - new_tasks = current_task->run(); - } - catch (...) - { - std::lock_guard lock{mutex}; - if (!exception) - exception = std::current_exception(); - } - - { - std::lock_guard lock{mutex}; - tasks.insert(tasks.end(), std::make_move_iterator(new_tasks.begin()), std::make_move_iterator(new_tasks.end())); - } - }; - - if (!thread_pool.trySchedule(job)) - job(); - } - - { - std::unique_lock lock{mutex}; - event.wait(lock, [&] { return !num_active_jobs; }); - } - - if (exception) - std::rethrow_exception(exception); -} - - -/// Returns access required to execute RESTORE query. -AccessRightsElements getRequiredAccessToRestore(const ASTBackupQuery::Elements & elements, const RestoreSettings & restore_settings) -{ - AccessRightsElements required_access; - for (const auto & element : elements) - { - switch (element.type) - { - case ASTBackupQuery::TABLE: - { - if (element.is_temp_db) - { - if (restore_settings.create_table != RestoreTableCreationMode::kMustExist) - required_access.emplace_back(AccessType::CREATE_TEMPORARY_TABLE); - break; - } - AccessFlags flags = AccessType::SHOW_TABLES; - if (restore_settings.create_table != RestoreTableCreationMode::kMustExist) - flags |= AccessType::CREATE_TABLE; - if (!restore_settings.structure_only) - flags |= AccessType::INSERT; - required_access.emplace_back(flags, element.new_name.first, element.new_name.second); - break; - } - case ASTBackupQuery::DATABASE: - { - if (element.is_temp_db) - { - if (restore_settings.create_table != RestoreTableCreationMode::kMustExist) - required_access.emplace_back(AccessType::CREATE_TEMPORARY_TABLE); - break; - } - AccessFlags flags = AccessType::SHOW_TABLES | AccessType::SHOW_DATABASES; - if (restore_settings.create_table != RestoreTableCreationMode::kMustExist) - flags |= AccessType::CREATE_TABLE; - if (restore_settings.create_database != RestoreDatabaseCreationMode::kMustExist) - flags |= AccessType::CREATE_DATABASE; - if (!restore_settings.structure_only) - flags |= AccessType::INSERT; - required_access.emplace_back(flags, element.new_name.first); - break; - } - case ASTBackupQuery::ALL_DATABASES: - { - AccessFlags flags = AccessType::SHOW_TABLES | AccessType::SHOW_DATABASES; - if (restore_settings.create_table != RestoreTableCreationMode::kMustExist) - flags |= AccessType::CREATE_TABLE; - if (restore_settings.create_database != RestoreDatabaseCreationMode::kMustExist) - flags |= AccessType::CREATE_DATABASE; - if (!restore_settings.structure_only) - flags |= AccessType::INSERT; - required_access.emplace_back(flags); - break; - } - } - } - return required_access; -} - -} diff --git a/src/Backups/RestoreUtils.h b/src/Backups/RestoreUtils.h deleted file mode 100644 index 1c9632a903b..00000000000 --- a/src/Backups/RestoreUtils.h +++ /dev/null @@ -1,38 +0,0 @@ -#pragma once - -#include -#include - - -namespace DB -{ - -class IBackup; -using BackupPtr = std::shared_ptr; -class IRestoreTask; -using RestoreTaskPtr = std::unique_ptr; -using RestoreTasks = std::vector; -struct RestoreSettings; -class IRestoreCoordination; -class AccessRightsElements; -class Context; -using ContextPtr = std::shared_ptr; -using ContextMutablePtr = std::shared_ptr; - -/// Prepares restore tasks. -RestoreTasks makeRestoreTasks(ContextMutablePtr context, const BackupPtr & backup, const ASTBackupQuery::Elements & elements, const RestoreSettings & restore_settings, const std::shared_ptr & restore_coordination, std::chrono::seconds timeout_for_restoring_metadata); - -/// Executes restore tasks. -void restoreMetadata( - RestoreTasks & restore_tasks, - const RestoreSettings & restore_settings, - const std::shared_ptr & restore_coordination, - std::chrono::seconds timeout_for_restoring_metadata); - -void restoreData(RestoreTasks & restore_tasks, ThreadPool & thread_pool); - - -/// Returns access required to execute RESTORE query. -AccessRightsElements getRequiredAccessToRestore(const ASTBackupQuery::Elements & elements, const RestoreSettings & restore_settings); - -} diff --git a/src/Backups/RestorerFromBackup.cpp b/src/Backups/RestorerFromBackup.cpp new file mode 100644 index 00000000000..14f5b7f48f0 --- /dev/null +++ b/src/Backups/RestorerFromBackup.cpp @@ -0,0 +1,817 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace fs = std::filesystem; + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int BACKUP_ENTRY_NOT_FOUND; + extern const int CANNOT_RESTORE_TABLE; + extern const int CANNOT_RESTORE_DATABASE; + extern const int LOGICAL_ERROR; +} + + +namespace +{ + constexpr const std::string_view sql_ext = ".sql"; + + String tryGetTableEngine(const IAST & ast) + { + const ASTCreateQuery * create = ast.as(); + if (!create) + return {}; + if (!create->storage || !create->storage->engine) + return {}; + return create->storage->engine->name; + } + + bool hasSystemTableEngine(const IAST & ast) + { + return tryGetTableEngine(ast).starts_with("System"); + } + + bool hasSystemAccessTableEngine(const IAST & ast) + { + String engine_name = tryGetTableEngine(ast); + return (engine_name == "SystemUsers") || (engine_name == "SystemRoles") || (engine_name == "SystemSettingsProfiles") + || (engine_name == "SystemRowPolicies") || (engine_name == "SystemQuotas"); + } +} + +bool RestorerFromBackup::TableKey::operator ==(const TableKey & right) const +{ + return (name == right.name) && (is_temporary == right.is_temporary); +} + +bool RestorerFromBackup::TableKey::operator <(const TableKey & right) const +{ + return (name < right.name) || ((name == right.name) && (is_temporary < right.is_temporary)); +} + +std::string_view RestorerFromBackup::toString(Stage stage) +{ + switch (stage) + { + case Stage::kPreparing: return "Preparing"; + case Stage::kFindingTablesInBackup: return "Finding tables in backup"; + case Stage::kCreatingDatabases: return "Creating databases"; + case Stage::kCreatingTables: return "Creating tables"; + case Stage::kInsertingDataToTables: return "Inserting data to tables"; + case Stage::kError: return "Error"; + } + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown restore stage: {}", static_cast(stage)); +} + + +RestorerFromBackup::RestorerFromBackup( + const ASTBackupQuery::Elements & restore_query_elements_, + const RestoreSettings & restore_settings_, + std::shared_ptr restore_coordination_, + const BackupPtr & backup_, + const ContextMutablePtr & context_, + std::chrono::seconds timeout_) + : restore_query_elements(restore_query_elements_) + , restore_settings(restore_settings_) + , restore_coordination(restore_coordination_) + , backup(backup_) + , context(context_) + , timeout(timeout_) + , log(&Poco::Logger::get("RestorerFromBackup")) +{ +} + +RestorerFromBackup::~RestorerFromBackup() = default; + +void RestorerFromBackup::restoreMetadata() +{ + run(/* only_check_access= */ false); +} + +void RestorerFromBackup::checkAccessOnly() +{ + run(/* only_check_access= */ true); +} + +void RestorerFromBackup::run(bool only_check_access) +{ + try + { + /// restoreMetadata() must not be called multiple times. + if (current_stage != Stage::kPreparing) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Already restoring"); + + /// Calculate the root path in the backup for restoring, it's either empty or has the format "shards//replicas//". + findRootPathsInBackup(); + + /// Do renaming in the create queries according to the renaming config. + renaming_map = makeRenamingMapFromBackupQuery(restore_query_elements); + + /// Find all the databases and tables which we will read from the backup. + setStage(Stage::kFindingTablesInBackup); + collectDatabaseAndTableInfos(); + + /// Check access rights. + checkAccessForCollectedInfos(); + if (only_check_access) + return; + + /// Create databases using the create queries read from the backup. + setStage(Stage::kCreatingDatabases); + createDatabases(); + + /// Create tables using the create queries read from the backup. + setStage(Stage::kCreatingTables); + createTables(); + + /// All what's left is to insert data to tables. + /// No more data restoring tasks are allowed after this point. + setStage(Stage::kInsertingDataToTables); + } + catch (...) + { + try + { + /// Other hosts should know that we've encountered an error. + setStage(Stage::kError, getCurrentExceptionMessage(false)); + } + catch (...) + { + } + throw; + } +} + + +RestorerFromBackup::DataRestoreTasks RestorerFromBackup::getDataRestoreTasks() +{ + if (current_stage != Stage::kInsertingDataToTables) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Metadata wasn't restored"); + + if (data_restore_tasks.empty() && !access_restore_task) + return {}; + + LOG_TRACE(log, "Will insert data to tables"); + + /// Storages and table locks must exist while we're executing data restoring tasks. + auto storages = std::make_shared>(); + auto table_locks = std::make_shared>(); + storages->reserve(table_infos.size()); + table_locks->reserve(table_infos.size()); + for (const auto & table_info : table_infos | boost::adaptors::map_values) + { + storages->push_back(table_info.storage); + table_locks->push_back(table_info.table_lock); + } + + DataRestoreTasks res_tasks; + for (const auto & task : data_restore_tasks) + res_tasks.push_back([task, storages, table_locks] { task(); }); + + if (access_restore_task) + res_tasks.push_back([task = access_restore_task, access_control = &context->getAccessControl()] { task->restore(*access_control); }); + + return res_tasks; +} + +void RestorerFromBackup::setStage(Stage new_stage, const String & error_message) +{ + if (new_stage == Stage::kError) + LOG_ERROR(log, "{} failed with error: {}", toString(current_stage), error_message); + else + LOG_TRACE(log, "{}", toString(new_stage)); + + current_stage = new_stage; + + if (!restore_coordination) + return; + + if (new_stage == Stage::kError) + { + restore_coordination->syncStageError(restore_settings.host_id, error_message); + } + else + { + auto all_hosts + = BackupSettings::Util::filterHostIDs(restore_settings.cluster_host_ids, restore_settings.shard_num, restore_settings.replica_num); + restore_coordination->syncStage(restore_settings.host_id, static_cast(new_stage), all_hosts, timeout); + } +} + +void RestorerFromBackup::findRootPathsInBackup() +{ + size_t shard_num = 1; + size_t replica_num = 1; + if (!restore_settings.host_id.empty()) + { + std::tie(shard_num, replica_num) + = BackupSettings::Util::findShardNumAndReplicaNum(restore_settings.cluster_host_ids, restore_settings.host_id); + } + + root_paths_in_backup.clear(); + + /// Start with "" as the root path and then we will add shard- and replica-related part to it. + fs::path root_path = "/"; + root_paths_in_backup.push_back(root_path); + + /// Add shard-related part to the root path. + Strings shards_in_backup = backup->listFiles(root_path / "shards"); + if (shards_in_backup.empty()) + { + if (restore_settings.shard_num_in_backup > 1) + throw Exception(ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "No shard #{} in backup", restore_settings.shard_num_in_backup); + } + else + { + String shard_name; + if (restore_settings.shard_num_in_backup) + shard_name = std::to_string(restore_settings.shard_num_in_backup); + else if (shards_in_backup.size() == 1) + shard_name = shards_in_backup.front(); + else + shard_name = std::to_string(shard_num); + if (std::find(shards_in_backup.begin(), shards_in_backup.end(), shard_name) == shards_in_backup.end()) + throw Exception(ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "No shard #{} in backup", shard_name); + root_path = root_path / "shards" / shard_name; + root_paths_in_backup.push_back(root_path); + } + + /// Add replica-related part to the root path. + Strings replicas_in_backup = backup->listFiles(root_path / "replicas"); + if (replicas_in_backup.empty()) + { + if (restore_settings.replica_num_in_backup > 1) + throw Exception(ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "No replica #{} in backup", restore_settings.replica_num_in_backup); + } + else + { + String replica_name; + if (restore_settings.replica_num_in_backup) + { + replica_name = std::to_string(restore_settings.replica_num_in_backup); + if (std::find(replicas_in_backup.begin(), replicas_in_backup.end(), replica_name) == replicas_in_backup.end()) + throw Exception(ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "No replica #{} in backup", replica_name); + } + else + { + replica_name = std::to_string(replica_num); + if (std::find(replicas_in_backup.begin(), replicas_in_backup.end(), replica_name) == replicas_in_backup.end()) + replica_name = replicas_in_backup.front(); + } + root_path = root_path / "replicas" / replica_name; + root_paths_in_backup.push_back(root_path); + } + + /// Revert the list of root paths, because we need it in the following order: + /// "/shards//replicas//" (first we search tables here) + /// "/shards//" (then here) + /// "/" (and finally here) + std::reverse(root_paths_in_backup.begin(), root_paths_in_backup.end()); + + LOG_TRACE( + log, + "Will use paths in backup: {}", + boost::algorithm::join( + root_paths_in_backup + | boost::adaptors::transformed([](const fs::path & path) -> String { return doubleQuoteString(String{path}); }), + ", ")); +} + +void RestorerFromBackup::collectDatabaseAndTableInfos() +{ + database_infos.clear(); + table_infos.clear(); + for (const auto & element : restore_query_elements) + { + switch (element.type) + { + case ASTBackupQuery::ElementType::TABLE: + { + collectTableInfo({element.database_name, element.table_name}, false, element.partitions); + break; + } + case ASTBackupQuery::ElementType::TEMPORARY_TABLE: + { + collectTableInfo({element.database_name, element.table_name}, true, element.partitions); + break; + } + case ASTBackupQuery::ElementType::DATABASE: + { + collectDatabaseInfo(element.database_name, element.except_tables, /* throw_if_no_database_metadata_in_backup= */ true); + break; + } + case ASTBackupQuery::ElementType::ALL: + { + collectAllDatabasesInfo(element.except_databases, element.except_tables); + break; + } + } + } + + LOG_INFO(log, "Will restore {} databases and {} tables", database_infos.size(), table_infos.size()); +} + +void RestorerFromBackup::collectTableInfo(const QualifiedTableName & table_name_in_backup, bool is_temporary_table, const std::optional & partitions) +{ + String database_name_in_backup = is_temporary_table ? DatabaseCatalog::TEMPORARY_DATABASE : table_name_in_backup.database; + + std::optional metadata_path; + std::optional root_path_in_use; + for (const auto & root_path_in_backup : root_paths_in_backup) + { + fs::path try_metadata_path; + if (is_temporary_table) + { + try_metadata_path + = root_path_in_backup / "temporary_tables" / "metadata" / (escapeForFileName(table_name_in_backup.table) + ".sql"); + } + else + { + try_metadata_path = root_path_in_backup / "metadata" / escapeForFileName(table_name_in_backup.database) + / (escapeForFileName(table_name_in_backup.table) + ".sql"); + } + + if (backup->fileExists(try_metadata_path)) + { + metadata_path = try_metadata_path; + root_path_in_use = root_path_in_backup; + break; + } + } + + if (!metadata_path) + throw Exception(ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Table {} not found in backup", table_name_in_backup.getFullName()); + + TableKey table_key; + fs::path data_path_in_backup; + if (is_temporary_table) + { + data_path_in_backup = *root_path_in_use / "temporary_tables" / "data" / escapeForFileName(table_name_in_backup.table); + table_key.name.table = renaming_map.getNewTemporaryTableName(table_name_in_backup.table); + table_key.is_temporary = true; + } + else + { + data_path_in_backup + = *root_path_in_use / "data" / escapeForFileName(table_name_in_backup.database) / escapeForFileName(table_name_in_backup.table); + table_key.name = renaming_map.getNewTableName(table_name_in_backup); + } + + auto read_buffer = backup->readFile(*metadata_path)->getReadBuffer(); + String create_query_str; + readStringUntilEOF(create_query_str, *read_buffer); + read_buffer.reset(); + ParserCreateQuery create_parser; + ASTPtr create_table_query = parseQuery(create_parser, create_query_str, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH); + renameDatabaseAndTableNameInCreateQuery(context->getGlobalContext(), renaming_map, create_table_query); + + if (auto it = table_infos.find(table_key); it != table_infos.end()) + { + const TableInfo & table_info = it->second; + if (table_info.create_table_query && (serializeAST(*table_info.create_table_query) != serializeAST(*create_table_query))) + { + throw Exception( + ErrorCodes::CANNOT_RESTORE_TABLE, + "Extracted two different create queries for the same {}table {}: {} and {}", + (is_temporary_table ? "temporary " : ""), + table_key.name.getFullName(), + serializeAST(*table_info.create_table_query), + serializeAST(*create_table_query)); + } + } + + TableInfo & res_table_info = table_infos[table_key]; + res_table_info.create_table_query = create_table_query; + res_table_info.data_path_in_backup = data_path_in_backup; + res_table_info.dependencies = getDependenciesSetFromCreateQuery(context->getGlobalContext(), table_key.name, create_table_query); + + if (partitions) + { + if (!res_table_info.partitions) + res_table_info.partitions.emplace(); + insertAtEnd(*res_table_info.partitions, *partitions); + } + + if (hasSystemAccessTableEngine(*create_table_query)) + { + if (!access_restore_task) + access_restore_task = std::make_shared(backup, restore_settings, restore_coordination); + access_restore_task->addDataPath(data_path_in_backup); + } +} + +void RestorerFromBackup::collectDatabaseInfo(const String & database_name_in_backup, const std::set & except_table_names, bool throw_if_no_database_metadata_in_backup) +{ + std::optional metadata_path; + std::unordered_set table_names_in_backup; + for (const auto & root_path_in_backup : root_paths_in_backup) + { + fs::path try_metadata_path = root_path_in_backup / "metadata" / (escapeForFileName(database_name_in_backup) + ".sql"); + if (!metadata_path && backup->fileExists(try_metadata_path)) + metadata_path = try_metadata_path; + + Strings file_names = backup->listFiles(root_path_in_backup / "metadata" / escapeForFileName(database_name_in_backup)); + for (const String & file_name : file_names) + { + if (!file_name.ends_with(sql_ext)) + continue; + String file_name_without_ext = file_name.substr(0, file_name.length() - sql_ext.length()); + table_names_in_backup.insert(unescapeForFileName(file_name_without_ext)); + } + } + + if (!metadata_path && throw_if_no_database_metadata_in_backup) + throw Exception(ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Database {} not found in backup", backQuoteIfNeed(database_name_in_backup)); + + if (metadata_path) + { + auto read_buffer = backup->readFile(*metadata_path)->getReadBuffer(); + String create_query_str; + readStringUntilEOF(create_query_str, *read_buffer); + read_buffer.reset(); + ParserCreateQuery create_parser; + ASTPtr create_database_query = parseQuery(create_parser, create_query_str, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH); + renameDatabaseAndTableNameInCreateQuery(context->getGlobalContext(), renaming_map, create_database_query); + + String database_name = renaming_map.getNewDatabaseName(database_name_in_backup); + DatabaseInfo & database_info = database_infos[database_name]; + + if (database_info.create_database_query && (serializeAST(*database_info.create_database_query) != serializeAST(*create_database_query))) + { + throw Exception( + ErrorCodes::CANNOT_RESTORE_DATABASE, + "Extracted two different create queries for the same database {}: {} and {}", + backQuoteIfNeed(database_name), + serializeAST(*database_info.create_database_query), + serializeAST(*create_database_query)); + } + + database_info.create_database_query = create_database_query; + } + + for (const String & table_name_in_backup : table_names_in_backup) + { + if (except_table_names.contains({database_name_in_backup, table_name_in_backup})) + continue; + + collectTableInfo({database_name_in_backup, table_name_in_backup}, /* is_temporary_table= */ false, /* partitions= */ {}); + } +} + +void RestorerFromBackup::collectAllDatabasesInfo(const std::set & except_database_names, const std::set & except_table_names) +{ + std::unordered_set database_names_in_backup; + std::unordered_set temporary_table_names_in_backup; + + for (const auto & root_path_in_backup : root_paths_in_backup) + { + Strings file_names = backup->listFiles(root_path_in_backup / "metadata"); + for (String & file_name : file_names) + { + if (file_name.ends_with(sql_ext)) + file_name.resize(file_name.length() - sql_ext.length()); + database_names_in_backup.emplace(unescapeForFileName(file_name)); + } + + file_names = backup->listFiles(root_path_in_backup / "temporary_tables" / "metadata"); + for (String & file_name : file_names) + { + if (!file_name.ends_with(sql_ext)) + continue; + file_name.resize(file_name.length() - sql_ext.length()); + temporary_table_names_in_backup.emplace(unescapeForFileName(file_name)); + } + } + + for (const String & database_name_in_backup : database_names_in_backup) + { + if (except_database_names.contains(database_name_in_backup)) + continue; + + collectDatabaseInfo(database_name_in_backup, except_table_names, /* throw_if_no_database_metadata_in_backup= */ false); + } + + for (const String & temporary_table_name_in_backup : temporary_table_names_in_backup) + collectTableInfo({"", temporary_table_name_in_backup}, /* is_temporary_table= */ true, /* partitions= */ {}); +} + +void RestorerFromBackup::checkAccessForCollectedInfos() const +{ + AccessRightsElements required_access; + for (const auto & database_name : database_infos | boost::adaptors::map_keys) + { + if (DatabaseCatalog::isPredefinedDatabaseName(database_name)) + continue; + + AccessFlags flags; + + if (restore_settings.create_database != RestoreDatabaseCreationMode::kMustExist) + flags |= AccessType::CREATE_DATABASE; + + if (!flags) + flags = AccessType::SHOW_DATABASES; + + required_access.emplace_back(flags, database_name); + } + + for (const auto & [table_name, table_info] : table_infos) + { + if (hasSystemTableEngine(*table_info.create_table_query)) + continue; + + if (table_name.is_temporary) + { + if (restore_settings.create_table != RestoreTableCreationMode::kMustExist) + required_access.emplace_back(AccessType::CREATE_TEMPORARY_TABLE); + continue; + } + + AccessFlags flags; + const ASTCreateQuery & create = table_info.create_table_query->as(); + + if (restore_settings.create_table != RestoreTableCreationMode::kMustExist) + { + if (create.is_dictionary) + flags |= AccessType::CREATE_DICTIONARY; + else if (create.is_ordinary_view || create.is_materialized_view || create.is_live_view) + flags |= AccessType::CREATE_VIEW; + else + flags |= AccessType::CREATE_TABLE; + } + + if (!restore_settings.structure_only && !create.is_dictionary && !create.is_ordinary_view + && backup->hasFiles(table_info.data_path_in_backup)) + { + flags |= AccessType::INSERT; + } + + if (!flags) + { + if (create.is_dictionary) + flags = AccessType::SHOW_DICTIONARIES; + else + flags = AccessType::SHOW_TABLES; + } + + required_access.emplace_back(flags, table_name.name.database, table_name.name.table); + } + + if (access_restore_task) + insertAtEnd(required_access, access_restore_task->getRequiredAccess()); + + /// We convert to AccessRights and back to check access rights in a predictable way + /// (some elements could be duplicated or not sorted). + required_access = AccessRights{required_access}.getElements(); + + context->checkAccess(required_access); + +} + +void RestorerFromBackup::createDatabases() +{ + for (const auto & [database_name, database_info] : database_infos) + { + bool need_create_database = (restore_settings.create_database != RestoreDatabaseCreationMode::kMustExist); + if (need_create_database && DatabaseCatalog::isPredefinedDatabaseName(database_name)) + need_create_database = false; /// Predefined databases always exist. + + if (need_create_database) + { + /// Execute CREATE DATABASE query. + auto create_database_query = database_info.create_database_query; + if (restore_settings.create_table == RestoreTableCreationMode::kCreateIfNotExists) + { + create_database_query = create_database_query->clone(); + create_database_query->as().if_not_exists = true; + } + LOG_TRACE(log, "Creating database {}: {}", backQuoteIfNeed(database_name), serializeAST(*create_database_query)); + executeCreateQuery(create_database_query); + } + + DatabasePtr database = DatabaseCatalog::instance().getDatabase(database_name); + + if (!restore_settings.allow_different_database_def) + { + /// Check that the database's definition is the same as expected. + ASTPtr create_database_query = database->getCreateDatabaseQueryForBackup(); + ASTPtr expected_create_query = database_info.create_database_query; + if (serializeAST(*create_database_query) != serializeAST(*expected_create_query)) + { + throw Exception( + ErrorCodes::CANNOT_RESTORE_DATABASE, + "The database {} has a different definition: {} " + "comparing to its definition in the backup: {}", + backQuoteIfNeed(database_name), + serializeAST(*create_database_query), + serializeAST(*expected_create_query)); + } + } + } +} + +void RestorerFromBackup::createTables() +{ + while (true) + { + /// We need to create tables considering their dependencies. + auto tables_to_create = findTablesWithoutDependencies(); + if (tables_to_create.empty()) + break; /// We've already created all the tables. + + for (const auto & table_key : tables_to_create) + { + auto & table_info = table_infos.at(table_key); + + DatabasePtr database; + if (table_key.is_temporary) + database = DatabaseCatalog::instance().getDatabaseForTemporaryTables(); + else + database = DatabaseCatalog::instance().getDatabase(table_key.name.database); + + bool need_create_table = (restore_settings.create_table != RestoreTableCreationMode::kMustExist); + if (need_create_table && hasSystemTableEngine(*table_info.create_table_query)) + need_create_table = false; /// Tables with System* table engine already exist or can't be created by SQL anyway. + + if (need_create_table) + { + /// Execute CREATE TABLE query (we call IDatabase::createTableRestoredFromBackup() to allow the database to do some + /// database-specific things). + auto create_table_query = table_info.create_table_query; + if (restore_settings.create_table == RestoreTableCreationMode::kCreateIfNotExists) + { + create_table_query = create_table_query->clone(); + create_table_query->as().if_not_exists = true; + } + LOG_TRACE( + log, + "Creating {}table {}: {}", + (table_key.is_temporary ? "temporary " : ""), + table_key.name.getFullName(), + serializeAST(*create_table_query)); + + database->createTableRestoredFromBackup(create_table_query, *this); + } + + table_info.created = true; + + auto resolved_id = table_key.is_temporary + ? context->resolveStorageID(StorageID{"", table_key.name.table}, Context::ResolveExternal) + : context->resolveStorageID(StorageID{table_key.name.database, table_key.name.table}, Context::ResolveGlobal); + + auto storage = database->getTable(resolved_id.table_name, context); + table_info.storage = storage; + table_info.table_lock = storage->lockForShare(context->getInitialQueryId(), context->getSettingsRef().lock_acquire_timeout); + + if (!restore_settings.allow_different_table_def) + { + ASTPtr create_table_query = storage->getCreateQueryForBackup(context, nullptr); + ASTPtr expected_create_query = table_info.create_table_query; + if (serializeAST(*create_table_query) != serializeAST(*expected_create_query)) + { + throw Exception( + ErrorCodes::CANNOT_RESTORE_TABLE, + "The {}table {} has a different definition: {} " + "comparing to its definition in the backup: {}", + (table_key.is_temporary ? "temporary " : ""), + table_key.name.getFullName(), + serializeAST(*create_table_query), + serializeAST(*expected_create_query)); + } + } + + if (!restore_settings.structure_only) + { + const auto & data_path_in_backup = table_info.data_path_in_backup; + const auto & partitions = table_info.partitions; + storage->restoreDataFromBackup(*this, data_path_in_backup, partitions); + } + } + } +} + +/// Returns the list of tables without dependencies or those which dependencies have been created before. +std::vector RestorerFromBackup::findTablesWithoutDependencies() const +{ + std::vector tables_without_dependencies; + bool all_tables_created = true; + + for (const auto & [key, table_info] : table_infos) + { + if (table_info.created) + continue; + + /// Found a table which is not created yet. + all_tables_created = false; + + /// Check if all dependencies have been created before. + bool all_dependencies_met = true; + for (const auto & dependency : table_info.dependencies) + { + auto it = table_infos.find(TableKey{dependency, false}); + if ((it != table_infos.end()) && !it->second.created) + { + all_dependencies_met = false; + break; + } + } + + if (all_dependencies_met) + tables_without_dependencies.push_back(key); + } + + if (!tables_without_dependencies.empty()) + return tables_without_dependencies; + + if (all_tables_created) + return {}; + + /// Cyclic dependency? We'll try to create those tables anyway but probably it's going to fail. + std::vector tables_with_cyclic_dependencies; + for (const auto & [key, table_info] : table_infos) + { + if (!table_info.created) + tables_with_cyclic_dependencies.push_back(key); + } + + /// Only show a warning here, proper exception will be thrown later on creating those tables. + LOG_WARNING( + log, + "Some tables have cyclic dependency from each other: {}", + boost::algorithm::join( + tables_with_cyclic_dependencies + | boost::adaptors::transformed([](const TableKey & key) -> String { return key.name.getFullName(); }), + ", ")); + + return tables_with_cyclic_dependencies; +} + +void RestorerFromBackup::addDataRestoreTask(DataRestoreTask && new_task) +{ + if (current_stage == Stage::kInsertingDataToTables) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding data-restoring tasks is not allowed"); + data_restore_tasks.push_back(std::move(new_task)); +} + +void RestorerFromBackup::addDataRestoreTasks(DataRestoreTasks && new_tasks) +{ + if (current_stage == Stage::kInsertingDataToTables) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding data-restoring tasks is not allowed"); + insertAtEnd(data_restore_tasks, std::move(new_tasks)); +} + +void RestorerFromBackup::checkPathInBackupToRestoreAccess(const String & path) +{ + if (!access_restore_task || !access_restore_task->hasDataPath(path)) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Path to restore access was not added"); +} + +void RestorerFromBackup::executeCreateQuery(const ASTPtr & create_query) const +{ + InterpreterCreateQuery interpreter{create_query, context}; + interpreter.setInternal(true); + interpreter.execute(); +} + +void RestorerFromBackup::throwPartitionsNotSupported(const StorageID & storage_id, const String & table_engine) +{ + throw Exception( + ErrorCodes::CANNOT_RESTORE_TABLE, + "Table engine {} doesn't support partitions, cannot table {}", + table_engine, + storage_id.getFullTableName()); +} + +void RestorerFromBackup::throwTableIsNotEmpty(const StorageID & storage_id) +{ + throw Exception( + ErrorCodes::CANNOT_RESTORE_TABLE, + "Cannot restore the table {} because it already contains some data. You can set structure_only=true or " + "allow_non_empty_tables=true to overcome that in the way you want", + storage_id.getFullTableName()); +} +} diff --git a/src/Backups/RestorerFromBackup.h b/src/Backups/RestorerFromBackup.h new file mode 100644 index 00000000000..65139e0b946 --- /dev/null +++ b/src/Backups/RestorerFromBackup.h @@ -0,0 +1,146 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ +class IBackup; +using BackupPtr = std::shared_ptr; +class IRestoreCoordination; +struct StorageID; +class AccessRestoreTask; + +/// Restores the definition of databases and tables and prepares tasks to restore the data of the tables. +class RestorerFromBackup : private boost::noncopyable +{ +public: + RestorerFromBackup( + const ASTBackupQuery::Elements & restore_query_elements_, + const RestoreSettings & restore_settings_, + std::shared_ptr restore_coordination_, + const BackupPtr & backup_, + const ContextMutablePtr & context_, + std::chrono::seconds timeout_); + + ~RestorerFromBackup(); + + /// Restores the definition of databases and tables and prepares tasks to restore the data of the tables. + /// restoreMetadata() checks access rights internally so checkAccessRightsOnly() shouldn't be called first. + void restoreMetadata(); + + /// Only checks access rights without restoring anything. + void checkAccessOnly(); + + using DataRestoreTask = std::function; + using DataRestoreTasks = std::vector; + DataRestoreTasks getDataRestoreTasks(); + + BackupPtr getBackup() const { return backup; } + const RestoreSettings & getRestoreSettings() const { return restore_settings; } + bool isNonEmptyTableAllowed() const { return getRestoreSettings().allow_non_empty_tables; } + std::shared_ptr getRestoreCoordination() const { return restore_coordination; } + std::chrono::seconds getTimeout() const { return timeout; } + ContextMutablePtr getContext() const { return context; } + void executeCreateQuery(const ASTPtr & create_query) const; + + /// Adds a data restore task which will be later returned by getDataRestoreTasks(). + /// This function can be called by implementations of IStorage::restoreFromBackup() in inherited storage classes. + void addDataRestoreTask(DataRestoreTask && new_task); + void addDataRestoreTasks(DataRestoreTasks && new_tasks); + + /// Adds a new data path to restore access control. + void checkPathInBackupToRestoreAccess(const String & path); + + /// Reading a backup includes a few stages: + enum class Stage + { + /// Initial stage. + kPreparing, + + /// Finding databases and tables in the backup which we're going to restore. + kFindingTablesInBackup, + + /// Creating databases or finding them and checking their definitions. + kCreatingDatabases, + + /// Creating tables or finding them and checking their definition. + kCreatingTables, + + /// Inserting restored data to tables. + kInsertingDataToTables, + + /// An error happens during any of the stages above, the backup is not restored properly. + kError = -1, + }; + static std::string_view toString(Stage stage); + + /// Throws an exception that a specified table engine doesn't support partitions. + [[noreturn]] static void throwPartitionsNotSupported(const StorageID & storage_id, const String & table_engine); + + /// Throws an exception that a specified table is already non-empty. + [[noreturn]] static void throwTableIsNotEmpty(const StorageID & storage_id); + +private: + const ASTBackupQuery::Elements restore_query_elements; + const RestoreSettings restore_settings; + std::shared_ptr restore_coordination; + BackupPtr backup; + ContextMutablePtr context; + std::chrono::seconds timeout; + Poco::Logger * log; + + Stage current_stage = Stage::kPreparing; + std::vector root_paths_in_backup; + DDLRenamingMap renaming_map; + + void run(bool only_check_access); + void setStage(Stage new_stage, const String & error_message = {}); + void findRootPathsInBackup(); + void collectDatabaseAndTableInfos(); + void collectTableInfo(const QualifiedTableName & table_name_in_backup, bool is_temporary_table, const std::optional & partitions); + void collectDatabaseInfo(const String & database_name_in_backup, const std::set & except_table_names, bool throw_if_no_database_metadata_in_backup); + void collectAllDatabasesInfo(const std::set & except_database_names, const std::set & except_table_names); + void checkAccessForCollectedInfos() const; + void createDatabases(); + void createTables(); + + struct DatabaseInfo + { + ASTPtr create_database_query; + }; + + struct TableInfo + { + ASTPtr create_table_query; + std::optional partitions; + std::filesystem::path data_path_in_backup; + std::unordered_set dependencies; + bool created = false; + StoragePtr storage; + TableLockHolder table_lock; + }; + + struct TableKey + { + QualifiedTableName name; + bool is_temporary = false; + bool operator ==(const TableKey & right) const; + bool operator <(const TableKey & right) const; + }; + + std::vector findTablesWithoutDependencies() const; + + std::unordered_map database_infos; + std::map table_infos; + std::vector data_restore_tasks; + std::shared_ptr access_restore_task; +}; + +} diff --git a/src/Backups/formatTableNameOrTemporaryTableName.cpp b/src/Backups/formatTableNameOrTemporaryTableName.cpp deleted file mode 100644 index 7338e1dab23..00000000000 --- a/src/Backups/formatTableNameOrTemporaryTableName.cpp +++ /dev/null @@ -1,17 +0,0 @@ -#include -#include -#include - - -namespace DB -{ - -String formatTableNameOrTemporaryTableName(const DatabaseAndTableName & table_name) -{ - if (table_name.first == DatabaseCatalog::TEMPORARY_DATABASE) - return "temporary table " + backQuoteIfNeed(table_name.second); - else - return "table " + backQuoteIfNeed(table_name.first) + "." + backQuoteIfNeed(table_name.second); -} - -} diff --git a/src/Backups/formatTableNameOrTemporaryTableName.h b/src/Backups/formatTableNameOrTemporaryTableName.h deleted file mode 100644 index a6b94cd4077..00000000000 --- a/src/Backups/formatTableNameOrTemporaryTableName.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -#include - - -namespace DB -{ -using DatabaseAndTableName = std::pair; - -/// Outputs either "table db_name.table_name" or "temporary table table_name". -String formatTableNameOrTemporaryTableName(const DatabaseAndTableName & table_name); - -} diff --git a/src/Backups/replaceTableUUIDWithMacroInReplicatedTableDef.cpp b/src/Backups/replaceTableUUIDWithMacroInReplicatedTableDef.cpp deleted file mode 100644 index cf6190c4c27..00000000000 --- a/src/Backups/replaceTableUUIDWithMacroInReplicatedTableDef.cpp +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include -#include -#include - - -namespace DB -{ - -void replaceTableUUIDWithMacroInReplicatedTableDef(ASTCreateQuery & create_query, const UUID & table_uuid) -{ - if (create_query.getTable().empty() || !create_query.storage || !create_query.storage->engine || (table_uuid == UUIDHelpers::Nil)) - return; - - auto & engine = *(create_query.storage->engine); - if (!engine.name.starts_with("Replicated") || !engine.arguments) - return; - - auto * args = typeid_cast(engine.arguments.get()); - - size_t zookeeper_path_arg_pos = engine.name.starts_with("ReplicatedGraphite") ? 1 : 0; - - if (!args || (args->children.size() <= zookeeper_path_arg_pos)) - return; - - auto * zookeeper_path_arg = typeid_cast(args->children[zookeeper_path_arg_pos].get()); - if (!zookeeper_path_arg || (zookeeper_path_arg->value.getType() != Field::Types::String)) - return; - - String & zookeeper_path = zookeeper_path_arg->value.get(); - - String table_uuid_str = toString(table_uuid); - if (size_t uuid_pos = zookeeper_path.find(table_uuid_str); uuid_pos != String::npos) - zookeeper_path.replace(uuid_pos, table_uuid_str.size(), "{uuid}"); -} - -} diff --git a/src/Backups/replaceTableUUIDWithMacroInReplicatedTableDef.h b/src/Backups/replaceTableUUIDWithMacroInReplicatedTableDef.h deleted file mode 100644 index e339b1c4536..00000000000 --- a/src/Backups/replaceTableUUIDWithMacroInReplicatedTableDef.h +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -#include - -namespace DB -{ - -class ASTCreateQuery; - -/// While making a replicated table it replaces "{uuid}" in zookeeper path with the real table UUID. -/// This function reverts this replacement.d -void replaceTableUUIDWithMacroInReplicatedTableDef(ASTCreateQuery & create_query, const UUID & table_uuid); - -} diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 733c2d6b4df..396fd97368e 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -1125,7 +1125,7 @@ void ClientBase::sendData(Block & sample, const ColumnsDescription & columns_des if (need_render_progress && have_data_in_stdin) { /// Set total_bytes_to_read for current fd. - FileProgress file_progress(0, std_in.size()); + FileProgress file_progress(0, std_in.getFileSize()); progress_indication.updateProgress(Progress(file_progress)); /// Set callback to be called on file progress. @@ -1275,7 +1275,7 @@ try } /// Check if server send Log packet - receiveLogs(parsed_query); + receiveLogsAndProfileEvents(parsed_query); /// Check if server send Exception packet auto packet_type = connection->checkPacket(0); @@ -1328,11 +1328,11 @@ void ClientBase::sendDataFromStdin(Block & sample, const ColumnsDescription & co /// Process Log packets, used when inserting data by blocks -void ClientBase::receiveLogs(ASTPtr parsed_query) +void ClientBase::receiveLogsAndProfileEvents(ASTPtr parsed_query) { auto packet_type = connection->checkPacket(0); - while (packet_type && *packet_type == Protocol::Server::Log) + while (packet_type && (*packet_type == Protocol::Server::Log || *packet_type == Protocol::Server::ProfileEvents)) { receiveAndProcessPacket(parsed_query, false); packet_type = connection->checkPacket(0); diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h index d11977e984a..d34fe282839 100644 --- a/src/Client/ClientBase.h +++ b/src/Client/ClientBase.h @@ -117,7 +117,7 @@ protected: private: void receiveResult(ASTPtr parsed_query); bool receiveAndProcessPacket(ASTPtr parsed_query, bool cancelled_); - void receiveLogs(ASTPtr parsed_query); + void receiveLogsAndProfileEvents(ASTPtr parsed_query); bool receiveSampleBlock(Block & out, ColumnsDescription & columns_description, ASTPtr parsed_query); bool receiveEndOfQuery(); void cancelQuery(); diff --git a/src/Client/ConnectionPool.cpp b/src/Client/ConnectionPool.cpp index c5f398c899f..4ec87127318 100644 --- a/src/Client/ConnectionPool.cpp +++ b/src/Client/ConnectionPool.cpp @@ -22,7 +22,7 @@ ConnectionPoolPtr ConnectionPoolFactory::get( Key key{ max_connections, host, port, default_database, user, password, cluster, cluster_secret, client_name, compression, secure, priority}; - std::unique_lock lock(mutex); + std::lock_guard lock(mutex); auto [it, inserted] = pools.emplace(key, ConnectionPoolPtr{}); if (!inserted) if (auto res = it->second.lock()) diff --git a/src/Client/ConnectionPool.h b/src/Client/ConnectionPool.h index ec1233df1b5..d1ee844358b 100644 --- a/src/Client/ConnectionPool.h +++ b/src/Client/ConnectionPool.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -179,7 +180,7 @@ public: private: mutable std::mutex mutex; using ConnectionPoolWeakPtr = std::weak_ptr; - std::unordered_map pools; + std::unordered_map pools TSA_GUARDED_BY(mutex); }; inline bool operator==(const ConnectionPoolFactory::Key & lhs, const ConnectionPoolFactory::Key & rhs) diff --git a/src/Client/ConnectionPoolWithFailover.cpp b/src/Client/ConnectionPoolWithFailover.cpp index 13d39980e1c..f2a07b64432 100644 --- a/src/Client/ConnectionPoolWithFailover.cpp +++ b/src/Client/ConnectionPoolWithFailover.cpp @@ -20,6 +20,7 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int ALL_CONNECTION_TRIES_FAILED; } @@ -45,6 +46,9 @@ IConnectionPool::Entry ConnectionPoolWithFailover::get(const ConnectionTimeouts const Settings * settings, bool /*force_connected*/) { + if (nested_pools.empty()) + throw DB::Exception(DB::ErrorCodes::ALL_CONNECTION_TRIES_FAILED, "Cannot get connection from ConnectionPoolWithFailover cause nested pools are empty"); + TryGetEntryFunc try_get_entry = [&](NestedPool & pool, std::string & fail_message) { return tryGetEntry(pool, timeouts, fail_message, settings); @@ -167,6 +171,9 @@ std::vector ConnectionPoolWithFailover::g PoolMode pool_mode, const TryGetEntryFunc & try_get_entry) { + if (nested_pools.empty()) + throw DB::Exception(DB::ErrorCodes::ALL_CONNECTION_TRIES_FAILED, "Cannot get connection from ConnectionPoolWithFailover cause nested pools are empty"); + size_t min_entries = (settings && settings->skip_unavailable_shards) ? 0 : 1; size_t max_tries = (settings ? size_t{settings->connections_with_failover_max_tries} : diff --git a/src/Client/LocalConnection.cpp b/src/Client/LocalConnection.cpp index 0707b0bcdc0..425e54fb392 100644 --- a/src/Client/LocalConnection.cpp +++ b/src/Client/LocalConnection.cpp @@ -18,6 +18,7 @@ namespace ErrorCodes extern const int UNKNOWN_PACKET_FROM_SERVER; extern const int UNKNOWN_EXCEPTION; extern const int NOT_IMPLEMENTED; + extern const int LOGICAL_ERROR; } LocalConnection::LocalConnection(ContextPtr context_, bool send_progress_, bool send_profile_events_, const String & server_display_name_) @@ -62,9 +63,13 @@ void LocalConnection::updateProgress(const Progress & value) state->progress.incrementPiecewiseAtomically(value); } -void LocalConnection::getProfileEvents(Block & block) +void LocalConnection::sendProfileEvents() { - ProfileEvents::getProfileEvents(server_display_name, state->profile_queue, block, last_sent_snapshots); + Block profile_block; + state->after_send_profile_events.restart(); + next_packet_type = Protocol::Server::ProfileEvents; + ProfileEvents::getProfileEvents(server_display_name, state->profile_queue, profile_block, last_sent_snapshots); + state->block.emplace(std::move(profile_block)); } void LocalConnection::sendQuery( @@ -192,13 +197,14 @@ void LocalConnection::sendData(const Block & block, const String &, bool) return; if (state->pushing_async_executor) - { state->pushing_async_executor->push(block); - } else if (state->pushing_executor) - { state->pushing_executor->push(block); - } + else + throw Exception("Unknown executor", ErrorCodes::LOGICAL_ERROR); + + if (send_profile_events) + sendProfileEvents(); } void LocalConnection::sendCancel() @@ -264,11 +270,7 @@ bool LocalConnection::poll(size_t) if (send_profile_events && (state->after_send_profile_events.elapsedMicroseconds() >= query_context->getSettingsRef().interactive_delay)) { - Block block; - state->after_send_profile_events.restart(); - next_packet_type = Protocol::Server::ProfileEvents; - getProfileEvents(block); - state->block.emplace(std::move(block)); + sendProfileEvents(); return true; } @@ -349,11 +351,7 @@ bool LocalConnection::poll(size_t) if (send_profile_events && state->executor) { - Block block; - state->after_send_profile_events.restart(); - next_packet_type = Protocol::Server::ProfileEvents; - getProfileEvents(block); - state->block.emplace(std::move(block)); + sendProfileEvents(); return true; } } diff --git a/src/Client/LocalConnection.h b/src/Client/LocalConnection.h index 1ad6ad73238..1ebe4a1d901 100644 --- a/src/Client/LocalConnection.h +++ b/src/Client/LocalConnection.h @@ -142,7 +142,7 @@ private: void updateProgress(const Progress & value); - void getProfileEvents(Block & block); + void sendProfileEvents(); bool pollImpl(); diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 6966dae3be3..6f2ac41cc08 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -631,6 +631,8 @@ M(660, HDFS_ERROR) \ M(661, CANNOT_SEND_SIGNAL) \ M(662, FS_METADATA_ERROR) \ + M(663, CANNOT_COLLECT_OBJECTS_FOR_BACKUP) \ + M(664, ACCESS_STORAGE_DOESNT_ALLOW_BACKUP) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Common/ErrorCodes.h b/src/Common/ErrorCodes.h index 4d5fa632e45..8879779a5e2 100644 --- a/src/Common/ErrorCodes.h +++ b/src/Common/ErrorCodes.h @@ -6,6 +6,7 @@ #include #include #include +#include #include /** Allows to count number of simultaneously happening error codes. @@ -57,7 +58,7 @@ namespace ErrorCodes void increment(bool remote, const std::string & message, const FramePointers & trace); private: - ErrorPair value; + ErrorPair value TSA_GUARDED_BY(mutex); std::mutex mutex; }; diff --git a/src/Common/FileCache.h b/src/Common/FileCache.h deleted file mode 100644 index cefa193bacb..00000000000 --- a/src/Common/FileCache.h +++ /dev/null @@ -1,430 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "FileCache_fwd.h" -#include -#include -#include -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - -class IFileCache; -using FileCachePtr = std::shared_ptr; - -/** - * Local cache for remote filesystem files, represented as a set of non-overlapping non-empty file segments. - */ -class IFileCache : private boost::noncopyable -{ -friend class FileSegment; -friend struct FileSegmentsHolder; -friend class FileSegmentRangeWriter; - -public: - using Key = UInt128; - using Downloader = std::unique_ptr; - - IFileCache( - const String & cache_base_path_, - const FileCacheSettings & cache_settings_); - - virtual ~IFileCache() = default; - - /// Restore cache from local filesystem. - virtual void initialize() = 0; - - virtual void remove(const Key & key) = 0; - - virtual void remove() = 0; - - static bool isReadOnly(); - - /// Cache capacity in bytes. - size_t capacity() const { return max_size; } - - static Key hash(const String & path); - - String getPathInLocalCache(const Key & key, size_t offset); - - String getPathInLocalCache(const Key & key); - - const String & getBasePath() const { return cache_base_path; } - - virtual std::vector tryGetCachePaths(const Key & key) = 0; - - /** - * Given an `offset` and `size` representing [offset, offset + size) bytes interval, - * return list of cached non-overlapping non-empty - * file segments `[segment1, ..., segmentN]` which intersect with given interval. - * - * Segments in returned list are ordered in ascending order and represent a full contiguous - * interval (no holes). Each segment in returned list has state: DOWNLOADED, DOWNLOADING or EMPTY. - * - * As long as pointers to returned file segments are hold - * it is guaranteed that these file segments are not removed from cache. - */ - virtual FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size) = 0; - - /** - * Segments in returned list are ordered in ascending order and represent a full contiguous - * interval (no holes). Each segment in returned list has state: DOWNLOADED, DOWNLOADING or EMPTY. - * - * If file segment has state EMPTY, then it is also marked as "detached". E.g. it is "detached" - * from cache (not owned by cache), and as a result will never change it's state and will be destructed - * with the destruction of the holder, while in getOrSet() EMPTY file segments can eventually change - * it's state (and become DOWNLOADED). - */ - virtual FileSegmentsHolder get(const Key & key, size_t offset, size_t size) = 0; - - virtual FileSegmentsHolder setDownloading(const Key & key, size_t offset, size_t size) = 0; - - virtual FileSegments getSnapshot() const = 0; - - /// For debug. - virtual String dumpStructure(const Key & key) = 0; - - virtual size_t getUsedCacheSize() const = 0; - - virtual size_t getFileSegmentsNum() const = 0; - -protected: - String cache_base_path; - size_t max_size; - size_t max_element_size; - size_t max_file_segment_size; - - bool is_initialized = false; - - mutable std::mutex mutex; - - class LRUQueue - { - public: - struct FileKeyAndOffset - { - Key key; - size_t offset; - size_t size; - size_t hits = 0; - - FileKeyAndOffset(const Key & key_, size_t offset_, size_t size_) : key(key_), offset(offset_), size(size_) {} - }; - - using Iterator = typename std::list::iterator; - - size_t getTotalCacheSize(std::lock_guard & /* cache_lock */) const { return cache_size; } - - size_t getElementsNum(std::lock_guard & /* cache_lock */) const { return queue.size(); } - - Iterator add(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock); - - void remove(Iterator queue_it, std::lock_guard & cache_lock); - - void moveToEnd(Iterator queue_it, std::lock_guard & cache_lock); - - /// Space reservation for a file segment is incremental, so we need to be able to increment size of the queue entry. - void incrementSize(Iterator queue_it, size_t size_increment, std::lock_guard & cache_lock); - - String toString(std::lock_guard & cache_lock) const; - - bool contains(const Key & key, size_t offset, std::lock_guard & cache_lock) const; - - Iterator begin() { return queue.begin(); } - - Iterator end() { return queue.end(); } - - void removeAll(std::lock_guard & cache_lock); - - private: - std::list queue; - size_t cache_size = 0; - }; - - using AccessKeyAndOffset = std::pair; - - struct KeyAndOffsetHash - { - std::size_t operator()(const AccessKeyAndOffset & key) const - { - return std::hash()(key.first) ^ std::hash()(key.second); - } - }; - - using AccessRecord = std::unordered_map; - - /// Used to track and control the cache access of each query. - /// Through it, we can realize the processing of different queries by the cache layer. - struct QueryContext - { - LRUQueue lru_queue; - AccessRecord records; - - size_t cache_size = 0; - size_t max_cache_size; - - bool skip_download_if_exceeds_query_cache; - - QueryContext(size_t max_cache_size_, bool skip_download_if_exceeds_query_cache_) - : max_cache_size(max_cache_size_) - , skip_download_if_exceeds_query_cache(skip_download_if_exceeds_query_cache_) {} - - void remove(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock) - { - if (cache_size < size) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Deleted cache size exceeds existing cache size"); - - if (!skip_download_if_exceeds_query_cache) - { - auto record = records.find({key, offset}); - if (record != records.end()) - { - lru_queue.remove(record->second, cache_lock); - records.erase({key, offset}); - } - } - cache_size -= size; - } - - void reserve(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock) - { - if (cache_size + size > max_cache_size) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Reserved cache size exceeds the remaining cache size"); - - if (!skip_download_if_exceeds_query_cache) - { - auto record = records.find({key, offset}); - if (record == records.end()) - { - auto queue_iter = lru_queue.add(key, offset, 0, cache_lock); - record = records.insert({{key, offset}, queue_iter}).first; - } - record->second->size += size; - } - cache_size += size; - } - - void use(const Key & key, size_t offset, std::lock_guard & cache_lock) - { - if (!skip_download_if_exceeds_query_cache) - { - auto record = records.find({key, offset}); - if (record != records.end()) - lru_queue.moveToEnd(record->second, cache_lock); - } - } - - size_t getMaxCacheSize() { return max_cache_size; } - - size_t getCacheSize() { return cache_size; } - - LRUQueue & queue() { return lru_queue; } - - bool isSkipDownloadIfExceed() { return skip_download_if_exceeds_query_cache; } - }; - - using QueryContextPtr = std::shared_ptr; - using QueryContextMap = std::unordered_map; - - QueryContextMap query_map; - - bool enable_filesystem_query_cache_limit; - - QueryContextPtr getCurrentQueryContext(std::lock_guard & cache_lock); - - QueryContextPtr getQueryContext(const String & query_id, std::lock_guard & cache_lock); - - void removeQueryContext(const String & query_id); - - QueryContextPtr getOrSetQueryContext(const String & query_id, const ReadSettings & settings, std::lock_guard &); - - virtual bool tryReserve( - const Key & key, size_t offset, size_t size, - std::lock_guard & cache_lock) = 0; - - virtual void remove( - Key key, size_t offset, - std::lock_guard & cache_lock, - std::lock_guard & segment_lock) = 0; - - virtual bool isLastFileSegmentHolder( - const Key & key, size_t offset, - std::lock_guard & cache_lock, - std::lock_guard & segment_lock) = 0; - - /// If file segment was partially downloaded and then space reservation fails (because of no - /// space left), then update corresponding cache cell metadata (file segment size). - virtual void reduceSizeToDownloaded( - const Key & key, size_t offset, - std::lock_guard & cache_lock, - std::lock_guard & segment_lock) = 0; - - void assertInitialized() const; - -public: - /// Save a query context information, and adopt different cache policies - /// for different queries through the context cache layer. - struct QueryContextHolder : private boost::noncopyable - { - explicit QueryContextHolder(const String & query_id_, IFileCache * cache_, QueryContextPtr context_); - - QueryContextHolder() = default; - - ~QueryContextHolder(); - - String query_id {}; - IFileCache * cache = nullptr; - QueryContextPtr context = nullptr; - }; - - QueryContextHolder getQueryContextHolder(const String & query_id, const ReadSettings & settings); -}; - -class LRUFileCache final : public IFileCache -{ -public: - LRUFileCache( - const String & cache_base_path_, - const FileCacheSettings & cache_settings_); - - FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size) override; - - FileSegmentsHolder get(const Key & key, size_t offset, size_t size) override; - - FileSegments getSnapshot() const override; - - void initialize() override; - - void remove(const Key & key) override; - - void remove() override; - - std::vector tryGetCachePaths(const Key & key) override; - - size_t getUsedCacheSize() const override; - - size_t getFileSegmentsNum() const override; - -private: - struct FileSegmentCell : private boost::noncopyable - { - FileSegmentPtr file_segment; - - /// Iterator is put here on first reservation attempt, if successful. - std::optional queue_iterator; - - /// Pointer to file segment is always hold by the cache itself. - /// Apart from pointer in cache, it can be hold by cache users, when they call - /// getorSet(), but cache users always hold it via FileSegmentsHolder. - bool releasable() const { return file_segment.unique(); } - - size_t size() const { return file_segment->reserved_size; } - - FileSegmentCell(FileSegmentPtr file_segment_, LRUFileCache * cache, std::lock_guard & cache_lock); - - FileSegmentCell(FileSegmentCell && other) noexcept - : file_segment(std::move(other.file_segment)) - , queue_iterator(other.queue_iterator) {} - }; - - using FileSegmentsByOffset = std::map; - using CachedFiles = std::unordered_map; - - CachedFiles files; - LRUQueue queue; - - LRUQueue stash_queue; - AccessRecord records; - - size_t max_stash_element_size; - size_t enable_cache_hits_threshold; - - Poco::Logger * log; - - FileSegments getImpl( - const Key & key, const FileSegment::Range & range, - std::lock_guard & cache_lock); - - FileSegmentCell * getCell( - const Key & key, size_t offset, std::lock_guard & cache_lock); - - FileSegmentCell * addCell( - const Key & key, size_t offset, size_t size, - FileSegment::State state, std::lock_guard & cache_lock); - - void useCell(const FileSegmentCell & cell, FileSegments & result, std::lock_guard & cache_lock); - - bool tryReserve( - const Key & key, size_t offset, size_t size, - std::lock_guard & cache_lock) override; - - bool tryReserveForMainList( - const Key & key, size_t offset, size_t size, - QueryContextPtr query_context, - std::lock_guard & cache_lock); - - void remove( - Key key, size_t offset, - std::lock_guard & cache_lock, - std::lock_guard & segment_lock) override; - - bool isLastFileSegmentHolder( - const Key & key, size_t offset, - std::lock_guard & cache_lock, - std::lock_guard & segment_lock) override; - - void reduceSizeToDownloaded( - const Key & key, size_t offset, - std::lock_guard & cache_lock, - std::lock_guard & segment_lock) override; - - size_t getAvailableCacheSize() const; - - void loadCacheInfoIntoMemory(std::lock_guard & cache_lock); - - FileSegments splitRangeIntoCells( - const Key & key, size_t offset, size_t size, FileSegment::State state, std::lock_guard & cache_lock); - - String dumpStructureUnlocked(const Key & key_, std::lock_guard & cache_lock); - - void fillHolesWithEmptyFileSegments( - FileSegments & file_segments, const Key & key, const FileSegment::Range & range, bool fill_with_detached_file_segments, std::lock_guard & cache_lock); - - FileSegmentsHolder setDownloading(const Key & key, size_t offset, size_t size) override; - - size_t getUsedCacheSizeUnlocked(std::lock_guard & cache_lock) const; - - size_t getAvailableCacheSizeUnlocked(std::lock_guard & cache_lock) const; - - size_t getFileSegmentsNumUnlocked(std::lock_guard & cache_lock) const; - - void assertCacheCellsCorrectness(const FileSegmentsByOffset & cells_by_offset, std::lock_guard & cache_lock); - -public: - String dumpStructure(const Key & key_) override; - - void assertCacheCorrectness(const Key & key, std::lock_guard & cache_lock); - - void assertCacheCorrectness(std::lock_guard & cache_lock); - - void assertQueueCorrectness(std::lock_guard & cache_lock); -}; - -} diff --git a/src/Common/FileCacheFactory.cpp b/src/Common/FileCacheFactory.cpp index 9eadea05547..e126ac014f2 100644 --- a/src/Common/FileCacheFactory.cpp +++ b/src/Common/FileCacheFactory.cpp @@ -1,5 +1,6 @@ #include "FileCacheFactory.h" -#include "FileCache.h" +#include "IFileCache.h" +#include "LRUFileCache.h" namespace DB { diff --git a/src/Common/FileCacheSettings.cpp b/src/Common/FileCacheSettings.cpp index 23cb418bc68..88ca6e3ce6b 100644 --- a/src/Common/FileCacheSettings.cpp +++ b/src/Common/FileCacheSettings.cpp @@ -13,6 +13,8 @@ void FileCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration & cache_on_write_operations = config.getUInt64(config_prefix + ".cache_on_write_operations", false); enable_filesystem_query_cache_limit = config.getUInt64(config_prefix + ".enable_filesystem_query_cache_limit", false); enable_cache_hits_threshold = config.getUInt64(config_prefix + ".enable_cache_hits_threshold", REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD); + do_not_evict_index_and_mark_files = config.getUInt64(config_prefix + ".do_not_evict_index_and_mark_files", true); + allow_to_remove_persistent_segments_from_cache_by_default = config.getUInt64(config_prefix + ".allow_to_remove_persistent_segments_from_cache_by_default", true); } } diff --git a/src/Common/FileCacheSettings.h b/src/Common/FileCacheSettings.h index 989db16bd7e..1d8b613bedd 100644 --- a/src/Common/FileCacheSettings.h +++ b/src/Common/FileCacheSettings.h @@ -12,10 +12,14 @@ struct FileCacheSettings size_t max_size = 0; size_t max_elements = REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS; size_t max_file_segment_size = REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE; + bool cache_on_write_operations = false; - bool enable_filesystem_query_cache_limit = false; size_t enable_cache_hits_threshold = REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD; + bool enable_filesystem_query_cache_limit = false; + + bool do_not_evict_index_and_mark_files = true; + bool allow_to_remove_persistent_segments_from_cache_by_default = true; void loadFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix); }; diff --git a/src/Common/FileSegment.cpp b/src/Common/FileSegment.cpp index 27a111c1297..1183abc0e22 100644 --- a/src/Common/FileSegment.cpp +++ b/src/Common/FileSegment.cpp @@ -1,11 +1,12 @@ #include "FileSegment.h" #include -#include #include +#include #include #include #include + namespace CurrentMetrics { extern const Metric CacheDetachedFileSegments; @@ -25,7 +26,8 @@ FileSegment::FileSegment( size_t size_, const Key & key_, IFileCache * cache_, - State download_state_) + State download_state_, + bool is_persistent_) : segment_range(offset_, offset_ + size_ - 1) , download_state(download_state_) , file_key(key_) @@ -35,6 +37,7 @@ FileSegment::FileSegment( #else , log(&Poco::Logger::get("FileSegment")) #endif + , is_persistent(is_persistent_) /// Not really used for now, see PR 36171 { /// On creation, file segment state can be EMPTY, DOWNLOADED, DOWNLOADING. switch (download_state) @@ -241,7 +244,7 @@ void FileSegment::write(const char * from, size_t size, size_t offset_) "Cache writer was finalized (downloaded size: {}, state: {})", downloaded_size, stateToString(download_state)); - auto download_path = cache->getPathInLocalCache(key(), offset()); + auto download_path = getPathInLocalCache(); cache_writer = std::make_unique(download_path); } @@ -271,6 +274,11 @@ void FileSegment::write(const char * from, size_t size, size_t offset_) assert(getDownloadOffset() == offset_ + size); } +String FileSegment::getPathInLocalCache() const +{ + return cache->getPathInLocalCache(key(), offset(), isPersistent()); +} + void FileSegment::writeInMemory(const char * from, size_t size) { if (!size) @@ -287,7 +295,7 @@ void FileSegment::writeInMemory(const char * from, size_t size) if (cache_writer) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cache writer already initialized"); - auto download_path = cache->getPathInLocalCache(key(), offset()); + auto download_path = getPathInLocalCache(); cache_writer = std::make_unique(download_path, size + 1); try @@ -677,7 +685,7 @@ void FileSegment::assertCorrectnessImpl(std::lock_guard & /* segment { assert(downloader_id.empty() == (download_state != FileSegment::State::DOWNLOADING)); assert(!downloader_id.empty() == (download_state == FileSegment::State::DOWNLOADING)); - assert(download_state != FileSegment::State::DOWNLOADED || std::filesystem::file_size(cache->getPathInLocalCache(key(), offset())) > 0); + assert(download_state != FileSegment::State::DOWNLOADED || std::filesystem::file_size(getPathInLocalCache()) > 0); } void FileSegment::throwIfDetached() const @@ -729,6 +737,7 @@ FileSegmentPtr FileSegment::getSnapshot(const FileSegmentPtr & file_segment, std snapshot->ref_count = file_segment.use_count(); snapshot->downloaded_size = file_segment->getDownloadedSize(); snapshot->download_state = file_segment->state(); + snapshot->is_persistent = file_segment->isPersistent(); return snapshot; } diff --git a/src/Common/FileSegment.h b/src/Common/FileSegment.h index 307ff167942..750aa6a1cb2 100644 --- a/src/Common/FileSegment.h +++ b/src/Common/FileSegment.h @@ -1,8 +1,9 @@ #pragma once #include -#include +#include #include +#include #include #include @@ -31,7 +32,7 @@ friend struct FileSegmentsHolder; friend class FileSegmentRangeWriter; public: - using Key = UInt128; + using Key = IFileCache::Key; using RemoteFileReaderPtr = std::shared_ptr; using LocalCacheWriterPtr = std::unique_ptr; @@ -70,8 +71,12 @@ public: }; FileSegment( - size_t offset_, size_t size_, const Key & key_, - IFileCache * cache_, State download_state_); + size_t offset_, + size_t size_, + const Key & key_, + IFileCache * cache_, + State download_state_, + bool is_persistent_ = false); ~FileSegment(); @@ -100,6 +105,8 @@ public: size_t offset() const { return range().left; } + bool isPersistent() const { return is_persistent; } + State wait(); bool reserve(size_t size); @@ -161,6 +168,8 @@ public: [[noreturn]] void throwIfDetached() const; + String getPathInLocalCache() const; + private: size_t availableSize() const { return reserved_size - downloaded_size; } @@ -237,6 +246,9 @@ private: std::atomic hits_count = 0; /// cache hits. std::atomic ref_count = 0; /// Used for getting snapshot state + /// Currently no-op. (will be added in PR 36171) + /// Defined if a file comply by the eviction policy. + bool is_persistent; CurrentMetrics::Increment metric_increment{CurrentMetrics::CacheFileSegments}; }; diff --git a/src/Common/IFileCache.cpp b/src/Common/IFileCache.cpp new file mode 100644 index 00000000000..fb120ae5902 --- /dev/null +++ b/src/Common/IFileCache.cpp @@ -0,0 +1,201 @@ +#include "IFileCache.h" + +#include +#include +#include +#include +#include +#include + +namespace fs = std::filesystem; + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int REMOTE_FS_OBJECT_CACHE_ERROR; + extern const int LOGICAL_ERROR; +} + +IFileCache::IFileCache( + const String & cache_base_path_, + const FileCacheSettings & cache_settings_) + : cache_base_path(cache_base_path_) + , max_size(cache_settings_.max_size) + , max_element_size(cache_settings_.max_elements) + , max_file_segment_size(cache_settings_.max_file_segment_size) + , enable_filesystem_query_cache_limit(cache_settings_.enable_filesystem_query_cache_limit) +{ +} + +String IFileCache::Key::toString() const +{ + return getHexUIntLowercase(key); +} + +IFileCache::Key IFileCache::hash(const String & path) +{ + return Key(sipHash128(path.data(), path.size())); +} + +String IFileCache::getPathInLocalCache(const Key & key, size_t offset, bool is_persistent) const +{ + auto key_str = key.toString(); + return fs::path(cache_base_path) + / key_str.substr(0, 3) + / key_str + / (std::to_string(offset) + (is_persistent ? "_persistent" : "")); +} + +String IFileCache::getPathInLocalCache(const Key & key) const +{ + auto key_str = key.toString(); + return fs::path(cache_base_path) / key_str.substr(0, 3) / key_str; +} + +static bool isQueryInitialized() +{ + return CurrentThread::isInitialized() + && CurrentThread::get().getQueryContext() + && CurrentThread::getQueryId().size != 0; +} + +bool IFileCache::isReadOnly() +{ + return !isQueryInitialized(); +} + +void IFileCache::assertInitialized() const +{ + if (!is_initialized) + throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Cache not initialized"); +} + +IFileCache::QueryContextPtr IFileCache::getCurrentQueryContext(std::lock_guard & cache_lock) +{ + if (!isQueryInitialized()) + return nullptr; + + return getQueryContext(CurrentThread::getQueryId().toString(), cache_lock); +} + +IFileCache::QueryContextPtr IFileCache::getQueryContext(const String & query_id, std::lock_guard & /* cache_lock */) +{ + auto query_iter = query_map.find(query_id); + return (query_iter == query_map.end()) ? nullptr : query_iter->second; +} + +void IFileCache::removeQueryContext(const String & query_id) +{ + std::lock_guard cache_lock(mutex); + auto query_iter = query_map.find(query_id); + + if (query_iter == query_map.end()) + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Attempt to release query context that does not exist (query_id: {})", + query_id); + } + + query_map.erase(query_iter); +} + +IFileCache::QueryContextPtr IFileCache::getOrSetQueryContext( + const String & query_id, const ReadSettings & settings, std::lock_guard & cache_lock) +{ + if (query_id.empty()) + return nullptr; + + auto context = getQueryContext(query_id, cache_lock); + if (context) + return context; + + auto query_context = std::make_shared(settings.max_query_cache_size, settings.skip_download_if_exceeds_query_cache); + auto query_iter = query_map.emplace(query_id, query_context).first; + return query_iter->second; +} + +IFileCache::QueryContextHolder IFileCache::getQueryContextHolder(const String & query_id, const ReadSettings & settings) +{ + std::lock_guard cache_lock(mutex); + + if (!enable_filesystem_query_cache_limit || settings.max_query_cache_size == 0) + return {}; + + /// if enable_filesystem_query_cache_limit is true, and max_query_cache_size large than zero, + /// we create context query for current query. + auto context = getOrSetQueryContext(query_id, settings, cache_lock); + return QueryContextHolder(query_id, this, context); +} + +void IFileCache::QueryContext::remove(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock) +{ + if (cache_size < size) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Deleted cache size exceeds existing cache size"); + + if (!skip_download_if_exceeds_query_cache) + { + auto record = records.find({key, offset}); + if (record != records.end()) + { + lru_queue.remove(record->second, cache_lock); + records.erase({key, offset}); + } + } + cache_size -= size; +} + +void IFileCache::QueryContext::reserve(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock) +{ + if (cache_size + size > max_cache_size) + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Reserved cache size exceeds the remaining cache size (key: {}, offset: {})", + key.toString(), offset); + } + + if (!skip_download_if_exceeds_query_cache) + { + auto record = records.find({key, offset}); + if (record == records.end()) + { + auto queue_iter = lru_queue.add(key, offset, 0, cache_lock); + record = records.insert({{key, offset}, queue_iter}).first; + } + record->second->size += size; + } + cache_size += size; +} + +void IFileCache::QueryContext::use(const Key & key, size_t offset, std::lock_guard & cache_lock) +{ + if (skip_download_if_exceeds_query_cache) + return; + + auto record = records.find({key, offset}); + if (record != records.end()) + lru_queue.moveToEnd(record->second, cache_lock); +} + +IFileCache::QueryContextHolder::QueryContextHolder( + const String & query_id_, + IFileCache * cache_, + IFileCache::QueryContextPtr context_) + : query_id(query_id_) + , cache(cache_) + , context(context_) +{ +} + +IFileCache::QueryContextHolder::~QueryContextHolder() +{ + /// If only the query_map and the current holder hold the context_query, + /// the query has been completed and the query_context is released. + if (context && context.use_count() == 2) + cache->removeQueryContext(query_id); +} + +} diff --git a/src/Common/IFileCache.h b/src/Common/IFileCache.h new file mode 100644 index 00000000000..c820d18cb95 --- /dev/null +++ b/src/Common/IFileCache.h @@ -0,0 +1,267 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include + + +namespace DB +{ + +class FileSegment; +using FileSegmentPtr = std::shared_ptr; +using FileSegments = std::list; +struct FileSegmentsHolder; +struct ReadSettings; + +/** + * Local cache for remote filesystem files, represented as a set of non-overlapping non-empty file segments. + */ +class IFileCache : private boost::noncopyable +{ +friend class FileSegment; +friend struct FileSegmentsHolder; +friend class FileSegmentRangeWriter; + +public: + struct Key + { + UInt128 key; + String toString() const; + + Key() = default; + explicit Key(const UInt128 & key_) : key(key_) {} + + bool operator==(const Key & other) const { return key == other.key; } + }; + + IFileCache( + const String & cache_base_path_, + const FileCacheSettings & cache_settings_); + + virtual ~IFileCache() = default; + + /// Restore cache from local filesystem. + virtual void initialize() = 0; + + virtual void removeIfExists(const Key & key) = 0; + + virtual void removeIfReleasable(bool remove_persistent_files) = 0; + + static bool isReadOnly(); + + /// Cache capacity in bytes. + size_t capacity() const { return max_size; } + + static Key hash(const String & path); + + String getPathInLocalCache(const Key & key, size_t offset, bool is_persistent) const; + + String getPathInLocalCache(const Key & key) const; + + const String & getBasePath() const { return cache_base_path; } + + virtual std::vector tryGetCachePaths(const Key & key) = 0; + + /** + * Given an `offset` and `size` representing [offset, offset + size) bytes interval, + * return list of cached non-overlapping non-empty + * file segments `[segment1, ..., segmentN]` which intersect with given interval. + * + * Segments in returned list are ordered in ascending order and represent a full contiguous + * interval (no holes). Each segment in returned list has state: DOWNLOADED, DOWNLOADING or EMPTY. + * + * As long as pointers to returned file segments are hold + * it is guaranteed that these file segments are not removed from cache. + */ + virtual FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size, bool is_persistent) = 0; + + /** + * Segments in returned list are ordered in ascending order and represent a full contiguous + * interval (no holes). Each segment in returned list has state: DOWNLOADED, DOWNLOADING or EMPTY. + * + * If file segment has state EMPTY, then it is also marked as "detached". E.g. it is "detached" + * from cache (not owned by cache), and as a result will never change it's state and will be destructed + * with the destruction of the holder, while in getOrSet() EMPTY file segments can eventually change + * it's state (and become DOWNLOADED). + */ + virtual FileSegmentsHolder get(const Key & key, size_t offset, size_t size) = 0; + + virtual FileSegmentsHolder setDownloading(const Key & key, size_t offset, size_t size, bool is_persistent) = 0; + + virtual FileSegments getSnapshot() const = 0; + + /// For debug. + virtual String dumpStructure(const Key & key) = 0; + + virtual size_t getUsedCacheSize() const = 0; + + virtual size_t getFileSegmentsNum() const = 0; + +protected: + String cache_base_path; + size_t max_size; + size_t max_element_size; + size_t max_file_segment_size; + + bool is_initialized = false; + + mutable std::mutex mutex; + + virtual bool tryReserve( + const Key & key, size_t offset, size_t size, + std::lock_guard & cache_lock) = 0; + + virtual void remove( + Key key, size_t offset, + std::lock_guard & cache_lock, + std::lock_guard & segment_lock) = 0; + + virtual bool isLastFileSegmentHolder( + const Key & key, size_t offset, + std::lock_guard & cache_lock, + std::lock_guard & segment_lock) = 0; + + virtual void reduceSizeToDownloaded( + const Key & key, size_t offset, + std::lock_guard & cache_lock, + std::lock_guard & /* segment_lock */) = 0; + + void assertInitialized() const; + + class LRUQueue + { + public: + struct FileKeyAndOffset + { + Key key; + size_t offset; + size_t size; + size_t hits = 0; + + FileKeyAndOffset(const Key & key_, size_t offset_, size_t size_) : key(key_), offset(offset_), size(size_) {} + }; + + using Iterator = typename std::list::iterator; + + size_t getTotalCacheSize(std::lock_guard & /* cache_lock */) const { return cache_size; } + + size_t getElementsNum(std::lock_guard & /* cache_lock */) const { return queue.size(); } + + Iterator add(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock); + + void remove(Iterator queue_it, std::lock_guard & cache_lock); + + void moveToEnd(Iterator queue_it, std::lock_guard & cache_lock); + + /// Space reservation for a file segment is incremental, so we need to be able to increment size of the queue entry. + void incrementSize(Iterator queue_it, size_t size_increment, std::lock_guard & cache_lock); + + String toString(std::lock_guard & cache_lock) const; + + bool contains(const Key & key, size_t offset, std::lock_guard & cache_lock) const; + + Iterator begin() { return queue.begin(); } + + Iterator end() { return queue.end(); } + + void removeAll(std::lock_guard & cache_lock); + + private: + std::list queue; + size_t cache_size = 0; + }; + + using AccessKeyAndOffset = std::pair; + struct KeyAndOffsetHash + { + std::size_t operator()(const AccessKeyAndOffset & key) const + { + return std::hash()(key.first.key) ^ std::hash()(key.second); + } + }; + + using AccessRecord = std::unordered_map; + + /// Used to track and control the cache access of each query. + /// Through it, we can realize the processing of different queries by the cache layer. + struct QueryContext + { + LRUQueue lru_queue; + AccessRecord records; + + size_t cache_size = 0; + size_t max_cache_size; + + bool skip_download_if_exceeds_query_cache; + + QueryContext(size_t max_cache_size_, bool skip_download_if_exceeds_query_cache_) + : max_cache_size(max_cache_size_) + , skip_download_if_exceeds_query_cache(skip_download_if_exceeds_query_cache_) {} + + void remove(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock); + + void reserve(const Key & key, size_t offset, size_t size, std::lock_guard & cache_lock); + + void use(const Key & key, size_t offset, std::lock_guard & cache_lock); + + size_t getMaxCacheSize() const { return max_cache_size; } + + size_t getCacheSize() const { return cache_size; } + + LRUQueue & queue() { return lru_queue; } + + bool isSkipDownloadIfExceed() const { return skip_download_if_exceeds_query_cache; } + }; + + using QueryContextPtr = std::shared_ptr; + using QueryContextMap = std::unordered_map; + + QueryContextMap query_map; + + bool enable_filesystem_query_cache_limit; + + QueryContextPtr getCurrentQueryContext(std::lock_guard & cache_lock); + + QueryContextPtr getQueryContext(const String & query_id, std::lock_guard & cache_lock); + + void removeQueryContext(const String & query_id); + + QueryContextPtr getOrSetQueryContext(const String & query_id, const ReadSettings & settings, std::lock_guard &); + +public: + /// Save a query context information, and adopt different cache policies + /// for different queries through the context cache layer. + struct QueryContextHolder : private boost::noncopyable + { + QueryContextHolder(const String & query_id_, IFileCache * cache_, QueryContextPtr context_); + + QueryContextHolder() = default; + + ~QueryContextHolder(); + + String query_id; + IFileCache * cache = nullptr; + QueryContextPtr context; + }; + + QueryContextHolder getQueryContextHolder(const String & query_id, const ReadSettings & settings); + +}; + +using FileCachePtr = std::shared_ptr; + +} + +namespace std +{ +template <> struct hash +{ + std::size_t operator()(const DB::IFileCache::Key & k) const { return hash()(k.key); } +}; + +} diff --git a/src/Common/LRUCache.h b/src/Common/LRUCache.h index 7210412a5e1..c149ee184ab 100644 --- a/src/Common/LRUCache.h +++ b/src/Common/LRUCache.h @@ -8,6 +8,7 @@ #include #include +#include namespace DB @@ -48,7 +49,7 @@ public: { std::lock_guard lock(mutex); - auto res = getImpl(key, lock); + auto res = getImpl(key); if (res) ++hits; else @@ -61,7 +62,7 @@ public: { std::lock_guard lock(mutex); - setImpl(key, mapped, lock); + setImpl(key, mapped); } void remove(const Key & key) @@ -91,7 +92,7 @@ public: { std::lock_guard cache_lock(mutex); - auto val = getImpl(key, cache_lock); + auto val = getImpl(key); if (val) { ++hits; @@ -129,7 +130,7 @@ public: auto token_it = insert_tokens.find(key); if (token_it != insert_tokens.end() && token_it->second.get() == token) { - setImpl(key, token->value, cache_lock); + setImpl(key, token->value); result = true; } @@ -189,7 +190,7 @@ protected: using Cells = std::unordered_map; - Cells cells; + Cells cells TSA_GUARDED_BY(mutex); mutable std::mutex mutex; private: @@ -200,8 +201,8 @@ private: explicit InsertToken(LRUCache & cache_) : cache(cache_) {} std::mutex mutex; - bool cleaned_up = false; /// Protected by the token mutex - MappedPtr value; /// Protected by the token mutex + bool cleaned_up TSA_GUARDED_BY(mutex) = false; + MappedPtr value TSA_GUARDED_BY(mutex); LRUCache & cache; size_t refcount = 0; /// Protected by the cache mutex @@ -221,6 +222,7 @@ private: InsertTokenHolder() = default; void acquire(const Key * key_, const std::shared_ptr & token_, [[maybe_unused]] std::lock_guard & cache_lock) + TSA_NO_THREAD_SAFETY_ANALYSIS // disabled only because we can't reference the parent-level cache mutex from here { key = key_; token = token_; @@ -228,6 +230,7 @@ private: } void cleanup([[maybe_unused]] std::lock_guard & token_lock, [[maybe_unused]] std::lock_guard & cache_lock) + TSA_NO_THREAD_SAFETY_ANALYSIS // disabled only because we can't reference the parent-level cache mutex from here { token->cache.insert_tokens.erase(*key); token->cleaned_up = true; @@ -258,21 +261,21 @@ private: friend struct InsertTokenHolder; - InsertTokenById insert_tokens; + InsertTokenById insert_tokens TSA_GUARDED_BY(mutex); - LRUQueue queue; + LRUQueue queue TSA_GUARDED_BY(mutex); /// Total weight of values. - size_t current_size = 0; + size_t current_size TSA_GUARDED_BY(mutex) = 0; const size_t max_size; const size_t max_elements_size; std::atomic hits {0}; std::atomic misses {0}; - WeightFunction weight_function; + const WeightFunction weight_function; - MappedPtr getImpl(const Key & key, [[maybe_unused]] std::lock_guard & cache_lock) + MappedPtr getImpl(const Key & key) TSA_REQUIRES(mutex) { auto it = cells.find(key); if (it == cells.end()) @@ -288,7 +291,7 @@ private: return cell.value; } - void setImpl(const Key & key, const MappedPtr & mapped, [[maybe_unused]] std::lock_guard & cache_lock) + void setImpl(const Key & key, const MappedPtr & mapped) TSA_REQUIRES(mutex) { auto [it, inserted] = cells.emplace(std::piecewise_construct, std::forward_as_tuple(key), @@ -321,7 +324,7 @@ private: removeOverflow(); } - void removeOverflow() + void removeOverflow() TSA_REQUIRES(mutex) { size_t current_weight_lost = 0; size_t queue_size = cells.size(); diff --git a/src/Common/FileCache.cpp b/src/Common/LRUFileCache.cpp similarity index 85% rename from src/Common/FileCache.cpp rename to src/Common/LRUFileCache.cpp index d8ffcaadd55..0ce76dbdec6 100644 --- a/src/Common/FileCache.cpp +++ b/src/Common/LRUFileCache.cpp @@ -1,4 +1,4 @@ -#include "FileCache.h" +#include "LRUFileCache.h" #include #include @@ -22,130 +22,12 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -namespace -{ - String keyToStr(const IFileCache::Key & key) - { - return getHexUIntLowercase(key); - } -} - -static bool isQueryInitialized() -{ - return CurrentThread::isInitialized() && CurrentThread::get().getQueryContext() && CurrentThread::getQueryId().size != 0; -} - -IFileCache::IFileCache( - const String & cache_base_path_, - const FileCacheSettings & cache_settings_) - : cache_base_path(cache_base_path_) - , max_size(cache_settings_.max_size) - , max_element_size(cache_settings_.max_elements) - , max_file_segment_size(cache_settings_.max_file_segment_size) - , enable_filesystem_query_cache_limit(cache_settings_.enable_filesystem_query_cache_limit) -{ -} - -IFileCache::Key IFileCache::hash(const String & path) -{ - return sipHash128(path.data(), path.size()); -} - -String IFileCache::getPathInLocalCache(const Key & key, size_t offset) -{ - auto key_str = keyToStr(key); - return fs::path(cache_base_path) / key_str.substr(0, 3) / key_str / std::to_string(offset); -} - -String IFileCache::getPathInLocalCache(const Key & key) -{ - auto key_str = keyToStr(key); - return fs::path(cache_base_path) / key_str.substr(0, 3) / key_str; -} - -bool IFileCache::isReadOnly() -{ - return (!isQueryInitialized()); -} - -void IFileCache::assertInitialized() const -{ - if (!is_initialized) - throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Cache not initialized"); -} - -IFileCache::QueryContextPtr IFileCache::getCurrentQueryContext(std::lock_guard & cache_lock) -{ - if (!isQueryInitialized()) - return nullptr; - - return getQueryContext(CurrentThread::getQueryId().toString(), cache_lock); -} - -IFileCache::QueryContextPtr IFileCache::getQueryContext(const String & query_id, std::lock_guard &) -{ - auto query_iter = query_map.find(query_id); - return (query_iter == query_map.end()) ? nullptr : query_iter->second; -} - -void IFileCache::removeQueryContext(const String & query_id) -{ - std::lock_guard cache_lock(mutex); - auto query_iter = query_map.find(query_id); - - if (query_iter == query_map.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to release query context that does not exist"); - - query_map.erase(query_iter); -} - -IFileCache::QueryContextPtr IFileCache::getOrSetQueryContext(const String & query_id, const ReadSettings & settings, std::lock_guard & cache_lock) -{ - if (query_id.empty()) - return nullptr; - - auto context = getQueryContext(query_id, cache_lock); - if (!context) - { - auto query_iter = query_map.insert({query_id, std::make_shared(settings.max_query_cache_size, settings.skip_download_if_exceeds_query_cache)}).first; - context = query_iter->second; - } - return context; -} - -IFileCache::QueryContextHolder IFileCache::getQueryContextHolder(const String & query_id, const ReadSettings & settings) -{ - std::lock_guard cache_lock(mutex); - - /// if enable_filesystem_query_cache_limit is true, and max_query_cache_size large than zero, - /// we create context query for current query. - if (enable_filesystem_query_cache_limit && settings.max_query_cache_size) - { - auto context = getOrSetQueryContext(query_id, settings, cache_lock); - return QueryContextHolder(query_id, this, context); - } - else - return QueryContextHolder(); -} - -IFileCache::QueryContextHolder::QueryContextHolder(const String & query_id_, IFileCache * cache_, IFileCache::QueryContextPtr context_) - : query_id(query_id_), cache(cache_), context(context_) -{ -} - -IFileCache::QueryContextHolder::~QueryContextHolder() -{ - /// If only the query_map and the current holder hold the context_query, - /// the query has been completed and the query_context is released. - if (context && context.use_count() == 2) - cache->removeQueryContext(query_id); -} - LRUFileCache::LRUFileCache(const String & cache_base_path_, const FileCacheSettings & cache_settings_) : IFileCache(cache_base_path_, cache_settings_) , max_stash_element_size(cache_settings_.max_elements) , enable_cache_hits_threshold(cache_settings_.enable_cache_hits_threshold) , log(&Poco::Logger::get("LRUFileCache")) + , allow_to_remove_persistent_segments_from_cache_by_default(cache_settings_.allow_to_remove_persistent_segments_from_cache_by_default) { } @@ -155,9 +37,20 @@ void LRUFileCache::initialize() if (!is_initialized) { if (fs::exists(cache_base_path)) - loadCacheInfoIntoMemory(cache_lock); + { + try + { + loadCacheInfoIntoMemory(cache_lock); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + return; + } + } else fs::create_directories(cache_base_path); + is_initialized = true; } } @@ -168,7 +61,7 @@ void LRUFileCache::useCell( auto file_segment = cell.file_segment; if (file_segment->isDownloaded() - && fs::file_size(getPathInLocalCache(file_segment->key(), file_segment->offset())) == 0) + && fs::file_size(getPathInLocalCache(file_segment->key(), file_segment->offset(), file_segment->isPersistent())) == 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot have zero size downloaded file segments. Current file segment: {}", file_segment->range().toString()); @@ -218,8 +111,10 @@ FileSegments LRUFileCache::getImpl( files.erase(key); + /// Note: it is guaranteed that there is no concurrency with files deletion, + /// because cache files are deleted only inside IFileCache and under cache lock. if (fs::exists(key_path)) - fs::remove(key_path); + fs::remove_all(key_path); return {}; } @@ -281,7 +176,7 @@ FileSegments LRUFileCache::getImpl( } FileSegments LRUFileCache::splitRangeIntoCells( - const Key & key, size_t offset, size_t size, FileSegment::State state, std::lock_guard & cache_lock) + const Key & key, size_t offset, size_t size, FileSegment::State state, bool is_persistent, std::lock_guard & cache_lock) { assert(size > 0); @@ -297,7 +192,7 @@ FileSegments LRUFileCache::splitRangeIntoCells( current_cell_size = std::min(remaining_size, max_file_segment_size); remaining_size -= current_cell_size; - auto * cell = addCell(key, current_pos, current_cell_size, state, cache_lock); + auto * cell = addCell(key, current_pos, current_cell_size, state, is_persistent, cache_lock); if (cell) file_segments.push_back(cell->file_segment); assert(cell); @@ -314,6 +209,7 @@ void LRUFileCache::fillHolesWithEmptyFileSegments( const Key & key, const FileSegment::Range & range, bool fill_with_detached_file_segments, + bool is_persistent, std::lock_guard & cache_lock) { /// There are segments [segment1, ..., segmentN] @@ -369,7 +265,7 @@ void LRUFileCache::fillHolesWithEmptyFileSegments( } else { - file_segments.splice(it, splitRangeIntoCells(key, current_pos, hole_size, FileSegment::State::EMPTY, cache_lock)); + file_segments.splice(it, splitRangeIntoCells(key, current_pos, hole_size, FileSegment::State::EMPTY, is_persistent, cache_lock)); } current_pos = segment_range.right + 1; @@ -397,12 +293,12 @@ void LRUFileCache::fillHolesWithEmptyFileSegments( else { file_segments.splice( - file_segments.end(), splitRangeIntoCells(key, current_pos, hole_size, FileSegment::State::EMPTY, cache_lock)); + file_segments.end(), splitRangeIntoCells(key, current_pos, hole_size, FileSegment::State::EMPTY, is_persistent, cache_lock)); } } } -FileSegmentsHolder LRUFileCache::getOrSet(const Key & key, size_t offset, size_t size) +FileSegmentsHolder LRUFileCache::getOrSet(const Key & key, size_t offset, size_t size, bool is_persistent) { assertInitialized(); @@ -419,11 +315,11 @@ FileSegmentsHolder LRUFileCache::getOrSet(const Key & key, size_t offset, size_t if (file_segments.empty()) { - file_segments = splitRangeIntoCells(key, offset, size, FileSegment::State::EMPTY, cache_lock); + file_segments = splitRangeIntoCells(key, offset, size, FileSegment::State::EMPTY, is_persistent, cache_lock); } else { - fillHolesWithEmptyFileSegments(file_segments, key, range, false, cache_lock); + fillHolesWithEmptyFileSegments(file_segments, key, range, /* fill_with_detached */false, is_persistent, cache_lock); } assert(!file_segments.empty()); @@ -456,14 +352,15 @@ FileSegmentsHolder LRUFileCache::get(const Key & key, size_t offset, size_t size } else { - fillHolesWithEmptyFileSegments(file_segments, key, range, true, cache_lock); + fillHolesWithEmptyFileSegments(file_segments, key, range, /* fill_with_detached */true, /* is_persistent */false, cache_lock); } return FileSegmentsHolder(std::move(file_segments)); } LRUFileCache::FileSegmentCell * LRUFileCache::addCell( - const Key & key, size_t offset, size_t size, FileSegment::State state, + const Key & key, size_t offset, size_t size, + FileSegment::State state, bool is_persistent, std::lock_guard & cache_lock) { /// Create a file segment cell and put it in `files` map by [key][offset]. @@ -475,10 +372,11 @@ LRUFileCache::FileSegmentCell * LRUFileCache::addCell( throw Exception( ErrorCodes::LOGICAL_ERROR, "Cache already exists for key: `{}`, offset: {}, size: {}.\nCurrent cache structure: {}", - keyToStr(key), offset, size, dumpStructureUnlocked(key, cache_lock)); + key.toString(), offset, size, dumpStructureUnlocked(key, cache_lock)); auto skip_or_download = [&]() -> FileSegmentPtr { + FileSegment::State result_state = state; if (state == FileSegment::State::EMPTY && enable_cache_hits_threshold) { auto record = records.find({key, offset}); @@ -496,7 +394,7 @@ LRUFileCache::FileSegmentCell * LRUFileCache::addCell( } /// For segments that do not reach the download threshold, we do not download them, but directly read them - return std::make_shared(offset, size, key, this, FileSegment::State::SKIP_CACHE); + result_state = FileSegment::State::SKIP_CACHE; } else { @@ -504,12 +402,11 @@ LRUFileCache::FileSegmentCell * LRUFileCache::addCell( queue_iter->hits++; stash_queue.moveToEnd(queue_iter, cache_lock); - state = queue_iter->hits >= enable_cache_hits_threshold ? FileSegment::State::EMPTY : FileSegment::State::SKIP_CACHE; - return std::make_shared(offset, size, key, this, state); + result_state = queue_iter->hits >= enable_cache_hits_threshold ? FileSegment::State::EMPTY : FileSegment::State::SKIP_CACHE; } } - else - return std::make_shared(offset, size, key, this, state); + + return std::make_shared(offset, size, key, this, result_state, is_persistent); }; FileSegmentCell cell(skip_or_download(), this, cache_lock); @@ -527,12 +424,16 @@ LRUFileCache::FileSegmentCell * LRUFileCache::addCell( throw Exception( ErrorCodes::LOGICAL_ERROR, "Failed to insert into cache key: `{}`, offset: {}, size: {}", - keyToStr(key), offset, size); + key.toString(), offset, size); return &(it->second); } -FileSegmentsHolder LRUFileCache::setDownloading(const Key & key, size_t offset, size_t size) +FileSegmentsHolder LRUFileCache::setDownloading( + const Key & key, + size_t offset, + size_t size, + bool is_persistent) { std::lock_guard cache_lock(mutex); @@ -545,9 +446,9 @@ FileSegmentsHolder LRUFileCache::setDownloading(const Key & key, size_t offset, throw Exception( ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Cache cell already exists for key `{}` and offset {}", - keyToStr(key), offset); + key.toString(), offset); - auto file_segments = splitRangeIntoCells(key, offset, size, FileSegment::State::DOWNLOADING, cache_lock); + auto file_segments = splitRangeIntoCells(key, offset, size, FileSegment::State::DOWNLOADING, is_persistent, cache_lock); return FileSegmentsHolder(std::move(file_segments)); } @@ -708,7 +609,7 @@ bool LRUFileCache::tryReserveForMainList( throw Exception( ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Cache became inconsistent. Key: {}, offset: {}", - keyToStr(key), offset); + key.toString(), offset); size_t cell_size = cell->size(); assert(entry_size == cell_size); @@ -790,7 +691,7 @@ bool LRUFileCache::tryReserveForMainList( return true; } -void LRUFileCache::remove(const Key & key) +void LRUFileCache::removeIfExists(const Key & key) { assertInitialized(); @@ -825,6 +726,7 @@ void LRUFileCache::remove(const Key & key) if (file_segment) { std::lock_guard segment_lock(file_segment->mutex); + file_segment->detach(cache_lock, segment_lock); remove(file_segment->key(), file_segment->offset(), cache_lock, segment_lock); } } @@ -836,14 +738,16 @@ void LRUFileCache::remove(const Key & key) files.erase(key); if (fs::exists(key_path)) - fs::remove(key_path); + fs::remove_all(key_path); } } -void LRUFileCache::remove() +void LRUFileCache::removeIfReleasable(bool remove_persistent_files) { /// Try remove all cached files by cache_base_path. /// Only releasable file segments are evicted. + /// `remove_persistent_files` defines whether non-evictable by some criteria files + /// (they do not comply with the cache eviction policy) should also be removed. std::lock_guard cache_lock(mutex); @@ -860,7 +764,10 @@ void LRUFileCache::remove() if (cell->releasable()) { auto file_segment = cell->file_segment; - if (file_segment) + if (file_segment + && (!file_segment->isPersistent() + || remove_persistent_files + || allow_to_remove_persistent_segments_from_cache_by_default)) { std::lock_guard segment_lock(file_segment->mutex); file_segment->detach(cache_lock, segment_lock); @@ -872,17 +779,23 @@ void LRUFileCache::remove() /// Remove all access information. records.clear(); stash_queue.removeAll(cache_lock); + +#ifndef NDEBUG + assertCacheCorrectness(cache_lock); +#endif } void LRUFileCache::remove( Key key, size_t offset, std::lock_guard & cache_lock, std::lock_guard & /* segment_lock */) { - LOG_TEST(log, "Remove. Key: {}, offset: {}", keyToStr(key), offset); + LOG_TEST(log, "Remove. Key: {}, offset: {}", key.toString(), offset); auto * cell = getCell(key, offset, cache_lock); if (!cell) - throw Exception(ErrorCodes::LOGICAL_ERROR, "No cache cell for key: {}, offset: {}", keyToStr(key), offset); + throw Exception(ErrorCodes::LOGICAL_ERROR, "No cache cell for key: {}, offset: {}", key.toString(), offset); + + bool is_persistent_file_segment = cell->file_segment->isPersistent(); if (cell->queue_iterator) { @@ -892,7 +805,7 @@ void LRUFileCache::remove( auto & offsets = files[key]; offsets.erase(offset); - auto cache_file_path = getPathInLocalCache(key, offset); + auto cache_file_path = getPathInLocalCache(key, offset, is_persistent_file_segment); if (fs::exists(cache_file_path)) { try @@ -906,14 +819,14 @@ void LRUFileCache::remove( files.erase(key); if (fs::exists(key_path)) - fs::remove(key_path); + fs::remove_all(key_path); } } catch (...) { throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Removal of cached file failed. Key: {}, offset: {}, path: {}, error: {}", - keyToStr(key), offset, cache_file_path, getCurrentExceptionMessage(false)); + key.toString(), offset, cache_file_path, getCurrentExceptionMessage(false)); } } } @@ -927,18 +840,33 @@ void LRUFileCache::loadCacheInfoIntoMemory(std::lock_guard & cache_l /// cache_base_path / key_prefix / key / offset + if (!files.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cache already initialized"); + fs::directory_iterator key_prefix_it{cache_base_path}; for (; key_prefix_it != fs::directory_iterator(); ++key_prefix_it) { fs::directory_iterator key_it{key_prefix_it->path()}; for (; key_it != fs::directory_iterator(); ++key_it) { - key = unhexUInt(key_it->path().filename().string().data()); + key = Key(unhexUInt(key_it->path().filename().string().data())); fs::directory_iterator offset_it{key_it->path()}; for (; offset_it != fs::directory_iterator(); ++offset_it) { - bool parsed = tryParse(offset, offset_it->path().filename()); + auto offset_with_suffix = offset_it->path().filename().string(); + auto delim_pos = offset_with_suffix.find('_'); + bool parsed; + bool is_persistent = false; + + if (delim_pos == std::string::npos) + parsed = tryParse(offset, offset_with_suffix); + else + { + parsed = tryParse(offset, offset_with_suffix.substr(0, delim_pos)); + is_persistent = offset_with_suffix.substr(delim_pos+1) == "persistent"; + } + if (!parsed) { LOG_WARNING(log, "Unexpected file: ", offset_it->path().string()); @@ -954,7 +882,7 @@ void LRUFileCache::loadCacheInfoIntoMemory(std::lock_guard & cache_l if (tryReserve(key, offset, size, cache_lock)) { - auto * cell = addCell(key, offset, size, FileSegment::State::DOWNLOADED, cache_lock); + auto * cell = addCell(key, offset, size, FileSegment::State::DOWNLOADED, is_persistent, cache_lock); if (cell) queue_entries.emplace_back(*cell->queue_iterator, cell->file_segment); } @@ -1003,7 +931,7 @@ void LRUFileCache::reduceSizeToDownloaded( throw Exception( ErrorCodes::LOGICAL_ERROR, "No cell found for key: {}, offset: {}", - keyToStr(key), offset); + key.toString(), offset); } const auto & file_segment = cell->file_segment; @@ -1014,7 +942,7 @@ void LRUFileCache::reduceSizeToDownloaded( throw Exception( ErrorCodes::LOGICAL_ERROR, "Nothing to reduce, file segment fully downloaded, key: {}, offset: {}", - keyToStr(key), offset); + key.toString(), offset); } cell->file_segment = std::make_shared(offset, downloaded_size, key, this, FileSegment::State::DOWNLOADED); @@ -1027,7 +955,7 @@ bool LRUFileCache::isLastFileSegmentHolder( auto * cell = getCell(key, offset, cache_lock); if (!cell) - throw Exception(ErrorCodes::LOGICAL_ERROR, "No cell found for key: {}, offset: {}", keyToStr(key), offset); + throw Exception(ErrorCodes::LOGICAL_ERROR, "No cell found for key: {}, offset: {}", key.toString(), offset); /// The caller of this method is last file segment holder if use count is 2 (the second pointer is cache itself) return cell->file_segment.use_count() == 2; @@ -1058,7 +986,7 @@ std::vector LRUFileCache::tryGetCachePaths(const Key & key) for (const auto & [offset, cell] : cells_by_offset) { if (cell.file_segment->state() == FileSegment::State::DOWNLOADED) - cache_paths.push_back(getPathInLocalCache(key, offset)); + cache_paths.push_back(getPathInLocalCache(key, offset, cell.file_segment->isPersistent())); } return cache_paths; @@ -1139,7 +1067,7 @@ IFileCache::LRUQueue::Iterator IFileCache::LRUQueue::add( throw Exception( ErrorCodes::LOGICAL_ERROR, "Attempt to add duplicate queue entry to queue. (Key: {}, offset: {}, size: {})", - keyToStr(key), offset, size); + key.toString(), offset, size); } #endif @@ -1190,7 +1118,7 @@ String IFileCache::LRUQueue::toString(std::lock_guard & /* cache_loc { if (!result.empty()) result += ", "; - result += fmt::format("{}: [{}, {}]", keyToStr(key), offset, offset + size - 1); + result += fmt::format("{}: [{}, {}]", key.toString(), offset, offset + size - 1); } return result; } diff --git a/src/Common/LRUFileCache.h b/src/Common/LRUFileCache.h new file mode 100644 index 00000000000..059fc0c22c9 --- /dev/null +++ b/src/Common/LRUFileCache.h @@ -0,0 +1,157 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + + +namespace DB +{ + +/** + * Local cache for remote filesystem files, represented as a set of non-overlapping non-empty file segments. + * Implements LRU eviction policy. + */ +class LRUFileCache final : public IFileCache +{ +public: + LRUFileCache( + const String & cache_base_path_, + const FileCacheSettings & cache_settings_); + + FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size, bool is_persistent) override; + + FileSegmentsHolder get(const Key & key, size_t offset, size_t size) override; + + FileSegments getSnapshot() const override; + + void initialize() override; + + void removeIfExists(const Key & key) override; + + void removeIfReleasable(bool remove_persistent_files) override; + + std::vector tryGetCachePaths(const Key & key) override; + + size_t getUsedCacheSize() const override; + + size_t getFileSegmentsNum() const override; + +private: + struct FileSegmentCell : private boost::noncopyable + { + FileSegmentPtr file_segment; + + /// Iterator is put here on first reservation attempt, if successful. + std::optional queue_iterator; + + /// Pointer to file segment is always hold by the cache itself. + /// Apart from pointer in cache, it can be hold by cache users, when they call + /// getorSet(), but cache users always hold it via FileSegmentsHolder. + bool releasable() const {return file_segment.unique(); } + + size_t size() const { return file_segment->reserved_size; } + + FileSegmentCell(FileSegmentPtr file_segment_, LRUFileCache * cache, std::lock_guard & cache_lock); + + FileSegmentCell(FileSegmentCell && other) noexcept + : file_segment(std::move(other.file_segment)) + , queue_iterator(other.queue_iterator) {} + }; + + using FileSegmentsByOffset = std::map; + using CachedFiles = std::unordered_map; + + CachedFiles files; + LRUQueue queue; + + LRUQueue stash_queue; + AccessRecord records; + + size_t max_stash_element_size; + size_t enable_cache_hits_threshold; + + Poco::Logger * log; + bool allow_to_remove_persistent_segments_from_cache_by_default; + + FileSegments getImpl( + const Key & key, const FileSegment::Range & range, + std::lock_guard & cache_lock); + + FileSegmentCell * getCell( + const Key & key, size_t offset, std::lock_guard & cache_lock); + + FileSegmentCell * addCell( + const Key & key, size_t offset, size_t size, + FileSegment::State state, bool is_persistent, + std::lock_guard & cache_lock); + + void useCell(const FileSegmentCell & cell, FileSegments & result, std::lock_guard & cache_lock); + + bool tryReserve( + const Key & key, size_t offset, size_t size, + std::lock_guard & cache_lock) override; + + bool tryReserveForMainList( + const Key & key, size_t offset, size_t size, + QueryContextPtr query_context, + std::lock_guard & cache_lock); + + void remove( + Key key, size_t offset, + std::lock_guard & cache_lock, + std::lock_guard & segment_lock) override; + + bool isLastFileSegmentHolder( + const Key & key, size_t offset, + std::lock_guard & cache_lock, + std::lock_guard & segment_lock) override; + + size_t getAvailableCacheSize() const; + + void loadCacheInfoIntoMemory(std::lock_guard & cache_lock); + + FileSegments splitRangeIntoCells( + const Key & key, size_t offset, size_t size, FileSegment::State state, bool is_persistent, std::lock_guard & cache_lock); + + String dumpStructureUnlocked(const Key & key_, std::lock_guard & cache_lock); + + void fillHolesWithEmptyFileSegments( + FileSegments & file_segments, const Key & key, const FileSegment::Range & range, bool fill_with_detached_file_segments, bool is_persistent, std::lock_guard & cache_lock); + + FileSegmentsHolder setDownloading(const Key & key, size_t offset, size_t size, bool is_persistent) override; + + size_t getUsedCacheSizeUnlocked(std::lock_guard & cache_lock) const; + + size_t getAvailableCacheSizeUnlocked(std::lock_guard & cache_lock) const; + + size_t getFileSegmentsNumUnlocked(std::lock_guard & cache_lock) const; + + void assertCacheCellsCorrectness(const FileSegmentsByOffset & cells_by_offset, std::lock_guard & cache_lock); + + void reduceSizeToDownloaded( + const Key & key, size_t offset, + std::lock_guard & cache_lock, std::lock_guard & /* segment_lock */) override; + +public: + String dumpStructure(const Key & key_) override; + + void assertCacheCorrectness(const Key & key, std::lock_guard & cache_lock); + + void assertCacheCorrectness(std::lock_guard & cache_lock); + + void assertQueueCorrectness(std::lock_guard & cache_lock); +}; + +} diff --git a/src/Common/MultiVersion.h b/src/Common/MultiVersion.h index a36a1f8ca5f..8a6eac0abad 100644 --- a/src/Common/MultiVersion.h +++ b/src/Common/MultiVersion.h @@ -2,6 +2,7 @@ #include #include +#include /** Allow to store and read-only usage of an object in several threads, @@ -51,6 +52,6 @@ public: } private: - Version current_version; + Version current_version TSA_GUARDED_BY(mutex); mutable std::mutex mutex; }; diff --git a/src/Common/ProgressIndication.cpp b/src/Common/ProgressIndication.cpp index 315080115a6..7bea00f5b1e 100644 --- a/src/Common/ProgressIndication.cpp +++ b/src/Common/ProgressIndication.cpp @@ -53,8 +53,11 @@ void ProgressIndication::resetProgress() show_progress_bar = false; written_progress_chars = 0; write_progress_on_update = false; - host_cpu_usage.clear(); - thread_data.clear(); + { + std::lock_guard lock(profile_events_mutex); + host_cpu_usage.clear(); + thread_data.clear(); + } } void ProgressIndication::setFileProgressCallback(ContextMutablePtr context, bool write_progress_on_update_) @@ -71,6 +74,8 @@ void ProgressIndication::setFileProgressCallback(ContextMutablePtr context, bool void ProgressIndication::addThreadIdToList(String const & host, UInt64 thread_id) { + std::lock_guard lock(profile_events_mutex); + auto & thread_to_times = thread_data[host]; if (thread_to_times.contains(thread_id)) return; @@ -79,6 +84,8 @@ void ProgressIndication::addThreadIdToList(String const & host, UInt64 thread_id void ProgressIndication::updateThreadEventData(HostToThreadTimesMap & new_thread_data, UInt64 elapsed_time) { + std::lock_guard lock(profile_events_mutex); + for (auto & new_host_map : new_thread_data) { host_cpu_usage[new_host_map.first] = calculateCPUUsage(new_host_map.second, elapsed_time); @@ -88,6 +95,8 @@ void ProgressIndication::updateThreadEventData(HostToThreadTimesMap & new_thread size_t ProgressIndication::getUsedThreadsCount() const { + std::lock_guard lock(profile_events_mutex); + return std::accumulate(thread_data.cbegin(), thread_data.cend(), 0, [] (size_t acc, auto const & threads) { @@ -97,6 +106,8 @@ size_t ProgressIndication::getUsedThreadsCount() const double ProgressIndication::getCPUUsage() const { + std::lock_guard lock(profile_events_mutex); + double res = 0; for (const auto & elem : host_cpu_usage) res += elem.second; @@ -105,6 +116,8 @@ double ProgressIndication::getCPUUsage() const ProgressIndication::MemoryUsage ProgressIndication::getMemoryUsage() const { + std::lock_guard lock(profile_events_mutex); + return std::accumulate(thread_data.cbegin(), thread_data.cend(), MemoryUsage{}, [](MemoryUsage const & acc, auto const & host_data) { @@ -137,6 +150,8 @@ void ProgressIndication::writeFinalProgress() void ProgressIndication::writeProgress() { + std::lock_guard lock(progress_mutex); + /// Output all progress bar commands to stderr at once to avoid flicker. WriteBufferFromFileDescriptor message(STDERR_FILENO, 1024); diff --git a/src/Common/ProgressIndication.h b/src/Common/ProgressIndication.h index d44becc416a..9ce29ef0d3c 100644 --- a/src/Common/ProgressIndication.h +++ b/src/Common/ProgressIndication.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -92,6 +93,16 @@ private: std::unordered_map host_cpu_usage; HostToThreadTimesMap thread_data; + /// In case of all of the above: + /// - clickhouse-local + /// - input_format_parallel_parsing=true + /// - write_progress_on_update=true + /// + /// It is possible concurrent access to the following: + /// - writeProgress() (class properties) (guarded with progress_mutex) + /// - thread_data/host_cpu_usage (guarded with profile_events_mutex) + mutable std::mutex profile_events_mutex; + mutable std::mutex progress_mutex; }; } diff --git a/src/Common/RemoteHostFilter.h b/src/Common/RemoteHostFilter.h index cea1a781308..2b91306f405 100644 --- a/src/Common/RemoteHostFilter.h +++ b/src/Common/RemoteHostFilter.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace Poco { class URI; } @@ -28,8 +29,8 @@ private: std::atomic_bool is_initialized = false; mutable std::mutex hosts_mutex; - std::unordered_set primary_hosts; /// Allowed primary () URL from config.xml - std::vector regexp_hosts; /// Allowed regexp () URL from config.xml + std::unordered_set primary_hosts TSA_GUARDED_BY(hosts_mutex); /// Allowed primary () URL from config.xml + std::vector regexp_hosts TSA_GUARDED_BY(hosts_mutex); /// Allowed regexp () URL from config.xml /// Checks if the primary_hosts and regexp_hosts contain str. If primary_hosts and regexp_hosts are empty return true. bool checkForDirectEntry(const std::string & str) const; diff --git a/src/Common/TLDListsHolder.h b/src/Common/TLDListsHolder.h index b41ad4e6f45..708d049d5a6 100644 --- a/src/Common/TLDListsHolder.h +++ b/src/Common/TLDListsHolder.h @@ -59,7 +59,7 @@ protected: TLDListsHolder(); std::mutex tld_lists_map_mutex; - Map tld_lists_map; + Map tld_lists_map TSA_GUARDED_BY(tld_lists_map_mutex); }; } diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.h b/src/Common/ZooKeeper/ZooKeeperImpl.h index 58c5947e8ea..262d16427b3 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.h +++ b/src/Common/ZooKeeper/ZooKeeperImpl.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -228,13 +229,13 @@ private: using Operations = std::map; - Operations operations; + Operations operations TSA_GUARDED_BY(operations_mutex); std::mutex operations_mutex; using WatchCallbacks = std::vector; using Watches = std::map; - Watches watches; + Watches watches TSA_GUARDED_BY(watches_mutex); std::mutex watches_mutex; ThreadFromGlobalPool send_thread; diff --git a/src/Common/filesystemHelpers.cpp b/src/Common/filesystemHelpers.cpp index 7c10b24eb9c..3b05e671384 100644 --- a/src/Common/filesystemHelpers.cpp +++ b/src/Common/filesystemHelpers.cpp @@ -17,6 +17,7 @@ #include #include #include +#include namespace fs = std::filesystem; @@ -29,6 +30,7 @@ namespace ErrorCodes extern const int SYSTEM_ERROR; extern const int NOT_IMPLEMENTED; extern const int CANNOT_STAT; + extern const int CANNOT_FSTAT; extern const int CANNOT_STATVFS; extern const int PATH_ACCESS_DENIED; extern const int CANNOT_CREATE_FILE; @@ -215,6 +217,20 @@ bool fileOrSymlinkPathStartsWith(const String & path, const String & prefix_path return fileOrSymlinkPathStartsWith(filesystem_path, filesystem_prefix_path); } +size_t getSizeFromFileDescriptor(int fd, const String & file_name) +{ + struct stat buf; + int res = fstat(fd, &buf); + if (-1 == res) + { + throwFromErrnoWithPath( + "Cannot execute fstat" + (file_name.empty() ? "" : " file: " + file_name), + file_name, + ErrorCodes::CANNOT_FSTAT); + } + return buf.st_size; +} + } diff --git a/src/Common/filesystemHelpers.h b/src/Common/filesystemHelpers.h index 751c6a57e1c..1b8aa9bcdf9 100644 --- a/src/Common/filesystemHelpers.h +++ b/src/Common/filesystemHelpers.h @@ -64,6 +64,8 @@ bool pathStartsWith(const String & path, const String & prefix_path); /// (Path is made absolute and normalized.) bool fileOrSymlinkPathStartsWith(const String & path, const String & prefix_path); +size_t getSizeFromFileDescriptor(int fd, const String & file_name = ""); + } namespace FS diff --git a/src/Common/noexcept_scope.h b/src/Common/noexcept_scope.h new file mode 100644 index 00000000000..56fb44ff0bf --- /dev/null +++ b/src/Common/noexcept_scope.h @@ -0,0 +1,36 @@ +#pragma once +#include +#include +#include + + +#define NOEXCEPT_SCOPE_IMPL_CONCAT(n, expected) \ + LockMemoryExceptionInThread lock_memory_tracker##n(VariableContext::Global); \ + SCOPE_EXIT( \ + { \ + const auto uncaught = std::uncaught_exceptions(); \ + assert((expected) == uncaught || (expected) + 1 == uncaught); \ + if ((expected) < uncaught) \ + { \ + tryLogCurrentException("NOEXCEPT_SCOPE"); \ + abort(); \ + } \ + } \ + ) + +#define NOEXCEPT_SCOPE_IMPL(n, expected) NOEXCEPT_SCOPE_IMPL_CONCAT(n, expected) + +#define NOEXCEPT_SCOPE_CONCAT(n) \ + const auto num_curr_exceptions##n = std::uncaught_exceptions(); \ + NOEXCEPT_SCOPE_IMPL(n, num_curr_exceptions##n) + +#define NOEXCEPT_SCOPE_FWD(n) NOEXCEPT_SCOPE_CONCAT(n) + + +/// It can be used in critical places to exit on unexpected exceptions. +/// SIGABRT is usually better that broken in-memory state with unpredictable consequences. +/// It also temporarily disables exception from memory tracker in current thread. +/// Strict version does not take into account nested exception (i.e. it aborts even when we're in catch block). + +#define NOEXCEPT_SCOPE_STRICT NOEXCEPT_SCOPE_IMPL(__LINE__, 0) +#define NOEXCEPT_SCOPE NOEXCEPT_SCOPE_FWD(__LINE__) diff --git a/src/Common/quoteString.cpp b/src/Common/quoteString.cpp index 15901643f47..e3e6e0b3249 100644 --- a/src/Common/quoteString.cpp +++ b/src/Common/quoteString.cpp @@ -43,4 +43,5 @@ String backQuoteIfNeed(const StringRef & x) } return res; } + } diff --git a/src/Common/quoteString.h b/src/Common/quoteString.h index 0364efbdf14..73c0de03d45 100644 --- a/src/Common/quoteString.h +++ b/src/Common/quoteString.h @@ -23,4 +23,5 @@ String backQuote(const StringRef & x); /// Quote the identifier with backquotes, if required. String backQuoteIfNeed(const StringRef & x); + } diff --git a/src/Common/tests/gtest_lru_file_cache.cpp b/src/Common/tests/gtest_lru_file_cache.cpp index ca3ca25a0e5..2f268e217df 100644 --- a/src/Common/tests/gtest_lru_file_cache.cpp +++ b/src/Common/tests/gtest_lru_file_cache.cpp @@ -1,7 +1,8 @@ #include #include #include -#include +#include +#include #include #include #include @@ -46,14 +47,9 @@ std::vector fromHolder(const DB::FileSegmentsHolder & holder return std::vector(holder.file_segments.begin(), holder.file_segments.end()); } -String keyToStr(const DB::IFileCache::Key & key) -{ - return getHexUIntLowercase(key); -} - String getFileSegmentPath(const String & base_path, const DB::IFileCache::Key & key, size_t offset) { - auto key_str = keyToStr(key); + auto key_str = key.toString(); return fs::path(base_path) / key_str.substr(0, 3) / key_str / DB::toString(offset); } @@ -62,7 +58,7 @@ void download(DB::FileSegmentPtr file_segment) const auto & key = file_segment->key(); size_t size = file_segment->range().size(); - auto key_str = keyToStr(key); + auto key_str = key.toString(); auto subdir = fs::path(cache_base_path) / key_str.substr(0, 3) / key_str; if (!fs::exists(subdir)) fs::create_directories(subdir); @@ -112,7 +108,7 @@ TEST(LRUFileCache, get) auto key = cache.hash("key1"); { - auto holder = cache.getOrSet(key, 0, 10); /// Add range [0, 9] + auto holder = cache.getOrSet(key, 0, 10, false); /// Add range [0, 9] auto segments = fromHolder(holder); /// Range was not present in cache. It should be added in cache as one while file segment. ASSERT_EQ(segments.size(), 1); @@ -141,7 +137,7 @@ TEST(LRUFileCache, get) { /// Want range [5, 14], but [0, 9] already in cache, so only [10, 14] will be put in cache. - auto holder = cache.getOrSet(key, 5, 10); + auto holder = cache.getOrSet(key, 5, 10, false); auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 2); @@ -161,14 +157,14 @@ TEST(LRUFileCache, get) ASSERT_EQ(cache.getUsedCacheSize(), 15); { - auto holder = cache.getOrSet(key, 9, 1); /// Get [9, 9] + auto holder = cache.getOrSet(key, 9, 1, false); /// Get [9, 9] auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 1); assertRange(7, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED); } { - auto holder = cache.getOrSet(key, 9, 2); /// Get [9, 10] + auto holder = cache.getOrSet(key, 9, 2, false); /// Get [9, 10] auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 2); assertRange(8, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED); @@ -176,16 +172,15 @@ TEST(LRUFileCache, get) } { - auto holder = cache.getOrSet(key, 10, 1); /// Get [10, 10] + auto holder = cache.getOrSet(key, 10, 1, false); /// Get [10, 10] auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 1); assertRange(10, segments[0], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED); } - complete(cache.getOrSet(key, 17, 4)); /// Get [17, 20] - complete(cache.getOrSet(key, 24, 3)); /// Get [24, 26] - // complete(cache.getOrSet(key, 27, 1)); /// Get [27, 27] - + complete(cache.getOrSet(key, 17, 4, false)); /// Get [17, 20] + complete(cache.getOrSet(key, 24, 3, false)); /// Get [24, 26] + /// complete(cache.getOrSet(key, 27, 1, false)); /// Get [27, 27] /// Current cache: [__________][_____] [____] [___][] /// ^ ^^ ^ ^ ^ ^ ^^^ @@ -195,7 +190,7 @@ TEST(LRUFileCache, get) ASSERT_EQ(cache.getUsedCacheSize(), 22); { - auto holder = cache.getOrSet(key, 0, 26); /// Get [0, 25] + auto holder = cache.getOrSet(key, 0, 26, false); /// Get [0, 25] auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 6); @@ -229,14 +224,14 @@ TEST(LRUFileCache, get) /// as max elements size is reached, next attempt to put something in cache should fail. /// This will also check that [27, 27] was indeed evicted. - auto holder1 = cache.getOrSet(key, 27, 1); + auto holder1 = cache.getOrSet(key, 27, 1, false); auto segments_1 = fromHolder(holder1); /// Get [27, 27] ASSERT_EQ(segments_1.size(), 1); assertRange(17, segments_1[0], DB::FileSegment::Range(27, 27), DB::FileSegment::State::EMPTY); } { - auto holder = cache.getOrSet(key, 12, 10); /// Get [12, 21] + auto holder = cache.getOrSet(key, 12, 10, false); /// Get [12, 21] auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 4); @@ -260,7 +255,7 @@ TEST(LRUFileCache, get) ASSERT_EQ(cache.getFileSegmentsNum(), 5); { - auto holder = cache.getOrSet(key, 23, 5); /// Get [23, 28] + auto holder = cache.getOrSet(key, 23, 5, false); /// Get [23, 28] auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 3); @@ -281,12 +276,12 @@ TEST(LRUFileCache, get) /// 17 21 2324 26 28 { - auto holder5 = cache.getOrSet(key, 2, 3); /// Get [2, 4] + auto holder5 = cache.getOrSet(key, 2, 3,false); /// Get [2, 4] auto s5 = fromHolder(holder5); ASSERT_EQ(s5.size(), 1); assertRange(25, s5[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::EMPTY); - auto holder1 = cache.getOrSet(key, 30, 2); /// Get [30, 31] + auto holder1 = cache.getOrSet(key, 30, 2, false); /// Get [30, 31] auto s1 = fromHolder(holder1); ASSERT_EQ(s1.size(), 1); assertRange(26, s1[0], DB::FileSegment::Range(30, 31), DB::FileSegment::State::EMPTY); @@ -302,20 +297,20 @@ TEST(LRUFileCache, get) /// ^ ^ ^ ^ ^ ^ ^ ^ /// 2 4 23 24 26 27 30 31 - auto holder2 = cache.getOrSet(key, 23, 1); /// Get [23, 23] + auto holder2 = cache.getOrSet(key, 23, 1, false); /// Get [23, 23] auto s2 = fromHolder(holder2); ASSERT_EQ(s2.size(), 1); - auto holder3 = cache.getOrSet(key, 24, 3); /// Get [24, 26] + auto holder3 = cache.getOrSet(key, 24, 3, false); /// Get [24, 26] auto s3 = fromHolder(holder3); ASSERT_EQ(s3.size(), 1); - auto holder4 = cache.getOrSet(key, 27, 1); /// Get [27, 27] + auto holder4 = cache.getOrSet(key, 27, 1, false); /// Get [27, 27] auto s4 = fromHolder(holder4); ASSERT_EQ(s4.size(), 1); /// All cache is now unreleasable because pointers are still hold - auto holder6 = cache.getOrSet(key, 0, 40); + auto holder6 = cache.getOrSet(key, 0, 40, false); auto f = fromHolder(holder6); ASSERT_EQ(f.size(), 9); @@ -336,7 +331,7 @@ TEST(LRUFileCache, get) } { - auto holder = cache.getOrSet(key, 2, 3); /// Get [2, 4] + auto holder = cache.getOrSet(key, 2, 3, false); /// Get [2, 4] auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 1); assertRange(31, segments[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::DOWNLOADED); @@ -347,7 +342,7 @@ TEST(LRUFileCache, get) /// 2 4 23 24 26 27 30 31 { - auto holder = cache.getOrSet(key, 25, 5); /// Get [25, 29] + auto holder = cache.getOrSet(key, 25, 5, false); /// Get [25, 29] auto segments = fromHolder(holder); ASSERT_EQ(segments.size(), 3); @@ -371,7 +366,7 @@ TEST(LRUFileCache, get) DB::CurrentThread::QueryScope query_scope_holder_1(query_context_1); thread_status_1.attachQueryContext(query_context_1); - auto holder_2 = cache.getOrSet(key, 25, 5); /// Get [25, 29] once again. + auto holder_2 = cache.getOrSet(key, 25, 5, false); /// Get [25, 29] once again. auto segments_2 = fromHolder(holder_2); ASSERT_EQ(segments.size(), 3); @@ -414,7 +409,7 @@ TEST(LRUFileCache, get) /// and notify_all() is also called from destructor of holder. std::optional holder; - holder.emplace(cache.getOrSet(key, 3, 23)); /// Get [3, 25] + holder.emplace(cache.getOrSet(key, 3, 23, false)); /// Get [3, 25] auto segments = fromHolder(*holder); ASSERT_EQ(segments.size(), 3); @@ -440,7 +435,7 @@ TEST(LRUFileCache, get) DB::CurrentThread::QueryScope query_scope_holder_1(query_context_1); thread_status_1.attachQueryContext(query_context_1); - auto holder_2 = cache.getOrSet(key, 3, 23); /// Get [3, 25] once again + auto holder_2 = cache.getOrSet(key, 3, 23, false); /// Get [3, 25] once again auto segments_2 = fromHolder(*holder); ASSERT_EQ(segments_2.size(), 3); @@ -487,7 +482,8 @@ TEST(LRUFileCache, get) auto cache2 = DB::LRUFileCache(cache_base_path, settings); cache2.initialize(); - auto holder1 = cache2.getOrSet(key, 2, 28); /// Get [2, 29] + auto holder1 = cache2.getOrSet(key, 2, 28, false); /// Get [2, 29] + auto segments1 = fromHolder(holder1); ASSERT_EQ(segments1.size(), 5); @@ -506,7 +502,7 @@ TEST(LRUFileCache, get) auto cache2 = DB::LRUFileCache(caches_dir / "cache2", settings2); cache2.initialize(); - auto holder1 = cache2.getOrSet(key, 0, 25); /// Get [0, 24] + auto holder1 = cache2.getOrSet(key, 0, 25, false); /// Get [0, 24] auto segments1 = fromHolder(holder1); ASSERT_EQ(segments1.size(), 3); diff --git a/src/Compression/getCompressionCodecForFile.cpp b/src/Compression/getCompressionCodecForFile.cpp index 5ef5502f947..f9365862c5b 100644 --- a/src/Compression/getCompressionCodecForFile.cpp +++ b/src/Compression/getCompressionCodecForFile.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB { @@ -12,9 +13,9 @@ namespace DB using Checksum = CityHash_v1_0_2::uint128; -CompressionCodecPtr getCompressionCodecForFile(const DiskPtr & disk, const String & relative_path) +CompressionCodecPtr getCompressionCodecForFile(const DataPartStoragePtr & data_part_storage, const String & relative_path) { - auto read_buffer = disk->readFile(relative_path); + auto read_buffer = data_part_storage->readFile(relative_path, {}, std::nullopt, std::nullopt); read_buffer->ignore(sizeof(Checksum)); UInt8 header_size = ICompressionCodec::getHeaderSize(); diff --git a/src/Compression/getCompressionCodecForFile.h b/src/Compression/getCompressionCodecForFile.h index 4870de8b3b3..ad855684128 100644 --- a/src/Compression/getCompressionCodecForFile.h +++ b/src/Compression/getCompressionCodecForFile.h @@ -2,6 +2,7 @@ #include #include +#include namespace DB { @@ -10,6 +11,6 @@ namespace DB /// clickhouse fashion (with checksums, headers for each block, etc). This /// method should be used as fallback when we cannot deduce compression codec /// from metadata. -CompressionCodecPtr getCompressionCodecForFile(const DiskPtr & disk, const String & relative_path); +CompressionCodecPtr getCompressionCodecForFile(const DataPartStoragePtr & data_part_storage, const String & relative_path); } diff --git a/src/Coordination/InMemoryLogStore.h b/src/Coordination/InMemoryLogStore.h index ad4466b363f..fc56826c81b 100644 --- a/src/Coordination/InMemoryLogStore.h +++ b/src/Coordination/InMemoryLogStore.h @@ -4,6 +4,7 @@ #include #include #include +#include #include namespace DB @@ -39,7 +40,7 @@ public: bool flush() override { return true; } private: - std::map> logs; + std::map> logs TSA_GUARDED_BY(logs_lock); mutable std::mutex logs_lock; std::atomic start_idx; }; diff --git a/src/Coordination/KeeperLogStore.h b/src/Coordination/KeeperLogStore.h index 3e558c0508e..e1c66599e0a 100644 --- a/src/Coordination/KeeperLogStore.h +++ b/src/Coordination/KeeperLogStore.h @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB { @@ -70,7 +71,7 @@ public: private: mutable std::mutex changelog_lock; Poco::Logger * log; - Changelog changelog; + Changelog changelog TSA_GUARDED_BY(changelog_lock); }; } diff --git a/src/Core/ProtocolDefines.h b/src/Core/ProtocolDefines.h index 6ee491f3ab5..2df48a79776 100644 --- a/src/Core/ProtocolDefines.h +++ b/src/Core/ProtocolDefines.h @@ -52,6 +52,8 @@ /// NOTE: DBMS_TCP_PROTOCOL_VERSION has nothing common with VERSION_REVISION, /// later is just a number for server version (one number instead of commit SHA) /// for simplicity (sometimes it may be more convenient in some use cases). -#define DBMS_TCP_PROTOCOL_VERSION 54455 +#define DBMS_TCP_PROTOCOL_VERSION 54456 #define DBMS_MIN_PROTOCOL_VERSION_WITH_INITIAL_QUERY_START_TIME 54449 + +#define DBMS_MIN_PROTOCOL_VERSION_WITH_PROFILE_EVENTS_IN_INSERT 54456 diff --git a/src/Core/Settings.h b/src/Core/Settings.h index e3f756c85f5..f1fd9d20f00 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -127,7 +127,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(UInt64, group_by_two_level_threshold_bytes, 50000000, "From what size of the aggregation state in bytes, a two-level aggregation begins to be used. 0 - the threshold is not set. Two-level aggregation is used when at least one of the thresholds is triggered.", 0) \ M(Bool, distributed_aggregation_memory_efficient, true, "Is the memory-saving mode of distributed aggregation enabled.", 0) \ M(UInt64, aggregation_memory_efficient_merge_threads, 0, "Number of threads to use for merge intermediate aggregation results in memory efficient mode. When bigger, then more memory is consumed. 0 means - same as 'max_threads'.", 0) \ - M(Bool, enable_positional_arguments, false, "Enable positional arguments in ORDER BY, GROUP BY and LIMIT BY", 0) \ + M(Bool, enable_positional_arguments, true, "Enable positional arguments in ORDER BY, GROUP BY and LIMIT BY", 0) \ \ M(UInt64, max_parallel_replicas, 1, "The maximum number of replicas of each shard used when the query is executed. For consistency (to get different parts of the same partition), this option only works for the specified sampling key. The lag of the replicas is not controlled.", 0) \ M(UInt64, parallel_replicas_count, 0, "", 0) \ @@ -183,6 +183,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) \ M(Int64, network_zstd_compression_level, 1, "Allows you to select the level of ZSTD compression.", 0) \ \ + M(Int64, zstd_window_log_max, 0, "Allows you to select the max window log of ZSTD (it will not be used for MergeTree family)", 0) \ + \ M(UInt64, priority, 0, "Priority of the query. 1 - the highest, higher value - lower priority; 0 - do not use priorities.", 0) \ M(Int64, os_thread_priority, 0, "If non zero - set corresponding 'nice' value for query processing threads. Can be used to adjust query priority for OS scheduler.", 0) \ \ @@ -404,6 +406,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, parallel_view_processing, false, "Enables pushing to attached views concurrently instead of sequentially.", 0) \ M(Bool, enable_unaligned_array_join, false, "Allow ARRAY JOIN with multiple arrays that have different sizes. When this settings is enabled, arrays will be resized to the longest one.", 0) \ M(Bool, optimize_read_in_order, true, "Enable ORDER BY optimization for reading data in corresponding order in MergeTree tables.", 0) \ + M(Bool, optimize_read_in_window_order, true, "Enable ORDER BY optimization in window clause for reading data in corresponding order in MergeTree tables.", 0) \ M(Bool, optimize_aggregation_in_order, false, "Enable GROUP BY optimization for aggregating data in corresponding order in MergeTree tables.", 0) \ M(UInt64, aggregation_in_order_max_block_bytes, 50000000, "Maximal size of block in bytes accumulated during aggregation in order of primary key. Lower block size allows to parallelize more final merge stage of aggregation.", 0) \ M(UInt64, read_in_order_two_level_merge_threshold, 100, "Minimal number of parts to read to run preliminary merge step during multithread reading in order of primary key.", 0) \ @@ -481,7 +484,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Seconds, lock_acquire_timeout, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, "How long locking request should wait before failing", 0) \ M(Bool, materialize_ttl_after_modify, true, "Apply TTL for old data, after ALTER MODIFY TTL query", 0) \ M(String, function_implementation, "", "Choose function implementation for specific target or variant (experimental). If empty enable all of them.", 0) \ - M(Bool, allow_experimental_geo_types, false, "Allow geo data types such as Point, Ring, Polygon, MultiPolygon", 0) \ + M(Bool, allow_experimental_geo_types, true, "Allow geo data types such as Point, Ring, Polygon, MultiPolygon", 0) \ M(Bool, data_type_default_nullable, false, "Data types without NULL or NOT NULL will make Nullable", 0) \ M(Bool, cast_keep_nullable, false, "CAST operator keep Nullable for result data type", 0) \ M(Bool, cast_ipv4_ipv6_default_on_conversion_error, false, "CAST operator into IPv4, CAST operator into IPV6 type, toIPv4, toIPv6 functions will return default value instead of throwing exception on conversion error.", 0) \ @@ -494,8 +497,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, allow_non_metadata_alters, true, "Allow to execute alters which affects not only tables metadata, but also data on disk", 0) \ M(Bool, enable_global_with_statement, true, "Propagate WITH statements to UNION queries and all subqueries", 0) \ M(Bool, aggregate_functions_null_for_empty, false, "Rewrite all aggregate functions in a query, adding -OrNull suffix to them", 0) \ - M(Bool, optimize_syntax_fuse_functions, false, "Allow apply syntax optimisation: fuse aggregate functions", 0) \ - M(Bool, optimize_fuse_sum_count_avg, false, "Fuse functions `sum, avg, count` with identical arguments into one `sumCount` (`optimize_syntax_fuse_functions should be enabled)", 0) \ + M(Bool, optimize_syntax_fuse_functions, false, "Not ready for production, do not use. Allow apply syntax optimisation: fuse aggregate functions", 0) \ + M(Bool, optimize_fuse_sum_count_avg, false, "Not ready for production, do not use. Fuse functions `sum, avg, count` with identical arguments into one `sumCount` (`optimize_syntax_fuse_functions should be enabled)", 0) \ M(Bool, flatten_nested, true, "If true, columns of type Nested will be flatten to separate array columns instead of one array of tuples", 0) \ M(Bool, asterisk_include_materialized_columns, false, "Include MATERIALIZED columns for wildcard query", 0) \ M(Bool, asterisk_include_alias_columns, false, "Include ALIAS columns for wildcard query", 0) \ @@ -550,7 +553,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(UInt64, function_range_max_elements_in_block, 500000000, "Maximum number of values generated by function 'range' per block of data (sum of array sizes for every row in a block, see also 'max_block_size' and 'min_insert_block_size_rows'). It is a safety threshold.", 0) \ M(ShortCircuitFunctionEvaluation, short_circuit_function_evaluation, ShortCircuitFunctionEvaluation::ENABLE, "Setting for short-circuit function evaluation configuration. Possible values: 'enable' - use short-circuit function evaluation for functions that are suitable for it, 'disable' - disable short-circuit function evaluation, 'force_enable' - use short-circuit function evaluation for all functions.", 0) \ \ - M(String, local_filesystem_read_method, "pread", "Method of reading data from local filesystem, one of: read, pread, mmap, pread_threadpool.", 0) \ + M(String, local_filesystem_read_method, "pread_threadpool", "Method of reading data from local filesystem, one of: read, pread, mmap, pread_threadpool.", 0) \ M(String, remote_filesystem_read_method, "threadpool", "Method of reading data from remote filesystem, one of: read, threadpool.", 0) \ M(Bool, local_filesystem_read_prefetch, false, "Should use prefetching when reading data from local filesystem.", 0) \ M(Bool, remote_filesystem_read_prefetch, true, "Should use prefetching when reading data from remote filesystem.", 0) \ @@ -600,6 +603,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, "Wait for committed changes to become actually visible in the latest snapshot", 0) \ M(Bool, throw_if_no_data_to_insert, true, "Enables or disables empty INSERTs, enabled by default", 0) \ M(Bool, compatibility_ignore_auto_increment_in_create_table, false, "Ignore AUTO_INCREMENT keyword in column declaration if true, otherwise return error. It simplifies migration from MySQL", 0) \ + M(Bool, multiple_joins_try_to_keep_original_names, false, "Do not add aliases to top level expression list on multiple joins rewrite", 0) \ // End of COMMON_SETTINGS // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS. diff --git a/src/Core/SortCursor.h b/src/Core/SortCursor.h index 3d0f33b181a..3b58923c2c9 100644 --- a/src/Core/SortCursor.h +++ b/src/Core/SortCursor.h @@ -140,8 +140,11 @@ struct SortCursorImpl bool isFirst() const { return pos == 0; } bool isLast() const { return pos + 1 >= rows; } + bool isLast(size_t size) const { return pos + size >= rows; } bool isValid() const { return pos < rows; } void next() { ++pos; } + void next(size_t size) { pos += size; } + size_t getSize() const { return rows; } /// Prevent using pos instead of getRow() private: @@ -168,6 +171,11 @@ struct SortCursorHelper return derived().greaterAt(rhs.derived(), impl->getRow(), rhs.impl->getRow()); } + bool ALWAYS_INLINE greaterWithOffset(const SortCursorHelper & rhs, size_t lhs_offset, size_t rhs_offset) const + { + return derived().greaterAt(rhs.derived(), impl->getRow() + lhs_offset, rhs.impl->getRow() + rhs_offset); + } + /// Inverted so that the priority queue elements are removed in ascending order. bool ALWAYS_INLINE operator< (const SortCursorHelper & rhs) const { @@ -322,66 +330,126 @@ struct SortCursorWithCollation : SortCursorHelper } }; +enum class SortingQueueStrategy +{ + Default, + Batch +}; -/** Allows to fetch data from multiple sort cursors in sorted order (merging sorted data streams). - * TODO: Replace with "Loser Tree", see https://en.wikipedia.org/wiki/K-way_merge_algorithm - */ -template -class SortingHeap +/// Allows to fetch data from multiple sort cursors in sorted order (merging sorted data streams). +template +class SortingQueueImpl { public: - SortingHeap() = default; + SortingQueueImpl() = default; template - explicit SortingHeap(Cursors & cursors) + explicit SortingQueueImpl(Cursors & cursors) { size_t size = cursors.size(); queue.reserve(size); + for (size_t i = 0; i < size; ++i) - if (!cursors[i].empty()) - queue.emplace_back(&cursors[i]); + { + if (cursors[i].empty()) + continue; + + queue.emplace_back(&cursors[i]); + } + std::make_heap(queue.begin(), queue.end()); + + if constexpr (strategy == SortingQueueStrategy::Batch) + { + if (!queue.empty()) + updateBatchSize(); + } } bool isValid() const { return !queue.empty(); } - Cursor & current() { return queue.front(); } + Cursor & current() requires (strategy == SortingQueueStrategy::Default) + { + return queue.front(); + } + + std::pair current() requires (strategy == SortingQueueStrategy::Batch) + { + return {&queue.front(), batch_size}; + } size_t size() { return queue.size(); } Cursor & nextChild() { return queue[nextChildIndex()]; } - void ALWAYS_INLINE next() + void ALWAYS_INLINE next() requires (strategy == SortingQueueStrategy::Default) { assert(isValid()); - if (!current()->isLast()) + if (!queue.front()->isLast()) { - current()->next(); - updateTop(); + queue.front()->next(); + updateTop(true /*check_in_order*/); } else + { removeTop(); + } + } + + void ALWAYS_INLINE next(size_t batch_size_value) requires (strategy == SortingQueueStrategy::Batch) + { + assert(isValid()); + assert(batch_size_value <= batch_size); + assert(batch_size_value > 0); + + batch_size -= batch_size_value; + if (batch_size > 0) + { + queue.front()->next(batch_size_value); + return; + } + + if (!queue.front()->isLast(batch_size_value)) + { + queue.front()->next(batch_size_value); + updateTop(false /*check_in_order*/); + } + else + { + removeTop(); + } } void replaceTop(Cursor new_top) { - current() = new_top; - updateTop(); + queue.front() = new_top; + updateTop(true /*check_in_order*/); } void removeTop() { std::pop_heap(queue.begin(), queue.end()); queue.pop_back(); - next_idx = 0; + next_child_idx = 0; + + if constexpr (strategy == SortingQueueStrategy::Batch) + { + if (queue.empty()) + batch_size = 0; + else + updateBatchSize(); + } } void push(SortCursorImpl & cursor) { queue.emplace_back(&cursor); std::push_heap(queue.begin(), queue.end()); - next_idx = 0; + next_child_idx = 0; + + if constexpr (strategy == SortingQueueStrategy::Batch) + updateBatchSize(); } private: @@ -389,26 +457,27 @@ private: Container queue; /// Cache comparison between first and second child if the order in queue has not been changed. - size_t next_idx = 0; + size_t next_child_idx = 0; + size_t batch_size = 0; size_t ALWAYS_INLINE nextChildIndex() { - if (next_idx == 0) + if (next_child_idx == 0) { - next_idx = 1; + next_child_idx = 1; - if (queue.size() > 2 && queue[1] < queue[2]) - ++next_idx; + if (queue.size() > 2 && queue[1].greater(queue[2])) + ++next_child_idx; } - return next_idx; + return next_child_idx; } /// This is adapted version of the function __sift_down from libc++. /// Why cannot simply use std::priority_queue? /// - because it doesn't support updating the top element and requires pop and push instead. /// Also look at "Boost.Heap" library. - void ALWAYS_INLINE updateTop() + void ALWAYS_INLINE updateTop(bool check_in_order) { size_t size = queue.size(); if (size < 2) @@ -420,10 +489,14 @@ private: auto child_it = begin + child_idx; /// Check if we are in order. - if (*child_it < *begin) + if (check_in_order && (*child_it).greater(*begin)) + { + if constexpr (strategy == SortingQueueStrategy::Batch) + updateBatchSize(); return; + } - next_idx = 0; + next_child_idx = 0; auto curr_it = begin; auto top(std::move(*begin)); @@ -441,7 +514,7 @@ private: child_it = begin + child_idx; - if ((child_idx + 1) < size && *child_it < *(child_it + 1)) + if ((child_idx + 1) < size && (*child_it).greater(*(child_it + 1))) { /// Right child exists and is greater than left child. ++child_it; @@ -449,13 +522,57 @@ private: } /// Check if we are in order. - } while (!(*child_it < top)); + } while (!((*child_it).greater(top))); *curr_it = std::move(top); + + if constexpr (strategy == SortingQueueStrategy::Batch) + updateBatchSize(); + } + + /// Update batch size of elements that client can extract from current cursor + void updateBatchSize() + { + assert(!queue.empty()); + + auto & begin_cursor = *queue.begin(); + size_t min_cursor_size = begin_cursor->getSize(); + size_t min_cursor_pos = begin_cursor->getPosRef(); + + if (queue.size() == 1) + { + batch_size = min_cursor_size - min_cursor_pos; + return; + } + + batch_size = 1; + size_t child_idx = nextChildIndex(); + auto & next_child_cursor = *(queue.begin() + child_idx); + + if (min_cursor_pos + batch_size < min_cursor_size && next_child_cursor.greaterWithOffset(begin_cursor, 0, batch_size)) + ++batch_size; + else + return; + + if (unlikely(begin_cursor.totallyLessOrEquals(next_child_cursor))) + { + batch_size = min_cursor_size - min_cursor_pos; + return; + } + + while (min_cursor_pos + batch_size < min_cursor_size && next_child_cursor.greaterWithOffset(begin_cursor, 0, batch_size)) + ++batch_size; } }; +template +using SortingQueue = SortingQueueImpl; + +template +using SortingQueueBatch = SortingQueueImpl; + /** SortQueueVariants allow to specialize sorting queue for concrete types and sort description. - * To access queue callOnVariant method must be used. + * To access queue variant callOnVariant method must be used. + * To access batch queue variant callOnBatchVariant method must be used. */ class SortQueueVariants { @@ -476,7 +593,7 @@ public: if (has_collation) { - queue_variants = SortingHeap(); + initializeQueues(); return; } else if (sort_description.size() == 1) @@ -491,16 +608,16 @@ public: using ColumnDataType = typename Types::LeftType; using ColumnType = typename ColumnDataType::ColumnType; - queue_variants = SortingHeap>(); + initializeQueues>(); return true; }); if (!result) - queue_variants = SortingHeap(); + initializeQueues(); } else { - queue_variants = SortingHeap(); + initializeQueues(); } } @@ -512,17 +629,30 @@ public: template decltype(auto) callOnVariant(Func && func) { - return std::visit(func, queue_variants); + return std::visit(func, default_queue_variants); + } + + template + decltype(auto) callOnBatchVariant(Func && func) + { + return std::visit(func, batch_queue_variants); } bool variantSupportJITCompilation() const { - return std::holds_alternative>(queue_variants) - || std::holds_alternative>(queue_variants) - || std::holds_alternative>(queue_variants); + return std::holds_alternative>(default_queue_variants) + || std::holds_alternative>(default_queue_variants) + || std::holds_alternative>(default_queue_variants); } private: + template + void initializeQueues() + { + default_queue_variants = SortingQueue(); + batch_queue_variants = SortingQueueBatch(); + } + static DataTypes extractSortDescriptionTypesFromHeader(const Block & header, const SortDescription & sort_description) { size_t sort_description_size = sort_description.size(); @@ -537,39 +667,45 @@ private: return data_types; } - std::variant< - SortingHeap>>, - SortingHeap>>, - SortingHeap>>, - SortingHeap>>, - SortingHeap>>, - SortingHeap>>, + template + using QueueVariants = std::variant< + SortingQueueImpl>, strategy>, + SortingQueueImpl>, strategy>, + SortingQueueImpl>, strategy>, + SortingQueueImpl>, strategy>, + SortingQueueImpl>, strategy>, + SortingQueueImpl>, strategy>, - SortingHeap>>, - SortingHeap>>, - SortingHeap>>, - SortingHeap>>, - SortingHeap>>, - SortingHeap>>, + SortingQueueImpl>, strategy>, + SortingQueueImpl>, strategy>, + SortingQueueImpl>, strategy>, + SortingQueueImpl>, strategy>, + SortingQueueImpl>, strategy>, + SortingQueueImpl>, strategy>, - SortingHeap>>, - SortingHeap>>, + SortingQueueImpl>, strategy>, + SortingQueueImpl>, strategy>, - SortingHeap>>, - SortingHeap>>, - SortingHeap>>, - SortingHeap>>, - SortingHeap>>, + SortingQueueImpl>, strategy>, + SortingQueueImpl>, strategy>, + SortingQueueImpl>, strategy>, + SortingQueueImpl>, strategy>, + SortingQueueImpl>, strategy>, - SortingHeap>>, + SortingQueueImpl>, strategy>, - SortingHeap>, - SortingHeap>, + SortingQueueImpl, strategy>, + SortingQueueImpl, strategy>, - SortingHeap, - SortingHeap, - SortingHeap> - queue_variants; + SortingQueueImpl, + SortingQueueImpl, + SortingQueueImpl>; + + using DefaultQueueVariants = QueueVariants; + using BatchQueueVariants = QueueVariants; + + DefaultQueueVariants default_queue_variants; + BatchQueueVariants batch_queue_variants; }; template diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index 62fcebb10bb..e731787a5c1 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -397,6 +397,7 @@ extern "C" void __sanitizer_set_death_callback(void (*)()); static void sanitizerDeathCallback() { + DENY_ALLOCATIONS_IN_SCOPE; /// Also need to send data via pipe. Otherwise it may lead to deadlocks or failures in printing diagnostic info. char buf[signal_pipe_buf_size]; diff --git a/src/Databases/DDLRenamingVisitor.cpp b/src/Databases/DDLRenamingVisitor.cpp new file mode 100644 index 00000000000..caedfc55f3d --- /dev/null +++ b/src/Databases/DDLRenamingVisitor.cpp @@ -0,0 +1,378 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int WRONG_DDL_RENAMING_SETTINGS; + extern const int LOGICAL_ERROR; +} + +namespace +{ + /// CREATE TABLE or CREATE DICTIONARY or CREATE VIEW or CREATE TEMPORARY TABLE or CREATE DATABASE query. + void visitCreateQuery(ASTCreateQuery & create, const DDLRenamingVisitor::Data & data) + { + if (create.temporary) + { + /// CREATE TEMPORARY TABLE + String table_name = create.getTable(); + const auto & new_table_name = data.renaming_map.getNewTemporaryTableName(table_name); + if (new_table_name != table_name) + create.setTable(new_table_name); + } + else if (create.table) + { + /// CREATE TABLE or CREATE DICTIONARY or CREATE VIEW + QualifiedTableName qualified_name; + qualified_name.table = create.getTable(); + qualified_name.database = create.getDatabase(); + + if (!qualified_name.database.empty() && !qualified_name.table.empty()) + { + auto new_qualified_name = data.renaming_map.getNewTableName(qualified_name); + if (new_qualified_name != qualified_name) + { + create.setTable(new_qualified_name.table); + create.setDatabase(new_qualified_name.database); + } + } + } + else if (create.database) + { + /// CREATE DATABASE + String database_name = create.getDatabase(); + if (!database_name.empty()) + { + String new_database_name = data.renaming_map.getNewDatabaseName(database_name); + create.setDatabase(new_database_name); + } + } + + QualifiedTableName as_table{create.as_database, create.as_table}; + if (!as_table.table.empty() && !as_table.database.empty()) + { + auto as_table_new = data.renaming_map.getNewTableName(as_table); + create.as_database = as_table_new.database; + create.as_table = as_table_new.table; + } + + QualifiedTableName to_table{create.to_table_id.database_name, create.to_table_id.table_name}; + if (!to_table.table.empty() && !to_table.database.empty()) + { + auto to_table_new = data.renaming_map.getNewTableName(to_table); + if (to_table_new != to_table) + create.to_table_id = StorageID{to_table_new.database, to_table_new.table}; + } + } + + /// ASTTableExpression represents a reference to a table in SELECT query. + /// DDLRenamingVisitor should handle ASTTableExpression because some CREATE queries can contain SELECT queries after AS (e.g. CREATE VIEW). + /// We'll try to replace database and table name. + void visitTableExpression(ASTTableExpression & expr, const DDLRenamingVisitor::Data & data) + { + if (!expr.database_and_table_name) + return; + + ASTIdentifier * identifier = dynamic_cast(expr.database_and_table_name.get()); + if (!identifier) + return; + + auto table_identifier = identifier->createTable(); + if (!table_identifier) + return; + + QualifiedTableName qualified_name{table_identifier->getDatabaseName(), table_identifier->shortName()}; + if (qualified_name.table.empty() || qualified_name.database.empty()) + return; + + auto new_qualified_name = data.renaming_map.getNewTableName(qualified_name); + if (new_qualified_name == qualified_name) + return; + + expr.database_and_table_name = std::make_shared(new_qualified_name.database, new_qualified_name.table); + expr.children.push_back(expr.database_and_table_name); + } + + /// ASTDictionary keeps a dictionary definition, for example + /// PRIMARY KEY key_column1, key_column2 + /// SOURCE(CLICKHOUSE(host '127.0.0.1' port 9000 user 'default' password '' db 'default' table 'ids' where 'id=10' query 'SELECT id, value_1, value_2 FROM default.ids')) + /// LAYOUT ... LIFETIME ... RANGE ... + /// + /// We'll try to replace database and table name in SOURCE if the specified `host` is local. + /// TODO: Probably we could try to replace database and table name in `query` too. + void visitDictionaryDef(ASTDictionary & dictionary, const DDLRenamingVisitor::Data & data) + { + if (!dictionary.source || dictionary.source->name != "clickhouse" || !dictionary.source->elements) + return; + + auto config = getDictionaryConfigurationFromAST(data.create_query->as(), data.global_context); + auto info = getInfoIfClickHouseDictionarySource(config, data.global_context); + if (!info || !info->is_local) + return; + + auto * source_list = dictionary.source->elements->as(); + if (!source_list) + return; + + auto & source_elements = source_list->children; + + Field * database_name_field = nullptr; + Field * table_name_field = nullptr; + + for (const auto & source_element : source_elements) + { + if (!source_element) + continue; + + auto * pair = source_element->as(); + if (!pair || !pair->second) + continue; + + auto * literal = pair->second->as(); + if (!literal) + continue; + + if (literal->value.getType() == Field::Types::String) + { + if (pair->first == "db") + database_name_field = &literal->value; + else if (pair->first == "table") + table_name_field = &literal->value; + } + } + + if (database_name_field && table_name_field) + { + QualifiedTableName qualified_name{database_name_field->get(), table_name_field->get()}; + if (!qualified_name.database.empty() && !qualified_name.table.empty()) + { + auto new_qualified_name = data.renaming_map.getNewTableName(qualified_name); + if (new_qualified_name != qualified_name) + { + *database_name_field = new_qualified_name.database; + *table_name_field = new_qualified_name.table; + } + } + } + } + + /// Replaces a qualified table name in a specified function's argument. + /// It can be either a string or an identifier with a dot in the middle. + void replaceTableNameInArgument(const ASTFunction & function, const DDLRenamingVisitor::Data & data, size_t arg_idx) + { + /// Just ignore incorrect arguments, proper exception will be thrown later + if (!function.arguments || function.arguments->children.size() <= arg_idx) + return; + + auto & arg = function.arguments->as()->children[arg_idx]; + if (auto * literal = arg->as()) + { + if (literal->value.getType() != Field::Types::String) + return; + + auto maybe_qualified_name = QualifiedTableName::tryParseFromString(literal->value.get()); + /// Just return if name if invalid + if (!maybe_qualified_name || maybe_qualified_name->database.empty() || maybe_qualified_name->table.empty()) + return; + + auto new_qualified_name = data.renaming_map.getNewTableName(*maybe_qualified_name); + literal->value = new_qualified_name.getFullName(); + return; + } + + if (const auto * identifier = dynamic_cast(arg.get())) + { + /// ASTIdentifier or ASTTableIdentifier + auto table_identifier = identifier->createTable(); + /// Just return if table identified is invalid + if (!table_identifier) + return; + + QualifiedTableName qualified_name{table_identifier->getDatabaseName(), table_identifier->shortName()}; + if (qualified_name.database.empty() || qualified_name.table.empty()) + return; + + auto new_qualified_name = data.renaming_map.getNewTableName(qualified_name); + arg = std::make_shared(new_qualified_name.database, new_qualified_name.table); + return; + } + } + + /// Replaces a qualified database name in a specified function's argument. + void replaceDatabaseNameInArguments(const ASTFunction & function, const DDLRenamingVisitor::Data & data, size_t arg_idx) + { + /// Just ignore incorrect arguments, proper exception will be thrown later + if (!function.arguments || function.arguments->children.size() <= arg_idx) + return; + + auto & arg = function.arguments->as()->children[arg_idx]; + auto * literal = arg->as(); + if (!literal || (literal->value.getType() != Field::Types::String)) + return; + + auto database_name = literal->value.get(); + if (database_name.empty()) + return; + + auto new_database_name = data.renaming_map.getNewDatabaseName(database_name); + literal->value = new_database_name; + } + + void visitTableEngine(ASTStorage & storage, const DDLRenamingVisitor::Data & data) + { + if (!storage.engine) + return; + + if (storage.engine->name == "Dictionary") + { + /// Syntax: CREATE TABLE table_name() engine = Dictionary('dictionary_name') + /// We'll try to replace the dictionary name. + replaceTableNameInArgument(*storage.engine, data, 0); + } + else if (storage.engine->name == "Merge") + { + /// Syntax: CREATE TABLE ... Engine=Merge(db_name, tables_regexp) + /// We'll try to replace the database name but we can do nothing to 'tables_regexp'. + replaceDatabaseNameInArguments(*storage.engine, data, 0); + } + } + + void visitFunction(const ASTFunction & function, const DDLRenamingVisitor::Data & data) + { + if (function.name == "joinGet" || + function.name == "dictHas" || + function.name == "dictIsIn" || + function.name.starts_with("dictGet")) + { + replaceTableNameInArgument(function, data, 0); + } + else if (Poco::toLower(function.name) == "in") + { + replaceTableNameInArgument(function, data, 1); + } + else if (function.name == "merge") + { + /// Syntax: merge('db_name', 'tables_regexp') + /// We'll try to replace the database name but we can do nothing to 'tables_regexp'. + replaceDatabaseNameInArguments(function, data, 0); + } + } +} + +void DDLRenamingVisitor::visit(ASTPtr ast, const Data & data) +{ + if (auto * create = ast->as()) + visitCreateQuery(*create, data); + else if (auto * expr = ast->as()) + visitTableExpression(*expr, data); + else if (auto * function = ast->as()) + visitFunction(*function, data); + else if (auto * dictionary = ast->as()) + visitDictionaryDef(*dictionary, data); + else if (auto * storage = ast->as()) + visitTableEngine(*storage, data); +} + +bool DDLRenamingVisitor::needChildVisit(const ASTPtr &, const ASTPtr &) { return true; } + + +void renameDatabaseAndTableNameInCreateQuery(const ContextPtr & global_context, const DDLRenamingMap & renaming_map, ASTPtr & ast) +{ + DDLRenamingVisitor::Data data{global_context, renaming_map, ast}; + DDLRenamingVisitor::Visitor{data}.visit(ast); +} + + +void DDLRenamingMap::setNewTableName(const QualifiedTableName & old_table_name, const QualifiedTableName & new_table_name) +{ + if (old_table_name.table.empty() || old_table_name.database.empty() || new_table_name.table.empty() || new_table_name.database.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty names are not allowed"); + + auto it = old_to_new_table_names.find(old_table_name); + if ((it != old_to_new_table_names.end())) + { + if (it->second == new_table_name) + return; + throw Exception( + ErrorCodes::WRONG_DDL_RENAMING_SETTINGS, + "Wrong renaming: it's specified that table {} should be renamed to {} and to {} at the same time", + old_table_name.getFullName(), + it->second.getFullName(), + new_table_name.getFullName()); + } + old_to_new_table_names[old_table_name] = new_table_name; +} + +void DDLRenamingMap::setNewDatabaseName(const String & old_database_name, const String & new_database_name) +{ + if (old_database_name.empty() || new_database_name.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty names are not allowed"); + + auto it = old_to_new_database_names.find(old_database_name); + if ((it != old_to_new_database_names.end())) + { + if (it->second == new_database_name) + return; + throw Exception(ErrorCodes::WRONG_DDL_RENAMING_SETTINGS, "Wrong renaming: it's specified that database {} should be renamed to {} and to {} at the same time", + backQuoteIfNeed(old_database_name), backQuoteIfNeed(it->second), backQuoteIfNeed(new_database_name)); + } + old_to_new_database_names[old_database_name] = new_database_name; +} + + +const String & DDLRenamingMap::getNewDatabaseName(const String & old_database_name) const +{ + auto it = old_to_new_database_names.find(old_database_name); + if (it != old_to_new_database_names.end()) + return it->second; + return old_database_name; +} + +QualifiedTableName DDLRenamingMap::getNewTableName(const QualifiedTableName & old_table_name) const +{ + auto it = old_to_new_table_names.find(old_table_name); + if (it != old_to_new_table_names.end()) + return it->second; + return {getNewDatabaseName(old_table_name.database), old_table_name.table}; +} + +void DDLRenamingMap::setNewTemporaryTableName(const String & old_table_name, const String & new_table_name) +{ + if (old_table_name.empty() || new_table_name.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty names are not allowed"); + + auto it = old_to_new_temporary_table_names.find(old_table_name); + if ((it != old_to_new_temporary_table_names.end())) + { + if (it->second == new_table_name) + return; + throw Exception(ErrorCodes::WRONG_DDL_RENAMING_SETTINGS, "Wrong renaming: it's specified that temporary table {} should be renamed to {} and to {} at the same time", + backQuoteIfNeed(old_table_name), backQuoteIfNeed(it->second), backQuoteIfNeed(new_table_name)); + } + old_to_new_temporary_table_names[old_table_name] = new_table_name; +} + +const String & DDLRenamingMap::getNewTemporaryTableName(const String & old_table_name) const +{ + auto it = old_to_new_temporary_table_names.find(old_table_name); + if (it != old_to_new_temporary_table_names.end()) + return it->second; + return old_table_name; +} + +} diff --git a/src/Databases/DDLRenamingVisitor.h b/src/Databases/DDLRenamingVisitor.h new file mode 100644 index 00000000000..9d0f770d105 --- /dev/null +++ b/src/Databases/DDLRenamingVisitor.h @@ -0,0 +1,57 @@ +#pragma once + +#include +#include +#include +#include +#include + + +namespace DB +{ +class IAST; +using ASTPtr = std::shared_ptr; +class Context; +using ContextPtr = std::shared_ptr; +class DDLRenamingMap; + +/// Changes names of databases or tables in a create query according to a specified renaming map. +/// Does not validate AST, works a best-effort way. +void renameDatabaseAndTableNameInCreateQuery(const ContextPtr & global_context, const DDLRenamingMap & renaming_map, ASTPtr & ast); + +/// Renaming map keeps information about new names of databases or tables. +class DDLRenamingMap +{ +public: + void setNewTableName(const QualifiedTableName & old_table_name, const QualifiedTableName & new_table_name); + void setNewDatabaseName(const String & old_database_name, const String & new_database_name); + void setNewTemporaryTableName(const String & old_table_name, const String & new_table_name); + + QualifiedTableName getNewTableName(const QualifiedTableName & old_table_name) const; + const String & getNewDatabaseName(const String & old_database_name) const; + const String & getNewTemporaryTableName(const String & old_table_name) const; + +private: + std::unordered_map old_to_new_table_names; + std::unordered_map old_to_new_database_names; + std::unordered_map old_to_new_temporary_table_names; +}; + +/// Visits ASTCreateQuery and changes names of databases or tables. +class DDLRenamingVisitor +{ +public: + struct Data + { + ContextPtr global_context; + const DDLRenamingMap & renaming_map; + ASTPtr create_query; + }; + + using Visitor = InDepthNodeVisitor; + + static bool needChildVisit(const ASTPtr &, const ASTPtr &); + static void visit(ASTPtr ast, const Data & data); +}; + +} diff --git a/src/Databases/DatabaseMemory.cpp b/src/Databases/DatabaseMemory.cpp index bc14f7b0ba0..6df5b70c827 100644 --- a/src/Databases/DatabaseMemory.cpp +++ b/src/Databases/DatabaseMemory.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -17,6 +18,7 @@ namespace DB namespace ErrorCodes { extern const int UNKNOWN_TABLE; + extern const int LOGICAL_ERROR; } DatabaseMemory::DatabaseMemory(const String & name_, ContextPtr context_) @@ -32,7 +34,19 @@ void DatabaseMemory::createTable( { std::unique_lock lock{mutex}; attachTableUnlocked(table_name, table, lock); - create_queries.emplace(table_name, query); + + /// Clean the query from temporary flags. + ASTPtr query_to_store = query; + if (query) + { + query_to_store = query->clone(); + auto * create = query_to_store->as(); + if (!create) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Query '{}' is not CREATE query", serializeAST(*query)); + cleanupObjectDefinitionFromTemporaryFlags(*create); + } + + create_queries.emplace(table_name, query_to_store); } void DatabaseMemory::dropTable( diff --git a/src/Databases/DatabaseMemory.h b/src/Databases/DatabaseMemory.h index 87fae115b59..b854d9be1f3 100644 --- a/src/Databases/DatabaseMemory.h +++ b/src/Databases/DatabaseMemory.h @@ -50,9 +50,6 @@ public: void alterTable(ContextPtr local_context, const StorageID & table_id, const StorageInMemoryMetadata & metadata) override; - /// This database can contain tables to backup. - bool hasTablesToBackup() const override { return true; } - private: String data_path; using NameToASTCreate = std::unordered_map; diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index 31739305e8e..9484da8ec2d 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -117,11 +117,10 @@ String getObjectDefinitionFromCreateQuery(const ASTPtr & query) auto * create = query_clone->as(); if (!create) - { - WriteBufferFromOwnString query_buf; - formatAST(*query, query_buf, true); - throw Exception(ErrorCodes::LOGICAL_ERROR, "Query '{}' is not CREATE query", query_buf.str()); - } + throw Exception(ErrorCodes::LOGICAL_ERROR, "Query '{}' is not CREATE query", serializeAST(*query)); + + /// Clean the query from temporary flags. + cleanupObjectDefinitionFromTemporaryFlags(*create); if (!create->is_dictionary) create->attach = true; @@ -129,20 +128,6 @@ String getObjectDefinitionFromCreateQuery(const ASTPtr & query) /// We remove everything that is not needed for ATTACH from the query. assert(!create->temporary); create->database.reset(); - create->as_database.clear(); - create->as_table.clear(); - create->if_not_exists = false; - create->is_populate = false; - create->replace_view = false; - create->replace_table = false; - create->create_or_replace = false; - - /// For views it is necessary to save the SELECT query itself, for the rest - on the contrary - if (!create->isView()) - create->select = nullptr; - - create->format = nullptr; - create->out_file = nullptr; if (create->uuid != UUIDHelpers::Nil) create->setTable(TABLE_WITH_UUID_NAME_PLACEHOLDER); @@ -465,11 +450,11 @@ ASTPtr DatabaseOnDisk::getCreateTableQueryImpl(const String & table_name, Contex if (!has_table && e.code() == ErrorCodes::FILE_DOESNT_EXIST && throw_on_error) throw Exception{"Table " + backQuote(table_name) + " doesn't exist", ErrorCodes::CANNOT_GET_CREATE_TABLE_QUERY}; - else if (is_system_storage) - ast = getCreateQueryFromStorage(table_name, storage, throw_on_error); - else if (throw_on_error) + else if (!is_system_storage && throw_on_error) throw; } + if (!ast && is_system_storage) + ast = getCreateQueryFromStorage(table_name, storage, throw_on_error); return ast; } @@ -713,6 +698,7 @@ ASTPtr DatabaseOnDisk::getCreateQueryFromStorage(const String & table_name, cons /// setup create table query storage info. auto ast_engine = std::make_shared(); ast_engine->name = storage->getName(); + ast_engine->no_empty_args = true; auto ast_storage = std::make_shared(); ast_storage->set(ast_storage->engine, ast_engine); diff --git a/src/Databases/DatabaseOrdinary.h b/src/Databases/DatabaseOrdinary.h index 2144f874b03..982be2024ce 100644 --- a/src/Databases/DatabaseOrdinary.h +++ b/src/Databases/DatabaseOrdinary.h @@ -36,9 +36,6 @@ public: const StorageID & table_id, const StorageInMemoryMetadata & metadata) override; - /// This database can contain tables to backup. - bool hasTablesToBackup() const override { return true; } - protected: virtual void commitAlterTable( const StorageID & table_id, diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 47a7041b3f4..5c701c8d90c 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -24,7 +24,10 @@ #include #include #include +#include +#include #include +#include namespace DB { @@ -41,6 +44,7 @@ namespace ErrorCodes extern const int INCORRECT_QUERY; extern const int ALL_CONNECTION_TRIES_FAILED; extern const int NO_ACTIVE_REPLICAS; + extern const int CANNOT_RESTORE_TABLE; } static constexpr const char * DROPPED_MARK = "DROPPED"; @@ -917,4 +921,39 @@ String DatabaseReplicated::readMetadataFile(const String & table_name) const return statement; } + +void DatabaseReplicated::createTableRestoredFromBackup(const ASTPtr & create_table_query, const RestorerFromBackup & restorer) +{ + /// Because of the replication multiple nodes can try to restore the same tables again and failed with "Table already exists" + /// because of some table could be restored already on other node and then replicated to this node. + /// To solve this problem we use the restore coordination: the first node calls + /// IRestoreCoordination::acquireCreatingTableInReplicatedDatabase() and then for other nodes this function returns false which means + /// this table is already being created by some other node. + String table_name = create_table_query->as().getTable(); + if (restorer.getRestoreCoordination()->acquireCreatingTableInReplicatedDatabase(getZooKeeperPath(), table_name)) + { + restorer.executeCreateQuery(create_table_query); + } + + /// Wait until the table is actually created no matter if it's created by the current or another node and replicated to the + /// current node afterwards. We have to wait because `RestorerFromBackup` is going to restore data of the table then. + /// TODO: The following code doesn't look very reliable, probably we need to rewrite it somehow. + auto timeout = restorer.getTimeout(); + bool use_timeout = (timeout.count() >= 0); + auto start_time = std::chrono::steady_clock::now(); + while (!isTableExist(table_name, restorer.getContext())) + { + waitForReplicaToProcessAllEntries(50); + + if (use_timeout) + { + auto elapsed = std::chrono::steady_clock::now() - start_time; + if (elapsed > timeout) + throw Exception(ErrorCodes::CANNOT_RESTORE_TABLE, + "Couldn't restore table {}.{} on other node or sync it (elapsed {})", + backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(table_name), to_string(elapsed)); + } + } +} + } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 72a4f0d00bb..45a9d12981c 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -72,6 +72,8 @@ public: void shutdown() override; + void createTableRestoredFromBackup(const ASTPtr & create_table_query, const RestorerFromBackup & restorer) override; + friend struct DatabaseReplicatedTask; friend class DatabaseReplicatedDDLWorker; private: diff --git a/src/Databases/DatabasesCommon.cpp b/src/Databases/DatabasesCommon.cpp index 1c3f417b431..13cd841cc6e 100644 --- a/src/Databases/DatabasesCommon.cpp +++ b/src/Databases/DatabasesCommon.cpp @@ -10,6 +10,8 @@ #include #include #include +#include +#include namespace DB @@ -154,6 +156,26 @@ ASTPtr getCreateQueryFromStorage(const StoragePtr & storage, const ASTPtr & ast_ } +void cleanupObjectDefinitionFromTemporaryFlags(ASTCreateQuery & query) +{ + query.as_database.clear(); + query.as_table.clear(); + query.if_not_exists = false; + query.is_populate = false; + query.is_create_empty = false; + query.replace_view = false; + query.replace_table = false; + query.create_or_replace = false; + + /// For views it is necessary to save the SELECT query itself, for the rest - on the contrary + if (!query.isView()) + query.select = nullptr; + + query.format = nullptr; + query.out_file = nullptr; +} + + DatabaseWithOwnTablesBase::DatabaseWithOwnTablesBase(const String & name_, const String & logger, ContextPtr context_) : IDatabase(name_), WithContext(context_->getGlobalContext()), log(&Poco::Logger::get(logger)) { @@ -300,4 +322,22 @@ StoragePtr DatabaseWithOwnTablesBase::getTableUnlocked(const String & table_name backQuote(database_name), backQuote(table_name)); } +DatabaseTablesIteratorPtr DatabaseWithOwnTablesBase::getTablesIteratorForBackup(const BackupEntriesCollector & backup_entries_collector) const +{ + /// Backup all the tables in this database. + /// Here we skip inner tables of materialized views. + auto skip_internal_tables = [](const String & table_name) { return !table_name.starts_with(".inner_id."); }; + return getTablesIterator(backup_entries_collector.getContext(), skip_internal_tables); +} + +void DatabaseWithOwnTablesBase::checkCreateTableQueryForBackup(const ASTPtr &, const BackupEntriesCollector &) const +{ +} + +void DatabaseWithOwnTablesBase::createTableRestoredFromBackup(const ASTPtr & create_table_query, const RestorerFromBackup & restorer) +{ + /// Creates a table by executing a "CREATE TABLE" query. + restorer.executeCreateQuery(create_table_query); +} + } diff --git a/src/Databases/DatabasesCommon.h b/src/Databases/DatabasesCommon.h index dbed4dbeaf1..fcaa4af88bb 100644 --- a/src/Databases/DatabasesCommon.h +++ b/src/Databases/DatabasesCommon.h @@ -15,6 +15,9 @@ namespace DB void applyMetadataChangesToCreateQuery(const ASTPtr & query, const StorageInMemoryMetadata & metadata); ASTPtr getCreateQueryFromStorage(const StoragePtr & storage, const ASTPtr & ast_storage, bool only_ordinary, uint32_t max_parser_depth, bool throw_on_error); +/// Cleans a CREATE QUERY from temporary flags like "IF NOT EXISTS", "OR REPLACE", "AS SELECT" (for non-views), etc. +void cleanupObjectDefinitionFromTemporaryFlags(ASTCreateQuery & query); + class Context; /// A base class for databases that manage their own list of tables. @@ -33,6 +36,10 @@ public: DatabaseTablesIteratorPtr getTablesIterator(ContextPtr context, const FilterByNameFunction & filter_by_table_name) const override; + DatabaseTablesIteratorPtr getTablesIteratorForBackup(const BackupEntriesCollector & backup_entries_collector) const override; + void checkCreateTableQueryForBackup(const ASTPtr & create_table_query, const BackupEntriesCollector & backup_entries_collector) const override; + void createTableRestoredFromBackup(const ASTPtr & create_table_query, const RestorerFromBackup & restorer) override; + void shutdown() override; ~DatabaseWithOwnTablesBase() override; diff --git a/src/Databases/IDatabase.cpp b/src/Databases/IDatabase.cpp index 992598879db..1d5695188b7 100644 --- a/src/Databases/IDatabase.cpp +++ b/src/Databases/IDatabase.cpp @@ -1,4 +1,7 @@ #include +#include +#include +#include #include @@ -8,6 +11,8 @@ namespace DB namespace ErrorCodes { extern const int UNKNOWN_TABLE; + extern const int CANNOT_BACKUP_TABLE; + extern const int CANNOT_RESTORE_TABLE; } StoragePtr IDatabase::getTable(const String & name, ContextPtr context) const @@ -17,4 +22,39 @@ StoragePtr IDatabase::getTable(const String & name, ContextPtr context) const throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} doesn't exist", backQuoteIfNeed(database_name), backQuoteIfNeed(name)); } +ASTPtr IDatabase::getCreateDatabaseQueryForBackup() const +{ + auto query = getCreateDatabaseQuery(); + + /// We don't want to see any UUIDs in backup (after RESTORE the table will have another UUID anyway). + auto & create = query->as(); + create.uuid = UUIDHelpers::Nil; + + return query; +} + +DatabaseTablesIteratorPtr IDatabase::getTablesIteratorForBackup(const BackupEntriesCollector &) const +{ + /// IDatabase doesn't own any tables. + return std::make_unique(Tables{}, getDatabaseName()); +} + +void IDatabase::checkCreateTableQueryForBackup(const ASTPtr & create_table_query, const BackupEntriesCollector &) const +{ + /// Cannot restore any table because IDatabase doesn't own any tables. + throw Exception(ErrorCodes::CANNOT_BACKUP_TABLE, + "Database engine {} does not support backups, cannot backup table {}.{}", + getEngineName(), backQuoteIfNeed(getDatabaseName()), + backQuoteIfNeed(create_table_query->as().getTable())); +} + +void IDatabase::createTableRestoredFromBackup(const ASTPtr & create_table_query, const RestorerFromBackup &) +{ + /// Cannot restore any table because IDatabase doesn't own any tables. + throw Exception(ErrorCodes::CANNOT_RESTORE_TABLE, + "Database engine {} does not support restoring tables, cannot restore table {}.{}", + getEngineName(), backQuoteIfNeed(getDatabaseName()), + backQuoteIfNeed(create_table_query->as().getTable())); +} + } diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index a1c05b3d712..38c85cf3d05 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -30,6 +30,8 @@ class SettingsChanges; using DictionariesWithID = std::vector>; struct ParsedTablesMetadata; struct QualifiedTableName; +class BackupEntriesCollector; +class RestorerFromBackup; namespace ErrorCodes { @@ -331,9 +333,17 @@ public: throw Exception(ErrorCodes::LOGICAL_ERROR, "Database engine {} does not run a replication thread!", getEngineName()); } - /// Returns true if the backup of the database is hollow, which means it doesn't contain - /// any tables which can be stored to a backup. - virtual bool hasTablesToBackup() const { return false; } + /// Returns a slightly changed version of the CREATE DATABASE query which must be written to a backup. + virtual ASTPtr getCreateDatabaseQueryForBackup() const; + + /// Returns an iterator that passes through all the tables when an user wants to backup the whole database. + virtual DatabaseTablesIteratorPtr getTablesIteratorForBackup(const BackupEntriesCollector & restorer) const; + + /// Checks a CREATE TABLE query before it will be written to a backup. Called by IStorage::getCreateQueryForBackup(). + virtual void checkCreateTableQueryForBackup(const ASTPtr & create_table_query, const BackupEntriesCollector & backup_entries_collector) const; + + /// Creates a table restored from backup. + virtual void createTableRestoredFromBackup(const ASTPtr & create_table_query, const RestorerFromBackup & restorer); virtual ~IDatabase() = default; diff --git a/src/Disks/FakeDiskTransaction.h b/src/Disks/FakeDiskTransaction.h new file mode 100644 index 00000000000..6d61ac752f2 --- /dev/null +++ b/src/Disks/FakeDiskTransaction.h @@ -0,0 +1,129 @@ +#pragma once + +#include + +namespace DB +{ + +/// Fake disk transaction implementation. +/// Just execute all operations immediately, commit is noop operation. +/// No support for atomicity and rollback. +struct FakeDiskTransaction final : public IDiskTransaction +{ +public: + explicit FakeDiskTransaction(IDisk & disk_) + : disk(disk_) + {} + + void commit() override {} + + void createDirectory(const std::string & path) override + { + disk.createDirectory(path); + } + + void createDirectories(const std::string & path) override + { + disk.createDirectories(path); + } + + void createFile(const std::string & path) override + { + disk.createFile(path); + } + + void clearDirectory(const std::string & path) override + { + disk.createDirectory(path); + } + + void moveDirectory(const std::string & from_path, const std::string & to_path) override + { + disk.moveDirectory(from_path, to_path); + } + + void moveFile(const String & from_path, const String & to_path) override + { + disk.moveFile(from_path, to_path); + } + + void replaceFile(const std::string & from_path, const std::string & to_path) override + { + disk.replaceFile(from_path, to_path); + } + + void copyFile(const std::string & from_file_path, const std::string & to_file_path) override + { + disk.copyFile(from_file_path, disk, to_file_path); + } + + std::unique_ptr writeFile( /// NOLINT + const std::string & path, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + WriteMode mode = WriteMode::Rewrite, + const WriteSettings & settings = {}, + bool /*autocommit */ = true) override + { + return disk.writeFile(path, buf_size, mode, settings); + } + + void removeFile(const std::string & path) override + { + disk.removeFile(path); + } + + void removeFileIfExists(const std::string & path) override + { + disk.removeFileIfExists(path); + } + + void removeDirectory(const std::string & path) override + { + disk.removeDirectory(path); + } + + void removeRecursive(const std::string & path) override + { + disk.removeRecursive(path); + } + + void removeSharedFile(const std::string & path, bool keep_shared_data) override + { + disk.removeSharedFile(path, keep_shared_data); + } + + void removeSharedRecursive(const std::string & path, bool keep_all_shared_data, const NameSet & file_names_remove_metadata_only) override + { + disk.removeSharedRecursive(path, keep_all_shared_data, file_names_remove_metadata_only); + } + + void removeSharedFileIfExists(const std::string & path, bool keep_shared_data) override + { + disk.removeSharedFileIfExists(path, keep_shared_data); + } + + void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override + { + disk.removeSharedFiles(files, keep_all_batch_data, file_names_remove_metadata_only); + } + + void setLastModified(const std::string & path, const Poco::Timestamp & timestamp) override + { + disk.setLastModified(path, timestamp); + } + + void setReadOnly(const std::string & path) override + { + disk.setReadOnly(path); + } + + void createHardLink(const std::string & src_path, const std::string & dst_path) override + { + disk.createHardLink(src_path, dst_path); + } + +private: + IDisk & disk; +}; + +} diff --git a/src/Disks/IDisk.cpp b/src/Disks/IDisk.cpp index 312e380f695..503e926743a 100644 --- a/src/Disks/IDisk.cpp +++ b/src/Disks/IDisk.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB { @@ -33,6 +34,24 @@ void IDisk::copyFile(const String & from_file_path, IDisk & to_disk, const Strin } +DiskTransactionPtr IDisk::createTransaction() +{ + return std::make_shared(*this); +} + +void IDisk::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) +{ + for (const auto & file : files) + { + bool keep_file = keep_all_batch_data || file_names_remove_metadata_only.contains(fs::path(file.path).filename()); + if (file.if_exists) + removeSharedFileIfExists(file.path, keep_file); + else + removeSharedFile(file.path, keep_file); + } +} + + using ResultsCollector = std::vector>; void asyncCopy(IDisk & from_disk, String from_path, IDisk & to_disk, String to_path, Executor & exec, ResultsCollector & results, bool copy_root_dir) diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index fd4662f9864..f441f0827fb 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -49,6 +50,10 @@ class WriteBufferFromFileBase; class MMappedFileCache; class IMetadataStorage; using MetadataStoragePtr = std::shared_ptr; +struct IDiskTransaction; +using DiskTransactionPtr = std::shared_ptr; +struct RemoveRequest; +using RemoveBatchRequest = std::vector; /** @@ -97,6 +102,8 @@ public: { } + virtual DiskTransactionPtr createTransaction(); + /// Root path for all files stored on the disk. /// It's not required to be a local filesystem path. virtual const String & getPath() const = 0; @@ -225,34 +232,11 @@ public: throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method `getRemotePathsRecursive() not implemented for disk: {}`", getType()); } - struct RemoveRequest - { - String path; - bool if_exists = false; - - explicit RemoveRequest(String path_, bool if_exists_ = false) - : path(std::move(path_)), if_exists(std::move(if_exists_)) - { - } - }; - - using RemoveBatchRequest = std::vector; - /// Batch request to remove multiple files. /// May be much faster for blob storage. /// Second bool param is a flag to remove (true) or keep (false) shared data on S3. /// Third param determines which files cannot be removed even if second is true. - virtual void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) - { - for (const auto & file : files) - { - bool keep_file = keep_all_batch_data || file_names_remove_metadata_only.contains(fs::path(file.path).filename()); - if (file.if_exists) - removeSharedFileIfExists(file.path, keep_file); - else - removeSharedFile(file.path, keep_file); - } - } + virtual void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only); /// Set last modified time to file or directory at `path`. virtual void setLastModified(const String & path, const Poco::Timestamp & timestamp) = 0; diff --git a/src/Disks/IDiskTransaction.h b/src/Disks/IDiskTransaction.h new file mode 100644 index 00000000000..e7b1cf3f675 --- /dev/null +++ b/src/Disks/IDiskTransaction.h @@ -0,0 +1,116 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +struct RemoveRequest +{ + std::string path; + bool if_exists = false; + + explicit RemoveRequest(std::string path_, bool if_exists_ = false) + : path(std::move(path_)), if_exists(std::move(if_exists_)) + { + } +}; + +using RemoveBatchRequest = std::vector; + +/// Simple interface batch execution of write disk operations. +/// Method are almost equal to disk methods. +struct IDiskTransaction : private boost::noncopyable +{ +public: + /// Tries to commit all accumulated operations simultaneously. + /// If something fails rollback and throw exception. + virtual void commit() = 0; + + virtual ~IDiskTransaction() = default; + + /// Create directory. + virtual void createDirectory(const std::string & path) = 0; + + /// Create directory and all parent directories if necessary. + virtual void createDirectories(const std::string & path) = 0; + + /// Remove all files from the directory. Directories are not removed. + virtual void clearDirectory(const std::string & path) = 0; + + /// Move directory from `from_path` to `to_path`. + virtual void moveDirectory(const std::string & from_path, const std::string & to_path) = 0; + + virtual void moveFile(const String & from_path, const String & to_path) = 0; + + virtual void createFile(const String & path) = 0; + + /// Move the file from `from_path` to `to_path`. + /// If a file with `to_path` path already exists, it will be replaced. + virtual void replaceFile(const std::string & from_path, const std::string & to_path) = 0; + + /// Only copy of several files supported now. Disk interface support copy to another disk + /// but it's impossible to implement correctly in transactions because other disk can + /// use different metadata storage. + /// TODO: maybe remove it at all, we don't want copies + virtual void copyFile(const std::string & from_file_path, const std::string & to_file_path) = 0; + + /// Open the file for write and return WriteBufferFromFileBase object. + virtual std::unique_ptr writeFile( /// NOLINT + const std::string & path, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + WriteMode mode = WriteMode::Rewrite, + const WriteSettings & settings = {}, + bool autocommit = true) = 0; + + /// Remove file. Throws exception if file doesn't exists or it's a directory. + virtual void removeFile(const std::string & path) = 0; + + /// Remove file if it exists. + virtual void removeFileIfExists(const std::string & path) = 0; + + /// Remove directory. Throws exception if it's not a directory or if directory is not empty. + virtual void removeDirectory(const std::string & path) = 0; + + /// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists. + virtual void removeRecursive(const std::string & path) = 0; + + /// Remove file. Throws exception if file doesn't exists or if directory is not empty. + /// Differs from removeFile for S3/HDFS disks + /// Second bool param is a flag to remove (true) or keep (false) shared data on S3 + virtual void removeSharedFile(const std::string & path, bool /* keep_shared_data */) = 0; + + /// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists. + /// Differs from removeRecursive for S3/HDFS disks + /// Second bool param is a flag to remove (false) or keep (true) shared data on S3. + /// Third param determines which files cannot be removed even if second is true. + virtual void removeSharedRecursive(const std::string & path, bool /* keep_all_shared_data */, const NameSet & /* file_names_remove_metadata_only */) = 0; + + /// Remove file or directory if it exists. + /// Differs from removeFileIfExists for S3/HDFS disks + /// Second bool param is a flag to remove (true) or keep (false) shared data on S3 + virtual void removeSharedFileIfExists(const std::string & path, bool /* keep_shared_data */) = 0; + + /// Batch request to remove multiple files. + /// May be much faster for blob storage. + /// Second bool param is a flag to remove (true) or keep (false) shared data on S3. + /// Third param determines which files cannot be removed even if second is true. + virtual void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) = 0; + + /// Set last modified time to file or directory at `path`. + virtual void setLastModified(const std::string & path, const Poco::Timestamp & timestamp) = 0; + + /// Set file at `path` as read-only. + virtual void setReadOnly(const std::string & path) = 0; + + /// Create hardlink from `src_path` to `dst_path`. + virtual void createHardLink(const std::string & src_path, const std::string & dst_path) = 0; + +}; + +using DiskTransactionPtr = std::shared_ptr; + +} diff --git a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp index 1e2863bcb96..774a7ecaaaa 100644 --- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp +++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp @@ -67,7 +67,7 @@ String AsynchronousReadIndirectBufferFromRemoteFS::getInfoForLog() return impl->getInfoForLog(); } -std::optional AsynchronousReadIndirectBufferFromRemoteFS::getFileSize() +size_t AsynchronousReadIndirectBufferFromRemoteFS::getFileSize() { return impl->getFileSize(); } @@ -185,8 +185,11 @@ bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl() } prefetch_buffer.swap(memory); + /// Adjust the working buffer so that it ignores `offset` bytes. - setWithBytesToIgnore(memory.data(), size, offset); + internal_buffer = Buffer(memory.data(), memory.data() + memory.size()); + working_buffer = Buffer(memory.data() + offset, memory.data() + size); + pos = working_buffer.begin(); } else { @@ -202,7 +205,9 @@ bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl() if (size) { /// Adjust the working buffer so that it ignores `offset` bytes. - setWithBytesToIgnore(memory.data(), size, offset); + internal_buffer = Buffer(memory.data(), memory.data() + memory.size()); + working_buffer = Buffer(memory.data() + offset, memory.data() + size); + pos = working_buffer.begin(); } } diff --git a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h index d9bb7849abf..3734509a605 100644 --- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h +++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h @@ -27,7 +27,7 @@ struct ReadSettings; * * We pass either `memory` or `prefetch_buffer` through all this chain and return it back. */ -class AsynchronousReadIndirectBufferFromRemoteFS : public ReadBufferFromFileBase, public WithFileSize +class AsynchronousReadIndirectBufferFromRemoteFS : public ReadBufferFromFileBase { public: explicit AsynchronousReadIndirectBufferFromRemoteFS( @@ -51,7 +51,7 @@ public: String getInfoForLog() override; - std::optional getFileSize() override; + size_t getFileSize() override; private: bool nextImpl() override; diff --git a/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp b/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp index d968a87de04..da6718ddeb2 100644 --- a/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp +++ b/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp @@ -56,6 +56,7 @@ CachedReadBufferFromRemoteFS::CachedReadBufferFromRemoteFS( , enable_logging(!query_id.empty() && settings_.enable_filesystem_cache_log) , current_buffer_id(getRandomASCIIString(8)) , query_context_holder(cache_->getQueryContextHolder(query_id, settings_)) + , is_persistent(false) /// Unused for now, see PR 36171 { } @@ -102,7 +103,7 @@ void CachedReadBufferFromRemoteFS::initialize(size_t offset, size_t size) } else { - file_segments_holder.emplace(cache->getOrSet(cache_key, offset, size)); + file_segments_holder.emplace(cache->getOrSet(cache_key, offset, size, is_persistent)); } /** @@ -120,7 +121,7 @@ void CachedReadBufferFromRemoteFS::initialize(size_t offset, size_t size) SeekableReadBufferPtr CachedReadBufferFromRemoteFS::getCacheReadBuffer(size_t offset) const { - auto path = cache->getPathInLocalCache(cache_key, offset); + auto path = cache->getPathInLocalCache(cache_key, offset, is_persistent); ReadSettings local_read_settings{settings}; /// Do not allow to use asynchronous version of LocalFSReadMethod. @@ -128,7 +129,7 @@ SeekableReadBufferPtr CachedReadBufferFromRemoteFS::getCacheReadBuffer(size_t of auto buf = createReadBufferFromFileBase(path, local_read_settings); auto * from_fd = dynamic_cast(buf.get()); - if (from_fd && from_fd->size() == 0) + if (from_fd && from_fd->getFileSize() == 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read from an empty cache file: {}", path); return buf; @@ -370,7 +371,7 @@ SeekableReadBufferPtr CachedReadBufferFromRemoteFS::getImplementationBuffer(File { #ifndef NDEBUG auto * file_reader = dynamic_cast(read_buffer_for_file_segment.get()); - size_t file_size = file_reader->size(); + size_t file_size = file_reader->getFileSize(); if (file_size == 0 || range.left + file_size <= file_offset_of_buffer_end) throw Exception( @@ -802,7 +803,7 @@ bool CachedReadBufferFromRemoteFS::nextImplStep() #ifndef NDEBUG if (auto * cache_file_reader = dynamic_cast(implementation_buffer.get())) { - auto cache_file_size = cache_file_reader->size(); + auto cache_file_size = cache_file_reader->getFileSize(); if (cache_file_size == 0) throw Exception( ErrorCodes::LOGICAL_ERROR, "Attempt to read from an empty cache file: {} (just before actual read)", cache_file_size); @@ -916,7 +917,7 @@ bool CachedReadBufferFromRemoteFS::nextImplStep() { std::optional cache_file_size; if (auto * cache_file_reader = dynamic_cast(implementation_buffer.get())) - cache_file_size = cache_file_reader->size(); + cache_file_size = cache_file_reader->getFileSize(); throw Exception( ErrorCodes::LOGICAL_ERROR, diff --git a/src/Disks/IO/CachedReadBufferFromRemoteFS.h b/src/Disks/IO/CachedReadBufferFromRemoteFS.h index 5094f1e5047..867b8538260 100644 --- a/src/Disks/IO/CachedReadBufferFromRemoteFS.h +++ b/src/Disks/IO/CachedReadBufferFromRemoteFS.h @@ -1,11 +1,12 @@ #pragma once -#include +#include #include #include #include #include #include +#include namespace CurrentMetrics @@ -125,6 +126,8 @@ private: ProfileEvents::Counters current_file_segment_counters; IFileCache::QueryContextHolder query_context_holder; + + bool is_persistent; }; } diff --git a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp index 699f8380cb8..3f7b378dee4 100644 --- a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp +++ b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp @@ -19,6 +19,10 @@ ReadIndirectBufferFromRemoteFS::ReadIndirectBufferFromRemoteFS( { } +size_t ReadIndirectBufferFromRemoteFS::getFileSize() +{ + return impl->getFileSize(); +} off_t ReadIndirectBufferFromRemoteFS::getPosition() { diff --git a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h index 64495a538e4..f27d67fc5fb 100644 --- a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h +++ b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h @@ -30,6 +30,8 @@ public: void setReadUntilEnd() override; + size_t getFileSize() override; + private: bool nextImpl() override; diff --git a/src/Disks/IO/WriteIndirectBufferFromRemoteFS.cpp b/src/Disks/IO/WriteIndirectBufferFromRemoteFS.cpp index 77da60ca07d..bf99934dd73 100644 --- a/src/Disks/IO/WriteIndirectBufferFromRemoteFS.cpp +++ b/src/Disks/IO/WriteIndirectBufferFromRemoteFS.cpp @@ -32,7 +32,6 @@ WriteIndirectBufferFromRemoteFS::~WriteIndirectBufferFromRemoteFS() } } - void WriteIndirectBufferFromRemoteFS::finalizeImpl() { WriteBufferFromFileDecorator::finalizeImpl(); diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index 11c29cdc362..0f2c320ed67 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -13,8 +13,9 @@ #include #include #include -#include +#include #include +#include #include namespace DB @@ -23,18 +24,9 @@ namespace DB namespace ErrorCodes { extern const int INCORRECT_DISK_INDEX; - extern const int UNKNOWN_FORMAT; - extern const int FILE_ALREADY_EXISTS; extern const int FILE_DOESNT_EXIST; - extern const int BAD_FILE_TYPE; extern const int ATTEMPT_TO_READ_AFTER_EOF; extern const int CANNOT_READ_ALL_DATA; - extern const int CANNOT_OPEN_FILE; -} - -static String revisionToString(UInt64 revision) -{ - return std::bitset<64>(revision).to_string(); } namespace @@ -86,6 +78,16 @@ private: } + +DiskTransactionPtr DiskObjectStorage::createTransaction() +{ + return std::make_shared( + *object_storage, + *metadata_storage, + remote_fs_root_path, + send_metadata ? metadata_helper.get() : nullptr); +} + DiskObjectStorage::DiskObjectStorage( const String & name_, const String & remote_fs_root_path_, @@ -174,9 +176,9 @@ bool DiskObjectStorage::isFile(const String & path) const void DiskObjectStorage::createFile(const String & path) { - auto tx = metadata_storage->createTransaction(); - tx->createEmptyMetadataFile(path); - tx->commit(); + auto transaction = createTransaction(); + transaction->createFile(path); + transaction->commit(); } size_t DiskObjectStorage::getFileSize(const String & path) const @@ -186,11 +188,6 @@ size_t DiskObjectStorage::getFileSize(const String & path) const void DiskObjectStorage::moveFile(const String & from_path, const String & to_path, bool should_send_metadata) { - if (exists(to_path)) - throw Exception("File already exists: " + to_path, ErrorCodes::FILE_ALREADY_EXISTS); - - if (!exists(from_path)) - throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "File {} doesn't exist, cannot move", to_path); if (should_send_metadata) { @@ -204,9 +201,9 @@ void DiskObjectStorage::moveFile(const String & from_path, const String & to_pat metadata_helper->createFileOperationObject("rename", revision, object_metadata); } - auto tx = metadata_storage->createTransaction(); - tx->moveFile(from_path, to_path); - tx->commit(); + auto transaction = createTransaction(); + transaction->moveFile(from_path, to_path); + transaction->commit(); } void DiskObjectStorage::moveFile(const String & from_path, const String & to_path) @@ -218,13 +215,9 @@ void DiskObjectStorage::replaceFile(const String & from_path, const String & to_ { if (exists(to_path)) { - auto blobs = metadata_storage->getRemotePaths(to_path); - - auto tx = metadata_storage->createTransaction(); - tx->replaceFile(from_path, to_path); - tx->commit(); - - removeFromRemoteFS(blobs); + auto transaction = createTransaction(); + transaction->replaceFile(from_path, to_path); + transaction->commit(); } else moveFile(from_path, to_path); @@ -232,16 +225,9 @@ void DiskObjectStorage::replaceFile(const String & from_path, const String & to_ void DiskObjectStorage::removeSharedFile(const String & path, bool delete_metadata_only) { - std::vector paths_to_remove; - removeMetadata(path, paths_to_remove); - - if (!delete_metadata_only) - removeFromRemoteFS(paths_to_remove); -} - -void DiskObjectStorage::removeFromRemoteFS(const std::vector & paths) -{ - object_storage->removeObjects(paths); + auto transaction = createTransaction(); + transaction->removeSharedFile(path, delete_metadata_only); + transaction->commit(); } UInt32 DiskObjectStorage::getRefCount(const String & path) const @@ -290,10 +276,9 @@ void DiskObjectStorage::createHardLink(const String & src_path, const String & d metadata_helper->createFileOperationObject("hardlink", revision, object_metadata); } - /// Create FS hardlink to metadata file. - auto tx = metadata_storage->createTransaction(); - tx->createHardLink(src_path, dst_path); - tx->commit(); + auto transaction = createTransaction(); + transaction->createHardLink(src_path, dst_path); + transaction->commit(); } void DiskObjectStorage::createHardLink(const String & src_path, const String & dst_path) @@ -306,9 +291,9 @@ void DiskObjectStorage::setReadOnly(const String & path) { /// We should store read only flag inside metadata file (instead of using FS flag), /// because we modify metadata file when create hard-links from it. - auto tx = metadata_storage->createTransaction(); - tx->setReadOnly(path); - tx->commit(); + auto transaction = createTransaction(); + transaction->setReadOnly(path); + transaction->commit(); } @@ -320,33 +305,33 @@ bool DiskObjectStorage::isDirectory(const String & path) const void DiskObjectStorage::createDirectory(const String & path) { - auto tx = metadata_storage->createTransaction(); - tx->createDirectory(path); - tx->commit(); + auto transaction = createTransaction(); + transaction->createDirectory(path); + transaction->commit(); } void DiskObjectStorage::createDirectories(const String & path) { - auto tx = metadata_storage->createTransaction(); - tx->createDicrectoryRecursive(path); - tx->commit(); + auto transaction = createTransaction(); + transaction->createDirectories(path); + transaction->commit(); } void DiskObjectStorage::clearDirectory(const String & path) { - for (auto it = iterateDirectory(path); it->isValid(); it->next()) - if (isFile(it->path())) - removeFile(it->path()); + auto transaction = createTransaction(); + transaction->clearDirectory(path); + transaction->commit(); } void DiskObjectStorage::removeDirectory(const String & path) { - auto tx = metadata_storage->createTransaction(); - tx->removeDirectory(path); - tx->commit(); + auto transaction = createTransaction(); + transaction->removeDirectory(path); + transaction->commit(); } @@ -365,9 +350,9 @@ void DiskObjectStorage::listFiles(const String & path, std::vector & fil void DiskObjectStorage::setLastModified(const String & path, const Poco::Timestamp & timestamp) { - auto tx = metadata_storage->createTransaction(); - tx->setLastModified(path, timestamp); - tx->commit(); + auto transaction = createTransaction(); + transaction->setLastModified(path, timestamp); + transaction->commit(); } @@ -381,74 +366,6 @@ time_t DiskObjectStorage::getLastChanged(const String & path) const return metadata_storage->getLastChanged(path); } -void DiskObjectStorage::removeMetadata(const String & path, std::vector & paths_to_remove) -{ - LOG_TRACE(log, "Remove file by path: {}", backQuote(metadata_storage->getPath() + path)); - - if (!metadata_storage->exists(path)) - throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Metadata path '{}' doesn't exist", path); - - if (!metadata_storage->isFile(path)) - throw Exception(ErrorCodes::BAD_FILE_TYPE, "Path '{}' is not a regular file", path); - - - try - { - uint32_t hardlink_count = metadata_storage->getHardlinkCount(path); - auto remote_objects = metadata_storage->getRemotePaths(path); - - auto tx = metadata_storage->createTransaction(); - tx->unlinkMetadata(path); - tx->commit(); - - if (hardlink_count == 0) - { - paths_to_remove = remote_objects; - for (const auto & path_to_remove : paths_to_remove) - object_storage->removeFromCache(path_to_remove); - } - } - catch (const Exception & e) - { - /// If it's impossible to read meta - just remove it from FS. - if (e.code() == ErrorCodes::UNKNOWN_FORMAT - || e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF - || e.code() == ErrorCodes::CANNOT_READ_ALL_DATA - || e.code() == ErrorCodes::CANNOT_OPEN_FILE) - { - LOG_INFO(log, "Failed to read metadata file {} before removal because it's incomplete or empty. " - "It's Ok and can happen after operation interruption (like metadata fetch), so removing as is", path); - - auto tx = metadata_storage->createTransaction(); - tx->unlinkFile(path); - tx->commit(); - } - else - throw; - } -} - - -void DiskObjectStorage::removeMetadataRecursive(const String & path, std::unordered_map> & paths_to_remove) -{ - checkStackSize(); /// This is needed to prevent stack overflow in case of cyclic symlinks. - - if (metadata_storage->isFile(path)) - { - removeMetadata(path, paths_to_remove[path]); - } - else - { - for (auto it = iterateDirectory(path); it->isValid(); it->next()) - removeMetadataRecursive(it->path(), paths_to_remove); - - auto tx = metadata_storage->createTransaction(); - tx->removeDirectory(path); - tx->commit(); - } -} - - void DiskObjectStorage::shutdown() { LOG_INFO(log, "Shutting down disk {}", name); @@ -477,32 +394,16 @@ ReservationPtr DiskObjectStorage::reserve(UInt64 bytes) void DiskObjectStorage::removeSharedFileIfExists(const String & path, bool delete_metadata_only) { - std::vector paths_to_remove; - if (metadata_storage->exists(path)) - { - removeMetadata(path, paths_to_remove); - if (!delete_metadata_only) - removeFromRemoteFS(paths_to_remove); - } + auto transaction = createTransaction(); + transaction->removeSharedFileIfExists(path, delete_metadata_only); + transaction->commit(); } void DiskObjectStorage::removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) { - std::unordered_map> paths_to_remove; - removeMetadataRecursive(path, paths_to_remove); - - if (!keep_all_batch_data) - { - std::vector remove_from_remote; - for (auto && [local_path, remote_paths] : paths_to_remove) - { - if (!file_names_remove_metadata_only.contains(fs::path(local_path).filename())) - { - remove_from_remote.insert(remove_from_remote.end(), remote_paths.begin(), remote_paths.end()); - } - } - removeFromRemoteFS(remove_from_remote); - } + auto transaction = createTransaction(); + transaction->removeSharedRecursive(path, keep_all_batch_data, file_names_remove_metadata_only); + transaction->commit(); } std::optional DiskObjectStorage::tryReserve(UInt64 bytes) @@ -546,38 +447,12 @@ std::unique_ptr DiskObjectStorage::writeFile( WriteMode mode, const WriteSettings & settings) { - auto blob_name = getRandomASCIIString(); + auto transaction = createTransaction(); + auto result = transaction->writeFile(path, buf_size, mode, settings); - std::optional object_attributes; - if (send_metadata) - { - auto revision = metadata_helper->revision_counter + 1; - metadata_helper->revision_counter++; - object_attributes = { - {"path", path} - }; - blob_name = "r" + revisionToString(revision) + "-file-" + blob_name; - } - - auto create_metadata_callback = [this, mode, path, blob_name] (size_t count) - { - auto tx = metadata_storage->createTransaction(); - if (mode == WriteMode::Rewrite) - tx->createMetadataFile(path, blob_name, count); - else - tx->addBlobToMetadata(path, blob_name, count); - - tx->commit(); - }; - - /// We always use mode Rewrite because we simulate append using metadata and different files - return object_storage->writeObject( - fs::path(remote_fs_root_path) / blob_name, WriteMode::Rewrite, object_attributes, - std::move(create_metadata_callback), - buf_size, settings); + return result; } - void DiskObjectStorage::applyNewSettings(const Poco::Util::AbstractConfiguration & config, ContextPtr context_, const String &, const DisksMap &) { const auto config_prefix = "storage_configuration.disks." + name; diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.h b/src/Disks/ObjectStorages/DiskObjectStorage.h index 6b1d591d4a3..b1a1d263ede 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.h +++ b/src/Disks/ObjectStorages/DiskObjectStorage.h @@ -4,6 +4,7 @@ #include #include #include +#include #include namespace CurrentMetrics @@ -36,6 +37,8 @@ public: bool send_metadata_, uint64_t thread_pool_size); + DiskTransactionPtr createTransaction() override; + DiskType getType() const override { return disk_type; } bool supportZeroCopyReplication() const override { return true; } @@ -89,8 +92,6 @@ public: void removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override; - void removeFromRemoteFS(const std::vector & paths); - MetadataStoragePtr getMetadataStorage() override { return metadata_storage; } UInt32 getRefCount(const String & path) const override; @@ -174,10 +175,6 @@ private: UInt64 reservation_count = 0; std::mutex reservation_mutex; - void removeMetadata(const String & path, std::vector & paths_to_remove); - - void removeMetadataRecursive(const String & path, std::unordered_map> & paths_to_remove); - std::optional tryReserve(UInt64 bytes); const bool send_metadata; diff --git a/src/Disks/ObjectStorages/DiskObjectStorageCommon.cpp b/src/Disks/ObjectStorages/DiskObjectStorageCommon.cpp index eb9d7107d39..9311cb2c12a 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageCommon.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageCommon.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include namespace DB diff --git a/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp b/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp index 0e35963e9cb..96667b8496a 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB { diff --git a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp new file mode 100644 index 00000000000..b0a180e2c53 --- /dev/null +++ b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp @@ -0,0 +1,604 @@ +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNKNOWN_FORMAT; + extern const int ATTEMPT_TO_READ_AFTER_EOF; + extern const int CANNOT_READ_ALL_DATA; + extern const int CANNOT_OPEN_FILE; + extern const int FILE_DOESNT_EXIST; + extern const int BAD_FILE_TYPE; + extern const int FILE_ALREADY_EXISTS; +} + +DiskObjectStorageTransaction::DiskObjectStorageTransaction( + IObjectStorage & object_storage_, + IMetadataStorage & metadata_storage_, + const std::string & remote_fs_root_path_, + DiskObjectStorageRemoteMetadataRestoreHelper * metadata_helper_) + : object_storage(object_storage_) + , metadata_storage(metadata_storage_) + , metadata_transaction(metadata_storage.createTransaction()) + , remote_fs_root_path(remote_fs_root_path_) + , metadata_helper(metadata_helper_) +{} + +/// Operation which affects only metadata. Simplest way to +/// implement via callback. +struct PureMetadataOperation final : public IDiskObjectStorageOperation +{ + std::function on_execute; + + PureMetadataOperation( + IObjectStorage & object_storage_, + IMetadataStorage & metadata_storage_, + std::function && on_execute_) + : IDiskObjectStorageOperation(object_storage_, metadata_storage_) + , on_execute(std::move(on_execute_)) + {} + + void execute(MetadataTransactionPtr transaction) override + { + on_execute(transaction); + } + + void undo() override + { + } + + void finalize() override + { + } +}; + +struct RemoveObjectOperation final : public IDiskObjectStorageOperation +{ + std::string path; + bool delete_metadata_only; + bool remove_from_cache{false}; + std::vector paths_to_remove; + bool if_exists; + + RemoveObjectOperation( + IObjectStorage & object_storage_, + IMetadataStorage & metadata_storage_, + const std::string & path_, + bool delete_metadata_only_, + bool if_exists_) + : IDiskObjectStorageOperation(object_storage_, metadata_storage_) + , path(path_) + , delete_metadata_only(delete_metadata_only_) + , if_exists(if_exists_) + {} + + void execute(MetadataTransactionPtr tx) override + { + if (!metadata_storage.exists(path)) + { + if (if_exists) + return; + + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Metadata path '{}' doesn't exist", path); + } + + if (!metadata_storage.isFile(path)) + throw Exception(ErrorCodes::BAD_FILE_TYPE, "Path '{}' is not a regular file", path); + + try + { + uint32_t hardlink_count = metadata_storage.getHardlinkCount(path); + auto remote_objects = metadata_storage.getRemotePaths(path); + + tx->unlinkMetadata(path); + + if (hardlink_count == 0) + { + paths_to_remove = remote_objects; + remove_from_cache = true; + } + } + catch (const Exception & e) + { + /// If it's impossible to read meta - just remove it from FS. + if (e.code() == ErrorCodes::UNKNOWN_FORMAT + || e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF + || e.code() == ErrorCodes::CANNOT_READ_ALL_DATA + || e.code() == ErrorCodes::CANNOT_OPEN_FILE) + { + tx->unlinkFile(path); + } + else + throw; + } + } + + void undo() override + { + + } + + void finalize() override + { + if (!delete_metadata_only && !paths_to_remove.empty()) + object_storage.removeObjects(paths_to_remove); + + if (remove_from_cache) + { + for (const auto & path_to_remove : paths_to_remove) + object_storage.removeFromCache(path_to_remove); + } + + } +}; + +struct RemoveRecursiveOperation final : public IDiskObjectStorageOperation +{ + std::string path; + std::unordered_map> paths_to_remove; + bool keep_all_batch_data; + NameSet file_names_remove_metadata_only; + std::vector path_to_remove_from_cache; + + RemoveRecursiveOperation( + IObjectStorage & object_storage_, + IMetadataStorage & metadata_storage_, + const std::string & path_, + bool keep_all_batch_data_, + const NameSet & file_names_remove_metadata_only_) + : IDiskObjectStorageOperation(object_storage_, metadata_storage_) + , path(path_) + , keep_all_batch_data(keep_all_batch_data_) + , file_names_remove_metadata_only(file_names_remove_metadata_only_) + {} + + void removeMetadataRecursive(MetadataTransactionPtr tx, const std::string & path_to_remove) + { + checkStackSize(); /// This is needed to prevent stack overflow in case of cyclic symlinks. + + if (metadata_storage.isFile(path_to_remove)) + { + try + { + uint32_t hardlink_count = metadata_storage.getHardlinkCount(path_to_remove); + auto remote_objects = metadata_storage.getRemotePaths(path_to_remove); + + tx->unlinkMetadata(path_to_remove); + + if (hardlink_count == 0) + { + paths_to_remove[path_to_remove] = remote_objects; + path_to_remove_from_cache.insert(path_to_remove_from_cache.end(), remote_objects.begin(), remote_objects.end()); + } + + } + catch (const Exception & e) + { + /// If it's impossible to read meta - just remove it from FS. + if (e.code() == ErrorCodes::UNKNOWN_FORMAT + || e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF + || e.code() == ErrorCodes::CANNOT_READ_ALL_DATA + || e.code() == ErrorCodes::CANNOT_OPEN_FILE) + { + tx->unlinkFile(path_to_remove); + } + else + throw; + } + } + else + { + for (auto it = metadata_storage.iterateDirectory(path_to_remove); it->isValid(); it->next()) + removeMetadataRecursive(tx, it->path()); + + tx->removeDirectory(path_to_remove); + } + } + + void execute(MetadataTransactionPtr tx) override + { + removeMetadataRecursive(tx, path); + } + + void undo() override + { + + } + + void finalize() override + { + if (!keep_all_batch_data) + { + std::vector remove_from_remote; + for (auto && [local_path, remote_paths] : paths_to_remove) + { + if (!file_names_remove_metadata_only.contains(fs::path(local_path).filename())) + { + remove_from_remote.insert(remove_from_remote.end(), remote_paths.begin(), remote_paths.end()); + } + } + object_storage.removeObjects(remove_from_remote); + } + + for (const auto & path_to_remove : path_to_remove_from_cache) + object_storage.removeFromCache(path_to_remove); + } +}; + + +struct ReplaceFileOperation final : public IDiskObjectStorageOperation +{ + std::string path_from; + std::string path_to; + std::vector blobs_to_remove; + + ReplaceFileOperation( + IObjectStorage & object_storage_, + IMetadataStorage & metadata_storage_, + const std::string & path_from_, + const std::string & path_to_) + : IDiskObjectStorageOperation(object_storage_, metadata_storage_) + , path_from(path_from_) + , path_to(path_to_) + {} + + void execute(MetadataTransactionPtr tx) override + { + if (metadata_storage.exists(path_to)) + { + blobs_to_remove = metadata_storage.getRemotePaths(path_to); + tx->replaceFile(path_from, path_to); + } + else + tx->moveFile(path_from, path_to); + } + + void undo() override + { + + } + + void finalize() override + { + if (!blobs_to_remove.empty()) + object_storage.removeObjects(blobs_to_remove); + } +}; + +struct WriteFileOperation final : public IDiskObjectStorageOperation +{ + std::string path; + std::string blob_path; + + WriteFileOperation( + IObjectStorage & object_storage_, + IMetadataStorage & metadata_storage_, + const std::string & path_, + const std::string & blob_path_) + : IDiskObjectStorageOperation(object_storage_, metadata_storage_) + , path(path_) + , blob_path(blob_path_) + {} + + void execute(MetadataTransactionPtr) override + { + + } + + void undo() override + { + if (object_storage.exists(blob_path)) + object_storage.removeObject(blob_path); + } + + void finalize() override + { + } +}; + + +struct CopyFileOperation final : public IDiskObjectStorageOperation +{ + std::string from_path; + std::string to_path; + std::string remote_fs_root_path; + + std::vector created_blobs; + + CopyFileOperation( + IObjectStorage & object_storage_, + IMetadataStorage & metadata_storage_, + const std::string & from_path_, + const std::string & to_path_, + const std::string & remote_fs_root_path_) + : IDiskObjectStorageOperation(object_storage_, metadata_storage_) + , from_path(from_path_) + , to_path(to_path_) + , remote_fs_root_path(remote_fs_root_path_) + {} + + void execute(MetadataTransactionPtr tx) override + { + tx->createEmptyMetadataFile(to_path); + auto source_blobs = metadata_storage.getBlobs(from_path); + for (const auto & [blob_from, size] : source_blobs) + { + auto blob_name = getRandomASCIIString(); + + auto blob_to = fs::path(remote_fs_root_path) / blob_name; + + object_storage.copyObject(fs::path(remote_fs_root_path) / blob_from, blob_to); + + tx->addBlobToMetadata(to_path, blob_name, size); + + created_blobs.push_back(blob_to); + } + } + + void undo() override + { + for (const auto & blob_path : created_blobs) + object_storage.removeObject(blob_path); + } + + void finalize() override + { + } +}; + +void DiskObjectStorageTransaction::createDirectory(const std::string & path) +{ + operations_to_execute.emplace_back( + std::make_unique(object_storage, metadata_storage, [path](MetadataTransactionPtr tx) + { + tx->createDirectory(path); + })); +} + +void DiskObjectStorageTransaction::createDirectories(const std::string & path) +{ + operations_to_execute.emplace_back( + std::make_unique(object_storage, metadata_storage, [path](MetadataTransactionPtr tx) + { + tx->createDicrectoryRecursive(path); + })); +} + + +void DiskObjectStorageTransaction::moveDirectory(const std::string & from_path, const std::string & to_path) +{ + operations_to_execute.emplace_back( + std::make_unique(object_storage, metadata_storage, [from_path, to_path](MetadataTransactionPtr tx) + { + tx->moveDirectory(from_path, to_path); + })); +} + +void DiskObjectStorageTransaction::moveFile(const String & from_path, const String & to_path) +{ + operations_to_execute.emplace_back( + std::make_unique(object_storage, metadata_storage, [from_path, to_path, this](MetadataTransactionPtr tx) + { + if (metadata_storage.exists(to_path)) + throw Exception("File already exists: " + to_path, ErrorCodes::FILE_ALREADY_EXISTS); + + if (!metadata_storage.exists(from_path)) + throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "File {} doesn't exist, cannot move", to_path); + + tx->moveFile(from_path, to_path); + })); +} + +void DiskObjectStorageTransaction::replaceFile(const std::string & from_path, const std::string & to_path) +{ + operations_to_execute.emplace_back(std::make_unique(object_storage, metadata_storage, from_path, to_path)); +} + +void DiskObjectStorageTransaction::clearDirectory(const std::string & path) +{ + for (auto it = metadata_storage.iterateDirectory(path); it->isValid(); it->next()) + { + if (metadata_storage.isFile(it->path())) + removeFile(it->path()); + } +} + +void DiskObjectStorageTransaction::removeFile(const std::string & path) +{ + removeSharedFile(path, false); +} + +void DiskObjectStorageTransaction::removeSharedFile(const std::string & path, bool keep_shared_data) +{ + operations_to_execute.emplace_back(std::make_unique(object_storage, metadata_storage, path, keep_shared_data, false)); +} + +void DiskObjectStorageTransaction::removeSharedRecursive(const std::string & path, bool keep_all_shared_data, const NameSet & file_names_remove_metadata_only) +{ + operations_to_execute.emplace_back(std::make_unique(object_storage, metadata_storage, path, keep_all_shared_data, file_names_remove_metadata_only)); +} + +void DiskObjectStorageTransaction::removeSharedFileIfExists(const std::string & path, bool keep_shared_data) +{ + operations_to_execute.emplace_back(std::make_unique(object_storage, metadata_storage, path, keep_shared_data, true)); +} + +void DiskObjectStorageTransaction::removeDirectory(const std::string & path) +{ + operations_to_execute.emplace_back( + std::make_unique(object_storage, metadata_storage, [path](MetadataTransactionPtr tx) + { + tx->removeDirectory(path); + })); +} + + +void DiskObjectStorageTransaction::removeRecursive(const std::string & path) +{ + removeSharedRecursive(path, false, {}); +} + +void DiskObjectStorageTransaction::removeFileIfExists(const std::string & path) +{ + removeSharedFileIfExists(path, false); +} + + +void DiskObjectStorageTransaction::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) +{ + for (const auto & file : files) + { + bool keep_file = keep_all_batch_data || file_names_remove_metadata_only.contains(fs::path(file.path).filename()); + if (file.if_exists) + removeSharedFileIfExists(file.path, keep_file); + else + removeSharedFile(file.path, keep_file); + } +} + +namespace +{ + +String revisionToString(UInt64 revision) +{ + return std::bitset<64>(revision).to_string(); +} + +} + +std::unique_ptr DiskObjectStorageTransaction::writeFile( /// NOLINT + const std::string & path, + size_t buf_size, + WriteMode mode, + const WriteSettings & settings, + bool autocommit) +{ + auto blob_name = getRandomASCIIString(); + + std::optional object_attributes; + if (metadata_helper) + { + auto revision = metadata_helper->revision_counter + 1; + metadata_helper->revision_counter++; + object_attributes = { + {"path", path} + }; + blob_name = "r" + revisionToString(revision) + "-file-" + blob_name; + } + + auto blob_path = fs::path(remote_fs_root_path) / blob_name; + + operations_to_execute.emplace_back(std::make_unique(object_storage, metadata_storage, path, blob_path)); + + auto create_metadata_callback = [tx = shared_from_this(), this, mode, path, blob_name, autocommit] (size_t count) + { + if (mode == WriteMode::Rewrite) + metadata_transaction->createMetadataFile(path, blob_name, count); + else + metadata_transaction->addBlobToMetadata(path, blob_name, count); + + if (autocommit) + metadata_transaction->commit(); + }; + + /// We always use mode Rewrite because we simulate append using metadata and different files + return object_storage.writeObject( + blob_path, WriteMode::Rewrite, object_attributes, + std::move(create_metadata_callback), + buf_size, settings); +} + + +void DiskObjectStorageTransaction::createHardLink(const std::string & src_path, const std::string & dst_path) +{ + operations_to_execute.emplace_back( + std::make_unique(object_storage, metadata_storage, [src_path, dst_path](MetadataTransactionPtr tx) + { + tx->createHardLink(src_path, dst_path); + })); +} + +void DiskObjectStorageTransaction::setReadOnly(const std::string & path) +{ + operations_to_execute.emplace_back( + std::make_unique(object_storage, metadata_storage, [path](MetadataTransactionPtr tx) + { + tx->setReadOnly(path); + })); +} + +void DiskObjectStorageTransaction::setLastModified(const std::string & path, const Poco::Timestamp & timestamp) +{ + operations_to_execute.emplace_back( + std::make_unique(object_storage, metadata_storage, [path, timestamp](MetadataTransactionPtr tx) + { + tx->setLastModified(path, timestamp); + })); +} + +void DiskObjectStorageTransaction::createFile(const std::string & path) +{ + operations_to_execute.emplace_back( + std::make_unique(object_storage, metadata_storage, [path](MetadataTransactionPtr tx) + { + tx->createEmptyMetadataFile(path); + })); +} + +void DiskObjectStorageTransaction::copyFile(const std::string & from_file_path, const std::string & to_file_path) +{ + operations_to_execute.emplace_back(std::make_unique(object_storage, metadata_storage, from_file_path, to_file_path, remote_fs_root_path)); +} + +void DiskObjectStorageTransaction::commit() +{ + for (size_t i = 0; i < operations_to_execute.size(); ++i) + { + try + { + operations_to_execute[i]->execute(metadata_transaction); + + } + catch (Exception & ex) + { + ex.addMessage(fmt::format("While executing operation #{}", i)); + + for (int64_t j = i; j >= 0; --j) + { + try + { + operations_to_execute[j]->undo(); + } + catch (Exception & rollback_ex) + { + rollback_ex.addMessage(fmt::format("While undoing operation #{}", i)); + throw; + } + } + throw; + } + } + + try + { + metadata_transaction->commit(); + } + catch (...) + { + for (const auto & operation : operations_to_execute | std::views::reverse) + operation->undo(); + + throw; + } + + for (const auto & operation : operations_to_execute) + operation->finalize(); +} + +} diff --git a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.h b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.h new file mode 100644 index 00000000000..24bbadb2cdd --- /dev/null +++ b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.h @@ -0,0 +1,113 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + + +/// Basic operation inside disk object storage transaction. +struct IDiskObjectStorageOperation +{ + /// useful for operations with blobs in object storage + IObjectStorage & object_storage; + /// useful for some read operations + IMetadataStorage & metadata_storage; +public: + IDiskObjectStorageOperation(IObjectStorage & object_storage_, IMetadataStorage & metadata_storage_) + : object_storage(object_storage_) + , metadata_storage(metadata_storage_) + {} + + /// Execute operation and something to metadata transaction + virtual void execute(MetadataTransactionPtr transaction) = 0; + /// Revert operation if possible + virtual void undo() = 0; + /// Action to execute after metadata transaction successfully committed. + /// Useful when it's impossible to revert operation + /// like removal of blobs. Such implementation can lead to garbage. + virtual void finalize() = 0; + virtual ~IDiskObjectStorageOperation() = default; +}; + +using DiskObjectStorageOperation = std::unique_ptr; + +using DiskObjectStorageOperations = std::vector; + +/// Disk object storage transaction, actually implement some part of disk object storage +/// logic. Works on top of non atomic operations with blobs and possibly atomic implementation +/// of metadata storage. +/// +/// Commit works like: +/// 1. Execute all accumulated operations in loop. +/// 2. Commit metadata transaction. +/// 3. Finalize all accumulated operations in loop. +/// +/// If something wrong happen on step 1 or 2 reverts all applied operations. +/// If finalize failed -- nothing is reverted, garbage is left in blob storage. +struct DiskObjectStorageTransaction final : public IDiskTransaction, std::enable_shared_from_this +{ +private: + IObjectStorage & object_storage; + IMetadataStorage & metadata_storage; + MetadataTransactionPtr metadata_transaction; + /// TODO we can get rid of this params + const std::string & remote_fs_root_path; + DiskObjectStorageRemoteMetadataRestoreHelper * metadata_helper; + + DiskObjectStorageOperations operations_to_execute; +public: + DiskObjectStorageTransaction( + IObjectStorage & object_storage_, + IMetadataStorage & metadata_storage_, + const std::string & remote_fs_root_path_, + DiskObjectStorageRemoteMetadataRestoreHelper * metadata_helper_); + + void commit() override; + + void createDirectory(const std::string & path) override; + + void createDirectories(const std::string & path) override; + + void clearDirectory(const std::string & path) override; + void moveDirectory(const std::string & from_path, const std::string & to_path) override; + + void moveFile(const String & from_path, const String & to_path) override; + + void replaceFile(const std::string & from_path, const std::string & to_path) override; + + void createFile(const String & path) override; + + void copyFile(const std::string & from_file_path, const std::string & to_file_path) override; + + /// writeFile is a difficult function for transactions. + /// Now it's almost noop because metadata added to transaction in finalize method + /// of write buffer. Autocommit means that transaction will be immediately committed + /// after returned buffer will be finalized. + std::unique_ptr writeFile( /// NOLINT + const std::string & path, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + WriteMode mode = WriteMode::Rewrite, + const WriteSettings & settings = {}, + bool autocommit = true) override; + + void removeFile(const std::string & path) override; + void removeFileIfExists(const std::string & path) override; + void removeDirectory(const std::string & path) override; + void removeRecursive(const std::string & path) override; + + void removeSharedFile(const std::string & path, bool keep_shared_data) override; + void removeSharedRecursive(const std::string & path, bool keep_all_shared_data, const NameSet & file_names_remove_metadata_only) override; + void removeSharedFileIfExists(const std::string & path, bool keep_shared_data) override; + void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override; + + void setLastModified(const std::string & path, const Poco::Timestamp & timestamp) override; + void setReadOnly(const std::string & path) override; + void createHardLink(const std::string & src_path, const std::string & dst_path) override; +}; + +using DiskObjectStorageTransactionPtr = std::shared_ptr; + +} diff --git a/src/Disks/ObjectStorages/IObjectStorage.cpp b/src/Disks/ObjectStorages/IObjectStorage.cpp index 475bcca4ea4..d29ecc24aeb 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.cpp +++ b/src/Disks/ObjectStorages/IObjectStorage.cpp @@ -1,5 +1,6 @@ #include #include +#include #include namespace DB @@ -31,7 +32,7 @@ void IObjectStorage::removeFromCache(const std::string & path) if (cache) { auto key = cache->hash(path); - cache->remove(key); + cache->removeIfExists(key); } } diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index b9ac497f54f..4921059c6b7 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -13,7 +13,7 @@ #include #include -#include +#include #include diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp b/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp index db1162ae617..9ba92113a30 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp @@ -396,7 +396,7 @@ void MetadataStorageFromDiskTransaction::commit() } catch (Exception & ex) { - ex.addMessage(fmt::format("While committing operation #{}", i)); + ex.addMessage(fmt::format("While committing metadata operation #{}", i)); state = MetadataFromDiskTransactionState::FAILED; rollback(i); throw; diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 3300b1da746..ffe4d2dd942 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -24,7 +24,7 @@ #include #include -#include +#include #include namespace DB diff --git a/src/Functions/FunctionsJSON.cpp b/src/Functions/FunctionsJSON.cpp index 49546aac92b..fa573ac829a 100644 --- a/src/Functions/FunctionsJSON.cpp +++ b/src/Functions/FunctionsJSON.cpp @@ -447,10 +447,16 @@ public: FunctionBasePtr build(const ColumnsWithTypeAndName & arguments) const override { + bool has_nothing_argument = false; + for (const auto & arg : arguments) + has_nothing_argument |= isNothing(arg.type); + DataTypePtr json_return_type = Impl::getReturnType(Name::name, createBlockWithNestedColumns(arguments)); NullPresence null_presence = getNullPresense(arguments); DataTypePtr return_type; - if (null_presence.has_null_constant) + if (has_nothing_argument) + return_type = std::make_shared(); + else if (null_presence.has_null_constant) return_type = makeNullable(std::make_shared()); else if (null_presence.has_nullable) return_type = makeNullable(json_return_type); diff --git a/src/Functions/FunctionsRound.h b/src/Functions/FunctionsRound.h index 518b969d441..9bf5ed2a6fd 100644 --- a/src/Functions/FunctionsRound.h +++ b/src/Functions/FunctionsRound.h @@ -168,7 +168,7 @@ struct IntegerRoundingComputation __builtin_unreachable(); } - static ALWAYS_INLINE void compute(const T * __restrict in, size_t scale, T * __restrict out) + static ALWAYS_INLINE void compute(const T * __restrict in, size_t scale, T * __restrict out) requires std::integral { if constexpr (sizeof(T) <= sizeof(scale) && scale_mode == ScaleMode::Negative) { @@ -181,6 +181,10 @@ struct IntegerRoundingComputation *out = compute(*in, scale); } + static ALWAYS_INLINE void compute(const T * __restrict in, T scale, T * __restrict out) requires(!std::integral) + { + *out = compute(*in, scale); + } }; @@ -432,7 +436,7 @@ public: scale_arg = in_scale - scale_arg; if (scale_arg > 0) { - size_t scale = intExp10(scale_arg); + auto scale = intExp10OfSize(scale_arg); const NativeType * __restrict p_in = reinterpret_cast(in.data()); const NativeType * end_in = reinterpret_cast(in.data()) + in.size(); diff --git a/src/Functions/IFunction.cpp b/src/Functions/IFunction.cpp index 171a6f55a39..5be2ea3c5e3 100644 --- a/src/Functions/IFunction.cpp +++ b/src/Functions/IFunction.cpp @@ -233,15 +233,15 @@ ColumnPtr IExecutableFunction::defaultImplementationForNothing( ColumnPtr IExecutableFunction::executeWithoutLowCardinalityColumns( const ColumnsWithTypeAndName & args, const DataTypePtr & result_type, size_t input_rows_count, bool dry_run) const { + if (auto res = defaultImplementationForNothing(args, result_type, input_rows_count)) + return res; + if (auto res = defaultImplementationForConstantArguments(args, result_type, input_rows_count, dry_run)) return res; if (auto res = defaultImplementationForNulls(args, result_type, input_rows_count, dry_run)) return res; - if (auto res = defaultImplementationForNothing(args, result_type, input_rows_count)) - return res; - ColumnPtr res; if (dry_run) res = executeDryRunImpl(args, result_type, input_rows_count); @@ -450,6 +450,15 @@ DataTypePtr IFunctionOverloadResolver::getReturnTypeWithoutLowCardinality(const { checkNumberOfArguments(arguments.size()); + if (!arguments.empty() && useDefaultImplementationForNothing()) + { + for (const auto & arg : arguments) + { + if (isNothing(arg.type)) + return std::make_shared(); + } + } + if (!arguments.empty() && useDefaultImplementationForNulls()) { NullPresence null_presence = getNullPresense(arguments); @@ -466,15 +475,6 @@ DataTypePtr IFunctionOverloadResolver::getReturnTypeWithoutLowCardinality(const } } - if (!arguments.empty() && useDefaultImplementationForNothing()) - { - for (const auto & arg : arguments) - { - if (isNothing(arg.type)) - return std::make_shared(); - } - } - return getReturnTypeImpl(arguments); } diff --git a/src/Functions/array/FunctionArrayMapped.h b/src/Functions/array/FunctionArrayMapped.h index 0af68910b70..c4ac89df78e 100644 --- a/src/Functions/array/FunctionArrayMapped.h +++ b/src/Functions/array/FunctionArrayMapped.h @@ -342,16 +342,26 @@ public: else if (lambda_result.column->isNullable()) { auto result_column = IColumn::mutate(std::move(lambda_result.column)); - auto * column_nullable = assert_cast(result_column.get()); - auto & null_map = column_nullable->getNullMapData(); - auto nested_column = IColumn::mutate(std::move(column_nullable->getNestedColumnPtr())); - auto & nested_data = assert_cast(nested_column.get())->getData(); - for (size_t i = 0; i != nested_data.size(); ++i) + + if (isColumnConst(*result_column)) { - if (null_map[i]) - nested_data[i] = 0; + UInt8 value = result_column->empty() ? 0 : result_column->getBool(0); + auto result_type = std::make_shared(); + lambda_result.column = result_type->createColumnConst(result_column->size(), value); + } + else + { + auto * column_nullable = assert_cast(result_column.get()); + auto & null_map = column_nullable->getNullMapData(); + auto nested_column = IColumn::mutate(std::move(column_nullable->getNestedColumnPtr())); + auto & nested_data = assert_cast(nested_column.get())->getData(); + for (size_t i = 0; i != nested_data.size(); ++i) + { + if (null_map[i]) + nested_data[i] = 0; + } + lambda_result.column = std::move(nested_column); } - lambda_result.column = std::move(nested_column); } } diff --git a/src/IO/Archives/ZipArchiveWriter.cpp b/src/IO/Archives/ZipArchiveWriter.cpp index dbfd66a6293..28a7bacf8d0 100644 --- a/src/IO/Archives/ZipArchiveWriter.cpp +++ b/src/IO/Archives/ZipArchiveWriter.cpp @@ -189,6 +189,8 @@ namespace explicit StreamFromWriteBuffer(std::unique_ptr write_buffer_) : write_buffer(std::move(write_buffer_)), start_offset(write_buffer->count()) {} + ~StreamFromWriteBuffer() { write_buffer->finalize(); } + static int closeFileFunc(void *, void * stream) { delete reinterpret_cast(stream); diff --git a/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp b/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp index 6916f506408..add18d8d12e 100644 --- a/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp +++ b/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace ProfileEvents @@ -93,7 +94,9 @@ bool AsynchronousReadBufferFromFileDescriptor::nextImpl() { prefetch_buffer.swap(memory); /// Adjust the working buffer so that it ignores `offset` bytes. - setWithBytesToIgnore(memory.data(), size, offset); + internal_buffer = Buffer(memory.data(), memory.data() + memory.size()); + working_buffer = Buffer(memory.data() + offset, memory.data() + size); + pos = working_buffer.begin(); return true; } @@ -109,7 +112,9 @@ bool AsynchronousReadBufferFromFileDescriptor::nextImpl() if (size) { /// Adjust the working buffer so that it ignores `offset` bytes. - setWithBytesToIgnore(memory.data(), size, offset); + internal_buffer = Buffer(memory.data(), memory.data() + memory.size()); + working_buffer = Buffer(memory.data() + offset, memory.data() + size); + pos = working_buffer.begin(); return true; } @@ -196,7 +201,6 @@ off_t AsynchronousReadBufferFromFileDescriptor::seek(off_t offset, int whence) else if (prefetch_future.valid()) { /// Read from prefetch buffer and recheck if the new position is valid inside. - if (nextImpl()) continue; } @@ -219,7 +223,8 @@ off_t AsynchronousReadBufferFromFileDescriptor::seek(off_t offset, int whence) file_offset_of_buffer_end = seek_pos; bytes_to_ignore = new_pos - seek_pos; - assert(bytes_to_ignore < internal_buffer.size()); + if (bytes_to_ignore >= internal_buffer.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error in AsynchronousReadBufferFromFileDescriptor, bytes_to_ignore ({}) >= internal_buffer.size() ({})", bytes_to_ignore, internal_buffer.size()); return seek_pos; } @@ -239,4 +244,9 @@ void AsynchronousReadBufferFromFileDescriptor::rewind() file_offset_of_buffer_end = 0; } +size_t AsynchronousReadBufferFromFileDescriptor::getFileSize() +{ + return getSizeFromFileDescriptor(fd, getFileName()); +} + } diff --git a/src/IO/AsynchronousReadBufferFromFileDescriptor.h b/src/IO/AsynchronousReadBufferFromFileDescriptor.h index e5b79ac9503..7ba842997f4 100644 --- a/src/IO/AsynchronousReadBufferFromFileDescriptor.h +++ b/src/IO/AsynchronousReadBufferFromFileDescriptor.h @@ -64,9 +64,10 @@ public: /// Seek to the beginning, discarding already read data if any. Useful to reread file that changes on every read. void rewind(); + size_t getFileSize() override; + private: std::future asyncReadInto(char * data, size_t size); }; } - diff --git a/src/IO/CompressionMethod.cpp b/src/IO/CompressionMethod.cpp index fe4772948ad..0da235c074c 100644 --- a/src/IO/CompressionMethod.cpp +++ b/src/IO/CompressionMethod.cpp @@ -99,7 +99,7 @@ CompressionMethod chooseCompressionMethod(const std::string & path, const std::s } static std::unique_ptr createCompressedWrapper( - std::unique_ptr nested, CompressionMethod method, size_t buf_size, char * existing_memory, size_t alignment) + std::unique_ptr nested, CompressionMethod method, size_t buf_size, char * existing_memory, size_t alignment, int zstd_window_log_max) { if (method == CompressionMethod::Gzip || method == CompressionMethod::Zlib) return std::make_unique(std::move(nested), method, buf_size, existing_memory, alignment); @@ -110,7 +110,7 @@ static std::unique_ptr createCompressedWrapper( if (method == CompressionMethod::Xz) return std::make_unique(std::move(nested), buf_size, existing_memory, alignment); if (method == CompressionMethod::Zstd) - return std::make_unique(std::move(nested), buf_size, existing_memory, alignment); + return std::make_unique(std::move(nested), buf_size, existing_memory, alignment, zstd_window_log_max); if (method == CompressionMethod::Lz4) return std::make_unique(std::move(nested), buf_size, existing_memory, alignment); #if USE_BZIP2 @@ -126,14 +126,13 @@ static std::unique_ptr createCompressedWrapper( } std::unique_ptr wrapReadBufferWithCompressionMethod( - std::unique_ptr nested, CompressionMethod method, size_t buf_size, char * existing_memory, size_t alignment) + std::unique_ptr nested, CompressionMethod method, int zstd_window_log_max, size_t buf_size, char * existing_memory, size_t alignment) { if (method == CompressionMethod::None) return nested; - return createCompressedWrapper(std::move(nested), method, buf_size, existing_memory, alignment); + return createCompressedWrapper(std::move(nested), method, buf_size, existing_memory, alignment, zstd_window_log_max); } - std::unique_ptr wrapWriteBufferWithCompressionMethod( std::unique_ptr nested, CompressionMethod method, int level, size_t buf_size, char * existing_memory, size_t alignment) { diff --git a/src/IO/CompressionMethod.h b/src/IO/CompressionMethod.h index 3953ba9d212..a399a756c13 100644 --- a/src/IO/CompressionMethod.h +++ b/src/IO/CompressionMethod.h @@ -5,7 +5,6 @@ #include - namespace DB { class ReadBuffer; @@ -50,10 +49,12 @@ CompressionMethod chooseCompressionMethod(const std::string & path, const std::s std::unique_ptr wrapReadBufferWithCompressionMethod( std::unique_ptr nested, CompressionMethod method, + int zstd_window_log_max = 0, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0); + std::unique_ptr wrapWriteBufferWithCompressionMethod( std::unique_ptr nested, CompressionMethod method, diff --git a/src/IO/ConcatSeekableReadBuffer.h b/src/IO/ConcatSeekableReadBuffer.h index fd9417cef8a..5d7dca82524 100644 --- a/src/IO/ConcatSeekableReadBuffer.h +++ b/src/IO/ConcatSeekableReadBuffer.h @@ -21,7 +21,7 @@ public: off_t seek(off_t off, int whence) override; off_t getPosition() override; - std::optional getFileSize() override { return total_size; } + size_t getFileSize() override { return total_size; } private: bool nextImpl() override; diff --git a/src/IO/MMapReadBufferFromFileDescriptor.cpp b/src/IO/MMapReadBufferFromFileDescriptor.cpp index 463252ca78d..5a636971fa0 100644 --- a/src/IO/MMapReadBufferFromFileDescriptor.cpp +++ b/src/IO/MMapReadBufferFromFileDescriptor.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -86,4 +87,8 @@ off_t MMapReadBufferFromFileDescriptor::seek(off_t offset, int whence) return new_pos; } +size_t MMapReadBufferFromFileDescriptor::getFileSize() +{ + return getSizeFromFileDescriptor(getFD(), getFileName()); +} } diff --git a/src/IO/MMapReadBufferFromFileDescriptor.h b/src/IO/MMapReadBufferFromFileDescriptor.h index 1715c2200fb..1a4bcd4f3ed 100644 --- a/src/IO/MMapReadBufferFromFileDescriptor.h +++ b/src/IO/MMapReadBufferFromFileDescriptor.h @@ -33,9 +33,12 @@ public: void finish(); off_t getPosition() override; + std::string getFileName() const override; + int getFD() const; + + size_t getFileSize() override; }; } - diff --git a/src/IO/ParallelReadBuffer.cpp b/src/IO/ParallelReadBuffer.cpp index 926d10bda5b..e7bb3dc72a8 100644 --- a/src/IO/ParallelReadBuffer.cpp +++ b/src/IO/ParallelReadBuffer.cpp @@ -150,7 +150,7 @@ off_t ParallelReadBuffer::seek(off_t offset, int whence) return offset; } -std::optional ParallelReadBuffer::getFileSize() +size_t ParallelReadBuffer::getFileSize() { return reader_factory->getFileSize(); } diff --git a/src/IO/ParallelReadBuffer.h b/src/IO/ParallelReadBuffer.h index 83b978848f8..9881d463ed4 100644 --- a/src/IO/ParallelReadBuffer.h +++ b/src/IO/ParallelReadBuffer.h @@ -43,7 +43,7 @@ public: ~ParallelReadBuffer() override { finishAndWait(); } off_t seek(off_t off, int whence) override; - std::optional getFileSize(); + size_t getFileSize(); off_t getPosition() override; const ReadBufferFactory & getReadBufferFactory() const { return *reader_factory; } diff --git a/src/IO/ReadBuffer.h b/src/IO/ReadBuffer.h index fd05ec67114..8d697710081 100644 --- a/src/IO/ReadBuffer.h +++ b/src/IO/ReadBuffer.h @@ -52,29 +52,6 @@ public: // FIXME: behavior differs greately from `BufferBase::set()` and it's very confusing. void set(Position ptr, size_t size) { BufferBase::set(ptr, size, 0); working_buffer.resize(0); } - /// Set buffer to given piece of memory but with certain bytes ignored from beginning. - /// - /// internal_buffer: |__________________| - /// working_buffer: |xxxxx|____________| - /// ^ ^ - /// bytes_to_ignore - /// - /// It's used for lazy seek. We also have another lazy seek mechanism that uses - /// `nextimpl_working_buffer_offset` to set offset in `next` method. It's important that we - /// don't do double lazy seek, which means `nextimpl_working_buffer_offset` should be zero. It's - /// useful to keep internal_buffer points to the real span of the underlying memory, because its - /// size might be used to allocate other buffers. It's also important to have pos starts at - /// working_buffer.begin(), because some buffers assume this condition to be true and uses - /// offset() to check read bytes. - void setWithBytesToIgnore(Position ptr, size_t size, size_t bytes_to_ignore) - { - assert(bytes_to_ignore < size); - assert(nextimpl_working_buffer_offset == 0); - internal_buffer = Buffer(ptr, ptr + size); - working_buffer = Buffer(ptr + bytes_to_ignore, ptr + size); - pos = ptr + bytes_to_ignore; - } - /** read next data and fill a buffer with it; set position to the beginning; * return `false` in case of end, `true` otherwise; throw an exception, if something is wrong */ diff --git a/src/IO/ReadBufferFromEmptyFile.h b/src/IO/ReadBufferFromEmptyFile.h index 0a14c07dd5c..f21f2f507dc 100644 --- a/src/IO/ReadBufferFromEmptyFile.h +++ b/src/IO/ReadBufferFromEmptyFile.h @@ -19,6 +19,7 @@ private: std::string getFileName() const override { return ""; } off_t seek(off_t /*off*/, int /*whence*/) override { return 0; } off_t getPosition() override { return 0; } + size_t getFileSize() override { return 0; } }; } diff --git a/src/IO/ReadBufferFromEncryptedFile.h b/src/IO/ReadBufferFromEncryptedFile.h index cacd194c336..2b9464114c9 100644 --- a/src/IO/ReadBufferFromEncryptedFile.h +++ b/src/IO/ReadBufferFromEncryptedFile.h @@ -30,6 +30,8 @@ public: void setReadUntilEnd() override { in->setReadUntilEnd(); } + size_t getFileSize() override { return in->getFileSize(); } + private: bool nextImpl() override; diff --git a/src/IO/ReadBufferFromFileBase.cpp b/src/IO/ReadBufferFromFileBase.cpp index 4db64755abf..1152804b770 100644 --- a/src/IO/ReadBufferFromFileBase.cpp +++ b/src/IO/ReadBufferFromFileBase.cpp @@ -3,6 +3,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int UNKNOWN_FILE_SIZE; +} + ReadBufferFromFileBase::ReadBufferFromFileBase() : BufferWithOwnMemory(0) { } @@ -19,4 +24,11 @@ ReadBufferFromFileBase::ReadBufferFromFileBase( ReadBufferFromFileBase::~ReadBufferFromFileBase() = default; +size_t ReadBufferFromFileBase::getFileSize() +{ + if (file_size) + return *file_size; + throw Exception(ErrorCodes::UNKNOWN_FILE_SIZE, "Cannot find out file size for read buffer"); +} + } diff --git a/src/IO/ReadBufferFromFileBase.h b/src/IO/ReadBufferFromFileBase.h index b076510a0d5..d28be034eb5 100644 --- a/src/IO/ReadBufferFromFileBase.h +++ b/src/IO/ReadBufferFromFileBase.h @@ -20,7 +20,8 @@ namespace DB { -class ReadBufferFromFileBase : public BufferWithOwnMemory, public WithFileName + +class ReadBufferFromFileBase : public BufferWithOwnMemory, public WithFileName, public WithFileSize { public: ReadBufferFromFileBase(); @@ -48,6 +49,8 @@ public: clock_type = clock_type_; } + size_t getFileSize() override; + protected: std::optional file_size; ProfileCallback profile_callback; diff --git a/src/IO/ReadBufferFromFileDecorator.cpp b/src/IO/ReadBufferFromFileDecorator.cpp index f4a996fc278..6e803586cd6 100644 --- a/src/IO/ReadBufferFromFileDecorator.cpp +++ b/src/IO/ReadBufferFromFileDecorator.cpp @@ -53,4 +53,9 @@ bool ReadBufferFromFileDecorator::nextImpl() return result; } +size_t ReadBufferFromFileDecorator::getFileSize() +{ + return getFileSizeFromReadBuffer(*impl); +} + } diff --git a/src/IO/ReadBufferFromFileDecorator.h b/src/IO/ReadBufferFromFileDecorator.h index 4b12bf96c26..6e62c7f741b 100644 --- a/src/IO/ReadBufferFromFileDecorator.h +++ b/src/IO/ReadBufferFromFileDecorator.h @@ -27,6 +27,8 @@ public: ReadBuffer & getWrappedReadBuffer() { return *impl; } + size_t getFileSize() override; + protected: std::unique_ptr impl; String file_name; diff --git a/src/IO/ReadBufferFromFileDescriptor.cpp b/src/IO/ReadBufferFromFileDescriptor.cpp index 3f6d11eb9e2..406b519df79 100644 --- a/src/IO/ReadBufferFromFileDescriptor.cpp +++ b/src/IO/ReadBufferFromFileDescriptor.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -38,7 +39,6 @@ namespace ErrorCodes extern const int ARGUMENT_OUT_OF_BOUND; extern const int CANNOT_SEEK_THROUGH_FILE; extern const int CANNOT_SELECT; - extern const int CANNOT_FSTAT; extern const int CANNOT_ADVISE; } @@ -249,13 +249,9 @@ bool ReadBufferFromFileDescriptor::poll(size_t timeout_microseconds) } -off_t ReadBufferFromFileDescriptor::size() +size_t ReadBufferFromFileDescriptor::getFileSize() { - struct stat buf; - int res = fstat(fd, &buf); - if (-1 == res) - throwFromErrnoWithPath("Cannot execute fstat " + getFileName(), getFileName(), ErrorCodes::CANNOT_FSTAT); - return buf.st_size; + return getSizeFromFileDescriptor(fd, getFileName()); } diff --git a/src/IO/ReadBufferFromFileDescriptor.h b/src/IO/ReadBufferFromFileDescriptor.h index a1d19c08087..96961e5a451 100644 --- a/src/IO/ReadBufferFromFileDescriptor.h +++ b/src/IO/ReadBufferFromFileDescriptor.h @@ -57,7 +57,7 @@ public: /// Seek to the beginning, discarding already read data if any. Useful to reread file that changes on every read. void rewind(); - off_t size(); + size_t getFileSize() override; void setProgressCallback(ContextPtr context); diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index 1a358f79a8b..53831e02cb2 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -222,20 +222,15 @@ off_t ReadBufferFromS3::seek(off_t offset_, int whence) return offset; } -std::optional ReadBufferFromS3::getFileSize() +size_t ReadBufferFromS3::getFileSize() { if (file_size) - return file_size; + return *file_size; auto object_size = S3::getObjectSize(client_ptr, bucket, key, version_id, false); - if (!object_size) - { - return std::nullopt; - } - file_size = object_size; - return file_size; + return *file_size; } off_t ReadBufferFromS3::getPosition() @@ -339,7 +334,7 @@ off_t ReadBufferS3Factory::seek(off_t off, [[maybe_unused]] int whence) return off; } -std::optional ReadBufferS3Factory::getFileSize() +size_t ReadBufferS3Factory::getFileSize() { return object_size; } diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index c5f72c7414f..7e6d408ec9f 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -65,7 +65,7 @@ public: off_t getPosition() override; - std::optional getFileSize() override; + size_t getFileSize() override; void setReadUntilPosition(size_t position) override; @@ -120,7 +120,7 @@ public: off_t seek(off_t off, [[maybe_unused]] int whence) override; - std::optional getFileSize() override; + size_t getFileSize() override; String getFileName() const override { return bucket + "/" + key; } diff --git a/src/IO/ReadWriteBufferFromHTTP.h b/src/IO/ReadWriteBufferFromHTTP.h index 2bb56bfe4fa..c450ffe1747 100644 --- a/src/IO/ReadWriteBufferFromHTTP.h +++ b/src/IO/ReadWriteBufferFromHTTP.h @@ -199,7 +199,7 @@ namespace detail } } - std::optional getFileSize() override + size_t getFileSize() override { if (read_range.end) return *read_range.end - getRangeBegin(); @@ -221,7 +221,7 @@ namespace detail if (response.hasContentLength()) read_range.end = getRangeBegin() + response.getContentLength(); - return read_range.end; + return *read_range.end; } String getFileName() const override { return uri.toString(); } @@ -749,7 +749,7 @@ public: return off; } - std::optional getFileSize() override { return total_object_size; } + size_t getFileSize() override { return total_object_size; } String getFileName() const override { return uri.toString(); } diff --git a/src/IO/WithFileSize.cpp b/src/IO/WithFileSize.cpp index c05a32291e3..28542db7a73 100644 --- a/src/IO/WithFileSize.cpp +++ b/src/IO/WithFileSize.cpp @@ -7,18 +7,23 @@ namespace DB { +namespace ErrorCodes +{ + extern const int UNKNOWN_FILE_SIZE; +} + template -static std::optional getFileSize(T & in) +static size_t getFileSize(T & in) { if (auto * with_file_size = dynamic_cast(&in)) { return with_file_size->getFileSize(); } - return std::nullopt; + throw Exception(ErrorCodes::UNKNOWN_FILE_SIZE, "Cannot find out file size"); } -std::optional getFileSizeFromReadBuffer(ReadBuffer & in) +size_t getFileSizeFromReadBuffer(ReadBuffer & in) { if (auto * delegate = dynamic_cast(&in)) { diff --git a/src/IO/WithFileSize.h b/src/IO/WithFileSize.h index b0d0517b23a..060626faed2 100644 --- a/src/IO/WithFileSize.h +++ b/src/IO/WithFileSize.h @@ -10,12 +10,12 @@ class ReadBuffer; class WithFileSize { public: - virtual std::optional getFileSize() = 0; + virtual size_t getFileSize() = 0; virtual ~WithFileSize() = default; }; bool isBufferWithFileSize(const ReadBuffer & in); -std::optional getFileSizeFromReadBuffer(ReadBuffer & in); +size_t getFileSizeFromReadBuffer(ReadBuffer & in); } diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index a21c15820cb..432304d6d5d 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -3,7 +3,7 @@ #if USE_AWS_S3 #include -#include +#include #include #include @@ -95,7 +95,7 @@ void WriteBufferFromS3::nextImpl() { auto cache_key = cache->hash(key); - file_segments_holder.emplace(cache->setDownloading(cache_key, current_download_offset, size)); + file_segments_holder.emplace(cache->setDownloading(cache_key, current_download_offset, size, /* is_persistent */false)); current_download_offset += size; size_t remaining_size = size; @@ -294,7 +294,7 @@ void WriteBufferFromS3::writePart() ++num_finished_bg_tasks; /// Notification under mutex is important here. - /// Othervies, WriteBuffer could be destroyed in between + /// Otherwise, WriteBuffer could be destroyed in between /// Releasing lock and condvar notification. bg_tasks_condvar.notify_one(); } diff --git a/src/IO/ZstdDeflatingAppendableWriteBuffer.cpp b/src/IO/ZstdDeflatingAppendableWriteBuffer.cpp index 85da576b417..459f486af18 100644 --- a/src/IO/ZstdDeflatingAppendableWriteBuffer.cpp +++ b/src/IO/ZstdDeflatingAppendableWriteBuffer.cpp @@ -179,7 +179,7 @@ void ZstdDeflatingAppendableWriteBuffer::addEmptyBlock() bool ZstdDeflatingAppendableWriteBuffer::isNeedToAddEmptyBlock() { ReadBufferFromFile reader(out->getFileName()); - auto fsize = reader.size(); + auto fsize = reader.getFileSize(); if (fsize > 3) { std::array result; diff --git a/src/IO/ZstdInflatingReadBuffer.cpp b/src/IO/ZstdInflatingReadBuffer.cpp index 712ea6960ef..0d026cdab9a 100644 --- a/src/IO/ZstdInflatingReadBuffer.cpp +++ b/src/IO/ZstdInflatingReadBuffer.cpp @@ -8,7 +8,7 @@ namespace ErrorCodes extern const int ZSTD_DECODER_FAILED; } -ZstdInflatingReadBuffer::ZstdInflatingReadBuffer(std::unique_ptr in_, size_t buf_size, char * existing_memory, size_t alignment) +ZstdInflatingReadBuffer::ZstdInflatingReadBuffer(std::unique_ptr in_, size_t buf_size, char * existing_memory, size_t alignment, int zstd_window_log_max) : CompressedReadBufferWrapper(std::move(in_), buf_size, existing_memory, alignment) { dctx = ZSTD_createDCtx(); @@ -19,6 +19,12 @@ ZstdInflatingReadBuffer::ZstdInflatingReadBuffer(std::unique_ptr in_ { throw Exception(ErrorCodes::ZSTD_DECODER_FAILED, "zstd_stream_decoder init failed: zstd version: {}", ZSTD_VERSION_STRING); } + + size_t ret = ZSTD_DCtx_setParameter(dctx, ZSTD_d_windowLogMax, zstd_window_log_max); + if (ZSTD_isError(ret)) + { + throw Exception(ErrorCodes::ZSTD_DECODER_FAILED, "zstd_stream_decoder init failed: {}", ZSTD_getErrorName(ret)); + } } ZstdInflatingReadBuffer::~ZstdInflatingReadBuffer() diff --git a/src/IO/ZstdInflatingReadBuffer.h b/src/IO/ZstdInflatingReadBuffer.h index a0c20b79d80..faa6231d4e2 100644 --- a/src/IO/ZstdInflatingReadBuffer.h +++ b/src/IO/ZstdInflatingReadBuffer.h @@ -20,7 +20,8 @@ public: std::unique_ptr in_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, - size_t alignment = 0); + size_t alignment = 0, + int zstd_window_log_max = 0); ~ZstdInflatingReadBuffer() override; diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp index f9d18cda777..e25a6260787 100644 --- a/src/Interpreters/ActionsVisitor.cpp +++ b/src/Interpreters/ActionsVisitor.cpp @@ -467,6 +467,15 @@ public: } bool contains(const std::string & name) const { return map.contains(name); } + + std::vector getAllNames() const + { + std::vector result; + result.reserve(map.size()); + for (auto const & e : map) + result.emplace_back(e.first); + return result; + } }; ActionsMatcher::Data::Data( @@ -481,7 +490,8 @@ ActionsMatcher::Data::Data( bool no_makeset_, bool only_consts_, bool create_source_for_in_, - AggregationKeysInfo aggregation_keys_info_) + AggregationKeysInfo aggregation_keys_info_, + bool build_expression_with_window_functions_) : WithContext(context_) , set_size_limit(set_size_limit_) , subquery_depth(subquery_depth_) @@ -495,6 +505,7 @@ ActionsMatcher::Data::Data( , visit_depth(0) , actions_stack(std::move(actions_dag), context_) , aggregation_keys_info(aggregation_keys_info_) + , build_expression_with_window_functions(build_expression_with_window_functions_) , next_unique_suffix(actions_stack.getLastActions().getIndex().size() + 1) { } @@ -504,6 +515,12 @@ bool ActionsMatcher::Data::hasColumn(const String & column_name) const return actions_stack.getLastActionsIndex().contains(column_name); } +std::vector ActionsMatcher::Data::getAllColumnNames() const +{ + const auto & index = actions_stack.getLastActionsIndex(); + return index.getAllNames(); +} + ScopeStack::ScopeStack(ActionsDAGPtr actions_dag, ContextPtr context_) : WithContext(context_) { auto & level = stack.emplace_back(); @@ -803,8 +820,9 @@ void ActionsMatcher::visit(const ASTIdentifier & identifier, const ASTPtr &, Dat { if (column_name_type.name == column_name) { - throw Exception("Column " + backQuote(column_name) + " is not under aggregate function and not in GROUP BY", - ErrorCodes::NOT_AN_AGGREGATE); + throw Exception(ErrorCodes::NOT_AN_AGGREGATE, + "Column {} is not under aggregate function and not in GROUP BY. Have columns: {}", + backQuote(column_name), toString(data.getAllColumnNames())); } } @@ -921,6 +939,7 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data & return; } + // Now we need to correctly process window functions and any expression which depend on them. if (node.is_window_function) { // Also add columns from PARTITION BY and ORDER BY of window functions. @@ -928,7 +947,6 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data & { visit(node.window_definition, data); } - // Also manually add columns for arguments of the window function itself. // ActionVisitor is written in such a way that this method must itself // descend into all needed function children. Window functions can't have @@ -945,12 +963,45 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data & // Don't need to do anything more for window functions here -- the // resulting column is added in ExpressionAnalyzer, similar to the // aggregate functions. + if (data.window_dependancy_state == WindowDependancyState::MAY_DEPEND) + data.window_function_in_subtree = true; return; } + else if (node.compute_after_window_functions) + { + // In this case we have window function call in subtree + // Add this function to actions index only if Data::build_expression_with_window_functions is set. + data.window_dependancy_state = WindowDependancyState::MAY_DEPEND; + for (const auto & arg : node.arguments->children) + { + data.window_function_in_subtree = false; + visit(arg, data); + // There is no point to check value of window_function_in_subtree here, + // because after window functions are computed, this variable is always false. + } + data.window_dependancy_state = WindowDependancyState::NONE; + if (!data.build_expression_with_window_functions) + return; + } + else if (data.window_dependancy_state == WindowDependancyState::MAY_DEPEND) + { + // This function may depend on evaluation of window function. + // We need to check it and add it to the index only if Data::build_expression_with_window_functions is set. + bool subtree_contains_window_call = false; + for (const auto & arg : node.arguments->children) + { + data.window_function_in_subtree = false; + visit(arg, data); + subtree_contains_window_call = subtree_contains_window_call || data.window_function_in_subtree; + } + data.window_function_in_subtree = subtree_contains_window_call; + if (subtree_contains_window_call && !data.build_expression_with_window_functions) + return; + } // An aggregate function can also be calculated as a window function, but we // checked for it above, so no need to do anything more. - if (AggregateFunctionFactory::instance().isAggregateFunctionName(node.name)) + if (AggregateUtils::isAggregateFunction(node)) return; FunctionOverloadResolverPtr function_builder; diff --git a/src/Interpreters/ActionsVisitor.h b/src/Interpreters/ActionsVisitor.h index 5a74124192c..afdf2948d47 100644 --- a/src/Interpreters/ActionsVisitor.h +++ b/src/Interpreters/ActionsVisitor.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -120,6 +121,12 @@ class ActionsMatcher public: using Visitor = ConstInDepthNodeVisitor; + enum class WindowDependancyState + { + NONE, + MAY_DEPEND, + }; + struct Data : public WithContext { SizeLimits set_size_limit; @@ -134,6 +141,7 @@ public: size_t visit_depth; ScopeStack actions_stack; AggregationKeysInfo aggregation_keys_info; + bool build_expression_with_window_functions; /* * Remember the last unique column suffix to avoid quadratic behavior @@ -142,6 +150,9 @@ public: */ int next_unique_suffix; + WindowDependancyState window_dependancy_state = WindowDependancyState::NONE; + bool window_function_in_subtree = false; + Data( ContextPtr context_, SizeLimits set_size_limit_, @@ -154,10 +165,13 @@ public: bool no_makeset_, bool only_consts_, bool create_source_for_in_, - AggregationKeysInfo aggregation_keys_info_); + AggregationKeysInfo aggregation_keys_info_, + bool build_expression_with_window_functions_ = false); /// Does result of the calculation already exists in the block. bool hasColumn(const String & column_name) const; + std::vector getAllColumnNames() const; + void addColumn(ColumnWithTypeAndName column) { actions_stack.addColumn(std::move(column)); diff --git a/src/Interpreters/Aggregator.cpp b/src/Interpreters/Aggregator.cpp index 1f01f1091e2..8a93dc5fd77 100644 --- a/src/Interpreters/Aggregator.cpp +++ b/src/Interpreters/Aggregator.cpp @@ -408,6 +408,29 @@ Block Aggregator::Params::getHeader( return materializeBlock(res); } +ColumnRawPtrs Aggregator::Params::makeRawKeyColumns(const Block & block) const +{ + ColumnRawPtrs key_columns(keys_size); + + for (size_t i = 0; i < keys_size; ++i) + key_columns[i] = block.safeGetByPosition(i).column.get(); + + return key_columns; +} + +Aggregator::AggregateColumnsConstData Aggregator::Params::makeAggregateColumnsData(const Block & block) const +{ + AggregateColumnsConstData aggregate_columns(aggregates_size); + + for (size_t i = 0; i < aggregates_size; ++i) + { + const auto & aggregate_column_name = aggregates[i].column_name; + aggregate_columns[i] = &typeid_cast(*block.getByName(aggregate_column_name).column).getData(); + } + + return aggregate_columns; +} + void Aggregator::Params::explain(WriteBuffer & out, size_t indent) const { Strings res; @@ -865,6 +888,38 @@ void Aggregator::executeOnBlockSmall( executeImpl(result, row_begin, row_end, key_columns, aggregate_instructions); } +void Aggregator::mergeOnBlockSmall( + AggregatedDataVariants & result, + size_t row_begin, + size_t row_end, + const AggregateColumnsConstData & aggregate_columns_data, + const ColumnRawPtrs & key_columns) const +{ + /// `result` will destroy the states of aggregate functions in the destructor + result.aggregator = this; + + /// How to perform the aggregation? + if (result.empty()) + { + initDataVariantsWithSizeHint(result, method_chosen, params); + result.keys_size = params.keys_size; + result.key_sizes = key_sizes; + } + + if (false) {} // NOLINT +#define M(NAME, IS_TWO_LEVEL) \ + else if (result.type == AggregatedDataVariants::Type::NAME) \ + mergeStreamsImpl(result.aggregates_pool, *result.NAME, result.NAME->data, \ + result.without_key, /* no_more_keys= */ false, \ + row_begin, row_end, \ + aggregate_columns_data, key_columns); + + APPLY_FOR_AGGREGATED_VARIANTS(M) +#undef M + else + throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT); +} + void Aggregator::executeImpl( AggregatedDataVariants & result, size_t row_begin, @@ -1181,8 +1236,7 @@ void NO_INLINE Aggregator::executeOnIntervalWithoutKeyImpl( AggregatedDataVariants & data_variants, size_t row_begin, size_t row_end, - AggregateFunctionInstruction * aggregate_instructions, - Arena * arena) const + AggregateFunctionInstruction * aggregate_instructions) const { /// `data_variants` will destroy the states of aggregate functions in the destructor data_variants.aggregator = this; @@ -1198,17 +1252,30 @@ void NO_INLINE Aggregator::executeOnIntervalWithoutKeyImpl( inst->offsets[static_cast(row_begin) - 1], inst->offsets[row_end - 1], res + inst->state_offset, - inst->batch_arguments, arena); + inst->batch_arguments, data_variants.aggregates_pool); else inst->batch_that->addBatchSinglePlaceFromInterval( row_begin, row_end, res + inst->state_offset, inst->batch_arguments, - arena); + data_variants.aggregates_pool); } } +void NO_INLINE Aggregator::mergeOnIntervalWithoutKeyImpl( + AggregatedDataVariants & data_variants, + size_t row_begin, + size_t row_end, + const AggregateColumnsConstData & aggregate_columns_data) const +{ + /// `data_variants` will destroy the states of aggregate functions in the destructor + data_variants.aggregator = this; + data_variants.init(AggregatedDataVariants::Type::without_key); + + mergeWithoutKeyStreamsImpl(data_variants, row_begin, row_end, aggregate_columns_data); +} + void Aggregator::prepareAggregateInstructions(Columns columns, AggregateColumns & aggregate_columns, Columns & materialized_columns, AggregateFunctionInstructions & aggregate_functions_instructions, NestedColumnsHolder & nested_columns_holder) const @@ -2569,33 +2636,20 @@ ManyAggregatedDataVariants Aggregator::prepareVariantsToMerge(ManyAggregatedData template void NO_INLINE Aggregator::mergeStreamsImplCase( - Block & block, Arena * aggregates_pool, Method & method [[maybe_unused]], Table & data, - AggregateDataPtr overflow_row) const + AggregateDataPtr overflow_row, + size_t row_begin, + size_t row_end, + const AggregateColumnsConstData & aggregate_columns_data, + const ColumnRawPtrs & key_columns) const { - ColumnRawPtrs key_columns(params.keys_size); - AggregateColumnsConstData aggregate_columns(params.aggregates_size); - - /// Remember the columns we will work with - for (size_t i = 0; i < params.keys_size; ++i) - key_columns[i] = block.safeGetByPosition(i).column.get(); - - for (size_t i = 0; i < params.aggregates_size; ++i) - { - const auto & aggregate_column_name = params.aggregates[i].column_name; - aggregate_columns[i] = &typeid_cast(*block.getByName(aggregate_column_name).column).getData(); - } - typename Method::State state(key_columns, key_sizes, aggregation_state_cache); - /// For all rows. - size_t rows = block.rows(); + std::unique_ptr places(new AggregateDataPtr[row_end]); - std::unique_ptr places(new AggregateDataPtr[rows]); - - for (size_t i = 0; i < rows; ++i) + for (size_t i = row_begin; i < row_end; ++i) { AggregateDataPtr aggregate_data = nullptr; @@ -2631,45 +2685,69 @@ void NO_INLINE Aggregator::mergeStreamsImplCase( { /// Merge state of aggregate functions. aggregate_functions[j]->mergeBatch( - 0, rows, + row_begin, row_end, places.get(), offsets_of_aggregate_states[j], - aggregate_columns[j]->data(), + aggregate_columns_data[j]->data(), aggregates_pool); } - - /// Early release memory. - block.clear(); } template void NO_INLINE Aggregator::mergeStreamsImpl( - Block & block, + Block block, Arena * aggregates_pool, Method & method, Table & data, AggregateDataPtr overflow_row, bool no_more_keys) const +{ + const AggregateColumnsConstData & aggregate_columns_data = params.makeAggregateColumnsData(block); + const ColumnRawPtrs & key_columns = params.makeRawKeyColumns(block); + + mergeStreamsImpl( + aggregates_pool, + method, + data, + overflow_row, + no_more_keys, + 0, + block.rows(), + aggregate_columns_data, + key_columns); +} + +template +void NO_INLINE Aggregator::mergeStreamsImpl( + Arena * aggregates_pool, + Method & method, + Table & data, + AggregateDataPtr overflow_row, + bool no_more_keys, + size_t row_begin, + size_t row_end, + const AggregateColumnsConstData & aggregate_columns_data, + const ColumnRawPtrs & key_columns) const { if (!no_more_keys) - mergeStreamsImplCase(block, aggregates_pool, method, data, overflow_row); + mergeStreamsImplCase(aggregates_pool, method, data, overflow_row, row_begin, row_end, aggregate_columns_data, key_columns); else - mergeStreamsImplCase(block, aggregates_pool, method, data, overflow_row); + mergeStreamsImplCase(aggregates_pool, method, data, overflow_row, row_begin, row_end, aggregate_columns_data, key_columns); } -void NO_INLINE Aggregator::mergeWithoutKeyStreamsImpl( - Block & block, +void NO_INLINE Aggregator::mergeBlockWithoutKeyStreamsImpl( + Block block, AggregatedDataVariants & result) const { - AggregateColumnsConstData aggregate_columns(params.aggregates_size); - - /// Remember the columns we will work with - for (size_t i = 0; i < params.aggregates_size; ++i) - { - const auto & aggregate_column_name = params.aggregates[i].column_name; - aggregate_columns[i] = &typeid_cast(*block.getByName(aggregate_column_name).column).getData(); - } - + AggregateColumnsConstData aggregate_columns = params.makeAggregateColumnsData(block); + mergeWithoutKeyStreamsImpl(result, 0, block.rows(), aggregate_columns); +} +void NO_INLINE Aggregator::mergeWithoutKeyStreamsImpl( + AggregatedDataVariants & result, + size_t row_begin, + size_t row_end, + const AggregateColumnsConstData & aggregate_columns_data) const +{ AggregatedDataWithoutKey & res = result.without_key; if (!res) { @@ -2678,17 +2756,15 @@ void NO_INLINE Aggregator::mergeWithoutKeyStreamsImpl( res = place; } - for (size_t row = 0, rows = block.rows(); row < rows; ++row) + for (size_t row = row_begin; row < row_end; ++row) { /// Adding Values for (size_t i = 0; i < params.aggregates_size; ++i) - aggregate_functions[i]->merge(res + offsets_of_aggregate_states[i], (*aggregate_columns[i])[row], result.aggregates_pool); + aggregate_functions[i]->merge(res + offsets_of_aggregate_states[i], (*aggregate_columns_data[i])[row], result.aggregates_pool); } - - /// Early release memory. - block.clear(); } + bool Aggregator::mergeOnBlock(Block block, AggregatedDataVariants & result, bool & no_more_keys) const { /// `result` will destroy the states of aggregate functions in the destructor @@ -2704,11 +2780,10 @@ bool Aggregator::mergeOnBlock(Block block, AggregatedDataVariants & result, bool } if (result.type == AggregatedDataVariants::Type::without_key || block.info.is_overflows) - mergeWithoutKeyStreamsImpl(block, result); - + mergeBlockWithoutKeyStreamsImpl(std::move(block), result); #define M(NAME, IS_TWO_LEVEL) \ else if (result.type == AggregatedDataVariants::Type::NAME) \ - mergeStreamsImpl(block, result.aggregates_pool, *result.NAME, result.NAME->data, result.without_key, no_more_keys); + mergeStreamsImpl(std::move(block), result.aggregates_pool, *result.NAME, result.NAME->data, result.without_key, no_more_keys); APPLY_FOR_AGGREGATED_VARIANTS(M) #undef M @@ -2824,7 +2899,7 @@ void Aggregator::mergeBlocks(BucketToBlocks bucket_to_blocks, AggregatedDataVari { #define M(NAME) \ else if (result.type == AggregatedDataVariants::Type::NAME) \ - mergeStreamsImpl(block, aggregates_pool, *result.NAME, result.NAME->data.impls[bucket], nullptr, false); + mergeStreamsImpl(std::move(block), aggregates_pool, *result.NAME, result.NAME->data.impls[bucket], nullptr, false); if (false) {} // NOLINT APPLY_FOR_VARIANTS_TWO_LEVEL(M) @@ -2875,11 +2950,11 @@ void Aggregator::mergeBlocks(BucketToBlocks bucket_to_blocks, AggregatedDataVari break; if (result.type == AggregatedDataVariants::Type::without_key || block.info.is_overflows) - mergeWithoutKeyStreamsImpl(block, result); + mergeBlockWithoutKeyStreamsImpl(std::move(block), result); #define M(NAME, IS_TWO_LEVEL) \ else if (result.type == AggregatedDataVariants::Type::NAME) \ - mergeStreamsImpl(block, result.aggregates_pool, *result.NAME, result.NAME->data, result.without_key, no_more_keys); + mergeStreamsImpl(std::move(block), result.aggregates_pool, *result.NAME, result.NAME->data, result.without_key, no_more_keys); APPLY_FOR_AGGREGATED_VARIANTS(M) #undef M @@ -2942,11 +3017,11 @@ Block Aggregator::mergeBlocks(BlocksList & blocks, bool final) bucket_num = -1; if (result.type == AggregatedDataVariants::Type::without_key || is_overflows) - mergeWithoutKeyStreamsImpl(block, result); + mergeBlockWithoutKeyStreamsImpl(std::move(block), result); #define M(NAME, IS_TWO_LEVEL) \ else if (result.type == AggregatedDataVariants::Type::NAME) \ - mergeStreamsImpl(block, result.aggregates_pool, *result.NAME, result.NAME->data, nullptr, false); + mergeStreamsImpl(std::move(block), result.aggregates_pool, *result.NAME, result.NAME->data, nullptr, false); APPLY_FOR_AGGREGATED_VARIANTS(M) #undef M diff --git a/src/Interpreters/Aggregator.h b/src/Interpreters/Aggregator.h index 1806465db4a..475fcd9e249 100644 --- a/src/Interpreters/Aggregator.h +++ b/src/Interpreters/Aggregator.h @@ -890,6 +890,11 @@ class NativeWriter; class Aggregator final { public: + using AggregateColumns = std::vector; + using AggregateColumnsData = std::vector; + using AggregateColumnsConstData = std::vector; + using AggregateFunctionsPlainPtrs = std::vector; + struct Params { /// Data structure of source blocks. @@ -1015,6 +1020,10 @@ public: return getHeader(src_header, intermediate_header, keys, aggregates, final); } + /// Remember the columns we will work with + ColumnRawPtrs makeRawKeyColumns(const Block & block) const; + AggregateColumnsConstData makeAggregateColumnsData(const Block & block) const; + /// Returns keys and aggregated for EXPLAIN query void explain(WriteBuffer & out, size_t indent) const; void explain(JSONBuilder::JSONMap & map) const; @@ -1022,11 +1031,6 @@ public: explicit Aggregator(const Params & params_); - using AggregateColumns = std::vector; - using AggregateColumnsData = std::vector; - using AggregateColumnsConstData = std::vector; - using AggregateFunctionsPlainPtrs = std::vector; - /// Process one block. Return false if the processing should be aborted (with group_by_overflow_mode = 'break'). bool executeOnBlock(const Block & block, AggregatedDataVariants & result, @@ -1181,6 +1185,14 @@ private: size_t row_end, ColumnRawPtrs & key_columns, AggregateFunctionInstruction * aggregate_instructions) const; + void mergeOnBlockSmall( + AggregatedDataVariants & result, + size_t row_begin, + size_t row_end, + const AggregateColumnsConstData & aggregate_columns_data, + const ColumnRawPtrs & key_columns) const; + + void mergeOnBlockImpl(Block block, AggregatedDataVariants & result, bool no_more_keys) const; void executeImpl( AggregatedDataVariants & result, @@ -1227,8 +1239,12 @@ private: AggregatedDataVariants & data_variants, size_t row_begin, size_t row_end, - AggregateFunctionInstruction * aggregate_instructions, - Arena * arena) const; + AggregateFunctionInstruction * aggregate_instructions) const; + void mergeOnIntervalWithoutKeyImpl( + AggregatedDataVariants & data_variants, + size_t row_begin, + size_t row_end, + const AggregateColumnsConstData & aggregate_columns_data) const; template void writeToTemporaryFileImpl( @@ -1338,24 +1354,43 @@ private: template void mergeStreamsImplCase( - Block & block, Arena * aggregates_pool, Method & method, Table & data, - AggregateDataPtr overflow_row) const; + AggregateDataPtr overflow_row, + size_t row_begin, + size_t row_end, + const AggregateColumnsConstData & aggregate_columns_data, + const ColumnRawPtrs & key_columns) const; template void mergeStreamsImpl( - Block & block, + Block block, Arena * aggregates_pool, Method & method, Table & data, AggregateDataPtr overflow_row, bool no_more_keys) const; + template + void mergeStreamsImpl( + Arena * aggregates_pool, + Method & method, + Table & data, + AggregateDataPtr overflow_row, + bool no_more_keys, + size_t row_begin, + size_t row_end, + const AggregateColumnsConstData & aggregate_columns_data, + const ColumnRawPtrs & key_columns) const; - void mergeWithoutKeyStreamsImpl( - Block & block, + void mergeBlockWithoutKeyStreamsImpl( + Block block, AggregatedDataVariants & result) const; + void mergeWithoutKeyStreamsImpl( + AggregatedDataVariants & result, + size_t row_begin, + size_t row_end, + const AggregateColumnsConstData & aggregate_columns_data) const; template void mergeBucketImpl( diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index 649c6b4e4ab..4ac5acfd60f 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/Cluster.cpp b/src/Interpreters/Cluster.cpp index 54f55c7b1f6..ddf3b46f102 100644 --- a/src/Interpreters/Cluster.cpp +++ b/src/Interpreters/Cluster.cpp @@ -27,6 +27,8 @@ namespace ErrorCodes extern const int SHARD_HAS_NO_CONNECTIONS; extern const int NO_ELEMENTS_IN_CONFIG; extern const int SYNTAX_ERROR; + extern const int INVALID_SHARD_ID; + extern const int NO_SUCH_REPLICA; } namespace @@ -749,6 +751,41 @@ std::vector Cluster::getHostIDs() const return host_ids; } +std::vector Cluster::filterAddressesByShardOrReplica(size_t only_shard_num, size_t only_replica_num) const +{ + std::vector res; + + auto enumerate_replicas = [&](size_t shard_index) + { + if (shard_index > addresses_with_failover.size()) + throw Exception(ErrorCodes::INVALID_SHARD_ID, "Cluster {} doesn't have shard #{}", name, shard_index); + const auto & replicas = addresses_with_failover[shard_index - 1]; + if (only_replica_num) + { + if (only_replica_num > replicas.size()) + throw Exception(ErrorCodes::NO_SUCH_REPLICA, "Cluster {} doesn't have replica #{} in shard #{}", name, only_replica_num, shard_index); + res.emplace_back(&replicas[only_replica_num - 1]); + } + else + { + for (const auto & addr : replicas) + res.emplace_back(&addr); + } + }; + + if (only_shard_num) + { + enumerate_replicas(only_shard_num); + } + else + { + for (size_t shard_index = 1; shard_index <= addresses_with_failover.size(); ++shard_index) + enumerate_replicas(shard_index); + } + + return res; +} + const std::string & Cluster::ShardInfo::insertPathForInternalReplication(bool prefer_localhost_replica, bool use_compact_format) const { if (!has_internal_replication) diff --git a/src/Interpreters/Cluster.h b/src/Interpreters/Cluster.h index 5ce011782fc..72958703d0e 100644 --- a/src/Interpreters/Cluster.h +++ b/src/Interpreters/Cluster.h @@ -215,6 +215,11 @@ public: const ShardsInfo & getShardsInfo() const { return shards_info; } const AddressesWithFailover & getShardsAddresses() const { return addresses_with_failover; } + /// Returns addresses of some replicas according to specified `only_shard_num` and `only_replica_num`. + /// `only_shard_num` is 1-based index of a shard, 0 means all shards. + /// `only_replica_num` is 1-based index of a replica, 0 means all replicas. + std::vector filterAddressesByShardOrReplica(size_t only_shard_num, size_t only_replica_num) const; + const ShardInfo & getAnyShardInfo() const { if (shards_info.empty()) diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp index c5ca5748066..fa9444a7e66 100644 --- a/src/Interpreters/DatabaseCatalog.cpp +++ b/src/Interpreters/DatabaseCatalog.cpp @@ -205,6 +205,12 @@ void DatabaseCatalog::shutdownImpl() view_dependencies.clear(); } +bool DatabaseCatalog::isPredefinedDatabaseName(const std::string_view & database_name) +{ + return database_name == TEMPORARY_DATABASE || database_name == SYSTEM_DATABASE || database_name == INFORMATION_SCHEMA + || database_name == INFORMATION_SCHEMA_UPPERCASE; +} + DatabaseAndTable DatabaseCatalog::tryGetByUUID(const UUID & uuid) const { assert(uuid != UUIDHelpers::Nil && getFirstLevelIdx(uuid) < uuid_map.size()); diff --git a/src/Interpreters/DatabaseCatalog.h b/src/Interpreters/DatabaseCatalog.h index 34b42a3397c..79ba4052038 100644 --- a/src/Interpreters/DatabaseCatalog.h +++ b/src/Interpreters/DatabaseCatalog.h @@ -122,11 +122,15 @@ class BackgroundSchedulePoolTaskHolder; class DatabaseCatalog : boost::noncopyable, WithMutableContext { public: + /// Names of predefined databases. static constexpr const char * TEMPORARY_DATABASE = "_temporary_and_external_tables"; static constexpr const char * SYSTEM_DATABASE = "system"; static constexpr const char * INFORMATION_SCHEMA = "information_schema"; static constexpr const char * INFORMATION_SCHEMA_UPPERCASE = "INFORMATION_SCHEMA"; + /// Returns true if a passed string is one of the predefined databases' names + static bool isPredefinedDatabaseName(const std::string_view & database_name); + static DatabaseCatalog & init(ContextMutablePtr global_context_); static DatabaseCatalog & instance(); static void shutdown(); diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 00333503db1..8d0c4dee023 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -137,7 +137,7 @@ bool checkPositionalArguments(ASTPtr & argument, const ASTSelectQuery * select_q { if (const auto * function = typeid_cast(node.get())) { - auto is_aggregate_function = AggregateFunctionFactory::instance().isAggregateFunctionName(function->name); + auto is_aggregate_function = AggregateUtils::isAggregateFunction(*function); if (is_aggregate_function) { throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, @@ -615,7 +615,6 @@ void ExpressionAnalyzer::getRootActions(const ASTPtr & ast, bool no_makeset_for_ actions = visitor_data.getActions(); } - void ExpressionAnalyzer::getRootActionsNoMakeSet(const ASTPtr & ast, ActionsDAGPtr & actions, bool only_consts) { LogAST log; @@ -659,6 +658,28 @@ void ExpressionAnalyzer::getRootActionsForHaving( } +void ExpressionAnalyzer::getRootActionsForWindowFunctions(const ASTPtr & ast, bool no_makeset_for_subqueries, ActionsDAGPtr & actions) +{ + LogAST log; + ActionsVisitor::Data visitor_data( + getContext(), + settings.size_limits_for_set, + subquery_depth, + sourceColumns(), + std::move(actions), + prepared_sets, + subqueries_for_sets, + no_makeset_for_subqueries, + false /* no_makeset */, + false /*only_consts */, + !isRemoteStorage() /* create_source_for_in */, + getAggregationKeysInfo(), + true); + ActionsVisitor(visitor_data, log.stream()).visit(ast); + actions = visitor_data.getActions(); +} + + void ExpressionAnalyzer::makeAggregateDescriptions(ActionsDAGPtr & actions, AggregateDescriptions & descriptions) { for (const ASTFunction * node : aggregates()) @@ -696,7 +717,7 @@ void ExpressionAnalyzer::makeAggregateDescriptions(ActionsDAGPtr & actions, Aggr } } -void makeWindowDescriptionFromAST(const Context & context, +void ExpressionAnalyzer::makeWindowDescriptionFromAST(const Context & context_, const WindowDescriptions & existing_descriptions, WindowDescription & desc, const IAST * ast) { @@ -765,6 +786,10 @@ void makeWindowDescriptionFromAST(const Context & context, desc.partition_by.push_back(SortColumnDescription( with_alias->getColumnName(), 1 /* direction */, 1 /* nulls_direction */)); + + auto actions_dag = std::make_shared(columns_after_join); + getRootActions(column_ast, false, actions_dag); + desc.partition_by_actions.push_back(std::move(actions_dag)); } } @@ -782,6 +807,10 @@ void makeWindowDescriptionFromAST(const Context & context, order_by_element.children.front()->getColumnName(), order_by_element.direction, order_by_element.nulls_direction)); + + auto actions_dag = std::make_shared(columns_after_join); + getRootActions(column_ast, false, actions_dag); + desc.order_by_actions.push_back(std::move(actions_dag)); } } @@ -808,14 +837,14 @@ void makeWindowDescriptionFromAST(const Context & context, if (definition.frame_end_type == WindowFrame::BoundaryType::Offset) { auto [value, _] = evaluateConstantExpression(definition.frame_end_offset, - context.shared_from_this()); + context_.shared_from_this()); desc.frame.end_offset = value; } if (definition.frame_begin_type == WindowFrame::BoundaryType::Offset) { auto [value, _] = evaluateConstantExpression(definition.frame_begin_offset, - context.shared_from_this()); + context_.shared_from_this()); desc.frame.begin_offset = value; } } @@ -895,7 +924,6 @@ void ExpressionAnalyzer::makeWindowDescriptions(ActionsDAGPtr actions) window_function.argument_types, window_function.function_parameters, properties); - // Find the window corresponding to this function. It may be either // referenced by name and previously defined in WINDOW clause, or it // may be defined inline. @@ -1388,6 +1416,15 @@ void SelectQueryExpressionAnalyzer::appendWindowFunctionsArguments( } } +void SelectQueryExpressionAnalyzer::appendExpressionsAfterWindowFunctions(ExpressionActionsChain & chain, bool /* only_types */) +{ + ExpressionActionsChain::Step & step = chain.lastStep(columns_after_window); + for (const auto & expression : syntax->expressions_with_window_function) + { + getRootActionsForWindowFunctions(expression->clone(), true, step.actions()); + } +} + bool SelectQueryExpressionAnalyzer::appendHaving(ExpressionActionsChain & chain, bool only_types) { const auto * select_query = getAggregatingQuery(); @@ -1415,7 +1452,7 @@ void SelectQueryExpressionAnalyzer::appendSelect(ExpressionActionsChain & chain, { if (const auto * function = typeid_cast(child.get()); function - && function->is_window_function) + && (function->is_window_function || function->compute_after_window_functions)) { // Skip window function columns here -- they are calculated after // other SELECT expressions by a special step. @@ -1891,6 +1928,12 @@ ExpressionAnalysisResult::ExpressionAnalysisResult( before_window = chain.getLastActions(); finalize_chain(chain); + query_analyzer.appendExpressionsAfterWindowFunctions(chain, only_types || !first_stage); + for (const auto & x : chain.getLastActions()->getNamesAndTypesList()) + { + query_analyzer.columns_after_window.push_back(x); + } + auto & step = chain.lastStep(query_analyzer.columns_after_window); // The output of this expression chain is the result of diff --git a/src/Interpreters/ExpressionAnalyzer.h b/src/Interpreters/ExpressionAnalyzer.h index 80c664832e5..278415f6429 100644 --- a/src/Interpreters/ExpressionAnalyzer.h +++ b/src/Interpreters/ExpressionAnalyzer.h @@ -140,6 +140,7 @@ public: /// A list of windows for window functions. const WindowDescriptions & windowDescriptions() const { return window_descriptions; } + void makeWindowDescriptionFromAST(const Context & context, const WindowDescriptions & existing_descriptions, WindowDescription & desc, const IAST * ast); void makeWindowDescriptions(ActionsDAGPtr actions); /** @@ -191,6 +192,8 @@ protected: void getRootActionsForHaving(const ASTPtr & ast, bool no_makeset_for_subqueries, ActionsDAGPtr & actions, bool only_consts = false); + void getRootActionsForWindowFunctions(const ASTPtr & ast, bool no_makeset_for_subqueries, ActionsDAGPtr & actions); + /** Add aggregation keys to aggregation_keys, aggregate functions to aggregate_descriptions, * Create a set of columns aggregated_columns resulting after the aggregation, if any, * or after all the actions that are normally performed before aggregation. @@ -406,6 +409,8 @@ private: void appendAggregateFunctionsArguments(ExpressionActionsChain & chain, bool only_types); void appendWindowFunctionsArguments(ExpressionActionsChain & chain, bool only_types); + void appendExpressionsAfterWindowFunctions(ExpressionActionsChain & chain, bool only_types); + /// After aggregation: bool appendHaving(ExpressionActionsChain & chain, bool only_types); /// appendSelect diff --git a/src/Interpreters/ExternalUserDefinedExecutableFunctionsLoader.cpp b/src/Interpreters/ExternalUserDefinedExecutableFunctionsLoader.cpp index 33829aceb31..8c7220a85da 100644 --- a/src/Interpreters/ExternalUserDefinedExecutableFunctionsLoader.cpp +++ b/src/Interpreters/ExternalUserDefinedExecutableFunctionsLoader.cpp @@ -148,6 +148,9 @@ ExternalLoader::LoadablePtr ExternalUserDefinedExecutableFunctionsLoader::create String command_value = config.getString(key_in_config + ".command"); std::vector parameters = extractParametersFromCommand(command_value); + if (!execute_direct && !parameters.empty()) + throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Parameters are not supported if executable user defined function is not direct"); + std::vector command_arguments; if (execute_direct) diff --git a/src/Interpreters/GetAggregatesVisitor.cpp b/src/Interpreters/GetAggregatesVisitor.cpp new file mode 100644 index 00000000000..a9d96a6d15a --- /dev/null +++ b/src/Interpreters/GetAggregatesVisitor.cpp @@ -0,0 +1,90 @@ +#include + +namespace DB +{ + +struct WindowExpressionsCollectorChildInfo +{ + void update(const WindowExpressionsCollectorChildInfo & other) + { + window_function_in_subtree = window_function_in_subtree || other.window_function_in_subtree; + } + + bool window_function_in_subtree = false; +}; + +// This visitor travers AST and collects the list of expressions which depend on +// evaluation of window functions. Expression is collected only if +// it's not a part of another expression. +// +// Also all collected AST nodes are marked as dependent on window function. +// This information is used during ActionsDAG building process. +struct WindowExpressionsCollectorMatcher +{ + using ChildInfo = WindowExpressionsCollectorChildInfo; + + static bool needVisitChild(ASTPtr & node, const ASTPtr & child) + { + if (child->as() || child->as()) + return false; + if (auto * select = node->as()) + { + // We don't analysis WITH statement because it might contain useless aggregates + if (child == select->with()) + return false; + } + // We procces every expression manually + if (auto * func = node->as()) + return false; + return true; + } + + WindowExpressionsCollectorChildInfo visitNode( + ASTPtr & ast, + const ASTPtr & parent, + WindowExpressionsCollectorChildInfo const &) + { + return visitNode(ast, parent); + } + + WindowExpressionsCollectorChildInfo visitNode( + ASTPtr & ast, + const ASTPtr & parent) + { + if (auto * func = ast->as()) + { + if (func->is_window_function) + return { .window_function_in_subtree = true }; + + WindowExpressionsCollectorChildInfo result; + for (auto & arg : func->arguments->children) + { + auto subtree_result = visitNode(arg, ast); + result.update(subtree_result); + } + + // We mark functions only on the top of AST + if ((!parent || !parent->as()) && result.window_function_in_subtree) + { + expressions_with_window_functions.push_back(func); + func->compute_after_window_functions = true; + } + + return result; + } + return {}; + } + + std::vector expressions_with_window_functions {}; +}; + +using WindowExpressionsCollectorVisitor = InDepthNodeVisitorWithChildInfo; + +std::vector getExpressionsWithWindowFunctions(ASTPtr & ast) +{ + WindowExpressionsCollectorVisitor visitor; + visitor.visit(ast); + return std::move(visitor.expressions_with_window_functions); +} + +} diff --git a/src/Interpreters/GetAggregatesVisitor.h b/src/Interpreters/GetAggregatesVisitor.h index 3966653235a..036d50ba4d6 100644 --- a/src/Interpreters/GetAggregatesVisitor.h +++ b/src/Interpreters/GetAggregatesVisitor.h @@ -95,9 +95,7 @@ private: { // Aggregate functions can also be calculated as window functions, but // here we are interested in aggregate functions calculated in GROUP BY. - return !node.is_window_function - && AggregateFunctionFactory::instance().isAggregateFunctionName( - node.name); + return !node.is_window_function && AggregateUtils::isAggregateFunction(node); } }; @@ -116,4 +114,6 @@ inline void assertNoAggregates(const ASTPtr & ast, const char * description) GetAggregatesVisitor(data).visit(ast); } +std::vector getExpressionsWithWindowFunctions(ASTPtr & ast); + } diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index fef741446bf..0cec83e964b 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -1003,15 +1003,13 @@ public: /// If it's joinGetOrNull, we need to wrap not-nullable columns in StorageJoin. for (size_t j = 0, size = right_indexes.size(); j < size; ++j) { - auto column_from_block = block.getByPosition(right_indexes[j]); - if (type_name[j].type->lowCardinality() != column_from_block.type->lowCardinality()) - { - JoinCommon::changeLowCardinalityInplace(column_from_block); - } - + const auto & column_from_block = block.getByPosition(right_indexes[j]); if (auto * nullable_col = typeid_cast(columns[j].get()); nullable_col && !column_from_block.column->isNullable()) nullable_col->insertFromNotNullable(*column_from_block.column, row_num); + else if (auto * lowcard_col = typeid_cast(columns[j].get()); + lowcard_col && !typeid_cast(column_from_block.column.get())) + lowcard_col->insertFromFullColumn(*column_from_block.column, row_num); else columns[j]->insertFrom(*column_from_block.column, row_num); } @@ -1020,12 +1018,12 @@ public: { for (size_t j = 0, size = right_indexes.size(); j < size; ++j) { - auto column_from_block = block.getByPosition(right_indexes[j]); - if (type_name[j].type->lowCardinality() != column_from_block.type->lowCardinality()) - { - JoinCommon::changeLowCardinalityInplace(column_from_block); - } - columns[j]->insertFrom(*column_from_block.column, row_num); + const auto & column_from_block = block.getByPosition(right_indexes[j]); + if (auto * lowcard_col = typeid_cast(columns[j].get()); + lowcard_col && !typeid_cast(column_from_block.column.get())) + lowcard_col->insertFromFullColumn(*column_from_block.column, row_num); + else + columns[j]->insertFrom(*column_from_block.column, row_num); } } } diff --git a/src/Interpreters/InDepthNodeVisitor.h b/src/Interpreters/InDepthNodeVisitor.h index b7353f2c243..736a764e8e9 100644 --- a/src/Interpreters/InDepthNodeVisitor.h +++ b/src/Interpreters/InDepthNodeVisitor.h @@ -95,4 +95,33 @@ public: template using ConstOneTypeMatcher = OneTypeMatcher; +template +struct InDepthNodeVisitorWithChildInfo : Visitor +{ + using ChildInfo = typename Visitor::ChildInfo; + + ChildInfo visit(T & ast, const T & parent = {}) + { + ChildInfo all_children_info; + for (auto & child : ast->children) + { + if (Visitor::needVisitChild(ast, child)) + { + ChildInfo child_info = visit(child, ast); + all_children_info.update(child_info); + } + } + + try + { + return Visitor::visitNode(ast, parent, all_children_info); + } + catch (Exception & e) + { + e.addMessage("While processing {}", ast->formatForErrorMessage()); + throw; + } + } +}; + } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 304cfa2f3f4..b29f7372d38 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -1455,7 +1455,7 @@ BlockIO InterpreterCreateQuery::doCreateOrReplaceTable(ASTCreateQuery & create, BlockIO InterpreterCreateQuery::fillTableIfNeeded(const ASTCreateQuery & create) { /// If the query is a CREATE SELECT, insert the data into the table. - if (create.select && !create.attach + if (create.select && !create.attach && !create.is_create_empty && !create.is_ordinary_view && !create.is_live_view && (!(create.is_materialized_view || create.is_window_view) || create.is_populate)) { diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index 241ab1b0f75..4ed293e8530 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -162,7 +162,7 @@ Block InterpreterInsertQuery::getSampleBlock( static bool hasAggregateFunctions(const IAST * ast) { if (const auto * func = typeid_cast(ast)) - if (AggregateFunctionFactory::instance().isAggregateFunctionName(func->name)) + if (AggregateUtils::isAggregateFunction(*func)) return true; for (const auto & child : ast->children) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 94ac7c26183..1bf188596d7 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -232,6 +232,8 @@ static void rewriteMultipleJoins(ASTPtr & query, const TablesWithColumns & table CrossToInnerJoinVisitor(cross_to_inner).visit(query); JoinToSubqueryTransformVisitor::Data join_to_subs_data{tables, aliases}; + join_to_subs_data.try_to_keep_original_names = settings.multiple_joins_try_to_keep_original_names; + JoinToSubqueryTransformVisitor(join_to_subs_data).visit(query); } @@ -884,7 +886,7 @@ static FillColumnDescription getWithFillDescription(const ASTOrderByElement & or return descr; } -static SortDescription getSortDescription(const ASTSelectQuery & query, ContextPtr context) +SortDescription InterpreterSelectQuery::getSortDescription(const ASTSelectQuery & query, ContextPtr context_) { SortDescription order_descr; order_descr.reserve(query.orderBy()->children.size()); @@ -898,15 +900,15 @@ static SortDescription getSortDescription(const ASTSelectQuery & query, ContextP collator = std::make_shared(order_by_elem.collation->as().value.get()); if (order_by_elem.with_fill) { - FillColumnDescription fill_desc = getWithFillDescription(order_by_elem, context); + FillColumnDescription fill_desc = getWithFillDescription(order_by_elem, context_); order_descr.emplace_back(name, order_by_elem.direction, order_by_elem.nulls_direction, collator, true, fill_desc); } else order_descr.emplace_back(name, order_by_elem.direction, order_by_elem.nulls_direction, collator); } - order_descr.compile_sort_description = context->getSettingsRef().compile_sort_description; - order_descr.min_count_to_compile_sort_description = context->getSettingsRef().min_count_to_compile_sort_description; + order_descr.compile_sort_description = context_->getSettingsRef().compile_sort_description; + order_descr.min_count_to_compile_sort_description = context_->getSettingsRef().min_count_to_compile_sort_description; return order_descr; } @@ -1031,12 +1033,12 @@ static std::pair getLimitLengthAndOffset(const ASTSelectQuery & } -static UInt64 getLimitForSorting(const ASTSelectQuery & query, ContextPtr context) +UInt64 InterpreterSelectQuery::getLimitForSorting(const ASTSelectQuery & query, ContextPtr context_) { /// Partial sort can be done if there is LIMIT but no DISTINCT or LIMIT BY, neither ARRAY JOIN. if (!query.distinct && !query.limitBy() && !query.limit_with_ties && !query.arrayJoinExpressionList().first && query.limitLength()) { - auto [limit_length, limit_offset] = getLimitLengthAndOffset(query, context); + auto [limit_length, limit_offset] = getLimitLengthAndOffset(query, context_); if (limit_length > std::numeric_limits::max() - limit_offset) return 0; @@ -1615,7 +1617,10 @@ static void executeMergeAggregatedImpl( Aggregator::Params params(header_before_merge, keys, aggregates, overflow_row, settings.max_threads); - auto transform_params = std::make_shared(params, final); + auto transform_params = std::make_shared( + params, + final, + /* only_merge_= */ false); auto merging_aggregated = std::make_unique( query_plan.getCurrentDataStream(), @@ -2287,6 +2292,7 @@ void InterpreterSelectQuery::executeAggregation(QueryPlan & query_plan, const Ac std::move(aggregator_params), std::move(grouping_sets_params), final, + /* only_merge_= */ false, settings.max_block_size, settings.aggregation_in_order_max_block_bytes, merge_threads, @@ -2360,7 +2366,10 @@ void InterpreterSelectQuery::executeRollupOrCube(QueryPlan & query_plan, Modific keys.push_back(header_before_transform.getPositionByName(key.name)); auto params = getAggregatorParams(query_ptr, *query_analyzer, *context, header_before_transform, keys, query_analyzer->aggregates(), false, settings, 0, 0); - auto transform_params = std::make_shared(std::move(params), true); + auto transform_params = std::make_shared( + std::move(params), + /* final_= */ true, + /* only_merge_= */ false); QueryPlanStepPtr step; if (modificator == Modificator::ROLLUP) diff --git a/src/Interpreters/InterpreterSelectQuery.h b/src/Interpreters/InterpreterSelectQuery.h index 40afaaaeed0..e6bd81b93fe 100644 --- a/src/Interpreters/InterpreterSelectQuery.h +++ b/src/Interpreters/InterpreterSelectQuery.h @@ -128,6 +128,9 @@ public: /// It will set shard_num and shard_count to the client_info void setProperClientInfo(size_t replica_num, size_t replica_count); + static SortDescription getSortDescription(const ASTSelectQuery & query, ContextPtr context); + static UInt64 getLimitForSorting(const ASTSelectQuery & query, ContextPtr context); + private: InterpreterSelectQuery( const ASTPtr & query_ptr_, diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index b73943e0186..9196a5222a2 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include @@ -316,12 +316,12 @@ BlockIO InterpreterSystemQuery::execute() { auto caches = FileCacheFactory::instance().getAll(); for (const auto & [_, cache_data] : caches) - cache_data.cache->remove(); + cache_data.cache->removeIfReleasable(/* remove_persistent_files */false); } else { auto cache = FileCacheFactory::instance().get(query.filesystem_cache_path); - cache->remove(); + cache->removeIfReleasable(/* remove_persistent_files */false); } break; } diff --git a/src/Interpreters/InterpreterTransactionControlQuery.cpp b/src/Interpreters/InterpreterTransactionControlQuery.cpp index 1e4868788ba..bdb523de880 100644 --- a/src/Interpreters/InterpreterTransactionControlQuery.cpp +++ b/src/Interpreters/InterpreterTransactionControlQuery.cpp @@ -67,6 +67,7 @@ BlockIO InterpreterTransactionControlQuery::executeCommit(ContextMutablePtr sess if (e.code() == ErrorCodes::UNKNOWN_STATUS_OF_TRANSACTION) { /// Detach transaction from current context if connection was lost and its status is unknown + /// (so it will be possible to start new one) session_context->setCurrentTransaction(NO_TRANSACTION_PTR); } throw; @@ -80,6 +81,16 @@ BlockIO InterpreterTransactionControlQuery::executeCommit(ContextMutablePtr sess /// It's useful for testing. It allows to enable fault injection (after commit) without breaking tests. txn->waitStateChange(Tx::CommittingCSN); + CSN csn_changed_state = txn->getCSN(); + if (csn_changed_state == Tx::UnknownCSN) + { + /// CommittingCSN -> UnknownCSN -> RolledBackCSN + /// It's possible if connection was lost before commit + /// (maybe we should get rid of intermediate UnknownCSN in this transition) + txn->waitStateChange(Tx::UnknownCSN); + chassert(txn->getCSN() == Tx::RolledBackCSN); + } + if (txn->getState() == MergeTreeTransaction::ROLLED_BACK) throw Exception(ErrorCodes::INVALID_TRANSACTION, "Transaction {} was rolled back", txn->tid); if (txn->getState() != MergeTreeTransaction::COMMITTED) diff --git a/src/Interpreters/JoinToSubqueryTransformVisitor.cpp b/src/Interpreters/JoinToSubqueryTransformVisitor.cpp index a3ffaafa4db..e07430c0feb 100644 --- a/src/Interpreters/JoinToSubqueryTransformVisitor.cpp +++ b/src/Interpreters/JoinToSubqueryTransformVisitor.cpp @@ -361,6 +361,7 @@ struct CheckAliasDependencyVisitorData dependency = &ident; } }; + using CheckAliasDependencyMatcher = OneTypeMatcher; using CheckAliasDependencyVisitor = InDepthNodeVisitor; @@ -500,6 +501,7 @@ void restoreName(ASTIdentifier & ident, const String & original_name, NameSet & { if (!ident.tryGetAlias().empty()) return; + if (original_name.empty()) return; @@ -509,7 +511,9 @@ void restoreName(ASTIdentifier & ident, const String & original_name, NameSet & restored_names.emplace(original_name); } else + { ident.setShortName(original_name); + } } /// Find clashes and normalize names @@ -527,12 +531,12 @@ std::vector normalizeColumnNamesExtractNeeded( { size_t last_table_pos = tables.size() - 1; - NameSet restored_names; std::vector needed_columns; needed_columns.reserve(tables.size()); for (const auto & table : tables) needed_columns.push_back(TableNeededColumns{table.table}); + NameSet restored_names; for (ASTIdentifier * ident : identifiers) { bool got_alias = aliases.contains(ident->name()); @@ -729,7 +733,10 @@ void JoinToSubqueryTransformMatcher::visit(ASTSelectQuery & select, ASTPtr & ast std::unordered_set public_identifiers; for (auto & top_level_child : select.select()->children) if (auto * ident = top_level_child->as()) - public_identifiers.insert(ident); + { + if (!data.try_to_keep_original_names || startsWith(ident->name(), UniqueShortNames::pattern)) + public_identifiers.insert(ident); + } UniqueShortNames unique_names; std::vector needed_columns = diff --git a/src/Interpreters/JoinToSubqueryTransformVisitor.h b/src/Interpreters/JoinToSubqueryTransformVisitor.h index a024a168509..96420512ae6 100644 --- a/src/Interpreters/JoinToSubqueryTransformVisitor.h +++ b/src/Interpreters/JoinToSubqueryTransformVisitor.h @@ -21,6 +21,7 @@ public: const std::vector & tables; const Aliases & aliases; bool done = false; + bool try_to_keep_original_names = false; }; static bool needChildVisit(ASTPtr &, const ASTPtr &); diff --git a/src/Interpreters/MergeJoin.cpp b/src/Interpreters/MergeJoin.cpp index d0bf4939c90..3dd6b7de574 100644 --- a/src/Interpreters/MergeJoin.cpp +++ b/src/Interpreters/MergeJoin.cpp @@ -31,9 +31,12 @@ namespace ErrorCodes namespace { -String deriveTempName(const String & name) +String deriveTempName(const String & name, JoinTableSide block_side) { - return "--" + name; + if (block_side == JoinTableSide::Left) + return "--pmj_cond_left_" + name; + else + return "--pmj_cond_right_" + name; } /* @@ -42,7 +45,7 @@ String deriveTempName(const String & name) * 0 converted to NULL and such rows won't be joined, * 1 converted to 0 (any constant non-NULL value to join) */ -ColumnWithTypeAndName condtitionColumnToJoinable(const Block & block, const String & src_column_name) +ColumnWithTypeAndName condtitionColumnToJoinable(const Block & block, const String & src_column_name, JoinTableSide block_side) { size_t res_size = block.rows(); auto data_col = ColumnUInt8::create(res_size, 0); @@ -60,10 +63,10 @@ ColumnWithTypeAndName condtitionColumnToJoinable(const Block & block, const Stri ColumnPtr res_col = ColumnNullable::create(std::move(data_col), std::move(null_map)); DataTypePtr res_col_type = std::make_shared(std::make_shared()); - String res_name = deriveTempName(src_column_name); + String res_name = deriveTempName(src_column_name, block_side); if (block.has(res_name)) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Conflicting column name '{}'", res_name); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Conflicting column name '{}' in block {}", res_name, block.dumpStructure()); return {res_col, res_col_type, res_name}; } @@ -517,8 +520,8 @@ MergeJoin::MergeJoin(std::shared_ptr table_join_, const Block & right { JoinCommon::checkTypesOfMasks({}, "", right_sample_block, mask_column_name_right); - key_names_left.push_back(deriveTempName(mask_column_name_left)); - key_names_right.push_back(deriveTempName(mask_column_name_right)); + key_names_left.push_back(deriveTempName(mask_column_name_left, JoinTableSide::Left)); + key_names_right.push_back(deriveTempName(mask_column_name_right, JoinTableSide::Right)); } key_names_left.insert(key_names_left.end(), onexpr.key_names_left.begin(), onexpr.key_names_left.end()); @@ -735,7 +738,7 @@ void MergeJoin::joinBlock(Block & block, ExtraBlockPtr & not_processed) not_processed = std::make_shared(NotProcessed{{}, 0, 0, 0}); if (needConditionJoinColumn()) - block.erase(deriveTempName(mask_column_name_left)); + block.erase(deriveTempName(mask_column_name_left, JoinTableSide::Left)); JoinCommon::restoreLowCardinalityInplace(block, lowcard_keys); } @@ -1126,9 +1129,9 @@ void MergeJoin::addConditionJoinColumn(Block & block, JoinTableSide block_side) if (needConditionJoinColumn()) { if (block_side == JoinTableSide::Left) - block.insert(condtitionColumnToJoinable(block, mask_column_name_left)); + block.insert(condtitionColumnToJoinable(block, mask_column_name_left, block_side)); else - block.insert(condtitionColumnToJoinable(block, mask_column_name_right)); + block.insert(condtitionColumnToJoinable(block, mask_column_name_right, block_side)); } } diff --git a/src/Interpreters/MergeTreeTransaction.cpp b/src/Interpreters/MergeTreeTransaction.cpp index cab40f3c6db..e6b4818b4d7 100644 --- a/src/Interpreters/MergeTreeTransaction.cpp +++ b/src/Interpreters/MergeTreeTransaction.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB { @@ -146,8 +147,8 @@ void MergeTreeTransaction::removeOldPart(const StoragePtr & storage, const DataP std::lock_guard lock{mutex}; checkIsNotCancelled(); - LockMemoryExceptionInThread lock_memory_tracker(VariableContext::Global); part_to_remove->version.lockRemovalTID(tid, context); + NOEXCEPT_SCOPE; storages.insert(storage); if (maybe_lock) table_read_locks_for_ordinary_db.emplace_back(std::move(maybe_lock)); diff --git a/src/Interpreters/MonotonicityCheckVisitor.h b/src/Interpreters/MonotonicityCheckVisitor.h index 00347388de2..4b9f36ab72d 100644 --- a/src/Interpreters/MonotonicityCheckVisitor.h +++ b/src/Interpreters/MonotonicityCheckVisitor.h @@ -47,8 +47,7 @@ public: /// if ORDER BY contains aggregate function or window functions, it /// shouldn't be optimized if (ast_function.is_window_function - || AggregateFunctionFactory::instance().isAggregateFunctionName( - ast_function.name)) + || AggregateUtils::isAggregateFunction(ast_function)) { return false; } diff --git a/src/Interpreters/PartLog.cpp b/src/Interpreters/PartLog.cpp index 13b74f3d00a..f79be6a67e0 100644 --- a/src/Interpreters/PartLog.cpp +++ b/src/Interpreters/PartLog.cpp @@ -191,8 +191,8 @@ bool PartLog::addNewParts( elem.table_name = table_id.table_name; elem.partition_id = part->info.partition_id; elem.part_name = part->name; - elem.disk_name = part->volume->getDisk()->getName(); - elem.path_on_disk = part->getFullPath(); + elem.disk_name = part->data_part_storage->getDiskName(); + elem.path_on_disk = part->data_part_storage->getFullPath(); elem.part_type = part->getType(); elem.bytes_compressed_on_disk = part->getBytesOnDisk(); diff --git a/src/Interpreters/RewriteAnyFunctionVisitor.cpp b/src/Interpreters/RewriteAnyFunctionVisitor.cpp index 33362648ae1..163e117f93d 100644 --- a/src/Interpreters/RewriteAnyFunctionVisitor.cpp +++ b/src/Interpreters/RewriteAnyFunctionVisitor.cpp @@ -41,8 +41,7 @@ bool extractIdentifiers(const ASTFunction & func, std::unordered_set & // be inside `any`, but this check in GetAggregatesMatcher happens // later, so we have to explicitly skip these nested functions here. if (arg_func->is_window_function - || AggregateFunctionFactory::instance().isAggregateFunctionName( - arg_func->name)) + || AggregateUtils::isAggregateFunction(*arg_func)) { return false; } diff --git a/src/Interpreters/ThreadStatusExt.cpp b/src/Interpreters/ThreadStatusExt.cpp index 2c83a3bfcdb..42db91f47c0 100644 --- a/src/Interpreters/ThreadStatusExt.cpp +++ b/src/Interpreters/ThreadStatusExt.cpp @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #if defined(OS_LINUX) @@ -343,7 +343,7 @@ void ThreadStatus::finalizeQueryProfiler() void ThreadStatus::detachQuery(bool exit_if_already_detached, bool thread_exits) { - LockMemoryExceptionInThread lock(VariableContext::Global); + NOEXCEPT_SCOPE; if (exit_if_already_detached && thread_state == ThreadState::DetachedFromQuery) { diff --git a/src/Interpreters/TransactionLog.cpp b/src/Interpreters/TransactionLog.cpp index 0ddc726ff7f..e6bd47eed44 100644 --- a/src/Interpreters/TransactionLog.cpp +++ b/src/Interpreters/TransactionLog.cpp @@ -9,12 +9,9 @@ #include #include #include +#include -/// It's used in critical places to exit on unexpected exceptions. -/// SIGABRT is usually better that broken state in memory with unpredictable consequences. -#define NOEXCEPT_SCOPE SCOPE_EXIT({ if (std::uncaught_exceptions()) { tryLogCurrentException("NOEXCEPT_SCOPE"); abort(); } }) - namespace DB { @@ -146,8 +143,7 @@ void TransactionLog::loadEntries(Strings::const_iterator beg, Strings::const_ite } futures.clear(); - NOEXCEPT_SCOPE; - LockMemoryExceptionInThread lock_memory_tracker(VariableContext::Global); + NOEXCEPT_SCOPE_STRICT; { std::lock_guard lock{mutex}; for (const auto & entry : loaded) @@ -453,7 +449,7 @@ CSN TransactionLog::commitTransaction(const MergeTreeTransactionPtr & txn, bool /// Do not allow exceptions between commit point and the and of transaction finalization /// (otherwise it may stuck in COMMITTING state holding snapshot). - NOEXCEPT_SCOPE; + NOEXCEPT_SCOPE_STRICT; /// FIXME Transactions: Sequential node numbers in ZooKeeper are Int32, but 31 bit is not enough for production use /// (overflow is possible in a several weeks/months of active usage) allocated_csn = deserializeCSN(csn_path_created.substr(zookeeper_path_log.size() + 1)); diff --git a/src/Interpreters/TransactionVersionMetadata.cpp b/src/Interpreters/TransactionVersionMetadata.cpp index 36a4fb9cc5b..5f46b86508c 100644 --- a/src/Interpreters/TransactionVersionMetadata.cpp +++ b/src/Interpreters/TransactionVersionMetadata.cpp @@ -95,12 +95,8 @@ bool VersionMetadata::tryLockRemovalTID(const TransactionID & tid, const Transac bool locked = removal_tid_lock.compare_exchange_strong(expected_removal_lock_value, removal_lock_value); if (!locked) { - if (tid == Tx::PrehistoricTID && expected_removal_lock_value == Tx::PrehistoricTID.getHash()) - { - /// Don't need to lock part for queries without transaction - LOG_TEST(log, "Assuming removal_tid is locked by {}, table: {}, part: {}", tid, context.table.getNameForLogs(), context.part_name); - return true; - } + if (expected_removal_lock_value == removal_lock_value) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Tried to lock part {} for removal second time by {}", context.part_name, tid); if (locked_by_id) *locked_by_id = expected_removal_lock_value; diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index 70aa859e741..3d14955c16a 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -511,7 +511,7 @@ void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const new_elements.push_back(elem); /// removing aggregation can change number of rows, so `count()` result in outer sub-query would be wrong - if (func && AggregateFunctionFactory::instance().isAggregateFunctionName(func->name) && !select_query->groupBy()) + if (func && AggregateUtils::isAggregateFunction(*func) && !select_query->groupBy()) new_elements.push_back(elem); } } @@ -1248,6 +1248,7 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect( result.aggregates = getAggregates(query, *select_query); result.window_function_asts = getWindowFunctions(query, *select_query); + result.expressions_with_window_function = getExpressionsWithWindowFunctions(query); result.collectUsedColumns(query, true); result.required_source_columns_before_expanding_alias_columns = result.required_source_columns.getNames(); @@ -1271,6 +1272,7 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect( { result.aggregates = getAggregates(query, *select_query); result.window_function_asts = getWindowFunctions(query, *select_query); + result.expressions_with_window_function = getExpressionsWithWindowFunctions(query); result.collectUsedColumns(query, true); } } diff --git a/src/Interpreters/TreeRewriter.h b/src/Interpreters/TreeRewriter.h index 2c246455ade..b84756260a8 100644 --- a/src/Interpreters/TreeRewriter.h +++ b/src/Interpreters/TreeRewriter.h @@ -20,7 +20,7 @@ using Scalars = std::map; struct StorageInMemoryMetadata; using StorageMetadataPtr = std::shared_ptr; struct StorageSnapshot; -using StorageSnapshotPtr = std::shared_ptr; +using StorageSnapshotPtr = std::shared_ptr; struct TreeRewriterResult { @@ -44,6 +44,8 @@ struct TreeRewriterResult std::vector window_function_asts; + std::vector expressions_with_window_function; + /// Which column is needed to be ARRAY-JOIN'ed to get the specified. /// For example, for `SELECT s.v ... ARRAY JOIN a AS s` will get "s.v" -> "a.v". NameToNameMap array_join_result_to_source; diff --git a/src/Interpreters/UserDefinedSQLObjectsLoader.cpp b/src/Interpreters/UserDefinedSQLObjectsLoader.cpp index c38df5b855f..75b91f3a817 100644 --- a/src/Interpreters/UserDefinedSQLObjectsLoader.cpp +++ b/src/Interpreters/UserDefinedSQLObjectsLoader.cpp @@ -86,28 +86,33 @@ void UserDefinedSQLObjectsLoader::loadObjects(ContextPtr context) if (unlikely(!enable_persistence)) return; - LOG_DEBUG(log, "loading user defined objects"); + LOG_DEBUG(log, "Loading user defined objects"); String dir_path = context->getUserDefinedPath(); Poco::DirectoryIterator dir_end; for (Poco::DirectoryIterator it(dir_path); it != dir_end; ++it) { - if (it->isLink()) + if (it->isDirectory()) continue; - const auto & file_name = it.name(); + const std::string & file_name = it.name(); /// For '.svn', '.gitignore' directory and similar. if (file_name.at(0) == '.') continue; - if (!it->isDirectory() && endsWith(file_name, ".sql")) - { - std::string_view object_name = file_name; - object_name.remove_suffix(strlen(".sql")); - object_name.remove_prefix(strlen("function_")); - loadUserDefinedObject(context, UserDefinedSQLObjectType::Function, object_name, dir_path + it.name()); - } + if (!startsWith(file_name, "function_") || !endsWith(file_name, ".sql")) + continue; + + std::string_view object_name = file_name; + + object_name.remove_prefix(strlen("function_")); + object_name.remove_suffix(strlen(".sql")); + + if (object_name.empty()) + continue; + + loadUserDefinedObject(context, UserDefinedSQLObjectType::Function, object_name, dir_path + it.name()); } } diff --git a/src/Interpreters/WindowDescription.h b/src/Interpreters/WindowDescription.h index bb0130b4d4e..65c8cb9423c 100644 --- a/src/Interpreters/WindowDescription.h +++ b/src/Interpreters/WindowDescription.h @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB { @@ -90,6 +91,9 @@ struct WindowDescription // then by ORDER BY. This field holds this combined sort order. SortDescription full_sort_description; + std::vector partition_by_actions; + std::vector order_by_actions; + WindowFrame frame; // The window functions that are calculated for this window. diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index f1661549c61..387f4e892a1 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -32,8 +32,6 @@ namespace ErrorCodes extern const int TIMEOUT_EXCEEDED; extern const int UNFINISHED; extern const int QUERY_IS_PROHIBITED; - extern const int INVALID_SHARD_ID; - extern const int NO_SUCH_REPLICA; extern const int LOGICAL_ERROR; } @@ -81,49 +79,28 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, ContextPtr context, } } - query->cluster = context->getMacros()->expand(query->cluster); + ClusterPtr cluster = params.cluster; + if (!cluster) + { + query->cluster = context->getMacros()->expand(query->cluster); + cluster = context->getCluster(query->cluster); + } /// TODO: support per-cluster grant context->checkAccess(AccessType::CLUSTER); - ClusterPtr cluster = params.cluster ? params.cluster : context->getCluster(query->cluster); DDLWorker & ddl_worker = context->getDDLWorker(); /// Enumerate hosts which will be used to send query. - std::vector hosts; - Cluster::AddressesWithFailover shards = cluster->getShardsAddresses(); - - auto collect_hosts_from_replicas = [&](size_t shard_index) - { - if (shard_index > shards.size()) - throw Exception(ErrorCodes::INVALID_SHARD_ID, "Cluster {} doesn't have shard #{}", query->cluster, shard_index); - const auto & replicas = shards[shard_index - 1]; - if (params.only_replica_num) - { - if (params.only_replica_num > replicas.size()) - throw Exception(ErrorCodes::NO_SUCH_REPLICA, "Cluster {} doesn't have replica #{} in shard #{}", query->cluster, params.only_replica_num, shard_index); - hosts.emplace_back(replicas[params.only_replica_num - 1]); - } - else - { - for (const auto & addr : replicas) - hosts.emplace_back(addr); - } - }; - - if (params.only_shard_num) - { - collect_hosts_from_replicas(params.only_shard_num); - } - else - { - for (size_t shard_index = 1; shard_index <= shards.size(); ++shard_index) - collect_hosts_from_replicas(shard_index); - } - - if (hosts.empty()) + auto addresses = cluster->filterAddressesByShardOrReplica(params.only_shard_num, params.only_replica_num); + if (addresses.empty()) throw Exception("No hosts defined to execute distributed DDL query", ErrorCodes::LOGICAL_ERROR); + std::vector hosts; + hosts.reserve(addresses.size()); + for (const auto * address : addresses) + hosts.emplace_back(*address); + /// The current database in a distributed query need to be replaced with either /// the local current database or a shard's default database. AccessRightsElements access_to_check = params.access_to_check; @@ -137,22 +114,19 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, ContextPtr context, if (need_replace_current_database) { - Strings shard_default_databases; - for (const auto & shard : shards) + Strings host_default_databases; + for (const auto * address : addresses) { - for (const auto & addr : shard) - { - if (!addr.default_database.empty()) - shard_default_databases.push_back(addr.default_database); - else - use_local_default_database = true; - } + if (!address->default_database.empty()) + host_default_databases.push_back(address->default_database); + else + use_local_default_database = true; } - ::sort(shard_default_databases.begin(), shard_default_databases.end()); - shard_default_databases.erase(std::unique(shard_default_databases.begin(), shard_default_databases.end()), shard_default_databases.end()); - assert(use_local_default_database || !shard_default_databases.empty()); + ::sort(host_default_databases.begin(), host_default_databases.end()); + host_default_databases.erase(std::unique(host_default_databases.begin(), host_default_databases.end()), host_default_databases.end()); + assert(use_local_default_database || !host_default_databases.empty()); - if (use_local_default_database && !shard_default_databases.empty()) + if (use_local_default_database && !host_default_databases.empty()) throw Exception("Mixed local default DB and shard default DB in DDL query", ErrorCodes::NOT_IMPLEMENTED); if (use_local_default_database) @@ -166,10 +140,10 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, ContextPtr context, auto & element = access_to_check[i]; if (element.isEmptyDatabase()) { - access_to_check.insert(access_to_check.begin() + i + 1, shard_default_databases.size() - 1, element); - for (size_t j = 0; j != shard_default_databases.size(); ++j) - access_to_check[i + j].replaceEmptyDatabase(shard_default_databases[j]); - i += shard_default_databases.size(); + access_to_check.insert(access_to_check.begin() + i + 1, host_default_databases.size() - 1, element); + for (size_t j = 0; j != host_default_databases.size(); ++j) + access_to_check[i + j].replaceEmptyDatabase(host_default_databases[j]); + i += host_default_databases.size(); } else ++i; diff --git a/src/Interpreters/getHeaderForProcessingStage.h b/src/Interpreters/getHeaderForProcessingStage.h index 6ada136030e..a578c414e3b 100644 --- a/src/Interpreters/getHeaderForProcessingStage.h +++ b/src/Interpreters/getHeaderForProcessingStage.h @@ -11,7 +11,7 @@ namespace DB class IStorage; struct StorageSnapshot; -using StorageSnapshotPtr = std::shared_ptr; +using StorageSnapshotPtr = std::shared_ptr; struct SelectQueryInfo; struct TreeRewriterResult; class ASTSelectQuery; diff --git a/src/Parsers/ASTBackupQuery.cpp b/src/Parsers/ASTBackupQuery.cpp index f4711c8a6fb..6f6c871fceb 100644 --- a/src/Parsers/ASTBackupQuery.cpp +++ b/src/Parsers/ASTBackupQuery.cpp @@ -14,68 +14,9 @@ namespace using Element = ASTBackupQuery::Element; using ElementType = ASTBackupQuery::ElementType; - void formatType(ElementType type, bool is_temp_db, const IAST::FormatSettings & format) - { - switch (type) - { - case ElementType::TABLE: - { - format.ostr << (format.hilite ? IAST::hilite_keyword : ""); - if (is_temp_db) - format.ostr << " TEMPORARY TABLE"; - else - format.ostr << " TABLE"; - format.ostr << (format.hilite ? IAST::hilite_none : ""); - break; - } - case ElementType::DATABASE: - { - format.ostr << (format.hilite ? IAST::hilite_keyword : ""); - if (is_temp_db) - format.ostr << " ALL TEMPORARY TABLES"; - else - format.ostr << " DATABASE"; - format.ostr << (format.hilite ? IAST::hilite_none : ""); - break; - } - case ElementType::ALL_DATABASES: - { - format.ostr << (format.hilite ? IAST::hilite_keyword : "") << " ALL DATABASES" << (format.hilite ? IAST::hilite_none : ""); - break; - } - } - } - - void formatName(const DatabaseAndTableName & name, ElementType type, bool is_temp_db, const IAST::FormatSettings & format) - { - switch (type) - { - case ElementType::TABLE: - { - format.ostr << " "; - if (!is_temp_db && !name.first.empty()) - format.ostr << backQuoteIfNeed(name.first) << "."; - format.ostr << backQuoteIfNeed(name.second); - break; - } - case ElementType::DATABASE: - { - if (!is_temp_db) - format.ostr << " " << backQuoteIfNeed(name.first); - break; - } - case ElementType::ALL_DATABASES: - { - break; - } - } - } - void formatPartitions(const ASTs & partitions, const IAST::FormatSettings & format) { - if (partitions.empty()) - return; - format.ostr << (format.hilite ? IAST::hilite_keyword : "") << " " << ((partitions.size() == 1) ? "PARTITION" : "PARTITIONS") << " " + format.ostr << " " << (format.hilite ? IAST::hilite_keyword : "") << ((partitions.size() == 1) ? "PARTITION" : "PARTITIONS") << " " << (format.hilite ? IAST::hilite_none : ""); bool need_comma = false; for (const auto & partition : partitions) @@ -87,40 +28,106 @@ namespace } } - void formatExceptList(const std::set & except_list, bool show_except_tables, const IAST::FormatSettings & format) + void formatExceptDatabases(const std::set & except_databases, const IAST::FormatSettings & format) { - if (except_list.empty()) + if (except_databases.empty()) return; - format.ostr << (format.hilite ? IAST::hilite_keyword : "") << " EXCEPT " << (format.hilite ? IAST::hilite_none : ""); - if (show_except_tables) - format.ostr << (format.hilite ? IAST::hilite_keyword : "") << "TABLES " << (format.hilite ? IAST::hilite_none : ""); + format.ostr << (format.hilite ? IAST::hilite_keyword : "") << " EXCEPT " + << (except_databases.size() == 1 ? "DATABASE" : "DATABASES") << " " << (format.hilite ? IAST::hilite_none : ""); bool need_comma = false; - for (const auto & item : except_list) + for (const auto & database_name : except_databases) { if (std::exchange(need_comma, true)) format.ostr << ","; - format.ostr << " " << backQuoteIfNeed(item); + format.ostr << backQuoteIfNeed(database_name); + } + } + + void formatExceptTables(const std::set & except_tables, const IAST::FormatSettings & format) + { + if (except_tables.empty()) + return; + + format.ostr << (format.hilite ? IAST::hilite_keyword : "") << " EXCEPT " << (except_tables.size() == 1 ? "TABLE" : "TABLES") << " " + << (format.hilite ? IAST::hilite_none : ""); + + bool need_comma = false; + for (const auto & table_name : except_tables) + { + if (std::exchange(need_comma, true)) + format.ostr << ", "; + + if (!table_name.first.empty()) + format.ostr << backQuoteIfNeed(table_name.first) << "."; + format.ostr << backQuoteIfNeed(table_name.second); } } void formatElement(const Element & element, const IAST::FormatSettings & format) { - formatType(element.type, element.is_temp_db, format); - formatName(element.name, element.type, element.is_temp_db, format); - - bool new_name_is_different = (element.new_name != element.name); - if (new_name_is_different) + switch (element.type) { - format.ostr << " " << (format.hilite ? IAST::hilite_keyword : "") << "AS" << (format.hilite ? IAST::hilite_none : ""); - formatName(element.new_name, element.type, element.is_temp_db, format); + case ElementType::TABLE: + { + format.ostr << (format.hilite ? IAST::hilite_keyword : "") << "TABLE " << (format.hilite ? IAST::hilite_none : ""); + + if (!element.database_name.empty()) + format.ostr << backQuoteIfNeed(element.database_name) << "."; + format.ostr << backQuoteIfNeed(element.table_name); + + if ((element.new_table_name != element.table_name) || (element.new_database_name != element.database_name)) + { + format.ostr << (format.hilite ? IAST::hilite_keyword : "") << " AS " << (format.hilite ? IAST::hilite_none : ""); + if (!element.new_database_name.empty()) + format.ostr << backQuoteIfNeed(element.new_database_name) << "."; + format.ostr << backQuoteIfNeed(element.new_table_name); + } + + if (element.partitions) + formatPartitions(*element.partitions, format); + break; + } + + case ElementType::TEMPORARY_TABLE: + { + format.ostr << (format.hilite ? IAST::hilite_keyword : "") << "TEMPORARY TABLE " << (format.hilite ? IAST::hilite_none : ""); + format.ostr << backQuoteIfNeed(element.table_name); + + if (element.new_table_name != element.table_name) + { + format.ostr << (format.hilite ? IAST::hilite_keyword : "") << " AS " << (format.hilite ? IAST::hilite_none : ""); + format.ostr << backQuoteIfNeed(element.new_table_name); + } + break; + } + + case ElementType::DATABASE: + { + format.ostr << (format.hilite ? IAST::hilite_keyword : ""); + format.ostr << "DATABASE "; + format.ostr << (format.hilite ? IAST::hilite_none : ""); + format.ostr << backQuoteIfNeed(element.database_name); + + if (element.new_database_name != element.database_name) + { + format.ostr << (format.hilite ? IAST::hilite_keyword : "") << " AS " << (format.hilite ? IAST::hilite_none : ""); + format.ostr << backQuoteIfNeed(element.new_database_name); + } + + formatExceptTables(element.except_tables, format); + break; + } + + case ElementType::ALL: + { + format.ostr << (format.hilite ? IAST::hilite_keyword : "") << "ALL" << (format.hilite ? IAST::hilite_none : ""); + formatExceptDatabases(element.except_databases, format); + formatExceptTables(element.except_tables, format); + break; + } } - - formatPartitions(element.partitions, format); - - bool show_except_tables = ((element.type == ASTBackupQuery::DATABASE) || !element.is_temp_db); - formatExceptList(element.except_list, show_except_tables, format); } void formatElements(const std::vector & elements, const IAST::FormatSettings & format) @@ -129,7 +136,7 @@ namespace for (const auto & element : elements) { if (std::exchange(need_comma, true)) - format.ostr << ","; + format.ostr << ", "; formatElement(element, format); } } @@ -192,22 +199,41 @@ namespace } -void ASTBackupQuery::Element::setDatabase(const String & new_database) +void ASTBackupQuery::Element::setCurrentDatabase(const String & current_database) { - if ((type == ASTBackupQuery::TABLE) && !is_temp_db) + if (current_database.empty()) + return; + + if (type == ASTBackupQuery::TABLE) { - if (name.first.empty()) - name.first = new_database; - if (new_name.first.empty()) - new_name.first = new_database; + if (database_name.empty()) + database_name = current_database; + if (new_database_name.empty()) + new_database_name = current_database; + } + else if (type == ASTBackupQuery::ALL) + { + for (auto it = except_tables.begin(); it != except_tables.end();) + { + const auto & except_table = *it; + if (except_table.first.empty()) + { + except_tables.emplace(DatabaseAndTableName{current_database, except_table.second}); + it = except_tables.erase(it); + } + else + { + ++it; + } + } } } -void ASTBackupQuery::setDatabase(ASTBackupQuery::Elements & elements, const String & new_database) +void ASTBackupQuery::setCurrentDatabase(ASTBackupQuery::Elements & elements, const String & current_database) { for (auto & element : elements) - element.setDatabase(new_database); + element.setCurrentDatabase(current_database); } @@ -225,7 +251,7 @@ ASTPtr ASTBackupQuery::clone() const void ASTBackupQuery::formatImpl(const FormatSettings & format, FormatState &, FormatStateStacked) const { - format.ostr << (format.hilite ? hilite_keyword : "") << ((kind == Kind::BACKUP) ? "BACKUP" : "RESTORE") + format.ostr << (format.hilite ? hilite_keyword : "") << ((kind == Kind::BACKUP) ? "BACKUP " : "RESTORE ") << (format.hilite ? hilite_none : ""); formatElements(elements, format); @@ -243,7 +269,7 @@ ASTPtr ASTBackupQuery::getRewrittenASTWithoutOnCluster(const WithoutOnClusterAST auto new_query = std::static_pointer_cast(clone()); new_query->cluster.clear(); new_query->settings = rewriteSettingsWithoutOnCluster(new_query->settings, params); - new_query->setDatabase(new_query->elements, params.default_database); + new_query->setCurrentDatabase(new_query->elements, params.default_database); return new_query; } diff --git a/src/Parsers/ASTBackupQuery.h b/src/Parsers/ASTBackupQuery.h index 53060b6f232..491f192042b 100644 --- a/src/Parsers/ASTBackupQuery.h +++ b/src/Parsers/ASTBackupQuery.h @@ -14,7 +14,6 @@ using DatabaseAndTableName = std::pair; * DICTIONARY [db.]dictionary_name [AS [db.]dictionary_name_in_backup] | * DATABASE database_name [AS database_name_in_backup] [EXCEPT TABLES ...] | * TEMPORARY TABLE table_name [AS table_name_in_backup] | - * ALL TEMPORARY TABLES [EXCEPT ...] | * ALL DATABASES [EXCEPT ...] } [,...] * [ON CLUSTER 'cluster_name'] * TO { File('path/') | @@ -25,7 +24,6 @@ using DatabaseAndTableName = std::pair; * DICTIONARY [db.]dictionary_name_in_backup [AS [db.]dictionary_name] | * DATABASE database_name_in_backup [AS database_name] [EXCEPT TABLES ...] | * TEMPORARY TABLE table_name_in_backup [AS table_name] | - * ALL TEMPORARY TABLES [EXCEPT ...] | * ALL DATABASES [EXCEPT ...] } [,...] * [ON CLUSTER 'cluster_name'] * FROM {File(...) | Disk(...)} @@ -52,25 +50,28 @@ public: enum ElementType { TABLE, + TEMPORARY_TABLE, DATABASE, - ALL_DATABASES, + ALL, }; struct Element { ElementType type; - DatabaseAndTableName name; - DatabaseAndTableName new_name; - bool is_temp_db = false; - ASTs partitions; - std::set except_list; + String table_name; + String database_name; + String new_table_name; /// usually the same as `table_name`, can be different in case of using AS + String new_database_name; /// usually the same as `database_name`, can be different in case of using AS + std::optional partitions; + std::set except_tables; + std::set except_databases; - void setDatabase(const String & new_database); + void setCurrentDatabase(const String & current_database); }; using Elements = std::vector; - static void setDatabase(Elements & elements, const String & new_database); - void setDatabase(const String & new_database) { setDatabase(elements, new_database); } + static void setCurrentDatabase(Elements & elements, const String & current_database); + void setCurrentDatabase(const String & current_database) { setCurrentDatabase(elements, current_database); } Elements elements; diff --git a/src/Parsers/ASTCreateQuery.cpp b/src/Parsers/ASTCreateQuery.cpp index 9bae29d47ff..e676be63921 100644 --- a/src/Parsers/ASTCreateQuery.cpp +++ b/src/Parsers/ASTCreateQuery.cpp @@ -428,6 +428,8 @@ void ASTCreateQuery::formatQueryImpl(const FormatSettings & settings, FormatStat if (is_populate) settings.ostr << (settings.hilite ? hilite_keyword : "") << " POPULATE" << (settings.hilite ? hilite_none : ""); + else if (is_create_empty) + settings.ostr << (settings.hilite ? hilite_keyword : "") << " EMPTY" << (settings.hilite ? hilite_none : ""); if (select) { diff --git a/src/Parsers/ASTCreateQuery.h b/src/Parsers/ASTCreateQuery.h index 596baa3eb3c..f9f57183a64 100644 --- a/src/Parsers/ASTCreateQuery.h +++ b/src/Parsers/ASTCreateQuery.h @@ -70,6 +70,7 @@ public: bool is_live_view{false}; bool is_window_view{false}; bool is_populate{false}; + bool is_create_empty{false}; /// CREATE TABLE ... EMPTY AS SELECT ... bool replace_view{false}; /// CREATE OR REPLACE VIEW ASTColumns * columns_list = nullptr; diff --git a/src/Parsers/ASTFunction.cpp b/src/Parsers/ASTFunction.cpp index b86929b054c..69927c430dc 100644 --- a/src/Parsers/ASTFunction.cpp +++ b/src/Parsers/ASTFunction.cpp @@ -89,6 +89,24 @@ void ASTFunction::appendColumnNameImpl(WriteBuffer & ostr) const } } +void ASTFunction::finishFormatWithWindow(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const +{ + if (!is_window_function) + return; + + settings.ostr << " OVER "; + if (!window_name.empty()) + { + settings.ostr << backQuoteIfNeed(window_name); + } + else + { + settings.ostr << "("; + window_definition->formatImpl(settings, state, frame); + settings.ostr << ")"; + } +} + /** Get the text that identifies this element. */ String ASTFunction::getID(char delim) const { @@ -563,7 +581,7 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format if (written) { - return; + return finishFormatWithWindow(settings, state, frame); } settings.ostr << (settings.hilite ? hilite_function : "") << name; @@ -603,22 +621,7 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format settings.ostr << (settings.hilite ? hilite_none : ""); - if (!is_window_function) - { - return; - } - - settings.ostr << " OVER "; - if (!window_name.empty()) - { - settings.ostr << backQuoteIfNeed(window_name); - } - else - { - settings.ostr << "("; - window_definition->formatImpl(settings, state, frame); - settings.ostr << ")"; - } + return finishFormatWithWindow(settings, state, frame); } String getFunctionName(const IAST * ast) diff --git a/src/Parsers/ASTFunction.h b/src/Parsers/ASTFunction.h index 6efbe512cf4..6d5089f802e 100644 --- a/src/Parsers/ASTFunction.h +++ b/src/Parsers/ASTFunction.h @@ -22,6 +22,8 @@ public: bool is_window_function = false; + bool compute_after_window_functions = false; + // We have to make these fields ASTPtr because this is what the visitors // expect. Some of them take const ASTPtr & (makes no sense), and some // take ASTPtr & and modify it. I don't understand how the latter is @@ -54,6 +56,8 @@ public: protected: void formatImplWithoutAlias(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; void appendColumnNameImpl(WriteBuffer & ostr) const override; +private: + void finishFormatWithWindow(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const; }; diff --git a/src/Parsers/ParserBackupQuery.cpp b/src/Parsers/ParserBackupQuery.cpp index d06816cb951..649304b1dab 100644 --- a/src/Parsers/ParserBackupQuery.cpp +++ b/src/Parsers/ParserBackupQuery.cpp @@ -21,75 +21,7 @@ namespace using Element = ASTBackupQuery::Element; using ElementType = ASTBackupQuery::ElementType; - bool parseType(IParser::Pos & pos, Expected & expected, ElementType & type, bool & is_temp_db) - { - is_temp_db = false; - if (ParserKeyword{"TABLE"}.ignore(pos, expected) || ParserKeyword{"DICTIONARY"}.ignore(pos, expected)) - { - type = ElementType::TABLE; - return true; - } - if (ParserKeyword{"TEMPORARY TABLE"}.ignore(pos, expected)) - { - type = ElementType::TABLE; - is_temp_db = true; - return true; - } - if (ParserKeyword{"DATABASE"}.ignore(pos, expected)) - { - type = ElementType::DATABASE; - return true; - } - if (ParserKeyword{"ALL TEMPORARY TABLES"}.ignore(pos, expected)) - { - type = ElementType::DATABASE; - is_temp_db = true; - return true; - } - if (ParserKeyword{"ALL DATABASES"}.ignore(pos, expected)) - { - type = ElementType::ALL_DATABASES; - return true; - } - return false; - } - - bool parseName(IParser::Pos & pos, Expected & expected, ElementType type, bool is_temp_db, DatabaseAndTableName & name) - { - name.first.clear(); - name.second.clear(); - switch (type) - { - case ElementType::TABLE: - { - if (is_temp_db) - { - ASTPtr ast; - if (!ParserIdentifier{}.parse(pos, ast, expected)) - return false; - name.second = getIdentifierName(ast); - return true; - } - return parseDatabaseAndTableName(pos, expected, name.first, name.second); - } - - case ElementType::DATABASE: - { - if (is_temp_db) - return false; - ASTPtr ast; - if (!ParserIdentifier{}.parse(pos, ast, expected)) - return false; - name.first = getIdentifierName(ast); - return true; - } - - default: - return false; - } - } - - bool parsePartitions(IParser::Pos & pos, Expected & expected, ASTs & partitions) + bool parsePartitions(IParser::Pos & pos, Expected & expected, std::optional & partitions) { if (!ParserKeyword{"PARTITION"}.ignore(pos, expected) && !ParserKeyword{"PARTITIONS"}.ignore(pos, expected)) return false; @@ -110,75 +42,145 @@ namespace return true; } - bool parseExceptList(IParser::Pos & pos, Expected & expected, bool parse_except_tables, std::set & except_list) - { - if (!ParserKeyword{parse_except_tables ? "EXCEPT TABLES" : "EXCEPT"}.ignore(pos, expected)) - return false; - - std::set result; - auto parse_list_element = [&] - { - ASTPtr ast; - if (!ParserIdentifier{}.parse(pos, ast, expected)) - return false; - result.insert(getIdentifierName(ast)); - return true; - }; - if (!ParserList::parseUtil(pos, expected, parse_list_element, false)) - return false; - - except_list = std::move(result); - return true; - } - - bool parseElement(IParser::Pos & pos, Expected & expected, Element & entry) + bool parseExceptDatabases(IParser::Pos & pos, Expected & expected, std::set & except_databases) { return IParserBase::wrapParseImpl(pos, [&] { - ElementType type; - bool is_temp_db = false; - if (!parseType(pos, expected, type, is_temp_db)) + if (!ParserKeyword{"EXCEPT DATABASE"}.ignore(pos, expected) && !ParserKeyword{"EXCEPT DATABASES"}.ignore(pos, expected)) return false; - DatabaseAndTableName name; - if ((type == ElementType::TABLE) || (type == ElementType::DATABASE && !is_temp_db)) + std::set result; + auto parse_list_element = [&] { - if (!parseName(pos, expected, type, is_temp_db, name)) + ASTPtr ast; + if (!ParserIdentifier{}.parse(pos, ast, expected)) return false; - } + result.insert(getIdentifierName(ast)); + return true; + }; + if (!ParserList::parseUtil(pos, expected, parse_list_element, false)) + return false; - DatabaseAndTableName new_name = name; - if (ParserKeyword{"AS"}.ignore(pos, expected)) - { - if ((type == ElementType::TABLE) || (type == ElementType::DATABASE && !is_temp_db)) - { - if (!parseName(pos, expected, type, is_temp_db, new_name)) - return false; - } - } - - ASTs partitions; - if (type == ElementType::TABLE) - parsePartitions(pos, expected, partitions); - - std::set except_list; - if ((type == ElementType::DATABASE) || (type == ElementType::ALL_DATABASES)) - { - bool parse_except_tables = ((type == ElementType::DATABASE) && !is_temp_db); - parseExceptList(pos, expected, parse_except_tables, except_list); - } - - entry.type = type; - entry.name = std::move(name); - entry.new_name = std::move(new_name); - entry.is_temp_db = is_temp_db; - entry.partitions = std::move(partitions); - entry.except_list = std::move(except_list); + except_databases = std::move(result); return true; }); } - bool parseElements(IParser::Pos & pos, Expected & expected, std::vector & elements) + bool parseExceptTables(IParser::Pos & pos, Expected & expected, const std::optional & database_name, std::set & except_tables) + { + return IParserBase::wrapParseImpl(pos, [&] + { + if (!ParserKeyword{"EXCEPT TABLE"}.ignore(pos, expected) && !ParserKeyword{"EXCEPT TABLES"}.ignore(pos, expected)) + return false; + + std::set result; + auto parse_list_element = [&] + { + DatabaseAndTableName table_name; + if (database_name) + { + ASTPtr ast; + if (!ParserIdentifier{}.parse(pos, ast, expected)) + return false; + table_name.first = *database_name; + table_name.second = getIdentifierName(ast); + } + else + { + if (!parseDatabaseAndTableName(pos, expected, table_name.first, table_name.second)) + return false; + } + + result.emplace(std::move(table_name)); + return true; + }; + if (!ParserList::parseUtil(pos, expected, parse_list_element, false)) + return false; + + except_tables = std::move(result); + return true; + }); + } + + bool parseElement(IParser::Pos & pos, Expected & expected, bool allow_all, Element & element) + { + return IParserBase::wrapParseImpl(pos, [&] + { + if (ParserKeyword{"TABLE"}.ignore(pos, expected) || ParserKeyword{"DICTIONARY"}.ignore(pos, expected) || + ParserKeyword{"VIEW"}.ignore(pos, expected)) + { + element.type = ElementType::TABLE; + if (!parseDatabaseAndTableName(pos, expected, element.database_name, element.table_name)) + return false; + + element.new_database_name = element.database_name; + element.new_table_name = element.table_name; + if (ParserKeyword("AS").ignore(pos, expected)) + { + if (!parseDatabaseAndTableName(pos, expected, element.new_database_name, element.new_table_name)) + return false; + } + + parsePartitions(pos, expected, element.partitions); + return true; + } + + if (ParserKeyword{"TEMPORARY TABLE"}.ignore(pos, expected)) + { + element.type = ElementType::TEMPORARY_TABLE; + + ASTPtr ast; + if (!ParserIdentifier{}.parse(pos, ast, expected)) + return false; + element.table_name = getIdentifierName(ast); + element.new_table_name = element.table_name; + + if (ParserKeyword("AS").ignore(pos, expected)) + { + ast = nullptr; + if (!ParserIdentifier{}.parse(pos, ast, expected)) + return false; + element.new_table_name = getIdentifierName(ast); + } + + return true; + } + + if (ParserKeyword{"DATABASE"}.ignore(pos, expected)) + { + element.type = ElementType::DATABASE; + + ASTPtr ast; + if (!ParserIdentifier{}.parse(pos, ast, expected)) + return false; + element.database_name = getIdentifierName(ast); + element.new_database_name = element.database_name; + + if (ParserKeyword("AS").ignore(pos, expected)) + { + ast = nullptr; + if (!ParserIdentifier{}.parse(pos, ast, expected)) + return false; + element.new_database_name = getIdentifierName(ast); + } + + parseExceptTables(pos, expected, element.database_name, element.except_tables); + return true; + } + + if (allow_all && ParserKeyword{"ALL"}.ignore(pos, expected)) + { + element.type = ElementType::ALL; + parseExceptDatabases(pos, expected, element.except_databases); + parseExceptTables(pos, expected, {}, element.except_tables); + return true; + } + + return false; + }); + } + + bool parseElements(IParser::Pos & pos, Expected & expected, bool allow_all, std::vector & elements) { return IParserBase::wrapParseImpl(pos, [&] { @@ -187,7 +189,7 @@ namespace auto parse_element = [&] { Element element; - if (parseElement(pos, expected, element)) + if (parseElement(pos, expected, allow_all, element)) { result.emplace_back(std::move(element)); return true; @@ -327,8 +329,11 @@ bool ParserBackupQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) else return false; + /// Disable "ALL" if this is a RESTORE command. + bool allow_all = (kind == Kind::RESTORE); + std::vector elements; - if (!parseElements(pos, expected, elements)) + if (!parseElements(pos, expected, allow_all, elements)) return false; String cluster; diff --git a/src/Parsers/ParserBackupQuery.h b/src/Parsers/ParserBackupQuery.h index ec63ea9ffa4..4c74666382c 100644 --- a/src/Parsers/ParserBackupQuery.h +++ b/src/Parsers/ParserBackupQuery.h @@ -10,8 +10,7 @@ namespace DB * DICTIONARY [db.]dictionary_name [AS [db.]dictionary_name_in_backup] | * DATABASE database_name [AS database_name_in_backup] [EXCEPT TABLES ...] | * TEMPORARY TABLE table_name [AS table_name_in_backup] | - * ALL TEMPORARY TABLES [EXCEPT ...] | - * ALL DATABASES [EXCEPT ...] } [,...] + * ALL [EXCEPT {TABLES|DATABASES}...] } [,...] * [ON CLUSTER 'cluster_name'] * TO { File('path/') | * Disk('disk_name', 'path/') @@ -21,8 +20,7 @@ namespace DB * DICTIONARY [db.]dictionary_name_in_backup [AS [db.]dictionary_name] | * DATABASE database_name_in_backup [AS database_name] [EXCEPT TABLES ...] | * TEMPORARY TABLE table_name_in_backup [AS table_name] | - * ALL TEMPORARY TABLES [EXCEPT ...] | - * ALL DATABASES [EXCEPT ...] } [,...] + * ALL [EXCEPT {TABLES|DATABASES} ...] } [,...] * [ON CLUSTER 'cluster_name'] * FROM {File(...) | Disk(...)} */ diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp index e57cfece806..e4a3f87f288 100644 --- a/src/Parsers/ParserCreateQuery.cpp +++ b/src/Parsers/ParserCreateQuery.cpp @@ -466,7 +466,6 @@ bool ParserCreateTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe ParserCompoundIdentifier table_name_p(true, true); ParserKeyword s_from("FROM"); ParserKeyword s_on("ON"); - ParserKeyword s_as("AS"); ParserToken s_dot(TokenType::Dot); ParserToken s_lparen(TokenType::OpeningRoundBracket); ParserToken s_rparen(TokenType::ClosingRoundBracket); @@ -492,6 +491,7 @@ bool ParserCreateTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe bool or_replace = false; bool if_not_exists = false; bool is_temporary = false; + bool is_create_empty = false; if (s_create.ignore(pos, expected)) { @@ -557,6 +557,17 @@ bool ParserCreateTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe return true; } + auto need_parse_as_select = [&is_create_empty, &pos, &expected]() + { + if (ParserKeyword{"EMPTY AS"}.ignore(pos, expected)) + { + is_create_empty = true; + return true; + } + + return ParserKeyword{"AS"}.ignore(pos, expected); + }; + /// List of columns. if (s_lparen.ignore(pos, expected)) { @@ -568,7 +579,7 @@ bool ParserCreateTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe auto storage_parse_result = storage_p.parse(pos, storage, expected); - if (storage_parse_result && s_as.ignore(pos, expected)) + if (storage_parse_result && need_parse_as_select()) { if (!select_p.parse(pos, select, expected)) return false; @@ -576,7 +587,7 @@ bool ParserCreateTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe if (!storage_parse_result && !is_temporary) { - if (s_as.ignore(pos, expected) && !table_function_p.parse(pos, as_table_function, expected)) + if (need_parse_as_select() && !table_function_p.parse(pos, as_table_function, expected)) return false; } @@ -591,7 +602,7 @@ bool ParserCreateTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe storage_p.parse(pos, storage, expected); /// CREATE|ATTACH TABLE ... AS ... - if (s_as.ignore(pos, expected)) + if (need_parse_as_select()) { if (!select_p.parse(pos, select, expected)) /// AS SELECT ... { @@ -660,6 +671,7 @@ bool ParserCreateTableQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expe tryGetIdentifierNameInto(as_database, query->as_database); tryGetIdentifierNameInto(as_table, query->as_table); query->set(query->select, select); + query->is_create_empty = is_create_empty; if (from_path) query->attach_from_path = from_path->as().value.get(); @@ -861,6 +873,7 @@ bool ParserCreateWindowViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & bool allowed_lateness = false; bool if_not_exists = false; bool is_populate = false; + bool is_create_empty = false; if (!s_create.ignore(pos, expected)) { @@ -944,6 +957,8 @@ bool ParserCreateWindowViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & if (s_populate.ignore(pos, expected)) is_populate = true; + else if (ParserKeyword{"EMPTY"}.ignore(pos, expected)) + is_create_empty = true; /// AS SELECT ... if (!s_as.ignore(pos, expected)) @@ -979,6 +994,7 @@ bool ParserCreateWindowViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & query->allowed_lateness = allowed_lateness; query->lateness_function = lateness; query->is_populate = is_populate; + query->is_create_empty = is_create_empty; tryGetIdentifierNameInto(as_database, query->as_database); tryGetIdentifierNameInto(as_table, query->as_table); @@ -1239,6 +1255,7 @@ bool ParserCreateViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expec bool is_ordinary_view = false; bool is_materialized_view = false; bool is_populate = false; + bool is_create_empty = false; bool replace_view = false; if (!s_create.ignore(pos, expected)) @@ -1309,6 +1326,8 @@ bool ParserCreateViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expec if (s_populate.ignore(pos, expected)) is_populate = true; + else if (ParserKeyword{"EMPTY"}.ignore(pos, expected)) + is_create_empty = true; } /// AS SELECT ... @@ -1328,6 +1347,7 @@ bool ParserCreateViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expec query->is_ordinary_view = is_ordinary_view; query->is_materialized_view = is_materialized_view; query->is_populate = is_populate; + query->is_create_empty = is_create_empty; query->replace_view = replace_view; auto * table_id = table->as(); diff --git a/src/Processors/Executors/ExecutionThreadContext.cpp b/src/Processors/Executors/ExecutionThreadContext.cpp index d77fe6138cd..5a5c1826c61 100644 --- a/src/Processors/Executors/ExecutionThreadContext.cpp +++ b/src/Processors/Executors/ExecutionThreadContext.cpp @@ -103,7 +103,6 @@ bool ExecutionThreadContext::executeTask() #endif span.addAttribute("thread_number", thread_number); - span.addAttribute("processor.description", node->processor->getDescription()); return node->exception == nullptr; } diff --git a/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp b/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp index c8e8cf900f4..5232d9166af 100644 --- a/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp +++ b/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp @@ -22,7 +22,6 @@ namespace DB namespace ErrorCodes { - extern const int UNKNOWN_FILE_SIZE; extern const int INCORRECT_DATA; } @@ -64,8 +63,6 @@ arrow::Result RandomAccessFileFromSeekableReadBuffer::GetSize() { if (isBufferWithFileSize(in)) file_size = getFileSizeFromReadBuffer(in); - if (!file_size) - throw Exception(ErrorCodes::UNKNOWN_FILE_SIZE, "Cannot find out size of file"); } return arrow::Result(*file_size); } diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.cpp b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.cpp index 1d0be726c16..68170924840 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.cpp +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.cpp @@ -22,7 +22,7 @@ void IMergingAlgorithmWithDelayedChunk::initializeQueue(Inputs inputs) header, current_inputs[source_num].chunk.getColumns(), description, source_num, current_inputs[source_num].permutation); } - queue = SortingHeap(cursors); + queue = SortingQueue(cursors); } void IMergingAlgorithmWithDelayedChunk::updateCursor(Input & input, size_t source_num) diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h index e9f735f4a71..a8c5730cdb3 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h @@ -13,7 +13,7 @@ public: IMergingAlgorithmWithDelayedChunk(Block header_, size_t num_inputs, SortDescription description_); protected: - SortingHeap queue; + SortingQueue queue; SortDescription description; /// Previous row. May refer to last_chunk_sort_columns or row from source_chunks. diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp index 2e87de1ae29..8c94a017271 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp @@ -43,7 +43,7 @@ void IMergingAlgorithmWithSharedChunks::initialize(Inputs inputs) source.chunk->sort_columns = cursors[source_num].sort_columns; } - queue = SortingHeap(cursors); + queue = SortingQueue(cursors); } void IMergingAlgorithmWithSharedChunks::consume(Input & input, size_t source_num) diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h index 32ef23ab6e5..58c445b6ac4 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h @@ -36,7 +36,7 @@ protected: using Sources = std::vector; Sources sources; - SortingHeap queue; + SortingQueue queue; /// Used in Vertical merge algorithm to gather non-PK/non-index columns (on next step) /// If it is not nullptr then it should be populated during execution diff --git a/src/Processors/Merges/Algorithms/MergedData.h b/src/Processors/Merges/Algorithms/MergedData.h index 89da346980d..6029809f0f2 100644 --- a/src/Processors/Merges/Algorithms/MergedData.h +++ b/src/Processors/Merges/Algorithms/MergedData.h @@ -38,27 +38,41 @@ public: sum_blocks_granularity += block_size; } - void insertFromChunk(Chunk && chunk, size_t limit_rows) + void insertRows(const ColumnRawPtrs & raw_columns, size_t start_index, size_t length, size_t block_size) + { + size_t num_columns = raw_columns.size(); + for (size_t i = 0; i < num_columns; ++i) + { + if (length == 1) + columns[i]->insertFrom(*raw_columns[i], start_index); + else + columns[i]->insertRangeFrom(*raw_columns[i], start_index, length); + } + + total_merged_rows += length; + merged_rows += length; + sum_blocks_granularity += (block_size * length); + } + + void insertChunk(Chunk && chunk, size_t rows_size) { if (merged_rows) throw Exception("Cannot insert to MergedData from Chunk because MergedData is not empty.", ErrorCodes::LOGICAL_ERROR); - auto num_rows = chunk.getNumRows(); + UInt64 num_rows = chunk.getNumRows(); columns = chunk.mutateColumns(); - if (limit_rows && num_rows > limit_rows) + + if (rows_size < num_rows) { - num_rows = limit_rows; + size_t pop_size = num_rows - rows_size; for (auto & column : columns) - column = IColumn::mutate(column->cut(0, num_rows)); + column->popBack(pop_size); } need_flush = true; - total_merged_rows += num_rows; - merged_rows = num_rows; - - /// We don't care about granularity here. Because, for fast-forward optimization, chunk will be moved as-is. - /// sum_blocks_granularity += block_size * num_rows; + total_merged_rows += rows_size; + merged_rows = rows_size; } Chunk pull() @@ -107,6 +121,7 @@ public: UInt64 totalMergedRows() const { return total_merged_rows; } UInt64 totalChunks() const { return total_chunks; } UInt64 totalAllocatedBytes() const { return total_allocated_bytes; } + UInt64 maxBlockSize() const { return max_block_size; } protected: MutableColumns columns; diff --git a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp index 627ac3f873c..d35d267731e 100644 --- a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp @@ -7,11 +7,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - MergingSortedAlgorithm::MergingSortedAlgorithm( Block header_, size_t num_inputs, @@ -74,7 +69,7 @@ void MergingSortedAlgorithm::initialize(Inputs inputs) cursors[source_num] = SortCursorImpl(header, chunk.getColumns(), description, source_num); } - queue_variants.callOnVariant([&](auto & queue) + queue_variants.callOnBatchVariant([&](auto & queue) { using QueueType = std::decay_t; queue = QueueType(cursors); @@ -87,7 +82,7 @@ void MergingSortedAlgorithm::consume(Input & input, size_t source_num) current_inputs[source_num].swap(input); cursors[source_num].reset(current_inputs[source_num].chunk.getColumns(), header); - queue_variants.callOnVariant([&](auto & queue) + queue_variants.callOnBatchVariant([&](auto & queue) { queue.push(cursors[source_num]); }); @@ -95,16 +90,17 @@ void MergingSortedAlgorithm::consume(Input & input, size_t source_num) IMergingAlgorithm::Status MergingSortedAlgorithm::merge() { - IMergingAlgorithm::Status result = queue_variants.callOnVariant([&](auto & queue) + IMergingAlgorithm::Status result = queue_variants.callOnBatchVariant([&](auto & queue) { - return mergeImpl(queue); + return mergeBatchImpl(queue); }); return result; } -template -IMergingAlgorithm::Status MergingSortedAlgorithm::mergeImpl(TSortingHeap & queue) + +template +IMergingAlgorithm::Status MergingSortedAlgorithm::mergeBatchImpl(TSortingQueue & queue) { /// Take rows in required order and put them into `merged_data`, while the rows are no more than `max_block_size` while (queue.isValid()) @@ -112,64 +108,100 @@ IMergingAlgorithm::Status MergingSortedAlgorithm::mergeImpl(TSortingHeap & queue if (merged_data.hasEnoughRows()) return Status(merged_data.pull()); - auto current = queue.current(); + auto [current_ptr, initial_batch_size] = queue.current(); + auto current = *current_ptr; - if (current.impl->isLast() && current_inputs[current.impl->order].skip_last_row) + bool batch_skip_last_row = false; + if (current.impl->isLast(initial_batch_size) && current_inputs[current.impl->order].skip_last_row) { - /// Get the next block from the corresponding source, if there is one. - queue.removeTop(); - return Status(current.impl->order); + batch_skip_last_row = true; + + if (initial_batch_size == 1) + { + /// Get the next block from the corresponding source, if there is one. + queue.removeTop(); + return Status(current.impl->order); + } } - /** And what if the block is totally less or equal than the rest for the current cursor? - * Or is there only one data source left in the queue? Then you can take the entire block on current cursor. - */ - if (current.impl->isFirst() - && !current_inputs[current.impl->order].skip_last_row /// Ignore optimization if last row should be skipped. - && (queue.size() == 1 - || (queue.size() >= 2 && current.totallyLessOrEquals(queue.nextChild())))) - { - //std::cerr << "current block is totally less or equals\n"; + UInt64 merged_rows = merged_data.mergedRows(); + size_t updated_batch_size = initial_batch_size; - /// If there are already data in the current block, we first return it. - /// We'll get here again the next time we call the merge function. + if (merged_rows + updated_batch_size > merged_data.maxBlockSize()) + { + batch_skip_last_row = false; + updated_batch_size -= merged_rows + updated_batch_size - merged_data.maxBlockSize(); + } + + bool limit_reached = false; + if (limit && merged_rows + updated_batch_size > limit) + { + batch_skip_last_row = false; + updated_batch_size -= merged_rows + updated_batch_size - limit; + limit_reached = true; + } + + if (unlikely(current.impl->isFirst() && current.impl->isLast(initial_batch_size))) + { + /** This is special optimization if current cursor is totally less than next cursor. + * We want to insert current cursor chunk directly in merged data. + * + * First if merged_data is not empty we need to flush it. + * We will get into the same condition on next mergeBatch call. + * + * Then we can insert chunk directly in merged data. + */ if (merged_data.mergedRows() != 0) - { - //std::cerr << "merged rows is non-zero\n"; - // merged_data.flush(); return Status(merged_data.pull()); + + size_t source_num = current.impl->order; + size_t insert_rows_size = initial_batch_size - static_cast(batch_skip_last_row); + merged_data.insertChunk(std::move(current_inputs[source_num].chunk), insert_rows_size); + current_inputs[source_num].chunk = Chunk(); + + if (out_row_sources_buf) + { + RowSourcePart row_source(current.impl->order); + + for (size_t i = 0; i < insert_rows_size; ++i) + out_row_sources_buf->write(row_source.data); } - /// Actually, current.impl->order stores source number (i.e. cursors[current.impl->order] == current.impl) - size_t source_num = current.impl->order; + if (limit_reached) + break; + + /// We will get the next block from the corresponding source, if there is one. queue.removeTop(); - return insertFromChunk(source_num); + + auto result = Status(merged_data.pull(), limit_reached); + if (!limit_reached) + result.required_source = source_num; + + return result; } - //std::cerr << "total_merged_rows: " << total_merged_rows << ", merged_rows: " << merged_rows << "\n"; - //std::cerr << "Inserting row\n"; - merged_data.insertRow(current->all_columns, current->getRow(), current->rows); + size_t insert_rows_size = updated_batch_size - static_cast(batch_skip_last_row); + merged_data.insertRows(current->all_columns, current->getRow(), insert_rows_size, current->rows); if (out_row_sources_buf) { - /// Actually, current.impl->order stores source number (i.e. cursors[current.impl->order] == current.impl) RowSourcePart row_source(current.impl->order); - out_row_sources_buf->write(row_source.data); + + for (size_t i = 0; i < insert_rows_size; ++i) + out_row_sources_buf->write(row_source.data); } - if (limit && merged_data.totalMergedRows() >= limit) - return Status(merged_data.pull(), true); + if (limit_reached) + break; - if (!current->isLast()) + if (!current->isLast(updated_batch_size)) { - //std::cerr << "moving to next row\n"; - queue.next(); + queue.next(updated_batch_size); } else { /// We will get the next block from the corresponding source, if there is one. queue.removeTop(); - //std::cerr << "It was last row, fetching next block\n"; return Status(current.impl->order); } } @@ -177,43 +209,4 @@ IMergingAlgorithm::Status MergingSortedAlgorithm::mergeImpl(TSortingHeap & queue return Status(merged_data.pull(), true); } -IMergingAlgorithm::Status MergingSortedAlgorithm::insertFromChunk(size_t source_num) -{ - if (source_num >= cursors.size()) - throw Exception("Logical error in MergingSortedTransform", ErrorCodes::LOGICAL_ERROR); - - //std::cerr << "copied columns\n"; - - auto num_rows = current_inputs[source_num].chunk.getNumRows(); - - UInt64 total_merged_rows_after_insertion = merged_data.mergedRows() + num_rows; - bool is_finished = limit && total_merged_rows_after_insertion >= limit; - - if (limit && total_merged_rows_after_insertion > limit) - { - num_rows -= total_merged_rows_after_insertion - limit; - merged_data.insertFromChunk(std::move(current_inputs[source_num].chunk), num_rows); - } - else - merged_data.insertFromChunk(std::move(current_inputs[source_num].chunk), 0); - - current_inputs[source_num].chunk = Chunk(); - - /// Write order of rows for other columns - /// this data will be used in gather stream - if (out_row_sources_buf) - { - RowSourcePart row_source(source_num); - for (size_t i = 0; i < num_rows; ++i) - out_row_sources_buf->write(row_source.data); - } - - auto status = Status(merged_data.pull(), is_finished); - - if (!is_finished) - status.required_source = source_num; - - return status; -} - } diff --git a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h index 44041775a74..9e517120f38 100644 --- a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h +++ b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h @@ -51,10 +51,9 @@ private: SortQueueVariants queue_variants; - Status insertFromChunk(size_t source_num); + template + Status mergeBatchImpl(TSortingQueue & queue); - template - Status mergeImpl(TSortingHeap & queue); }; } diff --git a/src/Processors/QueryPlan/AggregatingStep.cpp b/src/Processors/QueryPlan/AggregatingStep.cpp index 17a0498fb7e..28f821d6f3f 100644 --- a/src/Processors/QueryPlan/AggregatingStep.cpp +++ b/src/Processors/QueryPlan/AggregatingStep.cpp @@ -69,6 +69,7 @@ AggregatingStep::AggregatingStep( Aggregator::Params params_, GroupingSetsParamsList grouping_sets_params_, bool final_, + bool only_merge_, size_t max_block_size_, size_t aggregation_in_order_max_block_bytes_, size_t merge_threads_, @@ -79,7 +80,8 @@ AggregatingStep::AggregatingStep( : ITransformingStep(input_stream_, appendGroupingColumn(params_.getHeader(final_), grouping_sets_params_), getTraits(), false) , params(std::move(params_)) , grouping_sets_params(std::move(grouping_sets_params_)) - , final(std::move(final_)) + , final(final_) + , only_merge(only_merge_) , max_block_size(max_block_size_) , aggregation_in_order_max_block_bytes(aggregation_in_order_max_block_bytes_) , merge_threads(merge_threads_) @@ -119,7 +121,7 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B * 1. Parallel aggregation is done, and the results should be merged in parallel. * 2. An aggregation is done with store of temporary data on the disk, and they need to be merged in a memory efficient way. */ - auto transform_params = std::make_shared(std::move(params), final); + auto transform_params = std::make_shared(std::move(params), final, only_merge); if (!grouping_sets_params.empty()) { @@ -169,7 +171,7 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B transform_params->params.intermediate_header, transform_params->params.stats_collecting_params }; - auto transform_params_for_set = std::make_shared(std::move(params_for_set), final); + auto transform_params_for_set = std::make_shared(std::move(params_for_set), final, only_merge); if (streams > 1) { diff --git a/src/Processors/QueryPlan/AggregatingStep.h b/src/Processors/QueryPlan/AggregatingStep.h index 4dd3d956350..1be74da583a 100644 --- a/src/Processors/QueryPlan/AggregatingStep.h +++ b/src/Processors/QueryPlan/AggregatingStep.h @@ -7,9 +7,6 @@ namespace DB { -struct AggregatingTransformParams; -using AggregatingTransformParamsPtr = std::shared_ptr; - struct GroupingSetsParams { GroupingSetsParams() = default; @@ -36,6 +33,7 @@ public: Aggregator::Params params_, GroupingSetsParamsList grouping_sets_params_, bool final_, + bool only_merge_, size_t max_block_size_, size_t aggregation_in_order_max_block_bytes_, size_t merge_threads_, @@ -59,6 +57,7 @@ private: Aggregator::Params params; GroupingSetsParamsList grouping_sets_params; bool final; + bool only_merge; size_t max_block_size; size_t aggregation_in_order_max_block_bytes; size_t merge_threads; diff --git a/src/Processors/QueryPlan/Optimizations/Optimizations.h b/src/Processors/QueryPlan/Optimizations/Optimizations.h index 1d5b83dc9d0..d6ae3a276b2 100644 --- a/src/Processors/QueryPlan/Optimizations/Optimizations.h +++ b/src/Processors/QueryPlan/Optimizations/Optimizations.h @@ -48,15 +48,20 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes /// May split ExpressionStep and lift up only a part of it. size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes); +/// Utilize storage sorting when sorting for window functions. +/// Update information about prefix sort description in SortingStep. +size_t tryReuseStorageOrderingForWindowFunctions(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes); + inline const auto & getOptimizations() { - static const std::array optimizations = {{ + static const std::array optimizations = {{ {tryLiftUpArrayJoin, "liftUpArrayJoin", &QueryPlanOptimizationSettings::optimize_plan}, {tryPushDownLimit, "pushDownLimit", &QueryPlanOptimizationSettings::optimize_plan}, {trySplitFilter, "splitFilter", &QueryPlanOptimizationSettings::optimize_plan}, {tryMergeExpressions, "mergeExpressions", &QueryPlanOptimizationSettings::optimize_plan}, {tryPushDownFilter, "pushDownFilter", &QueryPlanOptimizationSettings::filter_push_down}, {tryExecuteFunctionsAfterSorting, "liftUpFunctions", &QueryPlanOptimizationSettings::optimize_plan}, + {tryReuseStorageOrderingForWindowFunctions, "reuseStorageOrderingForWindowFunctions", &QueryPlanOptimizationSettings::optimize_plan} }}; return optimizations; diff --git a/src/Processors/QueryPlan/Optimizations/reuseStorageOrderingForWindowFunctions.cpp b/src/Processors/QueryPlan/Optimizations/reuseStorageOrderingForWindowFunctions.cpp new file mode 100644 index 00000000000..4d3c268ab7e --- /dev/null +++ b/src/Processors/QueryPlan/Optimizations/reuseStorageOrderingForWindowFunctions.cpp @@ -0,0 +1,113 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB::QueryPlanOptimizations +{ + +size_t tryReuseStorageOrderingForWindowFunctions(QueryPlan::Node * parent_node, QueryPlan::Nodes & /*nodes*/) +{ + /// Find the following sequence of steps, add InputOrderInfo and apply prefix sort description to + /// SortingStep: + /// WindowStep <- SortingStep <- [Expression] <- [SettingQuotaAndLimits] <- ReadFromMergeTree + + auto * window_node = parent_node; + auto * window = typeid_cast(window_node->step.get()); + if (!window) + return 0; + if (window_node->children.size() != 1) + return 0; + + auto * sorting_node = window_node->children.front(); + auto * sorting = typeid_cast(sorting_node->step.get()); + if (!sorting) + return 0; + if (sorting_node->children.size() != 1) + return 0; + + auto * possible_read_from_merge_tree_node = sorting_node->children.front(); + + if (typeid_cast(possible_read_from_merge_tree_node->step.get())) + { + if (possible_read_from_merge_tree_node->children.size() != 1) + return 0; + + possible_read_from_merge_tree_node = possible_read_from_merge_tree_node->children.front(); + } + + auto * read_from_merge_tree = typeid_cast(possible_read_from_merge_tree_node->step.get()); + if (!read_from_merge_tree) + { + return 0; + } + + auto context = read_from_merge_tree->getContext(); + if (!context->getSettings().optimize_read_in_window_order) + { + return 0; + } + + const auto & query_info = read_from_merge_tree->getQueryInfo(); + const auto * select_query = query_info.query->as(); + + ManyExpressionActions order_by_elements_actions; + const auto & window_desc = window->getWindowDescription(); + + for (const auto & actions_dag : window_desc.partition_by_actions) + { + order_by_elements_actions.emplace_back( + std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(context, CompileExpressions::yes))); + } + + for (const auto & actions_dag : window_desc.order_by_actions) + { + order_by_elements_actions.emplace_back( + std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(context, CompileExpressions::yes))); + } + + auto order_optimizer = std::make_shared( + *select_query, + order_by_elements_actions, + window->getWindowDescription().full_sort_description, + query_info.syntax_analyzer_result); + + read_from_merge_tree->setQueryInfoOrderOptimizer(order_optimizer); + + /// If we don't have filtration, we can pushdown limit to reading stage for optimizations. + UInt64 limit = (select_query->hasFiltration() || select_query->groupBy()) ? 0 : InterpreterSelectQuery::getLimitForSorting(*select_query, context); + + auto order_info = order_optimizer->getInputOrder( + query_info.projection ? query_info.projection->desc->metadata : read_from_merge_tree->getStorageMetadata(), + context, + limit); + + if (order_info) + { + read_from_merge_tree->setQueryInfoInputOrderInfo(order_info); + sorting->convertToFinishSorting(order_info->order_key_prefix_descr); + } + + return 0; +} + +} diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 8adaf2f1027..cc9f6a0faef 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -982,6 +982,30 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead( return std::make_shared(MergeTreeDataSelectAnalysisResult{.result = std::move(result)}); } +void ReadFromMergeTree::setQueryInfoOrderOptimizer(std::shared_ptr order_optimizer) +{ + if (query_info.projection) + { + query_info.projection->order_optimizer = order_optimizer; + } + else + { + query_info.order_optimizer = order_optimizer; + } +} + +void ReadFromMergeTree::setQueryInfoInputOrderInfo(InputOrderInfoPtr order_info) +{ + if (query_info.projection) + { + query_info.projection->input_order_info = order_info; + } + else + { + query_info.input_order_info = order_info; + } +} + ReadFromMergeTree::AnalysisResult ReadFromMergeTree::getAnalysisResult() const { auto result_ptr = analyzed_result_ptr ? analyzed_result_ptr : selectRangesToRead(prepared_parts); @@ -1065,7 +1089,7 @@ void ReadFromMergeTree::initializePipeline(QueryPipelineBuilder & pipeline, cons column_names_to_read, result_projection); } - else if ((settings.optimize_read_in_order || settings.optimize_aggregation_in_order) && input_order_info) + else if ((settings.optimize_read_in_order || settings.optimize_aggregation_in_order || settings.optimize_read_in_window_order) && input_order_info) { pipe = spreadMarkRangesAmongStreamsWithOrder( std::move(result.parts_with_ranges), diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.h b/src/Processors/QueryPlan/ReadFromMergeTree.h index 6846506f260..5a543497ed0 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.h +++ b/src/Processors/QueryPlan/ReadFromMergeTree.h @@ -129,6 +129,13 @@ public: bool sample_factor_column_queried, Poco::Logger * log); + ContextPtr getContext() const { return context; } + const SelectQueryInfo & getQueryInfo() const { return query_info; } + StorageMetadataPtr getStorageMetadata() const { return metadata_for_reading; } + + void setQueryInfoOrderOptimizer(std::shared_ptr read_in_order_optimizer); + void setQueryInfoInputOrderInfo(InputOrderInfoPtr order_info); + private: const MergeTreeReaderSettings reader_settings; diff --git a/src/Processors/QueryPlan/SortingStep.cpp b/src/Processors/QueryPlan/SortingStep.cpp index 2d8fcf3d6cc..d7e2e673f0f 100644 --- a/src/Processors/QueryPlan/SortingStep.cpp +++ b/src/Processors/QueryPlan/SortingStep.cpp @@ -112,6 +112,12 @@ void SortingStep::updateLimit(size_t limit_) } } +void SortingStep::convertToFinishSorting(SortDescription prefix_description_) +{ + type = Type::FinishSorting; + prefix_description = std::move(prefix_description_); +} + void SortingStep::transformPipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) { if (type == Type::FinishSorting) diff --git a/src/Processors/QueryPlan/SortingStep.h b/src/Processors/QueryPlan/SortingStep.h index 1738d8d4e45..23a31b14093 100644 --- a/src/Processors/QueryPlan/SortingStep.h +++ b/src/Processors/QueryPlan/SortingStep.h @@ -54,6 +54,8 @@ public: SortDescription getSortDescription() const { return result_description; } + void convertToFinishSorting(SortDescription prefix_description); + private: enum class Type diff --git a/src/Processors/QueryPlan/WindowStep.cpp b/src/Processors/QueryPlan/WindowStep.cpp index df42ca9e60f..48d16ed321f 100644 --- a/src/Processors/QueryPlan/WindowStep.cpp +++ b/src/Processors/QueryPlan/WindowStep.cpp @@ -138,4 +138,9 @@ void WindowStep::describeActions(JSONBuilder::JSONMap & map) const map.add("Functions", std::move(functions_array)); } +const WindowDescription & WindowStep::getWindowDescription() const +{ + return window_description; +} + } diff --git a/src/Processors/QueryPlan/WindowStep.h b/src/Processors/QueryPlan/WindowStep.h index a65b157f481..9b58cceb972 100644 --- a/src/Processors/QueryPlan/WindowStep.h +++ b/src/Processors/QueryPlan/WindowStep.h @@ -25,6 +25,8 @@ public: void describeActions(JSONBuilder::JSONMap & map) const override; void describeActions(FormatSettings & settings) const override; + const WindowDescription & getWindowDescription() const; + private: WindowDescription window_description; std::vector window_functions; diff --git a/src/Processors/Transforms/AggregatingInOrderTransform.cpp b/src/Processors/Transforms/AggregatingInOrderTransform.cpp index 7491dda8164..f435d46a066 100644 --- a/src/Processors/Transforms/AggregatingInOrderTransform.cpp +++ b/src/Processors/Transforms/AggregatingInOrderTransform.cpp @@ -31,6 +31,7 @@ AggregatingInOrderTransform::AggregatingInOrderTransform( , max_block_size(max_block_size_) , max_block_bytes(max_block_bytes_) , params(std::move(params_)) + , aggregates_mask(getAggregatesMask(params->getHeader(), params->params.aggregates)) , group_by_info(group_by_info_) , sort_description(group_by_description_) , aggregate_columns(params->params.aggregates_size) @@ -66,6 +67,7 @@ static Int64 getCurrentMemoryUsage() void AggregatingInOrderTransform::consume(Chunk chunk) { + const Columns & columns = chunk.getColumns(); Int64 initial_memory_usage = getCurrentMemoryUsage(); size_t rows = chunk.getNumRows(); @@ -85,7 +87,7 @@ void AggregatingInOrderTransform::consume(Chunk chunk) Columns key_columns(params->params.keys_size); for (size_t i = 0; i < params->params.keys_size; ++i) { - materialized_columns.push_back(chunk.getColumns().at(params->params.keys[i])->convertToFullColumnIfConst()); + materialized_columns.push_back(columns.at(params->params.keys[i])->convertToFullColumnIfConst()); key_columns[i] = materialized_columns.back(); if (group_by_key) key_columns_raw[i] = materialized_columns.back().get(); @@ -93,7 +95,7 @@ void AggregatingInOrderTransform::consume(Chunk chunk) Aggregator::NestedColumnsHolder nested_columns_holder; Aggregator::AggregateFunctionInstructions aggregate_function_instructions; - params->aggregator.prepareAggregateInstructions(chunk.getColumns(), aggregate_columns, materialized_columns, aggregate_function_instructions, nested_columns_holder); + params->aggregator.prepareAggregateInstructions(columns, aggregate_columns, materialized_columns, aggregate_function_instructions, nested_columns_holder); size_t key_end = 0; size_t key_begin = 0; @@ -120,6 +122,17 @@ void AggregatingInOrderTransform::consume(Chunk chunk) Int64 current_memory_usage = 0; + Aggregator::AggregateColumnsConstData aggregate_columns_data(params->params.aggregates_size); + if (params->only_merge) + { + for (size_t i = 0, j = 0; i < columns.size(); ++i) + { + if (!aggregates_mask[i]) + continue; + aggregate_columns_data[j++] = &typeid_cast(*columns[i]).getData(); + } + } + /// Will split block into segments with the same key while (key_end != rows) { @@ -136,10 +149,20 @@ void AggregatingInOrderTransform::consume(Chunk chunk) /// Add data to aggr. state if interval is not empty. Empty when haven't found current key in new block. if (key_begin != key_end) { - if (group_by_key) - params->aggregator.executeOnBlockSmall(variants, key_begin, key_end, key_columns_raw, aggregate_function_instructions.data()); + if (params->only_merge) + { + if (group_by_key) + params->aggregator.mergeOnBlockSmall(variants, key_begin, key_end, aggregate_columns_data, key_columns_raw); + else + params->aggregator.mergeOnIntervalWithoutKeyImpl(variants, key_begin, key_end, aggregate_columns_data); + } else - params->aggregator.executeOnIntervalWithoutKeyImpl(variants, key_begin, key_end, aggregate_function_instructions.data(), variants.aggregates_pool); + { + if (group_by_key) + params->aggregator.executeOnBlockSmall(variants, key_begin, key_end, key_columns_raw, aggregate_function_instructions.data()); + else + params->aggregator.executeOnIntervalWithoutKeyImpl(variants, key_begin, key_end, aggregate_function_instructions.data()); + } } current_memory_usage = getCurrentMemoryUsage() - initial_memory_usage; diff --git a/src/Processors/Transforms/AggregatingInOrderTransform.h b/src/Processors/Transforms/AggregatingInOrderTransform.h index 9632b107463..ee9ab0f4b79 100644 --- a/src/Processors/Transforms/AggregatingInOrderTransform.h +++ b/src/Processors/Transforms/AggregatingInOrderTransform.h @@ -56,6 +56,7 @@ private: MutableColumns res_aggregate_columns; AggregatingTransformParamsPtr params; + ColumnsMask aggregates_mask; InputOrderInfoPtr group_by_info; /// For sortBlock() diff --git a/src/Processors/Transforms/AggregatingTransform.h b/src/Processors/Transforms/AggregatingTransform.h index bfc3904e5d8..8d62664da59 100644 --- a/src/Processors/Transforms/AggregatingTransform.h +++ b/src/Processors/Transforms/AggregatingTransform.h @@ -34,21 +34,24 @@ struct AggregatingTransformParams AggregatorListPtr aggregator_list_ptr; Aggregator & aggregator; bool final; + /// Merge data for aggregate projections. bool only_merge = false; - AggregatingTransformParams(const Aggregator::Params & params_, bool final_) + AggregatingTransformParams(const Aggregator::Params & params_, bool final_, bool only_merge_) : params(params_) , aggregator_list_ptr(std::make_shared()) , aggregator(*aggregator_list_ptr->emplace(aggregator_list_ptr->end(), params)) , final(final_) + , only_merge(only_merge_) { } - AggregatingTransformParams(const Aggregator::Params & params_, const AggregatorListPtr & aggregator_list_ptr_, bool final_) + AggregatingTransformParams(const Aggregator::Params & params_, const AggregatorListPtr & aggregator_list_ptr_, bool final_, bool only_merge_) : params(params_) , aggregator_list_ptr(aggregator_list_ptr_) , aggregator(*aggregator_list_ptr->emplace(aggregator_list_ptr->end(), params)) , final(final_) + , only_merge(only_merge_) { } diff --git a/src/Processors/Transforms/PostgreSQLSource.cpp b/src/Processors/Transforms/PostgreSQLSource.cpp index 6926ac26bbc..77c2fc41aa1 100644 --- a/src/Processors/Transforms/PostgreSQLSource.cpp +++ b/src/Processors/Transforms/PostgreSQLSource.cpp @@ -1,4 +1,5 @@ #include "PostgreSQLSource.h" +#include "Common/Exception.h" #if USE_LIBPQXX #include @@ -22,6 +23,10 @@ namespace DB { +namespace ErrorCodes +{ + extern const int TOO_MANY_COLUMNS; +} template PostgreSQLSource::PostgreSQLSource( @@ -123,6 +128,11 @@ Chunk PostgreSQLSource::generate() if (!row) break; + if (row->size() > description.sample_block.columns()) + throw Exception(ErrorCodes::TOO_MANY_COLUMNS, + "Row has too many columns: {}, expected structure: {}", + row->size(), description.sample_block.dumpStructure()); + for (const auto idx : collections::range(0, row->size())) { const auto & sample = description.sample_block.getByPosition(idx); diff --git a/src/Processors/Transforms/SortingTransform.cpp b/src/Processors/Transforms/SortingTransform.cpp index 97935bffd0c..603ee06b203 100644 --- a/src/Processors/Transforms/SortingTransform.cpp +++ b/src/Processors/Transforms/SortingTransform.cpp @@ -26,8 +26,11 @@ MergeSorter::MergeSorter(const Block & header, Chunks chunks_, SortDescription & : chunks(std::move(chunks_)), description(description_), max_merged_block_size(max_merged_block_size_), limit(limit_), queue_variants(header, description) { Chunks nonempty_chunks; - for (auto & chunk : chunks) + size_t chunks_size = chunks.size(); + + for (size_t chunk_index = 0; chunk_index < chunks_size; ++chunk_index) { + auto & chunk = chunks[chunk_index]; if (chunk.getNumRows() == 0) continue; @@ -36,7 +39,7 @@ MergeSorter::MergeSorter(const Block & header, Chunks chunks_, SortDescription & /// which can be inefficient. convertToFullIfSparse(chunk); - cursors.emplace_back(header, chunk.getColumns(), description); + cursors.emplace_back(header, chunk.getColumns(), description, chunk_index); has_collation |= cursors.back().has_collation; nonempty_chunks.emplace_back(std::move(chunk)); @@ -44,7 +47,7 @@ MergeSorter::MergeSorter(const Block & header, Chunks chunks_, SortDescription & chunks.swap(nonempty_chunks); - queue_variants.callOnVariant([&](auto & queue) + queue_variants.callOnBatchVariant([&](auto & queue) { using QueueType = std::decay_t; queue = QueueType(cursors); @@ -64,17 +67,17 @@ Chunk MergeSorter::read() return res; } - Chunk result = queue_variants.callOnVariant([&](auto & queue) + Chunk result = queue_variants.callOnBatchVariant([&](auto & queue) { - return mergeImpl(queue); + return mergeBatchImpl(queue); }); return result; } -template -Chunk MergeSorter::mergeImpl(TSortingHeap & queue) +template +Chunk MergeSorter::mergeBatchImpl(TSortingQueue & queue) { size_t num_columns = chunks[0].getNumColumns(); MutableColumns merged_columns = chunks[0].cloneEmptyColumns(); @@ -82,38 +85,53 @@ Chunk MergeSorter::mergeImpl(TSortingHeap & queue) /// Reserve if (queue.isValid()) { - /// The expected size of output block is the same as input block - size_t size_to_reserve = chunks[0].getNumRows(); + /// The size of output block will not be larger than the `max_merged_block_size`. + /// If redundant memory space is reserved, `MemoryTracker` will count more memory usage than actual usage. + size_t size_to_reserve = std::min(static_cast(chunks[0].getNumRows()), max_merged_block_size); for (auto & column : merged_columns) column->reserve(size_to_reserve); } - /// TODO: Optimization when a single block left. - /// Take rows from queue in right order and push to 'merged'. size_t merged_rows = 0; while (queue.isValid()) { - auto current = queue.current(); + auto [current_ptr, batch_size] = queue.current(); + auto & current = *current_ptr; - /// Append a row from queue. + if (merged_rows + batch_size > max_merged_block_size) + batch_size -= merged_rows + batch_size - max_merged_block_size; + + bool limit_reached = false; + if (limit && total_merged_rows + batch_size > limit) + { + batch_size -= total_merged_rows + batch_size - limit; + limit_reached = true; + } + + /// Append rows from queue. for (size_t i = 0; i < num_columns; ++i) - merged_columns[i]->insertFrom(*current->all_columns[i], current->getRow()); + { + if (batch_size == 1) + merged_columns[i]->insertFrom(*current->all_columns[i], current->getRow()); + else + merged_columns[i]->insertRangeFrom(*current->all_columns[i], current->getRow(), batch_size); + } - ++total_merged_rows; - ++merged_rows; + total_merged_rows += batch_size; + merged_rows += batch_size; /// We don't need more rows because of limit has reached. - if (limit && total_merged_rows == limit) + if (limit_reached) { chunks.clear(); break; } - queue.next(); + queue.next(batch_size); /// It's enough for current output block but we will continue. - if (merged_rows == max_merged_block_size) + if (merged_rows >= max_merged_block_size) break; } diff --git a/src/Processors/Transforms/SortingTransform.h b/src/Processors/Transforms/SortingTransform.h index a5945ed39fc..a607e52550d 100644 --- a/src/Processors/Transforms/SortingTransform.h +++ b/src/Processors/Transforms/SortingTransform.h @@ -32,10 +32,11 @@ private: bool has_collation = false; /** Two different cursors are supported - with and without Collation. - * Templates are used (instead of virtual functions in SortCursor) for zero-overhead. + * Templates are used (instead of virtual functions in SortCursor) for zero-overhead. */ - template - Chunk mergeImpl(TSortingHeap & queue); + template + Chunk mergeBatchImpl(TSortingQueue & queue); + }; diff --git a/src/QueryPipeline/RemoteInserter.cpp b/src/QueryPipeline/RemoteInserter.cpp index d5cef72b020..380594d46cf 100644 --- a/src/QueryPipeline/RemoteInserter.cpp +++ b/src/QueryPipeline/RemoteInserter.cpp @@ -47,10 +47,26 @@ RemoteInserter::RemoteInserter( } } + Settings settings = settings_; + /// With current protocol it is impossible to avoid deadlock in case of send_logs_level!=none. + /// + /// RemoteInserter send Data blocks/packets to the remote shard, + /// while remote side can send Log packets to the initiator (this RemoteInserter instance). + /// + /// But it is not enough to pull Log packets just before writing the next block + /// since there is no way to ensure that all Log packets had been consumed. + /// + /// And if enough Log packets will be queued by the remote side, + /// it will wait send_timeout until initiator will consume those packets, + /// while initiator already starts writing Data blocks, + /// and will not consume Log packets. + /// + /// So that is why send_logs_level had been disabled here. + settings.send_logs_level = "none"; /** Send query and receive "header", that describes table structure. * Header is needed to know, what structure is required for blocks to be passed to 'write' method. */ - connection.sendQuery(timeouts, query, "", QueryProcessingStage::Complete, &settings_, &modified_client_info, false, {}); + connection.sendQuery(timeouts, query, "", QueryProcessingStage::Complete, &settings, &modified_client_info, false, {}); while (true) { @@ -72,6 +88,10 @@ RemoteInserter::RemoteInserter( if (auto log_queue = CurrentThread::getInternalTextLogsQueue()) log_queue->pushBlock(std::move(packet.block)); } + else if (Protocol::Server::ProfileEvents == packet.type) + { + // Do nothing + } else if (Protocol::Server::TableColumns == packet.type) { /// Server could attach ColumnsDescription in front of stream for column defaults. There's no need to pass it through cause @@ -132,6 +152,10 @@ void RemoteInserter::onFinish() { // Do nothing } + else if (Protocol::Server::ProfileEvents == packet.type) + { + // Do nothing + } else throw NetException( ErrorCodes::UNEXPECTED_PACKET_FROM_SERVER, diff --git a/src/QueryPipeline/tests/gtest_blocks_size_merging_streams.cpp b/src/QueryPipeline/tests/gtest_blocks_size_merging_streams.cpp index feae7127349..f9eca5f1ee0 100644 --- a/src/QueryPipeline/tests/gtest_blocks_size_merging_streams.cpp +++ b/src/QueryPipeline/tests/gtest_blocks_size_merging_streams.cpp @@ -93,29 +93,24 @@ TEST(MergingSortedTest, SimpleBlockSizeTest) size_t total_rows = 0; Block block1; Block block2; - Block block3; executor.pull(block1); executor.pull(block2); - executor.pull(block3); Block tmp_block; ASSERT_FALSE(executor.pull(tmp_block)); - for (const auto & block : {block1, block2, block3}) + for (const auto & block : {block1, block2}) total_rows += block.rows(); + /** * First block consists of 1 row from block3 with 21 rows + 2 rows from block2 with 10 rows * + 5 rows from block 1 with 5 rows granularity */ EXPECT_EQ(block1.rows(), 8); /** - * Combination of 10 and 21 rows blocks + * Second block consists of 8 rows from block2 + 20 rows from block3 */ - EXPECT_EQ(block2.rows(), 14); - /** - * Combination of 10 and 21 rows blocks - */ - EXPECT_EQ(block3.rows(), 14); + EXPECT_EQ(block2.rows(), 28); EXPECT_EQ(total_rows, 5 + 10 + 21); } diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index 39870fc91dc..cdf856e87d5 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -649,7 +649,7 @@ void HTTPHandler::processQuery( /// Request body can be compressed using algorithm specified in the Content-Encoding header. String http_request_compression_method_str = request.get("Content-Encoding", ""); auto in_post = wrapReadBufferWithCompressionMethod( - wrapReadBufferReference(request.getStream()), chooseCompressionMethod({}, http_request_compression_method_str)); + wrapReadBufferReference(request.getStream()), chooseCompressionMethod({}, http_request_compression_method_str), context->getSettingsRef().zstd_window_log_max); /// The data can also be compressed using incompatible internal algorithm. This is indicated by /// 'decompress' query parameter. diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index cc51901ac40..eff91ae2302 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -357,7 +357,7 @@ void TCPHandler::runImpl() return true; sendProgress(); - sendProfileEvents(); + sendSelectProfileEvents(); sendLogs(); return false; @@ -586,7 +586,10 @@ bool TCPHandler::readDataNext() } if (read_ok) + { sendLogs(); + sendInsertProfileEvents(); + } else state.read_all_data = true; @@ -659,6 +662,8 @@ void TCPHandler::processInsertQuery() PushingPipelineExecutor executor(state.io.pipeline); run_executor(executor); } + + sendInsertProfileEvents(); } @@ -701,7 +706,7 @@ void TCPHandler::processOrdinaryQueryWithProcessors() /// Some time passed and there is a progress. after_send_progress.restart(); sendProgress(); - sendProfileEvents(); + sendSelectProfileEvents(); } sendLogs(); @@ -727,7 +732,7 @@ void TCPHandler::processOrdinaryQueryWithProcessors() sendProfileInfo(executor.getProfileInfo()); sendProgress(); sendLogs(); - sendProfileEvents(); + sendSelectProfileEvents(); } if (state.is_connection_closed) @@ -861,9 +866,6 @@ void TCPHandler::sendExtremes(const Block & extremes) void TCPHandler::sendProfileEvents() { - if (client_tcp_protocol_version < DBMS_MIN_PROTOCOL_VERSION_WITH_INCREMENTAL_PROFILE_EVENTS) - return; - Block block; ProfileEvents::getProfileEvents(server_display_name, state.profile_queue, block, last_sent_snapshots); if (block.rows() != 0) @@ -878,6 +880,21 @@ void TCPHandler::sendProfileEvents() } } +void TCPHandler::sendSelectProfileEvents() +{ + if (client_tcp_protocol_version < DBMS_MIN_PROTOCOL_VERSION_WITH_INCREMENTAL_PROFILE_EVENTS) + return; + + sendProfileEvents(); +} + +void TCPHandler::sendInsertProfileEvents() +{ + if (client_tcp_protocol_version < DBMS_MIN_PROTOCOL_VERSION_WITH_PROFILE_EVENTS_IN_INSERT) + return; + + sendProfileEvents(); +} bool TCPHandler::receiveProxyHeader() { diff --git a/src/Server/TCPHandler.h b/src/Server/TCPHandler.h index 4f2516e7923..a873f9ba75c 100644 --- a/src/Server/TCPHandler.h +++ b/src/Server/TCPHandler.h @@ -251,6 +251,8 @@ private: void sendTotals(const Block & totals); void sendExtremes(const Block & extremes); void sendProfileEvents(); + void sendSelectProfileEvents(); + void sendInsertProfileEvents(); /// Creates state.block_in/block_out for blocks read/write, depending on whether compression is enabled. void initBlockInput(); diff --git a/src/Storages/Cache/ExternalDataSourceCache.h b/src/Storages/Cache/ExternalDataSourceCache.h index 18d3d5ca699..937801c4767 100644 --- a/src/Storages/Cache/ExternalDataSourceCache.h +++ b/src/Storages/Cache/ExternalDataSourceCache.h @@ -53,7 +53,7 @@ public: bool nextImpl() override; off_t seek(off_t off, int whence) override; off_t getPosition() override; - std::optional getFileSize() override { return remote_file_size; } + size_t getFileSize() override { return remote_file_size; } private: std::unique_ptr local_file_holder; diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 7a43ae7af4b..1a90312e076 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -173,16 +173,16 @@ static auto getNameRange(const ColumnsDescription::ColumnsContainer & columns, c { String name_with_dot = name_without_dot + "."; - auto begin = columns.begin(); - for (; begin != columns.end(); ++begin) + /// First we need to check if we have column with name name_without_dot + /// and if not - check if we have names that start with name_with_dot + for (auto it = columns.begin(); it != columns.end(); ++it) { - if (begin->name == name_without_dot) - return std::make_pair(begin, std::next(begin)); - - if (startsWith(begin->name, name_with_dot)) - break; + if (it->name == name_without_dot) + return std::make_pair(it, std::next(it)); } + auto begin = std::find_if(columns.begin(), columns.end(), [&](const auto & column){ return startsWith(column.name, name_with_dot); }); + if (begin == columns.end()) return std::make_pair(begin, begin); diff --git a/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.cpp b/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.cpp index d4a8ac9f7a8..121a22e764c 100644 --- a/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.cpp +++ b/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.cpp @@ -90,7 +90,7 @@ void AsynchronousReadBufferFromHDFS::prefetch() } -std::optional AsynchronousReadBufferFromHDFS::getFileSize() +size_t AsynchronousReadBufferFromHDFS::getFileSize() { return impl->getFileSize(); } @@ -134,7 +134,9 @@ bool AsynchronousReadBufferFromHDFS::nextImpl() prefetch_buffer.swap(memory); /// Adjust the working buffer so that it ignores `offset` bytes. - setWithBytesToIgnore(memory.data(), size, offset); + internal_buffer = Buffer(memory.data(), memory.data() + memory.size()); + working_buffer = Buffer(memory.data() + offset, memory.data() + size); + pos = working_buffer.begin(); } else { @@ -150,7 +152,9 @@ bool AsynchronousReadBufferFromHDFS::nextImpl() if (size) { /// Adjust the working buffer so that it ignores `offset` bytes. - setWithBytesToIgnore(memory.data(), size, offset); + internal_buffer = Buffer(memory.data(), memory.data() + memory.size()); + working_buffer = Buffer(memory.data() + offset, memory.data() + size); + pos = working_buffer.begin(); } } diff --git a/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.h b/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.h index 1fde8ab8bac..a65e74a8c73 100644 --- a/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.h +++ b/src/Storages/HDFS/AsynchronousReadBufferFromHDFS.h @@ -34,7 +34,7 @@ public: void prefetch() override; - std::optional getFileSize() override; + size_t getFileSize() override; String getFileName() const override; diff --git a/src/Storages/HDFS/ReadBufferFromHDFS.cpp b/src/Storages/HDFS/ReadBufferFromHDFS.cpp index e6a8e6d081d..208c8018c64 100644 --- a/src/Storages/HDFS/ReadBufferFromHDFS.cpp +++ b/src/Storages/HDFS/ReadBufferFromHDFS.cpp @@ -16,6 +16,7 @@ namespace ErrorCodes extern const int CANNOT_SEEK_THROUGH_FILE; extern const int SEEK_POSITION_OUT_OF_BOUND; extern const int LOGICAL_ERROR; + extern const int UNKNOWN_FILE_SIZE; } @@ -59,11 +60,11 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl : public BufferWithOwnMemory getFileSize() const + size_t getFileSize() const { auto * file_info = hdfsGetPathInfo(fs.get(), hdfs_file_path.c_str()); if (!file_info) - return std::nullopt; + throw Exception(ErrorCodes::UNKNOWN_FILE_SIZE, "Cannot find out file size for: {}", hdfs_file_path); return file_info->mSize; } @@ -131,7 +132,7 @@ ReadBufferFromHDFS::ReadBufferFromHDFS( { } -std::optional ReadBufferFromHDFS::getFileSize() +size_t ReadBufferFromHDFS::getFileSize() { return impl->getFileSize(); } diff --git a/src/Storages/HDFS/ReadBufferFromHDFS.h b/src/Storages/HDFS/ReadBufferFromHDFS.h index bcb615eb69c..e2929d60464 100644 --- a/src/Storages/HDFS/ReadBufferFromHDFS.h +++ b/src/Storages/HDFS/ReadBufferFromHDFS.h @@ -38,7 +38,7 @@ public: off_t getPosition() override; - std::optional getFileSize() override; + size_t getFileSize() override; size_t getFileOffsetOfBufferEnd() const override; diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 2edcbeb9a7e..708bfd5ef8b 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -197,8 +197,9 @@ ColumnsDescription StorageHDFS::getTableStructureFromData( if (it == paths.end()) return nullptr; auto compression = chooseCompressionMethod(*it, compression_method); + auto zstd_window_log_max = ctx->getSettingsRef().zstd_window_log_max; return wrapReadBufferWithCompressionMethod( - std::make_unique(uri_without_path, *it++, ctx->getGlobalContext()->getConfigRef()), compression); + std::make_unique(uri_without_path, *it++, ctx->getGlobalContext()->getConfigRef()), compression, zstd_window_log_max); }; return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, paths.size() > 1, ctx); } @@ -327,7 +328,8 @@ bool HDFSSource::initialize() const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(current_path); auto compression = chooseCompressionMethod(path_from_uri, storage->compression_method); - read_buf = wrapReadBufferWithCompressionMethod(std::make_unique(uri_without_path, path_from_uri, getContext()->getGlobalContext()->getConfigRef()), compression); + const auto zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; + read_buf = wrapReadBufferWithCompressionMethod(std::make_unique(uri_without_path, path_from_uri, getContext()->getGlobalContext()->getConfigRef()), compression, zstd_window_log_max); auto input_format = getContext()->getInputFormat(storage->format_name, *read_buf, block_for_format, max_block_size); diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index 43b67657a87..cd6c49d6e3b 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -8,11 +8,13 @@ #include #include #include +#include #include #include #include #include #include +#include namespace DB @@ -246,14 +248,48 @@ bool IStorage::isStaticStorage() const return false; } -BackupEntries IStorage::backupData(ContextPtr, const ASTs &) +ASTPtr IStorage::getCreateQueryForBackup(const ContextPtr & context, DatabasePtr * database) const { - throw Exception("Table engine " + getName() + " doesn't support backups", ErrorCodes::NOT_IMPLEMENTED); + auto table_id = getStorageID(); + auto db = DatabaseCatalog::instance().tryGetDatabase(table_id.getDatabaseName()); + if (!db) + throw Exception(ErrorCodes::TABLE_IS_DROPPED, "Table {}.{} is dropped", table_id.database_name, table_id.table_name); + ASTPtr query = db->tryGetCreateTableQuery(table_id.getTableName(), context); + if (!query) + throw Exception(ErrorCodes::TABLE_IS_DROPPED, "Table {}.{} is dropped", table_id.database_name, table_id.table_name); + + /// We don't want to see any UUIDs in backup (after RESTORE the table will have another UUID anyway). + auto & create = query->as(); + create.uuid = UUIDHelpers::Nil; + create.to_inner_uuid = UUIDHelpers::Nil; + + /// If this is a definition of a system table we'll remove columns and comment because they're excessive for backups. + if (create.storage && create.storage->engine && create.storage->engine->name.starts_with("System")) + { + create.reset(create.columns_list); + create.reset(create.comment); + } + + if (database) + *database = db; + + return query; } -RestoreTaskPtr IStorage::restoreData(ContextMutablePtr, const ASTs &, const BackupPtr &, const String &, const StorageRestoreSettings &, const std::shared_ptr &) +ASTPtr IStorage::getCreateQueryForBackup(const BackupEntriesCollector & backup_entries_collector) const +{ + DatabasePtr database; + auto query = getCreateQueryForBackup(backup_entries_collector.getContext(), &database); + database->checkCreateTableQueryForBackup(query, backup_entries_collector); + return query; +} + +void IStorage::backupData(BackupEntriesCollector &, const String &, const std::optional &) +{ +} + +void IStorage::restoreDataFromBackup(RestorerFromBackup &, const String &, const std::optional &) { - throw Exception("Table engine " + getName() + " doesn't support backups", ErrorCodes::NOT_IMPLEMENTED); } std::string PrewhereInfo::dump() const diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 519d4ad6517..a655da4473b 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -66,14 +66,8 @@ struct SelectQueryInfo; using NameDependencies = std::unordered_map>; using DatabaseAndTableName = std::pair; -class IBackup; -using BackupPtr = std::shared_ptr; -class IBackupEntry; -using BackupEntries = std::vector>>; -class IRestoreTask; -using RestoreTaskPtr = std::unique_ptr; -struct StorageRestoreSettings; -class IRestoreCoordination; +class BackupEntriesCollector; +class RestorerFromBackup; struct ColumnSize { @@ -225,19 +219,21 @@ public: NameDependencies getDependentViewsByColumn(ContextPtr context) const; - /// Returns true if the backup is hollow, which means it doesn't contain any data. - virtual bool hasDataToBackup() const { return false; } - - /// Prepares entries to backup data of the storage. - virtual BackupEntries backupData(ContextPtr context, const ASTs & partitions); - - /// Extract data from the backup and put it to the storage. - virtual RestoreTaskPtr restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings, const std::shared_ptr & restore_coordination); - /// Returns whether the column is virtual - by default all columns are real. /// Initially reserved virtual column name may be shadowed by real column. bool isVirtualColumn(const String & column_name, const StorageMetadataPtr & metadata_snapshot) const; + /// Returns a slightly changed version of the CREATE TABLE query which must be written to a backup. + /// The function can throw `TABLE_IS_DROPPED` if this storage is not attached to a database. + virtual ASTPtr getCreateQueryForBackup(const ContextPtr & context, DatabasePtr * database) const; + virtual ASTPtr getCreateQueryForBackup(const BackupEntriesCollector & backup_entries_collector) const; + + /// Makes backup entries to backup the data of this storage. + virtual void backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions); + + /// Extracts data from the backup and put it to the storage. + virtual void restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions); + private: StorageID storage_id; diff --git a/src/Storages/MergeTree/DataPartStorageOnDisk.cpp b/src/Storages/MergeTree/DataPartStorageOnDisk.cpp new file mode 100644 index 00000000000..083cbc90cb1 --- /dev/null +++ b/src/Storages/MergeTree/DataPartStorageOnDisk.cpp @@ -0,0 +1,877 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int FILE_DOESNT_EXIST; + extern const int DIRECTORY_ALREADY_EXISTS; + extern const int NOT_ENOUGH_SPACE; + extern const int LOGICAL_ERROR; +} + +DataPartStorageOnDisk::DataPartStorageOnDisk(VolumePtr volume_, std::string root_path_, std::string part_dir_) + : volume(std::move(volume_)), root_path(std::move(root_path_)), part_dir(std::move(part_dir_)) +{ +} + +std::string DataPartStorageOnDisk::getFullPath() const +{ + return fs::path(volume->getDisk()->getPath()) / root_path / part_dir / ""; +} + +std::string DataPartStorageOnDisk::getRelativePath() const +{ + return fs::path(root_path) / part_dir / ""; +} + +void DataPartStorageOnDisk::setRelativePath(const std::string & path) +{ + part_dir = path; +} + +std::string DataPartStorageOnDisk::getFullRootPath() const +{ + return fs::path(volume->getDisk()->getPath()) / root_path / ""; +} + +DataPartStoragePtr DataPartStorageOnDisk::getProjection(const std::string & name) const +{ + return std::make_shared(volume, std::string(fs::path(root_path) / part_dir), name); +} + +bool DataPartStorageOnDisk::exists() const +{ + return volume->getDisk()->exists(fs::path(root_path) / part_dir); +} + +bool DataPartStorageOnDisk::exists(const std::string & name) const +{ + return volume->getDisk()->exists(fs::path(root_path) / part_dir / name); +} + +bool DataPartStorageOnDisk::isDirectory(const std::string & name) const +{ + return volume->getDisk()->isDirectory(fs::path(root_path) / part_dir / name); +} + +Poco::Timestamp DataPartStorageOnDisk::getLastModified() const +{ + return volume->getDisk()->getLastModified(fs::path(root_path) / part_dir); +} + +class DataPartStorageIteratorOnDisk final : public IDataPartStorageIterator +{ +public: + DataPartStorageIteratorOnDisk(DiskPtr disk_, DirectoryIteratorPtr it_) + : disk(std::move(disk_)), it(std::move(it_)) + { + } + + void next() override { it->next(); } + bool isValid() const override { return it->isValid(); } + bool isFile() const override { return isValid() && disk->isFile(it->path()); } + std::string name() const override { return it->name(); } + +private: + DiskPtr disk; + DirectoryIteratorPtr it; +}; + +DataPartStorageIteratorPtr DataPartStorageOnDisk::iterate() const +{ + return std::make_unique( + volume->getDisk(), + volume->getDisk()->iterateDirectory(fs::path(root_path) / part_dir)); +} + +size_t DataPartStorageOnDisk::getFileSize(const String & file_name) const +{ + return volume->getDisk()->getFileSize(fs::path(root_path) / part_dir / file_name); +} + +UInt32 DataPartStorageOnDisk::getRefCount(const String & file_name) const +{ + return volume->getDisk()->getRefCount(fs::path(root_path) / part_dir / file_name); +} + +static UInt64 calculateTotalSizeOnDiskImpl(const DiskPtr & disk, const String & from) +{ + if (disk->isFile(from)) + return disk->getFileSize(from); + std::vector files; + disk->listFiles(from, files); + UInt64 res = 0; + for (const auto & file : files) + res += calculateTotalSizeOnDiskImpl(disk, fs::path(from) / file); + return res; +} + +UInt64 DataPartStorageOnDisk::calculateTotalSizeOnDisk() const +{ + return calculateTotalSizeOnDiskImpl(volume->getDisk(), fs::path(root_path) / part_dir); +} + +std::unique_ptr DataPartStorageOnDisk::readFile( + const std::string & name, + const ReadSettings & settings, + std::optional read_hint, + std::optional file_size) const +{ + return volume->getDisk()->readFile(fs::path(root_path) / part_dir / name, settings, read_hint, file_size); +} + +static std::unique_ptr openForReading(const DiskPtr & disk, const String & path) +{ + size_t file_size = disk->getFileSize(path); + return disk->readFile(path, ReadSettings().adjustBufferSize(file_size), file_size); +} + +void DataPartStorageOnDisk::loadVersionMetadata(VersionMetadata & version, Poco::Logger * log) const +{ + std::string version_file_name = fs::path(root_path) / part_dir / "txn_version.txt"; + String tmp_version_file_name = version_file_name + ".tmp"; + DiskPtr disk = volume->getDisk(); + + auto remove_tmp_file = [&]() + { + auto last_modified = disk->getLastModified(tmp_version_file_name); + auto buf = openForReading(disk, tmp_version_file_name); + String content; + readStringUntilEOF(content, *buf); + LOG_WARNING(log, "Found file {} that was last modified on {}, has size {} and the following content: {}", + tmp_version_file_name, last_modified.epochTime(), content.size(), content); + disk->removeFile(tmp_version_file_name); + }; + + if (disk->exists(version_file_name)) + { + auto buf = openForReading(disk, version_file_name); + version.read(*buf); + if (disk->exists(tmp_version_file_name)) + remove_tmp_file(); + return; + } + + /// Four (?) cases are possible: + /// 1. Part was created without transactions. + /// 2. Version metadata file was not renamed from *.tmp on part creation. + /// 3. Version metadata were written to *.tmp file, but hard restart happened before fsync. + /// 4. Fsyncs in storeVersionMetadata() work incorrectly. + + if (!disk->exists(tmp_version_file_name)) + { + /// Case 1. + /// We do not have version metadata and transactions history for old parts, + /// so let's consider that such parts were created by some ancient transaction + /// and were committed with some prehistoric CSN. + /// NOTE It might be Case 3, but version metadata file is written on part creation before other files, + /// so it's not Case 3 if part is not broken. + version.setCreationTID(Tx::PrehistoricTID, nullptr); + version.creation_csn = Tx::PrehistoricCSN; + return; + } + + /// Case 2. + /// Content of *.tmp file may be broken, just use fake TID. + /// Transaction was not committed if *.tmp file was not renamed, so we should complete rollback by removing part. + version.setCreationTID(Tx::DummyTID, nullptr); + version.creation_csn = Tx::RolledBackCSN; + remove_tmp_file(); +} + +void DataPartStorageOnDisk::checkConsistency(const MergeTreeDataPartChecksums & checksums) const +{ + checksums.checkSizes(volume->getDisk(), getRelativePath()); +} + +void DataPartStorageOnDisk::remove( + bool can_remove_shared_data, + const NameSet & names_not_to_remove, + const MergeTreeDataPartChecksums & checksums, + std::list projections, + Poco::Logger * log) const +{ + /// NOTE We rename part to delete_tmp_ instead of delete_tmp_ to avoid race condition + /// when we try to remove two parts with the same name, but different relative paths, + /// for example all_1_2_1 (in Deleting state) and tmp_merge_all_1_2_1 (in Temporary state). + fs::path from = fs::path(root_path) / part_dir; + // fs::path to = fs::path(root_path) / ("delete_tmp_" + part_dir); + // TODO directory delete_tmp_ is never removed if server crashes before returning from this function + + /// Cut last "/" if it exists (it shouldn't). Otherwise fs::path behave differently. + fs::path part_dir_without_slash = part_dir.ends_with("/") ? part_dir.substr(0, part_dir.size() - 1) : part_dir; + + /// NOTE relative_path can contain not only part name itself, but also some prefix like + /// "moving/all_1_1_1" or "detached/all_2_3_5". We should handle this case more properly. + if (part_dir_without_slash.has_parent_path()) + { + auto parent_path = part_dir_without_slash.parent_path(); + if (parent_path == "detached") + throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to remove detached part {} with path {} in remove function. It shouldn't happen", part_dir, root_path); + + part_dir_without_slash = parent_path / ("delete_tmp_" + std::string{part_dir_without_slash.filename()}); + } + else + { + part_dir_without_slash = ("delete_tmp_" + std::string{part_dir_without_slash.filename()}); + } + + fs::path to = fs::path(root_path) / part_dir_without_slash; + + auto disk = volume->getDisk(); + if (disk->exists(to)) + { + LOG_WARNING(log, "Directory {} (to which part must be renamed before removing) already exists. Most likely this is due to unclean restart or race condition. Removing it.", fullPath(disk, to)); + try + { + disk->removeSharedRecursive(fs::path(to) / "", !can_remove_shared_data, names_not_to_remove); + } + catch (...) + { + LOG_ERROR(log, "Cannot recursively remove directory {}. Exception: {}", fullPath(disk, to), getCurrentExceptionMessage(false)); + throw; + } + } + + try + { + disk->moveDirectory(from, to); + } + catch (const fs::filesystem_error & e) + { + if (e.code() == std::errc::no_such_file_or_directory) + { + LOG_ERROR(log, "Directory {} (part to remove) doesn't exist or one of nested files has gone. Most likely this is due to manual removing. This should be discouraged. Ignoring.", fullPath(disk, to)); + return; + } + throw; + } + + // Record existing projection directories so we don't remove them twice + std::unordered_set projection_directories; + for (const auto & projection : projections) + { + std::string proj_dir_name = projection.name + ".proj"; + projection_directories.emplace(proj_dir_name); + + clearDirectory( + fs::path(to) / proj_dir_name, + can_remove_shared_data, names_not_to_remove, projection.checksums, {}, log, true); + } + + clearDirectory(to, can_remove_shared_data, names_not_to_remove, checksums, projection_directories, log, false); +} + +void DataPartStorageOnDisk::clearDirectory( + const std::string & dir, + bool can_remove_shared_data, + const NameSet & names_not_to_remove, + const MergeTreeDataPartChecksums & checksums, + const std::unordered_set & skip_directories, + Poco::Logger * log, + bool is_projection) const +{ + auto disk = volume->getDisk(); + + if (checksums.empty()) + { + if (is_projection) + { + LOG_ERROR( + log, + "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: checksums.txt is missing", + fullPath(disk, dir)); + } + + /// If the part is not completely written, we cannot use fast path by listing files. + disk->removeSharedRecursive(fs::path(dir) / "", !can_remove_shared_data, names_not_to_remove); + + return; + } + + try + { + /// Remove each expected file in directory, then remove directory itself. + RemoveBatchRequest request; + +#if !defined(__clang__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wunused-variable" +#endif + for (const auto & [file, _] : checksums.files) + { + if (skip_directories.find(file) == skip_directories.end()) + request.emplace_back(fs::path(dir) / file); + } +#if !defined(__clang__) +# pragma GCC diagnostic pop +#endif + + for (const auto & file : {"checksums.txt", "columns.txt"}) + request.emplace_back(fs::path(dir) / file); + + request.emplace_back(fs::path(dir) / "default_compression_codec.txt", true); + request.emplace_back(fs::path(dir) / "delete-on-destroy.txt", true); + + if (!is_projection) + request.emplace_back(fs::path(dir) / "txn_version.txt", true); + + disk->removeSharedFiles(request, !can_remove_shared_data, names_not_to_remove); + disk->removeDirectory(dir); + } + catch (...) + { + /// Recursive directory removal does many excessive "stat" syscalls under the hood. + + LOG_ERROR(log, "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: {}", fullPath(disk, dir), getCurrentExceptionMessage(false)); + + disk->removeSharedRecursive(fs::path(dir) / "", !can_remove_shared_data, names_not_to_remove); + } +} + +std::string DataPartStorageOnDisk::getRelativePathForPrefix(Poco::Logger * log, const String & prefix, bool detached) const +{ + String res; + + auto full_relative_path = fs::path(root_path); + if (detached) + full_relative_path /= "detached"; + + for (int try_no = 0; try_no < 10; ++try_no) + { + res = (prefix.empty() ? "" : prefix + "_") + part_dir + (try_no ? "_try" + DB::toString(try_no) : ""); + + if (!volume->getDisk()->exists(full_relative_path / res)) + return res; + + LOG_WARNING(log, "Directory {} (to detach to) already exists. Will detach to directory with '_tryN' suffix.", res); + } + + return res; +} + +void DataPartStorageBuilderOnDisk::setRelativePath(const std::string & path) +{ + part_dir = path; +} + +std::string DataPartStorageOnDisk::getDiskName() const +{ + return volume->getDisk()->getName(); +} + +std::string DataPartStorageOnDisk::getDiskType() const +{ + return toString(volume->getDisk()->getType()); +} + +bool DataPartStorageOnDisk::isStoredOnRemoteDisk() const +{ + return volume->getDisk()->isRemote(); +} + +bool DataPartStorageOnDisk::supportZeroCopyReplication() const +{ + return volume->getDisk()->supportZeroCopyReplication(); +} + +bool DataPartStorageOnDisk::supportParallelWrite() const +{ + return volume->getDisk()->supportParallelWrite(); +} + +bool DataPartStorageOnDisk::isBroken() const +{ + return volume->getDisk()->isBroken(); +} + +void DataPartStorageOnDisk::syncRevision(UInt64 revision) +{ + volume->getDisk()->syncRevision(revision); +} + +UInt64 DataPartStorageOnDisk::getRevision() const +{ + return volume->getDisk()->getRevision(); +} + +std::unordered_map DataPartStorageOnDisk::getSerializedMetadata(const std::vector & paths) const +{ + return volume->getDisk()->getSerializedMetadata(paths); +} + +std::string DataPartStorageOnDisk::getDiskPath() const +{ + return volume->getDisk()->getPath(); +} + +DataPartStorageOnDisk::DisksSet::const_iterator DataPartStorageOnDisk::isStoredOnDisk(const DisksSet & disks) const +{ + return disks.find(volume->getDisk()); +} + +ReservationPtr DataPartStorageOnDisk::reserve(UInt64 bytes) const +{ + auto res = volume->reserve(bytes); + if (!res) + throw Exception(ErrorCodes::NOT_ENOUGH_SPACE, "Cannot reserve {}, not enough space", ReadableSize(bytes)); + + return res; +} + +ReservationPtr DataPartStorageOnDisk::tryReserve(UInt64 bytes) const +{ + return volume->reserve(bytes); +} + +size_t DataPartStorageOnDisk::getVolumeIndex(const IStoragePolicy & storage_policy) const +{ + return storage_policy.getVolumeIndexByDisk(volume->getDisk()); +} + +void DataPartStorageOnDisk::writeChecksums(const MergeTreeDataPartChecksums & checksums, const WriteSettings & settings) const +{ + std::string path = fs::path(root_path) / part_dir / "checksums.txt"; + + try + { + { + auto out = volume->getDisk()->writeFile(path + ".tmp", 4096, WriteMode::Rewrite, settings); + checksums.write(*out); + } + + volume->getDisk()->moveFile(path + ".tmp", path); + } + catch (...) + { + try + { + if (volume->getDisk()->exists(path + ".tmp")) + volume->getDisk()->removeFile(path + ".tmp"); + } + catch (...) + { + tryLogCurrentException("DataPartStorageOnDisk"); + } + + throw; + } +} + +void DataPartStorageOnDisk::writeColumns(const NamesAndTypesList & columns, const WriteSettings & settings) const +{ + std::string path = fs::path(root_path) / part_dir / "columns.txt"; + + try + { + auto buf = volume->getDisk()->writeFile(path + ".tmp", 4096, WriteMode::Rewrite, settings); + columns.writeText(*buf); + buf->finalize(); + + volume->getDisk()->moveFile(path + ".tmp", path); + } + catch (...) + { + try + { + if (volume->getDisk()->exists(path + ".tmp")) + volume->getDisk()->removeFile(path + ".tmp"); + } + catch (...) + { + tryLogCurrentException("DataPartStorageOnDisk"); + } + + throw; + } +} + +void DataPartStorageOnDisk::writeVersionMetadata(const VersionMetadata & version, bool fsync_part_dir) const +{ + std::string path = fs::path(root_path) / part_dir / "txn_version.txt"; + try + { + { + /// TODO IDisk interface does not allow to open file with O_EXCL flag (for DiskLocal), + /// so we create empty file at first (expecting that createFile throws if file already exists) + /// and then overwrite it. + volume->getDisk()->createFile(path + ".tmp"); + auto buf = volume->getDisk()->writeFile(path + ".tmp", 256); + version.write(*buf); + buf->finalize(); + buf->sync(); + } + + SyncGuardPtr sync_guard; + if (fsync_part_dir) + sync_guard = volume->getDisk()->getDirectorySyncGuard(getRelativePath()); + volume->getDisk()->replaceFile(path + ".tmp", path); + + } + catch (...) + { + try + { + if (volume->getDisk()->exists(path + ".tmp")) + volume->getDisk()->removeFile(path + ".tmp"); + } + catch (...) + { + tryLogCurrentException("DataPartStorageOnDisk"); + } + + throw; + } +} + +void DataPartStorageOnDisk::appendCSNToVersionMetadata(const VersionMetadata & version, VersionMetadata::WhichCSN which_csn) const +{ + /// Small enough appends to file are usually atomic, + /// so we append new metadata instead of rewriting file to reduce number of fsyncs. + /// We don't need to do fsync when writing CSN, because in case of hard restart + /// we will be able to restore CSN from transaction log in Keeper. + + std::string version_file_name = fs::path(root_path) / part_dir / "txn_version.txt"; + DiskPtr disk = volume->getDisk(); + auto out = disk->writeFile(version_file_name, 256, WriteMode::Append); + version.writeCSN(*out, which_csn); + out->finalize(); +} + +void DataPartStorageOnDisk::appendRemovalTIDToVersionMetadata(const VersionMetadata & version, bool clear) const +{ + String version_file_name = fs::path(root_path) / part_dir / "txn_version.txt"; + DiskPtr disk = volume->getDisk(); + auto out = disk->writeFile(version_file_name, 256, WriteMode::Append); + version.writeRemovalTID(*out, clear); + out->finalize(); + + /// fsync is not required when we clearing removal TID, because after hard restart we will fix metadata + if (!clear) + out->sync(); +} + +void DataPartStorageOnDisk::writeDeleteOnDestroyMarker(Poco::Logger * log) const +{ + String marker_path = fs::path(root_path) / part_dir / "delete-on-destroy.txt"; + auto disk = volume->getDisk(); + try + { + volume->getDisk()->createFile(marker_path); + } + catch (Poco::Exception & e) + { + LOG_ERROR(log, "{} (while creating DeleteOnDestroy marker: {})", e.what(), backQuote(fullPath(disk, marker_path))); + } +} + +void DataPartStorageOnDisk::removeDeleteOnDestroyMarker() const +{ + std::string delete_on_destroy_file_name = fs::path(root_path) / part_dir / "delete-on-destroy.txt"; + volume->getDisk()->removeFileIfExists(delete_on_destroy_file_name); +} + +void DataPartStorageOnDisk::removeVersionMetadata() const +{ + std::string version_file_name = fs::path(root_path) / part_dir / "txn_version.txt"; + volume->getDisk()->removeFileIfExists(version_file_name); +} + +String DataPartStorageOnDisk::getUniqueId() const +{ + auto disk = volume->getDisk(); + if (!disk->supportZeroCopyReplication()) + throw Exception(fmt::format("Disk {} doesn't support zero-copy replication", disk->getName()), ErrorCodes::LOGICAL_ERROR); + + return disk->getUniqueId(fs::path(getRelativePath()) / "checksums.txt"); +} + +bool DataPartStorageOnDisk::shallParticipateInMerges(const IStoragePolicy & storage_policy) const +{ + /// `IMergeTreeDataPart::volume` describes space where current part belongs, and holds + /// `SingleDiskVolume` object which does not contain up-to-date settings of corresponding volume. + /// Therefore we shall obtain volume from storage policy. + auto volume_ptr = storage_policy.getVolume(storage_policy.getVolumeIndexByDisk(volume->getDisk())); + + return !volume_ptr->areMergesAvoided(); +} + +void DataPartStorageOnDisk::backup( + TemporaryFilesOnDisks & temp_dirs, + const MergeTreeDataPartChecksums & checksums, + const NameSet & files_without_checksums, + BackupEntries & backup_entries) const +{ + auto disk = volume->getDisk(); + + auto temp_dir_it = temp_dirs.find(disk); + if (temp_dir_it == temp_dirs.end()) + temp_dir_it = temp_dirs.emplace(disk, std::make_shared(disk, "tmp/backup_")).first; + auto temp_dir_owner = temp_dir_it->second; + fs::path temp_dir = temp_dir_owner->getPath(); + + fs::path temp_part_dir = temp_dir / part_dir; + disk->createDirectories(temp_part_dir); + + for (const auto & [filepath, checksum] : checksums.files) + { + String relative_filepath = fs::path(part_dir) / filepath; + String full_filepath = fs::path(root_path) / part_dir / filepath; + String hardlink_filepath = temp_part_dir / filepath; + disk->createHardLink(full_filepath, hardlink_filepath); + UInt128 file_hash{checksum.file_hash.first, checksum.file_hash.second}; + backup_entries.emplace_back( + relative_filepath, + std::make_unique(disk, hardlink_filepath, checksum.file_size, file_hash, temp_dir_owner)); + } + + for (const auto & filepath : files_without_checksums) + { + String relative_filepath = fs::path(part_dir) / filepath; + String full_filepath = fs::path(root_path) / part_dir / filepath; + backup_entries.emplace_back(relative_filepath, std::make_unique(disk, full_filepath)); + } +} + +DataPartStoragePtr DataPartStorageOnDisk::freeze( + const std::string & to, + const std::string & dir_path, + bool make_source_readonly, + std::function save_metadata_callback, + bool copy_instead_of_hardlink) const +{ + auto disk = volume->getDisk(); + disk->createDirectories(to); + + localBackup(disk, getRelativePath(), fs::path(to) / dir_path, make_source_readonly, {}, copy_instead_of_hardlink); + + if (save_metadata_callback) + save_metadata_callback(disk); + + disk->removeFileIfExists(fs::path(to) / dir_path / "delete-on-destroy.txt"); + disk->removeFileIfExists(fs::path(to) / dir_path / "txn_version.txt"); + + auto single_disk_volume = std::make_shared(disk->getName(), disk, 0); + return std::make_shared(single_disk_volume, to, dir_path); +} + +DataPartStoragePtr DataPartStorageOnDisk::clone( + const std::string & to, + const std::string & dir_path, + const DiskPtr & disk, + Poco::Logger * log) const +{ + String path_to_clone = fs::path(to) / dir_path / ""; + + if (disk->exists(path_to_clone)) + { + LOG_WARNING(log, "Path {} already exists. Will remove it and clone again.", fullPath(disk, path_to_clone)); + disk->removeRecursive(path_to_clone); + } + disk->createDirectories(to); + volume->getDisk()->copy(getRelativePath(), disk, to); + volume->getDisk()->removeFileIfExists(fs::path(path_to_clone) / "delete-on-destroy.txt"); + + auto single_disk_volume = std::make_shared(disk->getName(), disk, 0); + return std::make_shared(single_disk_volume, to, dir_path); +} + +void DataPartStorageOnDisk::rename(const std::string & new_root_path, const std::string & new_part_dir, Poco::Logger * log, bool remove_new_dir_if_exists, bool fsync_part_dir) +{ + if (!exists()) + throw Exception( + ErrorCodes::FILE_DOESNT_EXIST, + "Part directory {} doesn't exist. Most likely it is a logical error.", + std::string(fs::path(volume->getDisk()->getPath()) / root_path / part_dir)); + + /// Why "" ? + String to = fs::path(new_root_path) / new_part_dir / ""; + + if (volume->getDisk()->exists(to)) + { + if (remove_new_dir_if_exists) + { + Names files; + volume->getDisk()->listFiles(to, files); + + if (log) + LOG_WARNING(log, + "Part directory {} already exists and contains {} files. Removing it.", + fullPath(volume->getDisk(), to), files.size()); + + volume->getDisk()->removeRecursive(to); + } + else + { + throw Exception( + ErrorCodes::DIRECTORY_ALREADY_EXISTS, + "Part directory {} already exists", + fullPath(volume->getDisk(), to)); + } + } + + // metadata_manager->deleteAll(true); + // metadata_manager->assertAllDeleted(true); + + String from = getRelativePath(); + + /// Why? + volume->getDisk()->setLastModified(from, Poco::Timestamp::fromEpochTime(time(nullptr))); + volume->getDisk()->moveDirectory(from, to); + part_dir = new_part_dir; + root_path = new_root_path; + // metadata_manager->updateAll(true); + + SyncGuardPtr sync_guard; + if (fsync_part_dir) + sync_guard = volume->getDisk()->getDirectorySyncGuard(getRelativePath()); +} + +void DataPartStorageOnDisk::changeRootPath(const std::string & from_root, const std::string & to_root) +{ + /// This is a very dumb implementation, here for root path like + /// "some/current/path/to/part" and change like + /// "some/current" -> "other/different", we just replace prefix to make new root like + /// "other/different/path/to/part". + /// Here we expect that actual move was done by somebody else. + + size_t prefix_size = from_root.size(); + if (prefix_size > 0 && from_root.back() == '/') + --prefix_size; + + if (prefix_size > root_path.size() + || std::string_view(from_root).substr(0, prefix_size) != std::string_view(root_path).substr(0, prefix_size)) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Cannot change part root to {} because it is not a prefix of current root {}", + from_root, root_path); + + size_t dst_size = to_root.size(); + if (dst_size > 0 && to_root.back() == '/') + --dst_size; + + root_path = to_root.substr(0, dst_size) + root_path.substr(prefix_size); +} + +DataPartStorageBuilderOnDisk::DataPartStorageBuilderOnDisk(VolumePtr volume_, std::string root_path_, std::string part_dir_) + : volume(std::move(volume_)), root_path(std::move(root_path_)), part_dir(std::move(part_dir_)) +{ +} + +std::unique_ptr DataPartStorageBuilderOnDisk::readFile( + const std::string & name, + const ReadSettings & settings, + std::optional read_hint, + std::optional file_size) const +{ + return volume->getDisk()->readFile(fs::path(root_path) / part_dir / name, settings, read_hint, file_size); +} + +std::unique_ptr DataPartStorageBuilderOnDisk::writeFile( + const String & name, + size_t buf_size, + const WriteSettings & settings) +{ + return volume->getDisk()->writeFile(fs::path(root_path) / part_dir / name, buf_size, WriteMode::Rewrite, settings); +} + +void DataPartStorageBuilderOnDisk::removeFile(const String & name) +{ + return volume->getDisk()->removeFile(fs::path(root_path) / part_dir / name); +} + +void DataPartStorageBuilderOnDisk::removeRecursive() +{ + volume->getDisk()->removeRecursive(fs::path(root_path) / part_dir); +} + +void DataPartStorageBuilderOnDisk::removeSharedRecursive(bool keep_in_remote_fs) +{ + volume->getDisk()->removeSharedRecursive(fs::path(root_path) / part_dir, keep_in_remote_fs, {}); +} + +SyncGuardPtr DataPartStorageBuilderOnDisk::getDirectorySyncGuard() const +{ + return volume->getDisk()->getDirectorySyncGuard(fs::path(root_path) / part_dir); +} + +void DataPartStorageBuilderOnDisk::createHardLinkFrom(const IDataPartStorage & source, const std::string & from, const std::string & to) const +{ + const auto * source_on_disk = typeid_cast(&source); + if (!source_on_disk) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Cannot create hardlink from different storage. Expected DataPartStorageOnDisk, got {}", + typeid(source).name()); + + volume->getDisk()->createHardLink( + fs::path(source_on_disk->getRelativePath()) / from, + fs::path(root_path) / part_dir / to); +} + +bool DataPartStorageBuilderOnDisk::exists() const +{ + return volume->getDisk()->exists(fs::path(root_path) / part_dir); +} + + +bool DataPartStorageBuilderOnDisk::exists(const std::string & name) const +{ + return volume->getDisk()->exists(fs::path(root_path) / part_dir / name); +} + +std::string DataPartStorageBuilderOnDisk::getFullPath() const +{ + return fs::path(volume->getDisk()->getPath()) / root_path / part_dir; +} + +std::string DataPartStorageBuilderOnDisk::getRelativePath() const +{ + return fs::path(root_path) / part_dir; +} + +void DataPartStorageBuilderOnDisk::createDirectories() +{ + return volume->getDisk()->createDirectories(fs::path(root_path) / part_dir); +} + +void DataPartStorageBuilderOnDisk::createProjection(const std::string & name) +{ + return volume->getDisk()->createDirectory(fs::path(root_path) / part_dir / name); +} + +ReservationPtr DataPartStorageBuilderOnDisk::reserve(UInt64 bytes) +{ + auto res = volume->reserve(bytes); + if (!res) + throw Exception(ErrorCodes::NOT_ENOUGH_SPACE, "Cannot reserve {}, not enough space", ReadableSize(bytes)); + + return res; +} + +DataPartStorageBuilderPtr DataPartStorageBuilderOnDisk::getProjection(const std::string & name) const +{ + return std::make_shared(volume, std::string(fs::path(root_path) / part_dir), name); +} + +DataPartStoragePtr DataPartStorageBuilderOnDisk::getStorage() const +{ + return std::make_shared(volume, root_path, part_dir); +} + +} diff --git a/src/Storages/MergeTree/DataPartStorageOnDisk.h b/src/Storages/MergeTree/DataPartStorageOnDisk.h new file mode 100644 index 00000000000..d6fcb2f1442 --- /dev/null +++ b/src/Storages/MergeTree/DataPartStorageOnDisk.h @@ -0,0 +1,173 @@ +#pragma once +#include +#include +#include + +namespace DB +{ + +class IVolume; +using VolumePtr = std::shared_ptr; + + +class DataPartStorageOnDisk final : public IDataPartStorage +{ +public: + DataPartStorageOnDisk(VolumePtr volume_, std::string root_path_, std::string part_dir_); + + std::string getFullPath() const override; + std::string getRelativePath() const override; + std::string getPartDirectory() const override { return part_dir; } + std::string getFullRootPath() const override; + + DataPartStoragePtr getProjection(const std::string & name) const override; + + bool exists() const override; + bool exists(const std::string & name) const override; + bool isDirectory(const std::string & name) const override; + + Poco::Timestamp getLastModified() const override; + DataPartStorageIteratorPtr iterate() const override; + + size_t getFileSize(const std::string & file_name) const override; + UInt32 getRefCount(const std::string & file_name) const override; + + UInt64 calculateTotalSizeOnDisk() const override; + + std::unique_ptr readFile( + const std::string & name, + const ReadSettings & settings, + std::optional read_hint, + std::optional file_size) const override; + + void loadVersionMetadata(VersionMetadata & version, Poco::Logger * log) const override; + void checkConsistency(const MergeTreeDataPartChecksums & checksums) const override; + + void remove( + bool can_remove_shared_data, + const NameSet & names_not_to_remove, + const MergeTreeDataPartChecksums & checksums, + std::list projections, + Poco::Logger * log) const override; + + std::string getRelativePathForPrefix(Poco::Logger * log, const String & prefix, bool detached) const override; + + void setRelativePath(const std::string & path) override; + + std::string getDiskName() const override; + std::string getDiskType() const override; + bool isStoredOnRemoteDisk() const override; + bool supportZeroCopyReplication() const override; + bool supportParallelWrite() const override; + bool isBroken() const override; + void syncRevision(UInt64 revision) override; + UInt64 getRevision() const override; + std::unordered_map getSerializedMetadata(const std::vector & paths) const override; + std::string getDiskPath() const override; + + DisksSet::const_iterator isStoredOnDisk(const DisksSet & disks) const override; + + ReservationPtr reserve(UInt64 bytes) const override; + ReservationPtr tryReserve(UInt64 bytes) const override; + size_t getVolumeIndex(const IStoragePolicy &) const override; + + void writeChecksums(const MergeTreeDataPartChecksums & checksums, const WriteSettings & settings) const override; + void writeColumns(const NamesAndTypesList & columns, const WriteSettings & settings) const override; + void writeVersionMetadata(const VersionMetadata & version, bool fsync_part_dir) const override; + void appendCSNToVersionMetadata(const VersionMetadata & version, VersionMetadata::WhichCSN which_csn) const override; + void appendRemovalTIDToVersionMetadata(const VersionMetadata & version, bool clear) const override; + void writeDeleteOnDestroyMarker(Poco::Logger * log) const override; + void removeDeleteOnDestroyMarker() const override; + void removeVersionMetadata() const override; + + String getUniqueId() const override; + + bool shallParticipateInMerges(const IStoragePolicy &) const override; + + void backup( + TemporaryFilesOnDisks & temp_dirs, + const MergeTreeDataPartChecksums & checksums, + const NameSet & files_without_checksums, + BackupEntries & backup_entries) const override; + + DataPartStoragePtr freeze( + const std::string & to, + const std::string & dir_path, + bool make_source_readonly, + std::function save_metadata_callback, + bool copy_instead_of_hardlink) const override; + + DataPartStoragePtr clone( + const std::string & to, + const std::string & dir_path, + const DiskPtr & disk, + Poco::Logger * log) const override; + + void rename(const std::string & new_root_path, const std::string & new_part_dir, Poco::Logger * log, bool remove_new_dir_if_exists, bool fsync_part_dir) override; + + void changeRootPath(const std::string & from_root, const std::string & to_root) override; + +private: + VolumePtr volume; + std::string root_path; + std::string part_dir; + + void clearDirectory( + const std::string & dir, + bool can_remove_shared_data, + const NameSet & names_not_to_remove, + const MergeTreeDataPartChecksums & checksums, + const std::unordered_set & skip_directories, + Poco::Logger * log, + bool is_projection) const; +}; + +class DataPartStorageBuilderOnDisk final : public IDataPartStorageBuilder +{ +public: + DataPartStorageBuilderOnDisk(VolumePtr volume_, std::string root_path_, std::string part_dir_); + + void setRelativePath(const std::string & path) override; + + bool exists() const override; + bool exists(const std::string & name) const override; + + void createDirectories() override; + void createProjection(const std::string & name) override; + + std::string getPartDirectory() const override { return part_dir; } + std::string getFullPath() const override; + std::string getRelativePath() const override; + + std::unique_ptr readFile( + const std::string & name, + const ReadSettings & settings, + std::optional read_hint, + std::optional file_size) const override; + + std::unique_ptr writeFile( + const String & name, + size_t buf_size, + const WriteSettings & settings) override; + + void removeFile(const String & name) override; + void removeRecursive() override; + void removeSharedRecursive(bool keep_in_remote_fs) override; + + SyncGuardPtr getDirectorySyncGuard() const override; + + void createHardLinkFrom(const IDataPartStorage & source, const std::string & from, const std::string & to) const override; + + ReservationPtr reserve(UInt64 bytes) override; + + DataPartStorageBuilderPtr getProjection(const std::string & name) const override; + + DataPartStoragePtr getStorage() const override; + +private: + VolumePtr volume; + std::string root_path; + std::string part_dir; +}; + +} diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index 0c834564ec4..313bde658cb 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -140,12 +141,12 @@ void Service::processQuery(const HTMLForm & params, ReadBuffer & /*body*/, Write CurrentMetrics::Increment metric_increment{CurrentMetrics::ReplicatedSend}; + if (part->data_part_storage->isStoredOnRemoteDisk()) { - auto disk = part->volume->getDisk(); UInt64 revision = parse(params.get("disk_revision", "0")); if (revision) - disk->syncRevision(revision); - revision = disk->getRevision(); + part->data_part_storage->syncRevision(revision); + revision = part->data_part_storage->getRevision(); if (revision) response.addCookie({"disk_revision", toString(revision)}); } @@ -175,9 +176,8 @@ void Service::processQuery(const HTMLForm & params, ReadBuffer & /*body*/, Write if (data_settings->allow_remote_fs_zero_copy_replication && client_protocol_version >= REPLICATION_PROTOCOL_VERSION_WITH_PARTS_ZERO_COPY) { - auto disk = part->volume->getDisk(); - auto disk_type = toString(disk->getType()); - if (disk->supportZeroCopyReplication() && std::find(capability.begin(), capability.end(), disk_type) != capability.end()) + auto disk_type = part->data_part_storage->getDiskType(); + if (part->data_part_storage->supportZeroCopyReplication() && std::find(capability.begin(), capability.end(), disk_type) != capability.end()) { /// Send metadata if the receiver's capability covers the source disk type. response.addCookie({"remote_fs_metadata", disk_type}); @@ -268,7 +268,7 @@ MergeTreeData::DataPart::Checksums Service::sendPartFromDisk( checksums.files[file_name] = {}; } - auto disk = part->volume->getDisk(); + //auto disk = part->volume->getDisk(); MergeTreeData::DataPart::Checksums data_checksums; for (const auto & [name, projection] : part->getProjectionParts()) { @@ -294,14 +294,12 @@ MergeTreeData::DataPart::Checksums Service::sendPartFromDisk( { String file_name = it.first; - String path = fs::path(part->getFullRelativePath()) / file_name; - - UInt64 size = disk->getFileSize(path); + UInt64 size = part->data_part_storage->getFileSize(file_name); writeStringBinary(it.first, out); writeBinary(size, out); - auto file_in = disk->readFile(path); + auto file_in = part->data_part_storage->readFile(file_name, {}, std::nullopt, std::nullopt); HashingWriteBuffer hashing_out(out); copyDataWithThrottler(*file_in, hashing_out, blocker.getCounter(), data.getSendsThrottler()); @@ -309,7 +307,11 @@ MergeTreeData::DataPart::Checksums Service::sendPartFromDisk( throw Exception("Transferring part to replica was cancelled", ErrorCodes::ABORTED); if (hashing_out.count() != size) - throw Exception(ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART, "Unexpected size of file {}, expected {} got {}", path, hashing_out.count(), size); + throw Exception( + ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART, + "Unexpected size of file {}, expected {} got {}", + std::string(fs::path(part->data_part_storage->getRelativePath()) / file_name), + hashing_out.count(), size); writePODBinary(hashing_out.getHash(), out); @@ -323,9 +325,12 @@ MergeTreeData::DataPart::Checksums Service::sendPartFromDisk( void Service::sendPartFromDiskRemoteMeta(const MergeTreeData::DataPartPtr & part, WriteBuffer & out) { - auto disk = part->volume->getDisk(); - if (!disk->supportZeroCopyReplication()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Disk '{}' doesn't support zero-copy replication", disk->getName()); + const auto * data_part_storage_on_disk = dynamic_cast(part->data_part_storage.get()); + if (!data_part_storage_on_disk) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Storage '{}' doesn't support zero-copy replication", part->data_part_storage->getDiskName()); + + if (!data_part_storage_on_disk->supportZeroCopyReplication()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Disk '{}' doesn't support zero-copy replication", data_part_storage_on_disk->getDiskName()); /// We'll take a list of files from the list of checksums. MergeTreeData::DataPart::Checksums checksums = part->checksums; @@ -337,10 +342,10 @@ void Service::sendPartFromDiskRemoteMeta(const MergeTreeData::DataPartPtr & part std::vector paths; paths.reserve(checksums.files.size()); for (const auto & it : checksums.files) - paths.push_back(fs::path(part->getFullRelativePath()) / it.first); + paths.push_back(fs::path(part->data_part_storage->getRelativePath()) / it.first); /// Serialized metadatadatas with zero ref counts. - auto metadatas = disk->getSerializedMetadata(paths); + auto metadatas = data_part_storage_on_disk->getSerializedMetadata(paths); String part_id = part->getUniqueId(); writeStringBinary(part_id, out); @@ -349,10 +354,10 @@ void Service::sendPartFromDiskRemoteMeta(const MergeTreeData::DataPartPtr & part for (const auto & it : checksums.files) { const String & file_name = it.first; - String file_path_prefix = fs::path(part->getFullRelativePath()) / file_name; + String file_path_prefix = fs::path(part->data_part_storage->getRelativePath()) / file_name; /// Just some additional checks - String metadata_file_path = fs::path(disk->getPath()) / file_path_prefix; + String metadata_file_path = fs::path(data_part_storage_on_disk->getDiskPath()) / file_path_prefix; fs::path metadata(metadata_file_path); if (!fs::exists(metadata)) throw Exception(ErrorCodes::CORRUPTED_DATA, "Remote metadata '{}' is not exists", file_name); @@ -603,8 +608,19 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory( ThrottlerPtr throttler) { auto volume = std::make_shared("volume_" + part_name, disk, 0); + + auto data_part_storage = std::make_shared( + volume, + data.getRelativeDataPath(), + part_name); + + auto data_part_storage_builder = std::make_shared( + volume, + data.getRelativeDataPath(), + part_name); + MergeTreeData::MutableDataPartPtr new_data_part = - std::make_shared(data, part_name, volume); + std::make_shared(data, part_name, data_part_storage); new_data_part->version.setCreationTID(Tx::PrehistoricTID, nullptr); for (auto i = 0ul; i < projections; ++i) @@ -619,9 +635,12 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory( auto block = block_in.read(); throttler->add(block.bytes()); + auto projection_part_storage = data_part_storage->getProjection(projection_name + ".proj"); + auto projection_part_storage_builder = data_part_storage_builder->getProjection(projection_name + ".proj"); + MergeTreePartInfo new_part_info("all", 0, 0, 0); MergeTreeData::MutableDataPartPtr new_projection_part = - std::make_shared(data, projection_name, new_part_info, volume, projection_name, new_data_part.get()); + std::make_shared(data, projection_name, new_part_info, projection_part_storage, new_data_part.get()); new_projection_part->is_temp = false; new_projection_part->setColumns(block.getNamesAndTypesList()); @@ -631,6 +650,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory( MergedBlockOutputStream part_out( new_projection_part, + projection_part_storage_builder, metadata_snapshot->projections.get(projection_name).metadata, block.getNamesAndTypesList(), {}, @@ -658,7 +678,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory( new_data_part->partition.create(metadata_snapshot, block, 0, context); MergedBlockOutputStream part_out( - new_data_part, metadata_snapshot, block.getNamesAndTypesList(), {}, + new_data_part, data_part_storage_builder, metadata_snapshot, block.getNamesAndTypesList(), {}, CompressionCodecFactory::instance().get("NONE", {}), NO_TRANSACTION_PTR); part_out.write(block); @@ -670,9 +690,8 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory( void Fetcher::downloadBaseOrProjectionPartToDisk( const String & replica_path, - const String & part_download_path, + DataPartStorageBuilderPtr & data_part_storage_builder, bool sync, - DiskPtr disk, PooledReadWriteBufferFromHTTP & in, MergeTreeData::DataPart::Checksums & checksums, ThrottlerPtr throttler) const @@ -690,14 +709,14 @@ void Fetcher::downloadBaseOrProjectionPartToDisk( /// File must be inside "absolute_part_path" directory. /// Otherwise malicious ClickHouse replica may force us to write to arbitrary path. - String absolute_file_path = fs::weakly_canonical(fs::path(part_download_path) / file_name); - if (!startsWith(absolute_file_path, fs::weakly_canonical(part_download_path).string())) + String absolute_file_path = fs::weakly_canonical(fs::path(data_part_storage_builder->getRelativePath()) / file_name); + if (!startsWith(absolute_file_path, fs::weakly_canonical(data_part_storage_builder->getRelativePath()).string())) throw Exception(ErrorCodes::INSECURE_PATH, "File path ({}) doesn't appear to be inside part path ({}). " "This may happen if we are trying to download part from malicious replica or logical error.", - absolute_file_path, part_download_path); + absolute_file_path, data_part_storage_builder->getRelativePath()); - auto file_out = disk->writeFile(fs::path(part_download_path) / file_name); + auto file_out = data_part_storage_builder->writeFile(file_name, file_size, {}); HashingWriteBuffer hashing_out(*file_out); copyDataWithThrottler(in, hashing_out, file_size, blocker.getCounter(), throttler); @@ -706,7 +725,7 @@ void Fetcher::downloadBaseOrProjectionPartToDisk( /// NOTE The is_cancelled flag also makes sense to check every time you read over the network, /// performing a poll with a not very large timeout. /// And now we check it only between read chunks (in the `copyData` function). - disk->removeRecursive(part_download_path); + data_part_storage_builder->removeRecursive(); throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED); } @@ -716,7 +735,7 @@ void Fetcher::downloadBaseOrProjectionPartToDisk( if (expected_hash != hashing_out.getHash()) throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH, "Checksum mismatch for file {} transferred from {}", - fullPath(disk, (fs::path(part_download_path) / file_name).string()), + (fs::path(data_part_storage_builder->getFullPath()) / file_name).string(), replica_path); if (file_name != "checksums.txt" && @@ -751,21 +770,33 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDisk( || std::string::npos != part_name.find_first_of("/.")) throw Exception("Logical error: tmp_prefix and part_name cannot be empty or contain '.' or '/' characters.", ErrorCodes::LOGICAL_ERROR); - String part_relative_path = String(to_detached ? "detached/" : "") + tmp_prefix + part_name; - String part_download_path = data.getRelativeDataPath() + part_relative_path + "/"; + String part_dir = tmp_prefix + part_name; + String part_relative_path = data.getRelativeDataPath() + String(to_detached ? "detached/" : ""); - if (disk->exists(part_download_path)) + auto volume = std::make_shared("volume_" + part_name, disk, 0); + + auto data_part_storage = std::make_shared( + volume, + part_relative_path, + part_dir); + + DataPartStorageBuilderPtr data_part_storage_builder = std::make_shared( + volume, + part_relative_path, + part_dir); + + if (data_part_storage_builder->exists()) { LOG_WARNING(log, "Directory {} already exists, probably result of a failed fetch. Will remove it before fetching part.", - fullPath(disk, part_download_path)); - disk->removeRecursive(part_download_path); + data_part_storage_builder->getFullPath()); + data_part_storage_builder->removeRecursive(); } - disk->createDirectories(part_download_path); + data_part_storage_builder->createDirectories(); SyncGuardPtr sync_guard; if (data.getSettings()->fsync_part_directory) - sync_guard = disk->getDirectorySyncGuard(part_download_path); + sync_guard = disk->getDirectorySyncGuard(data_part_storage->getPartDirectory()); CurrentMetrics::Increment metric_increment{CurrentMetrics::ReplicatedFetch}; @@ -774,19 +805,22 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDisk( String projection_name; readStringBinary(projection_name, in); MergeTreeData::DataPart::Checksums projection_checksum; - disk->createDirectories(part_download_path + projection_name + ".proj/"); + + auto projection_part_storage = data_part_storage->getProjection(projection_name + ".proj"); + auto projection_part_storage_builder = data_part_storage_builder->getProjection(projection_name + ".proj"); + + projection_part_storage_builder->createDirectories(); downloadBaseOrProjectionPartToDisk( - replica_path, part_download_path + projection_name + ".proj/", sync, disk, in, projection_checksum, throttler); + replica_path, projection_part_storage_builder, sync, in, projection_checksum, throttler); checksums.addFile( projection_name + ".proj", projection_checksum.getTotalSizeOnDisk(), projection_checksum.getTotalChecksumUInt128()); } // Download the base part - downloadBaseOrProjectionPartToDisk(replica_path, part_download_path, sync, disk, in, checksums, throttler); + downloadBaseOrProjectionPartToDisk(replica_path, data_part_storage_builder, sync, in, checksums, throttler); assertEOF(in); - auto volume = std::make_shared("volume_" + part_name, disk, 0); - MergeTreeData::MutableDataPartPtr new_data_part = data.createPart(part_name, volume, part_relative_path); + MergeTreeData::MutableDataPartPtr new_data_part = data.createPart(part_name, data_part_storage); new_data_part->version.setCreationTID(Tx::PrehistoricTID, nullptr); new_data_part->is_temp = true; new_data_part->modification_time = time(nullptr); @@ -820,21 +854,31 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDiskRemoteMeta( static const String TMP_PREFIX = "tmp-fetch_"; String tmp_prefix = tmp_prefix_.empty() ? TMP_PREFIX : tmp_prefix_; - String part_relative_path = String(to_detached ? "detached/" : "") + tmp_prefix + part_name; - String part_download_path = fs::path(data.getRelativeDataPath()) / part_relative_path / ""; + String part_dir = tmp_prefix + part_name; + String part_relative_path = data.getRelativeDataPath() + String(to_detached ? "detached/" : ""); - if (disk->exists(part_download_path)) - throw Exception(ErrorCodes::DIRECTORY_ALREADY_EXISTS, "Directory {} already exists.", fullPath(disk, part_download_path)); + auto volume = std::make_shared("volume_" + part_name, disk); + + auto data_part_storage = std::make_shared( + volume, + part_relative_path, + part_dir); + + DataPartStorageBuilderPtr data_part_storage_builder = std::make_shared( + volume, + part_relative_path, + part_dir); + + if (data_part_storage_builder->exists()) + throw Exception(ErrorCodes::DIRECTORY_ALREADY_EXISTS, "Directory {} already exists.", data_part_storage_builder->getFullPath()); CurrentMetrics::Increment metric_increment{CurrentMetrics::ReplicatedFetch}; - disk->createDirectories(part_download_path); + data_part_storage_builder->createDirectories(); size_t files; readBinary(files, in); - auto volume = std::make_shared("volume_" + part_name, disk); - for (size_t i = 0; i < files; ++i) { String file_name; @@ -843,8 +887,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDiskRemoteMeta( readStringBinary(file_name, in); readBinary(file_size, in); - String data_path = fs::path(part_download_path) / file_name; - String metadata_file = fullPath(disk, data_path); + String metadata_file = fs::path(data_part_storage_builder->getFullPath()) / file_name; { auto file_out = std::make_unique(metadata_file, DBMS_DEFAULT_BUFFER_SIZE, -1, 0666, nullptr, 0); @@ -858,7 +901,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDiskRemoteMeta( /// NOTE The is_cancelled flag also makes sense to check every time you read over the network, /// performing a poll with a not very large timeout. /// And now we check it only between read chunks (in the `copyData` function). - disk->removeSharedRecursive(part_download_path, true, {}); + data_part_storage_builder->removeSharedRecursive(true); throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED); } @@ -876,7 +919,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDiskRemoteMeta( assertEOF(in); - MergeTreeData::MutableDataPartPtr new_data_part = data.createPart(part_name, volume, part_relative_path); + MergeTreeData::MutableDataPartPtr new_data_part = data.createPart(part_name, data_part_storage); new_data_part->version.setCreationTID(Tx::PrehistoricTID, nullptr); new_data_part->is_temp = true; new_data_part->modification_time = time(nullptr); diff --git a/src/Storages/MergeTree/DataPartsExchange.h b/src/Storages/MergeTree/DataPartsExchange.h index 8dfcfef9a8b..0e19bf4cdcd 100644 --- a/src/Storages/MergeTree/DataPartsExchange.h +++ b/src/Storages/MergeTree/DataPartsExchange.h @@ -90,9 +90,8 @@ public: private: void downloadBaseOrProjectionPartToDisk( const String & replica_path, - const String & part_download_path, + DataPartStorageBuilderPtr & data_part_storage_builder, bool sync, - DiskPtr disk, PooledReadWriteBufferFromHTTP & in, MergeTreeData::DataPart::Checksums & checksums, ThrottlerPtr throttler) const; diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h new file mode 100644 index 00000000000..0e165e74ed0 --- /dev/null +++ b/src/Storages/MergeTree/IDataPartStorage.h @@ -0,0 +1,256 @@ +#pragma once +#include +#include +#include +#include +#include + +namespace DB +{ + +class ReadBufferFromFileBase; +class WriteBufferFromFileBase; + + +class IDataPartStorageIterator +{ +public: + /// Iterate to the next file. + virtual void next() = 0; + + /// Return `true` if the iterator points to a valid element. + virtual bool isValid() const = 0; + + /// Return `true` if the iterator points to a file. + virtual bool isFile() const = 0; + + /// Name of the file that the iterator currently points to. + virtual std::string name() const = 0; + + virtual ~IDataPartStorageIterator() = default; +}; + +using DataPartStorageIteratorPtr = std::unique_ptr; + +struct MergeTreeDataPartChecksums; + +class IReservation; +using ReservationPtr = std::unique_ptr; + +class IStoragePolicy; + +class IDisk; +using DiskPtr = std::shared_ptr; + +class ISyncGuard; +using SyncGuardPtr = std::unique_ptr; + +class IBackupEntry; +using BackupEntryPtr = std::shared_ptr; +using BackupEntries = std::vector>; + +struct WriteSettings; + +class TemporaryFileOnDisk; + +/// This is an abstraction of storage for data part files. +/// Ideally, it is assumed to contains read-only methods from IDisk. +/// It is not fulfilled now, but let's try our best. +class IDataPartStorage +{ +public: + virtual ~IDataPartStorage() = default; + + /// Methods to get path components of a data part. + virtual std::string getFullPath() const = 0; /// '/var/lib/clickhouse/data/database/table/moving/all_1_5_1' + virtual std::string getRelativePath() const = 0; /// 'database/table/moving/all_1_5_1' + virtual std::string getPartDirectory() const = 0; /// 'all_1_5_1' + virtual std::string getFullRootPath() const = 0; /// '/var/lib/clickhouse/data/database/table/moving' + /// Can add it if needed /// 'database/table/moving' + /// virtual std::string getRelativeRootPath() const = 0; + + /// Get a storage for projection. + virtual std::shared_ptr getProjection(const std::string & name) const = 0; + + /// Part directory exists. + virtual bool exists() const = 0; + /// File inside part directory exists. Specified path is relative to the part path. + virtual bool exists(const std::string & name) const = 0; + virtual bool isDirectory(const std::string & name) const = 0; + + /// Modification time for part directory. + virtual Poco::Timestamp getLastModified() const = 0; + /// Iterate part directory. Iteration in subdirectory is not needed yet. + virtual DataPartStorageIteratorPtr iterate() const = 0; + + /// Get metadata for a file inside path dir. + virtual size_t getFileSize(const std::string & file_name) const = 0; + virtual UInt32 getRefCount(const std::string & file_name) const = 0; + + virtual UInt64 calculateTotalSizeOnDisk() const = 0; + + /// Open the file for read and return ReadBufferFromFileBase object. + virtual std::unique_ptr readFile( + const std::string & name, + const ReadSettings & settings, + std::optional read_hint, + std::optional file_size) const = 0; + + virtual void loadVersionMetadata(VersionMetadata & version, Poco::Logger * log) const = 0; + virtual void checkConsistency(const MergeTreeDataPartChecksums & checksums) const = 0; + + struct ProjectionChecksums + { + const std::string & name; + const MergeTreeDataPartChecksums & checksums; + }; + + /// Remove data part. + /// can_remove_shared_data, names_not_to_remove are specific for DiskObjectStorage. + /// projections, checksums are needed to avoid recursive listing + virtual void remove( + bool can_remove_shared_data, + const NameSet & names_not_to_remove, + const MergeTreeDataPartChecksums & checksums, + std::list projections, + Poco::Logger * log) const = 0; + + /// Get a name like 'prefix_partdir_tryN' which does not exist in a root dir. + /// TODO: remove it. + virtual std::string getRelativePathForPrefix(Poco::Logger * log, const String & prefix, bool detached) const = 0; + + /// Reset part directory, used for im-memory parts. + /// TODO: remove it. + virtual void setRelativePath(const std::string & path) = 0; + + /// Some methods from IDisk. Needed to avoid getting internal IDisk interface. + virtual std::string getDiskName() const = 0; + virtual std::string getDiskType() const = 0; + virtual bool isStoredOnRemoteDisk() const { return false; } + virtual bool supportZeroCopyReplication() const { return false; } + virtual bool supportParallelWrite() const = 0; + virtual bool isBroken() const = 0; + virtual void syncRevision(UInt64 revision) = 0; + virtual UInt64 getRevision() const = 0; + virtual std::unordered_map getSerializedMetadata(const std::vector & paths) const = 0; + /// Get a path for internal disk if relevant. It is used mainly for logging. + virtual std::string getDiskPath() const = 0; + + /// Check if data part is stored on one of the specified disk in set. + using DisksSet = std::unordered_set; + virtual DisksSet::const_iterator isStoredOnDisk(const DisksSet & disks) const { return disks.end(); } + + /// Reserve space on the same disk. + /// Probably we should try to remove it later. + virtual ReservationPtr reserve(UInt64 /*bytes*/) const { return nullptr; } + virtual ReservationPtr tryReserve(UInt64 /*bytes*/) const { return nullptr; } + virtual size_t getVolumeIndex(const IStoragePolicy &) const { return 0; } + + /// Some methods which change data part internals possibly after creation. + /// Probably we should try to remove it later. + virtual void writeChecksums(const MergeTreeDataPartChecksums & checksums, const WriteSettings & settings) const = 0; + virtual void writeColumns(const NamesAndTypesList & columns, const WriteSettings & settings) const = 0; + virtual void writeVersionMetadata(const VersionMetadata & version, bool fsync_part_dir) const = 0; + virtual void appendCSNToVersionMetadata(const VersionMetadata & version, VersionMetadata::WhichCSN which_csn) const = 0; + virtual void appendRemovalTIDToVersionMetadata(const VersionMetadata & version, bool clear) const = 0; + virtual void writeDeleteOnDestroyMarker(Poco::Logger * log) const = 0; + virtual void removeDeleteOnDestroyMarker() const = 0; + virtual void removeVersionMetadata() const = 0; + + /// A leak of abstraction. + /// Return some uniq string for file. + /// Required for distinguish different copies of the same part on remote FS. + virtual String getUniqueId() const = 0; + + /// A leak of abstraction + virtual bool shallParticipateInMerges(const IStoragePolicy &) const { return true; } + + /// Create a backup of a data part. + /// This method adds a new entry to backup_entries. + /// Also creates a new tmp_dir for internal disk (if disk is mentioned the first time). + using TemporaryFilesOnDisks = std::map>; + virtual void backup( + TemporaryFilesOnDisks & temp_dirs, + const MergeTreeDataPartChecksums & checksums, + const NameSet & files_without_checksums, + BackupEntries & backup_entries) const = 0; + + /// Creates hardlinks into 'to/dir_path' for every file in data part. + /// Callback is called after hardlinks are created, but before 'delete-on-destroy.txt' marker is removed. + virtual std::shared_ptr freeze( + const std::string & to, + const std::string & dir_path, + bool make_source_readonly, + std::function save_metadata_callback, + bool copy_instead_of_hardlink) const = 0; + + /// Make a full copy of a data part into 'to/dir_path' (possibly to a different disk). + virtual std::shared_ptr clone( + const std::string & to, + const std::string & dir_path, + const DiskPtr & disk, + Poco::Logger * log) const = 0; + + /// Rename part. + /// Ideally, new_root_path should be the same as current root (but it is not true). + /// Examples are: 'all_1_2_1' -> 'detached/all_1_2_1' + /// 'moving/tmp_all_1_2_1' -> 'all_1_2_1' + virtual void rename( + const std::string & new_root_path, + const std::string & new_part_dir, + Poco::Logger * log, + bool remove_new_dir_if_exists, + bool fsync_part_dir) = 0; + + /// Change part's root. from_root should be a prefix path of current root path. + /// Right now, this is needed for rename table query. + virtual void changeRootPath(const std::string & from_root, const std::string & to_root) = 0; +}; + +using DataPartStoragePtr = std::shared_ptr; + +/// This interface is needed to write data part. +class IDataPartStorageBuilder +{ +public: + virtual ~IDataPartStorageBuilder() = default; + + /// Reset part directory, used for im-memory parts + virtual void setRelativePath(const std::string & path) = 0; + + virtual std::string getPartDirectory() const = 0; + virtual std::string getFullPath() const = 0; + virtual std::string getRelativePath() const = 0; + + virtual bool exists() const = 0; + virtual bool exists(const std::string & name) const = 0; + + virtual void createDirectories() = 0; + virtual void createProjection(const std::string & name) = 0; + + virtual std::unique_ptr readFile( + const std::string & name, + const ReadSettings & settings, + std::optional read_hint, + std::optional file_size) const = 0; + + virtual std::unique_ptr writeFile(const String & name, size_t buf_size, const WriteSettings & settings) = 0; + + virtual void removeFile(const String & name) = 0; + virtual void removeRecursive() = 0; + virtual void removeSharedRecursive(bool keep_in_remote_fs) = 0; + + virtual SyncGuardPtr getDirectorySyncGuard() const { return nullptr; } + + virtual void createHardLinkFrom(const IDataPartStorage & source, const std::string & from, const std::string & to) const = 0; + + virtual ReservationPtr reserve(UInt64 /*bytes*/) { return nullptr; } + + virtual std::shared_ptr getProjection(const std::string & name) const = 0; + + virtual DataPartStoragePtr getStorage() const = 0; +}; + +using DataPartStorageBuilderPtr = std::shared_ptr; + +} diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 12ea2943c8a..3cb3d052b0c 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -53,10 +53,8 @@ namespace DB namespace ErrorCodes { - extern const int DIRECTORY_ALREADY_EXISTS; extern const int CANNOT_READ_ALL_DATA; extern const int LOGICAL_ERROR; - extern const int FILE_DOESNT_EXIST; extern const int NO_FILE_IN_DATA_PART; extern const int EXPECTED_END_OF_FILE; extern const int CORRUPTED_DATA; @@ -66,12 +64,6 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } -static std::unique_ptr openForReading(const DiskPtr & disk, const String & path) -{ - size_t file_size = disk->getFileSize(path); - return disk->readFile(path, ReadSettings().adjustBufferSize(file_size), file_size); -} - void IMergeTreeDataPart::MinMaxIndex::load(const MergeTreeData & data, const PartMetadataManagerPtr & manager) { auto metadata_snapshot = data.getInMemoryMetadataPtr(); @@ -105,7 +97,7 @@ void IMergeTreeDataPart::MinMaxIndex::load(const MergeTreeData & data, const Par } IMergeTreeDataPart::MinMaxIndex::WrittenFiles IMergeTreeDataPart::MinMaxIndex::store( - const MergeTreeData & data, const DiskPtr & disk_, const String & part_path, Checksums & out_checksums) const + const MergeTreeData & data, const DataPartStorageBuilderPtr & data_part_storage_builder, Checksums & out_checksums) const { auto metadata_snapshot = data.getInMemoryMetadataPtr(); const auto & partition_key = metadata_snapshot->getPartitionKey(); @@ -113,19 +105,20 @@ IMergeTreeDataPart::MinMaxIndex::WrittenFiles IMergeTreeDataPart::MinMaxIndex::s auto minmax_column_names = data.getMinMaxColumnsNames(partition_key); auto minmax_column_types = data.getMinMaxColumnsTypes(partition_key); - return store(minmax_column_names, minmax_column_types, disk_, part_path, out_checksums); + return store(minmax_column_names, minmax_column_types, data_part_storage_builder, out_checksums); } IMergeTreeDataPart::MinMaxIndex::WrittenFiles IMergeTreeDataPart::MinMaxIndex::store( const Names & column_names, const DataTypes & data_types, - const DiskPtr & disk_, - const String & part_path, + const DataPartStorageBuilderPtr & data_part_storage_builder, Checksums & out_checksums) const { if (!initialized) - throw Exception("Attempt to store uninitialized MinMax index for part " + part_path + ". This is a bug.", - ErrorCodes::LOGICAL_ERROR); + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Attempt to store uninitialized MinMax index for part {}. This is a bug", + data_part_storage_builder->getFullPath()); WrittenFiles written_files; @@ -134,7 +127,7 @@ IMergeTreeDataPart::MinMaxIndex::WrittenFiles IMergeTreeDataPart::MinMaxIndex::s String file_name = "minmax_" + escapeForFileName(column_names[i]) + ".idx"; auto serialization = data_types.at(i)->getDefaultSerialization(); - auto out = disk_->writeFile(fs::path(part_path) / file_name); + auto out = data_part_storage_builder->writeFile(file_name, DBMS_DEFAULT_BUFFER_SIZE, {}); HashingWriteBuffer out_hashing(*out); serialization->serializeBinary(hyperrectangle[i].left, out_hashing); serialization->serializeBinary(hyperrectangle[i].right, out_hashing); @@ -305,15 +298,13 @@ static void decrementTypeMetric(MergeTreeDataPartType type) IMergeTreeDataPart::IMergeTreeDataPart( const MergeTreeData & storage_, const String & name_, - const VolumePtr & volume_, - const std::optional & relative_path_, + const DataPartStoragePtr & data_part_storage_, Type part_type_, const IMergeTreeDataPart * parent_part_) : storage(storage_) , name(name_) , info(MergeTreePartInfo::fromPartName(name_, storage.format_version)) - , volume(parent_part_ ? parent_part_->volume : volume_) - , relative_path(relative_path_.value_or(name_)) + , data_part_storage(parent_part_ ? parent_part_->data_part_storage : data_part_storage_) , index_granularity_info(storage_, part_type_) , part_type(part_type_) , parent_part(parent_part_) @@ -333,15 +324,13 @@ IMergeTreeDataPart::IMergeTreeDataPart( const MergeTreeData & storage_, const String & name_, const MergeTreePartInfo & info_, - const VolumePtr & volume_, - const std::optional & relative_path_, + const DataPartStoragePtr & data_part_storage_, Type part_type_, const IMergeTreeDataPart * parent_part_) : storage(storage_) , name(name_) , info(info_) - , volume(parent_part_ ? parent_part_->volume : volume_) - , relative_path(relative_path_.value_or(name_)) + , data_part_storage(data_part_storage_) , index_granularity_info(storage_, part_type_) , part_type(part_type_) , parent_part(parent_part_) @@ -479,17 +468,17 @@ void IMergeTreeDataPart::removeIfNeeded() try { - auto path = getFullRelativePath(); + auto path = data_part_storage->getRelativePath(); - if (!volume->getDisk()->exists(path)) + if (!data_part_storage->exists()) // path return; if (is_temp) { - String file_name = fileName(relative_path); + String file_name = fileName(data_part_storage->getPartDirectory()); if (file_name.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "relative_path {} of part {} is invalid or not set", relative_path, name); + throw Exception(ErrorCodes::LOGICAL_ERROR, "relative_path {} of part {} is invalid or not set", data_part_storage->getPartDirectory(), name); if (!startsWith(file_name, "tmp") && !endsWith(file_name, ".tmp_proj")) { @@ -502,13 +491,7 @@ void IMergeTreeDataPart::removeIfNeeded() } } - if (parent_part) - { - auto [can_remove, _] = canRemovePart(); - projectionRemove(parent_part->getFullRelativePath(), !can_remove); - } - else - remove(); + remove(); if (state == State::DeleteOnDestroy) { @@ -615,26 +598,26 @@ String IMergeTreeDataPart::getColumnNameWithMinimumCompressedSize( } if (!minimum_size_column) - throw Exception("Could not find a column of minimum size in MergeTree, part " + getFullPath(), ErrorCodes::LOGICAL_ERROR); + throw Exception("Could not find a column of minimum size in MergeTree, part " + data_part_storage->getFullPath(), ErrorCodes::LOGICAL_ERROR); return *minimum_size_column; } -String IMergeTreeDataPart::getFullPath() const -{ - if (relative_path.empty()) - throw Exception("Part relative_path cannot be empty. It's bug.", ErrorCodes::LOGICAL_ERROR); +// String IMergeTreeDataPart::getFullPath() const +// { +// if (relative_path.empty()) +// throw Exception("Part relative_path cannot be empty. It's bug.", ErrorCodes::LOGICAL_ERROR); - return fs::path(storage.getFullPathOnDisk(volume->getDisk())) / (parent_part ? parent_part->relative_path : "") / relative_path / ""; -} +// return fs::path(storage.getFullPathOnDisk(volume->getDisk())) / (parent_part ? parent_part->relative_path : "") / relative_path / ""; +// } -String IMergeTreeDataPart::getFullRelativePath() const -{ - if (relative_path.empty()) - throw Exception("Part relative_path cannot be empty. It's bug.", ErrorCodes::LOGICAL_ERROR); +// String IMergeTreeDataPart::getRelativePath() const +// { +// if (relative_path.empty()) +// throw Exception("Part relative_path cannot be empty. It's bug.", ErrorCodes::LOGICAL_ERROR); - return fs::path(storage.relative_data_path) / (parent_part ? parent_part->relative_path : "") / relative_path / ""; -} +// return fs::path(storage.relative_data_path) / (parent_part ? parent_part->relative_path : "") / relative_path / ""; +// } void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checksums, bool check_consistency) { @@ -698,7 +681,7 @@ void IMergeTreeDataPart::appendFilesOfColumnsChecksumsIndexes(Strings & files, b Strings projection_files; projection_part->appendFilesOfColumnsChecksumsIndexes(projection_files, true); for (const auto & projection_file : projection_files) - files.push_back(fs::path(projection_part->relative_path) / projection_file); + files.push_back(fs::path(projection_part->name + ".proj") / projection_file); } } } @@ -708,10 +691,11 @@ void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool ch auto metadata_snapshot = storage.getInMemoryMetadataPtr(); for (const auto & projection : metadata_snapshot->projections) { - String path = getFullRelativePath() + projection.name + ".proj"; - if (volume->getDisk()->exists(path)) + String path = /*getRelativePath() + */ projection.name + ".proj"; + if (data_part_storage->exists(path)) { - auto part = storage.createPart(projection.name, {"all", 0, 0, 0}, volume, projection.name + ".proj", this); + auto projection_part_storage = data_part_storage->getProjection(projection.name + ".proj"); + auto part = storage.createPart(projection.name, {"all", 0, 0, 0}, projection_part_storage, this); part->loadColumnsChecksumsIndexes(require_columns_checksums, check_consistency); projection_parts.emplace(projection.name, std::move(part)); } @@ -752,7 +736,7 @@ void IMergeTreeDataPart::loadIndex() } String index_name = "primary.idx"; - String index_path = fs::path(getFullRelativePath()) / index_name; + String index_path = fs::path(data_part_storage->getRelativePath()) / index_name; auto index_file = metadata_manager->read(index_name); size_t marks_count = index_granularity.getMarksCount(); @@ -774,7 +758,7 @@ void IMergeTreeDataPart::loadIndex() } if (!index_file->eof()) - throw Exception("Index file " + fullPath(volume->getDisk(), index_path) + " is unexpectedly long", ErrorCodes::EXPECTED_END_OF_FILE); + throw Exception("Index file " + index_path + " is unexpectedly long", ErrorCodes::EXPECTED_END_OF_FILE); index.assign(std::make_move_iterator(loaded_index.begin()), std::make_move_iterator(loaded_index.end())); } @@ -799,13 +783,11 @@ NameSet IMergeTreeDataPart::getFileNamesWithoutChecksums() const return {}; NameSet result = {"checksums.txt", "columns.txt"}; - String default_codec_path = fs::path(getFullRelativePath()) / DEFAULT_COMPRESSION_CODEC_FILE_NAME; - String txn_version_path = fs::path(getFullRelativePath()) / TXN_VERSION_METADATA_FILE_NAME; - if (volume->getDisk()->exists(default_codec_path)) + if (data_part_storage->exists(DEFAULT_COMPRESSION_CODEC_FILE_NAME)) result.emplace(DEFAULT_COMPRESSION_CODEC_FILE_NAME); - if (volume->getDisk()->exists(txn_version_path)) + if (data_part_storage->exists(TXN_VERSION_METADATA_FILE_NAME)) result.emplace(TXN_VERSION_METADATA_FILE_NAME); return result; @@ -820,7 +802,7 @@ void IMergeTreeDataPart::loadDefaultCompressionCodec() return; } - String path = fs::path(getFullRelativePath()) / DEFAULT_COMPRESSION_CODEC_FILE_NAME; + String path = fs::path(data_part_storage->getRelativePath()) / DEFAULT_COMPRESSION_CODEC_FILE_NAME; bool exists = metadata_manager->exists(DEFAULT_COMPRESSION_CODEC_FILE_NAME); if (!exists) { @@ -886,10 +868,10 @@ CompressionCodecPtr IMergeTreeDataPart::detectDefaultCompressionCodec() const { if (path_to_data_file.empty()) { - String candidate_path = fs::path(getFullRelativePath()) / (ISerialization::getFileNameForStream(part_column, substream_path) + ".bin"); + String candidate_path = /*fs::path(getRelativePath()) */ (ISerialization::getFileNameForStream(part_column, substream_path) + ".bin"); /// We can have existing, but empty .bin files. Example: LowCardinality(Nullable(...)) columns and column_name.dict.null.bin file. - if (volume->getDisk()->exists(candidate_path) && volume->getDisk()->getFileSize(candidate_path) != 0) + if (data_part_storage->exists(candidate_path) && data_part_storage->getFileSize(candidate_path) != 0) path_to_data_file = candidate_path; } }); @@ -900,7 +882,7 @@ CompressionCodecPtr IMergeTreeDataPart::detectDefaultCompressionCodec() const continue; } - result = getCompressionCodecForFile(volume->getDisk(), path_to_data_file); + result = getCompressionCodecForFile(data_part_storage, path_to_data_file); break; } } @@ -925,7 +907,7 @@ void IMergeTreeDataPart::loadPartitionAndMinMaxIndex() } else { - String path = getFullRelativePath(); + //String path = getRelativePath(); if (!parent_part) partition.load(storage, metadata_manager); @@ -945,7 +927,7 @@ void IMergeTreeDataPart::loadPartitionAndMinMaxIndex() String calculated_partition_id = partition.getID(metadata_snapshot->getPartitionKey().sample_block); if (calculated_partition_id != info.partition_id) throw Exception( - "While loading part " + getFullPath() + ": calculated partition ID: " + calculated_partition_id + "While loading part " + data_part_storage->getFullPath() + ": calculated partition ID: " + calculated_partition_id + " differs from partition ID in part name: " + info.partition_id, ErrorCodes::CORRUPTED_DATA); } @@ -964,7 +946,7 @@ void IMergeTreeDataPart::appendFilesOfPartitionAndMinMaxIndex(Strings & files) c void IMergeTreeDataPart::loadChecksums(bool require) { - const String path = fs::path(getFullRelativePath()) / "checksums.txt"; + //const String path = fs::path(getRelativePath()) / "checksums.txt"; bool exists = metadata_manager->exists("checksums.txt"); if (exists) { @@ -975,7 +957,7 @@ void IMergeTreeDataPart::loadChecksums(bool require) bytes_on_disk = checksums.getTotalSizeOnDisk(); } else - bytes_on_disk = calculateTotalSizeOnDisk(volume->getDisk(), getFullRelativePath()); + bytes_on_disk = data_part_storage->calculateTotalSizeOnDisk(); //calculateTotalSizeOnDisk(volume->getDisk(), getRelativePath()); } else { @@ -987,13 +969,7 @@ void IMergeTreeDataPart::loadChecksums(bool require) LOG_WARNING(storage.log, "Checksums for part {} not found. Will calculate them from data on disk.", name); checksums = checkDataPart(shared_from_this(), false); - - { - auto out = volume->getDisk()->writeFile(fs::path(getFullRelativePath()) / "checksums.txt.tmp", 4096); - checksums.write(*out); - } - - volume->getDisk()->moveFile(fs::path(getFullRelativePath()) / "checksums.txt.tmp", fs::path(getFullRelativePath()) / "checksums.txt"); + data_part_storage->writeChecksums(checksums, {}); bytes_on_disk = checksums.getTotalSizeOnDisk(); } @@ -1006,7 +982,7 @@ void IMergeTreeDataPart::appendFilesOfChecksums(Strings & files) void IMergeTreeDataPart::loadRowsCount() { - String path = fs::path(getFullRelativePath()) / "count.txt"; + //String path = fs::path(getRelativePath()) / "count.txt"; auto read_rows_count = [&]() { @@ -1078,7 +1054,7 @@ void IMergeTreeDataPart::loadRowsCount() } else { - if (volume->getDisk()->exists(path)) + if (data_part_storage->exists("count.txt")) { read_rows_count(); return; @@ -1177,7 +1153,7 @@ void IMergeTreeDataPart::appendFilesOfUUID(Strings & files) void IMergeTreeDataPart::loadColumns(bool require) { - String path = fs::path(getFullRelativePath()) / "columns.txt"; + String path = fs::path(data_part_storage->getRelativePath()) / "columns.txt"; auto metadata_snapshot = storage.getInMemoryMetadataPtr(); if (parent_part) metadata_snapshot = metadata_snapshot->projections.get(name).metadata; @@ -1188,22 +1164,18 @@ void IMergeTreeDataPart::loadColumns(bool require) { /// We can get list of columns only from columns.txt in compact parts. if (require || part_type == Type::Compact) - throw Exception("No columns.txt in part " + name + ", expected path " + path + " on drive " + volume->getDisk()->getName(), + throw Exception("No columns.txt in part " + name + ", expected path " + path + " on drive " + data_part_storage->getDiskName(), ErrorCodes::NO_FILE_IN_DATA_PART); /// If there is no file with a list of columns, write it down. for (const NameAndTypePair & column : metadata_snapshot->getColumns().getAllPhysical()) - if (volume->getDisk()->exists(fs::path(getFullRelativePath()) / (getFileNameForColumn(column) + ".bin"))) + if (data_part_storage->exists(getFileNameForColumn(column) + ".bin")) loaded_columns.push_back(column); if (columns.empty()) throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART); - { - auto buf = volume->getDisk()->writeFile(path + ".tmp", 4096); - loaded_columns.writeText(*buf); - } - volume->getDisk()->moveFile(path + ".tmp", path); + data_part_storage->writeColumns(loaded_columns, {}); } else { @@ -1245,7 +1217,7 @@ void IMergeTreeDataPart::assertHasVersionMetadata(MergeTreeTransaction * txn) co name, storage.getStorageID().getNameForLogs(), version.creation_tid, txn ? txn->dumpDescription() : ""); assert(!txn || storage.supportsTransactions()); - assert(!txn || volume->getDisk()->exists(fs::path(getFullRelativePath()) / TXN_VERSION_METADATA_FILE_NAME)); + assert(!txn || data_part_storage->exists(TXN_VERSION_METADATA_FILE_NAME)); } void IMergeTreeDataPart::storeVersionMetadata() const @@ -1260,24 +1232,7 @@ void IMergeTreeDataPart::storeVersionMetadata() const throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Transactions are not supported for in-memory parts (table: {}, part: {})", storage.getStorageID().getNameForLogs(), name); - String version_file_name = fs::path(getFullRelativePath()) / TXN_VERSION_METADATA_FILE_NAME; - String tmp_version_file_name = version_file_name + ".tmp"; - DiskPtr disk = volume->getDisk(); - { - /// TODO IDisk interface does not allow to open file with O_EXCL flag (for DiskLocal), - /// so we create empty file at first (expecting that createFile throws if file already exists) - /// and then overwrite it. - disk->createFile(tmp_version_file_name); - auto out = disk->writeFile(tmp_version_file_name, 256, WriteMode::Rewrite); - version.write(*out); - out->finalize(); - out->sync(); - } - - SyncGuardPtr sync_guard; - if (storage.getSettings()->fsync_part_directory) - sync_guard = disk->getDirectorySyncGuard(getFullRelativePath()); - disk->replaceFile(tmp_version_file_name, version_file_name); + data_part_storage->writeVersionMetadata(version, storage.getSettings()->fsync_part_directory); } void IMergeTreeDataPart::appendCSNToVersionMetadata(VersionMetadata::WhichCSN which_csn) const @@ -1289,16 +1244,7 @@ void IMergeTreeDataPart::appendCSNToVersionMetadata(VersionMetadata::WhichCSN wh chassert(!(which_csn == VersionMetadata::WhichCSN::REMOVAL && version.removal_csn == 0)); chassert(isStoredOnDisk()); - /// Small enough appends to file are usually atomic, - /// so we append new metadata instead of rewriting file to reduce number of fsyncs. - /// We don't need to do fsync when writing CSN, because in case of hard restart - /// we will be able to restore CSN from transaction log in Keeper. - - String version_file_name = fs::path(getFullRelativePath()) / TXN_VERSION_METADATA_FILE_NAME; - DiskPtr disk = volume->getDisk(); - auto out = disk->writeFile(version_file_name, 256, WriteMode::Append); - version.writeCSN(*out, which_csn); - out->finalize(); + data_part_storage->appendCSNToVersionMetadata(version, which_csn); } void IMergeTreeDataPart::appendRemovalTIDToVersionMetadata(bool clear) const @@ -1321,69 +1267,15 @@ void IMergeTreeDataPart::appendRemovalTIDToVersionMetadata(bool clear) const else LOG_TEST(storage.log, "Appending removal TID for {} (creation: {}, removal {})", name, version.creation_tid, version.removal_tid); - String version_file_name = fs::path(getFullRelativePath()) / TXN_VERSION_METADATA_FILE_NAME; - DiskPtr disk = volume->getDisk(); - auto out = disk->writeFile(version_file_name, 256, WriteMode::Append); - version.writeRemovalTID(*out, clear); - out->finalize(); - - /// fsync is not required when we clearing removal TID, because after hard restart we will fix metadata - if (!clear) - out->sync(); + data_part_storage->appendRemovalTIDToVersionMetadata(version, clear); } void IMergeTreeDataPart::loadVersionMetadata() const try { - String version_file_name = fs::path(getFullRelativePath()) / TXN_VERSION_METADATA_FILE_NAME; - String tmp_version_file_name = version_file_name + ".tmp"; - DiskPtr disk = volume->getDisk(); + data_part_storage->loadVersionMetadata(version, storage.log); - auto remove_tmp_file = [&]() - { - auto last_modified = disk->getLastModified(tmp_version_file_name); - auto buf = openForReading(disk, tmp_version_file_name); - String content; - readStringUntilEOF(content, *buf); - LOG_WARNING(storage.log, "Found file {} that was last modified on {}, has size {} and the following content: {}", - tmp_version_file_name, last_modified.epochTime(), content.size(), content); - disk->removeFile(tmp_version_file_name); - }; - if (disk->exists(version_file_name)) - { - auto buf = openForReading(disk, version_file_name); - version.read(*buf); - if (disk->exists(tmp_version_file_name)) - remove_tmp_file(); - return; - } - - /// Four (?) cases are possible: - /// 1. Part was created without transactions. - /// 2. Version metadata file was not renamed from *.tmp on part creation. - /// 3. Version metadata were written to *.tmp file, but hard restart happened before fsync. - /// 4. Fsyncs in storeVersionMetadata() work incorrectly. - - if (!disk->exists(tmp_version_file_name)) - { - /// Case 1. - /// We do not have version metadata and transactions history for old parts, - /// so let's consider that such parts were created by some ancient transaction - /// and were committed with some prehistoric CSN. - /// NOTE It might be Case 3, but version metadata file is written on part creation before other files, - /// so it's not Case 3 if part is not broken. - version.setCreationTID(Tx::PrehistoricTID, nullptr); - version.creation_csn = Tx::PrehistoricCSN; - return; - } - - /// Case 2. - /// Content of *.tmp file may be broken, just use fake TID. - /// Transaction was not committed if *.tmp file was not renamed, so we should complete rollback by removing part. - version.setCreationTID(Tx::DummyTID, nullptr); - version.creation_csn = Tx::RolledBackCSN; - remove_tmp_file(); } catch (Exception & e) { @@ -1420,15 +1312,16 @@ bool IMergeTreeDataPart::assertHasValidVersionMetadata() const if (state == State::Temporary) return true; - DiskPtr disk = volume->getDisk(); - if (!disk->exists(getFullRelativePath())) + if (!data_part_storage->exists()) return true; String content; - String version_file_name = fs::path(getFullRelativePath()) / TXN_VERSION_METADATA_FILE_NAME; + String version_file_name = TXN_VERSION_METADATA_FILE_NAME; try { - auto buf = openForReading(disk, version_file_name); + size_t file_size = data_part_storage->getFileSize(TXN_VERSION_METADATA_FILE_NAME); + auto buf = data_part_storage->readFile(TXN_VERSION_METADATA_FILE_NAME, ReadSettings().adjustBufferSize(file_size), file_size, std::nullopt); + readStringUntilEOF(content, *buf); ReadBufferFromString str_buf{content}; VersionMetadata file; @@ -1462,25 +1355,20 @@ void IMergeTreeDataPart::appendFilesOfColumns(Strings & files) bool IMergeTreeDataPart::shallParticipateInMerges(const StoragePolicyPtr & storage_policy) const { - /// `IMergeTreeDataPart::volume` describes space where current part belongs, and holds - /// `SingleDiskVolume` object which does not contain up-to-date settings of corresponding volume. - /// Therefore we shall obtain volume from storage policy. - auto volume_ptr = storage_policy->getVolume(storage_policy->getVolumeIndexByDisk(volume->getDisk())); - - return !volume_ptr->areMergesAvoided(); + return data_part_storage->shallParticipateInMerges(*storage_policy); } -UInt64 IMergeTreeDataPart::calculateTotalSizeOnDisk(const DiskPtr & disk_, const String & from) -{ - if (disk_->isFile(from)) - return disk_->getFileSize(from); - std::vector files; - disk_->listFiles(from, files); - UInt64 res = 0; - for (const auto & file : files) - res += calculateTotalSizeOnDisk(disk_, fs::path(from) / file); - return res; -} +// UInt64 IMergeTreeDataPart::calculateTotalSizeOnDisk(const DataPartStoragePtr & data_part_storage_, const String & from) +// { +// if (data_part_storage_->isFile(from)) +// return data_part_storage_->getFileSize(from); +// std::vector files; +// disk_->listFiles(from, files); +// UInt64 res = 0; +// for (const auto & file : files) +// res += calculateTotalSizeOnDisk(data_part_storage_, fs::path(from) / file); +// return res; +// } void IMergeTreeDataPart::renameTo(const String & new_relative_path, bool remove_new_dir_if_exists) const @@ -1488,39 +1376,27 @@ try { assertOnDisk(); - String from = getFullRelativePath(); - String to = fs::path(storage.relative_data_path) / (parent_part ? parent_part->relative_path : "") / new_relative_path / ""; + std::string relative_path = storage.relative_data_path; + bool fsync_dir = storage.getSettings()->fsync_part_directory; - if (!volume->getDisk()->exists(from)) - throw Exception("Part directory " + fullPath(volume->getDisk(), from) + " doesn't exist. Most likely it is a logical error.", ErrorCodes::FILE_DOESNT_EXIST); - - if (volume->getDisk()->exists(to)) + if (parent_part) { - if (remove_new_dir_if_exists) - { - Names files; - volume->getDisk()->listFiles(to, files); - - LOG_WARNING(storage.log, "Part directory {} already exists and contains {} files. Removing it.", fullPath(volume->getDisk(), to), files.size()); - - volume->getDisk()->removeRecursive(to); - } - else - { - throw Exception("Part directory " + fullPath(volume->getDisk(), to) + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS); - } + /// For projections, move is only possible inside parent part dir. + relative_path = parent_part->data_part_storage->getRelativePath(); } + String from = data_part_storage->getRelativePath(); + auto to = fs::path(relative_path) / new_relative_path; + metadata_manager->deleteAll(true); metadata_manager->assertAllDeleted(true); - volume->getDisk()->setLastModified(from, Poco::Timestamp::fromEpochTime(time(nullptr))); - volume->getDisk()->moveDirectory(from, to); - relative_path = new_relative_path; + data_part_storage->rename(to.parent_path(), to.filename(), storage.log, remove_new_dir_if_exists, fsync_dir); metadata_manager->updateAll(true); - SyncGuardPtr sync_guard; - if (storage.getSettings()->fsync_part_directory) - sync_guard = volume->getDisk()->getDirectorySyncGuard(to); + for (const auto & [p_name, part] : projection_parts) + { + part->data_part_storage = data_part_storage->getProjection(p_name + ".proj"); + } } catch (...) { @@ -1566,190 +1442,24 @@ void IMergeTreeDataPart::remove() const if (!isStoredOnDisk()) return; - if (relative_path.empty()) - throw Exception("Part relative_path cannot be empty. This is bug.", ErrorCodes::LOGICAL_ERROR); - if (isProjectionPart()) - { LOG_WARNING(storage.log, "Projection part {} should be removed by its parent {}.", name, parent_part->name); - projectionRemove(parent_part->getFullRelativePath(), !can_remove); - return; - } metadata_manager->deleteAll(false); metadata_manager->assertAllDeleted(false); - /** Atomic directory removal: - * - rename directory to temporary name; - * - remove it recursive. - * - * For temporary name we use "delete_tmp_" prefix. - * - * NOTE: We cannot use "tmp_delete_" prefix, because there is a second thread, - * that calls "clearOldTemporaryDirectories" and removes all directories, that begin with "tmp_" and are old enough. - * But when we removing data part, it can be old enough. And rename doesn't change mtime. - * And a race condition can happen that will lead to "File not found" error here. - */ + std::list projection_checksums; - - /// NOTE We rename part to delete_tmp_ instead of delete_tmp_ to avoid race condition - /// when we try to remove two parts with the same name, but different relative paths, - /// for example all_1_2_1 (in Deleting state) and tmp_merge_all_1_2_1 (in Temporary state). - fs::path from = fs::path(storage.relative_data_path) / relative_path; - - /// Cut last "/" if it exists (it shouldn't). Otherwise fs::path behave differently. - fs::path relative_path_without_slash = relative_path.ends_with("/") ? relative_path.substr(0, relative_path.size() - 1) : relative_path; - - /// NOTE relative_path can contain not only part name itself, but also some prefix like - /// "moving/all_1_1_1" or "detached/all_2_3_5". We should handle this case more properly. - fs::path to = fs::path(storage.relative_data_path); - if (relative_path_without_slash.has_parent_path()) - { - auto parent_path = relative_path_without_slash.parent_path(); - if (parent_path == "detached") - throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to remove detached part {} with path {} in remove function. It shouldn't happen", name, relative_path); - - to /= parent_path / ("delete_tmp_" + std::string{relative_path_without_slash.filename()}); - } - else - { - to /= ("delete_tmp_" + std::string{relative_path_without_slash}); - } - - // TODO directory delete_tmp_ is never removed if server crashes before returning from this function - - auto disk = volume->getDisk(); - if (disk->exists(to)) - { - LOG_WARNING(storage.log, "Directory {} (to which part must be renamed before removing) already exists. Most likely this is due to unclean restart or race condition. Removing it.", fullPath(disk, to)); - try - { - disk->removeSharedRecursive(fs::path(to) / "", !can_remove, files_not_to_remove); - } - catch (...) - { - LOG_ERROR(storage.log, "Cannot recursively remove directory {}. Exception: {}", fullPath(disk, to), getCurrentExceptionMessage(false)); - throw; - } - } - - try - { - disk->moveDirectory(from, to); - } - catch (const fs::filesystem_error & e) - { - if (e.code() == std::errc::no_such_file_or_directory) - { - LOG_ERROR(storage.log, "Directory {} (part to remove) doesn't exist or one of nested files has gone. Most likely this is due to manual removing. This should be discouraged. Ignoring.", fullPath(disk, from)); - return; - } - throw; - } - - // Record existing projection directories so we don't remove them twice - std::unordered_set projection_directories; for (const auto & [p_name, projection_part] : projection_parts) { - /// NOTE: projections currently unsupported with zero copy replication. - /// TODO: fix it. - projection_part->projectionRemove(to, !can_remove); - projection_directories.emplace(p_name + ".proj"); + projection_part->metadata_manager->deleteAll(false); + projection_part->metadata_manager->assertAllDeleted(false); + projection_checksums.emplace_back(IDataPartStorage::ProjectionChecksums{.name = p_name, .checksums = projection_part->checksums}); } - - if (checksums.empty()) - { - /// If the part is not completely written, we cannot use fast path by listing files. - disk->removeSharedRecursive(fs::path(to) / "", !can_remove, files_not_to_remove); - } - else - { - try - { - /// Remove each expected file in directory, then remove directory itself. - IDisk::RemoveBatchRequest request; - - for (const auto & [file, _] : checksums.files) - { - if (projection_directories.find(file) == projection_directories.end()) - request.emplace_back(fs::path(to) / file); - } - - for (const auto & file : {"checksums.txt", "columns.txt"}) - request.emplace_back(fs::path(to) / file); - - request.emplace_back(fs::path(to) / DEFAULT_COMPRESSION_CODEC_FILE_NAME, true); - request.emplace_back(fs::path(to) / DELETE_ON_DESTROY_MARKER_FILE_NAME, true); - request.emplace_back(fs::path(to) / TXN_VERSION_METADATA_FILE_NAME, true); - - disk->removeSharedFiles(request, !can_remove, files_not_to_remove); - disk->removeDirectory(to); - } - catch (...) - { - /// Recursive directory removal does many excessive "stat" syscalls under the hood. - LOG_ERROR(storage.log, "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: {}", fullPath(disk, to), getCurrentExceptionMessage(false)); - - disk->removeSharedRecursive(fs::path(to) / "", !can_remove, files_not_to_remove); - } - } + data_part_storage->remove(can_remove, files_not_to_remove, checksums, projection_checksums, storage.log); } - -void IMergeTreeDataPart::projectionRemove(const String & parent_to, bool keep_shared_data) const -{ - metadata_manager->deleteAll(false); - metadata_manager->assertAllDeleted(false); - - String to = fs::path(parent_to) / relative_path; - auto disk = volume->getDisk(); - if (checksums.empty()) - { - - LOG_ERROR( - storage.log, - "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: checksums.txt is missing", - fullPath(disk, to)); - /// If the part is not completely written, we cannot use fast path by listing files. - disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data, {}); - } - else - { - try - { - /// Remove each expected file in directory, then remove directory itself. - IDisk::RemoveBatchRequest request; - - #if !defined(__clang__) - # pragma GCC diagnostic push - # pragma GCC diagnostic ignored "-Wunused-variable" - #endif - for (const auto & [file, _] : checksums.files) - request.emplace_back(fs::path(to) / file); - #if !defined(__clang__) - # pragma GCC diagnostic pop - #endif - - for (const auto & file : {"checksums.txt", "columns.txt"}) - request.emplace_back(fs::path(to) / file); - request.emplace_back(fs::path(to) / DEFAULT_COMPRESSION_CODEC_FILE_NAME, true); - request.emplace_back(fs::path(to) / DELETE_ON_DESTROY_MARKER_FILE_NAME, true); - - disk->removeSharedFiles(request, keep_shared_data, {}); - disk->removeSharedRecursive(to, keep_shared_data, {}); - } - catch (...) - { - /// Recursive directory removal does many excessive "stat" syscalls under the hood. - - LOG_ERROR(storage.log, "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: {}", fullPath(disk, to), getCurrentExceptionMessage(false)); - - disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data, {}); - } - } - } - String IMergeTreeDataPart::getRelativePathForPrefix(const String & prefix, bool detached) const { String res; @@ -1760,25 +1470,10 @@ String IMergeTreeDataPart::getRelativePathForPrefix(const String & prefix, bool * No more than 10 attempts are made so that there are not too many junk directories left. */ - auto full_relative_path = fs::path(storage.relative_data_path); - if (detached) - full_relative_path /= "detached"; if (detached && parent_part) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot detach projection"); - else if (parent_part) - full_relative_path /= parent_part->relative_path; - for (int try_no = 0; try_no < 10; ++try_no) - { - res = (prefix.empty() ? "" : prefix + "_") + name + (try_no ? "_try" + DB::toString(try_no) : ""); - - if (!volume->getDisk()->exists(full_relative_path / res)) - return res; - - LOG_WARNING(storage.log, "Directory {} (to detach to) already exists. Will detach to directory with '_tryN' suffix.", res); - } - - return res; + return data_part_storage->getRelativePathForPrefix(storage.log, prefix, detached); } String IMergeTreeDataPart::getRelativePathForDetachedPart(const String & prefix) const @@ -1799,36 +1494,29 @@ void IMergeTreeDataPart::renameToDetached(const String & prefix) const void IMergeTreeDataPart::makeCloneInDetached(const String & prefix, const StorageMetadataPtr & /*metadata_snapshot*/) const { - String destination_path = fs::path(storage.relative_data_path) / getRelativePathForDetachedPart(prefix); - localBackup(volume->getDisk(), getFullRelativePath(), destination_path); - volume->getDisk()->removeFileIfExists(fs::path(destination_path) / DELETE_ON_DESTROY_MARKER_FILE_NAME); + data_part_storage->freeze( + storage.relative_data_path, + getRelativePathForDetachedPart(prefix), + /*make_source_readonly*/ true, + {}, + /*copy_instead_of_hardlink*/ false); } -void IMergeTreeDataPart::makeCloneOnDisk(const DiskPtr & disk, const String & directory_name) const +DataPartStoragePtr IMergeTreeDataPart::makeCloneOnDisk(const DiskPtr & disk, const String & directory_name) const { assertOnDisk(); - if (disk->getName() == volume->getDisk()->getName()) - throw Exception("Can not clone data part " + name + " to same disk " + volume->getDisk()->getName(), ErrorCodes::LOGICAL_ERROR); + if (disk->getName() == data_part_storage->getDiskName()) + throw Exception("Can not clone data part " + name + " to same disk " + data_part_storage->getDiskName(), ErrorCodes::LOGICAL_ERROR); if (directory_name.empty()) throw Exception("Can not clone data part " + name + " to empty directory.", ErrorCodes::LOGICAL_ERROR); String path_to_clone = fs::path(storage.relative_data_path) / directory_name / ""; - - if (disk->exists(fs::path(path_to_clone) / relative_path)) - { - LOG_WARNING(storage.log, "Path {} already exists. Will remove it and clone again.", fullPath(disk, path_to_clone + relative_path)); - disk->removeRecursive(fs::path(path_to_clone) / relative_path / ""); - } - disk->createDirectories(path_to_clone); - volume->getDisk()->copy(getFullRelativePath(), disk, path_to_clone); - volume->getDisk()->removeFileIfExists(fs::path(path_to_clone) / DELETE_ON_DESTROY_MARKER_FILE_NAME); + return data_part_storage->clone(path_to_clone, data_part_storage->getPartDirectory(), disk, storage.log); } void IMergeTreeDataPart::checkConsistencyBase() const { - String path = getFullRelativePath(); - auto metadata_snapshot = storage.getInMemoryMetadataPtr(); if (parent_part) metadata_snapshot = metadata_snapshot->projections.get(name).metadata; @@ -1862,33 +1550,37 @@ void IMergeTreeDataPart::checkConsistencyBase() const } } - checksums.checkSizes(volume->getDisk(), path); + data_part_storage->checkConsistency(checksums); } else { - auto check_file_not_empty = [&path](const DiskPtr & disk_, const String & file_path) + auto check_file_not_empty = [this](const String & file_path) { UInt64 file_size; - if (!disk_->exists(file_path) || (file_size = disk_->getFileSize(file_path)) == 0) - throw Exception("Part " + fullPath(disk_, path) + " is broken: " + fullPath(disk_, file_path) + " is empty", ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART); + if (!data_part_storage->exists(file_path) || (file_size = data_part_storage->getFileSize(file_path)) == 0) + throw Exception( + ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART, + "Part {} is broken: {} is empty", + data_part_storage->getFullPath(), + std::string(fs::path(data_part_storage->getFullPath()) / file_path)); return file_size; }; /// Check that the primary key index is not empty. if (!pk.column_names.empty()) - check_file_not_empty(volume->getDisk(), fs::path(path) / "primary.idx"); + check_file_not_empty("primary.idx"); if (storage.format_version >= MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING) { - check_file_not_empty(volume->getDisk(), fs::path(path) / "count.txt"); + check_file_not_empty("count.txt"); if (metadata_snapshot->hasPartitionKey()) - check_file_not_empty(volume->getDisk(), fs::path(path) / "partition.dat"); + check_file_not_empty("partition.dat"); if (!parent_part) { for (const String & col_name : storage.getMinMaxColumnsNames(partition_key)) - check_file_not_empty(volume->getDisk(), fs::path(path) / ("minmax_" + escapeForFileName(col_name) + ".idx")); + check_file_not_empty("minmax_" + escapeForFileName(col_name) + ".idx"); } } } @@ -2018,11 +1710,7 @@ bool IMergeTreeDataPart::checkAllTTLCalculated(const StorageMetadataPtr & metada String IMergeTreeDataPart::getUniqueId() const { - auto disk = volume->getDisk(); - if (!disk->supportZeroCopyReplication()) - throw Exception(fmt::format("Disk {} doesn't support zero-copy replication", disk->getName()), ErrorCodes::LOGICAL_ERROR); - - return disk->getUniqueId(fs::path(getFullRelativePath()) / FILE_FOR_REFERENCES_CHECK); + return data_part_storage->getUniqueId(); } String IMergeTreeDataPart::getZeroLevelPartBlockID(std::string_view token) const @@ -2050,11 +1738,10 @@ String IMergeTreeDataPart::getZeroLevelPartBlockID(std::string_view token) const return info.partition_id + "_" + toString(hash_value.words[0]) + "_" + toString(hash_value.words[1]); } -IMergeTreeDataPart::uint128 IMergeTreeDataPart::getActualChecksumByFile(const String & file_path) const +IMergeTreeDataPart::uint128 IMergeTreeDataPart::getActualChecksumByFile(const String & file_name) const { assert(use_metadata_cache); - String file_name = std::filesystem::path(file_path).filename(); const auto filenames_without_checksums = getFileNamesWithoutChecksums(); auto it = checksums.files.find(file_name); if (!filenames_without_checksums.contains(file_name) && it != checksums.files.end()) @@ -2062,11 +1749,11 @@ IMergeTreeDataPart::uint128 IMergeTreeDataPart::getActualChecksumByFile(const St return it->second.file_hash; } - if (!volume->getDisk()->exists(file_path)) + if (!data_part_storage->exists(file_name)) { return {}; } - std::unique_ptr in_file = volume->getDisk()->readFile(file_path); + std::unique_ptr in_file = data_part_storage->readFile(file_name, {}, std::nullopt, std::nullopt); HashingReadBuffer in_hash(*in_file); String value; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 8dacc6833d3..66a6fe065f7 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -70,16 +71,14 @@ public: const MergeTreeData & storage_, const String & name_, const MergeTreePartInfo & info_, - const VolumePtr & volume, - const std::optional & relative_path, + const DataPartStoragePtr & data_part_storage_, Type part_type_, const IMergeTreeDataPart * parent_part_); IMergeTreeDataPart( const MergeTreeData & storage_, const String & name_, - const VolumePtr & volume, - const std::optional & relative_path, + const DataPartStoragePtr & data_part_storage_, Type part_type_, const IMergeTreeDataPart * parent_part_); @@ -90,16 +89,17 @@ public: UncompressedCache * uncompressed_cache, MarkCache * mark_cache, const MergeTreeReaderSettings & reader_settings_, - const ValueSizeMap & avg_value_size_hints_ = ValueSizeMap{}, - const ReadBufferFromFileBase::ProfileCallback & profile_callback_ = ReadBufferFromFileBase::ProfileCallback{}) const = 0; + const ValueSizeMap & avg_value_size_hints_, + const ReadBufferFromFileBase::ProfileCallback & profile_callback_) const = 0; virtual MergeTreeWriterPtr getWriter( + DataPartStorageBuilderPtr data_part_storage_builder, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, - const MergeTreeIndexGranularity & computed_index_granularity = {}) const = 0; + const MergeTreeIndexGranularity & computed_index_granularity) const = 0; virtual bool isStoredOnDisk() const = 0; @@ -150,8 +150,6 @@ public: void remove() const; - void projectionRemove(const String & parent_to, bool keep_shared_data = false) const; - /// Initialize columns (from columns.txt if exists, or create from column files if not). /// Load checksums from checksums.txt if exists. Load index if required. void loadColumnsChecksumsIndexes(bool require_columns_checksums, bool check_consistency); @@ -199,12 +197,10 @@ public: /// processed by multiple shards. UUID uuid = UUIDHelpers::Nil; - VolumePtr volume; + /// This is an object which encapsulates all the operations with disk. + /// Contains a path to stored data. + DataPartStoragePtr data_part_storage; - /// A directory path (relative to storage's path) where part data is actually stored - /// Examples: 'detached/tmp_fetch_', 'tmp_', '' - /// NOTE: Cannot have trailing slash. - mutable String relative_path; MergeTreeIndexGranularityInfo index_granularity_info; size_t rows_count = 0; @@ -313,8 +309,8 @@ public: using WrittenFiles = std::vector>; - [[nodiscard]] WrittenFiles store(const MergeTreeData & data, const DiskPtr & disk_, const String & part_path, Checksums & checksums) const; - [[nodiscard]] WrittenFiles store(const Names & column_names, const DataTypes & data_types, const DiskPtr & disk_, const String & part_path, Checksums & checksums) const; + [[nodiscard]] WrittenFiles store(const MergeTreeData & data, const DataPartStorageBuilderPtr & data_part_storage_builder, Checksums & checksums) const; + [[nodiscard]] WrittenFiles store(const Names & column_names, const DataTypes & data_types, const DataPartStorageBuilderPtr & data_part_storage_builder, Checksums & checksums) const; void update(const Block & block, const Names & column_names); void merge(const MinMaxIndex & other); @@ -344,12 +340,6 @@ public: size_t getFileSizeOrZero(const String & file_name) const; - /// Returns path to part dir relatively to disk mount point - String getFullRelativePath() const; - - /// Returns full path to part dir - String getFullPath() const; - /// Moves a part to detached/ directory and adds prefix to its name void renameToDetached(const String & prefix) const; @@ -361,7 +351,7 @@ public: virtual void makeCloneInDetached(const String & prefix, const StorageMetadataPtr & metadata_snapshot) const; /// Makes full clone of part in specified subdirectory (relative to storage data directory, e.g. "detached") on another disk - void makeCloneOnDisk(const DiskPtr & disk, const String & directory_name) const; + DataPartStoragePtr makeCloneOnDisk(const DiskPtr & disk, const String & directory_name) const; /// Checks that .bin and .mrk files exist. /// @@ -374,9 +364,6 @@ public: /// settings of given storage policy. bool shallParticipateInMerges(const StoragePolicyPtr & storage_policy) const; - /// Calculate the total size of the entire directory with all the files - static UInt64 calculateTotalSizeOnDisk(const DiskPtr & disk_, const String & from); - /// Calculate column and secondary indices sizes on disk. void calculateColumnsAndSecondaryIndicesSizesOnDisk(); @@ -467,7 +454,7 @@ public: UInt32 getNumberOfRefereneces() const; /// Get checksums of metadata file in part directory - IMergeTreeDataPart::uint128 getActualChecksumByFile(const String & file_path) const; + IMergeTreeDataPart::uint128 getActualChecksumByFile(const String & file_name) const; /// Check metadata in cache is consistent with actual metadata on disk(if use_metadata_cache is true) std::unordered_map checkMetadata() const; diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp index 3d6b12efdf1..84d0b50ae2f 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp @@ -39,11 +39,13 @@ Block permuteBlockIfNeeded(const Block & block, const IColumn::Permutation * per IMergeTreeDataPartWriter::IMergeTreeDataPartWriter( const MergeTreeData::DataPartPtr & data_part_, + DataPartStorageBuilderPtr data_part_storage_builder_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const MergeTreeWriterSettings & settings_, const MergeTreeIndexGranularity & index_granularity_) : data_part(data_part_) + , data_part_storage_builder(std::move(data_part_storage_builder_)) , storage(data_part_->storage) , metadata_snapshot(metadata_snapshot_) , columns_list(columns_list_) diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h index 34c53eda846..417e2713180 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h @@ -23,6 +23,7 @@ class IMergeTreeDataPartWriter : private boost::noncopyable public: IMergeTreeDataPartWriter( const MergeTreeData::DataPartPtr & data_part_, + DataPartStorageBuilderPtr data_part_storage_builder_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const MergeTreeWriterSettings & settings_, @@ -42,6 +43,7 @@ public: protected: const MergeTreeData::DataPartPtr data_part; + DataPartStorageBuilderPtr data_part_storage_builder; const MergeTreeData & storage; const StorageMetadataPtr metadata_snapshot; const NamesAndTypesList columns_list; diff --git a/src/Storages/MergeTree/IMergeTreeReader.cpp b/src/Storages/MergeTree/IMergeTreeReader.cpp index 3a823345dda..b8aeb8e6a5a 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.cpp +++ b/src/Storages/MergeTree/IMergeTreeReader.cpp @@ -73,7 +73,7 @@ void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_e catch (Exception & e) { /// Better diagnostics. - e.addMessage("(while reading from part " + data_part->getFullPath() + ")"); + e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + ")"); throw; } } @@ -119,7 +119,7 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns catch (Exception & e) { /// Better diagnostics. - e.addMessage("(while reading from part " + data_part->getFullPath() + ")"); + e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + ")"); throw; } } @@ -198,7 +198,7 @@ void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const catch (Exception & e) { /// Better diagnostics. - e.addMessage("(while reading from part " + data_part->getFullPath() + ")"); + e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + ")"); throw; } } diff --git a/src/Storages/MergeTree/IMergedBlockOutputStream.cpp b/src/Storages/MergeTree/IMergedBlockOutputStream.cpp index 2f37c4278b6..48cf720ad67 100644 --- a/src/Storages/MergeTree/IMergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/IMergedBlockOutputStream.cpp @@ -6,14 +6,14 @@ namespace DB { IMergedBlockOutputStream::IMergedBlockOutputStream( + DataPartStorageBuilderPtr data_part_storage_builder_, const MergeTreeDataPartPtr & data_part, const StorageMetadataPtr & metadata_snapshot_, const NamesAndTypesList & columns_list, bool reset_columns_) : storage(data_part->storage) , metadata_snapshot(metadata_snapshot_) - , volume(data_part->volume) - , part_path(data_part->isStoredOnDisk() ? data_part->getFullRelativePath() : "") + , data_part_storage_builder(std::move(data_part_storage_builder_)) , reset_columns(reset_columns_) { if (reset_columns) diff --git a/src/Storages/MergeTree/IMergedBlockOutputStream.h b/src/Storages/MergeTree/IMergedBlockOutputStream.h index 5706596af0f..3b94b85607a 100644 --- a/src/Storages/MergeTree/IMergedBlockOutputStream.h +++ b/src/Storages/MergeTree/IMergedBlockOutputStream.h @@ -12,6 +12,7 @@ class IMergedBlockOutputStream { public: IMergedBlockOutputStream( + DataPartStorageBuilderPtr data_part_storage_builder_, const MergeTreeDataPartPtr & data_part, const StorageMetadataPtr & metadata_snapshot_, const NamesAndTypesList & columns_list, @@ -44,9 +45,7 @@ protected: const MergeTreeData & storage; StorageMetadataPtr metadata_snapshot; - VolumePtr volume; - String part_path; - + DataPartStorageBuilderPtr data_part_storage_builder; IMergeTreeDataPart::MergeTreeWriterPtr writer; bool reset_columns = false; diff --git a/src/Storages/MergeTree/IPartMetadataManager.cpp b/src/Storages/MergeTree/IPartMetadataManager.cpp index 5e24ac2c0e1..d09fc9d4244 100644 --- a/src/Storages/MergeTree/IPartMetadataManager.cpp +++ b/src/Storages/MergeTree/IPartMetadataManager.cpp @@ -5,7 +5,7 @@ namespace DB { -IPartMetadataManager::IPartMetadataManager(const IMergeTreeDataPart * part_) : part(part_), disk(part->volume->getDisk()) +IPartMetadataManager::IPartMetadataManager(const IMergeTreeDataPart * part_) : part(part_) { } } diff --git a/src/Storages/MergeTree/IPartMetadataManager.h b/src/Storages/MergeTree/IPartMetadataManager.h index 876000de412..c1bf3b15805 100644 --- a/src/Storages/MergeTree/IPartMetadataManager.h +++ b/src/Storages/MergeTree/IPartMetadataManager.h @@ -52,7 +52,6 @@ public: protected: const IMergeTreeDataPart * part; - const DiskPtr disk; }; using PartMetadataManagerPtr = std::shared_ptr; diff --git a/src/Storages/MergeTree/MergeFromLogEntryTask.cpp b/src/Storages/MergeTree/MergeFromLogEntryTask.cpp index 67a9e108bc6..048c460c549 100644 --- a/src/Storages/MergeTree/MergeFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MergeFromLogEntryTask.cpp @@ -147,7 +147,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MergeFromLogEntryTask::prepare() for (auto & part_ptr : parts) { ttl_infos.update(part_ptr->ttl_infos); - max_volume_index = std::max(max_volume_index, storage.getStoragePolicy()->getVolumeIndexByDisk(part_ptr->volume->getDisk())); + max_volume_index = std::max(max_volume_index, part_ptr->data_part_storage->getVolumeIndex(*storage.getStoragePolicy())); } /// It will live until the whole task is being destroyed diff --git a/src/Storages/MergeTree/MergeList.cpp b/src/Storages/MergeTree/MergeList.cpp index 8722ddc5a82..d866345defe 100644 --- a/src/Storages/MergeTree/MergeList.cpp +++ b/src/Storages/MergeTree/MergeList.cpp @@ -64,7 +64,7 @@ MergeListElement::MergeListElement( for (const auto & source_part : future_part->parts) { source_part_names.emplace_back(source_part->name); - source_part_paths.emplace_back(source_part->getFullPath()); + source_part_paths.emplace_back(source_part->data_part_storage->getFullPath()); total_size_bytes_compressed += source_part->getBytesOnDisk(); total_size_marks += source_part->getMarksCount(); diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index 892d15fe4a0..f16d22f553a 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -120,12 +121,24 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() ctx->disk = global_ctx->space_reservation->getDisk(); - String local_part_path = global_ctx->data->relative_data_path; String local_tmp_part_basename = local_tmp_prefix + global_ctx->future_part->name + local_tmp_suffix; - String local_new_part_tmp_path = local_part_path + local_tmp_part_basename + "/"; - if (ctx->disk->exists(local_new_part_tmp_path)) - throw Exception("Directory " + fullPath(ctx->disk, local_new_part_tmp_path) + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS); + if (global_ctx->parent_path_storage_builder) + { + global_ctx->data_part_storage_builder = global_ctx->parent_path_storage_builder->getProjection(local_tmp_part_basename); + } + else + { + auto local_single_disk_volume = std::make_shared("volume_" + global_ctx->future_part->name, ctx->disk, 0); + + global_ctx->data_part_storage_builder = std::make_shared( + local_single_disk_volume, + global_ctx->data->relative_data_path, + local_tmp_part_basename); + } + + if (global_ctx->data_part_storage_builder->exists()) + throw Exception("Directory " + global_ctx->data_part_storage_builder->getFullPath() + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS); global_ctx->data->temporary_parts.add(local_tmp_part_basename); SCOPE_EXIT( @@ -149,13 +162,13 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() global_ctx->merging_columns, global_ctx->merging_column_names); - auto local_single_disk_volume = std::make_shared("volume_" + global_ctx->future_part->name, ctx->disk, 0); + auto data_part_storage = global_ctx->data_part_storage_builder->getStorage(); + global_ctx->new_data_part = global_ctx->data->createPart( global_ctx->future_part->name, global_ctx->future_part->type, global_ctx->future_part->part_info, - local_single_disk_volume, - local_tmp_part_basename, + data_part_storage, global_ctx->parent_part); global_ctx->new_data_part->uuid = global_ctx->future_part->uuid; @@ -289,6 +302,7 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() global_ctx->to = std::make_shared( global_ctx->new_data_part, + global_ctx->data_part_storage_builder, global_ctx->metadata_snapshot, global_ctx->merging_columns, MergeTreeIndexFactory::instance().getMany(global_ctx->metadata_snapshot->getSecondaryIndices()), @@ -479,6 +493,7 @@ void MergeTask::VerticalMergeStage::prepareVerticalMergeForOneColumn() const ctx->executor = std::make_unique(ctx->column_parts_pipeline); ctx->column_to = std::make_unique( + global_ctx->data_part_storage_builder, global_ctx->new_data_part, global_ctx->metadata_snapshot, ctx->executor->getHeader(), @@ -632,6 +647,7 @@ bool MergeTask::MergeProjectionsStage::mergeMinMaxIndexAndPrepareProjections() c global_ctx->deduplicate_by_columns, projection_merging_params, global_ctx->new_data_part.get(), + global_ctx->data_part_storage_builder.get(), ".proj", NO_TRANSACTION_PTR, global_ctx->data, diff --git a/src/Storages/MergeTree/MergeTask.h b/src/Storages/MergeTree/MergeTask.h index 0a7675b2268..bb86a5072e0 100644 --- a/src/Storages/MergeTree/MergeTask.h +++ b/src/Storages/MergeTree/MergeTask.h @@ -60,6 +60,7 @@ public: Names deduplicate_by_columns_, MergeTreeData::MergingParams merging_params_, const IMergeTreeDataPart * parent_part_, + const IDataPartStorageBuilder * parent_path_storage_builder_, String suffix_, MergeTreeTransactionPtr txn, MergeTreeData * data_, @@ -81,6 +82,7 @@ public: global_ctx->deduplicate = std::move(deduplicate_); global_ctx->deduplicate_by_columns = std::move(deduplicate_by_columns_); global_ctx->parent_part = std::move(parent_part_); + global_ctx->parent_path_storage_builder = std::move(parent_path_storage_builder_); global_ctx->data = std::move(data_); global_ctx->mutator = std::move(mutator_); global_ctx->merges_blocker = std::move(merges_blocker_); @@ -135,6 +137,7 @@ private: FutureMergedMutatedPartPtr future_part{nullptr}; /// This will be either nullptr or new_data_part, so raw pointer is ok. const IMergeTreeDataPart * parent_part{nullptr}; + const IDataPartStorageBuilder * parent_path_storage_builder{nullptr}; ContextPtr context{nullptr}; time_t time_of_merge{0}; ReservationSharedPtr space_reservation{nullptr}; @@ -160,6 +163,7 @@ private: std::unique_ptr merging_executor; MergeTreeData::MutableDataPartPtr new_data_part{nullptr}; + DataPartStorageBuilderPtr data_part_storage_builder; size_t rows_written{0}; UInt64 watch_prev_elapsed{0}; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index dea6c8b0066..df864a2725d 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1,9 +1,10 @@ #include +#include #include #include #include -#include +#include #include #include #include @@ -51,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -64,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -1015,7 +1018,8 @@ void MergeTreeData::loadDataPartsFromDisk( return; const auto & part_info = *part_opt; auto single_disk_volume = std::make_shared("volume_" + part_name, part_disk_ptr, 0); - auto part = createPart(part_name, part_info, single_disk_volume, part_name); + auto data_part_storage = std::make_shared(single_disk_volume, relative_data_path, part_name); + auto part = createPart(part_name, part_info, data_part_storage); bool broken = false; String part_path = fs::path(relative_data_path) / part_name; @@ -1023,7 +1027,7 @@ void MergeTreeData::loadDataPartsFromDisk( if (part_disk_ptr->exists(marker_path)) { /// NOTE: getBytesOnDisk() cannot be used here, since it maybe zero of checksums.txt will not exist - size_t size_of_part = IMergeTreeDataPart::calculateTotalSizeOnDisk(part->volume->getDisk(), part->getFullRelativePath()); + size_t size_of_part = data_part_storage->calculateTotalSizeOnDisk(); LOG_WARNING(log, "Detaching stale part {}{} (size: {}), which should have been deleted after a move. " "That can only happen after unclean restart of ClickHouse after move of a part having an operation blocking that stale copy of part.", @@ -1060,7 +1064,7 @@ void MergeTreeData::loadDataPartsFromDisk( if (broken) { /// NOTE: getBytesOnDisk() cannot be used here, since it maybe zero of checksums.txt will not exist - size_t size_of_part = IMergeTreeDataPart::calculateTotalSizeOnDisk(part->volume->getDisk(), part->getFullRelativePath()); + size_t size_of_part = data_part_storage->calculateTotalSizeOnDisk(); LOG_ERROR(log, "Detaching broken part {}{} (size: {}). " @@ -1089,7 +1093,7 @@ void MergeTreeData::loadDataPartsFromDisk( { if ((*it)->checksums.getTotalChecksumHex() == part->checksums.getTotalChecksumHex()) { - LOG_ERROR(log, "Remove duplicate part {}", part->getFullPath()); + LOG_ERROR(log, "Remove duplicate part {}", data_part_storage->getFullPath()); duplicate_parts_to_remove.push_back(part); } else @@ -1189,7 +1193,7 @@ void MergeTreeData::loadDataPartsFromWAL( { if ((*it)->checksums.getTotalChecksumHex() == part->checksums.getTotalChecksumHex()) { - LOG_ERROR(log, "Remove duplicate part {}", part->getFullPath()); + LOG_ERROR(log, "Remove duplicate part {}", part->data_part_storage->getFullPath()); duplicate_parts_to_remove.push_back(part); } else @@ -1265,7 +1269,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) disk_parts.emplace_back(std::make_pair(it->name(), disk_ptr)); else if (it->name() == MergeTreeWriteAheadLog::DEFAULT_WAL_FILE_NAME && settings->in_memory_parts_enable_wal) { - std::unique_lock lock(wal_init_lock); + std::lock_guard lock(wal_init_lock); if (write_ahead_log != nullptr) throw Exception( "There are multiple WAL files appeared in current storage policy. You need to resolve this manually", @@ -1750,11 +1754,9 @@ void MergeTreeData::flushAllInMemoryPartsIfNeeded() { if (auto part_in_memory = asInMemoryPart(part)) { - const auto & storage_relative_path = part_in_memory->storage.relative_data_path; - part_in_memory->flushToDisk(storage_relative_path, part_in_memory->relative_path, metadata_snapshot); + part_in_memory->flushToDisk(part_in_memory->data_part_storage->getPartDirectory(), metadata_snapshot); } } - } size_t MergeTreeData::clearOldPartsFromFilesystem(bool force) @@ -2019,6 +2021,9 @@ void MergeTreeData::rename(const String & new_table_path, const StorageID & new_ if (!getStorageID().hasUUID()) getContext()->dropCaches(); + for (const auto & part : data_parts_by_info) + part->data_part_storage->changeRootPath(relative_data_path, new_table_path); + relative_data_path = new_table_path; renameInMemory(new_table_id); @@ -2563,16 +2568,16 @@ MergeTreeDataPartType MergeTreeData::choosePartTypeOnDisk(size_t bytes_uncompres MergeTreeData::MutableDataPartPtr MergeTreeData::createPart(const String & name, MergeTreeDataPartType type, const MergeTreePartInfo & part_info, - const VolumePtr & volume, const String & relative_path, const IMergeTreeDataPart * parent_part) const + const DataPartStoragePtr & data_part_storage, const IMergeTreeDataPart * parent_part) const { if (type == MergeTreeDataPartType::Compact) - return std::make_shared(*this, name, part_info, volume, relative_path, parent_part); + return std::make_shared(*this, name, part_info, data_part_storage, parent_part); else if (type == MergeTreeDataPartType::Wide) - return std::make_shared(*this, name, part_info, volume, relative_path, parent_part); + return std::make_shared(*this, name, part_info, data_part_storage, parent_part); else if (type == MergeTreeDataPartType::InMemory) - return std::make_shared(*this, name, part_info, volume, relative_path, parent_part); + return std::make_shared(*this, name, part_info, data_part_storage, parent_part); else - throw Exception("Unknown type of part " + relative_path, ErrorCodes::UNKNOWN_PART_TYPE); + throw Exception("Unknown type of part " + data_part_storage->getRelativePath(), ErrorCodes::UNKNOWN_PART_TYPE); } static MergeTreeDataPartType getPartTypeFromMarkExtension(const String & mrk_ext) @@ -2588,18 +2593,17 @@ static MergeTreeDataPartType getPartTypeFromMarkExtension(const String & mrk_ext } MergeTreeData::MutableDataPartPtr MergeTreeData::createPart( - const String & name, const VolumePtr & volume, const String & relative_path, const IMergeTreeDataPart * parent_part) const + const String & name, const DataPartStoragePtr & data_part_storage, const IMergeTreeDataPart * parent_part) const { - return createPart(name, MergeTreePartInfo::fromPartName(name, format_version), volume, relative_path, parent_part); + return createPart(name, MergeTreePartInfo::fromPartName(name, format_version), data_part_storage, parent_part); } MergeTreeData::MutableDataPartPtr MergeTreeData::createPart( const String & name, const MergeTreePartInfo & part_info, - const VolumePtr & volume, const String & relative_path, const IMergeTreeDataPart * parent_part) const + const DataPartStoragePtr & data_part_storage, const IMergeTreeDataPart * parent_part) const { MergeTreeDataPartType type; - auto full_path = fs::path(relative_data_path) / (parent_part ? parent_part->relative_path : "") / relative_path / ""; - auto mrk_ext = MergeTreeIndexGranularityInfo::getMarksExtensionFromFilesystem(volume->getDisk(), full_path); + auto mrk_ext = MergeTreeIndexGranularityInfo::getMarksExtensionFromFilesystem(data_part_storage); if (mrk_ext) type = getPartTypeFromMarkExtension(*mrk_ext); @@ -2609,7 +2613,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::createPart( type = choosePartTypeOnDisk(0, 0); } - return createPart(name, type, part_info, volume, relative_path, parent_part); + return createPart(name, type, part_info, data_part_storage, parent_part); } void MergeTreeData::changeSettings( @@ -2857,7 +2861,7 @@ bool MergeTreeData::renameTempPartAndReplace( else /// Parts from ReplicatedMergeTree already have names part_name = part->name; - LOG_TRACE(log, "Renaming temporary part {} to {}.", part->relative_path, part_name); + LOG_TRACE(log, "Renaming temporary part {} to {}.", part->data_part_storage->getPartDirectory(), part_name); if (auto it_duplicate = data_parts_by_info.find(part_info); it_duplicate != data_parts_by_info.end()) { @@ -2906,16 +2910,18 @@ bool MergeTreeData::renameTempPartAndReplace( part->renameTo(part_name, true); auto part_it = data_parts_indexes.insert(part).first; - /// FIXME Transactions: it's not the best place for checking and setting removal_tid, - /// because it's too optimistic. We should lock removal_tid of covered parts at the beginning of operation. - MergeTreeTransaction::addNewPartAndRemoveCovered(shared_from_this(), part, covered_parts, txn); if (out_transaction) { + chassert(out_transaction->txn == txn); out_transaction->precommitted_parts.insert(part); } else { + /// FIXME Transactions: it's not the best place for checking and setting removal_tid, + /// because it's too optimistic. We should lock removal_tid of covered parts at the beginning of operation. + MergeTreeTransaction::addNewPartAndRemoveCovered(shared_from_this(), part, covered_parts, txn); + size_t reduce_bytes = 0; size_t reduce_rows = 0; size_t reduce_parts = 0; @@ -3153,9 +3159,9 @@ void MergeTreeData::restoreAndActivatePart(const DataPartPtr & part, DataPartsLo void MergeTreeData::forgetPartAndMoveToDetached(const MergeTreeData::DataPartPtr & part_to_detach, const String & prefix, bool restore_covered) { if (prefix.empty()) - LOG_INFO(log, "Renaming {} to {} and forgetting it.", part_to_detach->relative_path, part_to_detach->name); + LOG_INFO(log, "Renaming {} to {} and forgetting it.", part_to_detach->data_part_storage->getPartDirectory(), part_to_detach->name); else - LOG_INFO(log, "Renaming {} to {}_{} and forgetting it.", part_to_detach->relative_path, prefix, part_to_detach->name); + LOG_INFO(log, "Renaming {} to {}_{} and forgetting it.", part_to_detach->data_part_storage->getPartDirectory(), prefix, part_to_detach->name); auto lock = lockParts(); bool removed_active_part = false; @@ -3522,8 +3528,8 @@ void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy) /// when allow_remote_fs_zero_copy_replication turned on and off again original_active_part->force_keep_shared_data = false; - if (original_active_part->volume->getDisk()->supportZeroCopyReplication() && - part_copy->isStoredOnRemoteDiskWithZeroCopySupport() && + if (original_active_part->data_part_storage->supportZeroCopyReplication() && + part_copy->data_part_storage->supportZeroCopyReplication() && original_active_part->getUniqueId() == part_copy->getUniqueId()) { /// May be when several volumes use the same S3/HDFS storage @@ -3544,16 +3550,7 @@ void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy) /// All other locks are taken in StorageReplicatedMergeTree lockSharedData(*part_copy); - auto disk = original_active_part->volume->getDisk(); - String marker_path = fs::path(original_active_part->getFullRelativePath()) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME; - try - { - disk->createFile(marker_path); - } - catch (Poco::Exception & e) - { - LOG_ERROR(log, "{} (while creating DeleteOnDestroy marker: {})", e.what(), backQuote(fullPath(disk, marker_path))); - } + original_active_part->data_part_storage->writeDeleteOnDestroyMarker(log); return; } } @@ -3578,6 +3575,13 @@ MergeTreeData::DataPartsVector MergeTreeData::getVisibleDataPartsVectorInPartiti return getVisibleDataPartsVectorInPartition(local_context->getCurrentTransaction().get(), partition_id); } + +MergeTreeData::DataPartsVector MergeTreeData::getVisibleDataPartsVectorInPartition( + ContextPtr local_context, const String & partition_id, DataPartsLock & lock) const +{ + return getVisibleDataPartsVectorInPartition(local_context->getCurrentTransaction().get(), partition_id, &lock); +} + MergeTreeData::DataPartsVector MergeTreeData::getVisibleDataPartsVectorInPartition( MergeTreeTransaction * txn, const String & partition_id, DataPartsLock * acquired_lock) const { @@ -3679,13 +3683,10 @@ MergeTreeData::DataPartPtr MergeTreeData::getPartIfExists(const String & part_na static void loadPartAndFixMetadataImpl(MergeTreeData::MutableDataPartPtr part) { - auto disk = part->volume->getDisk(); - String full_part_path = part->getFullRelativePath(); - part->loadColumnsChecksumsIndexes(false, true); - part->modification_time = disk->getLastModified(full_part_path).epochTime(); - disk->removeFileIfExists(fs::path(full_part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME); - disk->removeFileIfExists(fs::path(full_part_path) / IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME); + part->modification_time = part->data_part_storage->getLastModified().epochTime(); + part->data_part_storage->removeDeleteOnDestroyMarker(); + part->data_part_storage->removeVersionMetadata(); } void MergeTreeData::calculateColumnAndSecondaryIndexSizesImpl() @@ -3845,7 +3846,7 @@ void MergeTreeData::movePartitionToDisk(const ASTPtr & partition, const String & auto disk = getStoragePolicy()->getDiskByName(name); std::erase_if(parts, [&](auto part_ptr) { - return part_ptr->volume->getDisk()->getName() == disk->getName(); + return part_ptr->data_part_storage->getDiskName() == disk->getName(); }); if (parts.empty()) @@ -3895,7 +3896,7 @@ void MergeTreeData::movePartitionToVolume(const ASTPtr & partition, const String { for (const auto & disk : volume->getDisks()) { - if (part_ptr->volume->getDisk()->getName() == disk->getName()) + if (part_ptr->data_part_storage->getDiskName() == disk->getName()) { return true; } @@ -4058,178 +4059,173 @@ Pipe MergeTreeData::alterPartition( } -BackupEntries MergeTreeData::backupData(ContextPtr local_context, const ASTs & partitions) +void MergeTreeData::backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) { - DataPartsVector data_parts; - if (partitions.empty()) - data_parts = getVisibleDataPartsVector(local_context); - else - data_parts = getVisibleDataPartsVectorInPartitions(local_context, getPartitionIDsFromQuery(partitions, local_context)); - return backupDataParts(data_parts); + backup_entries_collector.addBackupEntries(backupParts(backup_entries_collector.getContext(), data_path_in_backup, partitions)); } - -BackupEntries MergeTreeData::backupDataParts(const DataPartsVector & data_parts) +BackupEntries MergeTreeData::backupParts(const ContextPtr & local_context, const String & data_path_in_backup, const std::optional & partitions) const { + DataPartsVector data_parts; + if (partitions) + data_parts = getVisibleDataPartsVectorInPartitions(local_context, getPartitionIDsFromQuery(*partitions, local_context)); + else + data_parts = getVisibleDataPartsVector(local_context); + BackupEntries backup_entries; std::map> temp_dirs; + fs::path data_path_in_backup_fs = data_path_in_backup; for (const auto & part : data_parts) - { - auto disk = part->volume->getDisk(); + part->data_part_storage->backup(temp_dirs, part->checksums, part->getFileNamesWithoutChecksums(), backup_entries); - auto temp_dir_it = temp_dirs.find(disk); - if (temp_dir_it == temp_dirs.end()) - temp_dir_it = temp_dirs.emplace(disk, std::make_shared(disk, "tmp/backup_")).first; - auto temp_dir_owner = temp_dir_it->second; - fs::path temp_dir = temp_dir_owner->getPath(); - - fs::path part_dir = part->getFullRelativePath(); - fs::path temp_part_dir = temp_dir / part->relative_path; - disk->createDirectories(temp_part_dir); - - for (const auto & [filepath, checksum] : part->checksums.files) - { - String relative_filepath = fs::path(part->relative_path) / filepath; - String hardlink_filepath = temp_part_dir / filepath; - disk->createHardLink(part_dir / filepath, hardlink_filepath); - UInt128 file_hash{checksum.file_hash.first, checksum.file_hash.second}; - backup_entries.emplace_back( - relative_filepath, - std::make_unique(disk, hardlink_filepath, checksum.file_size, file_hash, temp_dir_owner)); - } - - for (const auto & filepath : part->getFileNamesWithoutChecksums()) - { - String relative_filepath = fs::path(part->relative_path) / filepath; - backup_entries.emplace_back(relative_filepath, std::make_unique(disk, part_dir / filepath)); - } - } + /// TODO: try to write better code later. + for (auto & entry : backup_entries) + entry.first = data_path_in_backup_fs / entry.first; return backup_entries; } +void MergeTreeData::restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) +{ + auto backup = restorer.getBackup(); + if (!restorer.isNonEmptyTableAllowed() && getTotalActiveSizeInBytes() && backup->hasFiles(data_path_in_backup)) + restorer.throwTableIsNotEmpty(getStorageID()); -class MergeTreeDataRestoreTask : public IRestoreTask + restorePartsFromBackup(restorer, data_path_in_backup, partitions); +} + +class MergeTreeData::RestoredPartsHolder { public: - MergeTreeDataRestoreTask( - const std::shared_ptr & storage_, - const BackupPtr & backup_, - const String & data_path_in_backup_, - const std::unordered_set & partition_ids_, - SimpleIncrement * increment_) - : storage(storage_) - , backup(backup_) - , data_path_in_backup(data_path_in_backup_) - , partition_ids(partition_ids_) - , increment(increment_) + RestoredPartsHolder(const std::shared_ptr & storage_, const BackupPtr & backup_, size_t num_parts_) + : storage(storage_), backup(backup_), num_parts(num_parts_) { } - RestoreTasks run() override + BackupPtr getBackup() const { return backup; } + + void setNumParts(size_t num_parts_) { - RestoreTasks restore_part_tasks; - Strings part_names = backup->listFiles(data_path_in_backup); - for (const String & part_name : part_names) - { - const auto part_info = MergeTreePartInfo::tryParsePartName(part_name, storage->format_version); - if (!part_info) - continue; + std::lock_guard lock{mutex}; + num_parts = num_parts_; + attachIfAllPartsRestored(); + } - if (!partition_ids.empty() && !partition_ids.contains(part_info->partition_id)) - continue; - - restore_part_tasks.push_back( - std::make_unique(storage, backup, data_path_in_backup, part_name, *part_info, increment)); - } - return restore_part_tasks; + void addPart(MutableDataPartPtr part, std::shared_ptr temp_part_dir_owner) + { + std::lock_guard lock{mutex}; + parts.emplace_back(part); + temp_part_dir_owners.emplace_back(temp_part_dir_owner); + attachIfAllPartsRestored(); } private: + void attachIfAllPartsRestored() + { + if (!num_parts || (parts.size() < num_parts)) + return; + + /// Sort parts by min_block (because we need to preserve the order of parts). + std::sort( + parts.begin(), + parts.end(), + [](const MutableDataPartPtr & lhs, const MutableDataPartPtr & rhs) { return lhs->info.min_block < rhs->info.min_block; }); + + storage->attachRestoredParts(std::move(parts)); + parts.clear(); + temp_part_dir_owners.clear(); + num_parts = 0; + } + std::shared_ptr storage; BackupPtr backup; - String data_path_in_backup; - std::unordered_set partition_ids; - SimpleIncrement * increment; - - class RestorePartTask : public IRestoreTask - { - public: - RestorePartTask( - const std::shared_ptr & storage_, - const BackupPtr & backup_, - const String & data_path_in_backup_, - const String & part_name_, - const MergeTreePartInfo & part_info_, - SimpleIncrement * increment_) - : storage(storage_) - , backup(backup_) - , data_path_in_backup(data_path_in_backup_) - , part_name(part_name_) - , part_info(part_info_) - , increment(increment_) - { - } - - RestoreTasks run() override - { - UInt64 total_size_of_part = 0; - Strings filenames = backup->listFiles(data_path_in_backup + part_name + "/", ""); - for (const String & filename : filenames) - total_size_of_part += backup->getFileSize(data_path_in_backup + part_name + "/" + filename); - - std::shared_ptr reservation = storage->getStoragePolicy()->reserveAndCheck(total_size_of_part); - auto disk = reservation->getDisk(); - String relative_data_path = storage->getRelativeDataPath(); - - auto temp_part_dir_owner = std::make_shared(disk, relative_data_path + "restoring_" + part_name + "_"); - String temp_part_dir = temp_part_dir_owner->getPath(); - disk->createDirectories(temp_part_dir); - - assert(temp_part_dir.starts_with(relative_data_path)); - String relative_temp_part_dir = temp_part_dir.substr(relative_data_path.size()); - - for (const String & filename : filenames) - { - auto backup_entry = backup->readFile(fs::path(data_path_in_backup) / part_name / filename); - auto read_buffer = backup_entry->getReadBuffer(); - auto write_buffer = disk->writeFile(fs::path(temp_part_dir) / filename); - copyData(*read_buffer, *write_buffer); - reservation->update(reservation->getSize() - backup_entry->getSize()); - } - - auto single_disk_volume = std::make_shared(disk->getName(), disk, 0); - auto part = storage->createPart(part_name, part_info, single_disk_volume, relative_temp_part_dir); - /// TODO Transactions: Decide what to do with version metadata (if any). Let's just remove it for now. - disk->removeFileIfExists(fs::path(temp_part_dir) / IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME); - part->version.setCreationTID(Tx::PrehistoricTID, nullptr); - part->loadColumnsChecksumsIndexes(false, true); - storage->renameTempPartAndAdd(part, NO_TRANSACTION_RAW, increment); - return {}; - } - - private: - std::shared_ptr storage; - BackupPtr backup; - String data_path_in_backup; - String part_name; - MergeTreePartInfo part_info; - SimpleIncrement * increment; - }; + size_t num_parts = 0; + MutableDataPartsVector parts; + std::vector> temp_part_dir_owners; + mutable std::mutex mutex; }; - -RestoreTaskPtr MergeTreeData::restoreDataParts(const std::unordered_set & partition_ids, - const BackupPtr & backup, const String & data_path_in_backup, - SimpleIncrement * increment) +void MergeTreeData::restorePartsFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) { - return std::make_unique( - std::static_pointer_cast(shared_from_this()), backup, data_path_in_backup, partition_ids, increment); + std::optional> partition_ids; + if (partitions) + partition_ids = getPartitionIDsFromQuery(*partitions, restorer.getContext()); + + auto backup = restorer.getBackup(); + Strings part_names = backup->listFiles(data_path_in_backup); + auto restored_parts_holder + = std::make_shared(std::static_pointer_cast(shared_from_this()), backup, part_names.size()); + + fs::path data_path_in_backup_fs = data_path_in_backup; + size_t num_parts = 0; + + for (const String & part_name : part_names) + { + const auto part_info = MergeTreePartInfo::tryParsePartName(part_name, format_version); + if (!part_info) + continue; + + if (partition_ids && !partition_ids->contains(part_info->partition_id)) + continue; + + restorer.addDataRestoreTask( + [storage = std::static_pointer_cast(shared_from_this()), + backup, + part_path_in_backup = data_path_in_backup_fs / part_name, + part_info=*part_info, + restored_parts_holder] + { storage->restorePartFromBackup(restored_parts_holder, part_info, part_path_in_backup); }); + + ++num_parts; + } + + restored_parts_holder->setNumParts(num_parts); +} + +void MergeTreeData::restorePartFromBackup(std::shared_ptr restored_parts_holder, const MergeTreePartInfo & part_info, const String & part_path_in_backup) +{ + auto backup = restored_parts_holder->getBackup(); + + UInt64 total_size_of_part = 0; + Strings filenames = backup->listFiles(part_path_in_backup, /* recursive= */ true); + fs::path part_path_in_backup_fs = part_path_in_backup; + for (const String & filename : filenames) + total_size_of_part += backup->getFileSize(part_path_in_backup_fs / filename); + + std::shared_ptr reservation = getStoragePolicy()->reserveAndCheck(total_size_of_part); + auto disk = reservation->getDisk(); + + String part_name = part_info.getPartName(); + auto temp_part_dir_owner = std::make_shared(disk, relative_data_path + "restoring_" + part_name + "_"); + String temp_part_dir = temp_part_dir_owner->getPath(); + disk->createDirectories(temp_part_dir); + + assert(temp_part_dir.starts_with(relative_data_path)); + String relative_temp_part_dir = temp_part_dir.substr(relative_data_path.size()); + + for (const String & filename : filenames) + { + auto backup_entry = backup->readFile(part_path_in_backup_fs / filename); + auto read_buffer = backup_entry->getReadBuffer(); + auto write_buffer = disk->writeFile(fs::path(temp_part_dir) / filename); + copyData(*read_buffer, *write_buffer); + reservation->update(reservation->getSize() - backup_entry->getSize()); + } + + auto single_disk_volume = std::make_shared(disk->getName(), disk, 0); + auto data_part_storage = std::make_shared(single_disk_volume, relative_data_path, relative_temp_part_dir); + auto part = createPart(part_name, part_info, data_part_storage); + /// TODO Transactions: Decide what to do with version metadata (if any). Let's just remove it for now. + disk->removeFileIfExists(fs::path(temp_part_dir) / IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME); + part->version.setCreationTID(Tx::PrehistoricTID, nullptr); + part->loadColumnsChecksumsIndexes(false, true); + + restored_parts_holder->addPart(part, temp_part_dir_owner); } -String MergeTreeData::getPartitionIDFromQuery(const ASTPtr & ast, ContextPtr local_context) const +String MergeTreeData::getPartitionIDFromQuery(const ASTPtr & ast, ContextPtr local_context, DataPartsLock * acquired_lock) const { const auto & partition_ast = ast->as(); @@ -4313,7 +4309,7 @@ String MergeTreeData::getPartitionIDFromQuery(const ASTPtr & ast, ContextPtr loc String partition_id = partition.getID(*this); { - auto data_parts_lock = lockParts(); + auto data_parts_lock = (acquired_lock) ? DataPartsLock() : lockParts(); DataPartPtr existing_part_in_partition = getAnyPartInPartition(partition_id, data_parts_lock); if (existing_part_in_partition && existing_part_in_partition->partition.value != partition.value) { @@ -4660,7 +4656,8 @@ MergeTreeData::MutableDataPartsVector MergeTreeData::tryLoadPartsToAttach(const LOG_DEBUG(log, "Checking part {}", new_name); auto single_disk_volume = std::make_shared("volume_" + old_name, disk); - MutableDataPartPtr part = createPart(old_name, single_disk_volume, source_dir + new_name); + auto data_part_storage = std::make_shared(single_disk_volume, relative_data_path, source_dir + new_name); + MutableDataPartPtr part = createPart(old_name, data_part_storage); loadPartAndFixMetadataImpl(part); loaded_parts.push_back(part); @@ -4695,6 +4692,24 @@ ReservationPtr MergeTreeData::reserveSpace(UInt64 expected_size, SpacePtr space) return checkAndReturnReservation(expected_size, std::move(reservation)); } +ReservationPtr MergeTreeData::reserveSpace(UInt64 expected_size, const DataPartStoragePtr & data_part_storage) +{ + expected_size = std::max(RESERVATION_MIN_ESTIMATION_SIZE, expected_size); + return data_part_storage->reserve(expected_size); +} + +ReservationPtr MergeTreeData::reserveSpace(UInt64 expected_size, const DataPartStorageBuilderPtr & data_part_storage_builder) +{ + expected_size = std::max(RESERVATION_MIN_ESTIMATION_SIZE, expected_size); + return data_part_storage_builder->reserve(expected_size); +} + +ReservationPtr MergeTreeData::tryReserveSpace(UInt64 expected_size, const DataPartStoragePtr & data_part_storage) +{ + expected_size = std::max(RESERVATION_MIN_ESTIMATION_SIZE, expected_size); + return data_part_storage->tryReserve(expected_size); +} + ReservationPtr MergeTreeData::tryReserveSpace(UInt64 expected_size, SpacePtr space) { expected_size = std::max(RESERVATION_MIN_ESTIMATION_SIZE, expected_size); @@ -4811,11 +4826,11 @@ bool MergeTreeData::isPartInTTLDestination(const TTLDescription & ttl, const IMe if (ttl.destination_type == DataDestinationType::VOLUME) { for (const auto & disk : policy->getVolumeByName(ttl.destination_name)->getDisks()) - if (disk->getName() == part.volume->getDisk()->getName()) + if (disk->getName() == part.data_part_storage->getDiskName()) return true; } else if (ttl.destination_type == DataDestinationType::DISK) - return policy->getDiskByName(ttl.destination_name)->getName() == part.volume->getDisk()->getName(); + return policy->getDiskByName(ttl.destination_name)->getName() == part.data_part_storage->getDiskName(); return false; } @@ -4879,7 +4894,7 @@ void MergeTreeData::Transaction::rollbackPartsToTemporaryState() WriteBufferFromOwnString buf; buf << " Rollbacking parts state to temporary and removing from working set:"; for (const auto & part : precommitted_parts) - buf << " " << part->relative_path; + buf << " " << part->data_part_storage->getPartDirectory(); buf << "."; LOG_DEBUG(data.log, "Undoing transaction.{}", buf.str()); @@ -4897,22 +4912,10 @@ void MergeTreeData::Transaction::rollback() WriteBufferFromOwnString buf; buf << " Removing parts:"; for (const auto & part : precommitted_parts) - buf << " " << part->relative_path; + buf << " " << part->data_part_storage->getPartDirectory(); buf << "."; LOG_DEBUG(data.log, "Undoing transaction.{}", buf.str()); - if (!txn) - { - auto lock = data.lockParts(); - for (const auto & part : precommitted_parts) - { - DataPartPtr covering_part; - DataPartsVector covered_parts = data.getActivePartsToReplace(part->info, part->name, covering_part, lock); - for (auto & covered : covered_parts) - covered->version.unlockRemovalTID(Tx::PrehistoricTID, TransactionInfoContext{data.getStorageID(), covered->name}); - } - } - data.removePartsFromWorkingSet(txn, DataPartsVector(precommitted_parts.begin(), precommitted_parts.end()), /* clear_without_timeout = */ true); @@ -4930,6 +4933,18 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData: auto parts_lock = acquired_parts_lock ? MergeTreeData::DataPartsLock() : data.lockParts(); auto * owing_parts_lock = acquired_parts_lock ? acquired_parts_lock : &parts_lock; + if (txn) + { + for (const DataPartPtr & part : precommitted_parts) + { + DataPartPtr covering_part; + DataPartsVector covered_parts = data.getActivePartsToReplace(part->info, part->name, covering_part, *owing_parts_lock); + MergeTreeTransaction::addNewPartAndRemoveCovered(data.shared_from_this(), part, covered_parts, txn); + } + } + + NOEXCEPT_SCOPE; + auto current_time = time(nullptr); size_t add_bytes = 0; @@ -4953,6 +4968,9 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData: } else { + if (!txn) + MergeTreeTransaction::addNewPartAndRemoveCovered(data.shared_from_this(), part, covered_parts, NO_TRANSACTION_RAW); + total_covered_parts.insert(total_covered_parts.end(), covered_parts.begin(), covered_parts.end()); for (const auto & covered_part : covered_parts) { @@ -5559,8 +5577,9 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg // If optimize_aggregation_in_order = true, we need additional information to transform the projection's pipeline. auto attach_aggregation_in_order_info = [&]() { - for (const auto & key : keys) + for (const auto & desc : select.getQueryAnalyzer()->aggregationKeys()) { + const String & key = desc.name; auto actions_dag = analysis_result.before_aggregation->clone(); actions_dag->foldActionsByProjection({key}, sample_block_for_keys); candidate.group_by_elements_actions.emplace_back(std::make_shared(actions_dag, actions_settings)); @@ -5901,7 +5920,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( bool does_storage_policy_allow_same_disk = false; for (const DiskPtr & disk : getStoragePolicy()->getDisks()) { - if (disk->getName() == src_part->volume->getDisk()->getName()) + if (disk->getName() == src_part->data_part_storage->getDiskName()) { does_storage_policy_allow_same_disk = true; break; @@ -5909,50 +5928,45 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( } if (!does_storage_policy_allow_same_disk) throw Exception( - "Could not clone and load part " + quoteString(src_part->getFullPath()) + " because disk does not belong to storage policy", - ErrorCodes::BAD_ARGUMENTS); + ErrorCodes::BAD_ARGUMENTS, + "Could not clone and load part {} because disk does not belong to storage policy", + quoteString(src_part->data_part_storage->getFullPath())); String dst_part_name = src_part->getNewName(dst_part_info); assert(!tmp_part_prefix.empty()); String tmp_dst_part_name = tmp_part_prefix + dst_part_name; - auto reservation = reserveSpace(src_part->getBytesOnDisk(), src_part->volume->getDisk()); - auto disk = reservation->getDisk(); - String src_part_path = src_part->getFullRelativePath(); - String dst_part_path = relative_data_path + tmp_dst_part_name; + /// Why it is needed if we only hardlink files? + auto reservation = src_part->data_part_storage->reserve(src_part->getBytesOnDisk()); - if (disk->exists(dst_part_path)) - throw Exception("Part in " + fullPath(disk, dst_part_path) + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS); + auto src_part_storage = src_part->data_part_storage; /// If source part is in memory, flush it to disk and clone it already in on-disk format if (auto src_part_in_memory = asInMemoryPart(src_part)) { - const auto & src_relative_data_path = src_part_in_memory->storage.relative_data_path; auto flushed_part_path = src_part_in_memory->getRelativePathForPrefix(tmp_part_prefix); - src_part_in_memory->flushToDisk(src_relative_data_path, flushed_part_path, metadata_snapshot); - src_part_path = fs::path(src_relative_data_path) / flushed_part_path / ""; + src_part_storage = src_part_in_memory->flushToDisk(flushed_part_path, metadata_snapshot); } String with_copy; if (copy_instead_of_hardlink) with_copy = " (copying data)"; - LOG_DEBUG(log, "Cloning part {} to {}{}", fullPath(disk, src_part_path), fullPath(disk, dst_part_path), with_copy); + LOG_DEBUG(log, "Cloning part {} to {}{}", + src_part_storage->getFullPath(), + std::string(fs::path(src_part_storage->getFullRootPath()) / tmp_dst_part_name), + with_copy); - localBackup(disk, src_part_path, dst_part_path, /* make_source_readonly */ false, {}, /* copy_instead_of_hardlinks */ copy_instead_of_hardlink); + auto dst_part_storage = src_part_storage->freeze(relative_data_path, tmp_dst_part_name, /* make_source_readonly */ false, {}, /* copy_instead_of_hardlinks */ copy_instead_of_hardlink); - disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME); - disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME); - - auto single_disk_volume = std::make_shared(disk->getName(), disk, 0); - auto dst_data_part = createPart(dst_part_name, dst_part_info, single_disk_volume, tmp_dst_part_name); + auto dst_data_part = createPart(dst_part_name, dst_part_info, dst_part_storage); if (!copy_instead_of_hardlink && hardlinked_files) { hardlinked_files->source_part_name = src_part->name; hardlinked_files->source_table_shared_id = src_part->storage.getTableSharedID(); - for (auto it = disk->iterateDirectory(src_part_path); it->isValid(); it->next()) + for (auto it = src_part->data_part_storage->iterate(); it->isValid(); it->next()) { if (it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME) hardlinked_files->hardlinks_from_source_part.insert(it->name()); @@ -5967,7 +5981,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( dst_data_part->is_temp = true; dst_data_part->loadColumnsChecksumsIndexes(require_part_metadata, true); - dst_data_part->modification_time = disk->getLastModified(dst_part_path).epochTime(); + dst_data_part->modification_time = dst_part_storage->getLastModified().epochTime(); return dst_data_part; } @@ -6009,14 +6023,14 @@ Strings MergeTreeData::getDataPaths() const void MergeTreeData::reportBrokenPart(MergeTreeData::DataPartPtr & data_part) const { - if (data_part->volume && data_part->volume->getDisk()->isBroken()) + if (data_part->data_part_storage && data_part->data_part_storage->isBroken()) { - auto disk = data_part->volume->getDisk(); auto parts = getDataPartsForInternalUsage(); - LOG_WARNING(log, "Scanning parts to recover on broken disk {}.", disk->getName() + "@" + disk->getPath()); + LOG_WARNING(log, "Scanning parts to recover on broken disk {}@{}.", data_part->data_part_storage->getDiskName(), data_part->data_part_storage->getDiskPath()); + for (const auto & part : parts) { - if (part->volume && part->volume->getDisk()->getName() == disk->getName()) + if (part->data_part_storage && part->data_part_storage->getDiskName() == data_part->data_part_storage->getDiskName()) broken_part_callback(part->name); } } @@ -6105,33 +6119,36 @@ PartitionCommandsResultInfo MergeTreeData::freezePartitionsByMatcher( LOG_DEBUG(log, "Freezing part {} snapshot will be placed at {}", part->name, backup_path); - auto disk = part->volume->getDisk(); - - disk->createDirectories(backup_path); - - String src_part_path = part->getFullRelativePath(); - String backup_part_path = fs::path(backup_path) / relative_data_path / part->relative_path; + auto data_part_storage = part->data_part_storage; + String src_part_path = data_part_storage->getRelativePath(); + String backup_part_path = fs::path(backup_path) / relative_data_path; if (auto part_in_memory = asInMemoryPart(part)) { auto flushed_part_path = part_in_memory->getRelativePathForPrefix("tmp_freeze"); - part_in_memory->flushToDisk(relative_data_path, flushed_part_path, metadata_snapshot); - src_part_path = fs::path(relative_data_path) / flushed_part_path / ""; + data_part_storage = part_in_memory->flushToDisk(flushed_part_path, metadata_snapshot); } - localBackup(disk, src_part_path, backup_part_path); + auto callback = [this, &part, &backup_part_path](const DiskPtr & disk) + { - // Store metadata for replicated table. - // Do nothing for non-replocated. - createAndStoreFreezeMetadata(disk, part, backup_part_path); + // Store metadata for replicated table. + // Do nothing for non-replocated. + createAndStoreFreezeMetadata(disk, part, fs::path(backup_part_path) / part->data_part_storage->getPartDirectory()); + }; - disk->removeFileIfExists(fs::path(backup_part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME); + auto new_storage = data_part_storage->freeze( + backup_part_path, + part->data_part_storage->getPartDirectory(), + /*make_source_readonly*/ true, + callback, + /*copy_instead_of_hardlink*/ false); part->is_frozen.store(true, std::memory_order_relaxed); result.push_back(PartitionCommandResultInfo{ .partition_id = part->info.partition_id, .part_name = part->name, - .backup_path = fs::path(disk->getPath()) / backup_path, - .part_backup_path = fs::path(disk->getPath()) / backup_part_path, + .backup_path = new_storage->getFullRootPath(), + .part_backup_path = new_storage->getFullPath(), .backup_name = backup_name, }); ++parts_processed; @@ -6237,8 +6254,8 @@ try if (result_part) { - part_log_elem.disk_name = result_part->volume->getDisk()->getName(); - part_log_elem.path_on_disk = result_part->getFullPath(); + part_log_elem.disk_name = result_part->data_part_storage->getDiskName(); + part_log_elem.path_on_disk = result_part->data_part_storage->getFullPath(); part_log_elem.bytes_compressed_on_disk = result_part->getBytesOnDisk(); part_log_elem.rows = result_part->rows_count; part_log_elem.part_type = result_part->getType(); @@ -6567,10 +6584,10 @@ void MergeTreeData::setDataVolume(size_t bytes, size_t rows, size_t parts) bool MergeTreeData::insertQueryIdOrThrow(const String & query_id, size_t max_queries) const { std::lock_guard lock(query_id_set_mutex); - return insertQueryIdOrThrowNoLock(query_id, max_queries, lock); + return insertQueryIdOrThrowNoLock(query_id, max_queries); } -bool MergeTreeData::insertQueryIdOrThrowNoLock(const String & query_id, size_t max_queries, const std::lock_guard &) const +bool MergeTreeData::insertQueryIdOrThrowNoLock(const String & query_id, size_t max_queries) const { if (query_id_set.find(query_id) != query_id_set.end()) return false; @@ -6584,10 +6601,10 @@ bool MergeTreeData::insertQueryIdOrThrowNoLock(const String & query_id, size_t m void MergeTreeData::removeQueryId(const String & query_id) const { std::lock_guard lock(query_id_set_mutex); - removeQueryIdNoLock(query_id, lock); + removeQueryIdNoLock(query_id); } -void MergeTreeData::removeQueryIdNoLock(const String & query_id, const std::lock_guard &) const +void MergeTreeData::removeQueryIdNoLock(const String & query_id) const { if (query_id_set.find(query_id) == query_id_set.end()) LOG_WARNING(log, "We have query_id removed but it's not recorded. This is a bug"); @@ -6653,7 +6670,7 @@ ReservationPtr MergeTreeData::balancedReservation( if (part->isStoredOnDisk() && part->getBytesOnDisk() >= min_bytes_to_rebalance_partition_over_jbod && part_info.partition_id == part->info.partition_id) { - auto name = part->volume->getDisk()->getName(); + auto name = part->data_part_storage->getDiskName(); auto it = disk_occupation.find(name); if (it != disk_occupation.end()) { diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 9ebdeca416b..00a56de9142 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -52,6 +53,9 @@ struct JobAndPool; class MergeTreeTransaction; struct ZeroCopyLock; +class IBackupEntry; +using BackupEntries = std::vector>>; + /// Auxiliary struct holding information about the future merged or mutated part. struct EmergingPartInfo { @@ -230,15 +234,15 @@ public: /// After this method setColumns must be called MutableDataPartPtr createPart(const String & name, MergeTreeDataPartType type, const MergeTreePartInfo & part_info, - const VolumePtr & volume, const String & relative_path, const IMergeTreeDataPart * parent_part = nullptr) const; + const DataPartStoragePtr & data_part_storage, const IMergeTreeDataPart * parent_part = nullptr) const; /// Create part, that already exists on filesystem. /// After this methods 'loadColumnsChecksumsIndexes' must be called. MutableDataPartPtr createPart(const String & name, - const VolumePtr & volume, const String & relative_path, const IMergeTreeDataPart * parent_part = nullptr) const; + const DataPartStoragePtr & data_part_storage, const IMergeTreeDataPart * parent_part = nullptr) const; MutableDataPartPtr createPart(const String & name, const MergeTreePartInfo & part_info, - const VolumePtr & volume, const String & relative_path, const IMergeTreeDataPart * parent_part = nullptr) const; + const DataPartStoragePtr & data_part_storage, const IMergeTreeDataPart * parent_part = nullptr) const; /// Auxiliary object to add a set of parts into the working set in two steps: /// * First, as PreActive parts (the parts are ready, but not yet in the active set). @@ -279,6 +283,7 @@ public: MergeTreeData & data; MergeTreeTransaction * txn; DataParts precommitted_parts; + DataParts locked_parts; void clear() { precommitted_parts.clear(); } }; @@ -414,6 +419,9 @@ public: SelectQueryInfo & info) const override; ReservationPtr reserveSpace(UInt64 expected_size, VolumePtr & volume) const; + static ReservationPtr tryReserveSpace(UInt64 expected_size, const DataPartStoragePtr & data_part_storage); + static ReservationPtr reserveSpace(UInt64 expected_size, const DataPartStoragePtr & data_part_storage); + static ReservationPtr reserveSpace(UInt64 expected_size, const DataPartStorageBuilderPtr & data_part_storage_builder); static bool partsContainSameProjections(const DataPartPtr & left, const DataPartPtr & right); @@ -498,6 +506,7 @@ public: /// Returns all parts in specified partition DataPartsVector getVisibleDataPartsVectorInPartition(MergeTreeTransaction * txn, const String & partition_id, DataPartsLock * acquired_lock = nullptr) const; + DataPartsVector getVisibleDataPartsVectorInPartition(ContextPtr local_context, const String & partition_id, DataPartsLock & lock) const; DataPartsVector getVisibleDataPartsVectorInPartition(ContextPtr local_context, const String & partition_id) const; DataPartsVector getVisibleDataPartsVectorInPartitions(ContextPtr local_context, const std::unordered_set & partition_ids) const; @@ -714,19 +723,11 @@ public: ContextPtr context, TableLockHolder & table_lock_holder); - /// Storage has data to backup. - bool hasDataToBackup() const override { return true; } - - /// Prepares entries to backup data of the storage. - BackupEntries backupData(ContextPtr context, const ASTs & partitions) override; - static BackupEntries backupDataParts(const DataPartsVector & data_parts); + /// Makes backup entries to backup the data of the storage. + void backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) override; /// Extract data from the backup and put it to the storage. - RestoreTaskPtr restoreDataParts( - const std::unordered_set & partition_ids, - const BackupPtr & backup, - const String & data_path_in_backup, - SimpleIncrement * increment); + void restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) override; /// Moves partition to specified Disk void movePartitionToDisk(const ASTPtr & partition, const String & name, bool moving_part, ContextPtr context); @@ -772,7 +773,7 @@ public: } /// For ATTACH/DETACH/DROP PARTITION. - String getPartitionIDFromQuery(const ASTPtr & ast, ContextPtr context) const; + String getPartitionIDFromQuery(const ASTPtr & ast, ContextPtr context, DataPartsLock * acquired_lock = nullptr) const; std::unordered_set getPartitionIDsFromQuery(const ASTs & asts, ContextPtr context) const; std::set getPartitionIdsAffectedByCommands(const MutationCommands & commands, ContextPtr query_context) const; @@ -924,11 +925,11 @@ public: /// Record current query id where querying the table. Throw if there are already `max_queries` queries accessing the same table. /// Returns false if the `query_id` already exists in the running set, otherwise return true. bool insertQueryIdOrThrow(const String & query_id, size_t max_queries) const; - bool insertQueryIdOrThrowNoLock(const String & query_id, size_t max_queries, const std::lock_guard &) const; + bool insertQueryIdOrThrowNoLock(const String & query_id, size_t max_queries) const TSA_REQUIRES(query_id_set_mutex); /// Remove current query id after query finished. void removeQueryId(const String & query_id) const; - void removeQueryIdNoLock(const String & query_id, const std::lock_guard &) const; + void removeQueryIdNoLock(const String & query_id) const TSA_REQUIRES(query_id_set_mutex); /// Return the partition expression types as a Tuple type. Return DataTypeUInt8 if partition expression is empty. DataTypePtr getPartitionValueType() const; @@ -975,7 +976,7 @@ public: /// Fetch part only if some replica has it on shared storage like S3 /// Overridden in StorageReplicatedMergeTree - virtual bool tryToFetchIfShared(const IMergeTreeDataPart &, const DiskPtr &, const String &) { return false; } + virtual DataPartStoragePtr tryToFetchIfShared(const IMergeTreeDataPart &, const DiskPtr &, const String &) { return nullptr; } /// Check shared data usage on other replicas for detached/freezed part /// Remove local files and remote files if needed @@ -1234,6 +1235,18 @@ protected: /// Moves part to specified space, used in ALTER ... MOVE ... queries bool movePartsToSpace(const DataPartsVector & parts, SpacePtr space); + /// Makes backup entries to backup the parts of this table. + BackupEntries backupParts(const ContextPtr & local_context, const String & data_path_in_backup, const std::optional & partitions) const; + + class RestoredPartsHolder; + + /// Restores the parts of this table from backup. + void restorePartsFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions); + void restorePartFromBackup(std::shared_ptr restored_parts_holder, const MergeTreePartInfo & part_info, const String & part_path_in_backup); + + /// Attaches restored parts to the storage. + virtual void attachRestoredParts(MutableDataPartsVector && parts) = 0; + static void incrementInsertedPartsProfileEvent(MergeTreeDataPartType type); static void incrementMergedPartsProfileEvent(MergeTreeDataPartType type); @@ -1281,7 +1294,7 @@ private: std::atomic total_active_size_parts = 0; // Record all query ids which access the table. It's guarded by `query_id_set_mutex` and is always mutable. - mutable std::set query_id_set; + mutable std::set query_id_set TSA_GUARDED_BY(query_id_set_mutex); mutable std::mutex query_id_set_mutex; // Get partition matcher for FREEZE / UNFREEZE queries. diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index f596828ed05..77e3b574804 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -484,6 +484,7 @@ MergeTaskPtr MergeTreeDataMergerMutator::mergePartsToTemporaryPart( const MergeTreeData::MergingParams & merging_params, const MergeTreeTransactionPtr & txn, const IMergeTreeDataPart * parent_part, + const IDataPartStorageBuilder * parent_path_storage_builder, const String & suffix) { return std::make_shared( @@ -498,6 +499,7 @@ MergeTaskPtr MergeTreeDataMergerMutator::mergePartsToTemporaryPart( deduplicate_by_columns, merging_params, parent_part, + parent_path_storage_builder, suffix, txn, &data, diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.h b/src/Storages/MergeTree/MergeTreeDataMergerMutator.h index a5f99c63f11..e3d59a3522f 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.h +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.h @@ -114,6 +114,7 @@ public: const MergeTreeData::MergingParams & merging_params, const MergeTreeTransactionPtr & txn, const IMergeTreeDataPart * parent_part = nullptr, + const IDataPartStorageBuilder * parent_path_storage_builder = nullptr, const String & suffix = ""); /// Mutate a single data part with the specified commands. Will create and return a temporary part. diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp index 53b779bc2ec..046a7d274c0 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp @@ -19,10 +19,9 @@ namespace ErrorCodes MergeTreeDataPartCompact::MergeTreeDataPartCompact( MergeTreeData & storage_, const String & name_, - const VolumePtr & volume_, - const std::optional & relative_path_, + const DataPartStoragePtr & data_part_storage_, const IMergeTreeDataPart * parent_part_) - : IMergeTreeDataPart(storage_, name_, volume_, relative_path_, Type::Compact, parent_part_) + : IMergeTreeDataPart(storage_, name_, data_part_storage_, Type::Compact, parent_part_) { } @@ -30,10 +29,9 @@ MergeTreeDataPartCompact::MergeTreeDataPartCompact( const MergeTreeData & storage_, const String & name_, const MergeTreePartInfo & info_, - const VolumePtr & volume_, - const std::optional & relative_path_, + const DataPartStoragePtr & data_part_storage_, const IMergeTreeDataPart * parent_part_) - : IMergeTreeDataPart(storage_, name_, info_, volume_, relative_path_, Type::Compact, parent_part_) + : IMergeTreeDataPart(storage_, name_, info_, data_part_storage_, Type::Compact, parent_part_) { } @@ -55,6 +53,7 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartCompact::getReader( } IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartCompact::getWriter( + DataPartStorageBuilderPtr data_part_storage_builder, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, @@ -71,7 +70,7 @@ IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartCompact::getWriter( { return *getColumnPosition(lhs.name) < *getColumnPosition(rhs.name); }); return std::make_unique( - shared_from_this(), ordered_columns_list, metadata_snapshot, + shared_from_this(), std::move(data_part_storage_builder), ordered_columns_list, metadata_snapshot, indices_to_recalc, index_granularity_info.marks_file_extension, default_codec_, writer_settings, computed_index_granularity); } @@ -93,7 +92,7 @@ void MergeTreeDataPartCompact::calculateEachColumnSizes(ColumnSizeByName & /*eac void MergeTreeDataPartCompact::loadIndexGranularity() { - String full_path = getFullRelativePath(); + //String full_path = getRelativePath(); if (columns.empty()) throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART); @@ -101,13 +100,16 @@ void MergeTreeDataPartCompact::loadIndexGranularity() if (!index_granularity_info.is_adaptive) throw Exception("MergeTreeDataPartCompact cannot be created with non-adaptive granulary.", ErrorCodes::NOT_IMPLEMENTED); - auto marks_file_path = index_granularity_info.getMarksFilePath(full_path + "data"); - if (!volume->getDisk()->exists(marks_file_path)) - throw Exception("Marks file '" + fullPath(volume->getDisk(), marks_file_path) + "' doesn't exist", ErrorCodes::NO_FILE_IN_DATA_PART); + auto marks_file_path = index_granularity_info.getMarksFilePath("data"); + if (!data_part_storage->exists(marks_file_path)) + throw Exception( + ErrorCodes::NO_FILE_IN_DATA_PART, + "Marks file '{}' doesn't exist", + std::string(fs::path(data_part_storage->getFullPath()) / marks_file_path)); - size_t marks_file_size = volume->getDisk()->getFileSize(marks_file_path); + size_t marks_file_size = data_part_storage->getFileSize(marks_file_path); - auto buffer = volume->getDisk()->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size); + auto buffer = data_part_storage->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size, std::nullopt); while (!buffer->eof()) { /// Skip offsets for columns @@ -137,7 +139,6 @@ bool MergeTreeDataPartCompact::hasColumnFiles(const NameAndTypePair & column) co void MergeTreeDataPartCompact::checkConsistency(bool require_part_metadata) const { checkConsistencyBase(); - String path = getFullRelativePath(); String mrk_file_name = DATA_FILE_NAME + index_granularity_info.marks_file_extension; if (!checksums.empty()) @@ -149,47 +150,62 @@ void MergeTreeDataPartCompact::checkConsistency(bool require_part_metadata) cons if (require_part_metadata) { if (!checksums.files.contains(mrk_file_name)) - throw Exception("No marks file checksum for column in part " + fullPath(volume->getDisk(), path), ErrorCodes::NO_FILE_IN_DATA_PART); + throw Exception( + ErrorCodes::NO_FILE_IN_DATA_PART, + "No marks file checksum for column in part {}", + data_part_storage->getFullPath()); if (!checksums.files.contains(DATA_FILE_NAME_WITH_EXTENSION)) - throw Exception("No data file checksum for in part " + fullPath(volume->getDisk(), path), ErrorCodes::NO_FILE_IN_DATA_PART); + throw Exception( + ErrorCodes::NO_FILE_IN_DATA_PART, + "No data file checksum for in part {}", + data_part_storage->getFullPath()); } } else { { /// count.txt should be present even in non custom-partitioned parts - auto file_path = path + "count.txt"; - if (!volume->getDisk()->exists(file_path) || volume->getDisk()->getFileSize(file_path) == 0) - throw Exception("Part " + path + " is broken: " + fullPath(volume->getDisk(), file_path) + " is empty", ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART); + std::string file_path = "count.txt"; + if (!data_part_storage->exists(file_path) || data_part_storage->getFileSize(file_path) == 0) + throw Exception( + ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART, + "Part {} is broken: {} is empty", + data_part_storage->getRelativePath(), + std::string(fs::path(data_part_storage->getFullPath()) / file_path)); } /// Check that marks are nonempty and have the consistent size with columns number. - auto mrk_file_path = path + mrk_file_name; - if (volume->getDisk()->exists(mrk_file_name)) + if (data_part_storage->exists(mrk_file_name)) { - UInt64 file_size = volume->getDisk()->getFileSize(mrk_file_name); + UInt64 file_size = data_part_storage->getFileSize(mrk_file_name); if (!file_size) - throw Exception("Part " + path + " is broken: " + fullPath(volume->getDisk(), mrk_file_name) + " is empty.", - ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART); + throw Exception( + ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART, + "Part {} is broken: {} is empty.", + data_part_storage->getRelativePath(), + std::string(fs::path(data_part_storage->getFullPath()) / mrk_file_name)); UInt64 expected_file_size = index_granularity_info.getMarkSizeInBytes(columns.size()) * index_granularity.getMarksCount(); if (expected_file_size != file_size) throw Exception( - "Part " + path + " is broken: bad size of marks file '" + fullPath(volume->getDisk(), mrk_file_name) + "': " + std::to_string(file_size) + ", must be: " + std::to_string(expected_file_size), - ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART); + ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART, + "Part {} is broken: bad size of marks file '{}': {}, must be: {}", + data_part_storage->getRelativePath(), + std::string(fs::path(data_part_storage->getFullPath()) / mrk_file_name), + std::to_string(file_size), std::to_string(expected_file_size)); } } } bool MergeTreeDataPartCompact::isStoredOnRemoteDisk() const { - return volume->getDisk()->isRemote(); + return data_part_storage->isStoredOnRemoteDisk(); } bool MergeTreeDataPartCompact::isStoredOnRemoteDiskWithZeroCopySupport() const { - return volume->getDisk()->supportZeroCopyReplication(); + return data_part_storage->supportZeroCopyReplication(); } MergeTreeDataPartCompact::~MergeTreeDataPartCompact() diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.h b/src/Storages/MergeTree/MergeTreeDataPartCompact.h index 79d1af06db5..b1c0851afde 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.h @@ -25,15 +25,13 @@ public: const MergeTreeData & storage_, const String & name_, const MergeTreePartInfo & info_, - const VolumePtr & volume_, - const std::optional & relative_path_ = {}, + const DataPartStoragePtr & data_part_storage_, const IMergeTreeDataPart * parent_part_ = nullptr); MergeTreeDataPartCompact( MergeTreeData & storage_, const String & name_, - const VolumePtr & volume_, - const std::optional & relative_path_ = {}, + const DataPartStoragePtr & data_part_storage_, const IMergeTreeDataPart * parent_part_ = nullptr); MergeTreeReaderPtr getReader( @@ -47,6 +45,7 @@ public: const ReadBufferFromFileBase::ProfileCallback & profile_callback) const override; MergeTreeWriterPtr getWriter( + DataPartStorageBuilderPtr data_part_storage_builder, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, diff --git a/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp b/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp index 8e803f0d068..4c87daa1e13 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp @@ -20,10 +20,9 @@ namespace ErrorCodes MergeTreeDataPartInMemory::MergeTreeDataPartInMemory( MergeTreeData & storage_, const String & name_, - const VolumePtr & volume_, - const std::optional & relative_path_, + const DataPartStoragePtr & data_part_storage_, const IMergeTreeDataPart * parent_part_) - : IMergeTreeDataPart(storage_, name_, volume_, relative_path_, Type::InMemory, parent_part_) + : IMergeTreeDataPart(storage_, name_, data_part_storage_, Type::InMemory, parent_part_) { default_codec = CompressionCodecFactory::instance().get("NONE", {}); } @@ -32,10 +31,9 @@ MergeTreeDataPartInMemory::MergeTreeDataPartInMemory( const MergeTreeData & storage_, const String & name_, const MergeTreePartInfo & info_, - const VolumePtr & volume_, - const std::optional & relative_path_, + const DataPartStoragePtr & data_part_storage_, const IMergeTreeDataPart * parent_part_) - : IMergeTreeDataPart(storage_, name_, info_, volume_, relative_path_, Type::InMemory, parent_part_) + : IMergeTreeDataPart(storage_, name_, info_, data_part_storage_, Type::InMemory, parent_part_) { default_codec = CompressionCodecFactory::instance().get("NONE", {}); } @@ -56,6 +54,7 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartInMemory::getReader( } IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartInMemory::getWriter( + DataPartStorageBuilderPtr data_part_storage_builder_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & /* indices_to_recalc */, @@ -63,65 +62,70 @@ IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartInMemory::getWriter( const MergeTreeWriterSettings & writer_settings, const MergeTreeIndexGranularity & /* computed_index_granularity */) const { + data_part_storage_builder = data_part_storage_builder_; auto ptr = std::static_pointer_cast(shared_from_this()); return std::make_unique( ptr, columns_list, metadata_snapshot, writer_settings); } -void MergeTreeDataPartInMemory::flushToDisk(const String & base_path, const String & new_relative_path, const StorageMetadataPtr & metadata_snapshot) const +DataPartStoragePtr MergeTreeDataPartInMemory::flushToDisk(const String & new_relative_path, const StorageMetadataPtr & metadata_snapshot) const { - const auto & disk = volume->getDisk(); - String destination_path = base_path + new_relative_path; + auto current_full_path = data_part_storage_builder->getFullPath(); + data_part_storage_builder->setRelativePath(new_relative_path); auto new_type = storage.choosePartTypeOnDisk(block.bytes(), rows_count); - auto new_data_part = storage.createPart(name, new_type, info, volume, new_relative_path); + auto new_data_part_storage = data_part_storage_builder->getStorage(); + auto new_data_part = storage.createPart(name, new_type, info, new_data_part_storage); new_data_part->uuid = uuid; new_data_part->setColumns(columns); new_data_part->partition.value = partition.value; new_data_part->minmax_idx = minmax_idx; - if (disk->exists(destination_path)) + if (data_part_storage_builder->exists()) { - throw Exception("Could not flush part " + quoteString(getFullPath()) - + ". Part in " + fullPath(disk, destination_path) + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS); + throw Exception( + ErrorCodes::DIRECTORY_ALREADY_EXISTS, + "Could not flush part {}. Part in {} already exists", + quoteString(current_full_path), + data_part_storage_builder->getFullPath()); } - disk->createDirectories(destination_path); + data_part_storage_builder->createDirectories(); auto compression_codec = storage.getContext()->chooseCompressionCodec(0, 0); auto indices = MergeTreeIndexFactory::instance().getMany(metadata_snapshot->getSecondaryIndices()); - MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns, indices, compression_codec, NO_TRANSACTION_PTR); + MergedBlockOutputStream out(new_data_part, data_part_storage_builder, metadata_snapshot, columns, indices, compression_codec, NO_TRANSACTION_PTR); out.write(block); const auto & projections = metadata_snapshot->getProjections(); for (const auto & [projection_name, projection] : projection_parts) { if (projections.has(projection_name)) { - String projection_destination_path = fs::path(destination_path) / projection_name / ".proj"; - if (disk->exists(projection_destination_path)) + auto projection_part_storage_builder = data_part_storage_builder->getProjection(projection_name + ".proj"); + if (projection_part_storage_builder->exists()) { throw Exception( ErrorCodes::DIRECTORY_ALREADY_EXISTS, "Could not flush projection part {}. Projection part in {} already exists", projection_name, - fullPath(disk, projection_destination_path)); + projection_part_storage_builder->getFullPath()); } auto projection_part = asInMemoryPart(projection); auto projection_type = storage.choosePartTypeOnDisk(projection_part->block.bytes(), rows_count); MergeTreePartInfo projection_info("all", 0, 0, 0); auto projection_data_part - = storage.createPart(projection_name, projection_type, projection_info, volume, projection_name + ".proj", parent_part); + = storage.createPart(projection_name, projection_type, projection_info, projection_part_storage_builder->getStorage(), parent_part); projection_data_part->is_temp = false; // clean up will be done on parent part projection_data_part->setColumns(projection->getColumns()); - disk->createDirectories(projection_destination_path); + projection_part_storage_builder->createDirectories(); const auto & desc = projections.get(name); auto projection_compression_codec = storage.getContext()->chooseCompressionCodec(0, 0); auto projection_indices = MergeTreeIndexFactory::instance().getMany(desc.metadata->getSecondaryIndices()); MergedBlockOutputStream projection_out( - projection_data_part, desc.metadata, projection_part->columns, projection_indices, + projection_data_part, projection_part_storage_builder, desc.metadata, projection_part->columns, projection_indices, projection_compression_codec, NO_TRANSACTION_PTR); projection_out.write(projection_part->block); @@ -131,17 +135,21 @@ void MergeTreeDataPartInMemory::flushToDisk(const String & base_path, const Stri } out.finalizePart(new_data_part, false); + return new_data_part_storage; } void MergeTreeDataPartInMemory::makeCloneInDetached(const String & prefix, const StorageMetadataPtr & metadata_snapshot) const { String detached_path = getRelativePathForDetachedPart(prefix); - flushToDisk(storage.getRelativeDataPath(), detached_path, metadata_snapshot); + flushToDisk(detached_path, metadata_snapshot); } void MergeTreeDataPartInMemory::renameTo(const String & new_relative_path, bool /* remove_new_dir_if_exists */) const { - relative_path = new_relative_path; + data_part_storage->setRelativePath(new_relative_path); + + if (data_part_storage_builder) + data_part_storage_builder->setRelativePath(new_relative_path); } void MergeTreeDataPartInMemory::calculateEachColumnSizes(ColumnSizeByName & each_columns_size, ColumnSize & total_size) const diff --git a/src/Storages/MergeTree/MergeTreeDataPartInMemory.h b/src/Storages/MergeTree/MergeTreeDataPartInMemory.h index d64245ca616..c9caf043b7b 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartInMemory.h +++ b/src/Storages/MergeTree/MergeTreeDataPartInMemory.h @@ -14,15 +14,13 @@ public: const MergeTreeData & storage_, const String & name_, const MergeTreePartInfo & info_, - const VolumePtr & volume_, - const std::optional & relative_path_ = {}, + const DataPartStoragePtr & data_part_storage_, const IMergeTreeDataPart * parent_part_ = nullptr); MergeTreeDataPartInMemory( MergeTreeData & storage_, const String & name_, - const VolumePtr & volume_, - const std::optional & relative_path_ = {}, + const DataPartStoragePtr & data_part_storage_, const IMergeTreeDataPart * parent_part_ = nullptr); MergeTreeReaderPtr getReader( @@ -36,6 +34,7 @@ public: const ReadBufferFromFileBase::ProfileCallback & profile_callback) const override; MergeTreeWriterPtr getWriter( + DataPartStorageBuilderPtr data_part_storage_builder_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, @@ -51,12 +50,13 @@ public: void renameTo(const String & new_relative_path, bool remove_new_dir_if_exists) const override; void makeCloneInDetached(const String & prefix, const StorageMetadataPtr & metadata_snapshot) const override; - void flushToDisk(const String & base_path, const String & new_relative_path, const StorageMetadataPtr & metadata_snapshot) const; + DataPartStoragePtr flushToDisk(const String & new_relative_path, const StorageMetadataPtr & metadata_snapshot) const; /// Returns hash of parts's block Checksum calculateBlockChecksum() const; mutable Block block; + mutable DataPartStorageBuilderPtr data_part_storage_builder; private: mutable std::condition_variable is_merged; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp index 7ba3b7ecea0..7fe68420310 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp @@ -21,10 +21,9 @@ namespace ErrorCodes MergeTreeDataPartWide::MergeTreeDataPartWide( MergeTreeData & storage_, const String & name_, - const VolumePtr & volume_, - const std::optional & relative_path_, + const DataPartStoragePtr & data_part_storage_, const IMergeTreeDataPart * parent_part_) - : IMergeTreeDataPart(storage_, name_, volume_, relative_path_, Type::Wide, parent_part_) + : IMergeTreeDataPart(storage_, name_, data_part_storage_, Type::Wide, parent_part_) { } @@ -32,10 +31,9 @@ MergeTreeDataPartWide::MergeTreeDataPartWide( const MergeTreeData & storage_, const String & name_, const MergeTreePartInfo & info_, - const VolumePtr & volume_, - const std::optional & relative_path_, + const DataPartStoragePtr & data_part_storage_, const IMergeTreeDataPart * parent_part_) - : IMergeTreeDataPart(storage_, name_, info_, volume_, relative_path_, Type::Wide, parent_part_) + : IMergeTreeDataPart(storage_, name_, info_, data_part_storage_, Type::Wide, parent_part_) { } @@ -57,6 +55,7 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartWide::getReader( } IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartWide::getWriter( + DataPartStorageBuilderPtr data_part_storage_builder, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, @@ -65,7 +64,7 @@ IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartWide::getWriter( const MergeTreeIndexGranularity & computed_index_granularity) const { return std::make_unique( - shared_from_this(), columns_list, metadata_snapshot, indices_to_recalc, + shared_from_this(), data_part_storage_builder, columns_list, metadata_snapshot, indices_to_recalc, index_granularity_info.marks_file_extension, default_codec_, writer_settings, computed_index_granularity); } @@ -104,19 +103,20 @@ ColumnSize MergeTreeDataPartWide::getColumnSizeImpl( void MergeTreeDataPartWide::loadIndexGranularity() { - String full_path = getFullRelativePath(); - index_granularity_info.changeGranularityIfRequired(volume->getDisk(), full_path); + index_granularity_info.changeGranularityIfRequired(data_part_storage); if (columns.empty()) throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART); /// We can use any column, it doesn't matter - std::string marks_file_path = index_granularity_info.getMarksFilePath(full_path + getFileNameForColumn(columns.front())); - if (!volume->getDisk()->exists(marks_file_path)) - throw Exception("Marks file '" + fullPath(volume->getDisk(), marks_file_path) + "' doesn't exist", ErrorCodes::NO_FILE_IN_DATA_PART); + std::string marks_file_path = index_granularity_info.getMarksFilePath(getFileNameForColumn(columns.front())); + if (!data_part_storage->exists(marks_file_path)) + throw Exception( + ErrorCodes::NO_FILE_IN_DATA_PART, "Marks file '{}' doesn't exist", + std::string(fs::path(data_part_storage->getFullPath()) / marks_file_path)); - size_t marks_file_size = volume->getDisk()->getFileSize(marks_file_path); + size_t marks_file_size = data_part_storage->getFileSize(marks_file_path); if (!index_granularity_info.is_adaptive) { @@ -125,7 +125,7 @@ void MergeTreeDataPartWide::loadIndexGranularity() } else { - auto buffer = volume->getDisk()->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size); + auto buffer = data_part_storage->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size, std::nullopt); while (!buffer->eof()) { buffer->seek(sizeof(size_t) * 2, SEEK_CUR); /// skip offset_in_compressed file and offset_in_decompressed_block @@ -135,7 +135,9 @@ void MergeTreeDataPartWide::loadIndexGranularity() } if (index_granularity.getMarksCount() * index_granularity_info.getMarkSizeInBytes() != marks_file_size) - throw Exception("Cannot read all marks from file " + fullPath(volume->getDisk(), marks_file_path), ErrorCodes::CANNOT_READ_ALL_DATA); + throw Exception( + ErrorCodes::CANNOT_READ_ALL_DATA, "Cannot read all marks from file {}", + std::string(fs::path(data_part_storage->getFullPath()) / marks_file_path)); } index_granularity.setInitialized(); @@ -143,12 +145,12 @@ void MergeTreeDataPartWide::loadIndexGranularity() bool MergeTreeDataPartWide::isStoredOnRemoteDisk() const { - return volume->getDisk()->isRemote(); + return data_part_storage->isStoredOnRemoteDisk(); } bool MergeTreeDataPartWide::isStoredOnRemoteDiskWithZeroCopySupport() const { - return volume->getDisk()->supportZeroCopyReplication(); + return data_part_storage->supportZeroCopyReplication(); } MergeTreeDataPartWide::~MergeTreeDataPartWide() @@ -159,7 +161,7 @@ MergeTreeDataPartWide::~MergeTreeDataPartWide() void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const { checkConsistencyBase(); - String path = getFullRelativePath(); + //String path = getRelativePath(); if (!checksums.empty()) { @@ -172,12 +174,18 @@ void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const String file_name = ISerialization::getFileNameForStream(name_type, substream_path); String mrk_file_name = file_name + index_granularity_info.marks_file_extension; String bin_file_name = file_name + DATA_FILE_EXTENSION; + if (!checksums.files.contains(mrk_file_name)) - throw Exception("No " + mrk_file_name + " file checksum for column " + name_type.name + " in part " + fullPath(volume->getDisk(), path), - ErrorCodes::NO_FILE_IN_DATA_PART); + throw Exception( + ErrorCodes::NO_FILE_IN_DATA_PART, + "No {} file checksum for column {} in part {} ", + mrk_file_name, name_type.name, data_part_storage->getFullPath()); + if (!checksums.files.contains(bin_file_name)) - throw Exception("No " + bin_file_name + " file checksum for column " + name_type.name + " in part " + fullPath(volume->getDisk(), path), - ErrorCodes::NO_FILE_IN_DATA_PART); + throw Exception( + ErrorCodes::NO_FILE_IN_DATA_PART, + "No {} file checksum for column {} in part ", + bin_file_name, name_type.name, data_part_storage->getFullPath()); }); } } @@ -190,22 +198,26 @@ void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const { getSerialization(name_type)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) { - auto file_path = path + ISerialization::getFileNameForStream(name_type, substream_path) + index_granularity_info.marks_file_extension; + auto file_path = ISerialization::getFileNameForStream(name_type, substream_path) + index_granularity_info.marks_file_extension; /// Missing file is Ok for case when new column was added. - if (volume->getDisk()->exists(file_path)) + if (data_part_storage->exists(file_path)) { - UInt64 file_size = volume->getDisk()->getFileSize(file_path); + UInt64 file_size = data_part_storage->getFileSize(file_path); if (!file_size) - throw Exception("Part " + path + " is broken: " + fullPath(volume->getDisk(), file_path) + " is empty.", - ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART); + throw Exception( + ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART, + "Part {} is broken: {} is empty.", + data_part_storage->getFullPath(), + std::string(fs::path(data_part_storage->getFullPath()) / file_path)); if (!marks_size) marks_size = file_size; else if (file_size != *marks_size) - throw Exception("Part " + path + " is broken: marks have different sizes.", - ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART); + throw Exception( + ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART, + "Part {} is broken: marks have different sizes.", data_part_storage->getFullPath()); } }); } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.h b/src/Storages/MergeTree/MergeTreeDataPartWide.h index bc2c399c100..325193557b3 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.h @@ -19,15 +19,13 @@ public: const MergeTreeData & storage_, const String & name_, const MergeTreePartInfo & info_, - const VolumePtr & volume, - const std::optional & relative_path_ = {}, + const DataPartStoragePtr & data_part_storage_, const IMergeTreeDataPart * parent_part_ = nullptr); MergeTreeDataPartWide( MergeTreeData & storage_, const String & name_, - const VolumePtr & volume, - const std::optional & relative_path_ = {}, + const DataPartStoragePtr & data_part_storage_, const IMergeTreeDataPart * parent_part_ = nullptr); MergeTreeReaderPtr getReader( @@ -41,6 +39,7 @@ public: const ReadBufferFromFileBase::ProfileCallback & profile_callback) const override; MergeTreeWriterPtr getWriter( + DataPartStorageBuilderPtr data_part_storage_builder, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp index a4786570fcb..d181a15d08f 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp @@ -11,6 +11,7 @@ namespace ErrorCodes MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact( const MergeTreeData::DataPartPtr & data_part_, + DataPartStorageBuilderPtr data_part_storage_builder_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc_, @@ -18,20 +19,18 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact( const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & settings_, const MergeTreeIndexGranularity & index_granularity_) - : MergeTreeDataPartWriterOnDisk(data_part_, columns_list_, metadata_snapshot_, + : MergeTreeDataPartWriterOnDisk(data_part_, std::move(data_part_storage_builder_), columns_list_, metadata_snapshot_, indices_to_recalc_, marks_file_extension_, default_codec_, settings_, index_granularity_) - , plain_file(data_part->volume->getDisk()->writeFile( - part_path + MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION, + , plain_file(data_part_storage_builder->writeFile( + MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION, settings.max_compress_block_size, - WriteMode::Rewrite, settings_.query_write_settings)) , plain_hashing(*plain_file) - , marks_file(data_part->volume->getDisk()->writeFile( - part_path + MergeTreeDataPartCompact::DATA_FILE_NAME + marks_file_extension_, - 4096, - WriteMode::Rewrite, - settings_.query_write_settings)) + , marks_file(data_part_storage_builder->writeFile( + MergeTreeDataPartCompact::DATA_FILE_NAME + marks_file_extension_, + 4096, + settings_.query_write_settings)) , marks(*marks_file) { const auto & storage_columns = metadata_snapshot->getColumns(); diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h index cc33d8404c2..dd098b125cd 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h @@ -10,6 +10,7 @@ class MergeTreeDataPartWriterCompact : public MergeTreeDataPartWriterOnDisk public: MergeTreeDataPartWriterCompact( const MergeTreeData::DataPartPtr & data_part, + DataPartStorageBuilderPtr data_part_storage_builder_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterInMemory.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterInMemory.cpp index 0c715a7c27f..e1145868ce2 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterInMemory.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterInMemory.cpp @@ -15,7 +15,7 @@ MergeTreeDataPartWriterInMemory::MergeTreeDataPartWriterInMemory( const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const MergeTreeWriterSettings & settings_) - : IMergeTreeDataPartWriter(part_, columns_list_, metadata_snapshot_, settings_) + : IMergeTreeDataPartWriter(part_, nullptr, columns_list_, metadata_snapshot_, settings_) , part_in_memory(part_) {} void MergeTreeDataPartWriterInMemory::write( diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp index 6cba4db19e3..56ebadc082c 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp @@ -41,7 +41,7 @@ void MergeTreeDataPartWriterOnDisk::Stream::sync() const MergeTreeDataPartWriterOnDisk::Stream::Stream( const String & escaped_column_name_, - DiskPtr disk_, + const DataPartStorageBuilderPtr & data_part_storage_builder, const String & data_path_, const std::string & data_file_extension_, const std::string & marks_path_, @@ -52,11 +52,11 @@ MergeTreeDataPartWriterOnDisk::Stream::Stream( escaped_column_name(escaped_column_name_), data_file_extension{data_file_extension_}, marks_file_extension{marks_file_extension_}, - plain_file(disk_->writeFile(data_path_ + data_file_extension, max_compress_block_size_, WriteMode::Rewrite, query_write_settings)), + plain_file(data_part_storage_builder->writeFile(data_path_ + data_file_extension, max_compress_block_size_, query_write_settings)), plain_hashing(*plain_file), compressed_buf(plain_hashing, compression_codec_, max_compress_block_size_), compressed(compressed_buf), - marks_file(disk_->writeFile(marks_path_ + marks_file_extension, 4096, WriteMode::Rewrite, query_write_settings)), marks(*marks_file) + marks_file(data_part_storage_builder->writeFile(marks_path_ + marks_file_extension, 4096, query_write_settings)), marks(*marks_file) { } @@ -77,6 +77,7 @@ void MergeTreeDataPartWriterOnDisk::Stream::addToChecksums(MergeTreeData::DataPa MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk( const MergeTreeData::DataPartPtr & data_part_, + DataPartStorageBuilderPtr data_part_storage_builder_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const MergeTreeIndices & indices_to_recalc_, @@ -84,10 +85,9 @@ MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk( const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & settings_, const MergeTreeIndexGranularity & index_granularity_) - : IMergeTreeDataPartWriter(data_part_, + : IMergeTreeDataPartWriter(data_part_, std::move(data_part_storage_builder_), columns_list_, metadata_snapshot_, settings_, index_granularity_) , skip_indices(indices_to_recalc_) - , part_path(data_part_->getFullRelativePath()) , marks_file_extension(marks_file_extension_) , default_codec(default_codec_) , compute_granularity(index_granularity.empty()) @@ -95,9 +95,8 @@ MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk( if (settings.blocks_are_granules_size && !index_granularity.empty()) throw Exception("Can't take information about index granularity from blocks, when non empty index_granularity array specified", ErrorCodes::LOGICAL_ERROR); - auto disk = data_part->volume->getDisk(); - if (!disk->exists(part_path)) - disk->createDirectories(part_path); + if (!data_part_storage_builder->exists()) + data_part_storage_builder->createDirectories(); if (settings.rewrite_primary_key) initPrimaryIndex(); @@ -157,7 +156,7 @@ void MergeTreeDataPartWriterOnDisk::initPrimaryIndex() { if (metadata_snapshot->hasPrimaryKey()) { - index_file_stream = data_part->volume->getDisk()->writeFile(part_path + "primary.idx", DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite, settings.query_write_settings); + index_file_stream = data_part_storage_builder->writeFile("primary.idx", DBMS_DEFAULT_BUFFER_SIZE, settings.query_write_settings); index_stream = std::make_unique(*index_file_stream); } } @@ -170,9 +169,9 @@ void MergeTreeDataPartWriterOnDisk::initSkipIndices() skip_indices_streams.emplace_back( std::make_unique( stream_name, - data_part->volume->getDisk(), - part_path + stream_name, index_helper->getSerializedFileExtension(), - part_path + stream_name, marks_file_extension, + data_part_storage_builder, + stream_name, index_helper->getSerializedFileExtension(), + stream_name, marks_file_extension, default_codec, settings.max_compress_block_size, settings.query_write_settings)); skip_indices_aggregators.push_back(index_helper->createIndexAggregator()); skip_index_accumulated_marks.push_back(0); diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h index 67b51df7d56..7cc53db8066 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h @@ -49,7 +49,7 @@ public: { Stream( const String & escaped_column_name_, - DiskPtr disk_, + const DataPartStorageBuilderPtr & data_part_storage_builder, const String & data_path_, const std::string & data_file_extension_, const std::string & marks_path_, @@ -87,6 +87,7 @@ public: MergeTreeDataPartWriterOnDisk( const MergeTreeData::DataPartPtr & data_part_, + DataPartStorageBuilderPtr data_part_storage_builder_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc, @@ -128,7 +129,6 @@ protected: const MergeTreeIndices skip_indices; - const String part_path; const String marks_file_extension; const CompressionCodecPtr default_codec; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp index 6610b8fc06b..db3580e1f86 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp @@ -71,6 +71,7 @@ Granules getGranulesToWrite(const MergeTreeIndexGranularity & index_granularity, MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide( const MergeTreeData::DataPartPtr & data_part_, + DataPartStorageBuilderPtr data_part_storage_builder_, const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc_, @@ -78,7 +79,7 @@ MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide( const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & settings_, const MergeTreeIndexGranularity & index_granularity_) - : MergeTreeDataPartWriterOnDisk(data_part_, columns_list_, metadata_snapshot_, + : MergeTreeDataPartWriterOnDisk(data_part_, std::move(data_part_storage_builder_), columns_list_, metadata_snapshot_, indices_to_recalc_, marks_file_extension_, default_codec_, settings_, index_granularity_) { @@ -111,9 +112,9 @@ void MergeTreeDataPartWriterWide::addStreams( column_streams[stream_name] = std::make_unique( stream_name, - data_part->volume->getDisk(), - part_path + stream_name, DATA_FILE_EXTENSION, - part_path + stream_name, marks_file_extension, + data_part_storage_builder, + stream_name, DATA_FILE_EXTENSION, + stream_name, marks_file_extension, compression_codec, settings.max_compress_block_size, settings.query_write_settings); @@ -409,17 +410,16 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const NameAndTypePai if (!type->isValueRepresentedByNumber() || type->haveSubtypes() || serialization->getKind() != ISerialization::Kind::DEFAULT) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot validate column of non fixed type {}", type->getName()); - auto disk = data_part->volume->getDisk(); String escaped_name = escapeForFileName(name); - String mrk_path = part_path + escaped_name + marks_file_extension; - String bin_path = part_path + escaped_name + DATA_FILE_EXTENSION; + String mrk_path = escaped_name + marks_file_extension; + String bin_path = escaped_name + DATA_FILE_EXTENSION; /// Some columns may be removed because of ttl. Skip them. - if (!disk->exists(mrk_path)) + if (!data_part_storage_builder->exists(mrk_path)) return; - auto mrk_in = disk->readFile(mrk_path); - DB::CompressedReadBufferFromFile bin_in(disk->readFile(bin_path)); + auto mrk_in = data_part_storage_builder->readFile(mrk_path, {}, std::nullopt, std::nullopt); + DB::CompressedReadBufferFromFile bin_in(data_part_storage_builder->readFile(bin_path, {}, std::nullopt, std::nullopt)); bool must_be_last = false; UInt64 offset_in_compressed_file = 0; UInt64 offset_in_decompressed_block = 0; @@ -470,7 +470,7 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const NameAndTypePai if (index_granularity_rows != index_granularity.getMarkRows(mark_num)) throw Exception( ErrorCodes::LOGICAL_ERROR, "Incorrect mark rows for part {} for mark #{} (compressed offset {}, decompressed offset {}), in-memory {}, on disk {}, total marks {}", - data_part->getFullPath(), mark_num, offset_in_compressed_file, offset_in_decompressed_block, index_granularity.getMarkRows(mark_num), index_granularity_rows, index_granularity.getMarksCount()); + data_part_storage_builder->getFullPath(), mark_num, offset_in_compressed_file, offset_in_decompressed_block, index_granularity.getMarkRows(mark_num), index_granularity_rows, index_granularity.getMarksCount()); auto column = type->createColumn(); diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h index b82fcd652ae..a3517f3aa88 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h @@ -19,6 +19,7 @@ class MergeTreeDataPartWriterWide : public MergeTreeDataPartWriterOnDisk public: MergeTreeDataPartWriterWide( const MergeTreeData::DataPartPtr & data_part, + DataPartStorageBuilderPtr data_part_storage_builder_, const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index f23a6e7834e..30fb3efcf0e 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -115,6 +116,21 @@ static RelativeSize convertAbsoluteSampleSizeToRelative(const ASTPtr & node, siz return std::min(RelativeSize(1), RelativeSize(absolute_sample_size) / RelativeSize(approx_total_rows)); } +static SortDescription getSortDescriptionFromGroupBy(const ASTSelectQuery & query) +{ + SortDescription order_descr; + order_descr.reserve(query.groupBy()->children.size()); + + for (const auto & elem : query.groupBy()->children) + { + /// Note, here aliases should not be used, since there will be no such column in a block. + String name = elem->getColumnNameWithoutAlias(); + order_descr.emplace_back(name, 1, 1); + } + + return order_descr; +} + QueryPlanPtr MergeTreeDataSelectExecutor::read( const Names & column_names_to_return, @@ -168,9 +184,6 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read( query_info.projection->desc->type, query_info.projection->desc->name); - Pipes pipes; - Pipe projection_pipe; - Pipe ordinary_pipe; QueryPlanResourceHolder resources; auto projection_plan = std::make_unique(); @@ -217,12 +230,9 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read( expression_before_aggregation->setStepDescription("Before GROUP BY"); projection_plan->addStep(std::move(expression_before_aggregation)); } - - auto builder = projection_plan->buildQueryPipeline( - QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); - projection_pipe = QueryPipelineBuilder::getPipe(std::move(*builder), resources); } + auto ordinary_query_plan = std::make_unique(); if (query_info.projection->merge_tree_normal_select_result_ptr) { auto storage_from_base_parts_of_projection @@ -234,49 +244,27 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read( nullptr, SelectQueryOptions{processed_stage}.projectionQuery()); - QueryPlan ordinary_query_plan; - interpreter.buildQueryPlan(ordinary_query_plan); + interpreter.buildQueryPlan(*ordinary_query_plan); const auto & expressions = interpreter.getAnalysisResult(); if (processed_stage == QueryProcessingStage::Enum::FetchColumns && expressions.before_where) { auto where_step = std::make_unique( - ordinary_query_plan.getCurrentDataStream(), + ordinary_query_plan->getCurrentDataStream(), expressions.before_where, expressions.where_column_name, expressions.remove_where_filter); where_step->setStepDescription("WHERE"); - ordinary_query_plan.addStep(std::move(where_step)); + ordinary_query_plan->addStep(std::move(where_step)); } - - auto builder = ordinary_query_plan.buildQueryPipeline( - QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); - ordinary_pipe = QueryPipelineBuilder::getPipe(std::move(*builder), resources); } + Pipe projection_pipe; + Pipe ordinary_pipe; if (query_info.projection->desc->type == ProjectionDescription::Type::Aggregate) { - /// Here we create shared ManyAggregatedData for both projection and ordinary data. - /// For ordinary data, AggregatedData is filled in a usual way. - /// For projection data, AggregatedData is filled by merging aggregation states. - /// When all AggregatedData is filled, we merge aggregation states together in a usual way. - /// Pipeline will look like: - /// ReadFromProjection -> Aggregating (only merge states) -> - /// ReadFromProjection -> Aggregating (only merge states) -> - /// ... -> Resize -> ConvertingAggregatedToChunks - /// ReadFromOrdinaryPart -> Aggregating (usual) -> (added by last Aggregating) - /// ReadFromOrdinaryPart -> Aggregating (usual) -> - /// ... - auto many_data = std::make_shared(projection_pipe.numOutputPorts() + ordinary_pipe.numOutputPorts()); - size_t counter = 0; - - AggregatorListPtr aggregator_list_ptr = std::make_shared(); - - // TODO apply in_order_optimization here - auto build_aggregate_pipe = [&](Pipe & pipe, bool projection) + auto make_aggregator_params = [&](const Block & header_before_aggregation, bool projection) { - const auto & header_before_aggregation = pipe.getHeader(); - ColumnNumbers keys; for (const auto & key : query_info.projection->aggregation_keys) keys.push_back(header_before_aggregation.getPositionByName(key.name)); @@ -290,29 +278,28 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read( descr.arguments.push_back(header_before_aggregation.getPositionByName(name)); } - AggregatingTransformParamsPtr transform_params; + Aggregator::Params params( + header_before_aggregation, + keys, + aggregates, + query_info.projection->aggregate_overflow_row, + settings.max_rows_to_group_by, + settings.group_by_overflow_mode, + settings.group_by_two_level_threshold, + settings.group_by_two_level_threshold_bytes, + settings.max_bytes_before_external_group_by, + settings.empty_result_for_aggregation_by_empty_set, + context->getTemporaryVolume(), + settings.max_threads, + settings.min_free_disk_space_for_temporary_data, + settings.compile_aggregate_expressions, + settings.min_count_to_compile_aggregate_expression); + + bool only_merge = false; if (projection) { - Aggregator::Params params( - header_before_aggregation, - keys, - aggregates, - query_info.projection->aggregate_overflow_row, - settings.max_rows_to_group_by, - settings.group_by_overflow_mode, - settings.group_by_two_level_threshold, - settings.group_by_two_level_threshold_bytes, - settings.max_bytes_before_external_group_by, - settings.empty_result_for_aggregation_by_empty_set, - context->getTemporaryVolume(), - settings.max_threads, - settings.min_free_disk_space_for_temporary_data, - settings.compile_expressions, - settings.min_count_to_compile_aggregate_expression, - header_before_aggregation); // The source header is also an intermediate header - - transform_params = std::make_shared( - std::move(params), aggregator_list_ptr, query_info.projection->aggregate_final); + /// The source header is also an intermediate header + params.intermediate_header = header_before_aggregation; /// This part is hacky. /// We want AggregatingTransform to work with aggregate states instead of normal columns. @@ -321,51 +308,135 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read( /// It is needed because data in projection: /// * is not merged completely (we may have states with the same key in different parts) /// * is not split into buckets (so if we just use MergingAggregated, it will use single thread) - transform_params->only_merge = true; - } - else - { - Aggregator::Params params( - header_before_aggregation, - keys, - aggregates, - query_info.projection->aggregate_overflow_row, - settings.max_rows_to_group_by, - settings.group_by_overflow_mode, - settings.group_by_two_level_threshold, - settings.group_by_two_level_threshold_bytes, - settings.max_bytes_before_external_group_by, - settings.empty_result_for_aggregation_by_empty_set, - context->getTemporaryVolume(), - settings.max_threads, - settings.min_free_disk_space_for_temporary_data, - settings.compile_aggregate_expressions, - settings.min_count_to_compile_aggregate_expression); - - transform_params = std::make_shared( - std::move(params), aggregator_list_ptr, query_info.projection->aggregate_final); + only_merge = true; } - pipe.resize(pipe.numOutputPorts(), true, true); - - auto merge_threads = num_streams; - auto temporary_data_merge_threads = settings.aggregation_memory_efficient_merge_threads - ? static_cast(settings.aggregation_memory_efficient_merge_threads) - : static_cast(settings.max_threads); - - pipe.addSimpleTransform([&](const Block & header) - { - return std::make_shared( - header, transform_params, many_data, counter++, merge_threads, temporary_data_merge_threads); - }); + return std::make_pair(params, only_merge); }; - if (!projection_pipe.empty()) - build_aggregate_pipe(projection_pipe, true); - if (!ordinary_pipe.empty()) - build_aggregate_pipe(ordinary_pipe, false); + if (ordinary_query_plan->isInitialized() && projection_plan->isInitialized()) + { + auto projection_builder = projection_plan->buildQueryPipeline( + QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); + projection_pipe = QueryPipelineBuilder::getPipe(std::move(*projection_builder), resources); + + auto ordinary_builder = ordinary_query_plan->buildQueryPipeline( + QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); + ordinary_pipe = QueryPipelineBuilder::getPipe(std::move(*ordinary_builder), resources); + + /// Here we create shared ManyAggregatedData for both projection and ordinary data. + /// For ordinary data, AggregatedData is filled in a usual way. + /// For projection data, AggregatedData is filled by merging aggregation states. + /// When all AggregatedData is filled, we merge aggregation states together in a usual way. + /// Pipeline will look like: + /// ReadFromProjection -> Aggregating (only merge states) -> + /// ReadFromProjection -> Aggregating (only merge states) -> + /// ... -> Resize -> ConvertingAggregatedToChunks + /// ReadFromOrdinaryPart -> Aggregating (usual) -> (added by last Aggregating) + /// ReadFromOrdinaryPart -> Aggregating (usual) -> + /// ... + auto many_data = std::make_shared(projection_pipe.numOutputPorts() + ordinary_pipe.numOutputPorts()); + size_t counter = 0; + + AggregatorListPtr aggregator_list_ptr = std::make_shared(); + + /// TODO apply optimize_aggregation_in_order here too (like below) + auto build_aggregate_pipe = [&](Pipe & pipe, bool projection) + { + auto [params, only_merge] = make_aggregator_params(pipe.getHeader(), projection); + + AggregatingTransformParamsPtr transform_params = std::make_shared( + std::move(params), aggregator_list_ptr, query_info.projection->aggregate_final, only_merge); + + pipe.resize(pipe.numOutputPorts(), true, true); + + auto merge_threads = num_streams; + auto temporary_data_merge_threads = settings.aggregation_memory_efficient_merge_threads + ? static_cast(settings.aggregation_memory_efficient_merge_threads) + : static_cast(settings.max_threads); + + pipe.addSimpleTransform([&](const Block & header) + { + return std::make_shared( + header, transform_params, many_data, counter++, merge_threads, temporary_data_merge_threads); + }); + }; + + if (!projection_pipe.empty()) + build_aggregate_pipe(projection_pipe, true); + if (!ordinary_pipe.empty()) + build_aggregate_pipe(ordinary_pipe, false); + } + else + { + auto add_aggregating_step = [&](QueryPlanPtr & query_plan, bool projection) + { + auto [params, only_merge] = make_aggregator_params(query_plan->getCurrentDataStream().header, projection); + + auto merge_threads = num_streams; + auto temporary_data_merge_threads = settings.aggregation_memory_efficient_merge_threads + ? static_cast(settings.aggregation_memory_efficient_merge_threads) + : static_cast(settings.max_threads); + + InputOrderInfoPtr group_by_info = query_info.projection->input_order_info; + SortDescription group_by_sort_description; + if (group_by_info && settings.optimize_aggregation_in_order) + group_by_sort_description = getSortDescriptionFromGroupBy(query_info.query->as()); + else + group_by_info = nullptr; + + auto aggregating_step = std::make_unique( + query_plan->getCurrentDataStream(), + std::move(params), + /* grouping_sets_params_= */ GroupingSetsParamsList{}, + query_info.projection->aggregate_final, + only_merge, + settings.max_block_size, + settings.aggregation_in_order_max_block_bytes, + merge_threads, + temporary_data_merge_threads, + /* storage_has_evenly_distributed_read_= */ false, + std::move(group_by_info), + std::move(group_by_sort_description)); + query_plan->addStep(std::move(aggregating_step)); + }; + + if (projection_plan->isInitialized()) + { + add_aggregating_step(projection_plan, true); + + auto projection_builder = projection_plan->buildQueryPipeline( + QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); + projection_pipe = QueryPipelineBuilder::getPipe(std::move(*projection_builder), resources); + } + if (ordinary_query_plan->isInitialized()) + { + add_aggregating_step(ordinary_query_plan, false); + + auto ordinary_builder = ordinary_query_plan->buildQueryPipeline( + QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); + ordinary_pipe = QueryPipelineBuilder::getPipe(std::move(*ordinary_builder), resources); + } + } + } + else + { + if (projection_plan->isInitialized()) + { + auto projection_builder = projection_plan->buildQueryPipeline( + QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); + projection_pipe = QueryPipelineBuilder::getPipe(std::move(*projection_builder), resources); + } + + if (ordinary_query_plan->isInitialized()) + { + auto ordinary_builder = ordinary_query_plan->buildQueryPipeline( + QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); + ordinary_pipe = QueryPipelineBuilder::getPipe(std::move(*ordinary_builder), resources); + } } + Pipes pipes; pipes.emplace_back(std::move(projection_pipe)); pipes.emplace_back(std::move(ordinary_pipe)); auto pipe = Pipe::unitePipes(std::move(pipes)); @@ -1084,6 +1155,7 @@ std::shared_ptr MergeTreeDataSelectExecutor::checkLimits( const MergeTreeData & data, const ReadFromMergeTree::AnalysisResult & result, const ContextPtr & context) + TSA_NO_THREAD_SAFETY_ANALYSIS // disabled because TSA is confused by guaranteed copy elision in data.getQueryIdSetLock() { const auto & settings = context->getSettingsRef(); const auto data_settings = data.getSettings(); @@ -1109,7 +1181,7 @@ std::shared_ptr MergeTreeDataSelectExecutor::checkLimits( if (!query_id.empty()) { auto lock = data.getQueryIdSetLock(); - if (data.insertQueryIdOrThrowNoLock(query_id, data_settings->max_concurrent_queries, lock)) + if (data.insertQueryIdOrThrowNoLock(query_id, data_settings->max_concurrent_queries)) { try { @@ -1118,7 +1190,7 @@ std::shared_ptr MergeTreeDataSelectExecutor::checkLimits( catch (...) { /// If we fail to construct the holder, remove query_id explicitly to avoid leak. - data.removeQueryIdNoLock(query_id, lock); + data.removeQueryIdNoLock(query_id); throw; } } @@ -1523,10 +1595,10 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex( UncompressedCache * uncompressed_cache, Poco::Logger * log) { - const std::string & path_prefix = part->getFullRelativePath() + index_helper->getFileName(); - if (!index_helper->getDeserializedFormat(part->volume->getDisk(), path_prefix)) + if (!index_helper->getDeserializedFormat(part->data_part_storage, index_helper->getFileName())) { - LOG_DEBUG(log, "File for index {} does not exist ({}.*). Skipping it.", backQuote(index_helper->index.name), path_prefix); + LOG_DEBUG(log, "File for index {} does not exist ({}.*). Skipping it.", backQuote(index_helper->index.name), + (fs::path(part->data_part_storage->getFullPath()) / index_helper->getFileName()).string()); return ranges; } @@ -1616,7 +1688,7 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingMergedIndex( { for (const auto & index_helper : indices) { - if (!part->volume->getDisk()->exists(part->getFullRelativePath() + index_helper->getFileName() + ".idx")) + if (!part->data_part_storage->exists(index_helper->getFileName() + ".idx")) { LOG_DEBUG(log, "File for index {} does not exist. Skipping it.", backQuote(index_helper->index.name)); return ranges; diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index 7e08fb0ccfc..f7c544132bb 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -367,13 +368,23 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPart( ReservationPtr reservation = data.reserveSpacePreferringTTLRules(metadata_snapshot, expected_size, move_ttl_infos, time(nullptr), 0, true); VolumePtr volume = data.getStoragePolicy()->getVolume(0); + VolumePtr data_part_volume = createVolumeFromReservation(reservation, volume); + + auto data_part_storage = std::make_shared( + data_part_volume, + data.relative_data_path, + TMP_PREFIX + part_name); + + auto data_part_storage_builder = std::make_shared( + data_part_volume, + data.relative_data_path, + TMP_PREFIX + part_name); auto new_data_part = data.createPart( part_name, data.choosePartType(expected_size, block.rows()), new_part_info, - createVolumeFromReservation(reservation, volume), - TMP_PREFIX + part_name); + data_part_storage); if (data.storage_settings.get()->assign_part_uuids) new_data_part->uuid = UUIDHelpers::generateV4(); @@ -395,15 +406,15 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPart( if (new_data_part->isStoredOnDisk()) { /// The name could be non-unique in case of stale files from previous runs. - String full_path = new_data_part->getFullRelativePath(); + String full_path = new_data_part->data_part_storage->getFullPath(); - if (new_data_part->volume->getDisk()->exists(full_path)) + if (new_data_part->data_part_storage->exists()) { - LOG_WARNING(log, "Removing old temporary directory {}", fullPath(new_data_part->volume->getDisk(), full_path)); - new_data_part->volume->getDisk()->removeRecursive(full_path); + LOG_WARNING(log, "Removing old temporary directory {}", new_data_part->data_part_storage->getFullPath()); + data_part_volume->getDisk()->removeRecursive(full_path); } - const auto disk = new_data_part->volume->getDisk(); + const auto disk = data_part_volume->getDisk(); disk->createDirectories(full_path); if (data.getSettings()->fsync_part_directory) @@ -433,11 +444,10 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPart( auto compression_codec = data.getContext()->chooseCompressionCodec(0, 0); const auto & index_factory = MergeTreeIndexFactory::instance(); - auto out = std::make_unique(new_data_part, metadata_snapshot, columns, + auto out = std::make_unique(new_data_part, data_part_storage_builder, metadata_snapshot, columns, index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec, context->getCurrentTransaction(), false, false, context->getWriteSettings()); - out->writeWithPermutation(block, perm_ptr); for (const auto & projection : metadata_snapshot->getProjections()) @@ -445,7 +455,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPart( auto projection_block = projection.calculate(block, context); if (projection_block.rows()) { - auto proj_temp_part = writeProjectionPart(data, log, projection_block, projection, new_data_part.get()); + auto proj_temp_part = writeProjectionPart(data, log, projection_block, projection, data_part_storage_builder, new_data_part.get()); new_data_part->addProjectionPart(projection.name, std::move(proj_temp_part.part)); for (auto & stream : proj_temp_part.streams) temp_part.streams.emplace_back(std::move(stream)); @@ -482,6 +492,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( const String & part_name, MergeTreeDataPartType part_type, const String & relative_path, + const DataPartStorageBuilderPtr & data_part_storage_builder, bool is_temp, const IMergeTreeDataPart * parent_part, const MergeTreeData & data, @@ -492,13 +503,15 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( TemporaryPart temp_part; const StorageMetadataPtr & metadata_snapshot = projection.metadata; MergeTreePartInfo new_part_info("all", 0, 0, 0); + auto projection_part_storage = parent_part->data_part_storage->getProjection(relative_path); auto new_data_part = data.createPart( part_name, part_type, new_part_info, - parent_part->volume, - relative_path, + projection_part_storage, parent_part); + + auto projection_part_storage_builder = data_part_storage_builder->getProjection(relative_path); new_data_part->is_temp = is_temp; NamesAndTypesList columns = metadata_snapshot->getColumns().getAllPhysical().filter(block.getNames()); @@ -512,15 +525,13 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( if (new_data_part->isStoredOnDisk()) { /// The name could be non-unique in case of stale files from previous runs. - String full_path = new_data_part->getFullRelativePath(); - - if (new_data_part->volume->getDisk()->exists(full_path)) + if (projection_part_storage->exists()) { - LOG_WARNING(log, "Removing old temporary directory {}", fullPath(new_data_part->volume->getDisk(), full_path)); - new_data_part->volume->getDisk()->removeRecursive(full_path); + LOG_WARNING(log, "Removing old temporary directory {}", projection_part_storage->getFullPath()); + projection_part_storage_builder->removeRecursive(); } - new_data_part->volume->getDisk()->createDirectories(full_path); + projection_part_storage_builder->createDirectories(); } /// If we need to calculate some columns to sort. @@ -564,6 +575,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( auto out = std::make_unique( new_data_part, + projection_part_storage_builder, metadata_snapshot, columns, MergeTreeIndices{}, @@ -585,7 +597,12 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( } MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPart( - MergeTreeData & data, Poco::Logger * log, Block block, const ProjectionDescription & projection, const IMergeTreeDataPart * parent_part) + MergeTreeData & data, + Poco::Logger * log, + Block block, + const ProjectionDescription & projection, + const DataPartStorageBuilderPtr & data_part_storage_builder, + const IMergeTreeDataPart * parent_part) { String part_name = projection.name; MergeTreeDataPartType part_type; @@ -598,7 +615,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPart( /// Size of part would not be greater than block.bytes() + epsilon size_t expected_size = block.bytes(); // just check if there is enough space on parent volume - data.reserveSpace(expected_size, parent_part->volume); + data.reserveSpace(expected_size, data_part_storage_builder); part_type = data.choosePartTypeOnDisk(expected_size, block.rows()); } @@ -606,6 +623,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPart( part_name, part_type, part_name + ".proj" /* relative_path */, + data_part_storage_builder, false /* is_temp */, parent_part, data, @@ -621,6 +639,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempProjectionPart( Poco::Logger * log, Block block, const ProjectionDescription & projection, + const DataPartStorageBuilderPtr & data_part_storage_builder, const IMergeTreeDataPart * parent_part, size_t block_num) { @@ -635,7 +654,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempProjectionPart( /// Size of part would not be greater than block.bytes() + epsilon size_t expected_size = block.bytes(); // just check if there is enough space on parent volume - data.reserveSpace(expected_size, parent_part->volume); + data.reserveSpace(expected_size, data_part_storage_builder); part_type = data.choosePartTypeOnDisk(expected_size, block.rows()); } @@ -643,6 +662,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempProjectionPart( part_name, part_type, part_name + ".tmp_proj" /* relative_path */, + data_part_storage_builder, true /* is_temp */, parent_part, data, @@ -656,12 +676,14 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeInMemoryProjectionP Poco::Logger * log, Block block, const ProjectionDescription & projection, + const DataPartStorageBuilderPtr & data_part_storage_builder, const IMergeTreeDataPart * parent_part) { return writeProjectionPartImpl( projection.name, MergeTreeDataPartType::InMemory, projection.name + ".proj" /* relative_path */, + data_part_storage_builder, false /* is_temp */, parent_part, data, diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.h b/src/Storages/MergeTree/MergeTreeDataWriter.h index 7b6bf8fb1db..147b38e828a 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.h +++ b/src/Storages/MergeTree/MergeTreeDataWriter.h @@ -74,6 +74,7 @@ public: Poco::Logger * log, Block block, const ProjectionDescription & projection, + const DataPartStorageBuilderPtr & data_part_storage_builder, const IMergeTreeDataPart * parent_part); /// For mutation: MATERIALIZE PROJECTION. @@ -82,6 +83,7 @@ public: Poco::Logger * log, Block block, const ProjectionDescription & projection, + const DataPartStorageBuilderPtr & data_part_storage_builder, const IMergeTreeDataPart * parent_part, size_t block_num); @@ -91,6 +93,7 @@ public: Poco::Logger * log, Block block, const ProjectionDescription & projection, + const DataPartStorageBuilderPtr & data_part_storage_builder, const IMergeTreeDataPart * parent_part); static Block mergeBlock( @@ -105,6 +108,7 @@ private: const String & part_name, MergeTreeDataPartType part_type, const String & relative_path, + const DataPartStorageBuilderPtr & data_part_storage_builder, bool is_temp, const IMergeTreeDataPart * parent_part, const MergeTreeData & data, diff --git a/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp b/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp index 07f6e3f3be7..d0f4d8b3604 100644 --- a/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp +++ b/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include diff --git a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.cpp b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.cpp index 9f791db0b69..6ae58dc4584 100644 --- a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.cpp @@ -13,13 +13,13 @@ namespace ErrorCodes extern const int UNKNOWN_PART_TYPE; } -std::optional MergeTreeIndexGranularityInfo::getMarksExtensionFromFilesystem(const DiskPtr & disk, const String & path_to_part) +std::optional MergeTreeIndexGranularityInfo::getMarksExtensionFromFilesystem(const DataPartStoragePtr & data_part_storage) { - if (disk->exists(path_to_part)) + if (data_part_storage->exists()) { - for (DirectoryIteratorPtr it = disk->iterateDirectory(path_to_part); it->isValid(); it->next()) + for (auto it = data_part_storage->iterate(); it->isValid(); it->next()) { - const auto & ext = fs::path(it->path()).extension(); + const auto & ext = fs::path(it->name()).extension(); if (ext == getNonAdaptiveMrkExtension() || ext == getAdaptiveMrkExtension(MergeTreeDataPartType::Wide) || ext == getAdaptiveMrkExtension(MergeTreeDataPartType::Compact)) @@ -46,9 +46,9 @@ MergeTreeIndexGranularityInfo::MergeTreeIndexGranularityInfo(const MergeTreeData setAdaptive(storage_settings->index_granularity_bytes); } -void MergeTreeIndexGranularityInfo::changeGranularityIfRequired(const DiskPtr & disk, const String & path_to_part) +void MergeTreeIndexGranularityInfo::changeGranularityIfRequired(const DataPartStoragePtr & data_part_storage) { - auto mrk_ext = getMarksExtensionFromFilesystem(disk, path_to_part); + auto mrk_ext = getMarksExtensionFromFilesystem(data_part_storage); if (mrk_ext && *mrk_ext == getNonAdaptiveMrkExtension()) setNonAdaptive(); } diff --git a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h index 92e889ec762..dbb027c244e 100644 --- a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h +++ b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -28,7 +29,7 @@ public: MergeTreeIndexGranularityInfo(const MergeTreeData & storage, MergeTreeDataPartType type_); - void changeGranularityIfRequired(const DiskPtr & disk, const String & path_to_part); + void changeGranularityIfRequired(const DataPartStoragePtr & data_part_storage); String getMarksFilePath(const String & path_prefix) const { @@ -37,7 +38,7 @@ public: size_t getMarkSizeInBytes(size_t columns_num = 1) const; - static std::optional getMarksExtensionFromFilesystem(const DiskPtr & disk, const String & path_to_part); + static std::optional getMarksExtensionFromFilesystem(const DataPartStoragePtr & data_part_storage); private: MergeTreeDataPartType type; diff --git a/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp b/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp index b257a1db090..05319ecc62e 100644 --- a/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexMinMax.cpp @@ -213,11 +213,11 @@ bool MergeTreeIndexMinMax::mayBenefitFromIndexForIn(const ASTPtr & node) const return false; } -MergeTreeIndexFormat MergeTreeIndexMinMax::getDeserializedFormat(const DiskPtr disk, const std::string & relative_path_prefix) const +MergeTreeIndexFormat MergeTreeIndexMinMax::getDeserializedFormat(const DataPartStoragePtr & data_part_storage, const std::string & relative_path_prefix) const { - if (disk->exists(relative_path_prefix + ".idx2")) + if (data_part_storage->exists(relative_path_prefix + ".idx2")) return {2, ".idx2"}; - else if (disk->exists(relative_path_prefix + ".idx")) + else if (data_part_storage->exists(relative_path_prefix + ".idx")) return {1, ".idx"}; return {0 /* unknown */, ""}; } diff --git a/src/Storages/MergeTree/MergeTreeIndexMinMax.h b/src/Storages/MergeTree/MergeTreeIndexMinMax.h index 9f78c86a498..0566a15d535 100644 --- a/src/Storages/MergeTree/MergeTreeIndexMinMax.h +++ b/src/Storages/MergeTree/MergeTreeIndexMinMax.h @@ -83,7 +83,7 @@ public: bool mayBenefitFromIndexForIn(const ASTPtr & node) const override; const char* getSerializedFileExtension() const override { return ".idx2"; } - MergeTreeIndexFormat getDeserializedFormat(const DiskPtr disk, const std::string & path_prefix) const override; /// NOLINT + MergeTreeIndexFormat getDeserializedFormat(const DataPartStoragePtr & data_part_storage, const std::string & path_prefix) const override; /// NOLINT }; } diff --git a/src/Storages/MergeTree/MergeTreeIndexReader.cpp b/src/Storages/MergeTree/MergeTreeIndexReader.cpp index 732c7a82209..c43c75035e4 100644 --- a/src/Storages/MergeTree/MergeTreeIndexReader.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexReader.cpp @@ -16,8 +16,8 @@ std::unique_ptr makeIndexReader( MergeTreeReaderSettings settings) { return std::make_unique( - part->volume->getDisk(), - part->getFullRelativePath() + index->getFileName(), extension, marks_count, + part->data_part_storage, + index->getFileName(), extension, marks_count, all_mark_ranges, std::move(settings), mark_cache, uncompressed_cache, part->getFileSizeOrZero(index->getFileName() + extension), @@ -40,8 +40,7 @@ MergeTreeIndexReader::MergeTreeIndexReader( MergeTreeReaderSettings settings) : index(index_) { - const std::string & path_prefix = part_->getFullRelativePath() + index->getFileName(); - auto index_format = index->getDeserializedFormat(part_->volume->getDisk(), path_prefix); + auto index_format = index->getDeserializedFormat(part_->data_part_storage, index->getFileName()); stream = makeIndexReader( index_format.extension, diff --git a/src/Storages/MergeTree/MergeTreeIndices.h b/src/Storages/MergeTree/MergeTreeIndices.h index d76216e1598..051edd630cb 100644 --- a/src/Storages/MergeTree/MergeTreeIndices.h +++ b/src/Storages/MergeTree/MergeTreeIndices.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -147,9 +148,9 @@ struct IMergeTreeIndex /// Returns extension for deserialization. /// /// Return pair. - virtual MergeTreeIndexFormat getDeserializedFormat(const DiskPtr disk, const std::string & relative_path_prefix) const + virtual MergeTreeIndexFormat getDeserializedFormat(const DataPartStoragePtr & data_part_storage, const std::string & relative_path_prefix) const { - if (disk->exists(relative_path_prefix + ".idx")) + if (data_part_storage->exists(relative_path_prefix + ".idx")) return {1, ".idx"}; return {0 /*unknown*/, ""}; } diff --git a/src/Storages/MergeTree/MergeTreeMarksLoader.cpp b/src/Storages/MergeTree/MergeTreeMarksLoader.cpp index e7ead4dc8bb..aeb00bfda79 100644 --- a/src/Storages/MergeTree/MergeTreeMarksLoader.cpp +++ b/src/Storages/MergeTree/MergeTreeMarksLoader.cpp @@ -16,14 +16,14 @@ namespace ErrorCodes } MergeTreeMarksLoader::MergeTreeMarksLoader( - DiskPtr disk_, + DataPartStoragePtr data_part_storage_, MarkCache * mark_cache_, const String & mrk_path_, size_t marks_count_, const MergeTreeIndexGranularityInfo & index_granularity_info_, bool save_marks_in_cache_, size_t columns_in_mark_) - : disk(std::move(disk_)) + : data_part_storage(std::move(data_part_storage_)) , mark_cache(mark_cache_) , mrk_path(mrk_path_) , marks_count(marks_count_) @@ -50,21 +50,23 @@ MarkCache::MappedPtr MergeTreeMarksLoader::loadMarksImpl() /// Memory for marks must not be accounted as memory usage for query, because they are stored in shared cache. MemoryTrackerBlockerInThread temporarily_disable_memory_tracker; - size_t file_size = disk->getFileSize(mrk_path); + size_t file_size = data_part_storage->getFileSize(mrk_path); size_t mark_size = index_granularity_info.getMarkSizeInBytes(columns_in_mark); size_t expected_file_size = mark_size * marks_count; if (expected_file_size != file_size) throw Exception( - "Bad size of marks file '" + fullPath(disk, mrk_path) + "': " + std::to_string(file_size) + ", must be: " + std::to_string(expected_file_size), - ErrorCodes::CORRUPTED_DATA); + ErrorCodes::CORRUPTED_DATA, + "Bad size of marks file '{}': {}, must be: {}", + std::string(fs::path(data_part_storage->getFullPath()) / mrk_path), + std::to_string(file_size), std::to_string(expected_file_size)); auto res = std::make_shared(marks_count * columns_in_mark); if (!index_granularity_info.is_adaptive) { /// Read directly to marks. - auto buffer = disk->readFile(mrk_path, ReadSettings().adjustBufferSize(file_size), file_size); + auto buffer = data_part_storage->readFile(mrk_path, ReadSettings().adjustBufferSize(file_size), file_size, std::nullopt); buffer->readStrict(reinterpret_cast(res->data()), file_size); if (!buffer->eof()) @@ -73,7 +75,7 @@ MarkCache::MappedPtr MergeTreeMarksLoader::loadMarksImpl() } else { - auto buffer = disk->readFile(mrk_path, ReadSettings().adjustBufferSize(file_size), file_size); + auto buffer = data_part_storage->readFile(mrk_path, ReadSettings().adjustBufferSize(file_size), file_size, std::nullopt); size_t i = 0; while (!buffer->eof()) { @@ -93,7 +95,7 @@ void MergeTreeMarksLoader::loadMarks() { if (mark_cache) { - auto key = mark_cache->hash(mrk_path); + auto key = mark_cache->hash(fs::path(data_part_storage->getFullPath()) / mrk_path); if (save_marks_in_cache) { auto callback = [this]{ return loadMarksImpl(); }; @@ -110,7 +112,7 @@ void MergeTreeMarksLoader::loadMarks() marks = loadMarksImpl(); if (!marks) - throw Exception("Failed to load marks: " + mrk_path, ErrorCodes::LOGICAL_ERROR); + throw Exception("Failed to load marks: " + std::string(fs::path(data_part_storage->getFullPath()) / mrk_path), ErrorCodes::LOGICAL_ERROR); } } diff --git a/src/Storages/MergeTree/MergeTreeMarksLoader.h b/src/Storages/MergeTree/MergeTreeMarksLoader.h index 062eb720583..3effeeec12b 100644 --- a/src/Storages/MergeTree/MergeTreeMarksLoader.h +++ b/src/Storages/MergeTree/MergeTreeMarksLoader.h @@ -1,5 +1,5 @@ #pragma once -#include +#include #include namespace DB @@ -13,7 +13,7 @@ public: using MarksPtr = MarkCache::MappedPtr; MergeTreeMarksLoader( - DiskPtr disk_, + DataPartStoragePtr data_part_storage_, MarkCache * mark_cache_, const String & mrk_path, size_t marks_count_, @@ -26,7 +26,7 @@ public: bool initialized() const { return marks != nullptr; } private: - DiskPtr disk; + DataPartStoragePtr data_part_storage; MarkCache * mark_cache = nullptr; String mrk_path; size_t marks_count; diff --git a/src/Storages/MergeTree/MergeTreePartition.cpp b/src/Storages/MergeTree/MergeTreePartition.cpp index ddbb536d7da..81026989f95 100644 --- a/src/Storages/MergeTree/MergeTreePartition.cpp +++ b/src/Storages/MergeTree/MergeTreePartition.cpp @@ -381,21 +381,20 @@ void MergeTreePartition::load(const MergeTreeData & storage, const PartMetadataM partition_key_sample.getByPosition(i).type->getDefaultSerialization()->deserializeBinary(value[i], *file); } -std::unique_ptr MergeTreePartition::store(const MergeTreeData & storage, const DiskPtr & disk, const String & part_path, MergeTreeDataPartChecksums & checksums) const +std::unique_ptr MergeTreePartition::store(const MergeTreeData & storage, const DataPartStorageBuilderPtr & data_part_storage_builder, MergeTreeDataPartChecksums & checksums) const { auto metadata_snapshot = storage.getInMemoryMetadataPtr(); const auto & context = storage.getContext(); - const auto & partition_key_sample = adjustPartitionKey(metadata_snapshot, context).sample_block; - return store(partition_key_sample, disk, part_path, checksums, context->getWriteSettings()); + const auto & partition_key_sample = adjustPartitionKey(metadata_snapshot, storage.getContext()).sample_block; + return store(partition_key_sample, data_part_storage_builder, checksums, context->getWriteSettings()); } -std::unique_ptr MergeTreePartition::store( - const Block & partition_key_sample, const DiskPtr & disk, const String & part_path, MergeTreeDataPartChecksums & checksums, const WriteSettings & settings) const +std::unique_ptr MergeTreePartition::store(const Block & partition_key_sample, const DataPartStorageBuilderPtr & data_part_storage_builder, MergeTreeDataPartChecksums & checksums, const WriteSettings & settings) const { if (!partition_key_sample) return nullptr; - auto out = disk->writeFile(std::filesystem::path(part_path) / "partition.dat", DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite, settings); + auto out = data_part_storage_builder->writeFile("partition.dat", DBMS_DEFAULT_BUFFER_SIZE, settings); HashingWriteBuffer out_hashing(*out); for (size_t i = 0; i < value.size(); ++i) { diff --git a/src/Storages/MergeTree/MergeTreePartition.h b/src/Storages/MergeTree/MergeTreePartition.h index 1afb833498c..6394641dfa3 100644 --- a/src/Storages/MergeTree/MergeTreePartition.h +++ b/src/Storages/MergeTree/MergeTreePartition.h @@ -15,8 +15,10 @@ class MergeTreeData; struct FormatSettings; struct MergeTreeDataPartChecksums; struct StorageInMemoryMetadata; +class IDataPartStorageBuilder; using StorageMetadataPtr = std::shared_ptr; +using DataPartStorageBuilderPtr = std::shared_ptr; /// This class represents a partition value of a single part and encapsulates its loading/storing logic. struct MergeTreePartition @@ -42,8 +44,8 @@ public: /// Store functions return write buffer with written but not finalized data. /// User must call finish() for returned object. - [[nodiscard]] std::unique_ptr store(const MergeTreeData & storage, const DiskPtr & disk, const String & part_path, MergeTreeDataPartChecksums & checksums) const; - [[nodiscard]] std::unique_ptr store(const Block & partition_key_sample, const DiskPtr & disk, const String & part_path, MergeTreeDataPartChecksums & checksums, const WriteSettings & settings) const; + [[nodiscard]] std::unique_ptr store(const MergeTreeData & storage, const DataPartStorageBuilderPtr & data_part_storage_builder, MergeTreeDataPartChecksums & checksums) const; + [[nodiscard]] std::unique_ptr store(const Block & partition_key_sample, const DataPartStorageBuilderPtr & data_part_storage_builder, MergeTreeDataPartChecksums & checksums, const WriteSettings & settings) const; void assign(const MergeTreePartition & other) { value = other.value; } diff --git a/src/Storages/MergeTree/MergeTreePartsMover.cpp b/src/Storages/MergeTree/MergeTreePartsMover.cpp index bb625d74ead..2df17702c03 100644 --- a/src/Storages/MergeTree/MergeTreePartsMover.cpp +++ b/src/Storages/MergeTree/MergeTreePartsMover.cpp @@ -100,6 +100,7 @@ bool MergeTreePartsMover::selectPartsForMove( return false; std::unordered_map need_to_move; + std::unordered_set need_to_move_disks; const auto policy = data->getStoragePolicy(); const auto & volumes = policy->getVolumes(); @@ -114,7 +115,10 @@ bool MergeTreePartsMover::selectPartsForMove( UInt64 unreserved_space = disk->getUnreservedSpace(); if (unreserved_space < required_maximum_available_space && !disk->isBroken()) + { need_to_move.emplace(disk, required_maximum_available_space - unreserved_space); + need_to_move_disks.emplace(disk); + } } } } @@ -135,7 +139,10 @@ bool MergeTreePartsMover::selectPartsForMove( auto ttl_entry = selectTTLDescriptionForTTLInfos(metadata_snapshot->getMoveTTLs(), part->ttl_infos.moves_ttl, time_of_move, true); - auto to_insert = need_to_move.find(part->volume->getDisk()); + auto to_insert = need_to_move.end(); + if (auto disk_it = part->data_part_storage->isStoredOnDisk(need_to_move_disks); disk_it != need_to_move_disks.end()) + to_insert = need_to_move.find(*disk_it); + ReservationPtr reservation; if (ttl_entry) { @@ -200,7 +207,9 @@ MergeTreeData::DataPartPtr MergeTreePartsMover::clonePart(const MergeTreeMoveEnt auto settings = data->getSettings(); auto part = moving_part.part; auto disk = moving_part.reserved_space->getDisk(); - LOG_DEBUG(log, "Cloning part {} from '{}' to '{}'", part->name, part->volume->getDisk()->getName(), disk->getName()); + LOG_DEBUG(log, "Cloning part {} from '{}' to '{}'", part->name, part->data_part_storage->getDiskName(), disk->getName()); + + DataPartStoragePtr cloned_part_storage; const String directory_to_move = "moving"; if (disk->supportZeroCopyReplication() && settings->allow_remote_fs_zero_copy_replication) @@ -208,7 +217,7 @@ MergeTreeData::DataPartPtr MergeTreePartsMover::clonePart(const MergeTreeMoveEnt /// Try zero-copy replication and fallback to default copy if it's not possible moving_part.part->assertOnDisk(); String path_to_clone = fs::path(data->getRelativeDataPath()) / directory_to_move / ""; - String relative_path = part->relative_path; + String relative_path = part->data_part_storage->getPartDirectory(); if (disk->exists(path_to_clone + relative_path)) { LOG_WARNING(log, "Path {} already exists. Will remove it and clone again.", fullPath(disk, path_to_clone + relative_path)); @@ -216,27 +225,26 @@ MergeTreeData::DataPartPtr MergeTreePartsMover::clonePart(const MergeTreeMoveEnt } disk->createDirectories(path_to_clone); - bool is_fetched = data->tryToFetchIfShared(*part, disk, fs::path(path_to_clone) / part->name); - if (!is_fetched) + + cloned_part_storage = data->tryToFetchIfShared(*part, disk, fs::path(path_to_clone) / part->name); + + if (!cloned_part_storage) { LOG_INFO(log, "Part {} was not fetched, we are the first who move it to another disk, so we will copy it", part->name); - part->volume->getDisk()->copy(fs::path(data->getRelativeDataPath()) / relative_path / "", disk, path_to_clone); + cloned_part_storage = part->data_part_storage->clone(path_to_clone, part->data_part_storage->getPartDirectory(), disk, log); } - part->volume->getDisk()->removeFileIfExists(fs::path(path_to_clone) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME); } else { - part->makeCloneOnDisk(disk, directory_to_move); + cloned_part_storage = part->makeCloneOnDisk(disk, directory_to_move); } - auto single_disk_volume = std::make_shared("volume_" + part->name, moving_part.reserved_space->getDisk(), 0); - MergeTreeData::MutableDataPartPtr cloned_part = - data->createPart(part->name, single_disk_volume, fs::path(directory_to_move) / part->name); - LOG_TRACE(log, "Part {} was cloned to {}", part->name, cloned_part->getFullPath()); + MergeTreeData::MutableDataPartPtr cloned_part = data->createPart(part->name, cloned_part_storage); + LOG_TRACE(log, "Part {} was cloned to {}", part->name, cloned_part->data_part_storage->getFullPath()); cloned_part->loadColumnsChecksumsIndexes(true, true); cloned_part->loadVersionMetadata(); - cloned_part->modification_time = disk->getLastModified(cloned_part->getFullRelativePath()).epochTime(); + cloned_part->modification_time = cloned_part->data_part_storage->getLastModified().epochTime(); return cloned_part; } @@ -252,7 +260,7 @@ void MergeTreePartsMover::swapClonedPart(const MergeTreeData::DataPartPtr & clon /// It's ok, because we don't block moving parts for merges or mutations if (!active_part || active_part->name != cloned_part->name) { - LOG_INFO(log, "Failed to swap {}. Active part doesn't exist. Possible it was merged or mutated. Will remove copy on path '{}'.", cloned_part->name, cloned_part->getFullPath()); + LOG_INFO(log, "Failed to swap {}. Active part doesn't exist. Possible it was merged or mutated. Will remove copy on path '{}'.", cloned_part->name, cloned_part->data_part_storage->getFullPath()); return; } @@ -262,7 +270,7 @@ void MergeTreePartsMover::swapClonedPart(const MergeTreeData::DataPartPtr & clon /// TODO what happen if server goes down here? data->swapActivePart(cloned_part); - LOG_TRACE(log, "Part {} was moved to {}", cloned_part->name, cloned_part->getFullPath()); + LOG_TRACE(log, "Part {} was moved to {}", cloned_part->name, cloned_part->data_part_storage->getFullPath()); } } diff --git a/src/Storages/MergeTree/MergeTreeReadPool.cpp b/src/Storages/MergeTree/MergeTreeReadPool.cpp index 134eba91000..d9a1f742609 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp @@ -261,7 +261,7 @@ void MergeTreeReadPool::fillPerThreadInfo( { PartInfo part_info{parts[i], per_part_sum_marks[i], i}; if (parts[i].data_part->isStoredOnDisk()) - parts_per_disk[parts[i].data_part->volume->getDisk()->getName()].push_back(std::move(part_info)); + parts_per_disk[parts[i].data_part->data_part_storage->getDiskName()].push_back(std::move(part_info)); else parts_per_disk[""].push_back(std::move(part_info)); } diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp index b943c3c8718..8e2e1d02836 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp @@ -35,9 +35,9 @@ MergeTreeReaderCompact::MergeTreeReaderCompact( settings_, avg_value_size_hints_) , marks_loader( - data_part->volume->getDisk(), + data_part->data_part_storage, mark_cache, - data_part->index_granularity_info.getMarksFilePath(data_part->getFullRelativePath() + MergeTreeDataPartCompact::DATA_FILE_NAME), + data_part->index_granularity_info.getMarksFilePath(MergeTreeDataPartCompact::DATA_FILE_NAME), data_part->getMarksCount(), data_part->index_granularity_info, settings.save_marks_in_cache, @@ -83,16 +83,17 @@ MergeTreeReaderCompact::MergeTreeReaderCompact( if (!settings.read_settings.local_fs_buffer_size || !settings.read_settings.remote_fs_buffer_size) throw Exception(ErrorCodes::CANNOT_READ_ALL_DATA, "Cannot read to empty buffer."); - const String full_data_path = data_part->getFullRelativePath() + MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION; + const String path = MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION; if (uncompressed_cache) { auto buffer = std::make_unique( - fullPath(data_part->volume->getDisk(), full_data_path), - [this, full_data_path]() + std::string(fs::path(data_part->data_part_storage->getFullPath()) / path), + [this, path]() { - return data_part->volume->getDisk()->readFile( - full_data_path, - settings.read_settings); + return data_part->data_part_storage->readFile( + path, + settings.read_settings, + std::nullopt, std::nullopt); }, uncompressed_cache, /* allow_different_codecs = */ true); @@ -111,9 +112,10 @@ MergeTreeReaderCompact::MergeTreeReaderCompact( { auto buffer = std::make_unique( - data_part->volume->getDisk()->readFile( - full_data_path, - settings.read_settings), + data_part->data_part_storage->readFile( + path, + settings.read_settings, + std::nullopt, std::nullopt), /* allow_different_codecs = */ true); if (profile_callback_) diff --git a/src/Storages/MergeTree/MergeTreeReaderStream.cpp b/src/Storages/MergeTree/MergeTreeReaderStream.cpp index aad3787bb36..6b43b8ac5c0 100644 --- a/src/Storages/MergeTree/MergeTreeReaderStream.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderStream.cpp @@ -15,7 +15,7 @@ namespace ErrorCodes } MergeTreeReaderStream::MergeTreeReaderStream( - DiskPtr disk_, + DataPartStoragePtr data_part_storage_, const String & path_prefix_, const String & data_file_extension_, size_t marks_count_, const MarkRanges & all_mark_ranges_, const MergeTreeReaderSettings & settings_, @@ -30,7 +30,7 @@ MergeTreeReaderStream::MergeTreeReaderStream( , all_mark_ranges(all_mark_ranges_) , file_size(file_size_) , uncompressed_cache(uncompressed_cache_) - , disk(std::move(disk_)) + , data_part_storage(std::move(data_part_storage_)) , path_prefix(path_prefix_) , data_file_extension(data_file_extension_) , is_low_cardinality_dictionary(is_low_cardinality_dictionary_) @@ -38,7 +38,7 @@ MergeTreeReaderStream::MergeTreeReaderStream( , mark_cache(mark_cache_) , save_marks_in_cache(settings.save_marks_in_cache) , index_granularity_info(index_granularity_info_) - , marks_loader(disk, mark_cache, index_granularity_info->getMarksFilePath(path_prefix), + , marks_loader(data_part_storage, mark_cache, index_granularity_info->getMarksFilePath(path_prefix), marks_count, *index_granularity_info, save_marks_in_cache) {} void MergeTreeReaderStream::init() @@ -79,13 +79,13 @@ void MergeTreeReaderStream::init() if (uncompressed_cache) { auto buffer = std::make_unique( - fullPath(disk, path_prefix + data_file_extension), + std::string(fs::path(data_part_storage->getFullPath()) / (path_prefix + data_file_extension)), [this, estimated_sum_mark_range_bytes, read_settings]() { - return disk->readFile( + return data_part_storage->readFile( path_prefix + data_file_extension, read_settings, - estimated_sum_mark_range_bytes); + estimated_sum_mark_range_bytes, std::nullopt); }, uncompressed_cache); @@ -102,10 +102,11 @@ void MergeTreeReaderStream::init() else { auto buffer = std::make_unique( - disk->readFile( + data_part_storage->readFile( path_prefix + data_file_extension, read_settings, - estimated_sum_mark_range_bytes)); + estimated_sum_mark_range_bytes, + std::nullopt)); if (profile_callback) buffer->setProfileCallback(profile_callback, clock_type); @@ -158,9 +159,22 @@ size_t MergeTreeReaderStream::getRightOffset(size_t right_mark_non_included) /// So, that's why we have to read one extra granule to the right, /// while reading dictionary of LowCardinality. - size_t right_mark_included = is_low_cardinality_dictionary - ? right_mark_non_included - : right_mark_non_included - 1; + /// If right_mark_non_included has non-zero offset in decompressed block, we have to + /// read its compressed block in a whole, because it may consist data from previous granule. + /// + /// For example: + /// Mark 10: (758287, 0) <--- right_mark_included + /// Mark 11: (908457, 53477) <--- right_mark_non_included + /// Mark 12: (1064746, 20742) <--- what we are looking for + /// Mark 13: (2009333, 40123) + /// + /// Since mark 11 starts from offset in decompressed block 53477, + /// it has some data from mark 10 and we have to read + /// compressed block [908457; 1064746 in a whole. + + size_t right_mark_included = right_mark_non_included - 1; + if (is_low_cardinality_dictionary || marks_loader.getMark(right_mark_non_included).offset_in_decompressed_block != 0) + ++right_mark_included; auto indices = collections::range(right_mark_included, marks_count); auto it = std::upper_bound(indices.begin(), indices.end(), right_mark_included, diff --git a/src/Storages/MergeTree/MergeTreeReaderStream.h b/src/Storages/MergeTree/MergeTreeReaderStream.h index 74922b9c236..dc2d99144ce 100644 --- a/src/Storages/MergeTree/MergeTreeReaderStream.h +++ b/src/Storages/MergeTree/MergeTreeReaderStream.h @@ -19,7 +19,7 @@ class MergeTreeReaderStream { public: MergeTreeReaderStream( - DiskPtr disk_, + DataPartStoragePtr data_part_storage_, const String & path_prefix_, const String & data_file_extension_, size_t marks_count_, const MarkRanges & all_mark_ranges, const MergeTreeReaderSettings & settings_, @@ -52,7 +52,7 @@ private: size_t file_size; UncompressedCache * uncompressed_cache; - DiskPtr disk; + DataPartStoragePtr data_part_storage; std::string path_prefix; std::string data_file_extension; diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.cpp b/src/Storages/MergeTree/MergeTreeReaderWide.cpp index 0f5cf8de669..93c1e23884a 100644 --- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp @@ -47,7 +47,6 @@ MergeTreeReaderWide::MergeTreeReaderWide( { try { - disk = data_part->volume->getDisk(); for (const NameAndTypePair & column : columns) { auto column_from_part = getColumnFromPart(column); @@ -74,7 +73,7 @@ size_t MergeTreeReaderWide::readRows( std::unordered_map caches; std::unordered_set prefetched_streams; - if (disk->isRemote() ? settings.read_settings.remote_fs_prefetch : settings.read_settings.local_fs_prefetch) + if (data_part->data_part_storage->isStoredOnRemoteDisk() ? settings.read_settings.remote_fs_prefetch : settings.read_settings.local_fs_prefetch) { /// Request reading of data in advance, /// so if reading can be asynchronous, it will also be performed in parallel for all columns. @@ -147,7 +146,7 @@ size_t MergeTreeReaderWide::readRows( storage.reportBrokenPart(data_part); /// Better diagnostics. - e.addMessage("(while reading from part " + data_part->getFullPath() + " " + e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + " " "from mark " + toString(from_mark) + " " "with max_rows_to_read = " + toString(max_rows_to_read) + ")"); throw; @@ -183,7 +182,7 @@ void MergeTreeReaderWide::addStreams(const NameAndTypePair & name_and_type, bool is_lc_dict = substream_path.size() > 1 && substream_path[substream_path.size() - 2].type == ISerialization::Substream::Type::DictionaryKeys; streams.emplace(stream_name, std::make_unique( - disk, data_part->getFullRelativePath() + stream_name, DATA_FILE_EXTENSION, + data_part->data_part_storage, stream_name, DATA_FILE_EXTENSION, data_part->getMarksCount(), all_mark_ranges, settings, mark_cache, uncompressed_cache, data_part->getFileSizeOrZero(stream_name + DATA_FILE_EXTENSION), &data_part->index_granularity_info, diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.h b/src/Storages/MergeTree/MergeTreeReaderWide.h index 41219560ecc..7bb1ccfd173 100644 --- a/src/Storages/MergeTree/MergeTreeReaderWide.h +++ b/src/Storages/MergeTree/MergeTreeReaderWide.h @@ -37,7 +37,6 @@ public: private: FileStreams streams; - DiskPtr disk; void addStreams(const NameAndTypePair & name_and_type, const ReadBufferFromFileBase::ProfileCallback & profile_callback, clockid_t clock_type); diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp index 50d7577b7d6..1d41c2c254d 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp @@ -64,11 +64,11 @@ void MergeTreeSelectProcessor::initializeReaders() owned_mark_cache = storage.getContext()->getMarkCache(); reader = data_part->getReader(task_columns.columns, storage_snapshot->getMetadataForQuery(), - all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings); + all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, {}, {}); if (prewhere_info) pre_reader = data_part->getReader(task_columns.pre_columns, storage_snapshot->getMetadataForQuery(), - all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings); + all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, {}, {}); } diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp index 22ef2db2e15..7c4cc04ba52 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp @@ -67,7 +67,7 @@ MergeTreeSequentialSource::MergeTreeSequentialSource( reader = data_part->getReader(columns_for_reader, storage_snapshot->metadata, MarkRanges{MarkRange(0, data_part->getMarksCount())}, - /* uncompressed_cache = */ nullptr, mark_cache.get(), reader_settings); + /* uncompressed_cache = */ nullptr, mark_cache.get(), reader_settings, {}, {}); } Chunk MergeTreeSequentialSource::generate() diff --git a/src/Storages/MergeTree/MergeTreeSink.cpp b/src/Storages/MergeTree/MergeTreeSink.cpp index 4dc4d62c2a2..fbc916ddb2c 100644 --- a/src/Storages/MergeTree/MergeTreeSink.cpp +++ b/src/Storages/MergeTree/MergeTreeSink.cpp @@ -76,7 +76,7 @@ void MergeTreeSink::consume(Chunk chunk) if (!temp_part.part) continue; - if (!support_parallel_write && temp_part.part->volume->getDisk()->supportParallelWrite()) + if (!support_parallel_write && temp_part.part->data_part_storage->supportParallelWrite()) support_parallel_write = true; if (storage.getDeduplicationLog()) diff --git a/src/Storages/MergeTree/MergeTreeWriteAheadLog.cpp b/src/Storages/MergeTree/MergeTreeWriteAheadLog.cpp index 06c56e88694..a9dce5b5ebe 100644 --- a/src/Storages/MergeTree/MergeTreeWriteAheadLog.cpp +++ b/src/Storages/MergeTree/MergeTreeWriteAheadLog.cpp @@ -10,6 +10,7 @@ #include #include #include +#include "Storages/MergeTree/DataPartStorageOnDisk.h" #include namespace DB @@ -132,6 +133,7 @@ MergeTreeData::MutableDataPartsVector MergeTreeWriteAheadLog::restore(const Stor while (!in->eof()) { MergeTreeData::MutableDataPartPtr part; + DataPartStorageBuilderPtr data_part_storage_builder; UInt8 version; String part_name; Block block; @@ -157,13 +159,14 @@ MergeTreeData::MutableDataPartsVector MergeTreeWriteAheadLog::restore(const Stor else if (action_type == ActionType::ADD_PART) { auto single_disk_volume = std::make_shared("volume_" + part_name, disk, 0); + auto data_part_storage = std::make_shared(single_disk_volume, storage.getRelativeDataPath(), part_name); + data_part_storage_builder = std::make_shared(single_disk_volume, storage.getRelativeDataPath(), part_name); part = storage.createPart( part_name, MergeTreeDataPartType::InMemory, MergeTreePartInfo::fromPartName(part_name, storage.format_version), - single_disk_volume, - part_name); + data_part_storage); part->uuid = metadata.part_uuid; @@ -199,6 +202,7 @@ MergeTreeData::MutableDataPartsVector MergeTreeWriteAheadLog::restore(const Stor { MergedBlockOutputStream part_out( part, + data_part_storage_builder, metadata_snapshot, block.getNamesAndTypesList(), {}, @@ -216,7 +220,7 @@ MergeTreeData::MutableDataPartsVector MergeTreeWriteAheadLog::restore(const Stor for (const auto & projection : metadata_snapshot->getProjections()) { auto projection_block = projection.calculate(block, context); - auto temp_part = MergeTreeDataWriter::writeInMemoryProjectionPart(storage, log, projection_block, projection, part.get()); + auto temp_part = MergeTreeDataWriter::writeInMemoryProjectionPart(storage, log, projection_block, projection, data_part_storage_builder, part.get()); temp_part.finalize(); if (projection_block.rows()) part->addProjectionPart(projection.name, std::move(temp_part.part)); diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index 09711e512a5..f7da1eb2585 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -15,6 +15,7 @@ namespace ErrorCodes MergedBlockOutputStream::MergedBlockOutputStream( const MergeTreeDataPartPtr & data_part, + DataPartStorageBuilderPtr data_part_storage_builder_, const StorageMetadataPtr & metadata_snapshot_, const NamesAndTypesList & columns_list_, const MergeTreeIndices & skip_indices, @@ -23,7 +24,7 @@ MergedBlockOutputStream::MergedBlockOutputStream( bool reset_columns_, bool blocks_are_granules_size, const WriteSettings & write_settings) - : IMergedBlockOutputStream(data_part, metadata_snapshot_, columns_list_, reset_columns_) + : IMergedBlockOutputStream(std::move(data_part_storage_builder_), data_part, metadata_snapshot_, columns_list_, reset_columns_) , columns_list(columns_list_) , default_codec(default_codec_) { @@ -35,8 +36,8 @@ MergedBlockOutputStream::MergedBlockOutputStream( /* rewrite_primary_key = */ true, blocks_are_granules_size); - if (!part_path.empty()) - volume->getDisk()->createDirectories(part_path); + if (data_part->isStoredOnDisk()) + data_part_storage_builder->createDirectories(); /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. TransactionID tid = txn ? txn->tid : Tx::PrehistoricTID; @@ -45,7 +46,7 @@ MergedBlockOutputStream::MergedBlockOutputStream( data_part->version.setCreationTID(tid, nullptr); data_part->storeVersionMetadata(); - writer = data_part->getWriter(columns_list, metadata_snapshot, skip_indices, default_codec, writer_settings); + writer = data_part->getWriter(data_part_storage_builder, columns_list, metadata_snapshot, skip_indices, default_codec, writer_settings, {}); } /// If data is pre-sorted. @@ -66,13 +67,15 @@ struct MergedBlockOutputStream::Finalizer::Impl { IMergeTreeDataPartWriter & writer; MergeTreeData::MutableDataPartPtr part; + DataPartStorageBuilderPtr data_part_storage_builder; NameSet files_to_remove_after_finish; std::vector> written_files; bool sync; - Impl(IMergeTreeDataPartWriter & writer_, MergeTreeData::MutableDataPartPtr part_, const NameSet & files_to_remove_after_finish_, bool sync_) + Impl(IMergeTreeDataPartWriter & writer_, MergeTreeData::MutableDataPartPtr part_, DataPartStorageBuilderPtr data_part_storage_builder_, const NameSet & files_to_remove_after_finish_, bool sync_) : writer(writer_) , part(std::move(part_)) + , data_part_storage_builder(std::move(data_part_storage_builder_)) , files_to_remove_after_finish(files_to_remove_after_finish_) , sync(sync_) {} @@ -90,9 +93,8 @@ void MergedBlockOutputStream::Finalizer::Impl::finish() { writer.finish(sync); - auto disk = part->volume->getDisk(); for (const auto & file_name: files_to_remove_after_finish) - disk->removeFile(part->getFullRelativePath() + file_name); + data_part_storage_builder->removeFile(file_name); for (auto & file : written_files) { @@ -164,7 +166,7 @@ MergedBlockOutputStream::Finalizer MergedBlockOutputStream::finalizePartAsync( new_part->setSerializationInfos(serialization_infos); } - auto finalizer = std::make_unique(*writer, new_part, files_to_remove_after_sync, sync); + auto finalizer = std::make_unique(*writer, new_part, data_part_storage_builder, files_to_remove_after_sync, sync); if (new_part->isStoredOnDisk()) finalizer->written_files = finalizePartOnDisk(new_part, checksums, write_settings); @@ -192,7 +194,7 @@ MergedBlockOutputStream::WrittenFiles MergedBlockOutputStream::finalizePartOnDis { if (storage.format_version >= MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING || isCompactPart(new_part)) { - auto count_out = volume->getDisk()->writeFile(part_path + "count.txt", 4096, WriteMode::Rewrite, settings); + auto count_out = data_part_storage_builder->writeFile("count.txt", 4096, settings); HashingWriteBuffer count_out_hashing(*count_out); writeIntText(rows_count, count_out_hashing); count_out_hashing.next(); @@ -206,7 +208,7 @@ MergedBlockOutputStream::WrittenFiles MergedBlockOutputStream::finalizePartOnDis { if (new_part->uuid != UUIDHelpers::Nil) { - auto out = volume->getDisk()->writeFile(fs::path(part_path) / IMergeTreeDataPart::UUID_FILE_NAME, 4096, WriteMode::Rewrite, settings); + auto out = data_part_storage_builder->writeFile(IMergeTreeDataPart::UUID_FILE_NAME, 4096, settings); HashingWriteBuffer out_hashing(*out); writeUUIDText(new_part->uuid, out_hashing); checksums.files[IMergeTreeDataPart::UUID_FILE_NAME].file_size = out_hashing.count(); @@ -217,12 +219,12 @@ MergedBlockOutputStream::WrittenFiles MergedBlockOutputStream::finalizePartOnDis if (storage.format_version >= MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING) { - if (auto file = new_part->partition.store(storage, volume->getDisk(), part_path, checksums)) + if (auto file = new_part->partition.store(storage, data_part_storage_builder, checksums)) written_files.emplace_back(std::move(file)); if (new_part->minmax_idx->initialized) { - auto files = new_part->minmax_idx->store(storage, volume->getDisk(), part_path, checksums); + auto files = new_part->minmax_idx->store(storage, data_part_storage_builder, checksums); for (auto & file : files) written_files.emplace_back(std::move(file)); } @@ -232,7 +234,7 @@ MergedBlockOutputStream::WrittenFiles MergedBlockOutputStream::finalizePartOnDis } { - auto count_out = volume->getDisk()->writeFile(fs::path(part_path) / "count.txt", 4096, WriteMode::Rewrite, settings); + auto count_out = data_part_storage_builder->writeFile("count.txt", 4096, settings); HashingWriteBuffer count_out_hashing(*count_out); writeIntText(rows_count, count_out_hashing); count_out_hashing.next(); @@ -246,7 +248,7 @@ MergedBlockOutputStream::WrittenFiles MergedBlockOutputStream::finalizePartOnDis if (!new_part->ttl_infos.empty()) { /// Write a file with ttl infos in json format. - auto out = volume->getDisk()->writeFile(fs::path(part_path) / "ttl.txt", 4096, WriteMode::Rewrite, settings); + auto out = data_part_storage_builder->writeFile("ttl.txt", 4096, settings); HashingWriteBuffer out_hashing(*out); new_part->ttl_infos.write(out_hashing); checksums.files["ttl.txt"].file_size = out_hashing.count(); @@ -257,7 +259,7 @@ MergedBlockOutputStream::WrittenFiles MergedBlockOutputStream::finalizePartOnDis if (!new_part->getSerializationInfos().empty()) { - auto out = volume->getDisk()->writeFile(part_path + IMergeTreeDataPart::SERIALIZATION_FILE_NAME, 4096, WriteMode::Rewrite, settings); + auto out = data_part_storage_builder->writeFile(IMergeTreeDataPart::SERIALIZATION_FILE_NAME, 4096, settings); HashingWriteBuffer out_hashing(*out); new_part->getSerializationInfos().writeJSON(out_hashing); checksums.files[IMergeTreeDataPart::SERIALIZATION_FILE_NAME].file_size = out_hashing.count(); @@ -268,7 +270,7 @@ MergedBlockOutputStream::WrittenFiles MergedBlockOutputStream::finalizePartOnDis { /// Write a file with a description of columns. - auto out = volume->getDisk()->writeFile(fs::path(part_path) / "columns.txt", 4096, WriteMode::Rewrite, settings); + auto out = data_part_storage_builder->writeFile("columns.txt", 4096, settings); new_part->getColumns().writeText(*out); out->preFinalize(); written_files.emplace_back(std::move(out)); @@ -276,7 +278,7 @@ MergedBlockOutputStream::WrittenFiles MergedBlockOutputStream::finalizePartOnDis if (default_codec != nullptr) { - auto out = volume->getDisk()->writeFile(part_path + IMergeTreeDataPart::DEFAULT_COMPRESSION_CODEC_FILE_NAME, 4096, WriteMode::Rewrite, settings); + auto out = data_part_storage_builder->writeFile(IMergeTreeDataPart::DEFAULT_COMPRESSION_CODEC_FILE_NAME, 4096, settings); DB::writeText(queryToString(default_codec->getFullCodecDesc()), *out); out->preFinalize(); written_files.emplace_back(std::move(out)); @@ -289,7 +291,7 @@ MergedBlockOutputStream::WrittenFiles MergedBlockOutputStream::finalizePartOnDis { /// Write file with checksums. - auto out = volume->getDisk()->writeFile(fs::path(part_path) / "checksums.txt", 4096, WriteMode::Rewrite, settings); + auto out = data_part_storage_builder->writeFile("checksums.txt", 4096, settings); checksums.write(*out); out->preFinalize(); written_files.emplace_back(std::move(out)); diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.h b/src/Storages/MergeTree/MergedBlockOutputStream.h index 67dec1923e8..70c4bc3b49f 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.h +++ b/src/Storages/MergeTree/MergedBlockOutputStream.h @@ -16,6 +16,7 @@ class MergedBlockOutputStream final : public IMergedBlockOutputStream public: MergedBlockOutputStream( const MergeTreeDataPartPtr & data_part, + DataPartStorageBuilderPtr data_part_storage_builder_, const StorageMetadataPtr & metadata_snapshot_, const NamesAndTypesList & columns_list_, const MergeTreeIndices & skip_indices, diff --git a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp index f0d241baa2f..740e57a136e 100644 --- a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp +++ b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp @@ -11,6 +11,7 @@ namespace ErrorCodes } MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream( + DataPartStorageBuilderPtr data_part_storage_builder_, const MergeTreeDataPartPtr & data_part, const StorageMetadataPtr & metadata_snapshot_, const Block & header_, @@ -19,7 +20,7 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream( WrittenOffsetColumns * offset_columns_, const MergeTreeIndexGranularity & index_granularity, const MergeTreeIndexGranularityInfo * index_granularity_info) - : IMergedBlockOutputStream(data_part, metadata_snapshot_, header_.getNamesAndTypesList(), /*reset_columns=*/ true) + : IMergedBlockOutputStream(std::move(data_part_storage_builder_), data_part, metadata_snapshot_, header_.getNamesAndTypesList(), /*reset_columns=*/ true) , header(header_) { const auto & global_settings = data_part->storage.getContext()->getSettings(); @@ -33,6 +34,7 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream( /* rewrite_primary_key = */false); writer = data_part->getWriter( + data_part_storage_builder, header.getNamesAndTypesList(), metadata_snapshot_, indices_to_recalc, @@ -77,13 +79,11 @@ MergedColumnOnlyOutputStream::fillChecksums( auto removed_files = removeEmptyColumnsFromPart(new_part, columns, serialization_infos, checksums); - auto disk = new_part->volume->getDisk(); for (const String & removed_file : removed_files) { - auto file_path = new_part->getFullRelativePath() + removed_file; /// Can be called multiple times, don't need to remove file twice - if (disk->exists(file_path)) - disk->removeFile(file_path); + if (data_part_storage_builder->exists(removed_file)) + data_part_storage_builder->removeFile(removed_file); if (all_checksums.files.contains(removed_file)) all_checksums.files.erase(removed_file); diff --git a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h index 7b587d01dab..1fd1c752226 100644 --- a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h +++ b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h @@ -14,6 +14,7 @@ public: /// Pass empty 'already_written_offset_columns' first time then and pass the same object to subsequent instances of MergedColumnOnlyOutputStream /// if you want to serialize elements of Nested data structure in different instances of MergedColumnOnlyOutputStream. MergedColumnOnlyOutputStream( + DataPartStorageBuilderPtr data_part_storage_builder_, const MergeTreeDataPartPtr & data_part, const StorageMetadataPtr & metadata_snapshot_, const Block & header_, diff --git a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp index f46bce1c87b..403d77165d4 100644 --- a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp @@ -92,7 +92,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MutateFromLogEntryTask::prepare() /// Once we mutate part, we must reserve space on the same disk, because mutations can possibly create hardlinks. /// Can throw an exception. - reserved_space = storage.reserveSpace(estimated_space_for_result, source_part->volume); + reserved_space = storage.reserveSpace(estimated_space_for_result, source_part->data_part_storage); table_lock_holder = storage.lockForShare( RWLockImpl::NO_QUERY, storage_settings_ptr->lock_acquire_timeout_for_background_operations); diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 5df0f6eab68..d39f3f704a1 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -541,17 +542,18 @@ static NameToNameVector collectFilesForRenames( /// Initialize and write to disk new part fields like checksums, columns, etc. void finalizeMutatedPart( const MergeTreeDataPartPtr & source_part, + const DataPartStorageBuilderPtr & data_part_storage_builder, MergeTreeData::MutableDataPartPtr new_data_part, ExecuteTTLType execute_ttl_type, const CompressionCodecPtr & codec, ContextPtr context) { - auto disk = new_data_part->volume->getDisk(); - auto part_path = fs::path(new_data_part->getFullRelativePath()); + //auto disk = new_data_part->volume->getDisk(); + //auto part_path = fs::path(new_data_part->getRelativePath()); if (new_data_part->uuid != UUIDHelpers::Nil) { - auto out = disk->writeFile(part_path / IMergeTreeDataPart::UUID_FILE_NAME, 4096, WriteMode::Rewrite, context->getWriteSettings()); + auto out = data_part_storage_builder->writeFile(IMergeTreeDataPart::UUID_FILE_NAME, 4096, context->getWriteSettings()); HashingWriteBuffer out_hashing(*out); writeUUIDText(new_data_part->uuid, out_hashing); new_data_part->checksums.files[IMergeTreeDataPart::UUID_FILE_NAME].file_size = out_hashing.count(); @@ -561,7 +563,7 @@ void finalizeMutatedPart( if (execute_ttl_type != ExecuteTTLType::NONE) { /// Write a file with ttl infos in json format. - auto out_ttl = disk->writeFile(part_path / "ttl.txt", 4096, WriteMode::Rewrite, context->getWriteSettings()); + auto out_ttl = data_part_storage_builder->writeFile("ttl.txt", 4096, context->getWriteSettings()); HashingWriteBuffer out_hashing(*out_ttl); new_data_part->ttl_infos.write(out_hashing); new_data_part->checksums.files["ttl.txt"].file_size = out_hashing.count(); @@ -570,7 +572,7 @@ void finalizeMutatedPart( if (!new_data_part->getSerializationInfos().empty()) { - auto out = disk->writeFile(part_path / IMergeTreeDataPart::SERIALIZATION_FILE_NAME, 4096, WriteMode::Rewrite, context->getWriteSettings()); + auto out = data_part_storage_builder->writeFile(IMergeTreeDataPart::SERIALIZATION_FILE_NAME, 4096, context->getWriteSettings()); HashingWriteBuffer out_hashing(*out); new_data_part->getSerializationInfos().writeJSON(out_hashing); new_data_part->checksums.files[IMergeTreeDataPart::SERIALIZATION_FILE_NAME].file_size = out_hashing.count(); @@ -579,18 +581,18 @@ void finalizeMutatedPart( { /// Write file with checksums. - auto out_checksums = disk->writeFile(part_path / "checksums.txt", 4096, WriteMode::Rewrite, context->getWriteSettings()); + auto out_checksums = data_part_storage_builder->writeFile("checksums.txt", 4096, context->getWriteSettings()); new_data_part->checksums.write(*out_checksums); } /// close fd { - auto out = disk->writeFile(part_path / IMergeTreeDataPart::DEFAULT_COMPRESSION_CODEC_FILE_NAME, 4096, WriteMode::Rewrite, context->getWriteSettings()); + auto out = data_part_storage_builder->writeFile(IMergeTreeDataPart::DEFAULT_COMPRESSION_CODEC_FILE_NAME, 4096, context->getWriteSettings()); DB::writeText(queryToString(codec->getFullCodecDesc()), *out); } { /// Write a file with a description of columns. - auto out_columns = disk->writeFile(part_path / "columns.txt", 4096, WriteMode::Rewrite, context->getWriteSettings()); + auto out_columns = data_part_storage_builder->writeFile("columns.txt", 4096, context->getWriteSettings()); new_data_part->getColumns().writeText(*out_columns); } /// close fd @@ -600,8 +602,7 @@ void finalizeMutatedPart( new_data_part->minmax_idx = source_part->minmax_idx; new_data_part->modification_time = time(nullptr); new_data_part->loadProjections(false, false); - new_data_part->setBytesOnDisk( - MergeTreeData::DataPart::calculateTotalSizeOnDisk(new_data_part->volume->getDisk(), part_path)); + new_data_part->setBytesOnDisk(new_data_part->data_part_storage->calculateTotalSizeOnDisk()); new_data_part->default_codec = codec; new_data_part->calculateColumnsAndSecondaryIndicesSizesOnDisk(); } @@ -653,10 +654,8 @@ struct MutationContext MutationsInterpreter::MutationKind::MutationKindEnum mutation_kind = MutationsInterpreter::MutationKind::MutationKindEnum::MUTATE_UNKNOWN; - VolumePtr single_disk_volume; MergeTreeData::MutableDataPartPtr new_data_part; - DiskPtr disk; - String new_part_tmp_path; + DataPartStorageBuilderPtr data_part_storage_builder; IMergedBlockOutputStreamPtr out{nullptr}; @@ -784,6 +783,7 @@ public: projection_merging_params, NO_TRANSACTION_PTR, ctx->new_data_part.get(), + ctx->data_part_storage_builder.get(), ".tmp_proj"); next_level_parts.push_back(executeHere(tmp_part_merge_task)); @@ -943,7 +943,7 @@ bool PartMergerWriter::mutateOriginalPartAndPrepareProjections() if (projection_block) { auto tmp_part = MergeTreeDataWriter::writeTempProjectionPart( - *ctx->data, ctx->log, projection_block, projection, ctx->new_data_part.get(), ++block_num); + *ctx->data, ctx->log, projection_block, projection, ctx->data_part_storage_builder, ctx->new_data_part.get(), ++block_num); tmp_part.finalize(); projection_parts[projection.name].emplace_back(std::move(tmp_part.part)); } @@ -965,7 +965,7 @@ bool PartMergerWriter::mutateOriginalPartAndPrepareProjections() if (projection_block) { auto temp_part = MergeTreeDataWriter::writeTempProjectionPart( - *ctx->data, ctx->log, projection_block, projection, ctx->new_data_part.get(), ++block_num); + *ctx->data, ctx->log, projection_block, projection, ctx->data_part_storage_builder, ctx->new_data_part.get(), ++block_num); temp_part.finalize(); projection_parts[projection.name].emplace_back(std::move(temp_part.part)); } @@ -1065,7 +1065,7 @@ private: void prepare() { - ctx->disk->createDirectories(ctx->new_part_tmp_path); + ctx->data_part_storage_builder->createDirectories(); /// Note: this is done before creating input streams, because otherwise data.data_parts_mutex /// (which is locked in data.getTotalActiveSizeInBytes()) @@ -1100,6 +1100,7 @@ private: ctx->out = std::make_shared( ctx->new_data_part, + ctx->data_part_storage_builder, ctx->metadata_snapshot, ctx->new_data_part->getColumns(), skip_part_indices, @@ -1192,7 +1193,7 @@ private: if (ctx->execute_ttl_type != ExecuteTTLType::NONE) ctx->files_to_skip.insert("ttl.txt"); - ctx->disk->createDirectories(ctx->new_part_tmp_path); + ctx->data_part_storage_builder->createDirectories(); /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. TransactionID tid = ctx->txn ? ctx->txn->tid : Tx::PrehistoricTID; @@ -1203,12 +1204,12 @@ private: NameSet hardlinked_files; /// Create hardlinks for unchanged files - for (auto it = ctx->disk->iterateDirectory(ctx->source_part->getFullRelativePath()); it->isValid(); it->next()) + for (auto it = ctx->source_part->data_part_storage->iterate(); it->isValid(); it->next()) { if (ctx->files_to_skip.contains(it->name())) continue; - String destination = ctx->new_part_tmp_path; + String destination; String file_name = it->name(); auto rename_it = std::find_if(ctx->files_to_rename.begin(), ctx->files_to_rename.end(), [&file_name](const auto & rename_pair) @@ -1220,29 +1221,31 @@ private: { if (rename_it->second.empty()) continue; - - destination += rename_it->second; + destination = rename_it->second; } else { - destination += it->name(); + destination = it->name(); } - - if (!ctx->disk->isDirectory(it->path())) + if (it->isFile()) { - ctx->disk->createHardLink(it->path(), destination); + ctx->data_part_storage_builder->createHardLinkFrom( + *ctx->source_part->data_part_storage, it->name(), destination); hardlinked_files.insert(it->name()); } - else if (!endsWith(".tmp_proj", it->name())) // ignore projection tmp merge dir { // it's a projection part directory - ctx->disk->createDirectories(destination); - for (auto p_it = ctx->disk->iterateDirectory(it->path()); p_it->isValid(); p_it->next()) + ctx->data_part_storage_builder->createProjection(destination); + + auto projection_data_part_storage = ctx->source_part->data_part_storage->getProjection(destination); + auto projection_data_part_storage_builder = ctx->data_part_storage_builder->getProjection(destination); + + for (auto p_it = projection_data_part_storage->iterate(); p_it->isValid(); p_it->next()) { - String p_destination = fs::path(destination) / p_it->name(); - ctx->disk->createHardLink(p_it->path(), p_destination); + projection_data_part_storage_builder->createHardLinkFrom( + *projection_data_part_storage, p_it->name(), p_it->name()); hardlinked_files.insert(p_it->name()); } } @@ -1272,6 +1275,7 @@ private: builder.addTransform(std::make_shared(builder.getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true)); ctx->out = std::make_shared( + ctx->data_part_storage_builder, ctx->new_data_part, ctx->metadata_snapshot, ctx->updated_header, @@ -1323,7 +1327,7 @@ private: } } - MutationHelpers::finalizeMutatedPart(ctx->source_part, ctx->new_data_part, ctx->execute_ttl_type, ctx->compression_codec, ctx->context); + MutationHelpers::finalizeMutatedPart(ctx->source_part, ctx->data_part_storage_builder, ctx->new_data_part, ctx->execute_ttl_type, ctx->compression_codec, ctx->context); } @@ -1458,11 +1462,22 @@ bool MutateTask::prepare() ctx->progress_callback = MergeProgressCallback((*ctx->mutate_entry)->ptr(), ctx->watch_prev_elapsed, *ctx->stage_progress); } - ctx->single_disk_volume = std::make_shared("volume_" + ctx->future_part->name, ctx->space_reservation->getDisk(), 0); + auto single_disk_volume = std::make_shared("volume_" + ctx->future_part->name, ctx->space_reservation->getDisk(), 0); /// FIXME new_data_part is not used in the case when we clone part with cloneAndLoadDataPartOnSameDisk and return false /// Is it possible to handle this case earlier? + + auto data_part_storage = std::make_shared( + single_disk_volume, + ctx->data->getRelativeDataPath(), + "tmp_mut_" + ctx->future_part->name); + + ctx->data_part_storage_builder = std::make_shared( + single_disk_volume, + ctx->data->getRelativeDataPath(), + "tmp_mut_" + ctx->future_part->name); + ctx->new_data_part = ctx->data->createPart( - ctx->future_part->name, ctx->future_part->type, ctx->future_part->part_info, ctx->single_disk_volume, "tmp_mut_" + ctx->future_part->name); + ctx->future_part->name, ctx->future_part->type, ctx->future_part->part_info, data_part_storage); ctx->new_data_part->uuid = ctx->future_part->uuid; ctx->new_data_part->is_temp = true; @@ -1479,9 +1494,6 @@ bool MutateTask::prepare() ctx->new_data_part->setSerializationInfos(new_infos); ctx->new_data_part->partition.assign(ctx->source_part->partition); - ctx->disk = ctx->new_data_part->volume->getDisk(); - ctx->new_part_tmp_path = ctx->new_data_part->getFullRelativePath(); - /// Don't change granularity type while mutating subset of columns ctx->mrk_extension = ctx->source_part->index_granularity_info.is_adaptive ? getAdaptiveMrkExtension(ctx->new_data_part->getType()) : getNonAdaptiveMrkExtension(); diff --git a/src/Storages/MergeTree/PartMetadataManagerOrdinary.cpp b/src/Storages/MergeTree/PartMetadataManagerOrdinary.cpp index 184521cb6cf..da147ff1f0e 100644 --- a/src/Storages/MergeTree/PartMetadataManagerOrdinary.cpp +++ b/src/Storages/MergeTree/PartMetadataManagerOrdinary.cpp @@ -7,10 +7,10 @@ namespace DB { -static std::unique_ptr openForReading(const DiskPtr & disk, const String & path) +static std::unique_ptr openForReading(const DataPartStoragePtr & data_part_storage, const String & path) { - size_t file_size = disk->getFileSize(path); - return disk->readFile(path, ReadSettings().adjustBufferSize(file_size), file_size); + size_t file_size = data_part_storage->getFileSize(path); + return data_part_storage->readFile(path, ReadSettings().adjustBufferSize(file_size), file_size, std::nullopt); } PartMetadataManagerOrdinary::PartMetadataManagerOrdinary(const IMergeTreeDataPart * part_) : IPartMetadataManager(part_) @@ -20,13 +20,12 @@ PartMetadataManagerOrdinary::PartMetadataManagerOrdinary(const IMergeTreeDataPar std::unique_ptr PartMetadataManagerOrdinary::read(const String & file_name) const { - String file_path = fs::path(part->getFullRelativePath()) / file_name; - return openForReading(disk, file_path); + return openForReading(part->data_part_storage, file_name); } bool PartMetadataManagerOrdinary::exists(const String & file_name) const { - return disk->exists(fs::path(part->getFullRelativePath()) / file_name); + return part->data_part_storage->exists(file_name); } diff --git a/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp b/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp index 3d68497f5b0..9930aca2576 100644 --- a/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp +++ b/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp @@ -30,24 +30,24 @@ PartMetadataManagerWithCache::PartMetadataManagerWithCache(const IMergeTreeDataP String PartMetadataManagerWithCache::getKeyFromFilePath(const String & file_path) const { - return disk->getName() + ":" + file_path; + return part->data_part_storage->getDiskName() + ":" + file_path; } String PartMetadataManagerWithCache::getFilePathFromKey(const String & key) const { - return key.substr(disk->getName().size() + 1); + return key.substr(part->data_part_storage->getDiskName().size() + 1); } std::unique_ptr PartMetadataManagerWithCache::read(const String & file_name) const { - String file_path = fs::path(part->getFullRelativePath()) / file_name; + String file_path = fs::path(part->data_part_storage->getRelativePath()) / file_name; String key = getKeyFromFilePath(file_path); String value; auto status = cache->get(key, value); if (!status.ok()) { ProfileEvents::increment(ProfileEvents::MergeTreeMetadataCacheMiss); - auto in = disk->readFile(file_path); + auto in = part->data_part_storage->readFile(file_name, {}, std::nullopt, std::nullopt); readStringUntilEOF(value, *in); cache->put(key, value); } @@ -60,7 +60,7 @@ std::unique_ptr PartMetadataManagerWithCache::read(const Str bool PartMetadataManagerWithCache::exists(const String & file_name) const { - String file_path = fs::path(part->getFullRelativePath()) / file_name; + String file_path = fs::path(part->data_part_storage->getRelativePath()) / file_name; String key = getKeyFromFilePath(file_path); String value; auto status = cache->get(key, value); @@ -72,7 +72,7 @@ bool PartMetadataManagerWithCache::exists(const String & file_name) const else { ProfileEvents::increment(ProfileEvents::MergeTreeMetadataCacheMiss); - return disk->exists(fs::path(part->getFullRelativePath()) / file_name); + return part->data_part_storage->exists(file_name); } } @@ -84,7 +84,7 @@ void PartMetadataManagerWithCache::deleteAll(bool include_projection) String value; for (const auto & file_name : file_names) { - String file_path = fs::path(part->getFullRelativePath()) / file_name; + String file_path = fs::path(part->data_part_storage->getRelativePath()) / file_name; String key = getKeyFromFilePath(file_path); auto status = cache->del(key); if (!status.ok()) @@ -112,10 +112,10 @@ void PartMetadataManagerWithCache::updateAll(bool include_projection) String read_value; for (const auto & file_name : file_names) { - String file_path = fs::path(part->getFullRelativePath()) / file_name; - if (!disk->exists(file_path)) + String file_path = fs::path(part->data_part_storage->getRelativePath()) / file_name; + if (!part->data_part_storage->exists(file_name)) continue; - auto in = disk->readFile(file_path); + auto in = part->data_part_storage->readFile(file_name, {}, std::nullopt, std::nullopt); readStringUntilEOF(value, *in); String key = getKeyFromFilePath(file_path); @@ -152,7 +152,7 @@ void PartMetadataManagerWithCache::assertAllDeleted(bool include_projection) con file_name = fs::path(file_path).filename(); /// Metadata file belongs to current part - if (fs::path(part->getFullRelativePath()) / file_name == file_path) + if (fs::path(part->data_part_storage->getRelativePath()) / file_name == file_path) throw Exception( ErrorCodes::LOGICAL_ERROR, "Data part {} with type {} with meta file {} still in cache", @@ -166,7 +166,7 @@ void PartMetadataManagerWithCache::assertAllDeleted(bool include_projection) con const auto & projection_parts = part->getProjectionParts(); for (const auto & [projection_name, projection_part] : projection_parts) { - if (fs::path(projection_part->getFullRelativePath()) / file_name == file_path) + if (fs::path(part->data_part_storage->getRelativePath()) / (projection_name + ".proj") / file_name == file_path) { throw Exception( ErrorCodes::LOGICAL_ERROR, @@ -183,7 +183,7 @@ void PartMetadataManagerWithCache::assertAllDeleted(bool include_projection) con void PartMetadataManagerWithCache::getKeysAndCheckSums(Strings & keys, std::vector & checksums) const { - String prefix = getKeyFromFilePath(fs::path(part->getFullRelativePath()) / ""); + String prefix = getKeyFromFilePath(fs::path(part->data_part_storage->getRelativePath()) / ""); Strings values; cache->getByPrefix(prefix, keys, values); size_t size = keys.size(); @@ -217,14 +217,14 @@ std::unordered_map PartMetadataManagerWit results.emplace(file_name, cache_checksums[i]); /// File belongs to normal part - if (fs::path(part->getFullRelativePath()) / file_name == file_path) + if (fs::path(part->data_part_storage->getRelativePath()) / file_name == file_path) { - auto disk_checksum = part->getActualChecksumByFile(file_path); + auto disk_checksum = part->getActualChecksumByFile(file_name); if (disk_checksum != cache_checksums[i]) throw Exception( ErrorCodes::CORRUPTED_DATA, - "Checksums doesn't match in part {}. Expected: {}. Found {}.", - part->name, + "Checksums doesn't match in part {} for {}. Expected: {}. Found {}.", + part->name, file_path, getHexUIntUppercase(disk_checksum.first) + getHexUIntUppercase(disk_checksum.second), getHexUIntUppercase(cache_checksums[i].first) + getHexUIntUppercase(cache_checksums[i].second)); @@ -256,7 +256,7 @@ std::unordered_map PartMetadataManagerWit proj_name, part->name, file_path); } - auto disk_checksum = it->second->getActualChecksumByFile(file_path); + auto disk_checksum = it->second->getActualChecksumByFile(file_name); if (disk_checksum != cache_checksums[i]) throw Exception( ErrorCodes::CORRUPTED_DATA, diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeAltersSequence.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeAltersSequence.cpp index 6e57fe55878..37d04541dfd 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeAltersSequence.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeAltersSequence.cpp @@ -4,7 +4,7 @@ namespace DB { -int ReplicatedMergeTreeAltersSequence::getHeadAlterVersion(std::lock_guard & /*state_lock*/) const +int ReplicatedMergeTreeAltersSequence::getHeadAlterVersion(std::unique_lock & /*state_lock*/) const { /// If queue empty, than we don't have version if (!queue_state.empty()) @@ -66,7 +66,7 @@ void ReplicatedMergeTreeAltersSequence::finishDataAlter(int alter_version, std:: } } -bool ReplicatedMergeTreeAltersSequence::canExecuteDataAlter(int alter_version, std::lock_guard & /*state_lock*/) const +bool ReplicatedMergeTreeAltersSequence::canExecuteDataAlter(int alter_version, std::unique_lock & /*state_lock*/) const { /// Queue maybe empty when we start after server shutdown /// and have some MUTATE_PART records in queue @@ -80,7 +80,7 @@ bool ReplicatedMergeTreeAltersSequence::canExecuteDataAlter(int alter_version, s return queue_state.at(alter_version).metadata_finished; } -bool ReplicatedMergeTreeAltersSequence::canExecuteMetaAlter(int alter_version, std::lock_guard & /*state_lock*/) const +bool ReplicatedMergeTreeAltersSequence::canExecuteMetaAlter(int alter_version, std::unique_lock & /*state_lock*/) const { assert(!queue_state.empty()); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeAltersSequence.h b/src/Storages/MergeTree/ReplicatedMergeTreeAltersSequence.h index aa58e16a716..c104109bd4c 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeAltersSequence.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeAltersSequence.h @@ -49,13 +49,13 @@ public: void finishDataAlter(int alter_version, std::lock_guard & /*state_lock*/); /// Check that we can execute this data alter. If it's metadata stage finished. - bool canExecuteDataAlter(int alter_version, std::lock_guard & /*state_lock*/) const; + bool canExecuteDataAlter(int alter_version, std::unique_lock & /*state_lock*/) const; /// Check that we can execute metadata alter with version. - bool canExecuteMetaAlter(int alter_version, std::lock_guard & /*state_lock*/) const; + bool canExecuteMetaAlter(int alter_version, std::unique_lock & /*state_lock*/) const; /// Just returns smallest alter version in sequence (first entry) - int getHeadAlterVersion(std::lock_guard & /*state_lock*/) const; + int getHeadAlterVersion(std::unique_lock & /*state_lock*/) const; }; } diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index 9f679f121b8..add1ba875aa 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -1066,8 +1066,9 @@ void ReplicatedMergeTreeQueue::removePartProducingOpsInRange( } -bool ReplicatedMergeTreeQueue::isNotCoveredByFuturePartsImpl(const LogEntry & entry, const String & new_part_name, - String & out_reason, std::lock_guard & /* queue_lock */) const +bool ReplicatedMergeTreeQueue::isCoveredByFuturePartsImpl(const LogEntry & entry, const String & new_part_name, + String & out_reason, std::unique_lock & /* queue_lock */, + std::vector * covered_entries_to_wait) const { /// Let's check if the same part is now being created by another action. auto entry_for_same_part_it = future_parts.find(new_part_name); @@ -1080,7 +1081,7 @@ bool ReplicatedMergeTreeQueue::isNotCoveredByFuturePartsImpl(const LogEntry & en entry.znode_name, entry.type, entry.new_part_name, another_entry.znode_name, another_entry.type, another_entry.new_part_name); LOG_INFO(log, fmt::runtime(out_reason)); - return false; + return true; /** When the corresponding action is completed, then `isNotCoveredByFuturePart` next time, will succeed, * and queue element will be processed. @@ -1098,24 +1099,50 @@ bool ReplicatedMergeTreeQueue::isNotCoveredByFuturePartsImpl(const LogEntry & en { auto future_part = MergeTreePartInfo::fromPartName(future_part_elem.first, format_version); - if (future_part.contains(result_part)) + if (future_part.isDisjoint(result_part)) + continue; + + /// Parts are not disjoint, so new_part_name either contains or covers future_part. + chassert(future_part.contains(result_part) || result_part.contains(future_part)); + /// We cannot execute `entry` (or upgrade its actual_part_name to `new_part_name`) + /// while any covered or covering parts are processed. + /// But we also cannot simply return true and postpone entry processing, because it may lead to kind of livelock. + /// Since queue is processed in multiple threads, it's likely that there will be at least one thread + /// executing faulty entry for some small part, so bigger covering part will never be processed. + /// That's why it's better to wait for covered entry to be executed (does not matter successfully or not) + /// instead of exiting and postponing covering entry. + + if (covered_entries_to_wait) { - out_reason = fmt::format( - "Not executing log entry {} for part {} " - "because it is covered by part {} that is currently executing.", - entry.znode_name, new_part_name, future_part_elem.first); - LOG_TRACE(log, fmt::runtime(out_reason)); - return false; + if (entry.znode_name < future_part_elem.second->znode_name) + { + out_reason = fmt::format( + "Not executing log entry {} for part {} " + "because it is not disjoint with part {} that is currently executing and another entry {} is newer.", + entry.znode_name, new_part_name, future_part_elem.first, future_part_elem.second->znode_name); + LOG_TRACE(log, fmt::runtime(out_reason)); + return true; + } + + covered_entries_to_wait->push_back(future_part_elem.second); + continue; } + + out_reason = fmt::format( + "Not executing log entry {} for part {} " + "because it is not disjoint with part {} that is currently executing.", + entry.znode_name, new_part_name, future_part_elem.first); + LOG_TRACE(log, fmt::runtime(out_reason)); + return true; } - return true; + return false; } bool ReplicatedMergeTreeQueue::addFuturePartIfNotCoveredByThem(const String & part_name, LogEntry & entry, String & reject_reason) { /// We have found `part_name` on some replica and are going to fetch it instead of covered `entry->new_part_name`. - std::lock_guard lock(state_mutex); + std::unique_lock lock(state_mutex); if (virtual_parts.getContainingPart(part_name).empty()) { @@ -1137,13 +1164,13 @@ bool ReplicatedMergeTreeQueue::addFuturePartIfNotCoveredByThem(const String & pa if (drop_ranges.isAffectedByDropRange(part_name, reject_reason)) return false; - if (isNotCoveredByFuturePartsImpl(entry, part_name, reject_reason, lock)) - { - CurrentlyExecuting::setActualPartName(entry, part_name, *this, lock); - return true; - } + std::vector covered_entries_to_wait; + if (isCoveredByFuturePartsImpl(entry, part_name, reject_reason, lock, &covered_entries_to_wait)) + return false; + + CurrentlyExecuting::setActualPartName(entry, part_name, *this, lock, covered_entries_to_wait); + return true; - return false; } @@ -1152,13 +1179,15 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry( String & out_postpone_reason, MergeTreeDataMergerMutator & merger_mutator, MergeTreeData & data, - std::lock_guard & state_lock) const + std::unique_lock & state_lock) const { /// If our entry produce part which is already covered by /// some other entry which is currently executing, then we can postpone this entry. for (const String & new_part_name : entry.getVirtualPartNames(format_version)) { - if (!isNotCoveredByFuturePartsImpl(entry, new_part_name, out_postpone_reason, state_lock)) + /// Do not wait for any entries here, because we have only one thread that scheduling queue entries. + /// We can wait in worker threads, but not in scheduler. + if (isCoveredByFuturePartsImpl(entry, new_part_name, out_postpone_reason, state_lock, /* covered_entries_to_wait */ nullptr)) return false; } @@ -1409,7 +1438,7 @@ Int64 ReplicatedMergeTreeQueue::getCurrentMutationVersion(const String & partiti ReplicatedMergeTreeQueue::CurrentlyExecuting::CurrentlyExecuting( - const ReplicatedMergeTreeQueue::LogEntryPtr & entry_, ReplicatedMergeTreeQueue & queue_, std::lock_guard & /* state_lock */) + const ReplicatedMergeTreeQueue::LogEntryPtr & entry_, ReplicatedMergeTreeQueue & queue_, std::unique_lock & /* state_lock */) : entry(entry_), queue(queue_) { if (entry->type == ReplicatedMergeTreeLogEntry::DROP_RANGE || entry->type == ReplicatedMergeTreeLogEntry::REPLACE_RANGE) @@ -1435,7 +1464,8 @@ void ReplicatedMergeTreeQueue::CurrentlyExecuting::setActualPartName( ReplicatedMergeTreeQueue::LogEntry & entry, const String & actual_part_name, ReplicatedMergeTreeQueue & queue, - std::lock_guard & /* state_lock */) + std::unique_lock & state_lock, + std::vector & covered_entries_to_wait) { if (!entry.actual_new_part_name.empty()) throw Exception("Entry actual part isn't empty yet. This is a bug.", ErrorCodes::LOGICAL_ERROR); @@ -1450,6 +1480,15 @@ void ReplicatedMergeTreeQueue::CurrentlyExecuting::setActualPartName( throw Exception(ErrorCodes::LOGICAL_ERROR, "Attaching already existing future part {}. This is a bug. " "It happened on attempt to execute {}: {}", entry.actual_new_part_name, entry.znode_name, entry.toString()); + + for (LogEntryPtr & covered_entry : covered_entries_to_wait) + { + if (&entry == covered_entry.get()) + continue; + LOG_TRACE(queue.log, "Waiting for {} producing {} to finish before executing {} producing not disjoint part {}", + covered_entry->znode_name, covered_entry->new_part_name, entry.znode_name, entry.new_part_name); + covered_entry->execution_complete.wait(state_lock, [&covered_entry] { return !covered_entry->currently_executing; }); + } } @@ -1491,7 +1530,7 @@ ReplicatedMergeTreeQueue::SelectedEntryPtr ReplicatedMergeTreeQueue::selectEntry { LogEntryPtr entry; - std::lock_guard lock(state_mutex); + std::unique_lock lock(state_mutex); for (auto it = queue.begin(); it != queue.end(); ++it) { diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h index dea4d0573db..a88d9182bbf 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h @@ -202,17 +202,18 @@ private: bool shouldExecuteLogEntry( const LogEntry & entry, String & out_postpone_reason, MergeTreeDataMergerMutator & merger_mutator, MergeTreeData & data, - std::lock_guard & state_lock) const; + std::unique_lock & state_lock) const; Int64 getCurrentMutationVersionImpl(const String & partition_id, Int64 data_version, std::lock_guard & /* state_lock */) const; /** Check that part isn't in currently generating parts and isn't covered by them. * Should be called under state_mutex. */ - bool isNotCoveredByFuturePartsImpl( + bool isCoveredByFuturePartsImpl( const LogEntry & entry, const String & new_part_name, String & out_reason, - std::lock_guard & state_lock) const; + std::unique_lock & state_lock, + std::vector * covered_entries_to_wait) const; /// After removing the queue element, update the insertion times in the RAM. Running under state_mutex. /// Returns information about what times have changed - this information can be passed to updateTimesInZooKeeper. @@ -254,14 +255,15 @@ private: CurrentlyExecuting( const ReplicatedMergeTreeQueue::LogEntryPtr & entry_, ReplicatedMergeTreeQueue & queue_, - std::lock_guard & state_lock); + std::unique_lock & state_lock); /// In case of fetch, we determine actual part during the execution, so we need to update entry. It is called under state_mutex. static void setActualPartName( ReplicatedMergeTreeQueue::LogEntry & entry, const String & actual_part_name, ReplicatedMergeTreeQueue & queue, - std::lock_guard & state_lock); + std::unique_lock & state_lock, + std::vector & covered_entries_to_wait); public: ~CurrentlyExecuting(); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index de893d59b05..d217e16c830 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -306,7 +306,7 @@ void ReplicatedMergeTreeSink::commitPart( metadata_snapshot->check(part->getColumns()); assertSessionIsNotExpired(zookeeper); - String temporary_part_relative_path = part->relative_path; + String temporary_part_relative_path = part->data_part_storage->getPartDirectory(); /// There is one case when we need to retry transaction in a loop. /// But don't do it too many times - just as defensive measure. diff --git a/src/Storages/MergeTree/checkDataPart.cpp b/src/Storages/MergeTree/checkDataPart.cpp index 1f68580fa33..d5a838668d2 100644 --- a/src/Storages/MergeTree/checkDataPart.cpp +++ b/src/Storages/MergeTree/checkDataPart.cpp @@ -46,8 +46,7 @@ bool isNotEnoughMemoryErrorCode(int code) IMergeTreeDataPart::Checksums checkDataPart( MergeTreeData::DataPartPtr data_part, - const DiskPtr & disk, - const String & full_relative_path, + const DataPartStoragePtr & data_part_storage, const NamesAndTypesList & columns_list, const MergeTreeDataPartType & part_type, const NameSet & files_without_checksums, @@ -62,20 +61,16 @@ IMergeTreeDataPart::Checksums checkDataPart( CurrentMetrics::Increment metric_increment{CurrentMetrics::ReplicatedChecks}; - String path = full_relative_path; - if (!path.empty() && path.back() != '/') - path += "/"; - NamesAndTypesList columns_txt; { - auto buf = disk->readFile(fs::path(path) / "columns.txt"); + auto buf = data_part_storage->readFile("columns.txt", {}, std::nullopt, std::nullopt); columns_txt.readText(*buf); assertEOF(*buf); } if (columns_txt != columns_list) - throw Exception("Columns doesn't match in part " + path + throw Exception("Columns doesn't match in part " + data_part_storage->getFullPath() + ". Expected: " + columns_list.toString() + ". Found: " + columns_txt.toString(), ErrorCodes::CORRUPTED_DATA); @@ -83,9 +78,9 @@ IMergeTreeDataPart::Checksums checkDataPart( IMergeTreeDataPart::Checksums checksums_data; /// This function calculates checksum for both compressed and decompressed contents of compressed file. - auto checksum_compressed_file = [](const DiskPtr & disk_, const String & file_path) + auto checksum_compressed_file = [](const DataPartStoragePtr & data_part_storage_, const String & file_path) { - auto file_buf = disk_->readFile(file_path); + auto file_buf = data_part_storage_->readFile(file_path, {}, std::nullopt, std::nullopt); HashingReadBuffer compressed_hashing_buf(*file_buf); CompressedReadBuffer uncompressing_buf(compressed_hashing_buf); HashingReadBuffer uncompressed_hashing_buf(uncompressing_buf); @@ -100,11 +95,10 @@ IMergeTreeDataPart::Checksums checkDataPart( auto ratio_of_defaults = data_part->storage.getSettings()->ratio_of_defaults_for_sparse_serialization; SerializationInfoByName serialization_infos(columns_txt, SerializationInfo::Settings{ratio_of_defaults, false}); - auto serialization_path = path + IMergeTreeDataPart::SERIALIZATION_FILE_NAME; - if (disk->exists(serialization_path)) + if (data_part_storage->exists(IMergeTreeDataPart::SERIALIZATION_FILE_NAME)) { - auto serialization_file = disk->readFile(serialization_path); + auto serialization_file = data_part_storage->readFile(IMergeTreeDataPart::SERIALIZATION_FILE_NAME, {}, std::nullopt, std::nullopt); serialization_infos.readJSON(*serialization_file); } @@ -118,9 +112,9 @@ IMergeTreeDataPart::Checksums checkDataPart( /// This function calculates only checksum of file content (compressed or uncompressed). /// It also calculates checksum of projections. - auto checksum_file = [&](const String & file_path, const String & file_name) + auto checksum_file = [&](const String & file_name) { - if (disk->isDirectory(file_path) && endsWith(file_name, ".proj")) + if (data_part_storage->isDirectory(file_name) && endsWith(file_name, ".proj")) { auto projection_name = file_name.substr(0, file_name.size() - sizeof(".proj") + 1); auto pit = data_part->getProjectionParts().find(projection_name); @@ -134,12 +128,12 @@ IMergeTreeDataPart::Checksums checkDataPart( const auto & projection = pit->second; IMergeTreeDataPart::Checksums projection_checksums_data; - const auto & projection_path = file_path; + + auto projection_part_storage = data_part_storage->getProjection(file_name); if (projection->getType() == MergeTreeDataPartType::Compact) { - auto proj_path = file_path + MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION; - auto file_buf = disk->readFile(proj_path); + auto file_buf = projection_part_storage->readFile(MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION, {}, std::nullopt, std::nullopt); HashingReadBuffer hashing_buf(*file_buf); hashing_buf.ignoreAll(); projection_checksums_data.files[MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION] @@ -154,22 +148,22 @@ IMergeTreeDataPart::Checksums checkDataPart( [&](const ISerialization::SubstreamPath & substream_path) { String projection_file_name = ISerialization::getFileNameForStream(projection_column, substream_path) + ".bin"; - projection_checksums_data.files[projection_file_name] = checksum_compressed_file(disk, projection_path + projection_file_name); + projection_checksums_data.files[projection_file_name] = checksum_compressed_file(projection_part_storage, projection_file_name); }); } } IMergeTreeDataPart::Checksums projection_checksums_txt; - if (require_checksums || disk->exists(projection_path + "checksums.txt")) + if (require_checksums || projection_part_storage->exists("checksums.txt")) { - auto buf = disk->readFile(projection_path + "checksums.txt"); + auto buf = projection_part_storage->readFile("checksums.txt", {}, std::nullopt, std::nullopt); projection_checksums_txt.read(*buf); assertEOF(*buf); } const auto & projection_checksum_files_txt = projection_checksums_txt.files; - for (auto projection_it = disk->iterateDirectory(projection_path); projection_it->isValid(); projection_it->next()) + for (auto projection_it = projection_part_storage->iterate(); projection_it->isValid(); projection_it->next()) { const String & projection_file_name = projection_it->name(); auto projection_checksum_it = projection_checksums_data.files.find(projection_file_name); @@ -181,7 +175,7 @@ IMergeTreeDataPart::Checksums checkDataPart( if (projection_txt_checksum_it == projection_checksum_files_txt.end() || projection_txt_checksum_it->second.uncompressed_size == 0) { - auto projection_file_buf = disk->readFile(projection_it->path()); + auto projection_file_buf = projection_part_storage->readFile(projection_file_name, {}, std::nullopt, std::nullopt); HashingReadBuffer projection_hashing_buf(*projection_file_buf); projection_hashing_buf.ignoreAll(); projection_checksums_data.files[projection_file_name] = IMergeTreeDataPart::Checksums::Checksum( @@ -189,7 +183,7 @@ IMergeTreeDataPart::Checksums checkDataPart( } else { - projection_checksums_data.files[projection_file_name] = checksum_compressed_file(disk, projection_it->path()); + projection_checksums_data.files[projection_file_name] = checksum_compressed_file(projection_part_storage, projection_file_name); } } } @@ -201,7 +195,7 @@ IMergeTreeDataPart::Checksums checkDataPart( } else { - auto file_buf = disk->readFile(file_path); + auto file_buf = data_part_storage->readFile(file_name, {}, std::nullopt, std::nullopt); HashingReadBuffer hashing_buf(*file_buf); hashing_buf.ignoreAll(); checksums_data.files[file_name] = IMergeTreeDataPart::Checksums::Checksum(hashing_buf.count(), hashing_buf.getHash()); @@ -212,8 +206,7 @@ IMergeTreeDataPart::Checksums checkDataPart( /// First calculate checksums for columns data if (part_type == MergeTreeDataPartType::Compact) { - const auto & file_name = MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION; - checksum_file(path + file_name, file_name); + checksum_file(MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION); /// Uncompressed checksums in compact parts are computed in a complex way. /// We check only checksum of compressed file. check_uncompressed = false; @@ -225,27 +218,27 @@ IMergeTreeDataPart::Checksums checkDataPart( get_serialization(column)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) { String file_name = ISerialization::getFileNameForStream(column, substream_path) + ".bin"; - checksums_data.files[file_name] = checksum_compressed_file(disk, path + file_name); + checksums_data.files[file_name] = checksum_compressed_file(data_part_storage, file_name); }); } } else { - throw Exception("Unknown type in part " + path, ErrorCodes::UNKNOWN_PART_TYPE); + throw Exception("Unknown type in part " + data_part_storage->getFullPath(), ErrorCodes::UNKNOWN_PART_TYPE); } /// Checksums from the rest files listed in checksums.txt. May be absent. If present, they are subsequently compared with the actual data checksums. IMergeTreeDataPart::Checksums checksums_txt; - if (require_checksums || disk->exists(fs::path(path) / "checksums.txt")) + if (require_checksums || data_part_storage->exists("checksums.txt")) { - auto buf = disk->readFile(fs::path(path) / "checksums.txt"); + auto buf = data_part_storage->readFile("checksums.txt", {}, std::nullopt, std::nullopt); checksums_txt.read(*buf); assertEOF(*buf); } const auto & checksum_files_txt = checksums_txt.files; - for (auto it = disk->iterateDirectory(path); it->isValid(); it->next()) + for (auto it = data_part_storage->iterate(); it->isValid(); it->next()) { const String & file_name = it->name(); auto checksum_it = checksums_data.files.find(file_name); @@ -257,11 +250,11 @@ IMergeTreeDataPart::Checksums checkDataPart( if (txt_checksum_it == checksum_files_txt.end() || txt_checksum_it->second.uncompressed_size == 0) { /// The file is not compressed. - checksum_file(it->path(), file_name); + checksum_file(file_name); } else /// If we have both compressed and uncompressed in txt, then calculate them { - checksums_data.files[file_name] = checksum_compressed_file(disk, it->path()); + checksums_data.files[file_name] = checksum_compressed_file(data_part_storage, file_name); } } } @@ -292,8 +285,7 @@ IMergeTreeDataPart::Checksums checkDataPart( return checkDataPart( data_part, - data_part->volume->getDisk(), - data_part->getFullRelativePath(), + data_part->data_part_storage, data_part->getColumns(), data_part->getType(), data_part->getFileNamesWithoutChecksums(), diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index 6d8c3b313d4..43e1af21eac 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -348,9 +348,10 @@ static StoragePtr create(const StorageFactory::Arguments & args) { /// Try use default values if arguments are not specified. /// Note: {uuid} macro works for ON CLUSTER queries when database engine is Atomic. - zookeeper_path = args.getContext()->getConfigRef().getString("default_replica_path", "/clickhouse/tables/{uuid}/{shard}"); + const auto & config = args.getContext()->getConfigRef(); + zookeeper_path = StorageReplicatedMergeTree::getDefaultZooKeeperPath(config); /// TODO maybe use hostname if {replica} is not defined? - replica_name = args.getContext()->getConfigRef().getString("default_replica_name", "{replica}"); + replica_name = StorageReplicatedMergeTree::getDefaultReplicaName(config); /// Modify query, so default values will be written to metadata assert(arg_num == 0); diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index d466096c8ba..2fa6003c0eb 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -208,7 +208,8 @@ std::unique_ptr createReadBuffer( in.setProgressCallback(context); } - return wrapReadBufferWithCompressionMethod(std::move(nested_buffer), method); + auto zstd_window_log_max = context->getSettingsRef().zstd_window_log_max; + return wrapReadBufferWithCompressionMethod(std::move(nested_buffer), method, zstd_window_log_max); } } diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp index 7b18b5fc7c6..1324ebf5b28 100644 --- a/src/Storages/StorageLog.cpp +++ b/src/Storages/StorageLog.cpp @@ -25,10 +25,11 @@ #include #include +#include #include #include #include -#include +#include #include #include @@ -50,7 +51,6 @@ namespace ErrorCodes extern const int SIZES_OF_MARKS_FILES_ARE_INCONSISTENT; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int INCORRECT_FILE_NAME; - extern const int NOT_IMPLEMENTED; } /// NOTE: The lock `StorageLog::rwlock` is NOT kept locked while reading, @@ -921,12 +921,12 @@ std::optional StorageLog::totalBytes(const Settings &) const return total_bytes; } -BackupEntries StorageLog::backupData(ContextPtr context, const ASTs & partitions) +void StorageLog::backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) { - if (!partitions.empty()) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Table engine {} doesn't support partitions", getName()); + if (partitions) + BackupEntriesCollector::throwPartitionsNotSupported(getStorageID(), getName()); - auto lock_timeout = getLockTimeout(context); + auto lock_timeout = getLockTimeout(backup_entries_collector.getContext()); loadMarks(lock_timeout); ReadLock lock{rwlock, lock_timeout}; @@ -934,23 +934,22 @@ BackupEntries StorageLog::backupData(ContextPtr context, const ASTs & partitions throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED); if (!num_data_files || !file_checker.getFileSize(data_files[INDEX_WITH_REAL_ROW_COUNT].path)) - return {}; + return; + fs::path data_path_in_backup_fs = data_path_in_backup; auto temp_dir_owner = std::make_shared(disk, "tmp/backup_"); - auto temp_dir = temp_dir_owner->getPath(); + fs::path temp_dir = temp_dir_owner->getPath(); disk->createDirectories(temp_dir); - BackupEntries backup_entries; - /// *.bin for (const auto & data_file : data_files) { /// We make a copy of the data file because it can be changed later in write() or in truncate(). String data_file_name = fileName(data_file.path); - String hardlink_file_path = temp_dir + "/" + data_file_name; + String hardlink_file_path = temp_dir / data_file_name; disk->createHardLink(data_file.path, hardlink_file_path); - backup_entries.emplace_back( - data_file_name, + backup_entries_collector.addBackupEntry( + data_path_in_backup_fs / data_file_name, std::make_unique( disk, hardlink_file_path, file_checker.getFileSize(data_file.path), std::nullopt, temp_dir_owner)); } @@ -960,151 +959,131 @@ BackupEntries StorageLog::backupData(ContextPtr context, const ASTs & partitions { /// We make a copy of the data file because it can be changed later in write() or in truncate(). String marks_file_name = fileName(marks_file_path); - String hardlink_file_path = temp_dir + "/" + marks_file_name; + String hardlink_file_path = temp_dir / marks_file_name; disk->createHardLink(marks_file_path, hardlink_file_path); - backup_entries.emplace_back( - marks_file_name, + backup_entries_collector.addBackupEntry( + data_path_in_backup_fs / marks_file_name, std::make_unique( disk, hardlink_file_path, file_checker.getFileSize(marks_file_path), std::nullopt, temp_dir_owner)); } /// sizes.json String files_info_path = file_checker.getPath(); - backup_entries.emplace_back(fileName(files_info_path), std::make_unique(disk, files_info_path)); + backup_entries_collector.addBackupEntry( + data_path_in_backup_fs / fileName(files_info_path), std::make_unique(disk, files_info_path)); /// columns.txt - backup_entries.emplace_back( - "columns.txt", std::make_unique(getInMemoryMetadata().getColumns().getAllPhysical().toString())); + backup_entries_collector.addBackupEntry( + data_path_in_backup_fs / "columns.txt", + std::make_unique(getInMemoryMetadata().getColumns().getAllPhysical().toString())); /// count.txt if (use_marks_file) { size_t num_rows = data_files[INDEX_WITH_REAL_ROW_COUNT].marks.empty() ? 0 : data_files[INDEX_WITH_REAL_ROW_COUNT].marks.back().rows; - backup_entries.emplace_back("count.txt", std::make_unique(toString(num_rows))); + backup_entries_collector.addBackupEntry( + data_path_in_backup_fs / "count.txt", std::make_unique(toString(num_rows))); } - - return backup_entries; } -class LogRestoreTask : public IRestoreTask +void StorageLog::restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) { - using WriteLock = StorageLog::WriteLock; - using Mark = StorageLog::Mark; + if (partitions) + RestorerFromBackup::throwPartitionsNotSupported(getStorageID(), getName()); -public: - LogRestoreTask( - std::shared_ptr storage_, const BackupPtr & backup_, const String & data_path_in_backup_, std::chrono::seconds lock_timeout_) - : storage(storage_), backup(backup_), data_path_in_backup(data_path_in_backup_), lock_timeout(lock_timeout_) - { - } + if (!num_data_files) + return; - RestoreTasks run() override - { - WriteLock lock{storage->rwlock, lock_timeout}; - if (!lock) - throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED); + auto backup = restorer.getBackup(); + if (!restorer.isNonEmptyTableAllowed() && total_bytes && backup->hasFiles(data_path_in_backup)) + RestorerFromBackup::throwTableIsNotEmpty(getStorageID()); - const auto num_data_files = storage->num_data_files; - if (!num_data_files) - return {}; - - auto & file_checker = storage->file_checker; - - /// Load the marks if not loaded yet. We have to do that now because we're going to update these marks. - storage->loadMarks(lock); - - /// If there were no files, save zero file sizes to be able to rollback in case of error. - storage->saveFileSizes(lock); - - try - { - /// Append data files. - auto & data_files = storage->data_files; - for (const auto & data_file : data_files) - { - String file_path_in_backup = data_path_in_backup + fileName(data_file.path); - auto backup_entry = backup->readFile(file_path_in_backup); - const auto & disk = storage->disk; - auto in = backup_entry->getReadBuffer(); - auto out = disk->writeFile(data_file.path, storage->max_compress_block_size, WriteMode::Append); - copyData(*in, *out); - } - - const bool use_marks_file = storage->use_marks_file; - if (use_marks_file) - { - /// Append marks. - size_t num_extra_marks = 0; - const auto & marks_file_path = storage->marks_file_path; - String file_path_in_backup = data_path_in_backup + fileName(marks_file_path); - size_t file_size = backup->getFileSize(file_path_in_backup); - if (file_size % (num_data_files * sizeof(Mark)) != 0) - throw Exception("Size of marks file is inconsistent", ErrorCodes::SIZES_OF_MARKS_FILES_ARE_INCONSISTENT); - - num_extra_marks = file_size / (num_data_files * sizeof(Mark)); - - size_t num_marks = data_files[0].marks.size(); - for (auto & data_file : data_files) - data_file.marks.reserve(num_marks + num_extra_marks); - - std::vector old_data_sizes; - std::vector old_num_rows; - old_data_sizes.resize(num_data_files); - old_num_rows.resize(num_data_files); - for (size_t i = 0; i != num_data_files; ++i) - { - old_data_sizes[i] = file_checker.getFileSize(data_files[i].path); - old_num_rows[i] = num_marks ? data_files[i].marks[num_marks - 1].rows : 0; - } - - auto backup_entry = backup->readFile(file_path_in_backup); - auto marks_rb = backup_entry->getReadBuffer(); - - for (size_t i = 0; i != num_extra_marks; ++i) - { - for (size_t j = 0; j != num_data_files; ++j) - { - Mark mark; - mark.read(*marks_rb); - mark.rows += old_num_rows[j]; /// Adjust the number of rows. - mark.offset += old_data_sizes[j]; /// Adjust the offset. - data_files[j].marks.push_back(mark); - } - } - } - - /// Finish writing. - storage->saveMarks(lock); - storage->saveFileSizes(lock); - storage->updateTotalRows(lock); - } - catch (...) - { - /// Rollback partial writes. - file_checker.repair(); - storage->removeUnsavedMarks(lock); - throw; - } - - return {}; - } - -private: - std::shared_ptr storage; - BackupPtr backup; - String data_path_in_backup; - std::chrono::seconds lock_timeout; -}; - -RestoreTaskPtr StorageLog::restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings &, const std::shared_ptr &) -{ - if (!partitions.empty()) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Table engine {} doesn't support partitions", getName()); - - return std::make_unique( - typeid_cast>(shared_from_this()), backup, data_path_in_backup, getLockTimeout(context)); + auto lock_timeout = getLockTimeout(restorer.getContext()); + restorer.addDataRestoreTask( + [storage = std::static_pointer_cast(shared_from_this()), backup, data_path_in_backup, lock_timeout] + { storage->restoreDataImpl(backup, data_path_in_backup, lock_timeout); }); } +void StorageLog::restoreDataImpl(const BackupPtr & backup, const String & data_path_in_backup, std::chrono::seconds lock_timeout) +{ + WriteLock lock{rwlock, lock_timeout}; + if (!lock) + throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED); + + /// Load the marks if not loaded yet. We have to do that now because we're going to update these marks. + loadMarks(lock); + + /// If there were no files, save zero file sizes to be able to rollback in case of error. + saveFileSizes(lock); + + try + { + fs::path data_path_in_backup_fs = data_path_in_backup; + + /// Append data files. + for (const auto & data_file : data_files) + { + String file_path_in_backup = data_path_in_backup_fs / fileName(data_file.path); + auto backup_entry = backup->readFile(file_path_in_backup); + auto in = backup_entry->getReadBuffer(); + auto out = disk->writeFile(data_file.path, max_compress_block_size, WriteMode::Append); + copyData(*in, *out); + } + + if (use_marks_file) + { + /// Append marks. + size_t num_extra_marks = 0; + String file_path_in_backup = data_path_in_backup_fs / fileName(marks_file_path); + size_t file_size = backup->getFileSize(file_path_in_backup); + if (file_size % (num_data_files * sizeof(Mark)) != 0) + throw Exception("Size of marks file is inconsistent", ErrorCodes::SIZES_OF_MARKS_FILES_ARE_INCONSISTENT); + + num_extra_marks = file_size / (num_data_files * sizeof(Mark)); + + size_t num_marks = data_files[0].marks.size(); + for (auto & data_file : data_files) + data_file.marks.reserve(num_marks + num_extra_marks); + + std::vector old_data_sizes; + std::vector old_num_rows; + old_data_sizes.resize(num_data_files); + old_num_rows.resize(num_data_files); + for (size_t i = 0; i != num_data_files; ++i) + { + old_data_sizes[i] = file_checker.getFileSize(data_files[i].path); + old_num_rows[i] = num_marks ? data_files[i].marks[num_marks - 1].rows : 0; + } + + auto backup_entry = backup->readFile(file_path_in_backup); + auto marks_rb = backup_entry->getReadBuffer(); + + for (size_t i = 0; i != num_extra_marks; ++i) + { + for (size_t j = 0; j != num_data_files; ++j) + { + Mark mark; + mark.read(*marks_rb); + mark.rows += old_num_rows[j]; /// Adjust the number of rows. + mark.offset += old_data_sizes[j]; /// Adjust the offset. + data_files[j].marks.push_back(mark); + } + } + } + + /// Finish writing. + saveMarks(lock); + saveFileSizes(lock); + updateTotalRows(lock); + } + catch (...) + { + /// Rollback partial writes. + file_checker.repair(); + removeUnsavedMarks(lock); + throw; + } +} void registerStorageLog(StorageFactory & factory) { diff --git a/src/Storages/StorageLog.h b/src/Storages/StorageLog.h index a67915cca52..778633440a4 100644 --- a/src/Storages/StorageLog.h +++ b/src/Storages/StorageLog.h @@ -12,6 +12,9 @@ namespace DB { +class IBackup; +using BackupPtr = std::shared_ptr; + /** Implements Log - a simple table engine without support of indices. * The data is stored in a compressed form. * @@ -22,7 +25,6 @@ class StorageLog final : public IStorage { friend class LogSource; friend class LogSink; - friend class LogRestoreTask; public: /** Attach the table with the appropriate name, along the appropriate path (with / at the end), @@ -68,9 +70,8 @@ public: std::optional totalRows(const Settings & settings) const override; std::optional totalBytes(const Settings & settings) const override; - bool hasDataToBackup() const override { return true; } - BackupEntries backupData(ContextPtr context, const ASTs & partitions) override; - RestoreTaskPtr restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings, const std::shared_ptr & restore_coordination) override; + void backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) override; + void restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) override; private: using ReadLock = std::shared_lock; @@ -97,6 +98,9 @@ private: /// Recalculates the number of rows stored in this table. void updateTotalRows(const WriteLock &); + /// Restores the data of this table from backup. + void restoreDataImpl(const BackupPtr & backup, const String & data_path_in_backup, std::chrono::seconds lock_timeout); + /** Offsets to some row number in a file for column in table. * They are needed so that you can read the data in several threads. */ diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index b32f77d825b..d0685c263f8 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -25,8 +25,7 @@ #include #include -#include -#include +#include namespace DB { @@ -409,18 +408,17 @@ Strings StorageMaterializedView::getDataPaths() const return {}; } -BackupEntries StorageMaterializedView::backupData(ContextPtr context_, const ASTs & partitions_) +void StorageMaterializedView::backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) { - if (!hasInnerTable()) - return {}; - return getTargetTable()->backupData(context_, partitions_); + /// We backup the target table's data only if it's inner. + if (hasInnerTable()) + getTargetTable()->backupData(backup_entries_collector, data_path_in_backup, partitions); } -RestoreTaskPtr StorageMaterializedView::restoreData(ContextMutablePtr context_, const ASTs & partitions_, const BackupPtr & backup_, const String & data_path_in_backup_, const StorageRestoreSettings & restore_settings_, const std::shared_ptr & restore_coordination_) +void StorageMaterializedView::restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) { - if (!hasInnerTable()) - return {}; - return getTargetTable()->restoreData(context_, partitions_, backup_, data_path_in_backup_, restore_settings_, restore_coordination_); + if (hasInnerTable()) + return getTargetTable()->restoreDataFromBackup(restorer, data_path_in_backup, partitions); } std::optional StorageMaterializedView::totalRows(const Settings & settings) const diff --git a/src/Storages/StorageMaterializedView.h b/src/Storages/StorageMaterializedView.h index e7c01297f67..8aec0313ecb 100644 --- a/src/Storages/StorageMaterializedView.h +++ b/src/Storages/StorageMaterializedView.h @@ -95,9 +95,8 @@ public: Strings getDataPaths() const override; - bool hasDataToBackup() const override { return hasInnerTable(); } - BackupEntries backupData(ContextPtr context_, const ASTs & partitions_) override; - RestoreTaskPtr restoreData(ContextMutablePtr context_, const ASTs & partitions_, const BackupPtr & backup, const String & data_path_in_backup_, const StorageRestoreSettings & restore_settings_, const std::shared_ptr & restore_coordination_) override; + void backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) override; + void restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) override; std::optional totalRows(const Settings & settings) const override; std::optional totalBytes(const Settings & settings) const override; diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index 1e032f78635..5de8c3bda43 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -23,9 +23,10 @@ #include #include #include +#include #include #include -#include +#include #include #include #include @@ -37,7 +38,6 @@ namespace DB namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int NOT_IMPLEMENTED; } @@ -379,187 +379,185 @@ void StorageMemory::truncate( } -class MemoryBackupEntriesBatch : public IBackupEntriesBatch, boost::noncopyable +namespace { -public: - MemoryBackupEntriesBatch( - const StorageMetadataPtr & metadata_snapshot_, const std::shared_ptr blocks_, UInt64 max_compress_block_size_) - : IBackupEntriesBatch({"data.bin", "index.mrk", "sizes.json"}) - , metadata_snapshot(metadata_snapshot_) - , blocks(blocks_) - , max_compress_block_size(max_compress_block_size_) + class MemoryBackupEntriesBatch : public IBackupEntriesBatch, boost::noncopyable { - } - -private: - static constexpr const size_t kDataBinPos = 0; - static constexpr const size_t kIndexMrkPos = 1; - static constexpr const size_t kSizesJsonPos = 2; - static constexpr const size_t kSize = 3; - - void initialize() - { - std::call_once(initialized_flag, [this]() + public: + MemoryBackupEntriesBatch( + const StorageMetadataPtr & metadata_snapshot_, + const std::shared_ptr blocks_, + const String & data_path_in_backup, + UInt64 max_compress_block_size_) + : IBackupEntriesBatch( + {fs::path{data_path_in_backup} / "data.bin", + fs::path{data_path_in_backup} / "index.mrk", + fs::path{data_path_in_backup} / "sizes.json"}) + , metadata_snapshot(metadata_snapshot_) + , blocks(blocks_) + , max_compress_block_size(max_compress_block_size_) { - temp_dir_owner.emplace(); - auto temp_dir = temp_dir_owner->path(); - fs::create_directories(temp_dir); + } - /// Writing data.bin - constexpr char data_file_name[] = "data.bin"; - String data_file_path = temp_dir + "/" + data_file_name; - IndexForNativeFormat index; + private: + static constexpr const size_t kDataBinPos = 0; + static constexpr const size_t kIndexMrkPos = 1; + static constexpr const size_t kSizesJsonPos = 2; + static constexpr const size_t kSize = 3; + + void initialize() + { + std::call_once(initialized_flag, [this]() { - auto data_out_compressed = std::make_unique(data_file_path); - CompressedWriteBuffer data_out{*data_out_compressed, CompressionCodecFactory::instance().getDefaultCodec(), max_compress_block_size}; - NativeWriter block_out{data_out, 0, metadata_snapshot->getSampleBlock(), false, &index}; - for (const auto & block : *blocks) - block_out.write(block); - } + temp_dir_owner.emplace(); + fs::path temp_dir = temp_dir_owner->path(); + fs::create_directories(temp_dir); - /// Writing index.mrk - constexpr char index_file_name[] = "index.mrk"; - String index_file_path = temp_dir + "/" + index_file_name; - { - auto index_out_compressed = std::make_unique(index_file_path); - CompressedWriteBuffer index_out{*index_out_compressed}; - index.write(index_out); - } + /// Writing data.bin + constexpr char data_file_name[] = "data.bin"; + auto data_file_path = temp_dir / data_file_name; + IndexForNativeFormat index; + { + auto data_out_compressed = std::make_unique(data_file_path); + CompressedWriteBuffer data_out{*data_out_compressed, CompressionCodecFactory::instance().getDefaultCodec(), max_compress_block_size}; + NativeWriter block_out{data_out, 0, metadata_snapshot->getSampleBlock(), false, &index}; + for (const auto & block : *blocks) + block_out.write(block); + } - /// Writing sizes.json - constexpr char sizes_file_name[] = "sizes.json"; - String sizes_file_path = temp_dir + "/" + sizes_file_name; - FileChecker file_checker{sizes_file_path}; - file_checker.update(data_file_path); - file_checker.update(index_file_path); - file_checker.save(); + /// Writing index.mrk + constexpr char index_file_name[] = "index.mrk"; + auto index_file_path = temp_dir / index_file_name; + { + auto index_out_compressed = std::make_unique(index_file_path); + CompressedWriteBuffer index_out{*index_out_compressed}; + index.write(index_out); + } - file_paths[kDataBinPos] = data_file_path; - file_sizes[kDataBinPos] = file_checker.getFileSize(data_file_path); + /// Writing sizes.json + constexpr char sizes_file_name[] = "sizes.json"; + auto sizes_file_path = temp_dir / sizes_file_name; + FileChecker file_checker{sizes_file_path}; + file_checker.update(data_file_path); + file_checker.update(index_file_path); + file_checker.save(); - file_paths[kIndexMrkPos] = index_file_path; - file_sizes[kIndexMrkPos] = file_checker.getFileSize(index_file_path); + file_paths[kDataBinPos] = data_file_path; + file_sizes[kDataBinPos] = file_checker.getFileSize(data_file_path); - file_paths[kSizesJsonPos] = sizes_file_path; - file_sizes[kSizesJsonPos] = fs::file_size(sizes_file_path); + file_paths[kIndexMrkPos] = index_file_path; + file_sizes[kIndexMrkPos] = file_checker.getFileSize(index_file_path); - /// We don't need to keep `blocks` any longer. - blocks.reset(); - metadata_snapshot.reset(); - }); - } + file_paths[kSizesJsonPos] = sizes_file_path; + file_sizes[kSizesJsonPos] = fs::file_size(sizes_file_path); - std::unique_ptr getReadBuffer(size_t index) override - { - initialize(); - return createReadBufferFromFileBase(file_paths[index], {}); - } + /// We don't need to keep `blocks` any longer. + blocks.reset(); + metadata_snapshot.reset(); + }); + } - UInt64 getSize(size_t index) override - { - initialize(); - return file_sizes[index]; - } + std::unique_ptr getReadBuffer(size_t index) override + { + initialize(); + return createReadBufferFromFileBase(file_paths[index], {}); + } - StorageMetadataPtr metadata_snapshot; - std::shared_ptr blocks; - UInt64 max_compress_block_size; - std::once_flag initialized_flag; - std::optional temp_dir_owner; - std::array file_paths; - std::array file_sizes; -}; + UInt64 getSize(size_t index) override + { + initialize(); + return file_sizes[index]; + } - -BackupEntries StorageMemory::backupData(ContextPtr context, const ASTs & partitions) -{ - if (!partitions.empty()) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Table engine {} doesn't support partitions", getName()); - - return std::make_shared(getInMemoryMetadataPtr(), data.get(), context->getSettingsRef().max_compress_block_size) - ->getBackupEntries(); + StorageMetadataPtr metadata_snapshot; + std::shared_ptr blocks; + UInt64 max_compress_block_size; + std::once_flag initialized_flag; + std::optional temp_dir_owner; + std::array file_paths; + std::array file_sizes; + }; } - -class MemoryRestoreTask : public IRestoreTask +void StorageMemory::backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) { -public: - MemoryRestoreTask( - std::shared_ptr storage_, const BackupPtr & backup_, const String & data_path_in_backup_) - : storage(storage_), backup(backup_), data_path_in_backup(data_path_in_backup_) + if (partitions) + BackupEntriesCollector::throwPartitionsNotSupported(getStorageID(), getName()); + + auto max_compress_block_size = backup_entries_collector.getContext()->getSettingsRef().max_compress_block_size; + backup_entries_collector.addBackupEntries( + std::make_shared(getInMemoryMetadataPtr(), data.get(), data_path_in_backup, max_compress_block_size) + ->getBackupEntries()); +} + +void StorageMemory::restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) +{ + if (partitions) + RestorerFromBackup::throwPartitionsNotSupported(getStorageID(), getName()); + + auto backup = restorer.getBackup(); + if (!restorer.isNonEmptyTableAllowed() && total_size_bytes && backup->hasFiles(data_path_in_backup)) + RestorerFromBackup::throwTableIsNotEmpty(getStorageID()); + + restorer.addDataRestoreTask( + [storage = std::static_pointer_cast(shared_from_this()), backup, data_path_in_backup] + { storage->restoreDataImpl(backup, data_path_in_backup); }); +} + +void StorageMemory::restoreDataImpl(const BackupPtr & backup, const String & data_path_in_backup) +{ + /// Our data are in the StripeLog format. + + fs::path data_path_in_backup_fs = data_path_in_backup; + + /// Reading index.mrk + IndexForNativeFormat index; { + String index_file_path = data_path_in_backup_fs / "index.mrk"; + auto backup_entry = backup->readFile(index_file_path); + auto in = backup_entry->getReadBuffer(); + CompressedReadBuffer compressed_in{*in}; + index.read(compressed_in); } - RestoreTasks run() override + /// Reading data.bin + Blocks new_blocks; + size_t new_bytes = 0; + size_t new_rows = 0; { - /// Our data are in the StripeLog format. - - /// Reading index.mrk - IndexForNativeFormat index; + String data_file_path = data_path_in_backup_fs / "data.bin"; + auto backup_entry = backup->readFile(data_file_path); + std::unique_ptr in = backup_entry->getReadBuffer(); + std::optional temp_data_copy; + if (!dynamic_cast(in.get())) { - String index_file_path = data_path_in_backup + "index.mrk"; - auto backup_entry = backup->readFile(index_file_path); - auto in = backup_entry->getReadBuffer(); - CompressedReadBuffer compressed_in{*in}; - index.read(compressed_in); + temp_data_copy.emplace(); + auto temp_data_copy_out = std::make_unique(temp_data_copy->path()); + copyData(*in, *temp_data_copy_out); + temp_data_copy_out.reset(); + in = createReadBufferFromFileBase(temp_data_copy->path(), {}); } + std::unique_ptr in_from_file{static_cast(in.release())}; + CompressedReadBufferFromFile compressed_in{std::move(in_from_file)}; + NativeReader block_in{compressed_in, 0, index.blocks.begin(), index.blocks.end()}; - /// Reading data.bin - Blocks new_blocks; - size_t new_bytes = 0; - size_t new_rows = 0; + while (auto block = block_in.read()) { - String data_file_path = data_path_in_backup + "data.bin"; - auto backup_entry = backup->readFile(data_file_path); - std::unique_ptr in = backup_entry->getReadBuffer(); - std::optional temp_data_copy; - if (!dynamic_cast(in.get())) - { - temp_data_copy.emplace(); - auto temp_data_copy_out = std::make_unique(temp_data_copy->path()); - copyData(*in, *temp_data_copy_out); - temp_data_copy_out.reset(); - in = createReadBufferFromFileBase(temp_data_copy->path(), {}); - } - std::unique_ptr in_from_file{static_cast(in.release())}; - CompressedReadBufferFromFile compressed_in{std::move(in_from_file)}; - NativeReader block_in{compressed_in, 0, index.blocks.begin(), index.blocks.end()}; - - while (auto block = block_in.read()) - { - new_bytes += block.bytes(); - new_rows += block.rows(); - new_blocks.push_back(std::move(block)); - } + new_bytes += block.bytes(); + new_rows += block.rows(); + new_blocks.push_back(std::move(block)); } - - /// Append old blocks with the new ones. - auto old_blocks = storage->data.get(); - Blocks old_and_new_blocks = *old_blocks; - old_and_new_blocks.insert(old_and_new_blocks.end(), std::make_move_iterator(new_blocks.begin()), std::make_move_iterator(new_blocks.end())); - - /// Finish restoring. - storage->data.set(std::make_unique(std::move(old_and_new_blocks))); - storage->total_size_bytes += new_bytes; - storage->total_size_rows += new_rows; - - return {}; } -private: - std::shared_ptr storage; - BackupPtr backup; - String data_path_in_backup; -}; + /// Append old blocks with the new ones. + auto old_blocks = data.get(); + Blocks old_and_new_blocks = *old_blocks; + old_and_new_blocks.insert(old_and_new_blocks.end(), std::make_move_iterator(new_blocks.begin()), std::make_move_iterator(new_blocks.end())); - -RestoreTaskPtr StorageMemory::restoreData(ContextMutablePtr, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings &, const std::shared_ptr &) -{ - if (!partitions.empty()) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Table engine {} doesn't support partitions", getName()); - - return std::make_unique( - typeid_cast>(shared_from_this()), backup, data_path_in_backup); + /// Finish restoring. + data.set(std::make_unique(std::move(old_and_new_blocks))); + total_size_bytes += new_bytes; + total_size_rows += new_rows; } diff --git a/src/Storages/StorageMemory.h b/src/Storages/StorageMemory.h index 4f0cf03b759..91dadcb884e 100644 --- a/src/Storages/StorageMemory.h +++ b/src/Storages/StorageMemory.h @@ -11,6 +11,8 @@ namespace DB { +class IBackup; +using BackupPtr = std::shared_ptr; /** Implements storage in the RAM. * Suitable for temporary data. @@ -20,7 +22,6 @@ namespace DB class StorageMemory final : public IStorage { friend class MemorySink; -friend class MemoryRestoreTask; public: StorageMemory( @@ -70,9 +71,8 @@ public: void truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder &) override; - bool hasDataToBackup() const override { return true; } - BackupEntries backupData(ContextPtr context, const ASTs & partitions) override; - RestoreTaskPtr restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings, const std::shared_ptr & restore_coordination) override; + void backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) override; + void restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) override; std::optional totalRows(const Settings &) const override; std::optional totalBytes(const Settings &) const override; @@ -115,6 +115,9 @@ public: void delayReadForGlobalSubqueries() { delay_read_for_global_subqueries = true; } private: + /// Restores the data of this table from backup. + void restoreDataImpl(const BackupPtr & backup, const String & data_path_in_backup); + /// MultiVersion data storage, so that we can copy the vector of blocks to readers. MultiVersion data; diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index b841c72fdbc..4cebd95bab9 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -3,7 +3,6 @@ #include #include -#include #include #include #include @@ -231,6 +230,12 @@ void StorageMergeTree::read( if (auto plan = reader.read( column_names, storage_snapshot, query_info, local_context, max_block_size, num_streams, processed_stage, nullptr, enable_parallel_reading)) query_plan = std::move(*plan); + + /// Now, copy of parts that is required for the query, stored in the processors, + /// while snapshot_data.parts includes all parts, even one that had been filtered out with partition pruning, + /// reset them to avoid holding them. + auto & snapshot_data = assert_cast(*storage_snapshot->data); + snapshot_data.parts = {}; } std::optional StorageMergeTree::totalRows(const Settings &) const @@ -279,8 +284,9 @@ void StorageMergeTree::truncate(const ASTPtr &, const StorageMetadataPtr &, Cont /// This protects against "revival" of data for a removed partition after completion of merge. auto merge_blocker = stopMergesAndWait(); - auto parts_to_remove = getVisibleDataPartsVector(local_context); - removePartsFromWorkingSet(local_context->getCurrentTransaction().get(), parts_to_remove, true); + auto data_parts_lock = lockParts(); + auto parts_to_remove = getVisibleDataPartsVectorUnlocked(local_context, data_parts_lock); + removePartsFromWorkingSet(local_context->getCurrentTransaction().get(), parts_to_remove, true, data_parts_lock); LOG_INFO(log, "Removed {} parts.", parts_to_remove.size()); } @@ -364,7 +370,7 @@ CurrentlyMergingPartsTagger::CurrentlyMergingPartsTagger( /// if we mutate part, than we should reserve space on the same disk, because mutations possible can create hardlinks if (is_mutation) - reserved_space = storage.tryReserveSpace(total_size, future_part->parts[0]->volume); + reserved_space = storage.tryReserveSpace(total_size, future_part->parts[0]->data_part_storage); else { IMergeTreeDataPart::TTLInfos ttl_infos; @@ -372,7 +378,7 @@ CurrentlyMergingPartsTagger::CurrentlyMergingPartsTagger( for (auto & part_ptr : future_part->parts) { ttl_infos.update(part_ptr->ttl_infos); - max_volume_index = std::max(max_volume_index, storage.getStoragePolicy()->getVolumeIndexByDisk(part_ptr->volume->getDisk())); + max_volume_index = std::max(max_volume_index, part_ptr->data_part_storage->getVolumeIndex(*storage.getStoragePolicy())); } reserved_space = storage.balancedReservation( @@ -1468,16 +1474,17 @@ void StorageMergeTree::dropPartition(const ASTPtr & partition, bool detach, Cont /// Asks to complete merges and does not allow them to start. /// This protects against "revival" of data for a removed partition after completion of merge. auto merge_blocker = stopMergesAndWait(); + auto data_parts_lock = lockParts(); const auto * partition_ast = partition->as(); if (partition_ast && partition_ast->all) - parts_to_remove = getVisibleDataPartsVector(local_context); + parts_to_remove = getVisibleDataPartsVectorUnlocked(local_context, data_parts_lock); else { - String partition_id = getPartitionIDFromQuery(partition, local_context); - parts_to_remove = getVisibleDataPartsVectorInPartition(local_context, partition_id); + String partition_id = getPartitionIDFromQuery(partition, local_context, &data_parts_lock); + parts_to_remove = getVisibleDataPartsVectorInPartition(local_context, partition_id, data_parts_lock); } /// TODO should we throw an exception if parts_to_remove is empty? - removePartsFromWorkingSet(local_context->getCurrentTransaction().get(), parts_to_remove, true); + removePartsFromWorkingSet(local_context->getCurrentTransaction().get(), parts_to_remove, true, data_parts_lock); } dropPartsImpl(std::move(parts_to_remove), detach); @@ -1493,7 +1500,7 @@ void StorageMergeTree::dropPartsImpl(DataPartsVector && parts_to_remove, bool de /// NOTE: no race with background cleanup until we hold pointers to parts for (const auto & part : parts_to_remove) { - LOG_INFO(log, "Detaching {}", part->relative_path); + LOG_INFO(log, "Detaching {}", part->data_part_storage->getPartDirectory()); part->makeCloneInDetached("", metadata_snapshot); } } @@ -1736,29 +1743,23 @@ CheckResults StorageMergeTree::checkData(const ASTPtr & query, ContextPtr local_ for (auto & part : data_parts) { - auto disk = part->volume->getDisk(); - String part_path = part->getFullRelativePath(); /// If the checksums file is not present, calculate the checksums and write them to disk. - String checksums_path = fs::path(part_path) / "checksums.txt"; - String tmp_checksums_path = fs::path(part_path) / "checksums.txt.tmp"; - if (part->isStoredOnDisk() && !disk->exists(checksums_path)) + String checksums_path = "checksums.txt"; + String tmp_checksums_path = "checksums.txt.tmp"; + if (part->isStoredOnDisk() && !part->data_part_storage->exists(checksums_path)) { try { auto calculated_checksums = checkDataPart(part, false); calculated_checksums.checkEqual(part->checksums, true); - auto out = disk->writeFile(tmp_checksums_path, 4096, WriteMode::Rewrite, local_context->getWriteSettings()); - part->checksums.write(*out); - disk->moveFile(tmp_checksums_path, checksums_path); + + part->data_part_storage->writeChecksums(part->checksums, local_context->getWriteSettings()); part->checkMetadata(); results.emplace_back(part->name, true, "Checksums recounted and written to disk."); } catch (const Exception & ex) { - if (disk->exists(tmp_checksums_path)) - disk->removeFile(tmp_checksums_path); - results.emplace_back(part->name, false, "Check of part finished with error: '" + ex.message() + "'"); } @@ -1781,9 +1782,10 @@ CheckResults StorageMergeTree::checkData(const ASTPtr & query, ContextPtr local_ } -RestoreTaskPtr StorageMergeTree::restoreData(ContextMutablePtr local_context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings &, const std::shared_ptr &) +void StorageMergeTree::attachRestoredParts(MutableDataPartsVector && parts) { - return restoreDataParts(getPartitionIDsFromQuery(partitions, local_context), backup, data_path_in_backup, &increment); + for (auto part : parts) + renameTempPartAndAdd(part, NO_TRANSACTION_RAW, &increment); } diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h index 18c4c9d3533..ccfe03d012a 100644 --- a/src/Storages/StorageMergeTree.h +++ b/src/Storages/StorageMergeTree.h @@ -105,8 +105,6 @@ public: CheckResults checkData(const ASTPtr & query, ContextPtr context) override; - RestoreTaskPtr restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings, const std::shared_ptr & restore_coordination) override; - bool scheduleDataProcessingJob(BackgroundJobsAssignee & assignee) override; MergeTreeDeduplicationLog * getDeduplicationLog() { return deduplication_log.get(); } @@ -255,6 +253,9 @@ private: void startBackgroundMovesIfNeeded() override; + /// Attaches restored parts to the storage. + void attachRestoredParts(MutableDataPartsVector && parts) override; + std::unique_ptr getDefaultSettings() const override; friend class MergeTreeSink; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 02b37225209..e93399918ef 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include @@ -71,12 +72,12 @@ #include #include +#include #include +#include #include -#include #include -#include -#include +#include #include @@ -478,6 +479,18 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( } +String StorageReplicatedMergeTree::getDefaultZooKeeperPath(const Poco::Util::AbstractConfiguration & config) +{ + return config.getString("default_replica_path", "/clickhouse/tables/{uuid}/{shard}"); +} + + +String StorageReplicatedMergeTree::getDefaultReplicaName(const Poco::Util::AbstractConfiguration & config) +{ + return config.getString("default_replica_name", "{replica}"); +} + + bool StorageReplicatedMergeTree::checkFixedGranularityInZookeeper() { auto zookeeper = getZooKeeper(); @@ -1566,12 +1579,16 @@ MergeTreeData::MutableDataPartPtr StorageReplicatedMergeTree::attachPartHelperFo continue; const String part_old_name = part_info->getPartName(); - const String part_path = fs::path("detached") / part_old_name; const VolumePtr volume = std::make_shared("volume_" + part_old_name, disk); + auto data_part_storage = std::make_shared( + volume, + fs::path(relative_data_path) / "detached", + part_old_name); + /// actual_part_info is more recent than part_info so we use it - MergeTreeData::MutableDataPartPtr part = createPart(part_new_name, actual_part_info, volume, part_path); + MergeTreeData::MutableDataPartPtr part = createPart(part_new_name, actual_part_info, data_part_storage); try { @@ -1586,7 +1603,7 @@ MergeTreeData::MutableDataPartPtr StorageReplicatedMergeTree::attachPartHelperFo if (entry.part_checksum == part->checksums.getTotalChecksumHex()) { - part->modification_time = disk->getLastModified(part->getFullRelativePath()).epochTime(); + part->modification_time = data_part_storage->getLastModified().epochTime(); return part; } } @@ -1884,7 +1901,7 @@ bool StorageReplicatedMergeTree::executeFetch(LogEntry & entry, bool need_to_che } -bool StorageReplicatedMergeTree::executeFetchShared( +DataPartStoragePtr StorageReplicatedMergeTree::executeFetchShared( const String & source_replica, const String & new_part_name, const DiskPtr & disk, @@ -1893,7 +1910,7 @@ bool StorageReplicatedMergeTree::executeFetchShared( if (source_replica.empty()) { LOG_INFO(log, "No active replica has part {} on shared storage.", new_part_name); - return false; + return nullptr; } const auto storage_settings_ptr = getSettings(); @@ -1901,8 +1918,7 @@ bool StorageReplicatedMergeTree::executeFetchShared( try { - if (!fetchExistsPart(new_part_name, metadata_snapshot, fs::path(zookeeper_path) / "replicas" / source_replica, disk, path)) - return false; + return fetchExistsPart(new_part_name, metadata_snapshot, fs::path(zookeeper_path) / "replicas" / source_replica, disk, path); } catch (Exception & e) { @@ -1911,8 +1927,6 @@ bool StorageReplicatedMergeTree::executeFetchShared( tryLogCurrentException(log, __PRETTY_FUNCTION__); throw; } - - return true; } @@ -1953,7 +1967,7 @@ void StorageReplicatedMergeTree::executeDropRange(const LogEntry & entry) /// If DETACH clone parts to detached/ directory for (const auto & part : parts_to_remove) { - LOG_INFO(log, "Detaching {}", part->relative_path); + LOG_INFO(log, "Detaching {}", part->data_part_storage->getPartDirectory()); part->makeCloneInDetached("", metadata_snapshot); } } @@ -2621,7 +2635,7 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo for (const auto & part : parts_to_remove_from_working_set) { - LOG_INFO(log, "Detaching {}", part->relative_path); + LOG_INFO(log, "Detaching {}", part->data_part_storage->getPartDirectory()); part->makeCloneInDetached("clone", metadata_snapshot); } } @@ -4140,7 +4154,7 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora } -bool StorageReplicatedMergeTree::fetchExistsPart(const String & part_name, const StorageMetadataPtr & metadata_snapshot, +DataPartStoragePtr StorageReplicatedMergeTree::fetchExistsPart(const String & part_name, const StorageMetadataPtr & metadata_snapshot, const String & source_replica_path, DiskPtr replaced_disk, String replaced_part_path) { auto zookeeper = getZooKeeper(); @@ -4151,7 +4165,7 @@ bool StorageReplicatedMergeTree::fetchExistsPart(const String & part_name, const LOG_DEBUG(log, "Part {} should be deleted after previous attempt before fetch", part->name); /// Force immediate parts cleanup to delete the part that was left from the previous fetch attempt. cleanup_thread.wakeup(); - return false; + return nullptr; } { @@ -4159,7 +4173,7 @@ bool StorageReplicatedMergeTree::fetchExistsPart(const String & part_name, const if (!currently_fetching_parts.insert(part_name).second) { LOG_DEBUG(log, "Part {} is already fetching right now", part_name); - return false; + return nullptr; } } @@ -4211,17 +4225,18 @@ bool StorageReplicatedMergeTree::fetchExistsPart(const String & part_name, const { part = get_part(); - if (part->volume->getDisk()->getName() != replaced_disk->getName()) - throw Exception("Part " + part->name + " fetched on wrong disk " + part->volume->getDisk()->getName(), ErrorCodes::LOGICAL_ERROR); - replaced_disk->removeFileIfExists(replaced_part_path); - replaced_disk->moveDirectory(part->getFullRelativePath(), replaced_part_path); + if (part->data_part_storage->getDiskName() != replaced_disk->getName()) + throw Exception("Part " + part->name + " fetched on wrong disk " + part->data_part_storage->getDiskName(), ErrorCodes::LOGICAL_ERROR); + + auto replaced_path = fs::path(replaced_part_path); + part->data_part_storage->rename(replaced_path.parent_path(), replaced_path.filename(), nullptr, true, false); } catch (const Exception & e) { /// The same part is being written right now (but probably it's not committed yet). /// We will check the need for fetch later. if (e.code() == ErrorCodes::DIRECTORY_ALREADY_EXISTS) - return false; + return nullptr; throw; } @@ -4235,7 +4250,7 @@ bool StorageReplicatedMergeTree::fetchExistsPart(const String & part_name, const LOG_DEBUG(log, "Fetched part {} from {}", part_name, source_replica_path); - return true; + return part->data_part_storage; } @@ -4409,6 +4424,14 @@ void StorageReplicatedMergeTree::read( /// If true, then we will ask initiator if we can read chosen ranges const bool enable_parallel_reading = local_context->getClientInfo().collaborate_with_initiator; + SCOPE_EXIT({ + /// Now, copy of parts that is required for the query, stored in the processors, + /// while snapshot_data.parts includes all parts, even one that had been filtered out with partition pruning, + /// reset them to avoid holding them. + auto & snapshot_data = assert_cast(*storage_snapshot->data); + snapshot_data.parts = {}; + }); + /** The `select_sequential_consistency` setting has two meanings: * 1. To throw an exception if on a replica there are not all parts which have been written down on quorum of remaining replicas. * 2. Do not read parts that have not yet been written to the quorum of the replicas. @@ -7392,7 +7415,7 @@ void StorageReplicatedMergeTree::checkBrokenDisks() for (auto & part : *parts) { - if (part->volume && part->volume->getDisk()->getName() == disk_ptr->getName()) + if (part->data_part_storage && part->data_part_storage->getDiskName() == disk_ptr->getName()) broken_part_callback(part->name); } continue; @@ -7495,7 +7518,7 @@ void StorageReplicatedMergeTree::lockSharedDataTemporary(const String & part_nam String id = part_id; boost::replace_all(id, "/", "_"); - Strings zc_zookeeper_paths = getZeroCopyPartPath(*getSettings(), disk->getType(), getTableSharedID(), + Strings zc_zookeeper_paths = getZeroCopyPartPath(*getSettings(), toString(disk->getType()), getTableSharedID(), part_name, zookeeper_path); for (const auto & zc_zookeeper_path : zc_zookeeper_paths) @@ -7511,11 +7534,10 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, { auto settings = getSettings(); - if (!part.volume || !part.isStoredOnDisk() || !settings->allow_remote_fs_zero_copy_replication) + if (!part.data_part_storage || !part.isStoredOnDisk() || !settings->allow_remote_fs_zero_copy_replication) return; - DiskPtr disk = part.volume->getDisk(); - if (!disk || !disk->supportZeroCopyReplication()) + if (!part.data_part_storage->supportZeroCopyReplication()) return; zkutil::ZooKeeperPtr zookeeper = tryGetZooKeeper(); @@ -7526,7 +7548,7 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, boost::replace_all(id, "/", "_"); Strings zc_zookeeper_paths = getZeroCopyPartPath( - *getSettings(), disk->getType(), getTableSharedID(), + *getSettings(), part.data_part_storage->getDiskType(), getTableSharedID(), part.name, zookeeper_path); String path_to_set_hardlinked_files; @@ -7535,7 +7557,7 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, if (hardlinked_files.has_value() && !hardlinked_files->hardlinks_from_source_part.empty()) { path_to_set_hardlinked_files = getZeroCopyPartPath( - *getSettings(), disk->getType(), hardlinked_files->source_table_shared_id, + *getSettings(), part.data_part_storage->getDiskType(), hardlinked_files->source_table_shared_id, hardlinked_files->source_part_name, zookeeper_path)[0]; hardlinks = hardlinked_files->hardlinks_from_source_part; @@ -7555,18 +7577,16 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, std::pair StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const { - if (!part.volume || !part.isStoredOnDisk()) + if (!part.data_part_storage || !part.isStoredOnDisk()) return std::make_pair(true, NameSet{}); - DiskPtr disk = part.volume->getDisk(); - if (!disk || !disk->supportZeroCopyReplication()) + if (!part.data_part_storage || !part.data_part_storage->supportZeroCopyReplication()) return std::make_pair(true, NameSet{}); /// If part is temporary refcount file may be absent - auto ref_count_path = fs::path(part.getFullRelativePath()) / IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK; - if (disk->exists(ref_count_path)) + if (part.data_part_storage->exists(IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK)) { - auto ref_count = disk->getRefCount(ref_count_path); + auto ref_count = part.data_part_storage->getRefCount(IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK); if (ref_count > 0) /// Keep part shard info for frozen backups return std::make_pair(false, NameSet{}); } @@ -7576,18 +7596,18 @@ std::pair StorageReplicatedMergeTree::unlockSharedData(const IMer return std::make_pair(true, NameSet{}); } - return unlockSharedDataByID(part.getUniqueId(), getTableSharedID(), part.name, replica_name, disk, getZooKeeper(), *getSettings(), log, + return unlockSharedDataByID(part.getUniqueId(), getTableSharedID(), part.name, replica_name, part.data_part_storage->getDiskType(), getZooKeeper(), *getSettings(), log, zookeeper_path); } std::pair StorageReplicatedMergeTree::unlockSharedDataByID( String part_id, const String & table_uuid, const String & part_name, - const String & replica_name_, DiskPtr disk, zkutil::ZooKeeperPtr zookeeper_ptr, const MergeTreeSettings & settings, + const String & replica_name_, std::string disk_type, zkutil::ZooKeeperPtr zookeeper_ptr, const MergeTreeSettings & settings, Poco::Logger * logger, const String & zookeeper_path_old) { boost::replace_all(part_id, "/", "_"); - Strings zc_zookeeper_paths = getZeroCopyPartPath(settings, disk->getType(), table_uuid, part_name, zookeeper_path_old); + Strings zc_zookeeper_paths = getZeroCopyPartPath(settings, disk_type, table_uuid, part_name, zookeeper_path_old); bool part_has_no_more_locks = true; NameSet files_not_to_remove; @@ -7682,7 +7702,7 @@ std::pair StorageReplicatedMergeTree::unlockSharedDataByID( } -bool StorageReplicatedMergeTree::tryToFetchIfShared( +DataPartStoragePtr StorageReplicatedMergeTree::tryToFetchIfShared( const IMergeTreeDataPart & part, const DiskPtr & disk, const String & path) @@ -7690,13 +7710,13 @@ bool StorageReplicatedMergeTree::tryToFetchIfShared( const auto settings = getSettings(); auto disk_type = disk->getType(); if (!(disk->supportZeroCopyReplication() && settings->allow_remote_fs_zero_copy_replication)) - return false; + return nullptr; String replica = getSharedDataReplica(part, disk_type); /// We can't fetch part when none replicas have this part on a same type remote disk if (replica.empty()) - return false; + return nullptr; return executeFetchShared(replica, part.name, disk, path); } @@ -7711,7 +7731,7 @@ String StorageReplicatedMergeTree::getSharedDataReplica( if (!zookeeper) return ""; - Strings zc_zookeeper_paths = getZeroCopyPartPath(*getSettings(), disk_type, getTableSharedID(), part.name, + Strings zc_zookeeper_paths = getZeroCopyPartPath(*getSettings(), toString(disk_type), getTableSharedID(), part.name, zookeeper_path); std::set replicas; @@ -7783,12 +7803,12 @@ String StorageReplicatedMergeTree::getSharedDataReplica( Strings StorageReplicatedMergeTree::getZeroCopyPartPath( - const MergeTreeSettings & settings, DiskType disk_type, const String & table_uuid, + const MergeTreeSettings & settings, std::string disk_type, const String & table_uuid, const String & part_name, const String & zookeeper_path_old) { Strings res; - String zero_copy = fmt::format("zero_copy_{}", toString(disk_type)); + String zero_copy = fmt::format("zero_copy_{}", disk_type); String new_path = fs::path(settings.remote_fs_zero_copy_zookeeper_path.toString()) / zero_copy / table_uuid / part_name; res.push_back(new_path); @@ -7822,7 +7842,7 @@ std::optional StorageReplicatedMergeTree::getZeroCopyPartPath(const Stri if (!disk || !disk->supportZeroCopyReplication()) return std::nullopt; - return getZeroCopyPartPath(*getSettings(), disk->getType(), getTableSharedID(), part_name, zookeeper_path)[0]; + return getZeroCopyPartPath(*getSettings(), toString(disk->getType()), getTableSharedID(), part_name, zookeeper_path)[0]; } std::optional StorageReplicatedMergeTree::tryCreateZeroCopyExclusiveLock(const String & part_name, const DiskPtr & disk) @@ -7917,12 +7937,22 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP auto minmax_idx = std::make_shared(); minmax_idx->update(block, getMinMaxColumnsNames(metadata_snapshot->getPartitionKey())); + auto new_volume = createVolumeFromReservation(reservation, volume); + auto data_part_storage = std::make_shared( + new_volume, + relative_data_path, + TMP_PREFIX + lost_part_name); + + DataPartStorageBuilderPtr data_part_storage_builder = std::make_shared( + new_volume, + relative_data_path, + TMP_PREFIX + lost_part_name); + auto new_data_part = createPart( lost_part_name, choosePartType(0, block.rows()), new_part_info, - createVolumeFromReservation(reservation, volume), - TMP_PREFIX + lost_part_name); + data_part_storage); if (settings->assign_part_uuids) new_data_part->uuid = UUIDHelpers::generateV4(); @@ -7959,19 +7989,16 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP if (new_data_part->isStoredOnDisk()) { /// The name could be non-unique in case of stale files from previous runs. - String full_path = new_data_part->getFullRelativePath(); - - if (new_data_part->volume->getDisk()->exists(full_path)) + if (data_part_storage_builder->exists()) { - LOG_WARNING(log, "Removing old temporary directory {}", fullPath(new_data_part->volume->getDisk(), full_path)); - new_data_part->volume->getDisk()->removeRecursive(full_path); + LOG_WARNING(log, "Removing old temporary directory {}", new_data_part->data_part_storage->getFullPath()); + data_part_storage_builder->removeRecursive(); } - const auto disk = new_data_part->volume->getDisk(); - disk->createDirectories(full_path); + data_part_storage_builder->createDirectories(); if (getSettings()->fsync_part_directory) - sync_guard = disk->getDirectorySyncGuard(full_path); + sync_guard = data_part_storage_builder->getDirectorySyncGuard(); } /// This effectively chooses minimal compression method: @@ -7979,7 +8006,7 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP auto compression_codec = getContext()->chooseCompressionCodec(0, 0); const auto & index_factory = MergeTreeIndexFactory::instance(); - MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns, + MergedBlockOutputStream out(new_data_part, data_part_storage_builder, metadata_snapshot, columns, index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec, NO_TRANSACTION_PTR); bool sync_on_insert = settings->fsync_after_insert; @@ -8201,7 +8228,7 @@ bool StorageReplicatedMergeTree::removeSharedDetachedPart(DiskPtr disk, const St String id = disk->getUniqueId(checksums); bool can_remove = false; std::tie(can_remove, files_not_to_remove) = StorageReplicatedMergeTree::unlockSharedDataByID(id, table_uuid, part_name, - detached_replica_name, disk, zookeeper, local_context->getReplicatedMergeTreeSettings(), &Poco::Logger::get("StorageReplicatedMergeTree"), + detached_replica_name, toString(disk->getType()), zookeeper, local_context->getReplicatedMergeTreeSettings(), &Poco::Logger::get("StorageReplicatedMergeTree"), detached_zookeeper_path); keep_shared = !can_remove; @@ -8227,139 +8254,163 @@ void StorageReplicatedMergeTree::createAndStoreFreezeMetadata(DiskPtr disk, Data } -class ReplicatedMergeTreeRestoreTask : public IRestoreTask +ASTPtr StorageReplicatedMergeTree::getCreateQueryForBackup(const ContextPtr & local_context, DatabasePtr * database) const { -public: - ReplicatedMergeTreeRestoreTask( - const std::shared_ptr & storage_, - const std::unordered_set & partition_ids_, - const BackupPtr & backup_, - const StorageRestoreSettings & restore_settings_, - const std::shared_ptr & restore_coordination_) - : storage(storage_) - , partition_ids(partition_ids_) - , backup(backup_) - , restore_settings(restore_settings_) - , restore_coordination(restore_coordination_) + ASTPtr query = MergeTreeData::getCreateQueryForBackup(local_context, database); + + /// Before storing the metadata in a backup we have to find a zookeeper path in its definition and turn the table's UUID in there + /// back into "{uuid}", and also we probably can remove the zookeeper path and replica name if they're default. + /// So we're kind of reverting what we had done to the table's definition in registerStorageMergeTree.cpp before we created this table. + auto & create = query->as(); + if (create.storage && create.storage->engine && (create.uuid != UUIDHelpers::Nil)) { + auto & engine = *(create.storage->engine); + if (auto * engine_args_ast = typeid_cast(engine.arguments.get())) + { + auto & engine_args = engine_args_ast->children; + if (engine_args.size() >= 2) + { + auto * zookeeper_path_ast = typeid_cast(engine_args[0].get()); + auto * replica_name_ast = typeid_cast(engine_args[1].get()); + if (zookeeper_path_ast && (zookeeper_path_ast->value.getType() == Field::Types::String) && + replica_name_ast && (replica_name_ast->value.getType() == Field::Types::String)) + { + String & zookeeper_path_arg = zookeeper_path_ast->value.get(); + String & replica_name_arg = replica_name_ast->value.get(); + String table_uuid_str = toString(create.uuid); + if (size_t uuid_pos = zookeeper_path_arg.find(table_uuid_str); uuid_pos != String::npos) + zookeeper_path_arg.replace(uuid_pos, table_uuid_str.size(), "{uuid}"); + const auto & config = getContext()->getConfigRef(); + if ((zookeeper_path_arg == getDefaultZooKeeperPath(config)) && (replica_name_arg == getDefaultReplicaName(config)) + && ((engine_args.size() == 2) || !engine_args[2]->as())) + { + engine_args.erase(engine_args.begin(), engine_args.begin() + 2); + } + } + } + } } - RestoreTasks run() override + return query; +} + +void StorageReplicatedMergeTree::backupData( + BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) +{ + /// First we generate backup entries in the same way as an ordinary MergeTree does. + /// But then we don't add them to the BackupEntriesCollector right away, + /// because we need to coordinate them with other replicas (other replicas can have better parts). + auto backup_entries = backupParts(backup_entries_collector.getContext(), "", partitions); + + auto coordination = backup_entries_collector.getBackupCoordination(); + String full_zk_path = getZooKeeperName() + getZooKeeperPath(); + coordination->addReplicatedDataPath(full_zk_path, data_path_in_backup); + + std::unordered_map part_names_with_hashes_calculating; + for (auto & [relative_path, backup_entry] : backup_entries) { - RestoreTasks restore_part_tasks; - - String full_zk_path = storage->getZooKeeperName() + storage->getZooKeeperPath(); - String data_path_in_backup = restore_coordination->getReplicatedTableDataPath(full_zk_path); - - auto storage_id = storage->getStorageID(); - DatabaseAndTableName table_name = {storage_id.database_name, storage_id.table_name}; - std::unordered_map partitions_restored_by_us; - - Strings part_names = backup->listFiles(data_path_in_backup); - - auto metadata_snapshot = storage->getInMemoryMetadataPtr(); - auto sink = std::make_shared(*storage, metadata_snapshot, 0, 0, 0, false, false, storage->getContext(), /*is_attach*/true); - - for (const String & part_name : part_names) + size_t slash_pos = relative_path.find('/'); + if (slash_pos != String::npos) { - const auto part_info = MergeTreePartInfo::tryParsePartName(part_name, storage->format_version); - if (!part_info) - continue; - - if (!partition_ids.empty() && !partition_ids.contains(part_info->partition_id)) - continue; - - auto it = partitions_restored_by_us.find(part_info->partition_id); - if (it == partitions_restored_by_us.end()) + String part_name = relative_path.substr(0, slash_pos); + if (MergeTreePartInfo::tryParsePartName(part_name, MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING)) { - it = partitions_restored_by_us.emplace( - part_info->partition_id, - restore_coordination->startInsertingDataToPartitionInReplicatedTable( - restore_settings.host_id, table_name, full_zk_path, part_info->partition_id)).first; + auto & hash = part_names_with_hashes_calculating[part_name]; + if (relative_path.ends_with(".bin")) + { + auto checksum = backup_entry->getChecksum(); + hash.update(relative_path); + hash.update(backup_entry->getSize()); + hash.update(*checksum); + } + continue; } - - if (!it->second) - continue; /// Other replica is already restoring this partition. - - restore_part_tasks.push_back( - std::make_unique(storage, sink, part_name, *part_info, backup, data_path_in_backup)); } - return restore_part_tasks; + /// Not a part name, probably error. + throw Exception(ErrorCodes::LOGICAL_ERROR, "{} doesn't follow the format /", quoteString(relative_path)); } -private: - std::shared_ptr storage; - std::unordered_set partition_ids; - BackupPtr backup; - StorageRestoreSettings restore_settings; - std::shared_ptr restore_coordination; - - class RestorePartTask : public IRestoreTask + std::vector part_names_with_hashes; + part_names_with_hashes.reserve(part_names_with_hashes_calculating.size()); + for (auto & [part_name, hash] : part_names_with_hashes_calculating) { - public: - RestorePartTask( - const std::shared_ptr & storage_, - const std::shared_ptr & sink_, - const String & part_name_, - const MergeTreePartInfo & part_info_, - const BackupPtr & backup_, - const String & data_path_in_backup_) - : storage(storage_) - , sink(sink_) - , part_name(part_name_) - , part_info(part_info_) - , backup(backup_) - , data_path_in_backup(data_path_in_backup_) + UInt128 checksum; + hash.get128(checksum); + auto & part_name_with_hash = part_names_with_hashes.emplace_back(); + part_name_with_hash.part_name = part_name; + part_name_with_hash.checksum = checksum; + } + + /// Send our list of part names to the coordination (to compare with other replicas). + coordination->addReplicatedPartNames(full_zk_path, getStorageID().getFullTableName(), getReplicaName(), part_names_with_hashes); + + /// This task will be executed after all replicas have collected their parts and the coordination is ready to + /// give us the final list of parts to add to the BackupEntriesCollector. + auto post_collecting_task = [full_zk_path, + replica_name = getReplicaName(), + coordination, + backup_entries = std::move(backup_entries), + &backup_entries_collector]() + { + Strings data_paths = coordination->getReplicatedDataPaths(full_zk_path); + std::vector data_paths_fs; + data_paths_fs.reserve(data_paths.size()); + for (const auto & data_path : data_paths) + data_paths_fs.push_back(data_path); + + Strings part_names = coordination->getReplicatedPartNames(full_zk_path, replica_name); + std::unordered_set part_names_set{part_names.begin(), part_names.end()}; + + for (const auto & [relative_path, backup_entry] : backup_entries) { + size_t slash_pos = relative_path.find('/'); + String part_name = relative_path.substr(0, slash_pos); + if (!part_names_set.contains(part_name)) + continue; + for (const auto & data_path : data_paths_fs) + backup_entries_collector.addBackupEntry(data_path / relative_path, backup_entry); } - - RestoreTasks run() override - { - UInt64 total_size_of_part = 0; - Strings filenames = backup->listFiles(data_path_in_backup + part_name + "/", ""); - for (const String & filename : filenames) - total_size_of_part += backup->getFileSize(data_path_in_backup + part_name + "/" + filename); - - std::shared_ptr reservation = storage->getStoragePolicy()->reserveAndCheck(total_size_of_part); - auto disk = reservation->getDisk(); - String relative_data_path = storage->getRelativeDataPath(); - - auto temp_part_dir_owner = std::make_shared(disk, relative_data_path + "restoring_" + part_name + "_"); - String temp_part_dir = temp_part_dir_owner->getPath(); - disk->createDirectories(temp_part_dir); - - assert(temp_part_dir.starts_with(relative_data_path)); - String relative_temp_part_dir = temp_part_dir.substr(relative_data_path.size()); - - for (const String & filename : filenames) - { - auto backup_entry = backup->readFile(fs::path(data_path_in_backup) / part_name / filename); - auto read_buffer = backup_entry->getReadBuffer(); - auto write_buffer = disk->writeFile(fs::path(temp_part_dir) / filename); - copyData(*read_buffer, *write_buffer); - reservation->update(reservation->getSize() - backup_entry->getSize()); - } - - auto single_disk_volume = std::make_shared(disk->getName(), disk, 0); - auto part = storage->createPart(part_name, part_info, single_disk_volume, relative_temp_part_dir); - /// TODO Transactions: Decide what to do with version metadata (if any). Let's just remove it for now. - disk->removeFileIfExists(fs::path(temp_part_dir) / IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME); - part->version.setCreationTID(Tx::PrehistoricTID, nullptr); - part->loadColumnsChecksumsIndexes(false, true); - sink->writeExistingPart(part); - return {}; - } - - private: - std::shared_ptr storage; - std::shared_ptr sink; - String part_name; - MergeTreePartInfo part_info; - BackupPtr backup; - String data_path_in_backup; }; -}; + backup_entries_collector.addPostCollectingTask(post_collecting_task); +} +void StorageReplicatedMergeTree::restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) +{ + String full_zk_path = getZooKeeperName() + getZooKeeperPath(); + if (!restorer.getRestoreCoordination()->acquireInsertingDataIntoReplicatedTable(full_zk_path)) + { + /// Other replica is already restoring the data of this table. + /// We'll get them later due to replication, it's not necessary to read it from the backup. + return; + } + + if (!restorer.isNonEmptyTableAllowed()) + { + bool empty = !getTotalActiveSizeInBytes(); + if (empty) + { + /// New parts could be in the replication queue but not fetched yet. + /// In that case we consider the table as not empty. + StorageReplicatedMergeTree::Status status; + getStatus(status, /* with_zk_fields = */ false); + if (status.queue.inserts_in_queue) + empty = false; + } + auto backup = restorer.getBackup(); + if (!empty && backup->hasFiles(data_path_in_backup)) + restorer.throwTableIsNotEmpty(getStorageID()); + } + + restorePartsFromBackup(restorer, data_path_in_backup, partitions); +} + +void StorageReplicatedMergeTree::attachRestoredParts(MutableDataPartsVector && parts) +{ + auto metadata_snapshot = getInMemoryMetadataPtr(); + auto sink = std::make_shared(*this, metadata_snapshot, 0, 0, 0, false, false, getContext(), /*is_attach*/true); + for (auto part : parts) + sink->writeExistingPart(part); +} #if 0 PartsTemporaryRename renamed_parts(*this, "detached/"); @@ -8387,21 +8438,4 @@ for (size_t i = 0; i < loaded_parts.size(); ++i) } #endif - -RestoreTaskPtr StorageReplicatedMergeTree::restoreData( - ContextMutablePtr local_context, - const ASTs & partitions, - const BackupPtr & backup, - const String & /* data_path_in_backup */, - const StorageRestoreSettings & restore_settings, - const std::shared_ptr & restore_coordination) -{ - return std::make_unique( - std::static_pointer_cast(shared_from_this()), - getPartitionIDsFromQuery(partitions, local_context), - backup, - restore_settings, - restore_coordination); -} - } diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 78d14ed8b58..73a08a2b921 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -111,6 +111,9 @@ public: void flush() override; ~StorageReplicatedMergeTree() override; + static String getDefaultZooKeeperPath(const Poco::Util::AbstractConfiguration & config); + static String getDefaultReplicaName(const Poco::Util::AbstractConfiguration & config); + std::string getName() const override { return "Replicated" + merging_params.getModeName() + "MergeTree"; } bool supportsParallelInsert() const override { return true; } @@ -219,8 +222,25 @@ public: /// Checks ability to use granularity bool canUseAdaptiveGranularity() const override; + /// Returns the default path to the table in ZooKeeper. + /// It's used if not set in engine's arguments while creating a replicated table. + static String getDefaultReplicaPath(const ContextPtr & context_); + + /// Returns the default replica name in ZooKeeper. + /// It's used if not set in engine's arguments while creating a replicated table. + static String getDefaultReplicaName(const ContextPtr & context_); + int getMetadataVersion() const { return metadata_version; } + /// Returns a slightly changed version of the CREATE TABLE query which must be written to a backup. + ASTPtr getCreateQueryForBackup(const ContextPtr & context, DatabasePtr * database) const override; + + /// Makes backup entries to backup the data of the storage. + void backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) override; + + /// Extract data from the backup and put it to the storage. + void restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) override; + /** Remove a specific replica from zookeeper. */ static void dropReplica(zkutil::ZooKeeperPtr zookeeper, const String & zookeeper_path, const String & replica, @@ -230,9 +250,6 @@ public: static bool removeTableNodesFromZooKeeper(zkutil::ZooKeeperPtr zookeeper, const String & zookeeper_path, const zkutil::EphemeralNodeHolder::Ptr & metadata_drop_lock, Poco::Logger * logger); - /// Extract data from the backup and put it to the storage. - RestoreTaskPtr restoreData(ContextMutablePtr local_context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings, const std::shared_ptr & restore_coordination) override; - /// Schedules job to execute in background pool (merge, mutate, drop range and so on) bool scheduleDataProcessingJob(BackgroundJobsAssignee & assignee) override; @@ -241,7 +258,7 @@ public: bool canExecuteFetch(const ReplicatedMergeTreeLogEntry & entry, String & disable_reason) const; /// Fetch part only when it stored on shared storage like S3 - bool executeFetchShared(const String & source_replica, const String & new_part_name, const DiskPtr & disk, const String & path); + DataPartStoragePtr executeFetchShared(const String & source_replica, const String & new_part_name, const DiskPtr & disk, const String & path); /// Lock part in zookeeper for use shared data in several nodes void lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock, std::optional hardlinked_files) const override; @@ -257,11 +274,11 @@ public: /// Return true if data unlocked /// Return false if data is still used by another node static std::pair unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name, const String & replica_name_, - DiskPtr disk, zkutil::ZooKeeperPtr zookeeper_, const MergeTreeSettings & settings, Poco::Logger * logger, + std::string disk_type, zkutil::ZooKeeperPtr zookeeper_, const MergeTreeSettings & settings, Poco::Logger * logger, const String & zookeeper_path_old); /// Fetch part only if some replica has it on shared storage like S3 - bool tryToFetchIfShared(const IMergeTreeDataPart & part, const DiskPtr & disk, const String & path) override; + DataPartStoragePtr tryToFetchIfShared(const IMergeTreeDataPart & part, const DiskPtr & disk, const String & path) override; /// Get best replica having this partition on a same type remote disk String getSharedDataReplica(const IMergeTreeDataPart & part, DiskType disk_type) const; @@ -641,7 +658,7 @@ private: * Used for replace local part on the same s3-shared part in hybrid storage. * Returns false if part is already fetching right now. */ - bool fetchExistsPart( + DataPartStoragePtr fetchExistsPart( const String & part_name, const StorageMetadataPtr & metadata_snapshot, const String & replica_path, @@ -780,12 +797,15 @@ private: void startBackgroundMovesIfNeeded() override; + /// Attaches restored parts to the storage. + void attachRestoredParts(MutableDataPartsVector && parts) override; + std::unique_ptr getDefaultSettings() const override; PartitionBlockNumbersHolder allocateBlockNumbersInAffectedPartitions( const MutationCommands & commands, ContextPtr query_context, const zkutil::ZooKeeperPtr & zookeeper) const; - static Strings getZeroCopyPartPath(const MergeTreeSettings & settings, DiskType disk_type, const String & table_uuid, + static Strings getZeroCopyPartPath(const MergeTreeSettings & settings, std::string disk_type, const String & table_uuid, const String & part_name, const String & zookeeper_path_old); static void createZeroCopyLockNode( diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index f524a405c9b..b4b97570ad1 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -434,7 +434,8 @@ bool StorageS3Source::initialize() file_path = fs::path(bucket) / current_key; - read_buf = wrapReadBufferWithCompressionMethod(createS3ReadBuffer(current_key), chooseCompressionMethod(current_key, compression_hint)); + auto zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; + read_buf = wrapReadBufferWithCompressionMethod(createS3ReadBuffer(current_key), chooseCompressionMethod(current_key, compression_hint), zstd_window_log_max); auto input_format = getContext()->getInputFormat(format, *read_buf, sample_block, max_block_size, format_settings); QueryPipelineBuilder builder; @@ -1170,10 +1171,12 @@ ColumnsDescription StorageS3::getTableStructureFromDataImpl( read_keys_in_distributed_processing->push_back(key); first = false; + const auto zstd_window_log_max = ctx->getSettingsRef().zstd_window_log_max; return wrapReadBufferWithCompressionMethod( std::make_unique( s3_configuration.client, s3_configuration.uri.bucket, key, s3_configuration.uri.version_id, s3_configuration.rw_settings.max_single_read_retries, ctx->getReadSettings()), - chooseCompressionMethod(key, compression_method)); + chooseCompressionMethod(key, compression_method), + zstd_window_log_max); }; return readSchemaFromFormat(format, format_settings, read_buffer_iterator, is_key_with_globs, ctx); diff --git a/src/Storages/StorageSnapshot.h b/src/Storages/StorageSnapshot.h index 5b76a4b37e5..6dad82dffd2 100644 --- a/src/Storages/StorageSnapshot.h +++ b/src/Storages/StorageSnapshot.h @@ -22,8 +22,8 @@ struct StorageSnapshot virtual ~Data() = default; }; - using DataPtr = std::unique_ptr; - const DataPtr data; + using DataPtr = std::unique_ptr; + DataPtr data; /// Projection that is used in query. mutable const ProjectionDescription * projection = nullptr; @@ -87,6 +87,6 @@ private: std::unordered_map virtual_columns; }; -using StorageSnapshotPtr = std::shared_ptr; +using StorageSnapshotPtr = std::shared_ptr; } diff --git a/src/Storages/StorageStripeLog.cpp b/src/Storages/StorageStripeLog.cpp index d66ac088a08..d569a81c4a7 100644 --- a/src/Storages/StorageStripeLog.cpp +++ b/src/Storages/StorageStripeLog.cpp @@ -35,10 +35,11 @@ #include #include +#include #include #include #include -#include +#include #include #include @@ -54,7 +55,6 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int INCORRECT_FILE_NAME; extern const int TIMEOUT_EXCEEDED; - extern const int NOT_IMPLEMENTED; } @@ -527,12 +527,12 @@ std::optional StorageStripeLog::totalBytes(const Settings &) const } -BackupEntries StorageStripeLog::backupData(ContextPtr context, const ASTs & partitions) +void StorageStripeLog::backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) { - if (!partitions.empty()) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Table engine {} doesn't support partitions", getName()); + if (partitions) + BackupEntriesCollector::throwPartitionsNotSupported(getStorageID(), getName()); - auto lock_timeout = getLockTimeout(context); + auto lock_timeout = getLockTimeout(backup_entries_collector.getContext()); loadIndices(lock_timeout); ReadLock lock{rwlock, lock_timeout}; @@ -540,22 +540,21 @@ BackupEntries StorageStripeLog::backupData(ContextPtr context, const ASTs & part throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED); if (!file_checker.getFileSize(data_file_path)) - return {}; + return; + fs::path data_path_in_backup_fs = data_path_in_backup; auto temp_dir_owner = std::make_shared(disk, "tmp/backup_"); - auto temp_dir = temp_dir_owner->getPath(); + fs::path temp_dir = temp_dir_owner->getPath(); disk->createDirectories(temp_dir); - BackupEntries backup_entries; - /// data.bin { /// We make a copy of the data file because it can be changed later in write() or in truncate(). String data_file_name = fileName(data_file_path); - String hardlink_file_path = temp_dir + "/" + data_file_name; + String hardlink_file_path = temp_dir / data_file_name; disk->createHardLink(data_file_path, hardlink_file_path); - backup_entries.emplace_back( - data_file_name, + backup_entries_collector.addBackupEntry( + data_path_in_backup_fs / data_file_name, std::make_unique( disk, hardlink_file_path, file_checker.getFileSize(data_file_path), std::nullopt, temp_dir_owner)); } @@ -564,123 +563,104 @@ BackupEntries StorageStripeLog::backupData(ContextPtr context, const ASTs & part { /// We make a copy of the data file because it can be changed later in write() or in truncate(). String index_file_name = fileName(index_file_path); - String hardlink_file_path = temp_dir + "/" + index_file_name; + String hardlink_file_path = temp_dir / index_file_name; disk->createHardLink(index_file_path, hardlink_file_path); - backup_entries.emplace_back( - index_file_name, + backup_entries_collector.addBackupEntry( + data_path_in_backup_fs / index_file_name, std::make_unique( disk, hardlink_file_path, file_checker.getFileSize(index_file_path), std::nullopt, temp_dir_owner)); } /// sizes.json String files_info_path = file_checker.getPath(); - backup_entries.emplace_back(fileName(files_info_path), std::make_unique(disk, files_info_path)); + backup_entries_collector.addBackupEntry( + data_path_in_backup_fs / fileName(files_info_path), std::make_unique(disk, files_info_path)); /// columns.txt - backup_entries.emplace_back( - "columns.txt", std::make_unique(getInMemoryMetadata().getColumns().getAllPhysical().toString())); + backup_entries_collector.addBackupEntry( + data_path_in_backup_fs / "columns.txt", + std::make_unique(getInMemoryMetadata().getColumns().getAllPhysical().toString())); /// count.txt size_t num_rows = 0; for (const auto & block : indices.blocks) num_rows += block.num_rows; - backup_entries.emplace_back("count.txt", std::make_unique(toString(num_rows))); - - return backup_entries; + backup_entries_collector.addBackupEntry( + data_path_in_backup_fs / "count.txt", std::make_unique(toString(num_rows))); } -class StripeLogRestoreTask : public IRestoreTask +void StorageStripeLog::restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) { - using WriteLock = StorageStripeLog::WriteLock; + if (partitions) + RestorerFromBackup::throwPartitionsNotSupported(getStorageID(), getName()); -public: - StripeLogRestoreTask( - const std::shared_ptr storage_, - const BackupPtr & backup_, - const String & data_path_in_backup_, - std::chrono::seconds lock_timeout_) - : storage(storage_), backup(backup_), data_path_in_backup(data_path_in_backup_), lock_timeout(lock_timeout_) + auto backup = restorer.getBackup(); + if (!restorer.isNonEmptyTableAllowed() && total_bytes && backup->hasFiles(data_path_in_backup)) + RestorerFromBackup::throwTableIsNotEmpty(getStorageID()); + + auto lock_timeout = getLockTimeout(restorer.getContext()); + restorer.addDataRestoreTask( + [storage = std::static_pointer_cast(shared_from_this()), backup, data_path_in_backup, lock_timeout] + { storage->restoreDataImpl(backup, data_path_in_backup, lock_timeout); }); +} + +void StorageStripeLog::restoreDataImpl(const BackupPtr & backup, const String & data_path_in_backup, std::chrono::seconds lock_timeout) +{ + WriteLock lock{rwlock, lock_timeout}; + if (!lock) + throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED); + + /// Load the indices if not loaded yet. We have to do that now because we're going to update these indices. + loadIndices(lock); + + /// If there were no files, save zero file sizes to be able to rollback in case of error. + saveFileSizes(lock); + + try { - } + fs::path data_path_in_backup_fs = data_path_in_backup; - RestoreTasks run() override - { - WriteLock lock{storage->rwlock, lock_timeout}; - if (!lock) - throw Exception("Lock timeout exceeded", ErrorCodes::TIMEOUT_EXCEEDED); - - auto & file_checker = storage->file_checker; - - /// Load the indices if not loaded yet. We have to do that now because we're going to update these indices. - storage->loadIndices(lock); - - /// If there were no files, save zero file sizes to be able to rollback in case of error. - storage->saveFileSizes(lock); - - try + /// Append the data file. + auto old_data_size = file_checker.getFileSize(data_file_path); { - /// Append the data file. - auto old_data_size = file_checker.getFileSize(storage->data_file_path); + String file_path_in_backup = data_path_in_backup_fs / fileName(data_file_path); + auto backup_entry = backup->readFile(file_path_in_backup); + auto in = backup_entry->getReadBuffer(); + auto out = disk->writeFile(data_file_path, max_compress_block_size, WriteMode::Append); + copyData(*in, *out); + } + + /// Append the index. + { + String index_path_in_backup = data_path_in_backup_fs / fileName(index_file_path); + IndexForNativeFormat extra_indices; + auto backup_entry = backup->readFile(index_path_in_backup); + auto index_in = backup_entry->getReadBuffer(); + CompressedReadBuffer index_compressed_in{*index_in}; + extra_indices.read(index_compressed_in); + + /// Adjust the offsets. + for (auto & block : extra_indices.blocks) { - const auto & data_file_path = storage->data_file_path; - String file_path_in_backup = data_path_in_backup + fileName(data_file_path); - auto backup_entry = backup->readFile(file_path_in_backup); - const auto & disk = storage->disk; - auto in = backup_entry->getReadBuffer(); - auto out = disk->writeFile(data_file_path, storage->max_compress_block_size, WriteMode::Append); - copyData(*in, *out); + for (auto & column : block.columns) + column.location.offset_in_compressed_file += old_data_size; } - /// Append the index. - { - const auto & index_file_path = storage->index_file_path; - String index_path_in_backup = data_path_in_backup + fileName(index_file_path); - IndexForNativeFormat extra_indices; - auto backup_entry = backup->readFile(index_path_in_backup); - auto index_in = backup_entry->getReadBuffer(); - CompressedReadBuffer index_compressed_in{*index_in}; - extra_indices.read(index_compressed_in); - - /// Adjust the offsets. - for (auto & block : extra_indices.blocks) - { - for (auto & column : block.columns) - column.location.offset_in_compressed_file += old_data_size; - } - - insertAtEnd(storage->indices.blocks, std::move(extra_indices.blocks)); - } - - /// Finish writing. - storage->saveIndices(lock); - storage->saveFileSizes(lock); - storage->updateTotalRows(lock); - return {}; - } - catch (...) - { - /// Rollback partial writes. - file_checker.repair(); - storage->removeUnsavedIndices(lock); - throw; + insertAtEnd(indices.blocks, std::move(extra_indices.blocks)); } + + /// Finish writing. + saveIndices(lock); + saveFileSizes(lock); + updateTotalRows(lock); + } + catch (...) + { + /// Rollback partial writes. + file_checker.repair(); + removeUnsavedIndices(lock); + throw; } - -private: - std::shared_ptr storage; - BackupPtr backup; - String data_path_in_backup; - std::chrono::seconds lock_timeout; -}; - - -RestoreTaskPtr StorageStripeLog::restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings &, const std::shared_ptr &) -{ - if (!partitions.empty()) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Table engine {} doesn't support partitions", getName()); - - return std::make_unique( - typeid_cast>(shared_from_this()), backup, data_path_in_backup, getLockTimeout(context)); } diff --git a/src/Storages/StorageStripeLog.h b/src/Storages/StorageStripeLog.h index c0eb2a0b864..3faffff381d 100644 --- a/src/Storages/StorageStripeLog.h +++ b/src/Storages/StorageStripeLog.h @@ -14,6 +14,8 @@ namespace DB { struct IndexForNativeFormat; +class IBackup; +using BackupPtr = std::shared_ptr; /** Implements a table engine that is suitable for small chunks of the log. * In doing so, stores all the columns in a single Native file, with a nearby index. @@ -22,7 +24,6 @@ class StorageStripeLog final : public IStorage { friend class StripeLogSource; friend class StripeLogSink; -friend class StripeLogRestoreTask; public: StorageStripeLog( @@ -62,9 +63,8 @@ public: std::optional totalRows(const Settings & settings) const override; std::optional totalBytes(const Settings & settings) const override; - bool hasDataToBackup() const override { return true; } - BackupEntries backupData(ContextPtr context, const ASTs & partitions) override; - RestoreTaskPtr restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings, const std::shared_ptr & restore_coordination) override; + void backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) override; + void restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) override; private: using ReadLock = std::shared_lock; @@ -87,6 +87,9 @@ private: /// Recalculates the number of rows stored in this table. void updateTotalRows(const WriteLock &); + /// Restores the data of this table from backup. + void restoreDataImpl(const BackupPtr & backup, const String & data_path_in_backup, std::chrono::seconds lock_timeout); + const DiskPtr disk; String table_path; diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index cd55c32fb9c..a90b6974c74 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -350,7 +350,8 @@ namespace std::move(read_buffer_factory), threadPoolCallbackRunner(IOThreadPool::get()), download_threads), - chooseCompressionMethod(request_uri.getPath(), compression_method)); + chooseCompressionMethod(request_uri.getPath(), compression_method), + settings.zstd_window_log_max); } } catch (const Poco::Exception & e) @@ -381,7 +382,8 @@ namespace delay_initialization, /* use_external_buffer */ false, /* skip_url_not_found_error */ skip_url_not_found_error), - chooseCompressionMethod(request_uri.getPath(), compression_method)); + chooseCompressionMethod(request_uri.getPath(), compression_method), + settings.zstd_window_log_max); } catch (...) { diff --git a/src/Storages/System/StorageSystemFilesystemCache.cpp b/src/Storages/System/StorageSystemFilesystemCache.cpp index f3ead8a95f0..4b76163363a 100644 --- a/src/Storages/System/StorageSystemFilesystemCache.cpp +++ b/src/Storages/System/StorageSystemFilesystemCache.cpp @@ -2,7 +2,8 @@ #include #include #include -#include +#include +#include #include #include #include @@ -43,7 +44,8 @@ void StorageSystemFilesystemCache::fillData(MutableColumns & res_columns, Contex for (const auto & file_segment : file_segments) { res_columns[0]->insert(cache_base_path); - res_columns[1]->insert(cache->getPathInLocalCache(file_segment->key(), file_segment->offset())); + res_columns[1]->insert( + cache->getPathInLocalCache(file_segment->key(), file_segment->offset(), file_segment->isPersistent())); const auto & range = file_segment->range(); res_columns[2]->insert(range.left); diff --git a/src/Storages/System/StorageSystemParts.cpp b/src/Storages/System/StorageSystemParts.cpp index a8edb8dd78b..01bba669c0e 100644 --- a/src/Storages/System/StorageSystemParts.cpp +++ b/src/Storages/System/StorageSystemParts.cpp @@ -195,9 +195,9 @@ void StorageSystemParts::processNextStorage( if (part->isStoredOnDisk()) { if (columns_mask[src_index++]) - columns[res_index++]->insert(part->volume->getDisk()->getName()); + columns[res_index++]->insert(part->data_part_storage->getDiskName()); if (columns_mask[src_index++]) - columns[res_index++]->insert(part->getFullPath()); + columns[res_index++]->insert(part->data_part_storage->getFullPath()); } else { diff --git a/src/Storages/System/StorageSystemPartsColumns.cpp b/src/Storages/System/StorageSystemPartsColumns.cpp index 7f648054da2..cebcfc492bf 100644 --- a/src/Storages/System/StorageSystemPartsColumns.cpp +++ b/src/Storages/System/StorageSystemPartsColumns.cpp @@ -190,9 +190,9 @@ void StorageSystemPartsColumns::processNextStorage( if (columns_mask[src_index++]) columns[res_index++]->insert(info.engine); if (columns_mask[src_index++]) - columns[res_index++]->insert(part->volume->getDisk()->getName()); + columns[res_index++]->insert(part->data_part_storage->getDiskName()); if (columns_mask[src_index++]) - columns[res_index++]->insert(part->getFullPath()); + columns[res_index++]->insert(part->data_part_storage->getFullPath()); if (columns_mask[src_index++]) columns[res_index++]->insert(column.name); diff --git a/src/Storages/System/StorageSystemProjectionParts.cpp b/src/Storages/System/StorageSystemProjectionParts.cpp index 21ca1f57703..7314c1e5012 100644 --- a/src/Storages/System/StorageSystemProjectionParts.cpp +++ b/src/Storages/System/StorageSystemProjectionParts.cpp @@ -200,9 +200,9 @@ void StorageSystemProjectionParts::processNextStorage( if (part->isStoredOnDisk()) { if (columns_mask[src_index++]) - columns[res_index++]->insert(part->volume->getDisk()->getName()); + columns[res_index++]->insert(part->data_part_storage->getDiskName()); if (columns_mask[src_index++]) - columns[res_index++]->insert(part->getFullPath()); + columns[res_index++]->insert(part->data_part_storage->getFullPath()); } else { diff --git a/src/Storages/System/StorageSystemProjectionPartsColumns.cpp b/src/Storages/System/StorageSystemProjectionPartsColumns.cpp index 1eec6825d5a..78a6df58761 100644 --- a/src/Storages/System/StorageSystemProjectionPartsColumns.cpp +++ b/src/Storages/System/StorageSystemProjectionPartsColumns.cpp @@ -211,9 +211,9 @@ void StorageSystemProjectionPartsColumns::processNextStorage( if (columns_mask[src_index++]) columns[res_index++]->insert(info.engine); if (columns_mask[src_index++]) - columns[res_index++]->insert(part->volume->getDisk()->getName()); + columns[res_index++]->insert(part->data_part_storage->getDiskName()); if (columns_mask[src_index++]) - columns[res_index++]->insert(part->getFullPath()); + columns[res_index++]->insert(part->data_part_storage->getFullPath()); if (columns_mask[src_index++]) columns[res_index++]->insert(column.name); diff --git a/src/Storages/System/StorageSystemQuotas.cpp b/src/Storages/System/StorageSystemQuotas.cpp index fa262f22d2c..efe6b93fe57 100644 --- a/src/Storages/System/StorageSystemQuotas.cpp +++ b/src/Storages/System/StorageSystemQuotas.cpp @@ -2,6 +2,8 @@ #include #include #include +#include +#include #include #include #include @@ -118,4 +120,25 @@ void StorageSystemQuotas::fillData(MutableColumns & res_columns, ContextPtr cont add_row(quota->getName(), id, storage->getStorageName(), quota->all_limits, quota->key_type, quota->to_roles); } } + +void StorageSystemQuotas::backupData( + BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) +{ + if (partitions) + BackupEntriesCollector::throwPartitionsNotSupported(getStorageID(), getName()); + + const auto & access_control = backup_entries_collector.getContext()->getAccessControl(); + access_control.backup(backup_entries_collector, AccessEntityType::QUOTA, data_path_in_backup); +} + +void StorageSystemQuotas::restoreDataFromBackup( + RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) +{ + if (partitions) + RestorerFromBackup::throwPartitionsNotSupported(getStorageID(), getName()); + + auto & access_control = restorer.getContext()->getAccessControl(); + access_control.restore(restorer, data_path_in_backup); +} + } diff --git a/src/Storages/System/StorageSystemQuotas.h b/src/Storages/System/StorageSystemQuotas.h index 3a0e3b0957b..28c873aa734 100644 --- a/src/Storages/System/StorageSystemQuotas.h +++ b/src/Storages/System/StorageSystemQuotas.h @@ -15,6 +15,9 @@ public: std::string getName() const override { return "SystemQuotas"; } static NamesAndTypesList getNamesAndTypes(); + void backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) override; + void restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) override; + protected: using IStorageSystemOneBlock::IStorageSystemOneBlock; void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const override; diff --git a/src/Storages/System/StorageSystemRemoteDataPaths.cpp b/src/Storages/System/StorageSystemRemoteDataPaths.cpp index dbce4c25773..a009f9d25c9 100644 --- a/src/Storages/System/StorageSystemRemoteDataPaths.cpp +++ b/src/Storages/System/StorageSystemRemoteDataPaths.cpp @@ -1,7 +1,7 @@ #include "StorageSystemRemoteDataPaths.h" #include #include -#include +#include #include #include #include diff --git a/src/Storages/System/StorageSystemRoles.cpp b/src/Storages/System/StorageSystemRoles.cpp index fcc45d1374f..ff3490ce8ba 100644 --- a/src/Storages/System/StorageSystemRoles.cpp +++ b/src/Storages/System/StorageSystemRoles.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include @@ -57,4 +59,24 @@ void StorageSystemRoles::fillData(MutableColumns & res_columns, ContextPtr conte } } +void StorageSystemRoles::backupData( + BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) +{ + if (partitions) + BackupEntriesCollector::throwPartitionsNotSupported(getStorageID(), getName()); + + const auto & access_control = backup_entries_collector.getContext()->getAccessControl(); + access_control.backup(backup_entries_collector, AccessEntityType::ROLE, data_path_in_backup); +} + +void StorageSystemRoles::restoreDataFromBackup( + RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) +{ + if (partitions) + RestorerFromBackup::throwPartitionsNotSupported(getStorageID(), getName()); + + auto & access_control = restorer.getContext()->getAccessControl(); + access_control.restore(restorer, data_path_in_backup); +} + } diff --git a/src/Storages/System/StorageSystemRoles.h b/src/Storages/System/StorageSystemRoles.h index 1a8aee61dcb..d9de9db5c65 100644 --- a/src/Storages/System/StorageSystemRoles.h +++ b/src/Storages/System/StorageSystemRoles.h @@ -14,6 +14,9 @@ public: std::string getName() const override { return "SystemRoles"; } static NamesAndTypesList getNamesAndTypes(); + void backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) override; + void restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) override; + protected: using IStorageSystemOneBlock::IStorageSystemOneBlock; void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const override; diff --git a/src/Storages/System/StorageSystemRowPolicies.cpp b/src/Storages/System/StorageSystemRowPolicies.cpp index cd4f3dab109..680f90adff7 100644 --- a/src/Storages/System/StorageSystemRowPolicies.cpp +++ b/src/Storages/System/StorageSystemRowPolicies.cpp @@ -2,6 +2,8 @@ #include #include #include +#include +#include #include #include #include @@ -136,4 +138,25 @@ void StorageSystemRowPolicies::fillData(MutableColumns & res_columns, ContextPtr add_row(policy->getName(), policy->getFullName(), id, storage->getStorageName(), policy->filters, policy->isRestrictive(), policy->to_roles); } } + +void StorageSystemRowPolicies::backupData( + BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) +{ + if (partitions) + BackupEntriesCollector::throwPartitionsNotSupported(getStorageID(), getName()); + + const auto & access_control = backup_entries_collector.getContext()->getAccessControl(); + access_control.backup(backup_entries_collector, AccessEntityType::ROW_POLICY, data_path_in_backup); +} + +void StorageSystemRowPolicies::restoreDataFromBackup( + RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) +{ + if (partitions) + RestorerFromBackup::throwPartitionsNotSupported(getStorageID(), getName()); + + auto & access_control = restorer.getContext()->getAccessControl(); + access_control.restore(restorer, data_path_in_backup); +} + } diff --git a/src/Storages/System/StorageSystemRowPolicies.h b/src/Storages/System/StorageSystemRowPolicies.h index 6520f426e4e..9f94f7df65b 100644 --- a/src/Storages/System/StorageSystemRowPolicies.h +++ b/src/Storages/System/StorageSystemRowPolicies.h @@ -16,6 +16,9 @@ public: std::string getName() const override { return "SystemRowPolicies"; } static NamesAndTypesList getNamesAndTypes(); + void backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) override; + void restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) override; + protected: using IStorageSystemOneBlock::IStorageSystemOneBlock; void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const override; diff --git a/src/Storages/System/StorageSystemSettingsProfiles.cpp b/src/Storages/System/StorageSystemSettingsProfiles.cpp index 132f10ea194..7c3ccfe863a 100644 --- a/src/Storages/System/StorageSystemSettingsProfiles.cpp +++ b/src/Storages/System/StorageSystemSettingsProfiles.cpp @@ -2,6 +2,8 @@ #include #include #include +#include +#include #include #include #include @@ -84,4 +86,24 @@ void StorageSystemSettingsProfiles::fillData(MutableColumns & res_columns, Conte } } +void StorageSystemSettingsProfiles::backupData( + BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) +{ + if (partitions) + BackupEntriesCollector::throwPartitionsNotSupported(getStorageID(), getName()); + + const auto & access_control = backup_entries_collector.getContext()->getAccessControl(); + access_control.backup(backup_entries_collector, AccessEntityType::SETTINGS_PROFILE, data_path_in_backup); +} + +void StorageSystemSettingsProfiles::restoreDataFromBackup( + RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) +{ + if (partitions) + RestorerFromBackup::throwPartitionsNotSupported(getStorageID(), getName()); + + auto & access_control = restorer.getContext()->getAccessControl(); + access_control.restore(restorer, data_path_in_backup); +} + } diff --git a/src/Storages/System/StorageSystemSettingsProfiles.h b/src/Storages/System/StorageSystemSettingsProfiles.h index f93322daeae..6edaa02a4c3 100644 --- a/src/Storages/System/StorageSystemSettingsProfiles.h +++ b/src/Storages/System/StorageSystemSettingsProfiles.h @@ -14,6 +14,9 @@ public: std::string getName() const override { return "SystemSettingsProfiles"; } static NamesAndTypesList getNamesAndTypes(); + void backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) override; + void restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) override; + protected: using IStorageSystemOneBlock::IStorageSystemOneBlock; void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const override; diff --git a/src/Storages/System/StorageSystemUsers.cpp b/src/Storages/System/StorageSystemUsers.cpp index d9b94f21c61..f2cae638d45 100644 --- a/src/Storages/System/StorageSystemUsers.cpp +++ b/src/Storages/System/StorageSystemUsers.cpp @@ -2,6 +2,8 @@ #include #include #include +#include +#include #include #include #include @@ -212,4 +214,24 @@ void StorageSystemUsers::fillData(MutableColumns & res_columns, ContextPtr conte } } +void StorageSystemUsers::backupData( + BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) +{ + if (partitions) + BackupEntriesCollector::throwPartitionsNotSupported(getStorageID(), getName()); + + const auto & access_control = backup_entries_collector.getContext()->getAccessControl(); + access_control.backup(backup_entries_collector, AccessEntityType::USER, data_path_in_backup); +} + +void StorageSystemUsers::restoreDataFromBackup( + RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) +{ + if (partitions) + RestorerFromBackup::throwPartitionsNotSupported(getStorageID(), getName()); + + auto & access_control = restorer.getContext()->getAccessControl(); + access_control.restore(restorer, data_path_in_backup); +} + } diff --git a/src/Storages/System/StorageSystemUsers.h b/src/Storages/System/StorageSystemUsers.h index b18d99e7400..536f0482480 100644 --- a/src/Storages/System/StorageSystemUsers.h +++ b/src/Storages/System/StorageSystemUsers.h @@ -14,6 +14,9 @@ public: std::string getName() const override { return "SystemUsers"; } static NamesAndTypesList getNamesAndTypes(); + void backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) override; + void restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional & partitions) override; + protected: using IStorageSystemOneBlock::IStorageSystemOneBlock; void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const override; diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index b745da13484..41c9c1996b1 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -92,7 +92,7 @@ public: { /// Do not throw if found aggregate function inside another aggregate function, /// because it will be checked, while creating expressions. - if (AggregateFunctionFactory::instance().isAggregateFunctionName(func.name)) + if (AggregateUtils::isAggregateFunction(func)) has_aggregate_function = true; } }; diff --git a/src/Storages/WindowView/StorageWindowView.cpp b/src/Storages/WindowView/StorageWindowView.cpp index 305abe23434..cfb19869074 100644 --- a/src/Storages/WindowView/StorageWindowView.cpp +++ b/src/Storages/WindowView/StorageWindowView.cpp @@ -459,7 +459,6 @@ void StorageWindowView::alter( auto inner_query = initInnerQuery(new_select_query->as(), local_context); - input_header.clear(); output_header.clear(); InterpreterDropQuery::executeDropQuery( @@ -1230,7 +1229,6 @@ StorageWindowView::StorageWindowView( ASTPtr StorageWindowView::initInnerQuery(ASTSelectQuery query, ContextPtr context_) { select_query = query.clone(); - input_header.clear(); output_header.clear(); String select_database_name = getContext()->getCurrentDatabase(); @@ -1627,15 +1625,10 @@ void StorageWindowView::dropInnerTableIfAny(bool no_delay, ContextPtr local_cont } } -const Block & StorageWindowView::getInputHeader() const +Block StorageWindowView::getInputHeader() const { - std::lock_guard lock(sample_block_lock); - if (!input_header) - { - auto metadata = getSourceTable()->getInMemoryMetadataPtr(); - input_header = metadata->getSampleBlockNonMaterialized(); - } - return input_header; + auto metadata = getSourceTable()->getInMemoryMetadataPtr(); + return metadata->getSampleBlockNonMaterialized(); } const Block & StorageWindowView::getOutputHeader() const diff --git a/src/Storages/WindowView/StorageWindowView.h b/src/Storages/WindowView/StorageWindowView.h index c4a7b98a22f..86cc80ee8ea 100644 --- a/src/Storages/WindowView/StorageWindowView.h +++ b/src/Storages/WindowView/StorageWindowView.h @@ -170,7 +170,7 @@ public: ASTPtr getSourceTableSelectQuery(); - const Block & getInputHeader() const; + Block getInputHeader() const; const Block & getOutputHeader() const; @@ -193,7 +193,6 @@ private: std::atomic modifying_query{false}; bool has_inner_table{true}; bool has_inner_target_table{false}; - mutable Block input_header; mutable Block output_header; UInt64 fire_signal_timeout_s; UInt64 clean_interval_usec; diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py index 9730ac2cc46..3976e2ba916 100644 --- a/tests/ci/build_check.py +++ b/tests/ci/build_check.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# + import subprocess import logging import json @@ -7,9 +7,16 @@ import os import sys import time from shutil import rmtree -from typing import List, Optional, Tuple +from typing import List, Tuple -from env_helper import GITHUB_JOB, REPO_COPY, TEMP_PATH, CACHES_PATH, IMAGES_PATH +from env_helper import ( + CACHES_PATH, + GITHUB_JOB, + IMAGES_PATH, + REPO_COPY, + S3_BUILDS_BUCKET, + TEMP_PATH, +) from s3_helper import S3Helper from pr_info import PRInfo from version_helper import ( @@ -24,6 +31,7 @@ from docker_pull_helper import get_image_with_version from tee_popen import TeePopen IMAGE_NAME = "clickhouse/binary-builder" +BUILD_LOG_NAME = "build_log.log" def _can_export_binaries(build_config: BuildConfig) -> bool: @@ -86,7 +94,7 @@ def get_packager_cmd( def build_clickhouse( packager_cmd: str, logs_path: str, build_output_path: str ) -> Tuple[str, bool]: - build_log_path = os.path.join(logs_path, "build_log.log") + build_log_path = os.path.join(logs_path, BUILD_LOG_NAME) success = False with TeePopen(packager_cmd, build_log_path) as process: retcode = process.wait() @@ -108,15 +116,56 @@ def build_clickhouse( return build_log_path, success -def get_build_results_if_exists( - s3_helper: S3Helper, s3_prefix: str -) -> Optional[List[str]]: +def check_for_success_run( + s3_helper: S3Helper, + s3_prefix: str, + build_name: str, + build_config: BuildConfig, +): + logged_prefix = os.path.join(S3_BUILDS_BUCKET, s3_prefix) + logging.info("Checking for artifacts in %s", logged_prefix) try: - content = s3_helper.list_prefix(s3_prefix) - return content + # TODO: theoretically, it would miss performance artifact for pr==0, + # but luckily we rerun only really failed tasks now, so we're safe + build_results = s3_helper.list_prefix(s3_prefix) except Exception as ex: - logging.info("Got exception %s listing %s", ex, s3_prefix) - return None + logging.info("Got exception while listing %s: %s\nRerun", logged_prefix, ex) + return + + if build_results is None or len(build_results) == 0: + logging.info("Nothing found in %s, rerun", logged_prefix) + return + + logging.info("Some build results found:\n%s", build_results) + build_urls = [] + log_url = "" + for url in build_results: + url_escaped = url.replace("+", "%2B").replace(" ", "%20") + if BUILD_LOG_NAME in url: + log_url = f"https://s3.amazonaws.com/{S3_BUILDS_BUCKET}/{url_escaped}" + else: + build_urls.append( + f"https://s3.amazonaws.com/{S3_BUILDS_BUCKET}/{url_escaped}" + ) + if not log_url: + # log is uploaded the last, so if there's no log we need to rerun the build + return + + success = len(build_urls) > 0 + create_json_artifact( + TEMP_PATH, + build_name, + log_url, + build_urls, + build_config, + 0, + success, + ) + # Fail build job if not successeded + if not success: + sys.exit(1) + else: + sys.exit(0) def create_json_artifact( @@ -213,37 +262,8 @@ def main(): ) # If this is rerun, then we try to find already created artifacts and just - # put them as github actions artifcat (result) - build_results = get_build_results_if_exists(s3_helper, s3_path_prefix) - if build_results is not None and len(build_results) > 0: - logging.info("Some build results found %s", build_results) - build_urls = [] - log_url = "" - for url in build_results: - if "build_log.log" in url: - log_url = "https://s3.amazonaws.com/clickhouse-builds/" + url.replace( - "+", "%2B" - ).replace(" ", "%20") - else: - build_urls.append( - "https://s3.amazonaws.com/clickhouse-builds/" - + url.replace("+", "%2B").replace(" ", "%20") - ) - success = len(build_urls) > 0 - create_json_artifact( - TEMP_PATH, - build_name, - log_url, - build_urls, - build_config, - 0, - success, - ) - # Fail build job if not successeded - if not success: - sys.exit(1) - else: - sys.exit(0) + # put them as github actions artifact (result) + check_for_success_run(s3_helper, s3_path_prefix, build_name, build_config) docker_image = get_image_with_version(IMAGES_PATH, IMAGE_NAME) image_version = docker_image.version @@ -295,14 +315,12 @@ def main(): logging.info("Going to run packager with %s", packager_cmd) - build_clickhouse_log = os.path.join(TEMP_PATH, "build_log") - if not os.path.exists(build_clickhouse_log): - os.makedirs(build_clickhouse_log) + logs_path = os.path.join(TEMP_PATH, "build_log") + if not os.path.exists(logs_path): + os.makedirs(logs_path) start = time.time() - log_path, success = build_clickhouse( - packager_cmd, build_clickhouse_log, build_output_path - ) + log_path, success = build_clickhouse(packager_cmd, logs_path, build_output_path) elapsed = int(time.time() - start) subprocess.check_call( f"sudo chown -R ubuntu:ubuntu {build_output_path}", shell=True @@ -310,17 +328,10 @@ def main(): subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {ccache_path}", shell=True) logging.info("Build finished with %s, log path %s", success, log_path) + # Upload the ccache first to have the least build time in case of problems logging.info("Will upload cache") upload_ccache(ccache_path, s3_helper, pr_info.number, TEMP_PATH) - if os.path.exists(log_path): - log_url = s3_helper.upload_build_file_to_s3( - log_path, s3_path_prefix + "/" + os.path.basename(log_path) - ) - logging.info("Log url %s", log_url) - else: - logging.info("Build log doesn't exist") - # FIXME performance performance_urls = [] performance_path = os.path.join(build_output_path, "performance.tgz") @@ -347,6 +358,14 @@ def main(): print("::notice ::Build URLs: {}".format("\n".join(build_urls))) + if os.path.exists(log_path): + log_url = s3_helper.upload_build_file_to_s3( + log_path, s3_path_prefix + "/" + os.path.basename(log_path) + ) + logging.info("Log url %s", log_url) + else: + logging.info("Build log doesn't exist") + print(f"::notice ::Log URL: {log_url}") create_json_artifact( diff --git a/tests/ci/cherry_pick.py b/tests/ci/cherry_pick.py index 4bbd30cd186..745284b2b29 100644 --- a/tests/ci/cherry_pick.py +++ b/tests/ci/cherry_pick.py @@ -1,48 +1,72 @@ #!/usr/bin/env python3 -import sys +import argparse import logging import os import subprocess from env_helper import GITHUB_WORKSPACE, TEMP_PATH -from get_robot_token import get_parameter_from_ssm +from get_robot_token import get_best_robot_token from ssh import SSHKey from cherry_pick_utils.backport import Backport from cherry_pick_utils.cherrypick import CherryPick +def parse_args(): + parser = argparse.ArgumentParser("Create cherry-pick and backport PRs") + parser.add_argument("--token", help="github token, if not set, used from smm") + parser.add_argument("--dry-run", action="store_true", help="do not create anything") + return parser.parse_args() + + +def main(): + args = parse_args() + token = args.token or get_best_robot_token() + + bp = Backport( + token, + os.environ.get("REPO_OWNER"), + os.environ.get("REPO_NAME"), + os.environ.get("REPO_TEAM"), + ) + + cherry_pick = CherryPick( + token, + os.environ.get("REPO_OWNER"), + os.environ.get("REPO_NAME"), + os.environ.get("REPO_TEAM"), + 1, + "master", + ) + # Use the same _gh in both objects to have a proper cost + # pylint: disable=protected-access + for key in bp._gh.api_costs: + if key in cherry_pick._gh.api_costs: + bp._gh.api_costs[key] += cherry_pick._gh.api_costs[key] + for key in cherry_pick._gh.api_costs: + if key not in bp._gh.api_costs: + bp._gh.api_costs[key] = cherry_pick._gh.api_costs[key] + cherry_pick._gh = bp._gh + # pylint: enable=protected-access + + def cherrypick_run(pr_data, branch): + cherry_pick.update_pr_branch(pr_data, branch) + return cherry_pick.execute(GITHUB_WORKSPACE, args.dry_run) + + try: + bp.execute(GITHUB_WORKSPACE, "origin", None, cherrypick_run) + except subprocess.CalledProcessError as e: + logging.error(e.output) + + if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - repo_path = GITHUB_WORKSPACE - temp_path = TEMP_PATH - if not os.path.exists(temp_path): - os.makedirs(temp_path) + if not os.path.exists(TEMP_PATH): + os.makedirs(TEMP_PATH) - sys.path.append(os.path.join(repo_path, "utils/github")) - - with SSHKey("ROBOT_CLICKHOUSE_SSH_KEY"): - token = get_parameter_from_ssm("github_robot_token_1") - - bp = Backport( - token, - os.environ.get("REPO_OWNER"), - os.environ.get("REPO_NAME"), - os.environ.get("REPO_TEAM"), - ) - - def cherrypick_run(token, pr, branch): - return CherryPick( - token, - os.environ.get("REPO_OWNER"), - os.environ.get("REPO_NAME"), - os.environ.get("REPO_TEAM"), - pr, - branch, - ).execute(repo_path, False) - - try: - bp.execute(repo_path, "origin", None, cherrypick_run) - except subprocess.CalledProcessError as e: - logging.error(e.output) + if os.getenv("ROBOT_CLICKHOUSE_SSH_KEY", ""): + with SSHKey("ROBOT_CLICKHOUSE_SSH_KEY"): + main() + else: + main() diff --git a/tests/ci/cherry_pick_utils/__init__.py b/tests/ci/cherry_pick_utils/__init__.py index 40a96afc6ff..faa18be5bbf 100644 --- a/tests/ci/cherry_pick_utils/__init__.py +++ b/tests/ci/cherry_pick_utils/__init__.py @@ -1 +1,2 @@ +#!/usr/bin/env python # -*- coding: utf-8 -*- diff --git a/tests/ci/cherry_pick_utils/backport.py b/tests/ci/cherry_pick_utils/backport.py index 615c0d19ffa..1bc910886de 100644 --- a/tests/ci/cherry_pick_utils/backport.py +++ b/tests/ci/cherry_pick_utils/backport.py @@ -1,24 +1,22 @@ # -*- coding: utf-8 -*- -try: - from clickhouse.utils.github.cherrypick import CherryPick - from clickhouse.utils.github.query import Query as RemoteRepo - from clickhouse.utils.github.local import Repository as LocalRepo -except: - from .cherrypick import CherryPick - from .query import Query as RemoteRepo - from .local import Repository as LocalRepo - import argparse import logging +import os import re import sys +sys.path.append(os.path.dirname(__file__)) + +from cherrypick import CherryPick +from query import Query as RemoteRepo +from local import Repository as LocalRepo + class Backport: def __init__(self, token, owner, name, team): self._gh = RemoteRepo( - token, owner=owner, name=name, team=team, max_page_size=30, min_page_size=7 + token, owner=owner, name=name, team=team, max_page_size=60, min_page_size=7 ) self._token = token self.default_branch_name = self._gh.default_branch @@ -49,14 +47,16 @@ class Backport: logging.info("No release branches found!") return - for branch in branches: - logging.info("Found release branch: %s", branch[0]) + logging.info( + "Found release branches: %s", ", ".join([br[0] for br in branches]) + ) if not until_commit: until_commit = branches[0][1] pull_requests = self.getPullRequests(until_commit) backport_map = {} + pr_map = {pr["number"]: pr for pr in pull_requests} RE_MUST_BACKPORT = re.compile(r"^v(\d+\.\d+)-must-backport$") RE_NO_BACKPORT = re.compile(r"^v(\d+\.\d+)-no-backport$") @@ -68,17 +68,17 @@ class Backport: pr["mergeCommit"]["oid"] ): logging.info( - "PR #{} is already inside {}. Dropping this branch for further PRs".format( - pr["number"], branches[-1][0] - ) + "PR #%s is already inside %s. Dropping this branch for further PRs", + pr["number"], + branches[-1][0], ) branches.pop() - logging.info("Processing PR #{}".format(pr["number"])) + logging.info("Processing PR #%s", pr["number"]) - assert len(branches) + assert len(branches) != 0 - branch_set = set([branch[0] for branch in branches]) + branch_set = {branch[0] for branch in branches} # First pass. Find all must-backports for label in pr["labels"]["nodes"]: @@ -120,16 +120,16 @@ class Backport: ) for pr, branches in list(backport_map.items()): - logging.info("PR #%s needs to be backported to:", pr) + statuses = [] for branch in branches: - logging.info( - "\t%s, and the status is: %s", - branch, - run_cherrypick(self._token, pr, branch), - ) + branch_status = run_cherrypick(pr_map[pr], branch) + statuses.append(f"{branch}, and the status is: {branch_status}") + logging.info( + "PR #%s needs to be backported to:\n\t%s", pr, "\n\t".join(statuses) + ) # print API costs - logging.info("\nGitHub API total costs per query:") + logging.info("\nGitHub API total costs for backporting per query:") for name, value in list(self._gh.api_costs.items()): logging.info("%s : %s", name, value) @@ -178,8 +178,13 @@ if __name__ == "__main__": else: logging.basicConfig(format="%(message)s", stream=sys.stdout, level=logging.INFO) - cherrypick_run = lambda token, pr, branch: CherryPick( - token, "ClickHouse", "ClickHouse", "core", pr, branch - ).execute(args.repo, args.dry_run) + cherry_pick = CherryPick( + args.token, "ClickHouse", "ClickHouse", "core", 1, "master" + ) + + def cherrypick_run(pr_data, branch): + cherry_pick.update_pr_branch(pr_data, branch) + return cherry_pick.execute(args.repo, args.dry_run) + bp = Backport(args.token, "ClickHouse", "ClickHouse", "core") bp.execute(args.repo, args.upstream, args.til, cherrypick_run) diff --git a/tests/ci/cherry_pick_utils/cherrypick.py b/tests/ci/cherry_pick_utils/cherrypick.py index c6469fa62a9..92c87800828 100644 --- a/tests/ci/cherry_pick_utils/cherrypick.py +++ b/tests/ci/cherry_pick_utils/cherrypick.py @@ -14,10 +14,6 @@ Second run checks PR from previous run to be merged or at least being mergeable. Third run creates PR from backport branch (with merged previous PR) to release branch. """ -try: - from clickhouse.utils.github.query import Query as RemoteRepo -except: - from .query import Query as RemoteRepo import argparse from enum import Enum @@ -26,6 +22,10 @@ import os import subprocess import sys +sys.path.append(os.path.dirname(__file__)) + +from query import Query as RemoteRepo + class CherryPick: class Status(Enum): @@ -45,20 +45,21 @@ class CherryPick: def __init__(self, token, owner, name, team, pr_number, target_branch): self._gh = RemoteRepo(token, owner=owner, name=name, team=team) self._pr = self._gh.get_pull_request(pr_number) + self.target_branch = target_branch self.ssh_url = self._gh.ssh_url # TODO: check if pull-request is merged. + self.update_pr_branch(self._pr, self.target_branch) + def update_pr_branch(self, pr_data, target_branch): + """The method is here to avoid unnecessary creation of new objects""" + self._pr = pr_data + self.target_branch = target_branch self.merge_commit_oid = self._pr["mergeCommit"]["oid"] - self.target_branch = target_branch - self.backport_branch = "backport/{branch}/{pr}".format( - branch=target_branch, pr=pr_number - ) - self.cherrypick_branch = "cherrypick/{branch}/{oid}".format( - branch=target_branch, oid=self.merge_commit_oid - ) + self.backport_branch = f"backport/{target_branch}/{pr_data['number']}" + self.cherrypick_branch = f"cherrypick/{target_branch}/{self.merge_commit_oid}" def getCherryPickPullRequest(self): return self._gh.find_pull_request( @@ -118,17 +119,16 @@ class CherryPick: ) # Create pull-request like a local cherry-pick + title = self._pr["title"].replace('"', r"\"") pr = self._gh.create_pull_request( source=self.cherrypick_branch, target=self.backport_branch, - title="Cherry pick #{number} to {target}: {title}".format( - number=self._pr["number"], - target=self.target_branch, - title=self._pr["title"].replace('"', '\\"'), - ), - description="Original pull-request #{}\n\n{}".format( - self._pr["number"], DESCRIPTION + title=( + f'Cherry pick #{self._pr["number"]} ' + f"to {self.target_branch}: " + f"{title}" ), + description=f'Original pull-request #{self._pr["number"]}\n\n{DESCRIPTION}', ) # FIXME: use `team` to leave a single eligible assignee. @@ -165,11 +165,8 @@ class CherryPick: "user.name=robot-clickhouse", ] - pr_title = "Backport #{number} to {target}: {title}".format( - number=self._pr["number"], - target=self.target_branch, - title=self._pr["title"].replace('"', '\\"'), - ) + title = (self._pr["title"].replace('"', r"\""),) + pr_title = f"Backport #{self._pr['number']} to {self.target_branch}: {title}" self._run(git_prefix + ["checkout", "-f", self.backport_branch]) self._run(git_prefix + ["pull", "--ff-only", "origin", self.backport_branch]) @@ -203,9 +200,8 @@ class CherryPick: source=self.backport_branch, target=self.target_branch, title=pr_title, - description="Original pull-request #{}\nCherry-pick pull-request #{}\n\n{}".format( - self._pr["number"], cherrypick_pr["number"], DESCRIPTION - ), + description=f"Original pull-request #{self._pr['number']}\n" + f"Cherry-pick pull-request #{cherrypick_pr['number']}\n\n{DESCRIPTION}", ) # FIXME: use `team` to leave a single eligible assignee. diff --git a/tests/ci/cherry_pick_utils/local.py b/tests/ci/cherry_pick_utils/local.py index 571c9102ba0..71923b63c35 100644 --- a/tests/ci/cherry_pick_utils/local.py +++ b/tests/ci/cherry_pick_utils/local.py @@ -5,10 +5,11 @@ import logging import os import re +import git + class RepositoryBase: def __init__(self, repo_path): - import git self._repo = git.Repo(repo_path, search_parent_directories=(not repo_path)) @@ -23,22 +24,22 @@ class RepositoryBase: self.comparator = functools.cmp_to_key(cmp) - def get_head_commit(self): - return self._repo.commit(self._default) - def iterate(self, begin, end): - rev_range = "{}...{}".format(begin, end) + rev_range = f"{begin}...{end}" for commit in self._repo.iter_commits(rev_range, first_parent=True): yield commit class Repository(RepositoryBase): def __init__(self, repo_path, remote_name, default_branch_name): - super(Repository, self).__init__(repo_path) + super().__init__(repo_path) self._remote = self._repo.remotes[remote_name] self._remote.fetch() self._default = self._remote.refs[default_branch_name] + def get_head_commit(self): + return self._repo.commit(self._default) + def get_release_branches(self): """ Returns sorted list of tuples: @@ -73,7 +74,7 @@ class Repository(RepositoryBase): class BareRepository(RepositoryBase): def __init__(self, repo_path, default_branch_name): - super(BareRepository, self).__init__(repo_path) + super().__init__(repo_path) self._default = self._repo.branches[default_branch_name] def get_release_branches(self): diff --git a/tests/ci/cherry_pick_utils/query.py b/tests/ci/cherry_pick_utils/query.py index 40eb5bf3604..917f9901287 100644 --- a/tests/ci/cherry_pick_utils/query.py +++ b/tests/ci/cherry_pick_utils/query.py @@ -1,7 +1,13 @@ # -*- coding: utf-8 -*- -import requests +import json +import inspect +import logging import time +from urllib3.util.retry import Retry # type: ignore + +import requests # type: ignore +from requests.adapters import HTTPAdapter # type: ignore class Query: @@ -10,43 +16,43 @@ class Query: """ _PULL_REQUEST = """ - author {{ - ... on User {{ - id - login - }} - }} - - baseRepository {{ - nameWithOwner - }} - - mergeCommit {{ - oid - parents(first: {min_page_size}) {{ - totalCount - nodes {{ - oid - }} - }} - }} - - mergedBy {{ - ... on User {{ - id - login - }} - }} - - baseRefName - closed - headRefName +author {{ + ... on User {{ id - mergeable - merged - number - title - url + login + }} +}} + +baseRepository {{ + nameWithOwner +}} + +mergeCommit {{ + oid + parents(first: {min_page_size}) {{ + totalCount + nodes {{ + oid + }} + }} +}} + +mergedBy {{ + ... on User {{ + id + login + }} +}} + +baseRefName +closed +headRefName +id +mergeable +merged +number +title +url """ def __init__(self, token, owner, name, team, max_page_size=100, min_page_size=10): @@ -56,6 +62,7 @@ class Query: self._owner = owner self._name = name self._team = team + self._session = None self._max_page_size = max_page_size self._min_page_size = min_page_size @@ -71,13 +78,13 @@ class Query: def get_repository(self): _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - defaultBranchRef {{ - name - }} - id - sshUrl - }} +repository(owner: "{owner}" name: "{name}") {{ + defaultBranchRef {{ + name + }} + id + sshUrl +}} """ query = _QUERY.format(owner=self._owner, name=self._name) @@ -91,20 +98,20 @@ class Query: """ _QUERY = """ - organization(login: "{organization}") {{ - team(slug: "{team}") {{ - members(first: {max_page_size} {next}) {{ - pageInfo {{ - hasNextPage - endCursor - }} - nodes {{ - id - login - }} - }} - }} +organization(login: "{organization}") {{ + team(slug: "{team}") {{ + members(first: {max_page_size} {next}) {{ + pageInfo {{ + hasNextPage + endCursor }} + nodes {{ + id + login + }} + }} + }} +}} """ members = {} @@ -126,20 +133,24 @@ class Query: organization=self._owner, team=self._team, max_page_size=self._max_page_size, - next='after: "{}"'.format(result["pageInfo"]["endCursor"]), + next=f'after: "{result["pageInfo"]["endCursor"]}"', ) - members += dict([(node["login"], node["id"]) for node in result["nodes"]]) + # Update members with new nodes compatible with py3.8-py3.10 + members = { + **members, + **{node["login"]: node["id"] for node in result["nodes"]}, + } return members def get_pull_request(self, number): _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - pullRequest(number: {number}) {{ - {pull_request_data} - }} - }} +repository(owner: "{owner}" name: "{name}") {{ + pullRequest(number: {number}) {{ + {pull_request_data} + }} +}} """ query = _QUERY.format( @@ -153,14 +164,16 @@ class Query: def find_pull_request(self, base, head): _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - pullRequests(first: {min_page_size} baseRefName: "{base}" headRefName: "{head}") {{ - nodes {{ - {pull_request_data} - }} - totalCount - }} - }} +repository(owner: "{owner}" name: "{name}") {{ + pullRequests( + first: {min_page_size} baseRefName: "{base}" headRefName: "{head}" + ) {{ + nodes {{ + {pull_request_data} + }} + totalCount + }} +}} """ query = _QUERY.format( @@ -182,13 +195,13 @@ class Query: Get all pull-requests filtered by label name """ _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - pullRequests(first: {min_page_size} labels: "{label_name}" states: OPEN) {{ - nodes {{ - {pull_request_data} - }} - }} - }} +repository(owner: "{owner}" name: "{name}") {{ + pullRequests(first: {min_page_size} labels: "{label_name}" states: OPEN) {{ + nodes {{ + {pull_request_data} + }} + }} +}} """ query = _QUERY.format( @@ -206,35 +219,32 @@ class Query: """ _QUERY = """ - repository(owner: "{owner}" name: "{name}") {{ - defaultBranchRef {{ - target {{ - ... on Commit {{ - history(first: {max_page_size} {next}) {{ - pageInfo {{ - hasNextPage - endCursor - }} - nodes {{ - oid - associatedPullRequests(first: {min_page_size}) {{ - totalCount - nodes {{ - ... on PullRequest {{ - {pull_request_data} +repository(owner: "{owner}" name: "{name}") {{ + defaultBranchRef {{ + target {{ + ... on Commit {{ + history(first: {max_page_size} {next}) {{ + pageInfo {{ + hasNextPage + endCursor + }} + nodes {{ + oid + associatedPullRequests(first: {min_page_size}) {{ + totalCount + nodes {{ + ... on PullRequest {{ + {pull_request_data} - labels(first: {min_page_size}) {{ - totalCount - pageInfo {{ - hasNextPage - endCursor - }} - nodes {{ - name - color - }} - }} - }} + labels(first: {min_page_size}) {{ + totalCount + pageInfo {{ + hasNextPage + endCursor + }} + nodes {{ + name + color }} }} }} @@ -243,6 +253,9 @@ class Query: }} }} }} + }} + }} +}} """ pull_requests = [] @@ -267,7 +280,7 @@ class Query: max_page_size=self._max_page_size, min_page_size=self._min_page_size, pull_request_data=self._PULL_REQUEST, - next='after: "{}"'.format(result["pageInfo"]["endCursor"]), + next=f'after: "{result["pageInfo"]["endCursor"]}"', ) for commit in result["nodes"]: @@ -285,7 +298,7 @@ class Query: for pull_request in commit["associatedPullRequests"]["nodes"]: if ( pull_request["baseRepository"]["nameWithOwner"] - == "{}/{}".format(self._owner, self._name) + == f"{self._owner}/{self._name}" and pull_request["baseRefName"] == self.default_branch and pull_request["mergeCommit"]["oid"] == commit["oid"] ): @@ -297,19 +310,19 @@ class Query: self, source, target, title, description="", draft=False, can_modify=True ): _QUERY = """ - createPullRequest(input: {{ - baseRefName: "{target}", - headRefName: "{source}", - repositoryId: "{id}", - title: "{title}", - body: "{body}", - draft: {draft}, - maintainerCanModify: {modify} - }}) {{ - pullRequest {{ - {pull_request_data} - }} - }} +createPullRequest(input: {{ + baseRefName: "{target}", + headRefName: "{source}", + repositoryId: "{id}", + title: "{title}", + body: "{body}", + draft: {draft}, + maintainerCanModify: {modify} +}}) {{ + pullRequest {{ + {pull_request_data} + }} +}} """ query = _QUERY.format( @@ -324,29 +337,29 @@ class Query: ) return self._run(query, is_mutation=True)["createPullRequest"]["pullRequest"] - def merge_pull_request(self, id): + def merge_pull_request(self, pr_id): _QUERY = """ - mergePullRequest(input: {{ - pullRequestId: "{id}" - }}) {{ - pullRequest {{ - {pull_request_data} - }} - }} +mergePullRequest(input: {{ + pullRequestId: "{pr_id}" +}}) {{ + pullRequest {{ + {pull_request_data} + }} +}} """ - query = _QUERY.format(id=id, pull_request_data=self._PULL_REQUEST) + query = _QUERY.format(pr_id=pr_id, pull_request_data=self._PULL_REQUEST) return self._run(query, is_mutation=True)["mergePullRequest"]["pullRequest"] # FIXME: figure out how to add more assignees at once def add_assignee(self, pr, assignee): _QUERY = """ - addAssigneesToAssignable(input: {{ - assignableId: "{id1}", - assigneeIds: "{id2}" - }}) {{ - clientMutationId - }} +addAssigneesToAssignable(input: {{ + assignableId: "{id1}", + assigneeIds: "{id2}" +}}) {{ + clientMutationId +}} """ query = _QUERY.format(id1=pr["id"], id2=assignee["id"]) @@ -362,28 +375,28 @@ class Query: """ _GET_LABEL = """ - repository(owner: "{owner}" name: "{name}") {{ - labels(first: {max_page_size} {next} query: "{label_name}") {{ - pageInfo {{ - hasNextPage - endCursor - }} - nodes {{ - id - name - color - }} - }} - }} +repository(owner: "{owner}" name: "{name}") {{ + labels(first: {max_page_size} {next} query: "{label_name}") {{ + pageInfo {{ + hasNextPage + endCursor + }} + nodes {{ + id + name + color + }} + }} +}} """ _SET_LABEL = """ - addLabelsToLabelable(input: {{ - labelableId: "{pr_id}", - labelIds: "{label_id}" - }}) {{ - clientMutationId - }} +addLabelsToLabelable(input: {{ + labelableId: "{pr_id}", + labelIds: "{label_id}" +}}) {{ + clientMutationId +}} """ labels = [] @@ -404,10 +417,10 @@ class Query: name=self._name, label_name=label_name, max_page_size=self._max_page_size, - next='after: "{}"'.format(result["pageInfo"]["endCursor"]), + next=f'after: "{result["pageInfo"]["endCursor"]}"', ) - labels += [label for label in result["nodes"]] + labels += list(result["nodes"]) if not labels: return @@ -415,83 +428,105 @@ class Query: query = _SET_LABEL.format(pr_id=pull_request["id"], label_id=labels[0]["id"]) self._run(query, is_mutation=True) - def _run(self, query, is_mutation=False): - from requests.adapters import HTTPAdapter - from urllib3.util.retry import Retry - - # sleep a little, because we querying github too often - print("Request, is mutation", is_mutation) - time.sleep(0.5) - - def requests_retry_session( - retries=5, - backoff_factor=0.5, + @property + def session(self): + if self._session is not None: + return self._session + retries = 5 + self._session = requests.Session() + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=1, status_forcelist=(403, 500, 502, 504), - session=None, - ): - session = session or requests.Session() - retry = Retry( - total=retries, - read=retries, - connect=retries, - backoff_factor=backoff_factor, - status_forcelist=status_forcelist, - ) - adapter = HTTPAdapter(max_retries=retry) - session.mount("http://", adapter) - session.mount("https://", adapter) - return session + ) + adapter = HTTPAdapter(max_retries=retry) + self._session.mount("http://", adapter) + self._session.mount("https://", adapter) + return self._session - headers = {"Authorization": "bearer {}".format(self._token)} + def _run(self, query, is_mutation=False): + # Get caller and parameters from the stack to track the progress + frame = inspect.getouterframes(inspect.currentframe(), 2)[1] + caller = frame[3] + f_parameters = inspect.signature(getattr(self, caller)).parameters + parameters = ", ".join(str(frame[0].f_locals[p]) for p in f_parameters) + mutation = "" if is_mutation: - query = """ - mutation {{ - {query} - }} - """.format( - query=query - ) - else: - query = """ - query {{ - {query} - rateLimit {{ - cost - remaining - }} - }} - """.format( - query=query - ) + mutation = ", is mutation" + print(f"---GraphQL request for {caller}({parameters}){mutation}---") - while True: - request = requests_retry_session().post( + headers = {"Authorization": f"bearer {self._token}"} + if is_mutation: + query = f""" +mutation {{ + {query} +}} + """ + else: + query = f""" +query {{ + {query} + rateLimit {{ + cost + remaining + }} +}} + """ + + def request_with_retry(retry=0): + max_retries = 5 + # From time to time we face some concrete errors, when it worth to + # retry instead of failing competely + # We should sleep progressively + progressive_sleep = 5 * sum(i + 1 for i in range(retry)) + if progressive_sleep: + logging.warning( + "Retry GraphQL request %s time, sleep %s seconds", + retry, + progressive_sleep, + ) + time.sleep(progressive_sleep) + response = self.session.post( "https://api.github.com/graphql", json={"query": query}, headers=headers ) - if request.status_code == 200: - result = request.json() + result = response.json() + if response.status_code == 200: if "errors" in result: raise Exception( - "Errors occurred: {}\nOriginal query: {}".format( - result["errors"], query - ) + f"Errors occurred: {result['errors']}\nOriginal query: {query}" ) if not is_mutation: - import inspect - - caller = inspect.getouterframes(inspect.currentframe(), 2)[1][3] - if caller not in list(self.api_costs.keys()): + if caller not in self.api_costs: self.api_costs[caller] = 0 self.api_costs[caller] += result["data"]["rateLimit"]["cost"] return result["data"] - else: - import json - - raise Exception( - "Query failed with code {code}:\n{json}".format( - code=request.status_code, - json=json.dumps(request.json(), indent=4), - ) + elif ( + response.status_code == 403 + and "secondary rate limit" in result["message"] + ): + if retry <= max_retries: + logging.warning("Secondary rate limit reached") + return request_with_retry(retry + 1) + elif response.status_code == 502 and "errors" in result: + too_many_data = any( + True + for err in result["errors"] + if "message" in err + and "This may be the result of a timeout" in err["message"] ) + if too_many_data: + logging.warning( + "Too many data is requested, decreasing page size %s by 10%%", + self._max_page_size, + ) + self._max_page_size = int(self._max_page_size * 0.9) + return request_with_retry(retry) + + data = json.dumps(result, indent=4) + raise Exception(f"Query failed with code {response.status_code}:\n{data}") + + return request_with_retry() diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 3b61e2077cf..a530b395130 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -12,6 +12,7 @@ CI_CONFIG = { "build_type": "", "sanitizer": "", "package_type": "deb", + "static_binary_name": "amd64", "bundled": "bundled", "splitted": "unsplitted", "additional_pkgs": True, @@ -34,6 +35,7 @@ CI_CONFIG = { "build_type": "", "sanitizer": "", "package_type": "deb", + "static_binary_name": "aarch64", "bundled": "bundled", "splitted": "unsplitted", "additional_pkgs": True, @@ -95,7 +97,6 @@ CI_CONFIG = { "build_type": "", "sanitizer": "", "package_type": "binary", - "static_binary_name": "amd64", "bundled": "bundled", "splitted": "unsplitted", "tidy": "disable", @@ -138,7 +139,6 @@ CI_CONFIG = { "build_type": "", "sanitizer": "", "package_type": "binary", - "static_binary_name": "aarch64", "bundled": "bundled", "splitted": "unsplitted", "tidy": "disable", diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 355f7b7a712..3e0d4e822b4 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -385,6 +385,7 @@ class FailureReason(enum.Enum): NO_LONG = "not running long tests" REPLICATED_DB = "replicated-database" S3_STORAGE = "s3-storage" + STRESS = "stress" BUILD = "not running for current build" BACKWARD_INCOMPATIBLE = "test is backward incompatible" @@ -420,6 +421,10 @@ class SettingsRandomizer: "max_block_size": lambda: random.randint(8000, 100000), "max_threads": lambda: random.randint(1, 64), "optimize_or_like_chain": lambda: random.randint(0, 1), + "optimize_read_in_order": lambda: random.randint(0, 1), + "read_in_order_two_level_merge_threshold": lambda: random.randint(0, 100), + "optimize_aggregation_in_order": lambda: random.randint(0, 1), + "aggregation_in_order_max_block_bytes": lambda: random.randint(0, 50000000), } @staticmethod @@ -679,6 +684,9 @@ class TestCase: elif tags and ("no-s3-storage" in tags) and args.s3_storage: return FailureReason.S3_STORAGE + elif tags and ("no-stress" in tags) and args.stress: + return FailureReason.STRESS + elif tags: for build_flag in args.build_flags: if "no-" + build_flag in tags: @@ -879,7 +887,7 @@ class TestCase: "test": self.case_file, "stdout": self.stdout_file, "stderr": self.stderr_file, - "secure": "--secure" if args.secure else "" + "secure": "--secure" if args.secure else "", } # >> append to stderr (but not stdout since it is not used there), @@ -1214,7 +1222,9 @@ class TestSuite: try: return int(clickhouse_execute(args, "EXISTS TABLE test.hits")) except Exception as e: - print("Cannot check if dataset is available, assuming it's not: ", str(e)) + print( + "Cannot check if dataset is available, assuming it's not: ", str(e) + ) return False base_dir = os.path.abspath(args.queries) @@ -1955,6 +1965,12 @@ if __name__ == "__main__": default=False, help="Run tests over s3 storage", ) + parser.add_argument( + "--stress", + action="store_true", + default=False, + help="Run stress tests", + ) parser.add_argument( "--no-random-settings", action="store_true", diff --git a/tests/config/test_function.xml b/tests/config/test_function.xml index a50ab69422a..928cbd75c78 100644 --- a/tests/config/test_function.xml +++ b/tests/config/test_function.xml @@ -13,18 +13,4 @@ cd /; clickhouse-local --input-format TabSeparated --output-format TabSeparated --structure 'x UInt64, y UInt64' --query "SELECT x + y FROM table" 0 - - executable - test_function_with_parameter - UInt64 - - UInt64 - - - UInt64 - - TabSeparated - cd /; clickhouse-local --input-format TabSeparated --output-format TabSeparated --structure 'x UInt64, y UInt64' --query "SELECT x + y + {test_parameter : UInt64} FROM table" - 0 - diff --git a/tests/integration/helpers/client.py b/tests/integration/helpers/client.py index 41c5608081d..f03a6d2ab23 100644 --- a/tests/integration/helpers/client.py +++ b/tests/integration/helpers/client.py @@ -25,6 +25,7 @@ class Client: user=None, password=None, database=None, + host=None, ignore_error=False, query_id=None, ): @@ -36,6 +37,7 @@ class Client: user=user, password=password, database=database, + host=host, ignore_error=ignore_error, query_id=query_id, ).get_answer() @@ -49,6 +51,7 @@ class Client: user=None, password=None, database=None, + host=None, ignore_error=False, query_id=None, ): @@ -66,13 +69,12 @@ class Client: if user is not None: command += ["--user", user] - if password is not None: command += ["--password", password] - if database is not None: command += ["--database", database] - + if host is not None: + command += ["--host", host] if query_id is not None: command += ["--query_id", query_id] diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index f8ad9213e5b..e3a9089d201 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -2876,6 +2876,7 @@ class ClickHouseInstance: user=None, password=None, database=None, + host=None, ignore_error=False, query_id=None, ): @@ -2890,6 +2891,7 @@ class ClickHouseInstance: database=database, ignore_error=ignore_error, query_id=query_id, + host=host, ) def query_with_retry( @@ -2901,6 +2903,7 @@ class ClickHouseInstance: user=None, password=None, database=None, + host=None, ignore_error=False, retry_count=20, sleep_time=0.5, @@ -2918,6 +2921,7 @@ class ClickHouseInstance: user=user, password=password, database=database, + host=host, ignore_error=ignore_error, ) if check_callback(result): @@ -2985,6 +2989,7 @@ class ClickHouseInstance: self, sql, data=None, + method=None, params=None, user=None, password=None, @@ -3016,10 +3021,11 @@ class ClickHouseInstance: requester = requests.Session() requester.mount("https://", adapter) requester.mount("http://", adapter) - if data: - r = requester.post(url, data, auth=auth, timeout=timeout) - else: - r = requester.get(url, auth=auth, timeout=timeout) + + if method is None: + method = "POST" if data else "GET" + + r = requester.request(method, url, data=data, auth=auth, timeout=timeout) def http_code_and_message(): code = r.status_code diff --git a/tests/integration/runner b/tests/integration/runner index ef48dfe687d..7a02ec309a0 100755 --- a/tests/integration/runner +++ b/tests/integration/runner @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -#-*- coding: utf-8 -*- +# -*- coding: utf-8 -*- import subprocess import os import getpass @@ -15,15 +15,14 @@ import random def random_str(length=6): alphabet = string.ascii_lowercase + string.digits - return "".join( - random.SystemRandom().choice(alphabet) for _ in range(length) - ) + return "".join(random.SystemRandom().choice(alphabet) for _ in range(length)) CUR_FILE_DIR = os.path.dirname(os.path.realpath(__file__)) DEFAULT_CLICKHOUSE_ROOT = os.path.abspath(os.path.join(CUR_FILE_DIR, "../../")) CURRENT_WORK_DIR = os.getcwd() -CONTAINER_NAME = f"clickhouse_integration_tests_{random_str()}" +VOLUME_NAME = "clickhouse_integration_tests" +CONTAINER_NAME = f"{VOLUME_NAME}_{random_str()}" CONFIG_DIR_IN_REPO = "programs/server" INTERGATION_DIR_IN_REPO = "tests/integration" @@ -31,6 +30,7 @@ SRC_DIR_IN_REPO = "src" DIND_INTEGRATION_TESTS_IMAGE_NAME = "clickhouse/integration-tests-runner" + def check_args_and_update_paths(args): if args.clickhouse_root: if not os.path.isabs(args.clickhouse_root): @@ -38,34 +38,54 @@ def check_args_and_update_paths(args): else: CLICKHOUSE_ROOT = args.clickhouse_root else: - logging.info("ClickHouse root is not set. Will use %s" % (DEFAULT_CLICKHOUSE_ROOT)) + logging.info( + "ClickHouse root is not set. Will use %s" % (DEFAULT_CLICKHOUSE_ROOT) + ) CLICKHOUSE_ROOT = DEFAULT_CLICKHOUSE_ROOT if not os.path.isabs(args.binary): args.binary = os.path.abspath(os.path.join(CURRENT_WORK_DIR, args.binary)) if not args.odbc_bridge_binary: - args.odbc_bridge_binary = os.path.join(os.path.dirname(args.binary), 'clickhouse-odbc-bridge') + args.odbc_bridge_binary = os.path.join( + os.path.dirname(args.binary), "clickhouse-odbc-bridge" + ) elif not os.path.isabs(args.odbc_bridge_binary): - args.odbc_bridge_binary = os.path.abspath(os.path.join(CURRENT_WORK_DIR, args.odbc_bridge_binary)) + args.odbc_bridge_binary = os.path.abspath( + os.path.join(CURRENT_WORK_DIR, args.odbc_bridge_binary) + ) if not args.library_bridge_binary: - args.library_bridge_binary = os.path.join(os.path.dirname(args.binary), 'clickhouse-library-bridge') + args.library_bridge_binary = os.path.join( + os.path.dirname(args.binary), "clickhouse-library-bridge" + ) elif not os.path.isabs(args.library_bridge_binary): - args.library_bridge_binary = os.path.abspath(os.path.join(CURRENT_WORK_DIR, args.library_bridge_binary)) + args.library_bridge_binary = os.path.abspath( + os.path.join(CURRENT_WORK_DIR, args.library_bridge_binary) + ) if args.base_configs_dir: if not os.path.isabs(args.base_configs_dir): - args.base_configs_dir = os.path.abspath(os.path.join(CURRENT_WORK_DIR, args.base_configs_dir)) + args.base_configs_dir = os.path.abspath( + os.path.join(CURRENT_WORK_DIR, args.base_configs_dir) + ) else: - args.base_configs_dir = os.path.abspath(os.path.join(CLICKHOUSE_ROOT, CONFIG_DIR_IN_REPO)) - logging.info("Base configs dir is not set. Will use %s" % (args.base_configs_dir)) + args.base_configs_dir = os.path.abspath( + os.path.join(CLICKHOUSE_ROOT, CONFIG_DIR_IN_REPO) + ) + logging.info( + "Base configs dir is not set. Will use %s" % (args.base_configs_dir) + ) if args.cases_dir: if not os.path.isabs(args.cases_dir): - args.cases_dir = os.path.abspath(os.path.join(CURRENT_WORK_DIR, args.cases_dir)) + args.cases_dir = os.path.abspath( + os.path.join(CURRENT_WORK_DIR, args.cases_dir) + ) else: - args.cases_dir = os.path.abspath(os.path.join(CLICKHOUSE_ROOT, INTERGATION_DIR_IN_REPO)) + args.cases_dir = os.path.abspath( + os.path.join(CLICKHOUSE_ROOT, INTERGATION_DIR_IN_REPO) + ) logging.info("Cases dir is not set. Will use %s" % (args.cases_dir)) if args.src_dir: @@ -75,26 +95,54 @@ def check_args_and_update_paths(args): args.src_dir = os.path.abspath(os.path.join(CLICKHOUSE_ROOT, SRC_DIR_IN_REPO)) logging.info("src dir is not set. Will use %s" % (args.src_dir)) - logging.info("base_configs_dir: {}, binary: {}, cases_dir: {} ".format(args.base_configs_dir, args.binary, args.cases_dir)) + logging.info( + "base_configs_dir: {}, binary: {}, cases_dir: {} ".format( + args.base_configs_dir, args.binary, args.cases_dir + ) + ) - for path in [args.binary, args.odbc_bridge_binary, args.library_bridge_binary, args.base_configs_dir, args.cases_dir, CLICKHOUSE_ROOT]: + for path in [ + args.binary, + args.odbc_bridge_binary, + args.library_bridge_binary, + args.base_configs_dir, + args.cases_dir, + CLICKHOUSE_ROOT, + ]: if not os.path.exists(path): raise Exception("Path {} doesn't exist".format(path)) if args.dockerd_volume: if not os.path.isabs(args.dockerd_volume): - args.src_dir = os.path.abspath(os.path.join(CURRENT_WORK_DIR, args.dockerd_volume)) + args.src_dir = os.path.abspath( + os.path.join(CURRENT_WORK_DIR, args.dockerd_volume) + ) - if (not os.path.exists(os.path.join(args.base_configs_dir, "config.xml"))) and (not os.path.exists(os.path.join(args.base_configs_dir, "config.yaml"))): - raise Exception("No config.xml or config.yaml in {}".format(args.base_configs_dir)) + if (not os.path.exists(os.path.join(args.base_configs_dir, "config.xml"))) and ( + not os.path.exists(os.path.join(args.base_configs_dir, "config.yaml")) + ): + raise Exception( + "No config.xml or config.yaml in {}".format(args.base_configs_dir) + ) + + if (not os.path.exists(os.path.join(args.base_configs_dir, "users.xml"))) and ( + not os.path.exists(os.path.join(args.base_configs_dir, "users.yaml")) + ): + raise Exception( + "No users.xml or users.yaml in {}".format(args.base_configs_dir) + ) - if (not os.path.exists(os.path.join(args.base_configs_dir, "users.xml"))) and (not os.path.exists(os.path.join(args.base_configs_dir, "users.yaml"))): - raise Exception("No users.xml or users.yaml in {}".format(args.base_configs_dir)) def docker_kill_handler_handler(signum, frame): - subprocess.check_call('docker kill $(docker ps -a -q --filter name={name} --format="{{{{.ID}}}}")'.format(name=CONTAINER_NAME), shell=True) + subprocess.check_call( + 'docker kill $(docker ps -a -q --filter name={name} --format="{{{{.ID}}}}")'.format( + name=CONTAINER_NAME + ), + shell=True, + ) raise KeyboardInterrupt("Killed by Ctrl+C") + signal.signal(signal.SIGINT, docker_kill_handler_handler) # Integration tests runner should allow to run tests on several versions of ClickHouse. @@ -110,109 +158,132 @@ signal.signal(signal.SIGINT, docker_kill_handler_handler) # 2) path of runner script is used to determine paths for trivial case, when we run it from repository if __name__ == "__main__": - logging.basicConfig(level=logging.INFO, format='%(asctime)s [ %(process)d ] %(levelname)s : %(message)s (%(filename)s:%(lineno)s, %(funcName)s)') + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [ %(process)d ] %(levelname)s : %(message)s (%(filename)s:%(lineno)s, %(funcName)s)", + ) parser = argparse.ArgumentParser(description="ClickHouse integration tests runner") parser.add_argument( "--binary", - default=os.environ.get("CLICKHOUSE_TESTS_SERVER_BIN_PATH", os.environ.get("CLICKHOUSE_TESTS_CLIENT_BIN_PATH", "/usr/bin/clickhouse")), - help="Path to clickhouse binary. For example /usr/bin/clickhouse") + default=os.environ.get( + "CLICKHOUSE_TESTS_SERVER_BIN_PATH", + os.environ.get("CLICKHOUSE_TESTS_CLIENT_BIN_PATH", "/usr/bin/clickhouse"), + ), + help="Path to clickhouse binary. For example /usr/bin/clickhouse", + ) parser.add_argument( "--odbc-bridge-binary", default=os.environ.get("CLICKHOUSE_TESTS_ODBC_BRIDGE_BIN_PATH", ""), - help="Path to clickhouse-odbc-bridge binary. Defaults to clickhouse-odbc-bridge in the same dir as clickhouse.") + help="Path to clickhouse-odbc-bridge binary. Defaults to clickhouse-odbc-bridge in the same dir as clickhouse.", + ) parser.add_argument( "--library-bridge-binary", default=os.environ.get("CLICKHOUSE_TESTS_LIBRARY_BRIDGE_BIN_PATH", ""), - help="Path to clickhouse-library-bridge binary. Defaults to clickhouse-library-bridge in the same dir as clickhouse.") + help="Path to clickhouse-library-bridge binary. Defaults to clickhouse-library-bridge in the same dir as clickhouse.", + ) parser.add_argument( "--base-configs-dir", default=os.environ.get("CLICKHOUSE_TESTS_BASE_CONFIG_DIR"), - help="Path to clickhouse base configs directory with config.xml/users.xml") + help="Path to clickhouse base configs directory with config.xml/users.xml", + ) parser.add_argument( "--cases-dir", default=os.environ.get("CLICKHOUSE_TESTS_INTEGRATION_PATH"), - help="Path to integration tests cases and configs directory. For example tests/integration in repository") + help="Path to integration tests cases and configs directory. For example tests/integration in repository", + ) parser.add_argument( "--src-dir", default=os.environ.get("CLICKHOUSE_SRC_DIR"), - help="Path to the 'src' directory in repository. Used to provide schemas (e.g. *.proto) for some tests when those schemas are located in the 'src' directory") + help="Path to the 'src' directory in repository. Used to provide schemas (e.g. *.proto) for some tests when those schemas are located in the 'src' directory", + ) parser.add_argument( "--clickhouse-root", - help="Path to repository root folder. Used to take configuration from repository default paths.") + help="Path to repository root folder. Used to take configuration from repository default paths.", + ) parser.add_argument( "--command", - default='', - help="Set it to run some other command in container (for example bash)") + default="", + help="Set it to run some other command in container (for example bash)", + ) parser.add_argument( "--disable-net-host", - action='store_true', + action="store_true", default=False, - help="Don't use net host in parent docker container") + help="Don't use net host in parent docker container", + ) parser.add_argument( "--network", - help="Set network driver for runnner container (defaults to `host`)") + help="Set network driver for runnner container (defaults to `host`)", + ) parser.add_argument( "--docker-image-version", default="latest", - help="Version of docker image which runner will use to run tests") + help="Version of docker image which runner will use to run tests", + ) parser.add_argument( "--docker-compose-images-tags", action="append", - help="Set non-default tags for images used in docker compose recipes(yandex/my_container:my_tag)") + help="Set non-default tags for images used in docker compose recipes(yandex/my_container:my_tag)", + ) parser.add_argument( - "-n", "--parallel", - action="store", - dest="parallel", - help="Parallelism") + "-n", "--parallel", action="store", dest="parallel", help="Parallelism" + ) parser.add_argument( - "-t", "--tests_list", + "-t", + "--tests_list", action="store", - nargs='+', + nargs="+", default=[], dest="tests_list", - help="List of tests to run") + help="List of tests to run", + ) parser.add_argument( - "-k", "--keyword_expression", + "-k", + "--keyword_expression", action="store", dest="keyword_expression", - help="pytest keyword expression") + help="pytest keyword expression", + ) parser.add_argument( "--tmpfs", - action='store_true', + action="store_true", default=False, dest="tmpfs", - help="Use tmpfs for dockerd files") + help="Use tmpfs for dockerd files", + ) parser.add_argument( "--cleanup-containers", - action='store_true', + action="store_true", default=False, dest="cleanup_containers", - help="Remove all running containers on test session start") + help="Remove all running containers on test session start", + ) parser.add_argument( "--dockerd-volume-dir", - action='store', + action="store", dest="dockerd_volume", - help="Bind volume to this dir to use for dockerd files") + help="Bind volume to this dir to use for dockerd files", + ) - parser.add_argument('pytest_args', nargs='*', help="args for pytest command") + parser.add_argument("pytest_args", nargs="*", help="args for pytest command") args = parser.parse_args() @@ -263,13 +334,19 @@ if __name__ == "__main__": if args.tmpfs: dockerd_internal_volume = "--tmpfs /var/lib/docker -e DOCKER_RAMDISK=true" elif args.dockerd_volume: - dockerd_internal_volume = "--mount type=bind,source={},target=/var/lib/docker".format(args.dockerd_volume) + dockerd_internal_volume = ( + "--mount type=bind,source={},target=/var/lib/docker".format( + args.dockerd_volume + ) + ) else: try: - subprocess.check_call('docker volume create {name}_volume'.format(name=CONTAINER_NAME), shell=True) + subprocess.check_call( + f"docker volume create {VOLUME_NAME}_volume", shell=True + ) except Exception as ex: print("Volume creationg failed, probably it already exists, exception", ex) - dockerd_internal_volume = "--volume={}_volume:/var/lib/docker".format(CONTAINER_NAME) + dockerd_internal_volume = f"--volume={VOLUME_NAME}_volume:/var/lib/docker" # If enabled we kill and remove containers before pytest session run. env_cleanup = "" @@ -285,7 +362,7 @@ if __name__ == "__main__": os.remove(old_log_path) if args.keyword_expression: - args.pytest_args += ['-k', args.keyword_expression] + args.pytest_args += ["-k", args.keyword_expression] cmd = "docker run {net} {tty} --rm --name {name} --privileged \ --volume={odbc_bridge_bin}:/clickhouse-odbc-bridge --volume={bin}:/clickhouse \ @@ -307,17 +384,20 @@ if __name__ == "__main__": env_tags=env_tags, env_cleanup=env_cleanup, parallel=parallel_args, - opts=' '.join(args.pytest_args).replace('\'', '\\\''), - tests_list=' '.join(args.tests_list), + opts=" ".join(args.pytest_args).replace("'", "\\'"), + tests_list=" ".join(args.tests_list), dockerd_internal_volume=dockerd_internal_volume, img=DIND_INTEGRATION_TESTS_IMAGE_NAME + ":" + args.docker_image_version, name=CONTAINER_NAME, - command=args.command + command=args.command, ) try: print("Trying to kill container", CONTAINER_NAME, "if it's already running") - subprocess.check_call(f'docker kill $(docker ps -a -q --filter name={CONTAINER_NAME} --format="{{{{.ID}}}}")', shell=True) + subprocess.check_call( + f'docker kill $(docker ps -a -q --filter name={CONTAINER_NAME} --format="{{{{.ID}}}}")', + shell=True, + ) print("Container killed") except: print("Nothing to kill") diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py index 869f36f194a..31a85379bbe 100644 --- a/tests/integration/test_backup_restore_new/test.py +++ b/tests/integration/test_backup_restore_new/test.py @@ -2,21 +2,23 @@ import pytest import re import os.path from helpers.cluster import ClickHouseCluster -from helpers.test_tools import assert_eq_with_retry +from helpers.test_tools import assert_eq_with_retry, TSV cluster = ClickHouseCluster(__file__) instance = cluster.add_instance( - "instance", main_configs=["configs/backups_disk.xml"], external_dirs=["/backups/"] + "instance", + main_configs=["configs/backups_disk.xml"], + external_dirs=["/backups/"], ) -def create_and_fill_table(engine="MergeTree"): +def create_and_fill_table(engine="MergeTree", n=100): if engine == "MergeTree": engine = "MergeTree ORDER BY y PARTITION BY x%10" instance.query("CREATE DATABASE test") instance.query(f"CREATE TABLE test.table(x UInt32, y String) ENGINE={engine}") instance.query( - "INSERT INTO test.table SELECT number, toString(number) FROM numbers(100)" + f"INSERT INTO test.table SELECT number, toString(number) FROM numbers({n})" ) @@ -35,6 +37,13 @@ def cleanup_after_test(): yield finally: instance.query("DROP DATABASE IF EXISTS test") + instance.query("DROP DATABASE IF EXISTS test2") + instance.query("DROP DATABASE IF EXISTS test3") + instance.query("DROP USER IF EXISTS u1") + instance.query("DROP ROLE IF EXISTS r1, r2") + instance.query("DROP SETTINGS PROFILE IF EXISTS prof1") + instance.query("DROP ROW POLICY IF EXISTS rowpol1 ON test.table") + instance.query("DROP QUOTA IF EXISTS q1") backup_id_counter = 0 @@ -51,6 +60,15 @@ def get_path_to_backup(backup_name): return os.path.join(instance.cluster.instances_dir, "backups", name) +session_id_counter = 0 + + +def new_session_id(): + global session_id_counter + session_id_counter += 1 + return "Session #" + str(session_id_counter) + + @pytest.mark.parametrize( "engine", ["MergeTree", "Log", "TinyLog", "StripeLog", "Memory"] ) @@ -78,9 +96,7 @@ def test_restore_table_into_existing_table(engine): assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" instance.query(f"BACKUP TABLE test.table TO {backup_name}") - expected_error = ( - "Cannot restore table test.table because it already contains some data" - ) + expected_error = "already contains some data" assert expected_error in instance.query_and_get_error( f"RESTORE TABLE test.table FROM {backup_name}" ) @@ -278,7 +294,7 @@ def test_async(): [id, _, status] = instance.query( f"BACKUP TABLE test.table TO {backup_name} ASYNC" ).split("\t") - assert status == "MAKING_BACKUP\n" + assert status == "MAKING_BACKUP\n" or status == "BACKUP_COMPLETE\n" assert_eq_with_retry( instance, f"SELECT status FROM system.backups WHERE uuid='{id}'", @@ -290,9 +306,378 @@ def test_async(): [id, _, status] = instance.query( f"RESTORE TABLE test.table FROM {backup_name} ASYNC" ).split("\t") - assert status == "RESTORING\n" + assert status == "RESTORING\n" or status == "RESTORED\n" assert_eq_with_retry( instance, f"SELECT status FROM system.backups WHERE uuid='{id}'", "RESTORED\n" ) assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" + + +def test_empty_files_in_backup(): + instance.query("CREATE DATABASE test") + instance.query( + "CREATE TABLE test.tbl1(x Array(UInt8)) ENGINE=MergeTree ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0" + ) + instance.query("INSERT INTO test.tbl1 VALUES ([])") + + backup_name = new_backup_name() + instance.query(f"BACKUP TABLE test.tbl1 TO {backup_name}") + + instance.query("DROP TABLE test.tbl1") + instance.query(f"RESTORE ALL FROM {backup_name}") + + assert instance.query("SELECT * FROM test.tbl1") == "[]\n" + + +def test_dependencies(): + create_and_fill_table() + instance.query("CREATE VIEW test.view AS SELECT x, y AS w FROM test.table") + instance.query( + "CREATE DICTIONARY test.dict1(x UInt32, w String) PRIMARY KEY x SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() DB 'test' TABLE 'view')) LAYOUT(FLAT()) LIFETIME(0)" + ) + instance.query( + "CREATE DICTIONARY test.dict2(x UInt32, w String) PRIMARY KEY w SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() DB 'test' TABLE 'dict1')) LAYOUT(FLAT()) LIFETIME(0)" + ) + instance.query( + "CREATE TABLE test.table2(k String, v Int32 DEFAULT dictGet('test.dict2', 'x', k) - 1) ENGINE=MergeTree ORDER BY tuple()" + ) + instance.query("INSERT INTO test.table2 (k) VALUES ('7'), ('96'), ('124')") + assert instance.query("SELECT * FROM test.table2 ORDER BY k") == TSV( + [["124", -1], ["7", 6], ["96", 95]] + ) + + backup_name = new_backup_name() + instance.query(f"BACKUP DATABASE test AS test2 TO {backup_name}") + + instance.query("DROP DATABASE test") + + instance.query(f"RESTORE DATABASE test2 AS test3 FROM {backup_name}") + + assert instance.query("SELECT * FROM test3.table2 ORDER BY k") == TSV( + [["124", -1], ["7", 6], ["96", 95]] + ) + instance.query("INSERT INTO test3.table2 (k) VALUES ('63'), ('152'), ('71')") + assert instance.query("SELECT * FROM test3.table2 ORDER BY k") == TSV( + [["124", -1], ["152", -1], ["63", 62], ["7", 6], ["71", 70], ["96", 95]] + ) + + +def test_materialized_view(): + create_and_fill_table(n=5) + instance.query( + "CREATE MATERIALIZED VIEW test.view ENGINE=MergeTree ORDER BY tuple() POPULATE AS SELECT y, x FROM test.table" + ) + instance.query("INSERT INTO test.table VALUES (990, 'a')") + + backup_name = new_backup_name() + instance.query(f"BACKUP DATABASE test TO {backup_name}") + + assert sorted( + os.listdir(os.path.join(get_path_to_backup(backup_name), "metadata/test")) + ) == ["table.sql", "view.sql"] + assert sorted( + os.listdir(os.path.join(get_path_to_backup(backup_name), "data/test")) + ) == ["table", "view"] + view_create_query = open( + os.path.join(get_path_to_backup(backup_name), "metadata/test/view.sql") + ).read() + assert view_create_query.startswith("CREATE MATERIALIZED VIEW test.view") + assert "POPULATE" not in view_create_query + + instance.query("DROP DATABASE test") + + instance.query(f"RESTORE DATABASE test FROM {backup_name}") + + instance.query("INSERT INTO test.table VALUES (991, 'b')") + + assert instance.query("SELECT * FROM test.view ORDER BY x") == TSV( + [["0", 0], ["1", 1], ["2", 2], ["3", 3], ["4", 4], ["a", 990], ["b", 991]] + ) + + +def test_materialized_view_with_target_table(): + create_and_fill_table(n=5) + instance.query( + "CREATE TABLE test.target(x Int64, y String) ENGINE=MergeTree ORDER BY tuple()" + ) + instance.query( + "CREATE MATERIALIZED VIEW test.view TO test.target AS SELECT y, x FROM test.table" + ) + instance.query("INSERT INTO test.table VALUES (990, 'a')") + + backup_name = new_backup_name() + instance.query(f"BACKUP DATABASE test TO {backup_name}") + + assert sorted( + os.listdir(os.path.join(get_path_to_backup(backup_name), "metadata/test")) + ) == ["table.sql", "target.sql", "view.sql"] + assert sorted( + os.listdir(os.path.join(get_path_to_backup(backup_name), "data/test")) + ) == ["table", "target"] + + instance.query("DROP DATABASE test") + + instance.query(f"RESTORE DATABASE test FROM {backup_name}") + + instance.query("INSERT INTO test.table VALUES (991, 'b')") + + assert instance.query("SELECT * FROM test.view ORDER BY x") == TSV( + [["a", 990], ["b", 991]] + ) + + +def test_temporary_table(): + session_id = new_session_id() + instance.http_query( + "CREATE TEMPORARY TABLE temp_tbl(s String)", params={"session_id": session_id} + ) + instance.http_query( + "INSERT INTO temp_tbl VALUES ('q')", params={"session_id": session_id} + ) + instance.http_query( + "INSERT INTO temp_tbl VALUES ('w'), ('e')", params={"session_id": session_id} + ) + + backup_name = new_backup_name() + instance.http_query( + f"BACKUP TEMPORARY TABLE temp_tbl TO {backup_name}", + params={"session_id": session_id}, + ) + + session_id = new_session_id() + instance.http_query( + f"RESTORE TEMPORARY TABLE temp_tbl FROM {backup_name}", + params={"session_id": session_id}, + ) + + assert instance.http_query( + "SELECT * FROM temp_tbl ORDER BY s", params={"session_id": session_id} + ) == TSV([["e"], ["q"], ["w"]]) + + +# "BACKUP DATABASE _temporary_and_external_tables" is allowed but the backup must not contain these tables. +def test_temporary_tables_database(): + session_id = new_session_id() + instance.http_query( + "CREATE TEMPORARY TABLE temp_tbl(s String)", params={"session_id": session_id} + ) + + backup_name = new_backup_name() + instance.query(f"BACKUP DATABASE _temporary_and_external_tables TO {backup_name}") + + assert os.listdir(os.path.join(get_path_to_backup(backup_name), "metadata/")) == [ + "_temporary_and_external_tables.sql" # database metadata only + ] + + +def test_restore_all_restores_temporary_tables(): + session_id = new_session_id() + instance.http_query( + "CREATE TEMPORARY TABLE temp_tbl(s String)", params={"session_id": session_id} + ) + instance.http_query( + "INSERT INTO temp_tbl VALUES ('q'), ('w'), ('e')", + params={"session_id": session_id}, + ) + + backup_name = new_backup_name() + instance.http_query( + f"BACKUP TEMPORARY TABLE temp_tbl TO {backup_name}", + params={"session_id": session_id}, + ) + + session_id = new_session_id() + instance.http_query( + f"RESTORE ALL FROM {backup_name}", + params={"session_id": session_id}, + method="POST", + ) + + assert instance.http_query( + "SELECT * FROM temp_tbl ORDER BY s", params={"session_id": session_id} + ) == TSV([["e"], ["q"], ["w"]]) + + +def test_required_privileges(): + create_and_fill_table(n=5) + + instance.query("CREATE USER u1") + + backup_name = new_backup_name() + expected_error = "necessary to have grant BACKUP ON test.table" + assert expected_error in instance.query_and_get_error( + f"BACKUP TABLE test.table TO {backup_name}", user="u1" + ) + + instance.query("GRANT BACKUP ON test.table TO u1") + instance.query(f"BACKUP TABLE test.table TO {backup_name}", user="u1") + + expected_error = "necessary to have grant INSERT, CREATE TABLE ON test.table" + assert expected_error in instance.query_and_get_error( + f"RESTORE TABLE test.table FROM {backup_name}", user="u1" + ) + + expected_error = "necessary to have grant INSERT, CREATE TABLE ON test.table2" + assert expected_error in instance.query_and_get_error( + f"RESTORE TABLE test.table AS test.table2 FROM {backup_name}", user="u1" + ) + + instance.query("GRANT INSERT, CREATE ON test.table2 TO u1") + instance.query( + f"RESTORE TABLE test.table AS test.table2 FROM {backup_name}", user="u1" + ) + + instance.query("DROP TABLE test.table") + + expected_error = "necessary to have grant INSERT, CREATE TABLE ON test.table" + assert expected_error in instance.query_and_get_error( + f"RESTORE ALL FROM {backup_name}", user="u1" + ) + + instance.query("GRANT INSERT, CREATE ON test.table TO u1") + instance.query(f"RESTORE ALL FROM {backup_name}", user="u1") + + +def test_system_table(): + backup_name = new_backup_name() + instance.query(f"BACKUP TABLE system.numbers TO {backup_name}") + + assert os.listdir( + os.path.join(get_path_to_backup(backup_name), "metadata/system") + ) == ["numbers.sql"] + + assert not os.path.isdir(os.path.join(get_path_to_backup(backup_name), "data")) + + create_query = open( + os.path.join(get_path_to_backup(backup_name), "metadata/system/numbers.sql") + ).read() + + assert create_query == "CREATE TABLE system.numbers ENGINE = SystemNumbers" + + instance.query(f"RESTORE TABLE system.numbers FROM {backup_name}") + + +def test_system_database(): + backup_name = new_backup_name() + instance.query(f"BACKUP DATABASE system TO {backup_name}") + + assert "numbers.sql" in os.listdir( + os.path.join(get_path_to_backup(backup_name), "metadata/system") + ) + + create_query = open( + os.path.join(get_path_to_backup(backup_name), "metadata/system/numbers.sql") + ).read() + + assert create_query == "CREATE TABLE system.numbers ENGINE = SystemNumbers" + + +def test_system_users(): + instance.query( + "CREATE USER u1 IDENTIFIED BY 'qwe123' SETTINGS PROFILE 'default', custom_a = 1" + ) + instance.query("GRANT SELECT ON test.* TO u1") + + instance.query("CREATE ROLE r1, r2") + instance.query("GRANT r1 TO r2 WITH ADMIN OPTION") + instance.query("GRANT r2 TO u1") + + instance.query("CREATE SETTINGS PROFILE prof1 SETTINGS custom_b=2 TO u1") + instance.query("CREATE ROW POLICY rowpol1 ON test.table USING x<50 TO u1") + instance.query("CREATE QUOTA q1 TO r1") + + backup_name = new_backup_name() + instance.query( + f"BACKUP TABLE system.users, TABLE system.roles, TABLE system.settings_profiles, TABLE system.row_policies, TABLE system.quotas TO {backup_name}" + ) + + instance.query("DROP USER u1") + instance.query("DROP ROLE r1, r2") + instance.query("DROP SETTINGS PROFILE prof1") + instance.query("DROP ROW POLICY rowpol1 ON test.table") + instance.query("DROP QUOTA q1") + + instance.query( + f"RESTORE TABLE system.users, TABLE system.roles, TABLE system.settings_profiles, TABLE system.row_policies, TABLE system.quotas FROM {backup_name}" + ) + + assert ( + instance.query("SHOW CREATE USER u1") + == "CREATE USER u1 IDENTIFIED WITH sha256_password SETTINGS PROFILE default, custom_a = 1\n" + ) + assert instance.query("SHOW GRANTS FOR u1") == TSV( + ["GRANT SELECT ON test.* TO u1", "GRANT r2 TO u1"] + ) + assert instance.query("SHOW CREATE ROLE r1") == "CREATE ROLE r1\n" + assert instance.query("SHOW GRANTS FOR r1") == "" + assert instance.query("SHOW CREATE ROLE r2") == "CREATE ROLE r2\n" + assert instance.query("SHOW GRANTS FOR r2") == TSV( + ["GRANT r1 TO r2 WITH ADMIN OPTION"] + ) + + assert ( + instance.query("SHOW CREATE SETTINGS PROFILE prof1") + == "CREATE SETTINGS PROFILE prof1 SETTINGS custom_b = 2 TO u1\n" + ) + assert ( + instance.query("SHOW CREATE ROW POLICY rowpol1") + == "CREATE ROW POLICY rowpol1 ON test.table FOR SELECT USING x < 50 TO u1\n" + ) + assert instance.query("SHOW CREATE QUOTA q1") == "CREATE QUOTA q1 TO r1\n" + + +def test_system_users_required_privileges(): + instance.query("CREATE ROLE r1") + instance.query("CREATE USER u1 DEFAULT ROLE r1") + instance.query("GRANT SELECT ON test.* TO u1") + + # SETTINGS allow_backup=false means the following user won't be included in backups. + instance.query("CREATE USER u2 SETTINGS allow_backup=false") + + backup_name = new_backup_name() + + expected_error = "necessary to have grant BACKUP ON system.users" + assert expected_error in instance.query_and_get_error( + f"BACKUP TABLE system.users, TABLE system.roles TO {backup_name}", user="u2" + ) + + instance.query("GRANT BACKUP ON system.users TO u2") + + expected_error = "necessary to have grant BACKUP ON system.roles" + assert expected_error in instance.query_and_get_error( + f"BACKUP TABLE system.users, TABLE system.roles TO {backup_name}", user="u2" + ) + + instance.query("GRANT BACKUP ON system.roles TO u2") + instance.query( + f"BACKUP TABLE system.users, TABLE system.roles TO {backup_name}", user="u2" + ) + + instance.query("DROP USER u1") + instance.query("DROP ROLE r1") + + expected_error = ( + "necessary to have grant CREATE USER, CREATE ROLE, ROLE ADMIN ON *.*" + ) + assert expected_error in instance.query_and_get_error( + f"RESTORE ALL FROM {backup_name}", user="u2" + ) + + instance.query("GRANT CREATE USER, CREATE ROLE, ROLE ADMIN ON *.* TO u2") + + expected_error = "necessary to have grant SELECT ON test.* WITH GRANT OPTION" + assert expected_error in instance.query_and_get_error( + f"RESTORE ALL FROM {backup_name}", user="u2" + ) + + instance.query("GRANT SELECT ON test.* TO u2 WITH GRANT OPTION") + instance.query(f"RESTORE ALL FROM {backup_name}", user="u2") + + assert instance.query("SHOW CREATE USER u1") == "CREATE USER u1 DEFAULT ROLE r1\n" + assert instance.query("SHOW GRANTS FOR u1") == TSV( + ["GRANT SELECT ON test.* TO u1", "GRANT r1 TO u1"] + ) + + assert instance.query("SHOW CREATE ROLE r1") == "CREATE ROLE r1\n" + assert instance.query("SHOW GRANTS FOR r1") == "" diff --git a/tests/integration/test_backup_restore_on_cluster/configs/replicated_access_storage.xml b/tests/integration/test_backup_restore_on_cluster/configs/replicated_access_storage.xml new file mode 100644 index 00000000000..b4d385451d1 --- /dev/null +++ b/tests/integration/test_backup_restore_on_cluster/configs/replicated_access_storage.xml @@ -0,0 +1,10 @@ + + + + /clickhouse/access + + + users.xml + + + diff --git a/tests/integration/test_backup_restore_on_cluster/test.py b/tests/integration/test_backup_restore_on_cluster/test.py index 70dccd242c4..6264959fbce 100644 --- a/tests/integration/test_backup_restore_on_cluster/test.py +++ b/tests/integration/test_backup_restore_on_cluster/test.py @@ -9,7 +9,11 @@ cluster = ClickHouseCluster(__file__) node1 = cluster.add_instance( "node1", - main_configs=["configs/remote_servers.xml", "configs/backups_disk.xml"], + main_configs=[ + "configs/remote_servers.xml", + "configs/replicated_access_storage.xml", + "configs/backups_disk.xml", + ], user_configs=["configs/allow_experimental_database_replicated.xml"], external_dirs=["/backups/"], macros={"replica": "node1", "shard": "shard1"}, @@ -18,7 +22,11 @@ node1 = cluster.add_instance( node2 = cluster.add_instance( "node2", - main_configs=["configs/remote_servers.xml", "configs/backups_disk.xml"], + main_configs=[ + "configs/remote_servers.xml", + "configs/replicated_access_storage.xml", + "configs/backups_disk.xml", + ], user_configs=["configs/allow_experimental_database_replicated.xml"], external_dirs=["/backups/"], macros={"replica": "node2", "shard": "shard1"}, @@ -28,7 +36,11 @@ node2 = cluster.add_instance( node3 = cluster.add_instance( "node3", - main_configs=["configs/remote_servers.xml", "configs/backups_disk.xml"], + main_configs=[ + "configs/remote_servers.xml", + "configs/replicated_access_storage.xml", + "configs/backups_disk.xml", + ], user_configs=["configs/allow_experimental_database_replicated.xml"], external_dirs=["/backups/"], macros={"replica": "node3", "shard": "shard1"}, @@ -51,7 +63,9 @@ def drop_after_test(): yield finally: node1.query("DROP TABLE IF EXISTS tbl ON CLUSTER 'cluster3' NO DELAY") + node1.query("DROP TABLE IF EXISTS tbl2 ON CLUSTER 'cluster3' NO DELAY") node1.query("DROP DATABASE IF EXISTS mydb ON CLUSTER 'cluster3' NO DELAY") + node1.query("DROP USER IF EXISTS u1, u2 ON CLUSTER 'cluster3'") backup_id_counter = 0 @@ -210,9 +224,7 @@ def test_backup_restore_on_single_replica(): node1.query("DROP DATABASE mydb NO DELAY") # Cannot restore table because it already contains data on other replicas. - expected_error = ( - "Cannot restore table mydb.test because it already contains some data" - ) + expected_error = "already contains some data" assert expected_error in node1.query_and_get_error( f"RESTORE DATABASE mydb FROM {backup_name}" ) @@ -254,9 +266,7 @@ def test_table_with_parts_in_queue_considered_non_empty(): node1.query("DROP DATABASE mydb NO DELAY") # Cannot restore table because it already contains data on other replicas. - expected_error = ( - "Cannot restore table mydb.test because it already contains some data" - ) + expected_error = "already contains some data" assert expected_error in node1.query_and_get_error( f"RESTORE DATABASE mydb FROM {backup_name}" ) @@ -389,7 +399,7 @@ def test_replicated_database_async(): f"BACKUP DATABASE mydb ON CLUSTER 'cluster' TO {backup_name} ASYNC" ).split("\t") - assert status == "MAKING_BACKUP\n" + assert status == "MAKING_BACKUP\n" or status == "BACKUP_COMPLETE\n" assert_eq_with_retry( node1, @@ -403,7 +413,7 @@ def test_replicated_database_async(): f"RESTORE DATABASE mydb ON CLUSTER 'cluster' FROM {backup_name} ASYNC" ).split("\t") - assert status == "RESTORING\n" + assert status == "RESTORING\n" or status == "RESTORED\n" assert_eq_with_retry( node1, f"SELECT status FROM system.backups WHERE uuid='{id}'", "RESTORED\n" @@ -413,3 +423,94 @@ def test_replicated_database_async(): assert node1.query("SELECT * FROM mydb.tbl ORDER BY x") == TSV([1, 22]) assert node2.query("SELECT * FROM mydb.tbl2 ORDER BY y") == TSV(["a", "bb"]) + + +def test_required_privileges(): + node1.query( + "CREATE TABLE tbl ON CLUSTER 'cluster' (" + "x UInt8" + ") ENGINE=ReplicatedMergeTree('/clickhouse/tables/tbl/', '{replica}')" + "ORDER BY x" + ) + + node1.query("INSERT INTO tbl VALUES (100)") + + node1.query("CREATE USER u1") + + backup_name = new_backup_name() + expected_error = "necessary to have grant BACKUP ON default.tbl" + assert expected_error in node1.query_and_get_error( + f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name}", user="u1" + ) + + node1.query("GRANT BACKUP ON tbl TO u1") + node1.query(f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name}", user="u1") + + node1.query(f"DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY") + + expected_error = "necessary to have grant INSERT, CREATE TABLE ON default.tbl2" + assert expected_error in node1.query_and_get_error( + f"RESTORE TABLE tbl AS tbl2 ON CLUSTER 'cluster' FROM {backup_name}", user="u1" + ) + + node1.query("GRANT INSERT, CREATE TABLE ON tbl2 TO u1") + node1.query( + f"RESTORE TABLE tbl AS tbl2 ON CLUSTER 'cluster' FROM {backup_name}", user="u1" + ) + + assert node2.query("SELECT * FROM tbl2") == "100\n" + + node1.query(f"DROP TABLE tbl2 ON CLUSTER 'cluster' NO DELAY") + node1.query("REVOKE ALL FROM u1") + + expected_error = "necessary to have grant INSERT, CREATE TABLE ON default.tbl" + assert expected_error in node1.query_and_get_error( + f"RESTORE ALL ON CLUSTER 'cluster' FROM {backup_name}", user="u1" + ) + + node1.query("GRANT INSERT, CREATE TABLE ON tbl TO u1") + node1.query(f"RESTORE ALL ON CLUSTER 'cluster' FROM {backup_name}", user="u1") + + assert node2.query("SELECT * FROM tbl") == "100\n" + + +def test_system_users(): + node1.query("CREATE USER u1 SETTINGS custom_a=123") + node1.query("GRANT SELECT ON tbl TO u1") + + backup_name = new_backup_name() + node1.query("CREATE USER u2 SETTINGS allow_backup=false") + + expected_error = "necessary to have grant BACKUP ON system.users" + assert expected_error in node1.query_and_get_error( + f"BACKUP TABLE system.users ON CLUSTER 'cluster' TO {backup_name}", user="u2" + ) + + node1.query("GRANT BACKUP ON system.users TO u2") + node1.query( + f"BACKUP TABLE system.users ON CLUSTER 'cluster' TO {backup_name}", user="u2" + ) + + node1.query("DROP USER u1") + + expected_error = "necessary to have grant CREATE USER ON *.*" + assert expected_error in node1.query_and_get_error( + f"RESTORE TABLE system.users ON CLUSTER 'cluster' FROM {backup_name}", user="u2" + ) + + node1.query("GRANT CREATE USER ON *.* TO u2") + + expected_error = "necessary to have grant SELECT ON default.tbl WITH GRANT OPTION" + assert expected_error in node1.query_and_get_error( + f"RESTORE TABLE system.users ON CLUSTER 'cluster' FROM {backup_name}", user="u2" + ) + + node1.query("GRANT SELECT ON tbl TO u2 WITH GRANT OPTION") + node1.query( + f"RESTORE TABLE system.users ON CLUSTER 'cluster' FROM {backup_name}", user="u2" + ) + + assert ( + node1.query("SHOW CREATE USER u1") == "CREATE USER u1 SETTINGS custom_a = 123\n" + ) + assert node1.query("SHOW GRANTS FOR u1") == "GRANT SELECT ON default.tbl TO u1\n" diff --git a/tests/integration/test_backward_compatibility/test_insert_profile_events.py b/tests/integration/test_backward_compatibility/test_insert_profile_events.py new file mode 100644 index 00000000000..8047c088e4c --- /dev/null +++ b/tests/integration/test_backward_compatibility/test_insert_profile_events.py @@ -0,0 +1,42 @@ +# pylint: disable=line-too-long +# pylint: disable=unused-argument +# pylint: disable=redefined-outer-name + +import pytest + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__, name="insert_profile_events") +upstream_node = cluster.add_instance("upstream_node") +old_node = cluster.add_instance( + "old_node", + image="clickhouse/clickhouse-server", + tag="22.5.1.2079", + with_installed_binary=True, +) + + +@pytest.fixture(scope="module") +def start_cluster(): + try: + cluster.start() + yield cluster + + finally: + cluster.shutdown() + + +def test_old_client_compatible(start_cluster): + old_node.query("INSERT INTO FUNCTION null('foo String') VALUES ('foo')('bar')") + old_node.query( + "INSERT INTO FUNCTION null('foo String') VALUES ('foo')('bar')", + host=upstream_node.ip_address, + ) + + +def test_new_client_compatible(start_cluster): + upstream_node.query( + "INSERT INTO FUNCTION null('foo String') VALUES ('foo')('bar')", + host=old_node.ip_address, + ) + upstream_node.query("INSERT INTO FUNCTION null('foo String') VALUES ('foo')('bar')") diff --git a/tests/integration/test_distributed_format/configs/another_remote_servers.xml b/tests/integration/test_distributed_format/configs/another_remote_servers.xml new file mode 100644 index 00000000000..2655f1d864e --- /dev/null +++ b/tests/integration/test_distributed_format/configs/another_remote_servers.xml @@ -0,0 +1,25 @@ + + + + + + not_existing + 9000 + + + not_existing2 + 9000 + + + + + + + + 127.0.0.1 + 9000 + + + + + diff --git a/tests/integration/test_distributed_format/test.py b/tests/integration/test_distributed_format/test.py index 415141be021..5611f465e8b 100644 --- a/tests/integration/test_distributed_format/test.py +++ b/tests/integration/test_distributed_format/test.py @@ -6,7 +6,11 @@ import pytest from helpers.cluster import ClickHouseCluster cluster = ClickHouseCluster(__file__) -node = cluster.add_instance("node", main_configs=["configs/remote_servers.xml"]) +node = cluster.add_instance( + "node", + main_configs=["configs/remote_servers.xml", "configs/another_remote_servers.xml"], + stay_alive=True, +) cluster_param = pytest.mark.parametrize( "cluster", @@ -143,3 +147,37 @@ def test_single_file_old(started_cluster, cluster): assert out == "1\ta\n2\tbb\n3\tccc\n" node.query("drop table test.distr_3") + + +def test_remove_replica(started_cluster): + node.query( + "create table test.local_4 (x UInt64, s String) engine = MergeTree order by x" + ) + node.query( + "create table test.distr_4 (x UInt64, s String) engine = Distributed('test_cluster_remove_replica1', test, local_4)" + ) + node.query( + "insert into test.distr_4 values (1, 'a'), (2, 'bb'), (3, 'ccc'), (4, 'dddd')" + ) + node.query("detach table test.distr_4") + + node.exec_in_container( + [ + "sed", + "-i", + "s/test_cluster_remove_replica1/test_cluster_remove_replica_tmp/g", + "/etc/clickhouse-server/config.d/another_remote_servers.xml", + ] + ) + node.exec_in_container( + [ + "sed", + "-i", + "s/test_cluster_remove_replica2/test_cluster_remove_replica1/g", + "/etc/clickhouse-server/config.d/another_remote_servers.xml", + ] + ) + node.query("SYSTEM RELOAD CONFIG") + node.query("attach table test.distr_4", ignore_error=True) + node.query("SYSTEM FLUSH DISTRIBUTED test.distr_4", ignore_error=True) + assert node.query("select 1") == "1\n" diff --git a/tests/integration/test_grant_and_revoke/test.py b/tests/integration/test_grant_and_revoke/test.py index 966620b37b8..d0a2f2ab933 100644 --- a/tests/integration/test_grant_and_revoke/test.py +++ b/tests/integration/test_grant_and_revoke/test.py @@ -179,7 +179,7 @@ def test_grant_all_on_table(): assert ( instance.query("SHOW GRANTS FOR B") == "GRANT SHOW TABLES, SHOW COLUMNS, SHOW DICTIONARIES, SELECT, INSERT, ALTER TABLE, ALTER VIEW, CREATE TABLE, CREATE VIEW, CREATE DICTIONARY, " - "DROP TABLE, DROP VIEW, DROP DICTIONARY, TRUNCATE, OPTIMIZE, CREATE ROW POLICY, ALTER ROW POLICY, DROP ROW POLICY, SHOW ROW POLICIES, " + "DROP TABLE, DROP VIEW, DROP DICTIONARY, TRUNCATE, OPTIMIZE, BACKUP, CREATE ROW POLICY, ALTER ROW POLICY, DROP ROW POLICY, SHOW ROW POLICIES, " "SYSTEM MERGES, SYSTEM TTL MERGES, SYSTEM FETCHES, SYSTEM MOVES, SYSTEM SENDS, SYSTEM REPLICATION QUEUES, SYSTEM DROP REPLICA, SYSTEM SYNC REPLICA, " "SYSTEM RESTART REPLICA, SYSTEM RESTORE REPLICA, SYSTEM FLUSH DISTRIBUTED, dictGet ON test.table TO B\n" ) diff --git a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/counter.clj b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/counter.clj index dfccf7dd635..f82d3f4c348 100644 --- a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/counter.clj +++ b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/counter.clj @@ -43,7 +43,9 @@ "A generator, client, and checker for a set test." [opts] {:client (CounterClient. nil nil) - :checker (checker/counter) + :checker (checker/compose + {:counter (checker/counter) + :perf (checker/perf)}) :generator (->> (range) (map (fn [x] (->> (gen/mix [r add]))))) diff --git a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/queue.clj b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/queue.clj index 30ff7c01ec4..3aeaa328606 100644 --- a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/queue.clj +++ b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/queue.clj @@ -59,7 +59,8 @@ {:client (QueueClient. nil nil) :checker (checker/compose {:total-queue (checker/total-queue) - :timeline (timeline/html)}) + :perf (checker/perf) + :timeline (timeline/html)}) :generator (->> (sorted-str-range 50000) (map (fn [x] (rand-nth [{:type :invoke, :f :enqueue :value x} @@ -72,6 +73,7 @@ :checker (checker/compose {:linear (checker/linearizable {:model (model/unordered-queue) :algorithm :linear}) + :perf (checker/perf) :timeline (timeline/html)}) :generator (->> (sorted-str-range 10000) (map (fn [x] diff --git a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/register.clj b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/register.clj index b2f381168bd..a1605192b51 100644 --- a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/register.clj +++ b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/register.clj @@ -55,6 +55,7 @@ (checker/compose {:linear (checker/linearizable {:model (model/cas-register) :algorithm :linear}) + :perf (checker/perf) :timeline (timeline/html)})) :generator (independent/concurrent-generator 10 diff --git a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/set.clj b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/set.clj index 79ec4f824bb..b992a6abcbb 100644 --- a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/set.clj +++ b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/set.clj @@ -44,7 +44,9 @@ "A generator, client, and checker for a set test." [opts] {:client (SetClient. "/a-set" nil nil) - :checker (checker/set) + :checker (checker/compose + {:set (checker/set) + :perf (checker/perf)}) :generator (->> (range) (map (fn [x] {:type :invoke, :f :add, :value x}))) :final-generator (gen/once {:type :invoke, :f :read, :value nil})}) diff --git a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/unique.clj b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/unique.clj index c50f33924e0..752240722d8 100644 --- a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/unique.clj +++ b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/unique.clj @@ -36,7 +36,9 @@ "A generator, client, and checker for a set test." [opts] {:client (UniqueClient. nil nil) - :checker (checker/unique-ids) + :checker (checker/compose + {:perf (checker/perf) + :unique (checker/unique-ids)}) :generator (->> (range) (map (fn [_] {:type :invoke, :f :generate})))}) diff --git a/tests/performance/join_append_block.xml b/tests/performance/join_append_block.xml new file mode 100644 index 00000000000..15859e95941 --- /dev/null +++ b/tests/performance/join_append_block.xml @@ -0,0 +1,3 @@ + + SELECT count(c) FROM numbers_mt(100000000) AS a INNER JOIN (SELECT number, toString(number) AS c FROM numbers(2000000)) AS b ON (a.number % 10000000) = b.number + diff --git a/tests/queries/0_stateless/00019_shard_quantiles_totals_distributed.sql b/tests/queries/0_stateless/00019_shard_quantiles_totals_distributed.sql index 84956f0d97e..e712b028a82 100644 --- a/tests/queries/0_stateless/00019_shard_quantiles_totals_distributed.sql +++ b/tests/queries/0_stateless/00019_shard_quantiles_totals_distributed.sql @@ -1,3 +1,4 @@ -- Tags: distributed -SELECT quantilesTiming(0.1, 0.5, 0.9)(dummy) FROM remote('127.0.0.{2,3}', system, one) GROUP BY 1 WITH TOTALS +SET enable_positional_arguments = 0; +SELECT quantilesTiming(0.1, 0.5, 0.9)(dummy) FROM remote('127.0.0.{2,3}', system, one) GROUP BY 1 WITH TOTALS; diff --git a/tests/queries/0_stateless/00038_totals_limit.sql b/tests/queries/0_stateless/00038_totals_limit.sql index 09960c92eb4..804378068d9 100644 --- a/tests/queries/0_stateless/00038_totals_limit.sql +++ b/tests/queries/0_stateless/00038_totals_limit.sql @@ -1 +1,2 @@ -SELECT count() GROUP BY 1 WITH TOTALS LIMIT 1 +SET enable_positional_arguments = 0; +SELECT count() GROUP BY 1 WITH TOTALS LIMIT 1; diff --git a/tests/queries/0_stateless/00209_insert_select_extremes.sql b/tests/queries/0_stateless/00209_insert_select_extremes.sql index 8c6e547131c..98dfe8e2658 100644 --- a/tests/queries/0_stateless/00209_insert_select_extremes.sql +++ b/tests/queries/0_stateless/00209_insert_select_extremes.sql @@ -1,6 +1,8 @@ DROP TABLE IF EXISTS test_00209; CREATE TABLE test_00209 (x UInt8) ENGINE = Log; +SET enable_positional_arguments = 0; + INSERT INTO test_00209 SELECT 1 AS x; INSERT INTO test_00209 SELECT 1 AS x SETTINGS extremes = 1; INSERT INTO test_00209 SELECT 1 AS x GROUP BY 1 WITH TOTALS; diff --git a/tests/queries/0_stateless/00700_decimal_round.reference b/tests/queries/0_stateless/00700_decimal_round.reference index d0f03c07849..4b4994106fe 100644 --- a/tests/queries/0_stateless/00700_decimal_round.reference +++ b/tests/queries/0_stateless/00700_decimal_round.reference @@ -73,3 +73,15 @@ 12345678901234567890123456789.123456789 -12345678901234567890123456789.123456789 12345678901234567890123456790 -12345678901234567890123456789 12345678901234567890123457000 -12345678901234567890123456000 12345678901234567890123456789.123456789 -12345678901234567890123456789.123456789 12345678901234567890123456789 -12345678901234567890123456790 12345678901234567890123456000 -12345678901234567890123457000 12345678901234567890123456789.123456789 -12345678901234567890123456789.123456789 12345678901234567890123456789 -12345678901234567890123456789 12345678901234567890123456000 -12345678901234567890123456000 +-- Decimal128, Scale 20 +round() : 1234567890.123456789 -1234567890.123456789 1234567890 -1234567890 1234568000 -1234568000 +roundBankers() : 1234567890.123456789 -1234567890.123456789 1234567890 -1234567890 1234568000 -1234568000 +ceil() : 1234567890.123456789 -1234567890.123456789 1234567891 -1234567890 1234568000 -1234567000 +floor() : 1234567890.123456789 -1234567890.123456789 1234567890 -1234567891 1234567000 -1234568000 +trunc() : 1234567890.123456789 -1234567890.123456789 1234567890 -1234567890 1234567000 -1234567000 +-- Decimal256, Scale 40 +round() : 1234567890.123456789 -1234567890.123456789 1234567890 -1234567890 1234568000 -1234568000 +roundBankers() : 1234567890.123456789 -1234567890.123456789 1234567890 -1234567890 1234568000 -1234568000 +ceil() : 1234567890.123456789 -1234567890.123456789 1234567891 -1234567890 1234568000 -1234567000 +floor() : 1234567890.123456789 -1234567890.123456789 1234567890 -1234567891 1234567000 -1234568000 +trunc() : 1234567890.123456789 -1234567890.123456789 1234567890 -1234567890 1234567000 -1234567000 diff --git a/tests/queries/0_stateless/00700_decimal_round.sql b/tests/queries/0_stateless/00700_decimal_round.sql index c5b8dbb520b..bf2749ac03f 100644 --- a/tests/queries/0_stateless/00700_decimal_round.sql +++ b/tests/queries/0_stateless/00700_decimal_round.sql @@ -78,3 +78,17 @@ SELECT toDecimal128('12345678901234567890123456789.123456789', 9) AS x, -x AS y, SELECT toDecimal128('12345678901234567890123456789.123456789', 9) AS x, -x AS y, ceil(x), ceil(y), ceil(x, -3), ceil(y, -3); SELECT toDecimal128('12345678901234567890123456789.123456789', 9) AS x, -x AS y, floor(x), floor(y), floor(x, -3), floor(y, -3); SELECT toDecimal128('12345678901234567890123456789.123456789', 9) AS x, -x AS y, trunc(x), trunc(y), trunc(x, -3), trunc(y, -3); + +select '-- Decimal128, Scale 20'; +SELECT 'round() : ', toDecimal128('1234567890.123456789', 20) AS x, -x AS y, round(x), round(y), round(x, -3), round(y, -3); +SELECT 'roundBankers() : ', toDecimal128('1234567890.123456789', 20) AS x, -x AS y, roundBankers(x), roundBankers(y), roundBankers(x, -3), roundBankers(y, -3); +SELECT 'ceil() : ', toDecimal128('1234567890.123456789', 20) AS x, -x AS y, ceil(x), ceil(y), ceil(x, -3), ceil(y, -3); +SELECT 'floor() : ', toDecimal128('1234567890.123456789', 20) AS x, -x AS y, floor(x), floor(y), floor(x, -3), floor(y, -3); +SELECT 'trunc() : ', toDecimal128('1234567890.123456789', 20) AS x, -x AS y, trunc(x), trunc(y), trunc(x, -3), trunc(y, -3); + +select '-- Decimal256, Scale 40'; +SELECT 'round() : ', toDecimal256('1234567890.123456789', 40) AS x, -x AS y, round(x), round(y), round(x, -3), round(y, -3); +SELECT 'roundBankers() : ', toDecimal256('1234567890.123456789', 40) AS x, -x AS y, roundBankers(x), roundBankers(y), roundBankers(x, -3), roundBankers(y, -3); +SELECT 'ceil() : ', toDecimal256('1234567890.123456789', 40) AS x, -x AS y, ceil(x), ceil(y), ceil(x, -3), ceil(y, -3); +SELECT 'floor() : ', toDecimal256('1234567890.123456789', 40) AS x, -x AS y, floor(x), floor(y), floor(x, -3), floor(y, -3); +SELECT 'trunc() : ', toDecimal256('1234567890.123456789', 40) AS x, -x AS y, trunc(x), trunc(y), trunc(x, -3), trunc(y, -3); diff --git a/tests/queries/0_stateless/00804_rollup_with_having.reference b/tests/queries/0_stateless/00804_rollup_with_having.reference index 0f708e8d900..aab16ed800b 100644 --- a/tests/queries/0_stateless/00804_rollup_with_having.reference +++ b/tests/queries/0_stateless/00804_rollup_with_having.reference @@ -1,4 +1,4 @@ a b 1 -a \N 2 a \N 1 +a \N 2 a b 1 diff --git a/tests/queries/0_stateless/00804_rollup_with_having.sql b/tests/queries/0_stateless/00804_rollup_with_having.sql index 29b9ae19041..852e7d32f6b 100644 --- a/tests/queries/0_stateless/00804_rollup_with_having.sql +++ b/tests/queries/0_stateless/00804_rollup_with_having.sql @@ -8,7 +8,7 @@ INSERT INTO rollup_having VALUES (NULL, NULL); INSERT INTO rollup_having VALUES ('a', NULL); INSERT INTO rollup_having VALUES ('a', 'b'); -SELECT a, b, count(*) FROM rollup_having GROUP BY a, b WITH ROLLUP HAVING a IS NOT NULL ORDER BY a, b; -SELECT a, b, count(*) FROM rollup_having GROUP BY a, b WITH ROLLUP HAVING a IS NOT NULL and b IS NOT NULL ORDER BY a, b; +SELECT a, b, count(*) as count FROM rollup_having GROUP BY a, b WITH ROLLUP HAVING a IS NOT NULL ORDER BY a, b, count; +SELECT a, b, count(*) as count FROM rollup_having GROUP BY a, b WITH ROLLUP HAVING a IS NOT NULL and b IS NOT NULL ORDER BY a, b, count; DROP TABLE rollup_having; diff --git a/tests/queries/0_stateless/00984_parser_stack_overflow.sh b/tests/queries/0_stateless/00984_parser_stack_overflow.sh index 329e51e774a..168ef155d9b 100755 --- a/tests/queries/0_stateless/00984_parser_stack_overflow.sh +++ b/tests/queries/0_stateless/00984_parser_stack_overflow.sh @@ -1,4 +1,6 @@ #!/usr/bin/env bash +# Tags: no-tsan +# FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan # Such a huge timeout mostly for debug build. CLICKHOUSE_CURL_TIMEOUT=60 diff --git a/tests/queries/0_stateless/01045_order_by_pk_special_storages.sh b/tests/queries/0_stateless/01045_order_by_pk_special_storages.sh index a46fedb4533..bb76f3978cc 100755 --- a/tests/queries/0_stateless/01045_order_by_pk_special_storages.sh +++ b/tests/queries/0_stateless/01045_order_by_pk_special_storages.sh @@ -24,7 +24,7 @@ $CLICKHOUSE_CLIENT -q "SELECT a FROM m ORDER BY a LIMIT 5" $CLICKHOUSE_CLIENT -q "SELECT a, s FROM m ORDER BY a, s LIMIT 10" # Not a single .sql test with max_rows_to_read because it doesn't work with Merge storage -rows_read=$($CLICKHOUSE_CLIENT -q "SELECT a FROM m ORDER BY a LIMIT 10 FORMAT JSON" --max_threads=1 --max_block_size=20 | grep "rows_read" | sed 's/[^0-9]*//g') +rows_read=$($CLICKHOUSE_CLIENT -q "SELECT a FROM m ORDER BY a LIMIT 10 FORMAT JSON" --max_threads=1 --max_block_size=20 --optimize_read_in_order=1 | grep "rows_read" | sed 's/[^0-9]*//g') # Expected number of read rows with a bit margin if [[ $rows_read -lt 500 ]] @@ -36,7 +36,7 @@ fi $CLICKHOUSE_CLIENT -q "SELECT '---StorageBuffer---'" $CLICKHOUSE_CLIENT -q "CREATE TABLE buf (a UInt32, s String) engine = Buffer('$CLICKHOUSE_DATABASE', s2, 16, 10, 100, 10000, 1000000, 10000000, 100000000)" $CLICKHOUSE_CLIENT -q "SELECT a, s FROM buf ORDER BY a, s LIMIT 10" -rows_read=$($CLICKHOUSE_CLIENT -q "SELECT a FROM buf ORDER BY a LIMIT 10 FORMAT JSON" --max_threads=1 --max_block_size=20 | grep "rows_read" | sed 's/[^0-9]*//g') +rows_read=$($CLICKHOUSE_CLIENT -q "SELECT a FROM buf ORDER BY a LIMIT 10 FORMAT JSON" --max_threads=1 --max_block_size=20 --optimize_read_in_order=1 | grep "rows_read" | sed 's/[^0-9]*//g') # Expected number of read rows with a bit margin if [[ $rows_read -lt 500 ]] @@ -48,7 +48,7 @@ fi $CLICKHOUSE_CLIENT -q "SELECT '---MaterializedView---'" $CLICKHOUSE_CLIENT -q "CREATE MATERIALIZED VIEW mv (a UInt32, s String) engine = MergeTree ORDER BY s SETTINGS min_bytes_for_wide_part = 0 POPULATE AS SELECT a, s FROM s1 WHERE a % 7 = 0" $CLICKHOUSE_CLIENT -q "SELECT a, s FROM mv ORDER BY s LIMIT 10" -rows_read=$($CLICKHOUSE_CLIENT -q "SELECT a, s FROM mv ORDER BY s LIMIT 10 FORMAT JSON" --max_threads=1 --max_block_size=20 | grep "rows_read" | sed 's/[^0-9]*//g') +rows_read=$($CLICKHOUSE_CLIENT -q "SELECT a, s FROM mv ORDER BY s LIMIT 10 FORMAT JSON" --max_threads=1 --max_block_size=20 --optimize_read_in_order=1 | grep "rows_read" | sed 's/[^0-9]*//g') if [[ $rows_read -lt 500 ]] then echo "OK" diff --git a/tests/queries/0_stateless/01098_temporary_and_external_tables.reference b/tests/queries/0_stateless/01098_temporary_and_external_tables.reference index 1ddb10581a8..7d6bdb6ab42 100644 --- a/tests/queries/0_stateless/01098_temporary_and_external_tables.reference +++ b/tests/queries/0_stateless/01098_temporary_and_external_tables.reference @@ -1,6 +1,6 @@ -CREATE TEMPORARY TABLE tmp_table\n(\n `n` UInt64\n)\nENGINE = Memory AS\nSELECT number AS n\nFROM numbers(42) -CREATE TEMPORARY TABLE tmp_table\n(\n `n` UInt64\n)\nENGINE = Memory AS\nSELECT number AS n\nFROM numbers(42) -CREATE TEMPORARY TABLE tmp_table\n(\n `n` UInt64\n)\nENGINE = Memory AS\nSELECT number AS n\nFROM numbers(42) +CREATE TEMPORARY TABLE tmp_table\n(\n `n` UInt64\n)\nENGINE = Memory +CREATE TEMPORARY TABLE tmp_table\n(\n `n` UInt64\n)\nENGINE = Memory +CREATE TEMPORARY TABLE tmp_table\n(\n `n` UInt64\n)\nENGINE = Memory 42 OK OK diff --git a/tests/queries/0_stateless/01162_strange_mutations.sh b/tests/queries/0_stateless/01162_strange_mutations.sh index c759d113f84..e480a653b74 100755 --- a/tests/queries/0_stateless/01162_strange_mutations.sh +++ b/tests/queries/0_stateless/01162_strange_mutations.sh @@ -28,16 +28,16 @@ do $CLICKHOUSE_CLIENT -q "CREATE TABLE test ENGINE=$engine AS SELECT number + 100 AS n, 0 AS test FROM numbers(50)" 2>&1| grep -Ev "Removing leftovers from table|removed by another replica" $CLICKHOUSE_CLIENT -q "select count(), sum(n), sum(test) from test" if [[ $engine == *"ReplicatedMergeTree"* ]]; then - $CLICKHOUSE_CLIENT -q "ALTER TABLE test - UPDATE test = (SELECT groupArray(id) FROM t1 GROUP BY 1)[n - 99] WHERE 1" 2>&1| grep -Fa "DB::Exception: " | grep -Fv "statement with subquery may be nondeterministic" - $CLICKHOUSE_CLIENT --allow_nondeterministic_mutations=1 --mutations_sync=1 -q "ALTER TABLE test - UPDATE test = (SELECT groupArray(id) FROM t1 GROUP BY 1)[n - 99] WHERE 1" + $CLICKHOUSE_CLIENT --enable_positional_arguments 0 -q "ALTER TABLE test + UPDATE test = (SELECT groupArray(id) FROM t1)[n - 99] WHERE 1" 2>&1| grep -Fa "DB::Exception: " | grep -Fv "statement with subquery may be nondeterministic" + $CLICKHOUSE_CLIENT --enable_positional_arguments 0 --allow_nondeterministic_mutations=1 --mutations_sync=1 -q "ALTER TABLE test + UPDATE test = (SELECT groupArray(id) FROM t1)[n - 99] WHERE 1" elif [[ $engine == *"Join"* ]]; then - $CLICKHOUSE_CLIENT -q "ALTER TABLE test - UPDATE test = (SELECT groupArray(id) FROM t1 GROUP BY 1)[n - 99] WHERE 1" 2>&1| grep -Fa "DB::Exception: " | grep -Fv "Table engine Join supports only DELETE mutations" + $CLICKHOUSE_CLIENT --enable_positional_arguments 0 -q "ALTER TABLE test + UPDATE test = (SELECT groupArray(id) FROM t1)[n - 99] WHERE 1" 2>&1| grep -Fa "DB::Exception: " | grep -Fv "Table engine Join supports only DELETE mutations" else - $CLICKHOUSE_CLIENT --mutations_sync=1 -q "ALTER TABLE test - UPDATE test = (SELECT groupArray(id) FROM t1 GROUP BY 1)[n - 99] WHERE 1" + $CLICKHOUSE_CLIENT --enable_positional_arguments 0 --mutations_sync=1 -q "ALTER TABLE test + UPDATE test = (SELECT groupArray(id) FROM t1)[n - 99] WHERE 1" fi $CLICKHOUSE_CLIENT -q "select count(), sum(n), sum(test) from test" $CLICKHOUSE_CLIENT -q "drop table test" diff --git a/tests/queries/0_stateless/01169_alter_partition_isolation_stress.sh b/tests/queries/0_stateless/01169_alter_partition_isolation_stress.sh index 653333dcb96..3657a93f1fd 100755 --- a/tests/queries/0_stateless/01169_alter_partition_isolation_stress.sh +++ b/tests/queries/0_stateless/01169_alter_partition_isolation_stress.sh @@ -18,9 +18,7 @@ function thread_insert() { set -e val=1 - trap "STOP_THE_LOOP=1" INT - STOP_THE_LOOP=0 - while [[ $STOP_THE_LOOP != 1 ]]; do + while true; do $CLICKHOUSE_CLIENT --multiquery --query " BEGIN TRANSACTION; INSERT INTO src VALUES /* ($val, 1) */ ($val, 1); @@ -93,9 +91,7 @@ function thread_partition_dst_to_src() function thread_select() { set -e - trap "STOP_THE_LOOP=1" INT - STOP_THE_LOOP=0 - while [[ $STOP_THE_LOOP != 1 ]]; do + while true; do $CLICKHOUSE_CLIENT --multiquery --query " BEGIN TRANSACTION; -- no duplicates @@ -122,9 +118,10 @@ thread_partition_src_to_dst & PID_3=$! thread_partition_dst_to_src & PID_4=$! wait $PID_3 && wait $PID_4 -kill -INT $PID_1 -kill -INT $PID_2 +kill -TERM $PID_1 +kill -TERM $PID_2 wait +wait_for_queries_to_finish $CLICKHOUSE_CLIENT -q "SELECT type, count(n) = countDistinct(n) FROM merge(currentDatabase(), '') GROUP BY type ORDER BY type" $CLICKHOUSE_CLIENT -q "SELECT DISTINCT arraySort(groupArrayIf(n, type=1)) = arraySort(groupArrayIf(n, type=2)) FROM merge(currentDatabase(), '') GROUP BY _table ORDER BY _table" diff --git a/tests/queries/0_stateless/01171_mv_select_insert_isolation_long.sh b/tests/queries/0_stateless/01171_mv_select_insert_isolation_long.sh index 261fa480491..30dbab6073c 100755 --- a/tests/queries/0_stateless/01171_mv_select_insert_isolation_long.sh +++ b/tests/queries/0_stateless/01171_mv_select_insert_isolation_long.sh @@ -50,9 +50,7 @@ function thread_insert_rollback() function thread_optimize() { set -e - trap "STOP_THE_LOOP=1" INT - STOP_THE_LOOP=0 - while [[ $STOP_THE_LOOP != 1 ]]; do + while true; do optimize_query="OPTIMIZE TABLE src" partition_id=$(( RANDOM % 2 )) if (( RANDOM % 2 )); then @@ -82,7 +80,6 @@ function thread_optimize() function thread_select() { set -e - trap "exit 0" INT while true; do $CLICKHOUSE_CLIENT --multiquery --query " BEGIN TRANSACTION; @@ -103,9 +100,7 @@ function thread_select() function thread_select_insert() { set -e - trap "STOP_THE_LOOP=1" INT - STOP_THE_LOOP=0 - while [[ $STOP_THE_LOOP != 1 ]]; do + while true; do $CLICKHOUSE_CLIENT --multiquery --query " BEGIN TRANSACTION; SELECT throwIf((SELECT count() FROM tmp) != 0) FORMAT Null; @@ -139,12 +134,13 @@ thread_select & PID_7=$! thread_select_insert & PID_8=$! wait $PID_1 && wait $PID_2 && wait $PID_3 -kill -INT $PID_4 -kill -INT $PID_5 -kill -INT $PID_6 -kill -INT $PID_7 -kill -INT $PID_8 +kill -TERM $PID_4 +kill -TERM $PID_5 +kill -TERM $PID_6 +kill -TERM $PID_7 +kill -TERM $PID_8 wait +wait_for_queries_to_finish $CLICKHOUSE_CLIENT --multiquery --query " BEGIN TRANSACTION; diff --git a/tests/queries/0_stateless/01172_transaction_counters.sql b/tests/queries/0_stateless/01172_transaction_counters.sql index b84a7b25c47..8e04b6c89bd 100644 --- a/tests/queries/0_stateless/01172_transaction_counters.sql +++ b/tests/queries/0_stateless/01172_transaction_counters.sql @@ -1,5 +1,6 @@ --- Tags: no-s3-storage +-- Tags: no-s3-storage, no-tsan -- FIXME this test fails with S3 due to a bug in DiskCacheWrapper +-- FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan drop table if exists txn_counters; create table txn_counters (n Int64, creation_tid DEFAULT transactionID()) engine=MergeTree order by n; diff --git a/tests/queries/0_stateless/01174_select_insert_isolation.sh b/tests/queries/0_stateless/01174_select_insert_isolation.sh index 4bce09cf1d5..5de42cbc4c5 100755 --- a/tests/queries/0_stateless/01174_select_insert_isolation.sh +++ b/tests/queries/0_stateless/01174_select_insert_isolation.sh @@ -35,9 +35,7 @@ function thread_insert_rollback() function thread_select() { - trap "STOP_THE_LOOP=1" INT - STOP_THE_LOOP=0 - while [[ $STOP_THE_LOOP != 1 ]]; do + while true; do # Result of `uniq | wc -l` must be 1 if the first and the last queries got the same result $CLICKHOUSE_CLIENT --multiquery --query " BEGIN TRANSACTION; @@ -55,8 +53,9 @@ thread_insert_commit 2 & PID_2=$! thread_insert_rollback 3 & PID_3=$! thread_select & PID_4=$! wait $PID_1 && wait $PID_2 && wait $PID_3 -kill -INT $PID_4 +kill -TERM $PID_4 wait +wait_for_queries_to_finish $CLICKHOUSE_CLIENT --multiquery --query " BEGIN TRANSACTION; diff --git a/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.sh b/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.sh index e4a23055ae6..e632841bd01 100755 --- a/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.sh +++ b/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.sh @@ -36,7 +36,7 @@ function run_until_out_contains() RAND_COMMENT="01175_DDL_$RANDOM" LOG_COMMENT="${CLICKHOUSE_LOG_COMMENT}_$RAND_COMMENT" -CLICKHOUSE_CLIENT_WITH_SETTINGS=${CLICKHOUSE_CLIENT/--log_comment=\'${CLICKHOUSE_LOG_COMMENT}\'/--log_comment=\'${LOG_COMMENT}\'} +CLICKHOUSE_CLIENT_WITH_SETTINGS=${CLICKHOUSE_CLIENT/--log_comment ${CLICKHOUSE_LOG_COMMENT}/--log_comment ${LOG_COMMENT}} CLICKHOUSE_CLIENT_WITH_SETTINGS+=" --output_format_parallel_formatting=0 " CLICKHOUSE_CLIENT_WITH_SETTINGS+=" --distributed_ddl_entry_format_version=2 " diff --git a/tests/queries/0_stateless/01183_custom_separated_format_http.sh b/tests/queries/0_stateless/01183_custom_separated_format_http.sh index 8eaa22f4ecc..744cf0c08bd 100755 --- a/tests/queries/0_stateless/01183_custom_separated_format_http.sh +++ b/tests/queries/0_stateless/01183_custom_separated_format_http.sh @@ -1,4 +1,6 @@ #!/usr/bin/env bash +# Tags: no-tsan +# FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/01184_long_insert_values_huge_strings.sh b/tests/queries/0_stateless/01184_long_insert_values_huge_strings.sh index 09a43d13a42..f4bad961f21 100755 --- a/tests/queries/0_stateless/01184_long_insert_values_huge_strings.sh +++ b/tests/queries/0_stateless/01184_long_insert_values_huge_strings.sh @@ -1,5 +1,6 @@ #!/usr/bin/env bash -# Tags: long +# Tags: long, no-tsan +# FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/01232_untuple.reference b/tests/queries/0_stateless/01232_untuple.reference index 21fd0c4a8a5..8e1f97d2585 100644 --- a/tests/queries/0_stateless/01232_untuple.reference +++ b/tests/queries/0_stateless/01232_untuple.reference @@ -3,11 +3,11 @@ hello 1 3 world 9 9 (0,1) key tupleElement(argMax(tuple(v1, v2, v3, v4, v5), v1), 1) tupleElement(argMax(tuple(v1, v2, v3, v4, v5), v1), 2) tupleElement(argMax(tuple(v1, v2, v3, v4, v5), v1), 3) tupleElement(argMax(tuple(v1, v2, v3, v4, v5), v1), 4) tupleElement(argMax(tuple(v1, v2, v3, v4, v5), v1), 5) -4 10 20 10 20 30 -3 70 20 10 20 30 -2 11 20 10 20 30 -5 10 20 10 20 30 1 20 20 10 20 30 +2 11 20 10 20 30 +3 70 20 10 20 30 +4 10 20 10 20 30 +5 10 20 10 20 30 6 10 20 10 20 30 7 18 20 10 20 30 8 30 20 10 20 30 diff --git a/tests/queries/0_stateless/01232_untuple.sql b/tests/queries/0_stateless/01232_untuple.sql index 39ee9e82fa7..92150e92b29 100644 --- a/tests/queries/0_stateless/01232_untuple.sql +++ b/tests/queries/0_stateless/01232_untuple.sql @@ -6,5 +6,5 @@ select argMax(untuple(x)), min(x) from (select (number, number + 1) as x from nu drop table if exists kv; create table kv (key int, v1 int, v2 int, v3 int, v4 int, v5 int) engine MergeTree order by key; insert into kv values (1, 10, 20, 10, 20, 30), (2, 11, 20, 10, 20, 30), (1, 18, 20, 10, 20, 30), (1, 20, 20, 10, 20, 30), (3, 70, 20, 10, 20, 30), (4, 10, 20, 10, 20, 30), (1, 10, 20, 10, 20, 30), (5, 10, 20, 10, 20, 30), (1, 10, 20, 10, 20, 30), (8, 30, 20, 10, 20, 30), (1, 10, 20, 10, 20, 30), (6, 10, 20, 10, 20, 30), (1, 10, 20, 10, 20, 30), (7, 18, 20, 10, 20, 30), (1, 10, 20, 10, 20, 30), (7, 10, 20, 10, 20, 30), (1, 10, 20, 10, 20, 30), (8, 10, 20, 10, 20, 30), (1, 10, 20, 10, 20, 30); -select key, untuple(argMax((* except (key),), v1)) from kv group by key format TSVWithNames; +select key, untuple(argMax((* except (key),), v1)) from kv group by key order by key format TSVWithNames; drop table if exists kv; diff --git a/tests/queries/0_stateless/01244_optimize_distributed_group_by_sharding_key.sql b/tests/queries/0_stateless/01244_optimize_distributed_group_by_sharding_key.sql index 134c6763fab..291910ed43f 100644 --- a/tests/queries/0_stateless/01244_optimize_distributed_group_by_sharding_key.sql +++ b/tests/queries/0_stateless/01244_optimize_distributed_group_by_sharding_key.sql @@ -15,6 +15,7 @@ create table dist_01247 as data_01247 engine=Distributed(test_cluster_two_shards set max_distributed_connections=1; set prefer_localhost_replica=0; +set enable_positional_arguments=0; select '-'; select * from dist_01247; diff --git a/tests/queries/0_stateless/01268_mergine_sorted_limit.sql b/tests/queries/0_stateless/01268_mergine_sorted_limit.sql index fbe047a3a77..49d8161bf83 100644 --- a/tests/queries/0_stateless/01268_mergine_sorted_limit.sql +++ b/tests/queries/0_stateless/01268_mergine_sorted_limit.sql @@ -6,7 +6,7 @@ INSERT INTO tab VALUES (1,1),(1,2),(1,3),(1,4),(1,5); INSERT INTO tab VALUES (2,6),(2,7),(2,8),(2,9),(2,0); -SELECT * FROM tab ORDER BY x LIMIT 3; -SELECT * FROM tab ORDER BY x LIMIT 4; +SELECT * FROM tab ORDER BY x LIMIT 3 SETTINGS optimize_read_in_order=1; +SELECT * FROM tab ORDER BY x LIMIT 4 SETTINGS optimize_read_in_order=1; DROP TABLE IF EXISTS tab; diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index 5a58b3b8825..93f93683fc4 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -59,6 +59,7 @@ DROP FUNCTION [] GLOBAL DROP DROP [] \N ALL TRUNCATE ['TRUNCATE TABLE'] TABLE ALL OPTIMIZE ['OPTIMIZE TABLE'] TABLE ALL +BACKUP [] TABLE ALL KILL QUERY [] GLOBAL ALL KILL TRANSACTION [] GLOBAL ALL MOVE PARTITION BETWEEN SHARDS [] GLOBAL ALL diff --git a/tests/queries/0_stateless/01283_max_threads_simple_query_optimization.sql b/tests/queries/0_stateless/01283_max_threads_simple_query_optimization.sql index 58522942641..d5f731568d1 100644 --- a/tests/queries/0_stateless/01283_max_threads_simple_query_optimization.sql +++ b/tests/queries/0_stateless/01283_max_threads_simple_query_optimization.sql @@ -1,6 +1,7 @@ DROP TABLE IF EXISTS data_01283; -set remote_filesystem_read_method='read'; +set remote_filesystem_read_method = 'read'; +set local_filesystem_read_method = 'pread'; CREATE TABLE data_01283 engine=MergeTree() ORDER BY key diff --git a/tests/queries/0_stateless/01323_add_scalars_in_time.reference b/tests/queries/0_stateless/01323_add_scalars_in_time.reference index 408efa7f823..bffe4d46ab2 100644 --- a/tests/queries/0_stateless/01323_add_scalars_in_time.reference +++ b/tests/queries/0_stateless/01323_add_scalars_in_time.reference @@ -1,5 +1,5 @@ -[0,2,3] id2 [1,2,3] id1 +[0,2,3] id2 test [1,2,3,4] 2 fre 3 jhg diff --git a/tests/queries/0_stateless/01323_add_scalars_in_time.sql b/tests/queries/0_stateless/01323_add_scalars_in_time.sql index 2ee5603f760..c337cd86f5b 100644 --- a/tests/queries/0_stateless/01323_add_scalars_in_time.sql +++ b/tests/queries/0_stateless/01323_add_scalars_in_time.sql @@ -16,7 +16,8 @@ WITH SELECT arraySort(arrayIntersect(argMax(seqs, create_time), arr1)) AS common, id FROM tags WHERE id LIKE 'id%' -GROUP BY id; +GROUP BY id +ORDER BY id; DROP TABLE tags; diff --git a/tests/queries/0_stateless/01323_too_many_threads_bug.sql b/tests/queries/0_stateless/01323_too_many_threads_bug.sql index 5dbb5aca2ec..d3254d49728 100644 --- a/tests/queries/0_stateless/01323_too_many_threads_bug.sql +++ b/tests/queries/0_stateless/01323_too_many_threads_bug.sql @@ -1,6 +1,7 @@ drop table if exists table_01323_many_parts; -set remote_filesystem_read_method='read'; +set remote_filesystem_read_method = 'read'; +set local_filesystem_read_method = 'pread'; create table table_01323_many_parts (x UInt64) engine = MergeTree order by x partition by x % 100; set max_partitions_per_insert_block = 100; diff --git a/tests/queries/0_stateless/01410_nullable_key_and_index.reference b/tests/queries/0_stateless/01410_nullable_key_and_index.reference index da88fbddd7a..37456e6c8d6 100644 --- a/tests/queries/0_stateless/01410_nullable_key_and_index.reference +++ b/tests/queries/0_stateless/01410_nullable_key_and_index.reference @@ -8,9 +8,9 @@ 14 21 16 24 18 27 -\N 0 -\N -1 \N -2 +\N -1 +\N 0 \N 0 \N -1 \N -2 diff --git a/tests/queries/0_stateless/01410_nullable_key_and_index.sql b/tests/queries/0_stateless/01410_nullable_key_and_index.sql index 969432eba01..905d997d95c 100644 --- a/tests/queries/0_stateless/01410_nullable_key_and_index.sql +++ b/tests/queries/0_stateless/01410_nullable_key_and_index.sql @@ -3,13 +3,14 @@ DROP TABLE IF EXISTS nullable_key_without_final_mark; DROP TABLE IF EXISTS nullable_minmax_index; SET max_threads = 1; +SET optimize_read_in_order=0; CREATE TABLE nullable_key (k Nullable(int), v int) ENGINE MergeTree ORDER BY k SETTINGS allow_nullable_key = 1, index_granularity = 1; INSERT INTO nullable_key SELECT number * 2, number * 3 FROM numbers(10); INSERT INTO nullable_key SELECT NULL, -number FROM numbers(3); -SELECT * FROM nullable_key ORDER BY k; +SELECT * FROM nullable_key ORDER BY k, v; SET force_primary_key = 1; SET max_rows_to_read = 3; diff --git a/tests/queries/0_stateless/01414_mutations_and_errors_zookeeper.sh b/tests/queries/0_stateless/01414_mutations_and_errors_zookeeper.sh index e47a7c4a40c..d58d57d4e52 100755 --- a/tests/queries/0_stateless/01414_mutations_and_errors_zookeeper.sh +++ b/tests/queries/0_stateless/01414_mutations_and_errors_zookeeper.sh @@ -76,4 +76,4 @@ $CLICKHOUSE_CLIENT --query "ALTER TABLE replicated_mutation_table MODIFY COLUMN $CLICKHOUSE_CLIENT --query "SELECT distinct(value) FROM replicated_mutation_table ORDER BY value" -$CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS replicated_mutation_table" +$CLICKHOUSE_CLIENT --query "DROP TABLE replicated_mutation_table" diff --git a/tests/queries/0_stateless/01513_optimize_aggregation_in_order_memory_long.sql b/tests/queries/0_stateless/01513_optimize_aggregation_in_order_memory_long.sql index cca994e8e4a..784dd73b865 100644 --- a/tests/queries/0_stateless/01513_optimize_aggregation_in_order_memory_long.sql +++ b/tests/queries/0_stateless/01513_optimize_aggregation_in_order_memory_long.sql @@ -12,7 +12,7 @@ set max_memory_usage='500M'; set max_threads=1; set max_block_size=500; -select key, groupArray(repeat('a', 200)), count() from data_01513 group by key format Null; -- { serverError 241; } +select key, groupArray(repeat('a', 200)), count() from data_01513 group by key format Null settings optimize_aggregation_in_order=0; -- { serverError 241; } select key, groupArray(repeat('a', 200)), count() from data_01513 group by key format Null settings optimize_aggregation_in_order=1; -- for WITH TOTALS previous groups should be kept. select key, groupArray(repeat('a', 200)), count() from data_01513 group by key with totals format Null settings optimize_aggregation_in_order=1; -- { serverError 241; } diff --git a/tests/queries/0_stateless/01521_alter_enum_and_reverse_read.sql b/tests/queries/0_stateless/01521_alter_enum_and_reverse_read.sql index 014790a61c1..b5391517c14 100644 --- a/tests/queries/0_stateless/01521_alter_enum_and_reverse_read.sql +++ b/tests/queries/0_stateless/01521_alter_enum_and_reverse_read.sql @@ -8,6 +8,6 @@ ALTER TABLE enum_test MODIFY COLUMN e Enum8('IU' = 1, 'WS' = 2, 'PS' = 3); INSERT INTO enum_test SELECT '2020-10-09 00:00:00', 'h1', 'PS' from numbers(1); -SELECT * FROM enum_test ORDER BY timestamp, e desc; +SELECT * FROM enum_test ORDER BY timestamp, e desc SETTINGS optimize_read_in_order=1; DROP TABLE IF EXISTS enum_test; diff --git a/tests/queries/0_stateless/01524_do_not_merge_across_partitions_select_final.sql b/tests/queries/0_stateless/01524_do_not_merge_across_partitions_select_final.sql index bffd12e5aca..a5423d1a3ff 100644 --- a/tests/queries/0_stateless/01524_do_not_merge_across_partitions_select_final.sql +++ b/tests/queries/0_stateless/01524_do_not_merge_across_partitions_select_final.sql @@ -32,6 +32,7 @@ INSERT INTO select_final SELECT toDate('2000-01-01'), number, '' FROM numbers(50 OPTIMIZE TABLE select_final FINAL; SET remote_filesystem_read_method = 'read'; +SET local_filesystem_read_method = 'pread'; SELECT max(x) FROM select_final FINAL; diff --git a/tests/queries/0_stateless/01562_optimize_monotonous_functions_in_order_by.sql b/tests/queries/0_stateless/01562_optimize_monotonous_functions_in_order_by.sql index b31457d8f68..15ddb5a848f 100644 --- a/tests/queries/0_stateless/01562_optimize_monotonous_functions_in_order_by.sql +++ b/tests/queries/0_stateless/01562_optimize_monotonous_functions_in_order_by.sql @@ -1,4 +1,5 @@ SET optimize_monotonous_functions_in_order_by = 1; +SET optimize_read_in_order = 1; DROP TABLE IF EXISTS test_order_by; diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference index 0f21ba9b99c..aaa88d66ca0 100644 --- a/tests/queries/0_stateless/01591_window_functions.reference +++ b/tests/queries/0_stateless/01591_window_functions.reference @@ -62,9 +62,18 @@ select number, quantileExact(number) over (partition by intDiv(number, 3) AS val 7 7 8 7 9 9 --- can't reference it yet -- the window functions are calculated at the --- last stage of select, after all other functions. -select q * 10, quantileExact(number) over (partition by intDiv(number, 3) rows unbounded preceding) q from numbers(10); -- { serverError 47 } +-- now we should be able to compute expressions with window functions +select number, q * 10, quantileExact(number) over (partition by intDiv(number, 3) order by number rows unbounded preceding) q from numbers(10) order by number; +0 0 0 +1 10 1 +2 10 1 +3 30 3 +4 40 4 +5 40 4 +6 60 6 +7 70 7 +8 70 7 +9 90 9 -- must work in WHERE if you wrap it in a subquery select * from (select count(*) over (rows unbounded preceding) c from numbers(3)) where c > 0; 1 diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql index 31cfa181f9c..3f4a028eac2 100644 --- a/tests/queries/0_stateless/01591_window_functions.sql +++ b/tests/queries/0_stateless/01591_window_functions.sql @@ -20,9 +20,8 @@ select number, quantileExact(number) over (partition by intDiv(number, 3) AS val -- can add an alias after window spec select number, quantileExact(number) over (partition by intDiv(number, 3) AS value order by number rows unbounded preceding) q from numbers(10); --- can't reference it yet -- the window functions are calculated at the --- last stage of select, after all other functions. -select q * 10, quantileExact(number) over (partition by intDiv(number, 3) rows unbounded preceding) q from numbers(10); -- { serverError 47 } +-- now we should be able to compute expressions with window functions +select number, q * 10, quantileExact(number) over (partition by intDiv(number, 3) order by number rows unbounded preceding) q from numbers(10) order by number; -- must work in WHERE if you wrap it in a subquery select * from (select count(*) over (rows unbounded preceding) c from numbers(3)) where c > 0; diff --git a/tests/queries/0_stateless/01621_sort_after_join_pipeline_stuck.sql b/tests/queries/0_stateless/01621_sort_after_join_pipeline_stuck.sql index ce9ae65e71e..e2f061f82ed 100644 --- a/tests/queries/0_stateless/01621_sort_after_join_pipeline_stuck.sql +++ b/tests/queries/0_stateless/01621_sort_after_join_pipeline_stuck.sql @@ -1 +1,2 @@ +SET enable_positional_arguments = 0; SELECT k FROM (SELECT NULL, nullIf(number, 3) AS k, '1048575', (65536, -9223372036854775808), toString(number) AS a FROM system.numbers LIMIT 1048577) AS js1 ANY RIGHT JOIN (SELECT 1.000100016593933, nullIf(number, NULL) AS k, toString(number) AS b FROM system.numbers LIMIT 2, 255) AS js2 USING (k) ORDER BY 257 ASC NULLS LAST FORMAT Null; diff --git a/tests/queries/0_stateless/01641_memory_tracking_insert_optimize.sql b/tests/queries/0_stateless/01641_memory_tracking_insert_optimize.sql index 36b6c97460c..1c29ea83efc 100644 --- a/tests/queries/0_stateless/01641_memory_tracking_insert_optimize.sql +++ b/tests/queries/0_stateless/01641_memory_tracking_insert_optimize.sql @@ -4,7 +4,9 @@ drop table if exists data_01641; -- Disable cache for s3 storage tests because it increases memory usage. set enable_filesystem_cache=0; -set remote_filesystem_read_method='read'; + +set remote_filesystem_read_method = 'read'; +set local_filesystem_read_method = 'pread'; create table data_01641 (key Int, value String) engine=MergeTree order by (key, repeat(value, 40)) settings old_parts_lifetime=0, min_bytes_for_wide_part=0; diff --git a/tests/queries/0_stateless/01646_system_restart_replicas_smoke.sql b/tests/queries/0_stateless/01646_system_restart_replicas_smoke.sql index 7307ba60e3f..39fb6266c43 100644 --- a/tests/queries/0_stateless/01646_system_restart_replicas_smoke.sql +++ b/tests/queries/0_stateless/01646_system_restart_replicas_smoke.sql @@ -1,5 +1,9 @@ --- Tags: replica, no-tsan, no-parallel +-- Tags: replica, no-tsan, no-parallel, no-stress -- Tag no-tsan: RESTART REPLICAS can acquire too much locks, while only 64 is possible from one thread under TSan +-- Tag no-stress: RESTART REPLICAS can leave some tables, +-- that may pollute error log, +-- like in 01414_mutations_and_errors_zookeeper. +-- no-stress is like worked no-parallel for stress testing DROP TABLE IF EXISTS data_01646; CREATE TABLE data_01646 (x Date, s String) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test_01646/data_01646', 'r') ORDER BY s PARTITION BY x; diff --git a/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql b/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql index 22532529812..ec2a1850594 100644 --- a/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql +++ b/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql @@ -1,3 +1,6 @@ +-- Tags: no-tsan +-- FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan + drop table if exists perf_lc_num; CREATE TABLE perf_lc_num(  num UInt8,  arr Array(LowCardinality(Int64)) default [num]  ) ENGINE = TinyLog; diff --git a/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order.reference b/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order.reference new file mode 100644 index 00000000000..7fcd29b5faf --- /dev/null +++ b/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order.reference @@ -0,0 +1,12 @@ +Partial sorting plan + optimize_read_in_window_order=0 + Sort description: n ASC, x ASC + optimize_read_in_window_order=1 + Prefix sort description: n ASC + Result sort description: n ASC, x ASC +No sorting plan + optimize_read_in_window_order=0 + Sort description: n ASC, x ASC + optimize_read_in_window_order=1 + Prefix sort description: n ASC, x ASC + Result sort description: n ASC, x ASC diff --git a/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order.sh b/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order.sh new file mode 100755 index 00000000000..418baea8113 --- /dev/null +++ b/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +name=test_01655_plan_optimizations_optimize_read_in_window_order + +$CLICKHOUSE_CLIENT -q "drop table if exists ${name}" +$CLICKHOUSE_CLIENT -q "drop table if exists ${name}_n" +$CLICKHOUSE_CLIENT -q "drop table if exists ${name}_n_x" + +$CLICKHOUSE_CLIENT -q "create table ${name} engine=MergeTree order by tuple() as select toInt64((sin(number)+2)*65535)%10 as n, number as x from numbers_mt(100000)" +$CLICKHOUSE_CLIENT -q "create table ${name}_n engine=MergeTree order by n as select * from ${name} order by n" +$CLICKHOUSE_CLIENT -q "create table ${name}_n_x engine=MergeTree order by (n, x) as select * from ${name} order by n, x" + +$CLICKHOUSE_CLIENT -q "optimize table ${name}_n final" +$CLICKHOUSE_CLIENT -q "optimize table ${name}_n_x final" + +echo 'Partial sorting plan' +echo ' optimize_read_in_window_order=0' +$CLICKHOUSE_CLIENT -q "explain plan actions=1, description=1 select n, sum(x) OVER (ORDER BY n, x ROWS BETWEEN 100 PRECEDING AND CURRENT ROW) from ${name}_n SETTINGS optimize_read_in_window_order=0" | grep -i "sort description" + +echo ' optimize_read_in_window_order=1' +$CLICKHOUSE_CLIENT -q "explain plan actions=1, description=1 select n, sum(x) OVER (ORDER BY n, x ROWS BETWEEN 100 PRECEDING AND CURRENT ROW) from ${name}_n SETTINGS optimize_read_in_window_order=1" | grep -i "sort description" + +echo 'No sorting plan' +echo ' optimize_read_in_window_order=0' +$CLICKHOUSE_CLIENT -q "explain plan actions=1, description=1 select n, sum(x) OVER (ORDER BY n, x ROWS BETWEEN 100 PRECEDING AND CURRENT ROW) from ${name}_n_x SETTINGS optimize_read_in_window_order=0" | grep -i "sort description" + +echo ' optimize_read_in_window_order=1' +$CLICKHOUSE_CLIENT -q "explain plan actions=1, description=1 select n, sum(x) OVER (ORDER BY n, x ROWS BETWEEN 100 PRECEDING AND CURRENT ROW) from ${name}_n_x SETTINGS optimize_read_in_window_order=1" | grep -i "sort description" + +$CLICKHOUSE_CLIENT -q "drop table ${name}" +$CLICKHOUSE_CLIENT -q "drop table ${name}_n" +$CLICKHOUSE_CLIENT -q "drop table ${name}_n_x" diff --git a/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order_long.reference b/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order_long.reference new file mode 100644 index 00000000000..b462a5a7baa --- /dev/null +++ b/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order_long.reference @@ -0,0 +1,4 @@ +OK +OK +OK +OK diff --git a/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order_long.sh b/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order_long.sh new file mode 100755 index 00000000000..297688a29c3 --- /dev/null +++ b/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order_long.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Tags: long + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +name=test_01655_plan_optimizations_optimize_read_in_window_order_long +max_memory_usage=20000000 + +$CLICKHOUSE_CLIENT -q "drop table if exists ${name}" +$CLICKHOUSE_CLIENT -q "drop table if exists ${name}_n" +$CLICKHOUSE_CLIENT -q "drop table if exists ${name}_n_x" + +$CLICKHOUSE_CLIENT -q "create table ${name} engine=MergeTree order by tuple() as select toInt64((sin(number)+2)*65535)%500 as n, number as x from numbers_mt(5000000)" +$CLICKHOUSE_CLIENT -q "create table ${name}_n engine=MergeTree order by n as select * from ${name} order by n" +$CLICKHOUSE_CLIENT -q "create table ${name}_n_x engine=MergeTree order by (n, x) as select * from ${name} order by n, x" + +$CLICKHOUSE_CLIENT -q "optimize table ${name}_n final" +$CLICKHOUSE_CLIENT -q "optimize table ${name}_n_x final" + +$CLICKHOUSE_CLIENT -q "select n, sum(x) OVER (ORDER BY n, x ROWS BETWEEN 100 PRECEDING AND CURRENT ROW) from ${name}_n SETTINGS optimize_read_in_window_order=0, max_memory_usage=$max_memory_usage, max_threads=1 format Null" 2>&1 | grep -F -q "MEMORY_LIMIT_EXCEEDED" && echo 'OK' || echo 'FAIL' +$CLICKHOUSE_CLIENT -q "select n, sum(x) OVER (ORDER BY n, x ROWS BETWEEN 100 PRECEDING AND CURRENT ROW) from ${name}_n SETTINGS optimize_read_in_window_order=1, max_memory_usage=$max_memory_usage, max_threads=1 format Null" + +$CLICKHOUSE_CLIENT -q "select n, sum(x) OVER (ORDER BY n, x ROWS BETWEEN 100 PRECEDING AND CURRENT ROW) from ${name}_n_x SETTINGS optimize_read_in_window_order=0, max_memory_usage=$max_memory_usage, max_threads=1 format Null" 2>&1 | grep -F -q "MEMORY_LIMIT_EXCEEDED" && echo 'OK' || echo 'FAIL' +$CLICKHOUSE_CLIENT -q "select n, sum(x) OVER (ORDER BY n, x ROWS BETWEEN 100 PRECEDING AND CURRENT ROW) from ${name}_n_x SETTINGS optimize_read_in_window_order=1, max_memory_usage=$max_memory_usage, max_threads=1 format Null" + +$CLICKHOUSE_CLIENT -q "select n, sum(x) OVER (PARTITION BY n ORDER BY x ROWS BETWEEN 100 PRECEDING AND CURRENT ROW) from ${name}_n_x SETTINGS optimize_read_in_window_order=0, max_memory_usage=$max_memory_usage, max_threads=1 format Null" 2>&1 | grep -F -q "MEMORY_LIMIT_EXCEEDED" && echo 'OK' || echo 'FAIL' +$CLICKHOUSE_CLIENT -q "select n, sum(x) OVER (PARTITION BY n ORDER BY x ROWS BETWEEN 100 PRECEDING AND CURRENT ROW) from ${name}_n_x SETTINGS optimize_read_in_window_order=1, max_memory_usage=$max_memory_usage, max_threads=1 format Null" + +$CLICKHOUSE_CLIENT -q "select n, sum(x) OVER (PARTITION BY n+x%2 ORDER BY n, x ROWS BETWEEN 100 PRECEDING AND CURRENT ROW) from ${name}_n_x SETTINGS optimize_read_in_window_order=1, max_memory_usage=$max_memory_usage, max_threads=1 format Null" 2>&1 | grep -F -q "MEMORY_LIMIT_EXCEEDED" && echo 'OK' || echo 'FAIL' + +$CLICKHOUSE_CLIENT -q "drop table ${name}" +$CLICKHOUSE_CLIENT -q "drop table ${name}_n" +$CLICKHOUSE_CLIENT -q "drop table ${name}_n_x" diff --git a/tests/queries/0_stateless/01710_projection_aggregation_in_order.sql b/tests/queries/0_stateless/01710_projection_aggregation_in_order.sql index add38dbd3f8..06f05e36237 100644 --- a/tests/queries/0_stateless/01710_projection_aggregation_in_order.sql +++ b/tests/queries/0_stateless/01710_projection_aggregation_in_order.sql @@ -1,3 +1,6 @@ +-- Test that check the correctness of the result for optimize_aggregation_in_order and projections, +-- not that this optimization will take place. + DROP TABLE IF EXISTS normal; CREATE TABLE normal diff --git a/tests/queries/0_stateless/01710_projections.sql b/tests/queries/0_stateless/01710_projections.sql index 54581b5ae11..5df16a93980 100644 --- a/tests/queries/0_stateless/01710_projections.sql +++ b/tests/queries/0_stateless/01710_projections.sql @@ -40,6 +40,7 @@ select toStartOfMinute(datetime) dt_m, domain, sum(retry_count) / sum(duration), select toStartOfHour(toStartOfMinute(datetime)) dt_h, uniqHLL12(x_id), uniqHLL12(y_id) from projection_test group by dt_h order by dt_h; -- found by fuzzer +SET enable_positional_arguments = 0; SELECT 2, -1 FROM projection_test PREWHERE domain_alias = 1. WHERE domain = NULL GROUP BY -9223372036854775808 ORDER BY countIf(first_time = 0) / count(-2147483649) DESC NULLS LAST, 1048576 DESC NULLS LAST; drop table if exists projection_test; diff --git a/tests/queries/0_stateless/01710_projections_optimize_aggregation_in_order.reference b/tests/queries/0_stateless/01710_projections_optimize_aggregation_in_order.reference new file mode 100644 index 00000000000..30b0f53ced8 --- /dev/null +++ b/tests/queries/0_stateless/01710_projections_optimize_aggregation_in_order.reference @@ -0,0 +1,28 @@ +SELECT k1, k2, k3, sum(value) v FROM in_order_agg_01710 GROUP BY k1, k2, k3 ORDER BY k1, k2, k3 SETTINGS optimize_aggregation_in_order=0 +1 0 0 1249950000 +1 0 2 1250000000 +1 1 1 1249975000 +1 1 3 1250025000 +Used processors: +AggregatingTransform +SELECT k1, k2, k3, sum(value) v FROM in_order_agg_01710 GROUP BY k1, k2, k3 ORDER BY k1, k2, k3 SETTINGS optimize_aggregation_in_order=1 +1 0 0 1249950000 +1 0 2 1250000000 +1 1 1 1249975000 +1 1 3 1250025000 +Used processors: +AggregatingInOrderTransform +SELECT k1, k3, sum(value) v FROM in_order_agg_01710 GROUP BY k1, k3 ORDER BY k1, k3 SETTINGS optimize_aggregation_in_order=0 +1 0 1249950000 +1 1 1249975000 +1 2 1250000000 +1 3 1250025000 +Used processors: +AggregatingTransform +SELECT k1, k3, sum(value) v FROM in_order_agg_01710 GROUP BY k1, k3 ORDER BY k1, k3 SETTINGS optimize_aggregation_in_order=1 +1 0 1249950000 +1 1 1249975000 +1 2 1250000000 +1 3 1250025000 +Used processors: +AggregatingInOrderTransform diff --git a/tests/queries/0_stateless/01710_projections_optimize_aggregation_in_order.sh b/tests/queries/0_stateless/01710_projections_optimize_aggregation_in_order.sh new file mode 100755 index 00000000000..0cafa904a71 --- /dev/null +++ b/tests/queries/0_stateless/01710_projections_optimize_aggregation_in_order.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -nm -q " + DROP TABLE IF EXISTS in_order_agg_01710; + + CREATE TABLE in_order_agg_01710 + ( + k1 UInt32, + k2 UInt32, + k3 UInt32, + value UInt32, + PROJECTION aaaa + ( + SELECT + k1, + k2, + k3, + sum(value) + GROUP BY k1, k2, k3 + ) + ) + ENGINE = MergeTree + ORDER BY tuple(); + + INSERT INTO in_order_agg_01710 SELECT 1, number%2, number%4, number FROM numbers(100000); +" + +function random_str() +{ + local n=$1 && shift + tr -cd '[:lower:]' < /dev/urandom | head -c"$n" +} + +function run_query() +{ + local query=$1 && shift + + local query_id + query_id="$CLICKHOUSE_TEST_UNIQUE_NAME-$(random_str 6)" + + echo "$query" + local opts=( + --allow_experimental_projection_optimization 1 + --force_optimize_projection 1 + --log_processors_profiles 1 + --query_id "$query_id" + ) + $CLICKHOUSE_CLIENT "${opts[@]}" "$@" -q "$query" + + $CLICKHOUSE_CLIENT -q "SYSTEM FLUSH LOGS" + + echo "Used processors:" + $CLICKHOUSE_CLIENT --param_query_id "$query_id" -q "SELECT DISTINCT name FROM system.processors_profile_log WHERE query_id = {query_id:String} AND name LIKE 'Aggregating%'" +} + +run_query "SELECT k1, k2, k3, sum(value) v FROM in_order_agg_01710 GROUP BY k1, k2, k3 ORDER BY k1, k2, k3 SETTINGS optimize_aggregation_in_order=0" +run_query "SELECT k1, k2, k3, sum(value) v FROM in_order_agg_01710 GROUP BY k1, k2, k3 ORDER BY k1, k2, k3 SETTINGS optimize_aggregation_in_order=1" +run_query "SELECT k1, k3, sum(value) v FROM in_order_agg_01710 GROUP BY k1, k3 ORDER BY k1, k3 SETTINGS optimize_aggregation_in_order=0" +run_query "SELECT k1, k3, sum(value) v FROM in_order_agg_01710 GROUP BY k1, k3 ORDER BY k1, k3 SETTINGS optimize_aggregation_in_order=1" diff --git a/tests/queries/0_stateless/01710_projections_partial_optimize_aggregation_in_order.reference b/tests/queries/0_stateless/01710_projections_partial_optimize_aggregation_in_order.reference new file mode 100644 index 00000000000..6e0a46509bd --- /dev/null +++ b/tests/queries/0_stateless/01710_projections_partial_optimize_aggregation_in_order.reference @@ -0,0 +1,28 @@ +SELECT k1, k2, k3, sum(value) v FROM in_order_agg_partial_01710 GROUP BY k1, k2, k3 ORDER BY k1, k2, k3 SETTINGS optimize_aggregation_in_order=0 +1 0 0 1249950000 +1 0 2 1250000000 +1 1 1 1249975000 +1 1 3 1250025000 +Used processors: +AggregatingTransform +SELECT k1, k2, k3, sum(value) v FROM in_order_agg_partial_01710 GROUP BY k1, k2, k3 ORDER BY k1, k2, k3 SETTINGS optimize_aggregation_in_order=1 +1 0 0 1249950000 +1 0 2 1250000000 +1 1 1 1249975000 +1 1 3 1250025000 +Used processors: +AggregatingTransform +SELECT k1, k3, sum(value) v FROM in_order_agg_partial_01710 GROUP BY k1, k3 ORDER BY k1, k3 SETTINGS optimize_aggregation_in_order=0 +1 0 1249950000 +1 1 1249975000 +1 2 1250000000 +1 3 1250025000 +Used processors: +AggregatingTransform +SELECT k1, k3, sum(value) v FROM in_order_agg_partial_01710 GROUP BY k1, k3 ORDER BY k1, k3 SETTINGS optimize_aggregation_in_order=1 +1 0 1249950000 +1 1 1249975000 +1 2 1250000000 +1 3 1250025000 +Used processors: +AggregatingTransform diff --git a/tests/queries/0_stateless/01710_projections_partial_optimize_aggregation_in_order.sh b/tests/queries/0_stateless/01710_projections_partial_optimize_aggregation_in_order.sh new file mode 100755 index 00000000000..f66dc9ff872 --- /dev/null +++ b/tests/queries/0_stateless/01710_projections_partial_optimize_aggregation_in_order.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash + +# Test for optimize_aggregation_in_order with partial projections, i.e.: +# - first part will not have projection +# - second part will have projection +# And so two different aggregation should be done. + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -nm -q " + DROP TABLE IF EXISTS in_order_agg_partial_01710; + + CREATE TABLE in_order_agg_partial_01710 + ( + k1 UInt32, + k2 UInt32, + k3 UInt32, + value UInt32 + ) + ENGINE = MergeTree + ORDER BY tuple(); + + INSERT INTO in_order_agg_partial_01710 SELECT 1, number%2, number%4, number FROM numbers(50000); + SYSTEM STOP MERGES in_order_agg_partial_01710; + ALTER TABLE in_order_agg_partial_01710 ADD PROJECTION aaaa ( + SELECT + k1, + k2, + k3, + sum(value) + GROUP BY k1, k2, k3 + ); + INSERT INTO in_order_agg_partial_01710 SELECT 1, number%2, number%4, number FROM numbers(100000) LIMIT 50000, 100000; +" + +function random_str() +{ + local n=$1 && shift + tr -cd '[:lower:]' < /dev/urandom | head -c"$n" +} + +function run_query() +{ + local query=$1 && shift + + local query_id + query_id="$CLICKHOUSE_TEST_UNIQUE_NAME-$(random_str 6)" + + echo "$query" + local opts=( + --allow_experimental_projection_optimization 1 + --force_optimize_projection 1 + --log_processors_profiles 1 + --query_id "$query_id" + ) + $CLICKHOUSE_CLIENT "${opts[@]}" "$@" -q "$query" + + $CLICKHOUSE_CLIENT -q "SYSTEM FLUSH LOGS" + + echo "Used processors:" + $CLICKHOUSE_CLIENT --param_query_id "$query_id" -q "SELECT DISTINCT name FROM system.processors_profile_log WHERE query_id = {query_id:String} AND name LIKE 'Aggregating%'" +} + +run_query "SELECT k1, k2, k3, sum(value) v FROM in_order_agg_partial_01710 GROUP BY k1, k2, k3 ORDER BY k1, k2, k3 SETTINGS optimize_aggregation_in_order=0" +run_query "SELECT k1, k2, k3, sum(value) v FROM in_order_agg_partial_01710 GROUP BY k1, k2, k3 ORDER BY k1, k2, k3 SETTINGS optimize_aggregation_in_order=1" +run_query "SELECT k1, k3, sum(value) v FROM in_order_agg_partial_01710 GROUP BY k1, k3 ORDER BY k1, k3 SETTINGS optimize_aggregation_in_order=0" +run_query "SELECT k1, k3, sum(value) v FROM in_order_agg_partial_01710 GROUP BY k1, k3 ORDER BY k1, k3 SETTINGS optimize_aggregation_in_order=1" diff --git a/tests/queries/0_stateless/01746_long_zstd_http_compression_json_format.sh b/tests/queries/0_stateless/01746_long_zstd_http_compression_json_format.sh index e10032e04fd..16f5211f012 100755 --- a/tests/queries/0_stateless/01746_long_zstd_http_compression_json_format.sh +++ b/tests/queries/0_stateless/01746_long_zstd_http_compression_json_format.sh @@ -1,5 +1,6 @@ #!/usr/bin/env bash -# Tags: long, no-fasttest +# Tags: long, no-fasttest, no-tsan +# FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/01780_column_sparse_full.reference b/tests/queries/0_stateless/01780_column_sparse_full.reference index 03160f24a41..20d8de0d669 100644 --- a/tests/queries/0_stateless/01780_column_sparse_full.reference +++ b/tests/queries/0_stateless/01780_column_sparse_full.reference @@ -47,15 +47,15 @@ all_2_2_0 u Default 174250 ====== -174250 -58413 -57920 57917 +57920 +58413 +174250 ====== -174250 -58413 -57920 57917 +57920 +58413 +174250 ====== 508413 57920 diff --git a/tests/queries/0_stateless/01780_column_sparse_full.sql b/tests/queries/0_stateless/01780_column_sparse_full.sql index c190e8f0df4..08a1c0699a4 100644 --- a/tests/queries/0_stateless/01780_column_sparse_full.sql +++ b/tests/queries/0_stateless/01780_column_sparse_full.sql @@ -47,9 +47,9 @@ SELECT id, u, s FROM remote('127.0.0.{1,2}', currentDatabase(), t_sparse_full) O SELECT '======'; SELECT sum(u) FROM t_sparse_full GROUP BY id % 3 AS k WITH TOTALS ORDER BY k; SELECT '======'; -SELECT sum(u) FROM t_sparse_full GROUP BY id % 3 AS k WITH ROLLUP ORDER BY k; +SELECT sum(u) AS value FROM t_sparse_full GROUP BY id % 3 AS k WITH ROLLUP ORDER BY value; SELECT '======'; -SELECT sum(u) FROM t_sparse_full GROUP BY id % 3 AS k WITH CUBE ORDER BY k; +SELECT sum(u) AS value FROM t_sparse_full GROUP BY id % 3 AS k WITH CUBE ORDER BY value; SELECT '======'; SELECT sum(id) FROM t_sparse_full GROUP BY u % 3 AS k ORDER BY k; SELECT '======'; diff --git a/tests/queries/0_stateless/01786_explain_merge_tree.sh b/tests/queries/0_stateless/01786_explain_merge_tree.sh index eb47f065044..138905c65e7 100755 --- a/tests/queries/0_stateless/01786_explain_merge_tree.sh +++ b/tests/queries/0_stateless/01786_explain_merge_tree.sh @@ -4,7 +4,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -CLICKHOUSE_CLIENT="$CLICKHOUSE_CLIENT --optimize_move_to_prewhere=1 --convert_query_to_cnf=0" +CLICKHOUSE_CLIENT="$CLICKHOUSE_CLIENT --optimize_move_to_prewhere=1 --convert_query_to_cnf=0 --optimize_read_in_order=1" $CLICKHOUSE_CLIENT -q "drop table if exists test_index" $CLICKHOUSE_CLIENT -q "drop table if exists idx" diff --git a/tests/queries/0_stateless/01921_with_fill_with_totals.sql b/tests/queries/0_stateless/01921_with_fill_with_totals.sql index 1821e5b2413..253e8219d9a 100644 --- a/tests/queries/0_stateless/01921_with_fill_with_totals.sql +++ b/tests/queries/0_stateless/01921_with_fill_with_totals.sql @@ -7,6 +7,8 @@ GROUP BY number WITH TOTALS ORDER BY number DESC WITH FILL FROM 15; +SET enable_positional_arguments = 0; + SELECT number, sum(number) diff --git a/tests/queries/0_stateless/01926_order_by_desc_limit.sql b/tests/queries/0_stateless/01926_order_by_desc_limit.sql index 86468b4fcd6..223dbf70fc4 100644 --- a/tests/queries/0_stateless/01926_order_by_desc_limit.sql +++ b/tests/queries/0_stateless/01926_order_by_desc_limit.sql @@ -1,4 +1,5 @@ --- Tags: no-random-settings +-- Tags: no-random-settings, no-tsan +-- FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan DROP TABLE IF EXISTS order_by_desc; diff --git a/tests/queries/0_stateless/02067_lost_part_s3.sql b/tests/queries/0_stateless/02067_lost_part_s3.sql index ee3297331cd..87cbdca1d06 100644 --- a/tests/queries/0_stateless/02067_lost_part_s3.sql +++ b/tests/queries/0_stateless/02067_lost_part_s3.sql @@ -1,3 +1,5 @@ +-- Tags: no-backward-compatibility-check:22.5.1 + DROP TABLE IF EXISTS partslost_0; DROP TABLE IF EXISTS partslost_1; DROP TABLE IF EXISTS partslost_2; diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index eee7c1fe936..100e72d9a61 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -3,7 +3,7 @@ CREATE TABLE system.aggregate_function_combinators `name` String, `is_internal` UInt8 ) -ENGINE = SystemAggregateFunctionCombinators() +ENGINE = SystemAggregateFunctionCombinators COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.asynchronous_inserts ( @@ -19,21 +19,21 @@ CREATE TABLE system.asynchronous_inserts `entries.finished` Array(UInt8), `entries.exception` Array(String) ) -ENGINE = SystemAsynchronousInserts() +ENGINE = SystemAsynchronousInserts COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.asynchronous_metrics ( `metric` String, `value` Float64 ) -ENGINE = SystemAsynchronousMetrics() +ENGINE = SystemAsynchronousMetrics COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.build_options ( `name` String, `value` String ) -ENGINE = SystemBuildOptions() +ENGINE = SystemBuildOptions COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.clusters ( @@ -51,14 +51,14 @@ CREATE TABLE system.clusters `slowdowns_count` UInt32, `estimated_recovery_time` UInt32 ) -ENGINE = SystemClusters() +ENGINE = SystemClusters COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.collations ( `name` String, `language` Nullable(String) ) -ENGINE = SystemTableCollations() +ENGINE = SystemTableCollations COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.columns ( @@ -84,13 +84,13 @@ CREATE TABLE system.columns `numeric_scale` Nullable(UInt64), `datetime_precision` Nullable(UInt64) ) -ENGINE = SystemColumns() +ENGINE = SystemColumns COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.contributors ( `name` String ) -ENGINE = SystemContributors() +ENGINE = SystemContributors COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.current_roles ( @@ -98,7 +98,7 @@ CREATE TABLE system.current_roles `with_admin_option` UInt8, `is_default` UInt8 ) -ENGINE = SystemCurrentRoles() +ENGINE = SystemCurrentRoles COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.data_skipping_indices ( @@ -112,7 +112,7 @@ CREATE TABLE system.data_skipping_indices `data_uncompressed_bytes` UInt64, `marks` UInt64 ) -ENGINE = SystemDataSkippingIndices() +ENGINE = SystemDataSkippingIndices COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.data_type_families ( @@ -120,7 +120,7 @@ CREATE TABLE system.data_type_families `case_insensitive` UInt8, `alias_to` String ) -ENGINE = SystemTableDataTypeFamilies() +ENGINE = SystemTableDataTypeFamilies COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.databases ( @@ -132,7 +132,7 @@ CREATE TABLE system.databases `comment` String, `database` String ) -ENGINE = SystemDatabases() +ENGINE = SystemDatabases COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.detached_parts ( @@ -146,7 +146,7 @@ CREATE TABLE system.detached_parts `max_block_number` Nullable(Int64), `level` Nullable(UInt32) ) -ENGINE = SystemDetachedParts() +ENGINE = SystemDetachedParts COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.dictionaries ( @@ -176,7 +176,7 @@ CREATE TABLE system.dictionaries `last_exception` String, `comment` String ) -ENGINE = SystemDictionaries() +ENGINE = SystemDictionaries COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.disks ( @@ -188,7 +188,7 @@ CREATE TABLE system.disks `type` String, `cache_path` String ) -ENGINE = SystemDisks() +ENGINE = SystemDisks COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.distributed_ddl_queue ( @@ -208,7 +208,7 @@ CREATE TABLE system.distributed_ddl_queue `query_finish_time` Nullable(DateTime), `query_duration_ms` Nullable(UInt64) ) -ENGINE = SystemDDLWorkerQueue() +ENGINE = SystemDDLWorkerQueue COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.distribution_queue ( @@ -223,7 +223,7 @@ CREATE TABLE system.distribution_queue `broken_data_compressed_bytes` UInt64, `last_exception` String ) -ENGINE = SystemDistributionQueue() +ENGINE = SystemDistributionQueue COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.enabled_roles ( @@ -232,7 +232,7 @@ CREATE TABLE system.enabled_roles `is_current` UInt8, `is_default` UInt8 ) -ENGINE = SystemEnabledRoles() +ENGINE = SystemEnabledRoles COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.errors ( @@ -244,7 +244,7 @@ CREATE TABLE system.errors `last_error_trace` Array(UInt64), `remote` UInt8 ) -ENGINE = SystemErrors() +ENGINE = SystemErrors COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.events ( @@ -252,7 +252,7 @@ CREATE TABLE system.events `value` UInt64, `description` String ) -ENGINE = SystemEvents() +ENGINE = SystemEvents COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.formats ( @@ -260,7 +260,7 @@ CREATE TABLE system.formats `is_input` UInt8, `is_output` UInt8 ) -ENGINE = SystemFormats() +ENGINE = SystemFormats COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.functions ( @@ -271,20 +271,20 @@ CREATE TABLE system.functions `create_query` String, `origin` Enum8('System' = 0, 'SQLUserDefined' = 1, 'ExecutableUserDefined' = 2) ) -ENGINE = SystemFunctions() +ENGINE = SystemFunctions COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.grants ( `user_name` Nullable(String), `role_name` Nullable(String), - `access_type` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SELECT' = 5, 'INSERT' = 6, 'ALTER UPDATE' = 7, 'ALTER DELETE' = 8, 'ALTER ADD COLUMN' = 9, 'ALTER MODIFY COLUMN' = 10, 'ALTER DROP COLUMN' = 11, 'ALTER COMMENT COLUMN' = 12, 'ALTER CLEAR COLUMN' = 13, 'ALTER RENAME COLUMN' = 14, 'ALTER MATERIALIZE COLUMN' = 15, 'ALTER COLUMN' = 16, 'ALTER MODIFY COMMENT' = 17, 'ALTER ORDER BY' = 18, 'ALTER SAMPLE BY' = 19, 'ALTER ADD INDEX' = 20, 'ALTER DROP INDEX' = 21, 'ALTER MATERIALIZE INDEX' = 22, 'ALTER CLEAR INDEX' = 23, 'ALTER INDEX' = 24, 'ALTER ADD PROJECTION' = 25, 'ALTER DROP PROJECTION' = 26, 'ALTER MATERIALIZE PROJECTION' = 27, 'ALTER CLEAR PROJECTION' = 28, 'ALTER PROJECTION' = 29, 'ALTER ADD CONSTRAINT' = 30, 'ALTER DROP CONSTRAINT' = 31, 'ALTER CONSTRAINT' = 32, 'ALTER TTL' = 33, 'ALTER MATERIALIZE TTL' = 34, 'ALTER SETTINGS' = 35, 'ALTER MOVE PARTITION' = 36, 'ALTER FETCH PARTITION' = 37, 'ALTER FREEZE PARTITION' = 38, 'ALTER DATABASE SETTINGS' = 39, 'ALTER TABLE' = 40, 'ALTER DATABASE' = 41, 'ALTER VIEW REFRESH' = 42, 'ALTER VIEW MODIFY QUERY' = 43, 'ALTER VIEW' = 44, 'ALTER' = 45, 'CREATE DATABASE' = 46, 'CREATE TABLE' = 47, 'CREATE VIEW' = 48, 'CREATE DICTIONARY' = 49, 'CREATE TEMPORARY TABLE' = 50, 'CREATE FUNCTION' = 51, 'CREATE' = 52, 'DROP DATABASE' = 53, 'DROP TABLE' = 54, 'DROP VIEW' = 55, 'DROP DICTIONARY' = 56, 'DROP FUNCTION' = 57, 'DROP' = 58, 'TRUNCATE' = 59, 'OPTIMIZE' = 60, 'KILL QUERY' = 61, 'KILL TRANSACTION' = 62, 'MOVE PARTITION BETWEEN SHARDS' = 63, 'CREATE USER' = 64, 'ALTER USER' = 65, 'DROP USER' = 66, 'CREATE ROLE' = 67, 'ALTER ROLE' = 68, 'DROP ROLE' = 69, 'ROLE ADMIN' = 70, 'CREATE ROW POLICY' = 71, 'ALTER ROW POLICY' = 72, 'DROP ROW POLICY' = 73, 'CREATE QUOTA' = 74, 'ALTER QUOTA' = 75, 'DROP QUOTA' = 76, 'CREATE SETTINGS PROFILE' = 77, 'ALTER SETTINGS PROFILE' = 78, 'DROP SETTINGS PROFILE' = 79, 'SHOW USERS' = 80, 'SHOW ROLES' = 81, 'SHOW ROW POLICIES' = 82, 'SHOW QUOTAS' = 83, 'SHOW SETTINGS PROFILES' = 84, 'SHOW ACCESS' = 85, 'ACCESS MANAGEMENT' = 86, 'SYSTEM SHUTDOWN' = 87, 'SYSTEM DROP DNS CACHE' = 88, 'SYSTEM DROP MARK CACHE' = 89, 'SYSTEM DROP UNCOMPRESSED CACHE' = 90, 'SYSTEM DROP MMAP CACHE' = 91, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 92, 'SYSTEM DROP CACHE' = 93, 'SYSTEM RELOAD CONFIG' = 94, 'SYSTEM RELOAD SYMBOLS' = 95, 'SYSTEM RELOAD DICTIONARY' = 96, 'SYSTEM RELOAD MODEL' = 97, 'SYSTEM RELOAD FUNCTION' = 98, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 99, 'SYSTEM RELOAD' = 100, 'SYSTEM RESTART DISK' = 101, 'SYSTEM MERGES' = 102, 'SYSTEM TTL MERGES' = 103, 'SYSTEM FETCHES' = 104, 'SYSTEM MOVES' = 105, 'SYSTEM DISTRIBUTED SENDS' = 106, 'SYSTEM REPLICATED SENDS' = 107, 'SYSTEM SENDS' = 108, 'SYSTEM REPLICATION QUEUES' = 109, 'SYSTEM DROP REPLICA' = 110, 'SYSTEM SYNC REPLICA' = 111, 'SYSTEM RESTART REPLICA' = 112, 'SYSTEM RESTORE REPLICA' = 113, 'SYSTEM SYNC DATABASE REPLICA' = 114, 'SYSTEM SYNC TRANSACTION LOG' = 115, 'SYSTEM FLUSH DISTRIBUTED' = 116, 'SYSTEM FLUSH LOGS' = 117, 'SYSTEM FLUSH' = 118, 'SYSTEM THREAD FUZZER' = 119, 'SYSTEM UNFREEZE' = 120, 'SYSTEM' = 121, 'dictGet' = 122, 'addressToLine' = 123, 'addressToLineWithInlines' = 124, 'addressToSymbol' = 125, 'demangle' = 126, 'INTROSPECTION' = 127, 'FILE' = 128, 'URL' = 129, 'REMOTE' = 130, 'MONGO' = 131, 'MEILISEARCH' = 132, 'MYSQL' = 133, 'POSTGRES' = 134, 'SQLITE' = 135, 'ODBC' = 136, 'JDBC' = 137, 'HDFS' = 138, 'S3' = 139, 'HIVE' = 140, 'SOURCES' = 141, 'CLUSTER' = 142, 'ALL' = 143, 'NONE' = 144), + `access_type` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SELECT' = 5, 'INSERT' = 6, 'ALTER UPDATE' = 7, 'ALTER DELETE' = 8, 'ALTER ADD COLUMN' = 9, 'ALTER MODIFY COLUMN' = 10, 'ALTER DROP COLUMN' = 11, 'ALTER COMMENT COLUMN' = 12, 'ALTER CLEAR COLUMN' = 13, 'ALTER RENAME COLUMN' = 14, 'ALTER MATERIALIZE COLUMN' = 15, 'ALTER COLUMN' = 16, 'ALTER MODIFY COMMENT' = 17, 'ALTER ORDER BY' = 18, 'ALTER SAMPLE BY' = 19, 'ALTER ADD INDEX' = 20, 'ALTER DROP INDEX' = 21, 'ALTER MATERIALIZE INDEX' = 22, 'ALTER CLEAR INDEX' = 23, 'ALTER INDEX' = 24, 'ALTER ADD PROJECTION' = 25, 'ALTER DROP PROJECTION' = 26, 'ALTER MATERIALIZE PROJECTION' = 27, 'ALTER CLEAR PROJECTION' = 28, 'ALTER PROJECTION' = 29, 'ALTER ADD CONSTRAINT' = 30, 'ALTER DROP CONSTRAINT' = 31, 'ALTER CONSTRAINT' = 32, 'ALTER TTL' = 33, 'ALTER MATERIALIZE TTL' = 34, 'ALTER SETTINGS' = 35, 'ALTER MOVE PARTITION' = 36, 'ALTER FETCH PARTITION' = 37, 'ALTER FREEZE PARTITION' = 38, 'ALTER DATABASE SETTINGS' = 39, 'ALTER TABLE' = 40, 'ALTER DATABASE' = 41, 'ALTER VIEW REFRESH' = 42, 'ALTER VIEW MODIFY QUERY' = 43, 'ALTER VIEW' = 44, 'ALTER' = 45, 'CREATE DATABASE' = 46, 'CREATE TABLE' = 47, 'CREATE VIEW' = 48, 'CREATE DICTIONARY' = 49, 'CREATE TEMPORARY TABLE' = 50, 'CREATE FUNCTION' = 51, 'CREATE' = 52, 'DROP DATABASE' = 53, 'DROP TABLE' = 54, 'DROP VIEW' = 55, 'DROP DICTIONARY' = 56, 'DROP FUNCTION' = 57, 'DROP' = 58, 'TRUNCATE' = 59, 'OPTIMIZE' = 60, 'BACKUP' = 61, 'KILL QUERY' = 62, 'KILL TRANSACTION' = 63, 'MOVE PARTITION BETWEEN SHARDS' = 64, 'CREATE USER' = 65, 'ALTER USER' = 66, 'DROP USER' = 67, 'CREATE ROLE' = 68, 'ALTER ROLE' = 69, 'DROP ROLE' = 70, 'ROLE ADMIN' = 71, 'CREATE ROW POLICY' = 72, 'ALTER ROW POLICY' = 73, 'DROP ROW POLICY' = 74, 'CREATE QUOTA' = 75, 'ALTER QUOTA' = 76, 'DROP QUOTA' = 77, 'CREATE SETTINGS PROFILE' = 78, 'ALTER SETTINGS PROFILE' = 79, 'DROP SETTINGS PROFILE' = 80, 'SHOW USERS' = 81, 'SHOW ROLES' = 82, 'SHOW ROW POLICIES' = 83, 'SHOW QUOTAS' = 84, 'SHOW SETTINGS PROFILES' = 85, 'SHOW ACCESS' = 86, 'ACCESS MANAGEMENT' = 87, 'SYSTEM SHUTDOWN' = 88, 'SYSTEM DROP DNS CACHE' = 89, 'SYSTEM DROP MARK CACHE' = 90, 'SYSTEM DROP UNCOMPRESSED CACHE' = 91, 'SYSTEM DROP MMAP CACHE' = 92, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 93, 'SYSTEM DROP CACHE' = 94, 'SYSTEM RELOAD CONFIG' = 95, 'SYSTEM RELOAD SYMBOLS' = 96, 'SYSTEM RELOAD DICTIONARY' = 97, 'SYSTEM RELOAD MODEL' = 98, 'SYSTEM RELOAD FUNCTION' = 99, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 100, 'SYSTEM RELOAD' = 101, 'SYSTEM RESTART DISK' = 102, 'SYSTEM MERGES' = 103, 'SYSTEM TTL MERGES' = 104, 'SYSTEM FETCHES' = 105, 'SYSTEM MOVES' = 106, 'SYSTEM DISTRIBUTED SENDS' = 107, 'SYSTEM REPLICATED SENDS' = 108, 'SYSTEM SENDS' = 109, 'SYSTEM REPLICATION QUEUES' = 110, 'SYSTEM DROP REPLICA' = 111, 'SYSTEM SYNC REPLICA' = 112, 'SYSTEM RESTART REPLICA' = 113, 'SYSTEM RESTORE REPLICA' = 114, 'SYSTEM SYNC DATABASE REPLICA' = 115, 'SYSTEM SYNC TRANSACTION LOG' = 116, 'SYSTEM FLUSH DISTRIBUTED' = 117, 'SYSTEM FLUSH LOGS' = 118, 'SYSTEM FLUSH' = 119, 'SYSTEM THREAD FUZZER' = 120, 'SYSTEM UNFREEZE' = 121, 'SYSTEM' = 122, 'dictGet' = 123, 'addressToLine' = 124, 'addressToLineWithInlines' = 125, 'addressToSymbol' = 126, 'demangle' = 127, 'INTROSPECTION' = 128, 'FILE' = 129, 'URL' = 130, 'REMOTE' = 131, 'MONGO' = 132, 'MEILISEARCH' = 133, 'MYSQL' = 134, 'POSTGRES' = 135, 'SQLITE' = 136, 'ODBC' = 137, 'JDBC' = 138, 'HDFS' = 139, 'S3' = 140, 'HIVE' = 141, 'SOURCES' = 142, 'CLUSTER' = 143, 'ALL' = 144, 'NONE' = 145), `database` Nullable(String), `table` Nullable(String), `column` Nullable(String), `is_partial_revoke` UInt8, `grant_option` UInt8 ) -ENGINE = SystemGrants() +ENGINE = SystemGrants COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.graphite_retentions ( @@ -299,7 +299,7 @@ CREATE TABLE system.graphite_retentions `Tables.database` Array(String), `Tables.table` Array(String) ) -ENGINE = SystemGraphite() +ENGINE = SystemGraphite COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.licenses ( @@ -308,14 +308,14 @@ CREATE TABLE system.licenses `license_path` String, `license_text` String ) -ENGINE = SystemLicenses() +ENGINE = SystemLicenses COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.macros ( `macro` String, `substitution` String ) -ENGINE = SystemMacros() +ENGINE = SystemMacros COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.merge_tree_settings ( @@ -325,7 +325,7 @@ CREATE TABLE system.merge_tree_settings `description` String, `type` String ) -ENGINE = SystemMergeTreeSettings() +ENGINE = SystemMergeTreeSettings COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.merges ( @@ -352,7 +352,7 @@ CREATE TABLE system.merges `merge_type` String, `merge_algorithm` String ) -ENGINE = SystemMerges() +ENGINE = SystemMerges COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.metrics ( @@ -360,7 +360,7 @@ CREATE TABLE system.metrics `value` Int64, `description` String ) -ENGINE = SystemMetrics() +ENGINE = SystemMetrics COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.models ( @@ -372,7 +372,7 @@ CREATE TABLE system.models `loading_duration` Float32, `last_exception` String ) -ENGINE = SystemModels() +ENGINE = SystemModels COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.mutations ( @@ -390,25 +390,25 @@ CREATE TABLE system.mutations `latest_fail_time` DateTime, `latest_fail_reason` String ) -ENGINE = SystemMutations() +ENGINE = SystemMutations COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.numbers ( `number` UInt64 ) -ENGINE = SystemNumbers() +ENGINE = SystemNumbers COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.numbers_mt ( `number` UInt64 ) -ENGINE = SystemNumbers() +ENGINE = SystemNumbers COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.one ( `dummy` UInt8 ) -ENGINE = SystemOne() +ENGINE = SystemOne COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.part_moves_between_shards ( @@ -427,7 +427,7 @@ CREATE TABLE system.part_moves_between_shards `num_tries` UInt32, `last_exception` String ) -ENGINE = SystemShardMoves() +ENGINE = SystemShardMoves COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.parts ( @@ -493,7 +493,7 @@ CREATE TABLE system.parts `bytes` UInt64, `marks_size` UInt64 ) -ENGINE = SystemParts() +ENGINE = SystemParts COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.parts_columns ( @@ -547,16 +547,16 @@ CREATE TABLE system.parts_columns `bytes` UInt64, `marks_size` UInt64 ) -ENGINE = SystemPartsColumns() +ENGINE = SystemPartsColumns COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.privileges ( - `privilege` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SELECT' = 5, 'INSERT' = 6, 'ALTER UPDATE' = 7, 'ALTER DELETE' = 8, 'ALTER ADD COLUMN' = 9, 'ALTER MODIFY COLUMN' = 10, 'ALTER DROP COLUMN' = 11, 'ALTER COMMENT COLUMN' = 12, 'ALTER CLEAR COLUMN' = 13, 'ALTER RENAME COLUMN' = 14, 'ALTER MATERIALIZE COLUMN' = 15, 'ALTER COLUMN' = 16, 'ALTER MODIFY COMMENT' = 17, 'ALTER ORDER BY' = 18, 'ALTER SAMPLE BY' = 19, 'ALTER ADD INDEX' = 20, 'ALTER DROP INDEX' = 21, 'ALTER MATERIALIZE INDEX' = 22, 'ALTER CLEAR INDEX' = 23, 'ALTER INDEX' = 24, 'ALTER ADD PROJECTION' = 25, 'ALTER DROP PROJECTION' = 26, 'ALTER MATERIALIZE PROJECTION' = 27, 'ALTER CLEAR PROJECTION' = 28, 'ALTER PROJECTION' = 29, 'ALTER ADD CONSTRAINT' = 30, 'ALTER DROP CONSTRAINT' = 31, 'ALTER CONSTRAINT' = 32, 'ALTER TTL' = 33, 'ALTER MATERIALIZE TTL' = 34, 'ALTER SETTINGS' = 35, 'ALTER MOVE PARTITION' = 36, 'ALTER FETCH PARTITION' = 37, 'ALTER FREEZE PARTITION' = 38, 'ALTER DATABASE SETTINGS' = 39, 'ALTER TABLE' = 40, 'ALTER DATABASE' = 41, 'ALTER VIEW REFRESH' = 42, 'ALTER VIEW MODIFY QUERY' = 43, 'ALTER VIEW' = 44, 'ALTER' = 45, 'CREATE DATABASE' = 46, 'CREATE TABLE' = 47, 'CREATE VIEW' = 48, 'CREATE DICTIONARY' = 49, 'CREATE TEMPORARY TABLE' = 50, 'CREATE FUNCTION' = 51, 'CREATE' = 52, 'DROP DATABASE' = 53, 'DROP TABLE' = 54, 'DROP VIEW' = 55, 'DROP DICTIONARY' = 56, 'DROP FUNCTION' = 57, 'DROP' = 58, 'TRUNCATE' = 59, 'OPTIMIZE' = 60, 'KILL QUERY' = 61, 'KILL TRANSACTION' = 62, 'MOVE PARTITION BETWEEN SHARDS' = 63, 'CREATE USER' = 64, 'ALTER USER' = 65, 'DROP USER' = 66, 'CREATE ROLE' = 67, 'ALTER ROLE' = 68, 'DROP ROLE' = 69, 'ROLE ADMIN' = 70, 'CREATE ROW POLICY' = 71, 'ALTER ROW POLICY' = 72, 'DROP ROW POLICY' = 73, 'CREATE QUOTA' = 74, 'ALTER QUOTA' = 75, 'DROP QUOTA' = 76, 'CREATE SETTINGS PROFILE' = 77, 'ALTER SETTINGS PROFILE' = 78, 'DROP SETTINGS PROFILE' = 79, 'SHOW USERS' = 80, 'SHOW ROLES' = 81, 'SHOW ROW POLICIES' = 82, 'SHOW QUOTAS' = 83, 'SHOW SETTINGS PROFILES' = 84, 'SHOW ACCESS' = 85, 'ACCESS MANAGEMENT' = 86, 'SYSTEM SHUTDOWN' = 87, 'SYSTEM DROP DNS CACHE' = 88, 'SYSTEM DROP MARK CACHE' = 89, 'SYSTEM DROP UNCOMPRESSED CACHE' = 90, 'SYSTEM DROP MMAP CACHE' = 91, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 92, 'SYSTEM DROP CACHE' = 93, 'SYSTEM RELOAD CONFIG' = 94, 'SYSTEM RELOAD SYMBOLS' = 95, 'SYSTEM RELOAD DICTIONARY' = 96, 'SYSTEM RELOAD MODEL' = 97, 'SYSTEM RELOAD FUNCTION' = 98, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 99, 'SYSTEM RELOAD' = 100, 'SYSTEM RESTART DISK' = 101, 'SYSTEM MERGES' = 102, 'SYSTEM TTL MERGES' = 103, 'SYSTEM FETCHES' = 104, 'SYSTEM MOVES' = 105, 'SYSTEM DISTRIBUTED SENDS' = 106, 'SYSTEM REPLICATED SENDS' = 107, 'SYSTEM SENDS' = 108, 'SYSTEM REPLICATION QUEUES' = 109, 'SYSTEM DROP REPLICA' = 110, 'SYSTEM SYNC REPLICA' = 111, 'SYSTEM RESTART REPLICA' = 112, 'SYSTEM RESTORE REPLICA' = 113, 'SYSTEM SYNC DATABASE REPLICA' = 114, 'SYSTEM SYNC TRANSACTION LOG' = 115, 'SYSTEM FLUSH DISTRIBUTED' = 116, 'SYSTEM FLUSH LOGS' = 117, 'SYSTEM FLUSH' = 118, 'SYSTEM THREAD FUZZER' = 119, 'SYSTEM UNFREEZE' = 120, 'SYSTEM' = 121, 'dictGet' = 122, 'addressToLine' = 123, 'addressToLineWithInlines' = 124, 'addressToSymbol' = 125, 'demangle' = 126, 'INTROSPECTION' = 127, 'FILE' = 128, 'URL' = 129, 'REMOTE' = 130, 'MONGO' = 131, 'MEILISEARCH' = 132, 'MYSQL' = 133, 'POSTGRES' = 134, 'SQLITE' = 135, 'ODBC' = 136, 'JDBC' = 137, 'HDFS' = 138, 'S3' = 139, 'HIVE' = 140, 'SOURCES' = 141, 'CLUSTER' = 142, 'ALL' = 143, 'NONE' = 144), + `privilege` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SELECT' = 5, 'INSERT' = 6, 'ALTER UPDATE' = 7, 'ALTER DELETE' = 8, 'ALTER ADD COLUMN' = 9, 'ALTER MODIFY COLUMN' = 10, 'ALTER DROP COLUMN' = 11, 'ALTER COMMENT COLUMN' = 12, 'ALTER CLEAR COLUMN' = 13, 'ALTER RENAME COLUMN' = 14, 'ALTER MATERIALIZE COLUMN' = 15, 'ALTER COLUMN' = 16, 'ALTER MODIFY COMMENT' = 17, 'ALTER ORDER BY' = 18, 'ALTER SAMPLE BY' = 19, 'ALTER ADD INDEX' = 20, 'ALTER DROP INDEX' = 21, 'ALTER MATERIALIZE INDEX' = 22, 'ALTER CLEAR INDEX' = 23, 'ALTER INDEX' = 24, 'ALTER ADD PROJECTION' = 25, 'ALTER DROP PROJECTION' = 26, 'ALTER MATERIALIZE PROJECTION' = 27, 'ALTER CLEAR PROJECTION' = 28, 'ALTER PROJECTION' = 29, 'ALTER ADD CONSTRAINT' = 30, 'ALTER DROP CONSTRAINT' = 31, 'ALTER CONSTRAINT' = 32, 'ALTER TTL' = 33, 'ALTER MATERIALIZE TTL' = 34, 'ALTER SETTINGS' = 35, 'ALTER MOVE PARTITION' = 36, 'ALTER FETCH PARTITION' = 37, 'ALTER FREEZE PARTITION' = 38, 'ALTER DATABASE SETTINGS' = 39, 'ALTER TABLE' = 40, 'ALTER DATABASE' = 41, 'ALTER VIEW REFRESH' = 42, 'ALTER VIEW MODIFY QUERY' = 43, 'ALTER VIEW' = 44, 'ALTER' = 45, 'CREATE DATABASE' = 46, 'CREATE TABLE' = 47, 'CREATE VIEW' = 48, 'CREATE DICTIONARY' = 49, 'CREATE TEMPORARY TABLE' = 50, 'CREATE FUNCTION' = 51, 'CREATE' = 52, 'DROP DATABASE' = 53, 'DROP TABLE' = 54, 'DROP VIEW' = 55, 'DROP DICTIONARY' = 56, 'DROP FUNCTION' = 57, 'DROP' = 58, 'TRUNCATE' = 59, 'OPTIMIZE' = 60, 'BACKUP' = 61, 'KILL QUERY' = 62, 'KILL TRANSACTION' = 63, 'MOVE PARTITION BETWEEN SHARDS' = 64, 'CREATE USER' = 65, 'ALTER USER' = 66, 'DROP USER' = 67, 'CREATE ROLE' = 68, 'ALTER ROLE' = 69, 'DROP ROLE' = 70, 'ROLE ADMIN' = 71, 'CREATE ROW POLICY' = 72, 'ALTER ROW POLICY' = 73, 'DROP ROW POLICY' = 74, 'CREATE QUOTA' = 75, 'ALTER QUOTA' = 76, 'DROP QUOTA' = 77, 'CREATE SETTINGS PROFILE' = 78, 'ALTER SETTINGS PROFILE' = 79, 'DROP SETTINGS PROFILE' = 80, 'SHOW USERS' = 81, 'SHOW ROLES' = 82, 'SHOW ROW POLICIES' = 83, 'SHOW QUOTAS' = 84, 'SHOW SETTINGS PROFILES' = 85, 'SHOW ACCESS' = 86, 'ACCESS MANAGEMENT' = 87, 'SYSTEM SHUTDOWN' = 88, 'SYSTEM DROP DNS CACHE' = 89, 'SYSTEM DROP MARK CACHE' = 90, 'SYSTEM DROP UNCOMPRESSED CACHE' = 91, 'SYSTEM DROP MMAP CACHE' = 92, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 93, 'SYSTEM DROP CACHE' = 94, 'SYSTEM RELOAD CONFIG' = 95, 'SYSTEM RELOAD SYMBOLS' = 96, 'SYSTEM RELOAD DICTIONARY' = 97, 'SYSTEM RELOAD MODEL' = 98, 'SYSTEM RELOAD FUNCTION' = 99, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 100, 'SYSTEM RELOAD' = 101, 'SYSTEM RESTART DISK' = 102, 'SYSTEM MERGES' = 103, 'SYSTEM TTL MERGES' = 104, 'SYSTEM FETCHES' = 105, 'SYSTEM MOVES' = 106, 'SYSTEM DISTRIBUTED SENDS' = 107, 'SYSTEM REPLICATED SENDS' = 108, 'SYSTEM SENDS' = 109, 'SYSTEM REPLICATION QUEUES' = 110, 'SYSTEM DROP REPLICA' = 111, 'SYSTEM SYNC REPLICA' = 112, 'SYSTEM RESTART REPLICA' = 113, 'SYSTEM RESTORE REPLICA' = 114, 'SYSTEM SYNC DATABASE REPLICA' = 115, 'SYSTEM SYNC TRANSACTION LOG' = 116, 'SYSTEM FLUSH DISTRIBUTED' = 117, 'SYSTEM FLUSH LOGS' = 118, 'SYSTEM FLUSH' = 119, 'SYSTEM THREAD FUZZER' = 120, 'SYSTEM UNFREEZE' = 121, 'SYSTEM' = 122, 'dictGet' = 123, 'addressToLine' = 124, 'addressToLineWithInlines' = 125, 'addressToSymbol' = 126, 'demangle' = 127, 'INTROSPECTION' = 128, 'FILE' = 129, 'URL' = 130, 'REMOTE' = 131, 'MONGO' = 132, 'MEILISEARCH' = 133, 'MYSQL' = 134, 'POSTGRES' = 135, 'SQLITE' = 136, 'ODBC' = 137, 'JDBC' = 138, 'HDFS' = 139, 'S3' = 140, 'HIVE' = 141, 'SOURCES' = 142, 'CLUSTER' = 143, 'ALL' = 144, 'NONE' = 145), `aliases` Array(String), `level` Nullable(Enum8('GLOBAL' = 0, 'DATABASE' = 1, 'TABLE' = 2, 'DICTIONARY' = 3, 'VIEW' = 4, 'COLUMN' = 5)), - `parent_group` Nullable(Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SELECT' = 5, 'INSERT' = 6, 'ALTER UPDATE' = 7, 'ALTER DELETE' = 8, 'ALTER ADD COLUMN' = 9, 'ALTER MODIFY COLUMN' = 10, 'ALTER DROP COLUMN' = 11, 'ALTER COMMENT COLUMN' = 12, 'ALTER CLEAR COLUMN' = 13, 'ALTER RENAME COLUMN' = 14, 'ALTER MATERIALIZE COLUMN' = 15, 'ALTER COLUMN' = 16, 'ALTER MODIFY COMMENT' = 17, 'ALTER ORDER BY' = 18, 'ALTER SAMPLE BY' = 19, 'ALTER ADD INDEX' = 20, 'ALTER DROP INDEX' = 21, 'ALTER MATERIALIZE INDEX' = 22, 'ALTER CLEAR INDEX' = 23, 'ALTER INDEX' = 24, 'ALTER ADD PROJECTION' = 25, 'ALTER DROP PROJECTION' = 26, 'ALTER MATERIALIZE PROJECTION' = 27, 'ALTER CLEAR PROJECTION' = 28, 'ALTER PROJECTION' = 29, 'ALTER ADD CONSTRAINT' = 30, 'ALTER DROP CONSTRAINT' = 31, 'ALTER CONSTRAINT' = 32, 'ALTER TTL' = 33, 'ALTER MATERIALIZE TTL' = 34, 'ALTER SETTINGS' = 35, 'ALTER MOVE PARTITION' = 36, 'ALTER FETCH PARTITION' = 37, 'ALTER FREEZE PARTITION' = 38, 'ALTER DATABASE SETTINGS' = 39, 'ALTER TABLE' = 40, 'ALTER DATABASE' = 41, 'ALTER VIEW REFRESH' = 42, 'ALTER VIEW MODIFY QUERY' = 43, 'ALTER VIEW' = 44, 'ALTER' = 45, 'CREATE DATABASE' = 46, 'CREATE TABLE' = 47, 'CREATE VIEW' = 48, 'CREATE DICTIONARY' = 49, 'CREATE TEMPORARY TABLE' = 50, 'CREATE FUNCTION' = 51, 'CREATE' = 52, 'DROP DATABASE' = 53, 'DROP TABLE' = 54, 'DROP VIEW' = 55, 'DROP DICTIONARY' = 56, 'DROP FUNCTION' = 57, 'DROP' = 58, 'TRUNCATE' = 59, 'OPTIMIZE' = 60, 'KILL QUERY' = 61, 'KILL TRANSACTION' = 62, 'MOVE PARTITION BETWEEN SHARDS' = 63, 'CREATE USER' = 64, 'ALTER USER' = 65, 'DROP USER' = 66, 'CREATE ROLE' = 67, 'ALTER ROLE' = 68, 'DROP ROLE' = 69, 'ROLE ADMIN' = 70, 'CREATE ROW POLICY' = 71, 'ALTER ROW POLICY' = 72, 'DROP ROW POLICY' = 73, 'CREATE QUOTA' = 74, 'ALTER QUOTA' = 75, 'DROP QUOTA' = 76, 'CREATE SETTINGS PROFILE' = 77, 'ALTER SETTINGS PROFILE' = 78, 'DROP SETTINGS PROFILE' = 79, 'SHOW USERS' = 80, 'SHOW ROLES' = 81, 'SHOW ROW POLICIES' = 82, 'SHOW QUOTAS' = 83, 'SHOW SETTINGS PROFILES' = 84, 'SHOW ACCESS' = 85, 'ACCESS MANAGEMENT' = 86, 'SYSTEM SHUTDOWN' = 87, 'SYSTEM DROP DNS CACHE' = 88, 'SYSTEM DROP MARK CACHE' = 89, 'SYSTEM DROP UNCOMPRESSED CACHE' = 90, 'SYSTEM DROP MMAP CACHE' = 91, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 92, 'SYSTEM DROP CACHE' = 93, 'SYSTEM RELOAD CONFIG' = 94, 'SYSTEM RELOAD SYMBOLS' = 95, 'SYSTEM RELOAD DICTIONARY' = 96, 'SYSTEM RELOAD MODEL' = 97, 'SYSTEM RELOAD FUNCTION' = 98, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 99, 'SYSTEM RELOAD' = 100, 'SYSTEM RESTART DISK' = 101, 'SYSTEM MERGES' = 102, 'SYSTEM TTL MERGES' = 103, 'SYSTEM FETCHES' = 104, 'SYSTEM MOVES' = 105, 'SYSTEM DISTRIBUTED SENDS' = 106, 'SYSTEM REPLICATED SENDS' = 107, 'SYSTEM SENDS' = 108, 'SYSTEM REPLICATION QUEUES' = 109, 'SYSTEM DROP REPLICA' = 110, 'SYSTEM SYNC REPLICA' = 111, 'SYSTEM RESTART REPLICA' = 112, 'SYSTEM RESTORE REPLICA' = 113, 'SYSTEM SYNC DATABASE REPLICA' = 114, 'SYSTEM SYNC TRANSACTION LOG' = 115, 'SYSTEM FLUSH DISTRIBUTED' = 116, 'SYSTEM FLUSH LOGS' = 117, 'SYSTEM FLUSH' = 118, 'SYSTEM THREAD FUZZER' = 119, 'SYSTEM UNFREEZE' = 120, 'SYSTEM' = 121, 'dictGet' = 122, 'addressToLine' = 123, 'addressToLineWithInlines' = 124, 'addressToSymbol' = 125, 'demangle' = 126, 'INTROSPECTION' = 127, 'FILE' = 128, 'URL' = 129, 'REMOTE' = 130, 'MONGO' = 131, 'MEILISEARCH' = 132, 'MYSQL' = 133, 'POSTGRES' = 134, 'SQLITE' = 135, 'ODBC' = 136, 'JDBC' = 137, 'HDFS' = 138, 'S3' = 139, 'HIVE' = 140, 'SOURCES' = 141, 'CLUSTER' = 142, 'ALL' = 143, 'NONE' = 144)) + `parent_group` Nullable(Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SELECT' = 5, 'INSERT' = 6, 'ALTER UPDATE' = 7, 'ALTER DELETE' = 8, 'ALTER ADD COLUMN' = 9, 'ALTER MODIFY COLUMN' = 10, 'ALTER DROP COLUMN' = 11, 'ALTER COMMENT COLUMN' = 12, 'ALTER CLEAR COLUMN' = 13, 'ALTER RENAME COLUMN' = 14, 'ALTER MATERIALIZE COLUMN' = 15, 'ALTER COLUMN' = 16, 'ALTER MODIFY COMMENT' = 17, 'ALTER ORDER BY' = 18, 'ALTER SAMPLE BY' = 19, 'ALTER ADD INDEX' = 20, 'ALTER DROP INDEX' = 21, 'ALTER MATERIALIZE INDEX' = 22, 'ALTER CLEAR INDEX' = 23, 'ALTER INDEX' = 24, 'ALTER ADD PROJECTION' = 25, 'ALTER DROP PROJECTION' = 26, 'ALTER MATERIALIZE PROJECTION' = 27, 'ALTER CLEAR PROJECTION' = 28, 'ALTER PROJECTION' = 29, 'ALTER ADD CONSTRAINT' = 30, 'ALTER DROP CONSTRAINT' = 31, 'ALTER CONSTRAINT' = 32, 'ALTER TTL' = 33, 'ALTER MATERIALIZE TTL' = 34, 'ALTER SETTINGS' = 35, 'ALTER MOVE PARTITION' = 36, 'ALTER FETCH PARTITION' = 37, 'ALTER FREEZE PARTITION' = 38, 'ALTER DATABASE SETTINGS' = 39, 'ALTER TABLE' = 40, 'ALTER DATABASE' = 41, 'ALTER VIEW REFRESH' = 42, 'ALTER VIEW MODIFY QUERY' = 43, 'ALTER VIEW' = 44, 'ALTER' = 45, 'CREATE DATABASE' = 46, 'CREATE TABLE' = 47, 'CREATE VIEW' = 48, 'CREATE DICTIONARY' = 49, 'CREATE TEMPORARY TABLE' = 50, 'CREATE FUNCTION' = 51, 'CREATE' = 52, 'DROP DATABASE' = 53, 'DROP TABLE' = 54, 'DROP VIEW' = 55, 'DROP DICTIONARY' = 56, 'DROP FUNCTION' = 57, 'DROP' = 58, 'TRUNCATE' = 59, 'OPTIMIZE' = 60, 'BACKUP' = 61, 'KILL QUERY' = 62, 'KILL TRANSACTION' = 63, 'MOVE PARTITION BETWEEN SHARDS' = 64, 'CREATE USER' = 65, 'ALTER USER' = 66, 'DROP USER' = 67, 'CREATE ROLE' = 68, 'ALTER ROLE' = 69, 'DROP ROLE' = 70, 'ROLE ADMIN' = 71, 'CREATE ROW POLICY' = 72, 'ALTER ROW POLICY' = 73, 'DROP ROW POLICY' = 74, 'CREATE QUOTA' = 75, 'ALTER QUOTA' = 76, 'DROP QUOTA' = 77, 'CREATE SETTINGS PROFILE' = 78, 'ALTER SETTINGS PROFILE' = 79, 'DROP SETTINGS PROFILE' = 80, 'SHOW USERS' = 81, 'SHOW ROLES' = 82, 'SHOW ROW POLICIES' = 83, 'SHOW QUOTAS' = 84, 'SHOW SETTINGS PROFILES' = 85, 'SHOW ACCESS' = 86, 'ACCESS MANAGEMENT' = 87, 'SYSTEM SHUTDOWN' = 88, 'SYSTEM DROP DNS CACHE' = 89, 'SYSTEM DROP MARK CACHE' = 90, 'SYSTEM DROP UNCOMPRESSED CACHE' = 91, 'SYSTEM DROP MMAP CACHE' = 92, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 93, 'SYSTEM DROP CACHE' = 94, 'SYSTEM RELOAD CONFIG' = 95, 'SYSTEM RELOAD SYMBOLS' = 96, 'SYSTEM RELOAD DICTIONARY' = 97, 'SYSTEM RELOAD MODEL' = 98, 'SYSTEM RELOAD FUNCTION' = 99, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 100, 'SYSTEM RELOAD' = 101, 'SYSTEM RESTART DISK' = 102, 'SYSTEM MERGES' = 103, 'SYSTEM TTL MERGES' = 104, 'SYSTEM FETCHES' = 105, 'SYSTEM MOVES' = 106, 'SYSTEM DISTRIBUTED SENDS' = 107, 'SYSTEM REPLICATED SENDS' = 108, 'SYSTEM SENDS' = 109, 'SYSTEM REPLICATION QUEUES' = 110, 'SYSTEM DROP REPLICA' = 111, 'SYSTEM SYNC REPLICA' = 112, 'SYSTEM RESTART REPLICA' = 113, 'SYSTEM RESTORE REPLICA' = 114, 'SYSTEM SYNC DATABASE REPLICA' = 115, 'SYSTEM SYNC TRANSACTION LOG' = 116, 'SYSTEM FLUSH DISTRIBUTED' = 117, 'SYSTEM FLUSH LOGS' = 118, 'SYSTEM FLUSH' = 119, 'SYSTEM THREAD FUZZER' = 120, 'SYSTEM UNFREEZE' = 121, 'SYSTEM' = 122, 'dictGet' = 123, 'addressToLine' = 124, 'addressToLineWithInlines' = 125, 'addressToSymbol' = 126, 'demangle' = 127, 'INTROSPECTION' = 128, 'FILE' = 129, 'URL' = 130, 'REMOTE' = 131, 'MONGO' = 132, 'MEILISEARCH' = 133, 'MYSQL' = 134, 'POSTGRES' = 135, 'SQLITE' = 136, 'ODBC' = 137, 'JDBC' = 138, 'HDFS' = 139, 'S3' = 140, 'HIVE' = 141, 'SOURCES' = 142, 'CLUSTER' = 143, 'ALL' = 144, 'NONE' = 145)) ) -ENGINE = SystemPrivileges() +ENGINE = SystemPrivileges COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.processes ( @@ -603,7 +603,7 @@ CREATE TABLE system.processes `Settings.Names` Array(String), `Settings.Values` Array(String) ) -ENGINE = SystemProcesses() +ENGINE = SystemProcesses COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.projection_parts ( @@ -667,7 +667,7 @@ CREATE TABLE system.projection_parts `bytes` UInt64, `marks_size` UInt64 ) -ENGINE = SystemProjectionParts() +ENGINE = SystemProjectionParts COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.projection_parts_columns ( @@ -721,7 +721,7 @@ CREATE TABLE system.projection_parts_columns `bytes` UInt64, `marks_size` UInt64 ) -ENGINE = SystemProjectionPartsColumns() +ENGINE = SystemProjectionPartsColumns COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.quota_limits ( @@ -739,7 +739,7 @@ CREATE TABLE system.quota_limits `max_execution_time` Nullable(Float64), `max_written_bytes` Nullable(UInt64) ) -ENGINE = SystemQuotaLimits() +ENGINE = SystemQuotaLimits COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.quota_usage ( @@ -769,7 +769,7 @@ CREATE TABLE system.quota_usage `written_bytes` Nullable(UInt64), `max_written_bytes` Nullable(UInt64) ) -ENGINE = SystemQuotaUsage() +ENGINE = SystemQuotaUsage COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.quotas ( @@ -782,7 +782,7 @@ CREATE TABLE system.quotas `apply_to_list` Array(String), `apply_to_except` Array(String) ) -ENGINE = SystemQuotas() +ENGINE = SystemQuotas COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.quotas_usage ( @@ -813,7 +813,7 @@ CREATE TABLE system.quotas_usage `written_bytes` Nullable(UInt64), `max_written_bytes` Nullable(UInt64) ) -ENGINE = SystemQuotasUsage() +ENGINE = SystemQuotasUsage COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.replicas ( @@ -851,7 +851,7 @@ CREATE TABLE system.replicas `zookeeper_exception` String, `replica_is_active` Map(String, UInt8) ) -ENGINE = SystemReplicas() +ENGINE = SystemReplicas COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.replicated_fetches ( @@ -872,7 +872,7 @@ CREATE TABLE system.replicated_fetches `to_detached` UInt8, `thread_id` UInt64 ) -ENGINE = SystemReplicatedFetches() +ENGINE = SystemReplicatedFetches COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.replicated_merge_tree_settings ( @@ -882,7 +882,7 @@ CREATE TABLE system.replicated_merge_tree_settings `description` String, `type` String ) -ENGINE = SystemReplicatedMergeTreeSettings() +ENGINE = SystemReplicatedMergeTreeSettings COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.replication_queue ( @@ -907,7 +907,7 @@ CREATE TABLE system.replication_queue `last_postpone_time` DateTime, `merge_type` String ) -ENGINE = SystemReplicationQueue() +ENGINE = SystemReplicationQueue COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.role_grants ( @@ -917,7 +917,7 @@ CREATE TABLE system.role_grants `granted_role_is_default` UInt8, `with_admin_option` UInt8 ) -ENGINE = SystemRoleGrants() +ENGINE = SystemRoleGrants COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.roles ( @@ -925,7 +925,7 @@ CREATE TABLE system.roles `id` UUID, `storage` String ) -ENGINE = SystemRoles() +ENGINE = SystemRoles COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.row_policies ( @@ -941,7 +941,7 @@ CREATE TABLE system.row_policies `apply_to_list` Array(String), `apply_to_except` Array(String) ) -ENGINE = SystemRowPolicies() +ENGINE = SystemRowPolicies COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.settings ( @@ -954,7 +954,7 @@ CREATE TABLE system.settings `readonly` UInt8, `type` String ) -ENGINE = SystemSettings() +ENGINE = SystemSettings COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.settings_profile_elements ( @@ -969,7 +969,7 @@ CREATE TABLE system.settings_profile_elements `readonly` Nullable(UInt8), `inherit_profile` Nullable(String) ) -ENGINE = SystemSettingsProfileElements() +ENGINE = SystemSettingsProfileElements COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.settings_profiles ( @@ -981,7 +981,7 @@ CREATE TABLE system.settings_profiles `apply_to_list` Array(String), `apply_to_except` Array(String) ) -ENGINE = SystemSettingsProfiles() +ENGINE = SystemSettingsProfiles COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.stack_trace ( @@ -990,7 +990,7 @@ CREATE TABLE system.stack_trace `query_id` String, `trace` Array(UInt64) ) -ENGINE = SystemStackTrace() +ENGINE = SystemStackTrace COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.storage_policies ( @@ -1003,7 +1003,7 @@ CREATE TABLE system.storage_policies `move_factor` Float32, `prefer_not_to_merge` UInt8 ) -ENGINE = SystemStoragePolicies() +ENGINE = SystemStoragePolicies COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.table_engines ( @@ -1017,13 +1017,13 @@ CREATE TABLE system.table_engines `supports_deduplication` UInt8, `supports_parallel_insert` UInt8 ) -ENGINE = SystemTableEngines() +ENGINE = SystemTableEngines COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.table_functions ( `name` String ) -ENGINE = SystemTableFunctions() +ENGINE = SystemTableFunctions COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.tables ( @@ -1057,13 +1057,13 @@ CREATE TABLE system.tables `loading_dependent_table` Array(String), `table` String ) -ENGINE = SystemTables() +ENGINE = SystemTables COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.time_zones ( `time_zone` String ) -ENGINE = SystemTimeZones() +ENGINE = SystemTimeZones COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.user_directories ( @@ -1072,7 +1072,7 @@ CREATE TABLE system.user_directories `params` String, `precedence` UInt64 ) -ENGINE = SystemUserDirectories() +ENGINE = SystemUserDirectories COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.users ( @@ -1093,23 +1093,23 @@ CREATE TABLE system.users `grantees_except` Array(String), `default_database` String ) -ENGINE = SystemUsers() +ENGINE = SystemUsers COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.warnings ( `message` String ) -ENGINE = SystemWarnings() +ENGINE = SystemWarnings COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.zeros ( `zero` UInt8 ) -ENGINE = SystemZeros() +ENGINE = SystemZeros COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.zeros_mt ( `zero` UInt8 ) -ENGINE = SystemZeros() +ENGINE = SystemZeros COMMENT 'SYSTEM TABLE is built on the fly.' diff --git a/tests/queries/0_stateless/02118_show_create_table_rocksdb.reference b/tests/queries/0_stateless/02118_show_create_table_rocksdb.reference index 9e487824e3e..848abb332bb 100644 --- a/tests/queries/0_stateless/02118_show_create_table_rocksdb.reference +++ b/tests/queries/0_stateless/02118_show_create_table_rocksdb.reference @@ -1 +1 @@ -CREATE TABLE system.rocksdb\n(\n `database` String,\n `table` String,\n `name` String,\n `value` UInt64\n)\nENGINE = SystemRocksDB()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' +CREATE TABLE system.rocksdb\n(\n `database` String,\n `table` String,\n `name` String,\n `value` UInt64\n)\nENGINE = SystemRocksDB\nCOMMENT \'SYSTEM TABLE is built on the fly.\' diff --git a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql index 8fb11ac383c..4dfcbb9bf80 100644 --- a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql +++ b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql @@ -1,4 +1,6 @@ SET max_threads=0; +SET optimize_read_in_order=1; +SET read_in_order_two_level_merge_threshold=100; DROP TABLE IF EXISTS t_read_in_order; diff --git a/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql index e82c78b5e42..9846c1208a1 100644 --- a/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql +++ b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql @@ -7,6 +7,7 @@ SETTINGS index_granularity = 4; INSERT INTO t_max_rows_to_read SELECT number FROM numbers(100); SET max_threads = 1; +SET optimize_read_in_order = 1; SELECT a FROM t_max_rows_to_read WHERE a = 10 SETTINGS max_rows_to_read = 4; diff --git a/tests/queries/0_stateless/02226_s3_with_cache.sql b/tests/queries/0_stateless/02226_s3_with_cache.sql index e62e63b7f97..0d0653d4089 100644 --- a/tests/queries/0_stateless/02226_s3_with_cache.sql +++ b/tests/queries/0_stateless/02226_s3_with_cache.sql @@ -23,7 +23,8 @@ AND current_database = currentDatabase() ORDER BY query_start_time DESC LIMIT 1; -SET remote_filesystem_read_method='read'; +set remote_filesystem_read_method = 'read'; +set local_filesystem_read_method = 'pread'; SELECT 2, * FROM test LIMIT 10 FORMAT Null; diff --git a/tests/queries/0_stateless/02232_dist_insert_send_logs_level_hung.reference b/tests/queries/0_stateless/02232_dist_insert_send_logs_level_hung.reference new file mode 100644 index 00000000000..573541ac970 --- /dev/null +++ b/tests/queries/0_stateless/02232_dist_insert_send_logs_level_hung.reference @@ -0,0 +1 @@ +0 diff --git a/tests/queries/0_stateless/02232_dist_insert_send_logs_level_hung.sh b/tests/queries/0_stateless/02232_dist_insert_send_logs_level_hung.sh new file mode 100755 index 00000000000..a8dce5cb516 --- /dev/null +++ b/tests/queries/0_stateless/02232_dist_insert_send_logs_level_hung.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# Tags: disabled +# Tag: no-parallel - to heavy +# Tag: long - to heavy + +# This is the regression test when remote peer send some logs for INSERT, +# it is easy to archive using materialized views, with small block size. + +CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL=trace + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# NOTE: that since we use CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL we need to apply +# --server_logs_file for every clickhouse-client invocation. +client_opts=( + # For --send_logs_level see $CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL + --server_logs_file /dev/null + # we need lots of blocks to get log entry for each of them + --min_insert_block_size_rows 1 + # we need to terminate ASAP + --max_block_size 1 +) + +$CLICKHOUSE_CLIENT "${client_opts[@]}" -nm -q " + drop table if exists mv_02232; + drop table if exists in_02232; + drop table if exists out_02232; + + create table out_02232 (key Int) engine=Null(); + create table in_02232 (key Int) engine=Null(); + create materialized view mv_02232 to out_02232 as select * from in_02232; +" + +insert_client_opts=( + # Increase timeouts to avoid timeout during trying to send Log packet to + # the remote side, when the socket is full. + --send_timeout 86400 + --receive_timeout 86400 +) +# 250 seconds is enough to trigger the query hung (even in debug build) +# +# NOTE: using proper termination (via SIGINT) is too long, +# hence timeout+KILL QUERY. +timeout 250s $CLICKHOUSE_CLIENT "${client_opts[@]}" "${insert_client_opts[@]}" -q "insert into function remote('127.2', currentDatabase(), in_02232) select * from numbers(1e6)" + +# Kill underlying query of remote() to make KILL faster +timeout 30s $CLICKHOUSE_CLIENT "${client_opts[@]}" -q "KILL QUERY WHERE Settings['log_comment'] = '$CLICKHOUSE_LOG_COMMENT' SYNC" --format Null +echo $? + +$CLICKHOUSE_CLIENT "${client_opts[@]}" -nm -q " + drop table in_02232; + drop table mv_02232; + drop table out_02232; +" diff --git a/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.reference b/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.reference index 9d252c9f396..f98effbec67 100644 --- a/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.reference +++ b/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.reference @@ -2,7 +2,7 @@ 0 0 0 -- { echoOn } insert into data_02233 select number%10, number%3, number from numbers(100); -explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=1; +explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=1, read_in_order_two_level_merge_threshold=1; (Expression) ExpressionTransform × 2 (Sorting) @@ -20,7 +20,7 @@ ExpressionTransform × 2 ExpressionTransform (ReadFromMergeTree) MergeTreeInOrder 0 → 1 -explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1; +explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=0, read_in_order_two_level_merge_threshold=1; (Expression) ExpressionTransform × 2 (Sorting) @@ -103,7 +103,7 @@ select parent_key, child_key, count() from data_02233 group by parent_key, child 9 2 3 0 0 100 -select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1; +select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=0; 0 0 4 0 1 3 0 2 3 diff --git a/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.sql b/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.sql index cf1e825b03d..233599feb65 100644 --- a/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.sql +++ b/tests/queries/0_stateless/02233_optimize_aggregation_in_order_prefix.sql @@ -7,11 +7,11 @@ SELECT child_key, parent_key, child_key FROM data_02233 GROUP BY parent_key, chi -- { echoOn } insert into data_02233 select number%10, number%3, number from numbers(100); -explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=1; -explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1; +explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=1, read_in_order_two_level_merge_threshold=1; +explain pipeline select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=0, read_in_order_two_level_merge_threshold=1; select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=1; select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=1, max_block_size=1; -select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1; +select parent_key, child_key, count() from data_02233 group by parent_key, child_key with totals order by parent_key, child_key settings max_threads=1, optimize_aggregation_in_order=0; -- fuzzer SELECT child_key, parent_key, child_key FROM data_02233 GROUP BY parent_key, child_key, child_key ORDER BY child_key, parent_key ASC NULLS LAST SETTINGS max_threads = 1, optimize_aggregation_in_order = 1; diff --git a/tests/queries/0_stateless/02293_test_zstd_window_log_max.reference b/tests/queries/0_stateless/02293_test_zstd_window_log_max.reference new file mode 100644 index 00000000000..98ca7fb2d29 --- /dev/null +++ b/tests/queries/0_stateless/02293_test_zstd_window_log_max.reference @@ -0,0 +1,2 @@ +1 +40 diff --git a/tests/queries/0_stateless/02293_test_zstd_window_log_max.sh b/tests/queries/0_stateless/02293_test_zstd_window_log_max.sh new file mode 100755 index 00000000000..fd5144c5c39 --- /dev/null +++ b/tests/queries/0_stateless/02293_test_zstd_window_log_max.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +# Tags: no-parallel + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# reuse the test data in 01946_test_zstd_decompression_with_escape_sequence_at_the_end_of_buffer.sh +$CLICKHOUSE_LOCAL --query "SELECT count() FROM file('$CUR_DIR/data_zstd/test_01946.zstd', JSONEachRow, 'foo String') SETTINGS zstd_window_log_max = 20" 2>&1 | grep -c "ZSTD_DECODER_FAILED" +$CLICKHOUSE_LOCAL --query "SELECT count() FROM file('$CUR_DIR/data_zstd/test_01946.zstd', JSONEachRow, 'foo String') SETTINGS zstd_window_log_max = 21" diff --git a/tests/queries/0_stateless/02305_schema_inference_with_globs.sh b/tests/queries/0_stateless/02305_schema_inference_with_globs.sh index 19506c84645..b9bf0096f37 100755 --- a/tests/queries/0_stateless/02305_schema_inference_with_globs.sh +++ b/tests/queries/0_stateless/02305_schema_inference_with_globs.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-fasttest +# Tags: no-fasttest, no-cpu-aarch64 CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/02310_clickhouse_client_INSERT_progress_profile_events.reference b/tests/queries/0_stateless/02310_clickhouse_client_INSERT_progress_profile_events.reference new file mode 100644 index 00000000000..64ab61e6765 --- /dev/null +++ b/tests/queries/0_stateless/02310_clickhouse_client_INSERT_progress_profile_events.reference @@ -0,0 +1,2 @@ +0 +--progress produce some rows diff --git a/tests/queries/0_stateless/02310_clickhouse_client_INSERT_progress_profile_events.sh b/tests/queries/0_stateless/02310_clickhouse_client_INSERT_progress_profile_events.sh new file mode 100755 index 00000000000..6c37d870652 --- /dev/null +++ b/tests/queries/0_stateless/02310_clickhouse_client_INSERT_progress_profile_events.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Tags: long + +# This is the regression for the concurrent access in ProgressIndication, +# so it is important to read enough rows here (10e6). +# +# Initially there was 100e6, but under thread fuzzer 10min may be not enough sometimes, +# but I believe that CI will catch possible issues even with less rows anyway. + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +tmp_file_progress="$(mktemp "$CUR_DIR/$CLICKHOUSE_TEST_UNIQUE_NAME.XXXXXX.progress")" +trap 'rm $tmp_file_progress' EXIT + +yes | head -n10000000 | $CLICKHOUSE_CLIENT -q "insert into function null('foo String') format TSV" --progress 2> "$tmp_file_progress" +echo $? +test -s "$tmp_file_progress" && echo "--progress produce some rows" || echo "FAIL: no rows with --progress" diff --git a/tests/queries/0_stateless/02310_clickhouse_local_INSERT_progress_profile_events.reference b/tests/queries/0_stateless/02310_clickhouse_local_INSERT_progress_profile_events.reference new file mode 100644 index 00000000000..64ab61e6765 --- /dev/null +++ b/tests/queries/0_stateless/02310_clickhouse_local_INSERT_progress_profile_events.reference @@ -0,0 +1,2 @@ +0 +--progress produce some rows diff --git a/tests/queries/0_stateless/02310_clickhouse_local_INSERT_progress_profile_events.sh b/tests/queries/0_stateless/02310_clickhouse_local_INSERT_progress_profile_events.sh new file mode 100755 index 00000000000..00a8b7a2a90 --- /dev/null +++ b/tests/queries/0_stateless/02310_clickhouse_local_INSERT_progress_profile_events.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Tags: long + +# This is the regression for the concurrent access in ProgressIndication, +# so it is important to read enough rows here (10e6). +# +# Initially there was 100e6, but under thread fuzzer 10min may be not enough sometimes, +# but I believe that CI will catch possible issues even with less rows anyway. + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +tmp_file_progress="$(mktemp "$CUR_DIR/$CLICKHOUSE_TEST_UNIQUE_NAME.XXXXXX.progress")" +trap 'rm $tmp_file_progress' EXIT + +yes | head -n10000000 | $CLICKHOUSE_LOCAL -q "insert into function null('foo String') format TSV" --progress 2> "$tmp_file_progress" +echo $? +test -s "$tmp_file_progress" && echo "--progress produce some rows" || echo "FAIL: no rows with --progress" diff --git a/tests/queries/0_stateless/02310_profile_events_insert.reference b/tests/queries/0_stateless/02310_profile_events_insert.reference new file mode 100644 index 00000000000..7308b2da5b1 --- /dev/null +++ b/tests/queries/0_stateless/02310_profile_events_insert.reference @@ -0,0 +1,4 @@ +client +InsertedRows: 1 (increment) +local +InsertedRows: 1 (increment) diff --git a/tests/queries/0_stateless/02310_profile_events_insert.sh b/tests/queries/0_stateless/02310_profile_events_insert.sh new file mode 100755 index 00000000000..e51297ea7c9 --- /dev/null +++ b/tests/queries/0_stateless/02310_profile_events_insert.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +echo client +$CLICKHOUSE_CLIENT --print-profile-events --profile-events-delay-ms=-1 -q "insert into function null('foo Int') values (1)" |& grep -o 'InsertedRows: .*' + +echo local +$CLICKHOUSE_LOCAL --print-profile-events --profile-events-delay-ms=-1 -q "insert into function null('foo Int') values (1)" |& grep -o 'InsertedRows: .*' + +exit 0 diff --git a/tests/queries/0_stateless/02315_executable_user_defined_function_parameter.reference b/tests/queries/0_stateless/02315_executable_user_defined_function_parameter.reference deleted file mode 100644 index fd3c81a4d76..00000000000 --- a/tests/queries/0_stateless/02315_executable_user_defined_function_parameter.reference +++ /dev/null @@ -1,2 +0,0 @@ -5 -5 diff --git a/tests/queries/0_stateless/02315_executable_user_defined_function_parameter.sql b/tests/queries/0_stateless/02315_executable_user_defined_function_parameter.sql deleted file mode 100644 index f6e5678e612..00000000000 --- a/tests/queries/0_stateless/02315_executable_user_defined_function_parameter.sql +++ /dev/null @@ -1,6 +0,0 @@ -SELECT test_function_with_parameter('test')(1, 2); --{serverError 53} -SELECT test_function_with_parameter(2, 2)(1, 2); --{serverError 36} -SELECT test_function_with_parameter(1, 2); --{serverError 36} - -SELECT test_function_with_parameter(2)(1, 2); -SELECT test_function_with_parameter('2')(1, 2); diff --git a/tests/queries/0_stateless/02316_cast_to_ip_address_default_column.sql b/tests/queries/0_stateless/02316_cast_to_ip_address_default_column.sql index e8071877379..200cec8fed9 100644 --- a/tests/queries/0_stateless/02316_cast_to_ip_address_default_column.sql +++ b/tests/queries/0_stateless/02316_cast_to_ip_address_default_column.sql @@ -1,3 +1,5 @@ +-- Tags: no-backward-compatibility-check + SET cast_ipv4_ipv6_default_on_conversion_error = 1; DROP TABLE IF EXISTS ipv4_test; diff --git a/tests/queries/0_stateless/02316_expressions_with_window_functions.reference b/tests/queries/0_stateless/02316_expressions_with_window_functions.reference new file mode 100644 index 00000000000..2560e90408c --- /dev/null +++ b/tests/queries/0_stateless/02316_expressions_with_window_functions.reference @@ -0,0 +1,254 @@ +-- { echoOn } +-- SELECT number, sum(number) + 1 OVER (PARTITION BY (number % 10)) +-- FROM numbers(100) +-- ORDER BY number; -- { clientError SYNTAX_ERROR } + +SELECT number, 1 + sum(number) OVER (PARTITION BY number % 10) +FROM numbers(100) +ORDER BY number; +0 451 +1 461 +2 471 +3 481 +4 491 +5 501 +6 511 +7 521 +8 531 +9 541 +10 451 +11 461 +12 471 +13 481 +14 491 +15 501 +16 511 +17 521 +18 531 +19 541 +20 451 +21 461 +22 471 +23 481 +24 491 +25 501 +26 511 +27 521 +28 531 +29 541 +30 451 +31 461 +32 471 +33 481 +34 491 +35 501 +36 511 +37 521 +38 531 +39 541 +40 451 +41 461 +42 471 +43 481 +44 491 +45 501 +46 511 +47 521 +48 531 +49 541 +50 451 +51 461 +52 471 +53 481 +54 491 +55 501 +56 511 +57 521 +58 531 +59 541 +60 451 +61 461 +62 471 +63 481 +64 491 +65 501 +66 511 +67 521 +68 531 +69 541 +70 451 +71 461 +72 471 +73 481 +74 491 +75 501 +76 511 +77 521 +78 531 +79 541 +80 451 +81 461 +82 471 +83 481 +84 491 +85 501 +86 511 +87 521 +88 531 +89 541 +90 451 +91 461 +92 471 +93 481 +94 491 +95 501 +96 511 +97 521 +98 531 +99 541 +SELECT sum(number) + 1 AS x +FROM numbers(100) +GROUP BY number % 10 +ORDER BY x; +451 +461 +471 +481 +491 +501 +511 +521 +531 +541 +SELECT + number, + sum(number) OVER (PARTITION BY number % 10) / count() OVER (PARTITION BY number % 10), + avg(number) OVER (PARTITION BY number % 10) +FROM numbers(100) +ORDER BY number ASC; +0 45 45 +1 46 46 +2 47 47 +3 48 48 +4 49 49 +5 50 50 +6 51 51 +7 52 52 +8 53 53 +9 54 54 +10 45 45 +11 46 46 +12 47 47 +13 48 48 +14 49 49 +15 50 50 +16 51 51 +17 52 52 +18 53 53 +19 54 54 +20 45 45 +21 46 46 +22 47 47 +23 48 48 +24 49 49 +25 50 50 +26 51 51 +27 52 52 +28 53 53 +29 54 54 +30 45 45 +31 46 46 +32 47 47 +33 48 48 +34 49 49 +35 50 50 +36 51 51 +37 52 52 +38 53 53 +39 54 54 +40 45 45 +41 46 46 +42 47 47 +43 48 48 +44 49 49 +45 50 50 +46 51 51 +47 52 52 +48 53 53 +49 54 54 +50 45 45 +51 46 46 +52 47 47 +53 48 48 +54 49 49 +55 50 50 +56 51 51 +57 52 52 +58 53 53 +59 54 54 +60 45 45 +61 46 46 +62 47 47 +63 48 48 +64 49 49 +65 50 50 +66 51 51 +67 52 52 +68 53 53 +69 54 54 +70 45 45 +71 46 46 +72 47 47 +73 48 48 +74 49 49 +75 50 50 +76 51 51 +77 52 52 +78 53 53 +79 54 54 +80 45 45 +81 46 46 +82 47 47 +83 48 48 +84 49 49 +85 50 50 +86 51 51 +87 52 52 +88 53 53 +89 54 54 +90 45 45 +91 46 46 +92 47 47 +93 48 48 +94 49 49 +95 50 50 +96 51 51 +97 52 52 +98 53 53 +99 54 54 +SELECT sum(number) / sum(sum(number)) OVER (PARTITION BY (number % 10)) +FROM numbers(10000) +GROUP BY number % 10; +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +SELECT 1 + sum(number) / sum(sum(number)) OVER (PARTITION BY (number % 10)) +FROM numbers(10000) +GROUP BY number % 10; +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 diff --git a/tests/queries/0_stateless/02316_expressions_with_window_functions.sql b/tests/queries/0_stateless/02316_expressions_with_window_functions.sql new file mode 100644 index 00000000000..c3137ef3746 --- /dev/null +++ b/tests/queries/0_stateless/02316_expressions_with_window_functions.sql @@ -0,0 +1,28 @@ +-- { echoOn } +-- SELECT number, sum(number) + 1 OVER (PARTITION BY (number % 10)) +-- FROM numbers(100) +-- ORDER BY number; -- { clientError SYNTAX_ERROR } + +SELECT number, 1 + sum(number) OVER (PARTITION BY number % 10) +FROM numbers(100) +ORDER BY number; + +SELECT sum(number) + 1 AS x +FROM numbers(100) +GROUP BY number % 10 +ORDER BY x; + +SELECT + number, + sum(number) OVER (PARTITION BY number % 10) / count() OVER (PARTITION BY number % 10), + avg(number) OVER (PARTITION BY number % 10) +FROM numbers(100) +ORDER BY number ASC; + +SELECT sum(number) / sum(sum(number)) OVER (PARTITION BY (number % 10)) +FROM numbers(10000) +GROUP BY number % 10; + +SELECT 1 + sum(number) / sum(sum(number)) OVER (PARTITION BY (number % 10)) +FROM numbers(10000) +GROUP BY number % 10; diff --git a/tests/queries/0_stateless/02317_functions_with_nothing.reference b/tests/queries/0_stateless/02317_functions_with_nothing.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02317_functions_with_nothing.sql b/tests/queries/0_stateless/02317_functions_with_nothing.sql new file mode 100644 index 00000000000..3bfda3bb6c3 --- /dev/null +++ b/tests/queries/0_stateless/02317_functions_with_nothing.sql @@ -0,0 +1,7 @@ +SELECT JSONExtractKeysAndValuesRaw(arrayJoin([])); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} +SELECT JSONHas(arrayJoin([])); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} +SELECT isValidJSON(arrayJoin([])); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} +SELECT concat(arrayJoin([]), arrayJoin([NULL, ''])); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} +SELECT plus(arrayJoin([]), arrayJoin([NULL, 1])); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} +SELECT sipHash64(arrayJoin([]), [NULL], arrayJoin(['', NULL, '', NULL])); -- {serverError ILLEGAL_TYPE_OF_ARGUMENT} +SELECT [concat(NULL, arrayJoin([]))]; diff --git a/tests/queries/0_stateless/02320_alter_columns_with_dots.reference b/tests/queries/0_stateless/02320_alter_columns_with_dots.reference new file mode 100644 index 00000000000..3c3e828924e --- /dev/null +++ b/tests/queries/0_stateless/02320_alter_columns_with_dots.reference @@ -0,0 +1,27 @@ +id String +abc.1 String +abc.2 String +abc String +id String +abc.2 String +abc String +abc.1 String +id String +abc String +abc.2 String +abc.1 String +id String +abc.2 String +abc String +abc.1 String +id String +abc String +abc.2 String +abc.1 String +id String +abc.2 String +abc.1 String +abc String +id String +abc.2 String +abc.1 String diff --git a/tests/queries/0_stateless/02320_alter_columns_with_dots.sql b/tests/queries/0_stateless/02320_alter_columns_with_dots.sql new file mode 100644 index 00000000000..1b48538f987 --- /dev/null +++ b/tests/queries/0_stateless/02320_alter_columns_with_dots.sql @@ -0,0 +1,15 @@ +DROP TABLE IF EXISTS test; +CREATE TABLE test (id String, `abc.1` String, `abc.2` String, `abc` String) ENGINE MergeTree order by id; +DESC TABLE test; +ALTER TABLE test MODIFY COLUMN `abc.1` String AFTER `abc`; +DESC TABLE test; +ALTER TABLE test MODIFY COLUMN `abc.2` String AFTER `abc`; +DESC TABLE test; +ALTER TABLE test MODIFY COLUMN `abc` String AFTER `abc.2`; +DESC TABLE test; +ALTER TABLE test MODIFY COLUMN `abc` String AFTER `id`; +DESC TABLE test; +ALTER TABLE test MODIFY COLUMN `abc` String AFTER `abc.1`; +DESC TABLE test; +ALTER TABLE test DROP COLUMN `abc`; +DESC TABLE test; diff --git a/tests/queries/0_stateless/02320_mapped_array_witn_const_nullable.reference b/tests/queries/0_stateless/02320_mapped_array_witn_const_nullable.reference new file mode 100644 index 00000000000..dc31f044eef --- /dev/null +++ b/tests/queries/0_stateless/02320_mapped_array_witn_const_nullable.reference @@ -0,0 +1,18 @@ +[] +[1] +[1,1] +[] +[0] +[0,1] +[] +[0] +[0,0] +[] +[] +[] +[] +[NULL] +[NULL,NULL] +[] +[] +[] diff --git a/tests/queries/0_stateless/02320_mapped_array_witn_const_nullable.sql b/tests/queries/0_stateless/02320_mapped_array_witn_const_nullable.sql new file mode 100644 index 00000000000..08651590c76 --- /dev/null +++ b/tests/queries/0_stateless/02320_mapped_array_witn_const_nullable.sql @@ -0,0 +1,9 @@ +-- Tags: no-backward-compatibility-check + +select arrayMap(x -> toNullable(1), range(number)) from numbers(3); +select arrayFilter(x -> toNullable(1), range(number)) from numbers(3); +select arrayMap(x -> toNullable(0), range(number)) from numbers(3); +select arrayFilter(x -> toNullable(0), range(number)) from numbers(3); +select arrayMap(x -> NULL::Nullable(UInt8), range(number)) from numbers(3); +select arrayFilter(x -> NULL::Nullable(UInt8), range(number)) from numbers(3); + diff --git a/tests/queries/0_stateless/02332_dist_insert_send_logs_level.reference b/tests/queries/0_stateless/02332_dist_insert_send_logs_level.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02332_dist_insert_send_logs_level.sh b/tests/queries/0_stateless/02332_dist_insert_send_logs_level.sh new file mode 100755 index 00000000000..653cb25172a --- /dev/null +++ b/tests/queries/0_stateless/02332_dist_insert_send_logs_level.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +# Tags: no-backward-compatibility-check + +CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL=trace +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT --server_logs_file /dev/null -q "CREATE TABLE data_02332 (key Int) Engine=Null()" +# If ClickHouse server will forward logs from the remote nodes, than it will definitely will have the following message in the log: +# +# executeQuery: (from 127.0.0.1:53440, initial_query_id: fc1f7dbd-845b-4142-9306-158ddd564e61) INSERT INTO default.data (key) VALUES (stage: Complete) +# +# And if the server will forward logs, then the query may hung. +$CLICKHOUSE_CLIENT -q "INSERT INTO FUNCTION remote('127.2', currentDatabase(), data_02332) SELECT * FROM numbers(10)" |& grep 'executeQuery.*initial_query_id.*INSERT INTO' +exit 0 diff --git a/tests/queries/0_stateless/02337_multiple_joins_original_names.reference b/tests/queries/0_stateless/02337_multiple_joins_original_names.reference new file mode 100644 index 00000000000..6ed281c757a --- /dev/null +++ b/tests/queries/0_stateless/02337_multiple_joins_original_names.reference @@ -0,0 +1,2 @@ +1 +1 diff --git a/tests/queries/0_stateless/02337_multiple_joins_original_names.sql b/tests/queries/0_stateless/02337_multiple_joins_original_names.sql new file mode 100644 index 00000000000..afafee9f8eb --- /dev/null +++ b/tests/queries/0_stateless/02337_multiple_joins_original_names.sql @@ -0,0 +1,22 @@ +-- https://github.com/ClickHouse/ClickHouse/issues/34697 + +SELECT table1_id FROM ( + SELECT first.table1_id + FROM (SELECT number+1 as table1_id FROM numbers(1)) as first + JOIN (SELECT number+1 as table2_id FROM numbers(1)) as second ON first.table1_id = second.table2_id + JOIN (SELECT number+1 as table3_id FROM numbers(1)) as third ON first.table1_id = third.table3_id +); -- { serverError UNKNOWN_IDENTIFIER } + +SELECT table1_id FROM ( + SELECT first.table1_id + FROM (SELECT number+1 as table1_id FROM numbers(1)) as first + JOIN (SELECT number+1 as table2_id FROM numbers(1)) as second ON first.table1_id = second.table2_id + JOIN (SELECT number+1 as table3_id FROM numbers(1)) as third ON first.table1_id = third.table3_id +) SETTINGS multiple_joins_try_to_keep_original_names = 1; + +SELECT aaa FROM ( + SELECT first.table1_id as aaa + FROM (SELECT number+1 as table1_id FROM numbers(1)) as first + JOIN (SELECT number+1 as table2_id FROM numbers(1)) as second ON first.table1_id = second.table2_id + JOIN (SELECT number+1 as table3_id FROM numbers(1)) as third ON first.table1_id = third.table3_id +) SETTINGS multiple_joins_try_to_keep_original_names = 1; diff --git a/tests/queries/0_stateless/02340_parts_refcnt_mergetree.reference b/tests/queries/0_stateless/02340_parts_refcnt_mergetree.reference new file mode 100644 index 00000000000..e225ce389cb --- /dev/null +++ b/tests/queries/0_stateless/02340_parts_refcnt_mergetree.reference @@ -0,0 +1,2 @@ +data_02340 1_2_2_0 5 +data_02340_rep 1_0_0_0 5 diff --git a/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh b/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh new file mode 100755 index 00000000000..29b3b7b3d9d --- /dev/null +++ b/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +function random_str() +{ + local n=$1 && shift + tr -cd '[:lower:]' < /dev/urandom | head -c"$n" +} + +function check_refcnt_for_table() +{ + local table=$1 && shift + + $CLICKHOUSE_CLIENT -q "system stop merges $table" + $CLICKHOUSE_CLIENT -q "insert into $table select number, number%4 from numbers(200)" + + local query_id + query_id="$table-$(random_str 10)" + + # Notes: + # - query may sleep 0.1*(200/4)=5 seconds, it is enough to check system.parts + # - "part = 1" condition should prune all parts except first + $CLICKHOUSE_CLIENT --format Null --max_block_size 1 --query_id "$query_id" -q "select sleepEachRow(0.1) from $table where part = 1" & + PID=$! + + # wait for query to be started + while [ "$($CLICKHOUSE_CLIENT -q "select count() from system.processes where query_id = '$query_id'")" -ne 1 ]; do + sleep 0.1 + done + + # When the query only starts it execution it holds reference for each part, + # however when it starts reading, partition pruning takes place, + # and it should hold only parts that are required for SELECT + # + # So 2 seconds delay to ensure that it goes the reading stage. + sleep 2 + + # NOTE: parts that are used in query will have refcount increased for each range + $CLICKHOUSE_CLIENT -q "select table, name, refcount from system.parts where database = '$CLICKHOUSE_DATABASE' and table = '$table' and refcount > 1" + + kill -INT $PID + wait $PID +} + +$CLICKHOUSE_CLIENT -nmq " + drop table if exists data_02340; + create table data_02340 (key Int, part Int) engine=MergeTree() partition by part order by key; +" +check_refcnt_for_table data_02340 + +$CLICKHOUSE_CLIENT -nmq " + drop table if exists data_02340_rep; + create table data_02340_rep (key Int, part Int) engine=ReplicatedMergeTree('/clickhouse/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX', '1') partition by part order by key; +" +check_refcnt_for_table data_02340_rep + +exit 0 diff --git a/tests/queries/0_stateless/02342_window_view_different_struct.reference b/tests/queries/0_stateless/02342_window_view_different_struct.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02342_window_view_different_struct.sql b/tests/queries/0_stateless/02342_window_view_different_struct.sql new file mode 100644 index 00000000000..0b21e39d3c9 --- /dev/null +++ b/tests/queries/0_stateless/02342_window_view_different_struct.sql @@ -0,0 +1,25 @@ +-- Tags: no-backward-compatibility-check:22.6 + +SET allow_experimental_window_view = 1; + +DROP TABLE IF EXISTS data_02342; +DROP TABLE IF EXISTS window_view_02342; + +-- ALTER +CREATE TABLE data_02342 (a UInt8) ENGINE=MergeTree ORDER BY a; +CREATE WINDOW VIEW window_view_02342 ENGINE=Memory AS SELECT count(a), tumbleStart(wid) AS w_start, tumbleEnd(tumble(now(), INTERVAL '3' SECOND)) AS w_end FROM data_02342 GROUP BY tumble(now(), INTERVAL '3' SECOND) AS wid; +INSERT INTO data_02342 VALUES (42); +ALTER TABLE data_02342 ADD COLUMN s String; +INSERT INTO data_02342 VALUES (42, 'data_02342'); +DROP TABLE data_02342; +DROP TABLE window_view_02342; + +-- DROP/CREATE +CREATE TABLE data_02342 (a UInt8) ENGINE=MergeTree ORDER BY a; +CREATE WINDOW VIEW window_view_02342 ENGINE=Memory AS SELECT count(a), tumbleStart(wid) AS w_start, tumbleEnd(tumble(now(), INTERVAL '3' SECOND)) AS w_end FROM data_02342 GROUP BY tumble(now(), INTERVAL '3' SECOND) AS wid; +INSERT INTO data_02342 VALUES (42); +DROP TABLE data_02342; +CREATE TABLE data_02342 (a UInt8, s String) ENGINE=MergeTree ORDER BY a; +INSERT INTO data_02342 VALUES (42, 'data_02342'); +DROP TABLE data_02342; +DROP TABLE window_view_02342; diff --git a/tests/queries/0_stateless/02343_create_empty_as_select.reference b/tests/queries/0_stateless/02343_create_empty_as_select.reference new file mode 100644 index 00000000000..3b0d34c5863 --- /dev/null +++ b/tests/queries/0_stateless/02343_create_empty_as_select.reference @@ -0,0 +1,4 @@ +CREATE TABLE default.t\n(\n `1` UInt8\n)\nENGINE = Memory +0 +CREATE MATERIALIZED VIEW default.mv\n(\n `1` UInt8\n)\nENGINE = Memory AS\nSELECT 1 +0 diff --git a/tests/queries/0_stateless/02343_create_empty_as_select.sql b/tests/queries/0_stateless/02343_create_empty_as_select.sql new file mode 100644 index 00000000000..54f383b553f --- /dev/null +++ b/tests/queries/0_stateless/02343_create_empty_as_select.sql @@ -0,0 +1,18 @@ + +drop table if exists t; +drop table if exists mv; + +create table t engine=Memory empty; -- { clientError SYNTAX_ERROR } +create table t engine=Memory empty as; -- { clientError SYNTAX_ERROR } +create table t engine=Memory as; -- { clientError SYNTAX_ERROR } +create table t engine=Memory empty as select 1; + +show create table t; +select count() from t; + +create materialized view mv engine=Memory empty as select 1; +show create mv; +select count() from mv; + +drop table t; +drop table mv; diff --git a/tests/queries/0_stateless/02343_read_from_s3_compressed_blocks.reference b/tests/queries/0_stateless/02343_read_from_s3_compressed_blocks.reference new file mode 100644 index 00000000000..bd188af4707 --- /dev/null +++ b/tests/queries/0_stateless/02343_read_from_s3_compressed_blocks.reference @@ -0,0 +1 @@ +57344 diff --git a/tests/queries/0_stateless/02343_read_from_s3_compressed_blocks.sql b/tests/queries/0_stateless/02343_read_from_s3_compressed_blocks.sql new file mode 100644 index 00000000000..03e32d32497 --- /dev/null +++ b/tests/queries/0_stateless/02343_read_from_s3_compressed_blocks.sql @@ -0,0 +1,15 @@ +-- Tags: no-parallel, no-fasttest, no-s3-storage + +DROP TABLE IF EXISTS t_s3_compressed_blocks; + +CREATE TABLE t_s3_compressed_blocks (id UInt64, s String CODEC(NONE)) +ENGINE = MergeTree ORDER BY id +SETTINGS storage_policy = 's3_cache', +min_bytes_for_wide_part = 0; + +INSERT INTO t_s3_compressed_blocks SELECT number, randomPrintableASCII(128) from numbers(57344); + +SET max_threads = 1; +SELECT count() FROM t_s3_compressed_blocks WHERE NOT ignore(s); + +DROP TABLE t_s3_compressed_blocks; diff --git a/tests/queries/0_stateless/replication.lib b/tests/queries/0_stateless/replication.lib index 61491630f46..6bf3c35f344 100755 --- a/tests/queries/0_stateless/replication.lib +++ b/tests/queries/0_stateless/replication.lib @@ -44,7 +44,7 @@ function check_replication_consistency() num_tries=0 while [[ $($CLICKHOUSE_CLIENT -q "SELECT count() FROM system.processes WHERE current_database=currentDatabase() AND query LIKE '%$table_name_prefix%'") -ne 1 ]]; do sleep 0.5; - num_tries=$((num_tries-1)) + num_tries=$((num_tries+1)) if [ $num_tries -eq 100 ]; then $CLICKHOUSE_CLIENT -q "SELECT count() FROM system.processes WHERE current_database=currentDatabase() AND query LIKE '%$table_name_prefix%' FORMAT Vertical" break diff --git a/tests/queries/1_stateful/00159_parallel_formatting_http.sh b/tests/queries/1_stateful/00159_parallel_formatting_http.sh index ea4a4d12867..7b949cf23e6 100755 --- a/tests/queries/1_stateful/00159_parallel_formatting_http.sh +++ b/tests/queries/1_stateful/00159_parallel_formatting_http.sh @@ -1,4 +1,6 @@ #!/usr/bin/env bash +# Tags: no-tsan +# FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/1_stateful/00172_early_constant_folding.reference b/tests/queries/1_stateful/00172_early_constant_folding.reference index 27cd6b545e0..da564dc694e 100644 --- a/tests/queries/1_stateful/00172_early_constant_folding.reference +++ b/tests/queries/1_stateful/00172_early_constant_folding.reference @@ -2,6 +2,5 @@ ExpressionTransform (ReadFromStorage) AggregatingTransform - StrictResize - ExpressionTransform - SourceFromSingleChunk 0 → 1 + ExpressionTransform + SourceFromSingleChunk 0 → 1 diff --git a/tests/queries/shell_config.sh b/tests/queries/shell_config.sh index ce5947d95ed..87c999c2032 100644 --- a/tests/queries/shell_config.sh +++ b/tests/queries/shell_config.sh @@ -19,9 +19,9 @@ export CLICKHOUSE_TEST_UNIQUE_NAME="${CLICKHOUSE_TEST_NAME}_${CLICKHOUSE_DATABAS [ -v CLICKHOUSE_PORT_TCP ] && CLICKHOUSE_BENCHMARK_OPT0+=" --port=${CLICKHOUSE_PORT_TCP} " [ -v CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL ] && CLICKHOUSE_CLIENT_OPT0+=" --send_logs_level=${CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL} " [ -v CLICKHOUSE_DATABASE ] && CLICKHOUSE_CLIENT_OPT0+=" --database=${CLICKHOUSE_DATABASE} " -[ -v CLICKHOUSE_LOG_COMMENT ] && CLICKHOUSE_CLIENT_OPT0+=" --log_comment='${CLICKHOUSE_LOG_COMMENT}' " +[ -v CLICKHOUSE_LOG_COMMENT ] && CLICKHOUSE_CLIENT_OPT0+=" --log_comment $(printf '%q' ${CLICKHOUSE_LOG_COMMENT}) " [ -v CLICKHOUSE_DATABASE ] && CLICKHOUSE_BENCHMARK_OPT0+=" --database=${CLICKHOUSE_DATABASE} " -[ -v CLICKHOUSE_LOG_COMMENT ] && CLICKHOUSE_BENCHMARK_OPT0+=" --log_comment='${CLICKHOUSE_LOG_COMMENT}' " +[ -v CLICKHOUSE_LOG_COMMENT ] && CLICKHOUSE_BENCHMARK_OPT0+=" --log_comment $(printf '%q' ${CLICKHOUSE_LOG_COMMENT}) " export CLICKHOUSE_BINARY=${CLICKHOUSE_BINARY:="clickhouse"} # client @@ -129,3 +129,17 @@ function clickhouse_client_removed_host_parameter() # bash regex magic is arcane, but version dependant and weak; sed or awk are not really portable. $(echo "$CLICKHOUSE_CLIENT" | python3 -c "import sys, re; print(re.sub('--host(\s+|=)[^\s]+', '', sys.stdin.read()))") "$@" } + +function wait_for_queries_to_finish() +{ + # Wait for all queries to finish (query may still be running if thread is killed by timeout) + num_tries=0 + while [[ $($CLICKHOUSE_CLIENT -q "SELECT count() FROM system.processes WHERE current_database=currentDatabase() AND query NOT LIKE '%system.processes%'") -ne 0 ]]; do + sleep 0.5; + num_tries=$((num_tries+1)) + if [ $num_tries -eq 20 ]; then + $CLICKHOUSE_CLIENT -q "SELECT count() FROM system.processes WHERE current_database=currentDatabase() AND query NOT LIKE '%system.processes%' FORMAT Vertical" + break + fi + done +} diff --git a/tools/clickhouse-diagnostics/internal/platform/database/native_test.go b/tools/clickhouse-diagnostics/internal/platform/database/native_test.go index d1d4711cddb..1e936fe2449 100644 --- a/tools/clickhouse-diagnostics/internal/platform/database/native_test.go +++ b/tools/clickhouse-diagnostics/internal/platform/database/native_test.go @@ -67,7 +67,7 @@ func TestReadTableNamesForDatabase(t *testing.T) { t.Run("client can read tables for a database", func(t *testing.T) { tables, err := clickhouseClient.ReadTableNamesForDatabase("system") require.Nil(t, err) - require.Equal(t, 70, len(tables)) + require.GreaterOrEqual(t, len(tables), 70) require.Contains(t, tables, "merge_tree_settings") }) } diff --git a/tools/clickhouse-diagnostics/testdata/logs/var/logs/clickhouse-server.err.log b/tools/clickhouse-diagnostics/testdata/logs/var/logs/clickhouse-server.err.log new file mode 100644 index 00000000000..1a1768fe87e --- /dev/null +++ b/tools/clickhouse-diagnostics/testdata/logs/var/logs/clickhouse-server.err.log @@ -0,0 +1,10 @@ +2021.12.13 10:12:26.940169 [ 38398 ] {} Access(local directory): File /var/lib/clickhouse/access/users.list doesn't exist +2021.12.13 10:12:26.940204 [ 38398 ] {} Access(local directory): Recovering lists in directory /var/lib/clickhouse/access/ +2021.12.13 10:12:40.649453 [ 38445 ] {} Access(user directories): from: 127.0.0.1, user: default: Authentication failed: Code: 193. DB::Exception: Invalid credentials. (WRONG_PASSWORD), Stack trace (when copying this message, always include the lines below): + +0. DB::Exception::Exception(std::__1::basic_string, std::__1::allocator > const&, int, bool) @ 0x9b722d4 in /usr/bin/clickhouse +1. DB::IAccessStorage::throwInvalidCredentials() @ 0x119d9b27 in /usr/bin/clickhouse +2. DB::IAccessStorage::loginImpl(DB::Credentials const&, Poco::Net::IPAddress const&, DB::ExternalAuthenticators const&) const @ 0x119d98d7 in /usr/bin/clickhouse +3. DB::IAccessStorage::login(DB::Credentials const&, Poco::Net::IPAddress const&, DB::ExternalAuthenticators const&, bool) const @ 0x119d9084 in /usr/bin/clickhouse +4. DB::MultipleAccessStorage::loginImpl(DB::Credentials const&, Poco::Net::IPAddress const&, DB::ExternalAuthenticators const&) const @ 0x119ff93c in /usr/bin/clickhouse +5. DB::IAccessStorage::login(DB::Credentials const&, Poco::Net::IPAddress const&, DB::ExternalAuthenticators const&, bool) const @ 0x119d9084 in /usr/bin/clickhouse diff --git a/tools/clickhouse-diagnostics/testdata/logs/var/logs/clickhouse-server.log b/tools/clickhouse-diagnostics/testdata/logs/var/logs/clickhouse-server.log new file mode 100644 index 00000000000..f6abe7764ba --- /dev/null +++ b/tools/clickhouse-diagnostics/testdata/logs/var/logs/clickhouse-server.log @@ -0,0 +1,10 @@ +2022.02.02 14:49:32.458680 [ 200404 ] {} DiskLocal: Reserving 2.47 MiB on disk `default`, having unreserved 1.56 TiB. +2022.02.02 14:49:32.459086 [ 200359 ] {de87df8b-2250-439c-9e87-df8b2250339c::202202_147058_147550_344} MergeTask::PrepareStage: Merging 2 parts: from 202202_147058_147549_343 to 202202_147550_147550_0 into Wide +2022.02.02 14:49:32.459201 [ 200359 ] {de87df8b-2250-439c-9e87-df8b2250339c::202202_147058_147550_344} MergeTask::PrepareStage: Selected MergeAlgorithm: Horizontal +2022.02.02 14:49:32.459262 [ 200359 ] {de87df8b-2250-439c-9e87-df8b2250339c::202202_147058_147550_344} MergeTreeSequentialSource: Reading 159 marks from part 202202_147058_147549_343, total 1289014 rows starting from the beginning of the part +2022.02.02 14:49:32.459614 [ 200359 ] {de87df8b-2250-439c-9e87-df8b2250339c::202202_147058_147550_344} MergeTreeSequentialSource: Reading 2 marks from part 202202_147550_147550_0, total 2618 rows starting from the beginning of the part +2022.02.02 14:49:32.507755 [ 200359 ] {de87df8b-2250-439c-9e87-df8b2250339c::202202_147058_147550_344} MergeTask::MergeProjectionsStage: Merge sorted 1291632 rows, containing 5 columns (5 merged, 0 gathered) in 0.048711404 sec., 26516008.448452853 rows/sec., 639.52 MiB/sec. +2022.02.02 14:49:32.508332 [ 200359 ] {de87df8b-2250-439c-9e87-df8b2250339c::202202_147058_147550_344} system.asynchronous_metric_log (de87df8b-2250-439c-9e87-df8b2250339c): Renaming temporary part tmp_merge_202202_147058_147550_344 to 202202_147058_147550_344. +2022.02.02 14:49:32.508406 [ 200359 ] {de87df8b-2250-439c-9e87-df8b2250339c::202202_147058_147550_344} system.asynchronous_metric_log (de87df8b-2250-439c-9e87-df8b2250339c) (MergerMutator): Merged 2 parts: from 202202_147058_147549_343 to 202202_147550_147550_0 +2022.02.02 14:49:32.508440 [ 200359 ] {} MemoryTracker: Peak memory usage Mutate/Merge: 16.31 MiB. +2022.02.02 14:49:33.000148 [ 200388 ] {} AsynchronousMetrics: MemoryTracking: was 774.16 MiB, peak 2.51 GiB, will set to 772.30 MiB (RSS), difference: -1.86 MiB diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index b3a113ad2db..059566d83eb 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -462,6 +462,7 @@ unencrypted unixodbc url userspace +userver utils uuid variadic diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 2047d46251e..b33cbcebdb7 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -4,6 +4,7 @@ v22.4.5.9-stable 2022-05-06 v22.4.4.7-stable 2022-04-29 v22.4.3.3-stable 2022-04-26 v22.4.2.1-stable 2022-04-22 +v22.3.7.28-lts 2022-06-20 v22.3.6.5-lts 2022-05-06 v22.3.5.5-lts 2022-04-29 v22.3.4.20-lts 2022-04-26 diff --git a/utils/trace-visualizer/README.md b/utils/trace-visualizer/README.md index 63a6a737e3c..daef02d411e 100644 --- a/utils/trace-visualizer/README.md +++ b/utils/trace-visualizer/README.md @@ -1,17 +1,12 @@ Trace visualizer is a tool for representation of a tracing data as a Gantt diagram. # Quick start -For now this tool is not integrated into ClickHouse and requires a lot of manual adjustments. -```bash -cd utils/trace-visualizer -python3 -m http.server -``` -Open [localhost](http://localhost:8000). It will show an example of data. To show your tracing data you have to put it in JSON format near `index.html` and change call to `fetchData()` function at the bottom of `index.html`. (Do not forget to disable browser caching while changing it). +For now this tool is not integrated into ClickHouse and requires manual actions. Open `trace-visualizer/index.html` in your browser. It will show an example of data. To visualize your data click `Load` button and select your trace data JSON file. # Visualizing query trace First of all [opentelemetry_span_log](https://clickhouse.com/docs/en/operations/opentelemetry/) system table must be enabled to save query traces. Then run a query you want to trace with a setting: ```sql -set opentelemetry_start_trace_probability=1; +SET opentelemetry_start_trace_probability=1; SELECT 1; ``` @@ -22,10 +17,9 @@ SELECT DISTINCT trace_id FROM system.opentelemetry_span_log ORDER BY query_start To obtain JSON data suitable for visualizing run: ```sql -SELECT tuple (parent_span_id, attribute['clickhouse.thread_id'] || attribute['thread_number'] as thread_id)::Tuple(parent_span_id UInt64, thread_id String) as group, operation_name, start_time_us, finish_time_us, sipHash64(operation_name) as color, attribute -from system.opentelemetry_span_log +SELECT tuple (leftPad(attribute['clickhouse.thread_id'] || attribute['thread_number'], 10, '0') as thread_id, parent_span_id)::Tuple(thread_id String, parent_span_id UInt64) as group, operation_name, start_time_us, finish_time_us, sipHash64(operation_name) as color, attribute +FROM system.opentelemetry_span_log WHERE trace_id = 'your-trace-id' -ORDER BY group ASC FORMAT JSON SETTINGS output_format_json_named_tuples_as_objects = 1; ``` diff --git a/utils/trace-visualizer/css/d3-gantt.css b/utils/trace-visualizer/css/d3-gantt.css index 31da093dd2d..3e3150a02f0 100644 --- a/utils/trace-visualizer/css/d3-gantt.css +++ b/utils/trace-visualizer/css/d3-gantt.css @@ -8,7 +8,6 @@ } rect.zoom-panel { - /*cursor: ew-resize;*/ fill: none; pointer-events: all; } @@ -20,18 +19,18 @@ rect.zoom-panel { } .axis.y { - font-size: 16px; + font-size: 9px; cursor: ns-resize; } .axis.x { - font-size: 16px; + font-size: 9px; } #ruler { text-anchor: middle; alignment-baseline: before-edge; - font-size: 16px; + font-size: 9px; font-family: sans-serif; pointer-events: none; } diff --git a/utils/trace-visualizer/index.html b/utils/trace-visualizer/index.html index ea02b3141ad..cec88e587d3 100644 --- a/utils/trace-visualizer/index.html +++ b/utils/trace-visualizer/index.html @@ -14,26 +14,77 @@ + + +
+ + diff --git a/utils/trace-visualizer/js/d3-gantt.js b/utils/trace-visualizer/js/d3-gantt.js index 21a9dab6133..e3a51a33c21 100644 --- a/utils/trace-visualizer/js/d3-gantt.js +++ b/utils/trace-visualizer/js/d3-gantt.js @@ -37,6 +37,7 @@ .on("zoom", function() { if (tipShown != null) { tip.hide(tipShown); + tipShown = null; } var tr = d3.event.transform; xZoomed = tr.rescaleX(x); @@ -50,8 +51,6 @@ zoomContainer1.attr("transform", "translate(" + tr.x + ",0) scale(" + tr.k + ",1)"); zoomContainer2.attr("transform", "translate(" + tr.x + ",0) scale(" + tr.k + ",1)"); - - render(); }) .on("start", function() { zoom.startScreenY = d3.event.sourceEvent.screenY; @@ -116,6 +115,12 @@ .call(xAxis) ; + // ruler should be drawn above x axis and under y axis + ruler = fixedContainer.append("g") + .attr("id", "ruler") + .attr("transform", "translate(0, 0)") + ; + // create y axis fixedContainer.append("g") .attr("class", "y axis") @@ -186,10 +191,6 @@ ; // ruler - ruler = fixedContainer.append("g") - .attr("id", "ruler") - .attr("transform", "translate(0, 0)") - ; ruler.append("rect") .attr("id", "ruler-line") .attr("x", 0) @@ -218,10 +219,9 @@ // scroll handling window.onscroll = function myFunction() { - documentBodyScrollLeft(document.body.scrollLeft); - documentBodyScrollTop(document.body.scrollTop); + documentBodyScrollLeft(window.scrollX); + documentBodyScrollTop(window.scrollY); var scroll = scrollParams(); - svgChartContainer .attr("transform", "translate(" + margin.left + ", " + (margin.top + scroll.y1) + ")"); @@ -262,10 +262,6 @@ return "translate(" + x(d.t1) + "," + y(d.band) + ")"; } - var xPixel = function(d) { - return xZoomed.invert(1) - xZoomed.invert(0); - } - var render = function(t0, smooth) { // Save/restore last t0 value if (!arguments.length || t0 == -1) { @@ -282,7 +278,7 @@ .attr("class", "bar") .attr("vector-effect", "non-scaling-stroke") .style("fill", d => d.color) - .on('click', function(d) { + .on('mouseover', function(d) { if (tipShown != d) { tipShown = d; tip.show(d); @@ -296,7 +292,7 @@ .attr("y", 0) .attr("transform", bandTransform) .attr("height", y.bandwidth()) - .attr("width", d => Math.max(1*xPixel(), x(d.t2) - x(d.t1))) + .attr("width", d => x(d.t2) - x(d.t1)) ; var emptyMarker = bandsSvg.selectAll("text") @@ -314,12 +310,11 @@ //.clamp(true); // dosn't work with zoom/pan xZoomed = x; y = d3.scaleBand() - .domain(Object.values(data).map(d => d.band).sort()) - .rangeRound([0, height - margin.top - margin.bottom]) - .padding(0.5); + .domain([...data.bands]) + .range([1, height - margin.top - margin.bottom]) + .padding(1/8); xAxis = d3.axisBottom() .scale(x) - //.tickSubdivide(true) .tickSize(8) .tickPadding(8); yAxis = d3.axisLeft() @@ -331,7 +326,7 @@ var documentBodyScrollLeft = function(value) { if (!arguments.length) { if (documentBodyScrollLeft.value === undefined) { - documentBodyScrollLeft.value = document.body.scrollLeft; + documentBodyScrollLeft.value = window.scrollX; } return documentBodyScrollLeft.value; } else { @@ -343,7 +338,7 @@ var documentBodyScrollTop = function(value) { if (!arguments.length) { if (!documentBodyScrollTop.value === undefined) { - documentBodyScrollTop.value = document.body.scrollTop; + documentBodyScrollTop.value = window.scrollY; } return documentBodyScrollTop.value; } else { @@ -353,7 +348,7 @@ var scrollParams = function() { var y1 = documentBodyScrollTop(); - var y2 = y1 + window.innerHeight - margin.footer; + var y2 = y1 + view_height; y2 = Math.min(y2, height - margin.top - margin.bottom); var h = y2 - y1; return { @@ -401,7 +396,7 @@ var textWidth = 10 * posText.length; ruler.select("#bgrect") .attr("x", -textWidth/2 - xpadding) - .attr("y", positionRuler.bbox.y - ypadding) + .attr("y", positionRuler.bbox.y - ypadding + window.scrollY) .attr("width", textWidth + (xpadding*2)) .attr("height", positionRuler.bbox.height + (ypadding*2)) ; @@ -425,6 +420,13 @@ return gantt; } + gantt.view_height = function(value) { + if (!arguments.length) + return view_height; + view_height = +value; + return gantt; + } + gantt.selector = function(value) { if (!arguments.length) return selector; @@ -444,12 +446,18 @@ return data; } + gantt.destroy = function() { + tip.destroy(); + d3.select(selector).selectAll("svg").remove(); + } + // constructor // Config - var margin = { top: 20, right: 40, bottom: 20, left: 200, footer: 100 }, + var margin = { top: 0, right: 30, bottom: 20, left: 150 }, height = document.body.clientHeight - margin.top - margin.bottom - 5, width = document.body.clientWidth - margin.right - margin.left - 5, + view_height = window.innerHeight, selector = 'body', timeDomainStart = 0, timeDomainEnd = 1000,