Merge branch 'master' into keeper-reduce-memory

2024-11-25 00:52:02 +00:00 · 2024-01-22 08:04:17 +00:00 · 2024-01-22 08:04:17 +00:00 · 59f9abcf58
commit 59f9abcf58
parent 0132455b39 ecaef80ea0
426 changed files with 7661 additions and 3340 deletions
--- a/.github/workflows/jepsen.yml
+++ b/.github/workflows/jepsen.yml
@ -8,13 +8,13 @@ on: # yamllint disable-line rule:truthy
  schedule:
    - cron: '0 */6 * * *'
  workflow_dispatch:
-  workflow_call:
 jobs:
  KeeperJepsenRelease:
    uses: ./.github/workflows/reusable_simple_job.yml
    with:
      test_name: Jepsen keeper check
      runner_type: style-checker
+      report_required: true
      run_command: |
        python3 jepsen_check.py keeper
  # ServerJepsenRelease:
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@ -15,6 +15,8 @@ jobs:
    outputs:
      data: ${{ steps.runconfig.outputs.CI_DATA }}
    steps:
+      - name: DebugInfo
+        uses: hmarr/debug-action@a701ed95a46e6f2fb0df25e1a558c16356fae35a
      - name: Check out repository code
        uses: ClickHouse/checkout@v1
        with:
@ -33,11 +35,9 @@ jobs:
      - name: PrepareRunConfig
        id: runconfig
        run: |
-            echo "::group::configure CI run"
            python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --configure --rebuild-all-binaries --outfile ${{ runner.temp }}/ci_run_data.json
-            echo "::endgroup::"

-            echo "::group::CI run configure results"
+            echo "::group::CI configuration"
            python3 -m json.tool ${{ runner.temp }}/ci_run_data.json
            echo "::endgroup::"

@ -255,9 +255,9 @@ jobs:
      run_command: |
        cd "$GITHUB_WORKSPACE/tests/ci"
        python3 docker_server.py --release-type head \
-          --image-repo clickhouse/clickhouse-server --image-path docker/server
+          --image-repo clickhouse/clickhouse-server --image-path docker/server --allow-build-reuse
        python3 docker_server.py --release-type head \
-          --image-repo clickhouse/clickhouse-keeper --image-path docker/keeper
+          --image-repo clickhouse/clickhouse-keeper --image-path docker/keeper --allow-build-reuse
 ############################################################################################
 ##################################### BUILD REPORTER #######################################
 ############################################################################################
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@ -22,6 +22,8 @@ jobs:
    outputs:
      data: ${{ steps.runconfig.outputs.CI_DATA }}
    steps:
+      - name: DebugInfo
+        uses: hmarr/debug-action@a701ed95a46e6f2fb0df25e1a558c16356fae35a
      - name: Check out repository code
        uses: ClickHouse/checkout@v1
        with:
@ -44,11 +46,9 @@ jobs:
      - name: PrepareRunConfig
        id: runconfig
        run: |
-            echo "::group::configure CI run"
            python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --configure --outfile ${{ runner.temp }}/ci_run_data.json
-            echo "::endgroup::"

-            echo "::group::CI run configure results"
+            echo "::group::CI configuration"
            python3 -m json.tool ${{ runner.temp }}/ci_run_data.json
            echo "::endgroup::"

@ -67,6 +67,7 @@ jobs:
          DOCKER_TAG=$(echo '${{ toJson(fromJson(steps.runconfig.outputs.CI_DATA).docker_data.images) }}' | tr -d '\n')
          export DOCKER_TAG=$DOCKER_TAG
          python3 ./tests/ci/style_check.py --no-push
+          python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --infile ${{ runner.temp }}/ci_run_data.json --post --job-name 'Style check'
  BuildDockers:
    needs: [RunConfig]
    if: ${{ !failure() && !cancelled() }}
@ -796,7 +797,7 @@ jobs:
      test_name: Unit tests (asan)
      runner_type: fuzzer-unit-tester
      data: ${{ needs.RunConfig.outputs.data }}
-  UnitTestsReleaseClang:
+  UnitTestsRelease:
    needs: [RunConfig, BuilderBinRelease]
    if: ${{ !failure() && !cancelled() }}
    uses: ./.github/workflows/reusable_test.yml
@ -923,7 +924,7 @@ jobs:
      - UnitTestsTsan
      - UnitTestsMsan
      - UnitTestsUBsan
-      - UnitTestsReleaseClang
+      - UnitTestsRelease
      - CompatibilityCheckX86
      - CompatibilityCheckAarch64
      - SQLancerTestRelease
@ -966,13 +967,20 @@ jobs:
 #############################################################################################
 ###################################### JEPSEN TESTS #########################################
 #############################################################################################
+  # This is special test NOT INCLUDED in FinishCheck
+  # When it's skipped, all dependent tasks will be skipped too.
+  # DO NOT add it there
  Jepsen:
-    # This is special test NOT INCLUDED in FinishCheck
-    # When it's skipped, all dependent tasks will be skipped too.
-    # DO NOT add it there
-    if: ${{ !failure() && !cancelled() && contains(github.event.pull_request.labels.*.name, 'jepsen-test') }}
+    # we need concurrency as the job uses dedicated instances in the cloud
+    concurrency:
+      group: jepsen
+    if: ${{ !failure() && !cancelled() }}
    needs: [RunConfig, BuilderBinRelease]
-    uses: ./.github/workflows/jepsen.yml
+    uses: ./.github/workflows/reusable_test.yml
+    with:
+      test_name: ClickHouse Keeper Jepsen
+      runner_type: style-checker
+      data: ${{ needs.RunConfig.outputs.data }}
 #############################################################################################
 ####################################### libFuzzer ###########################################
 #############################################################################################
--- a/.github/workflows/reusable_build.yml
+++ b/.github/workflows/reusable_build.yml
@ -58,6 +58,7 @@ jobs:
      - name: Apply sparse checkout for contrib # in order to check that it doesn't break build
        # This step is done in GITHUB_WORKSPACE,
        # because it's broken in REPO_COPY for some reason
+        # See also update-submodules.sh
        if: ${{ env.BUILD_SPARSE_CHECKOUT == 'true' }}
        run: |
          rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed'
@ -72,12 +73,15 @@ jobs:
      - name: Pre
        run: |
          python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --infile ${{ toJson(inputs.data) }} --pre --job-name '${{inputs.build_name}}'
-      - name: Build
+      - name: Run
        run: |
-          python3 "$GITHUB_WORKSPACE/tests/ci/build_check.py" "$BUILD_NAME"
+          python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" \
+            --infile ${{ toJson(inputs.data) }} \
+            --job-name "$BUILD_NAME" \
+            --run
      - name: Post
        # it still be build report to upload for failed build job
-        if: always()
+        if: ${{ !cancelled() }}
        run: |
          python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --infile ${{ toJson(inputs.data) }} --post --job-name '${{inputs.build_name}}'
      - name: Mark as done
--- a/.github/workflows/reusable_simple_job.yml
+++ b/.github/workflows/reusable_simple_job.yml
@ -34,12 +34,16 @@ name: Simple job
      working-directory:
        description: sets custom working directory
        type: string
-        default: ""
+        default: "$GITHUB_WORKSPACE/tests/ci"
      git_ref:
        description: commit to use, merge commit for pr or head
        required: false
        type: string
        default: ${{ github.event.after }} # no merge commit
+      report_required:
+        description: set to true if job report with the commit status required
+        type: boolean
+        default: false
    secrets:
      secret_envs:
        description: if given, it's passed to the environments
@ -58,6 +62,8 @@ jobs:
    env:
      GITHUB_JOB_OVERRIDDEN: ${{inputs.test_name}}
    steps:
+      - name: DebugInfo
+        uses: hmarr/debug-action@a701ed95a46e6f2fb0df25e1a558c16356fae35a
      - name: Check out repository code
        uses: ClickHouse/checkout@v1
        with:
@ -79,12 +85,12 @@ jobs:
          job_type: test
      - name: Run
        run: |
-          if [ -n '${{ inputs.working-directory }}' ]; then
-            cd "${{ inputs.working-directory }}"
-          else
-            cd "$GITHUB_WORKSPACE/tests/ci"
-          fi
+          cd "${{ inputs.working-directory }}"
          ${{ inputs.run_command }}
+      - name: Post
+        if: ${{ inputs.report_required }}
+        run: |
+          python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --post --job-name '${{inputs.test_name}}'
      - name: Clean
        if: always()
        uses: ./.github/actions/clean
--- a/.github/workflows/reusable_test.yml
+++ b/.github/workflows/reusable_test.yml
@ -38,7 +38,7 @@ name: Testing workflow
      working-directory:
        description: sets custom working directory
        type: string
-        default: ""
+        default: "$GITHUB_WORKSPACE/tests/ci"
    secrets:
      secret_envs:
        description: if given, it's passed to the environments
@ -96,19 +96,14 @@ jobs:
          python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --infile ${{ toJson(inputs.data) }} --pre --job-name '${{inputs.test_name}}'
      - name: Run
        run: |
-          if [ -n "${{ inputs.working-directory }}" ]; then
-            cd "${{ inputs.working-directory }}"
-          else
-            cd "$GITHUB_WORKSPACE/tests/ci"
-          fi
-          if [ -n "$(echo '${{ inputs.run_command }}' | tr -d '\n')" ]; then
-            echo "Running command from workflow input"
-            ${{ inputs.run_command }}
-          else
-            echo "Running command from job config"
-            python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --infile ${{ toJson(inputs.data) }} --run --job-name '${{inputs.test_name}}'
-          fi
+          cd "${{ inputs.working-directory }}"
+          python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" \
+            --infile ${{ toJson(inputs.data) }} \
+            --job-name '${{inputs.test_name}}' \
+            --run \
+            --run-command '''${{inputs.run_command}}'''
      - name: Post run
+        if: ${{ !cancelled() }}
        run: |
          python3 "$GITHUB_WORKSPACE/tests/ci/ci.py" --infile ${{ toJson(inputs.data) }} --post --job-name '${{inputs.test_name}}'
      - name: Mark as done
--- a/.gitmessage
+++ b/.gitmessage
@ -1,9 +1,18 @@


-## To avoid merge commit in CI run (add a leading space to apply):
-#no-merge-commit
+### CI modificators (add a leading space to apply):

-## Running specified job (add a leading space to apply):
+## To avoid a merge commit in CI:
+#no_merge_commit
+
+## To discard CI cache:
+#no_ci_cache
+
+## To run specified set of tests in CI:
+#ci_set_<SET_NAME>
+#ci_set_reduced
+
+## To run specified job in CI:
 #job_<JOB NAME>
 #job_stateless_tests_release
 #job_package_debug
--- a/contrib/avro
+++ b/contrib/avro
@ -1 +1 @@
-Subproject commit 2fb8a8a6ec0eab9109b68abf3b4857e8c476b918
+Subproject commit d43acc84d3d455b016f847d6666fbc3cd27f16a9
--- a/contrib/boost-cmake/CMakeLists.txt
+++ b/contrib/boost-cmake/CMakeLists.txt
@ -44,12 +44,14 @@ set (SRCS_IOSTREAMS
    "${LIBRARY_DIR}/libs/iostreams/src/gzip.cpp"
    "${LIBRARY_DIR}/libs/iostreams/src/mapped_file.cpp"
    "${LIBRARY_DIR}/libs/iostreams/src/zlib.cpp"
+    "${LIBRARY_DIR}/libs/iostreams/src/zstd.cpp"
 )

 add_library (_boost_iostreams ${SRCS_IOSTREAMS})
 add_library (boost::iostreams ALIAS _boost_iostreams)
 target_include_directories (_boost_iostreams PRIVATE ${LIBRARY_DIR})
 target_link_libraries (_boost_iostreams PRIVATE ch_contrib::zlib)
+target_link_libraries (_boost_iostreams PRIVATE ch_contrib::zstd)

 # program_options

--- a/contrib/jemalloc-cmake/CMakeLists.txt
+++ b/contrib/jemalloc-cmake/CMakeLists.txt
@ -34,9 +34,9 @@ if (OS_LINUX)
    # avoid spurious latencies and additional work associated with
    # MADV_DONTNEED. See
    # https://github.com/ClickHouse/ClickHouse/issues/11121 for motivation.
-    set (JEMALLOC_CONFIG_MALLOC_CONF "percpu_arena:percpu,oversize_threshold:0,muzzy_decay_ms:5000,dirty_decay_ms:5000")
+    set (JEMALLOC_CONFIG_MALLOC_CONF "percpu_arena:percpu,oversize_threshold:0,muzzy_decay_ms:0,dirty_decay_ms:5000")
 else()
-    set (JEMALLOC_CONFIG_MALLOC_CONF "oversize_threshold:0,muzzy_decay_ms:5000,dirty_decay_ms:5000")
+    set (JEMALLOC_CONFIG_MALLOC_CONF "oversize_threshold:0,muzzy_decay_ms:0,dirty_decay_ms:5000")
 endif()
 # CACHE variable is empty to allow changing defaults without the necessity
 # to purge cache
--- a/contrib/update-submodules.sh
+++ b/contrib/update-submodules.sh
@ -6,9 +6,15 @@ SCRIPT_DIR=$(dirname "${SCRIPT_PATH}")
 GIT_DIR=$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel)
 cd $GIT_DIR

+# Exclude from contribs some garbage subdirs that we don't need.
+# It reduces the checked out files size about 3 times and therefore speeds up indexing in IDEs and searching.
+# NOTE .git/ still contains everything that we don't check out (although, it's compressed)
+# See also https://git-scm.com/docs/git-sparse-checkout
 contrib/sparse-checkout/setup-sparse-checkout.sh
+
 git submodule init
 git submodule sync
+
 # NOTE: do not use --remote for `git submodule update`[1] command, since the submodule references to the specific commit SHA1 in the subproject.
 #       It may cause unexpected behavior. Instead you need to commit a new SHA1 for a submodule.
 #
--- a/docker/server/entrypoint.sh
+++ b/docker/server/entrypoint.sh
@ -49,17 +49,10 @@ CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-}"
 CLICKHOUSE_DB="${CLICKHOUSE_DB:-}"
 CLICKHOUSE_ACCESS_MANAGEMENT="${CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT:-0}"

-for dir in "$DATA_DIR" \
-  "$ERROR_LOG_DIR" \
-  "$LOG_DIR" \
-  "$TMP_DIR" \
-  "$USER_PATH" \
-  "$FORMAT_SCHEMA_PATH" \
-  "${DISKS_PATHS[@]}" \
-  "${DISKS_METADATA_PATHS[@]}"
-do
+function create_directory_and_do_chown() {
+    local dir=$1
    # check if variable not empty
-    [ -z "$dir" ] && continue
+    [ -z "$dir" ] && return
    # ensure directories exist
    if [ "$DO_CHOWN" = "1" ]; then
        mkdir="mkdir"
@ -81,6 +74,23 @@ do
            chown -R "$USER:$GROUP" "$dir"
        fi
    fi
+}
+
+create_directory_and_do_chown "$DATA_DIR"
+
+# Change working directory to $DATA_DIR in case there're paths relative to $DATA_DIR, also avoids running
+# clickhouse-server at root directory.
+cd "$DATA_DIR"
+
+for dir in "$ERROR_LOG_DIR" \
+  "$LOG_DIR" \
+  "$TMP_DIR" \
+  "$USER_PATH" \
+  "$FORMAT_SCHEMA_PATH" \
+  "${DISKS_PATHS[@]}" \
+  "${DISKS_METADATA_PATHS[@]}"
+do
+    create_directory_and_do_chown "$dir"
 done

 # if clickhouse user is defined - create it (user "default" already exists out of box)
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -99,6 +99,16 @@ if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]
    > /etc/clickhouse-server2/config.d/filesystem_caches_path.xml.tmp
    mv /etc/clickhouse-server2/config.d/filesystem_caches_path.xml.tmp /etc/clickhouse-server2/config.d/filesystem_caches_path.xml

+    sudo cat /etc/clickhouse-server1/config.d/filesystem_caches_path.xml \
+    | sed "s|<custom_cached_disks_base_directory replace=\"replace\">/var/lib/clickhouse/filesystem_caches/</custom_cached_disks_base_directory>|<custom_cached_disks_base_directory replace=\"replace\">/var/lib/clickhouse/filesystem_caches_1/</custom_cached_disks_base_directory>|" \
+    > /etc/clickhouse-server1/config.d/filesystem_caches_path.xml.tmp
+    mv /etc/clickhouse-server1/config.d/filesystem_caches_path.xml.tmp /etc/clickhouse-server1/config.d/filesystem_caches_path.xml
+
+    sudo cat /etc/clickhouse-server2/config.d/filesystem_caches_path.xml \
+    | sed "s|<custom_cached_disks_base_directory replace=\"replace\">/var/lib/clickhouse/filesystem_caches/</custom_cached_disks_base_directory>|<custom_cached_disks_base_directory replace=\"replace\">/var/lib/clickhouse/filesystem_caches_2/</custom_cached_disks_base_directory>|" \
+    > /etc/clickhouse-server2/config.d/filesystem_caches_path.xml.tmp
+    mv /etc/clickhouse-server2/config.d/filesystem_caches_path.xml.tmp /etc/clickhouse-server2/config.d/filesystem_caches_path.xml
+
    mkdir -p /var/run/clickhouse-server1
    sudo chown clickhouse:clickhouse /var/run/clickhouse-server1
    sudo -E -u clickhouse /usr/bin/clickhouse server --config /etc/clickhouse-server1/config.xml --daemon \
@ -235,6 +245,17 @@ clickhouse-client -q "system flush logs" ||:
 # stop logs replication to make it possible to dump logs tables via clickhouse-local
 stop_logs_replication

+# Try to get logs while server is running
+successfuly_saved=0
+for table in query_log zookeeper_log trace_log transactions_info_log
+do
+    clickhouse-client -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst || successfuly_saved=$((successfuly_saved+$?))
+    if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
+        clickhouse-client -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.1.tsv.zst || successfuly_saved=$((successfuly_saved+$?))
+        clickhouse-client -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.2.tsv.zst || successfuly_saved=$((successfuly_saved+$?))
+    fi
+done
+
 # Stop server so we can safely read data with clickhouse-local.
 # Why do we read data with clickhouse-local?
 # Because it's the simplest way to read it when server has crashed.
@ -254,21 +275,25 @@ if [[ -n "$USE_S3_STORAGE_FOR_MERGE_TREE" ]] && [[ "$USE_S3_STORAGE_FOR_MERGE_TR
    data_path_config="--config-file=/etc/clickhouse-server/config.xml"
 fi

-# Compress tables.
-#
-# NOTE:
-# - that due to tests with s3 storage we cannot use /var/lib/clickhouse/data
-#   directly
-# - even though ci auto-compress some files (but not *.tsv) it does this only
-#   for files >64MB, we want this files to be compressed explicitly
-for table in query_log zookeeper_log trace_log transactions_info_log
-do
-    clickhouse-local "$data_path_config" --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||:
-    if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
-        clickhouse-local --path /var/lib/clickhouse1/ --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.1.tsv.zst ||:
-        clickhouse-local --path /var/lib/clickhouse2/ --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.2.tsv.zst ||:
-    fi
-done
+
+# If server crashed dump system logs with clickhouse-local
+if [ $successfuly_saved -ne 0 ]; then
+    # Compress tables.
+    #
+    # NOTE:
+    # - that due to tests with s3 storage we cannot use /var/lib/clickhouse/data
+    #   directly
+    # - even though ci auto-compress some files (but not *.tsv) it does this only
+    #   for files >64MB, we want this files to be compressed explicitly
+    for table in query_log zookeeper_log trace_log transactions_info_log
+    do
+        clickhouse-local "$data_path_config" --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||:
+        if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
+            clickhouse-local --path /var/lib/clickhouse1/ --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.1.tsv.zst ||:
+            clickhouse-local --path /var/lib/clickhouse2/ --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.2.tsv.zst ||:
+        fi
+    done
+fi

 # Also export trace log in flamegraph-friendly format.
 for trace_type in CPU Memory Real
--- a/docker/test/upgrade/run.sh
+++ b/docker/test/upgrade/run.sh
@ -56,6 +56,9 @@ echo "ATTACH DATABASE system ENGINE=Ordinary" > /var/lib/clickhouse/metadata/sys
 # Install previous release packages
 install_packages previous_release_package_folder

+# Save old settings from system table for settings changes check
+clickhouse-local -q "select * from system.settings format Native" > old_settings.native
+
 # Initial run without S3 to create system.*_log on local file system to make it
 # available for dump via clickhouse-local
 configure
@ -152,6 +155,63 @@ install_packages package_folder
 export ZOOKEEPER_FAULT_INJECTION=1
 configure

+# Check that all new/changed setting were added in settings changes history.
+# Some settings can be different for builds with sanitizers, so we check
+# settings changes only for non-sanitizer builds.
+IS_SANITIZED=$(clickhouse-local --query "SELECT value LIKE '%-fsanitize=%' FROM system.build_options WHERE name = 'CXX_FLAGS'")
+if [ "${IS_SANITIZED}" -eq "0" ]
+then
+  clickhouse-local -q "select * from system.settings format Native" > new_settings.native
+  clickhouse-local -nmq "
+  CREATE TABLE old_settings AS file('old_settings.native');
+  CREATE TABLE new_settings AS file('new_settings.native');
+
+  SELECT
+      name,
+      new_settings.value AS new_value,
+      old_settings.value AS old_value
+  FROM new_settings
+  LEFT JOIN old_settings ON new_settings.name = old_settings.name
+  WHERE (new_settings.value != old_settings.value) AND (name NOT IN (
+      SELECT arrayJoin(tupleElement(changes, 'name'))
+      FROM system.settings_changes
+      WHERE version = extract(version(), '^(?:\\d+\\.\\d+)')
+  ))
+  SETTINGS join_use_nulls = 1
+  INTO OUTFILE 'changed_settings.txt'
+  FORMAT PrettyCompactNoEscapes;
+
+  SELECT name
+  FROM new_settings
+  WHERE (name NOT IN (
+      SELECT name
+      FROM old_settings
+  )) AND (name NOT IN (
+      SELECT arrayJoin(tupleElement(changes, 'name'))
+      FROM system.settings_changes
+      WHERE version = extract(version(), '^(?:\\d+\\.\\d+)')
+  ))
+  INTO OUTFILE 'new_settings.txt'
+  FORMAT PrettyCompactNoEscapes;
+  "
+
+  if [ -s changed_settings.txt ]
+  then
+      mv changed_settings.txt /test_output/
+      echo -e "Changed settings are not reflected in settings changes history (see changed_settings.txt)$FAIL$(head_escaped /test_output/changed_settings.txt)" >> /test_output/test_results.tsv
+  else
+      echo -e "There are no changed settings or they are reflected in settings changes history$OK" >> /test_output/test_results.tsv
+  fi
+
+  if [ -s new_settings.txt ]
+  then
+      mv new_settings.txt /test_output/
+      echo -e "New settings are not reflected in settings changes history (see new_settings.txt)$FAIL$(head_escaped /test_output/new_settings.txt)" >> /test_output/test_results.tsv
+  else
+      echo -e "There are no new settings or they are reflected in settings changes history$OK" >> /test_output/test_results.tsv
+  fi
+fi
+
 # Just in case previous version left some garbage in zk
 sudo cat /etc/clickhouse-server/config.d/lost_forever_check.xml \
  | sed "s|>1<|>0<|g" \
@ -257,6 +317,8 @@ clickhouse-local --structure "test String, res String, time Nullable(Float32), d
 (test like '%Fatal message%') DESC,
 (test like '%Error message%') DESC,
 (test like '%previous release%') DESC,
+(test like '%Changed settings%') DESC,
+(test like '%New settings%') DESC,
 rowNumberInAllBlocks()
 LIMIT 1" < /test_output/test_results.tsv > /test_output/check_status.tsv || echo "failure\tCannot parse test_results.tsv" > /test_output/check_status.tsv
 [ -s /test_output/check_status.tsv ] || echo -e "success\tNo errors found" > /test_output/check_status.tsv
--- a/docs/en/operations/allocation-profiling.md
+++ b/docs/en/operations/allocation-profiling.md
@ -0,0 +1,207 @@
+---
+slug: /en/operations/allocation-profiling
+sidebar_label: "Allocation profiling"
+title: "Allocation profiling"
+---
+
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Allocation profiling
+
+ClickHouse uses [jemalloc](https://github.com/jemalloc/jemalloc) as its global allocator that comes with some tools for allocation sampling and profiling.  
+To make allocation profiling more convenient, `SYSTEM` commands are provided along 4LW commands in Keeper.
+
+## Sampling allocations and flushing heap profiles
+
+If we want to sample and profile allocations in `jemalloc`, we need to start ClickHouse/Keeper with profiling enabled using environment variable `MALLOC_CONF`.
+
+```sh
+MALLOC_CONF=background_thread:true,prof:true
+```
+
+`jemalloc` will sample allocation and store the information internally.
+
+We can tell `jemalloc` to flush current profile by running:
+
+<Tabs groupId="binary">
+<TabItem value="clickhouse" label="ClickHouse">
+
+    SYSTEM JEMALLOC FLUSH PROFILE
+
+</TabItem>
+<TabItem value="keeper" label="Keeper">
+
+    echo jmfp | nc localhost 9181
+
+</TabItem>
+</Tabs>
+
+By default, heap profile file will be generated in `/tmp/jemalloc_clickhouse._pid_._seqnum_.heap` where `_pid_` is the PID of ClickHouse and `_seqnum_` is the global sequence number for the current heap profile.  
+For Keeper, the default file is `/tmp/jemalloc_keeper._pid_._seqnum_.heap` following the same rules.
+
+A different location can be defined by appending the `MALLOC_CONF` environment variable with `prof_prefix` option.  
+For example, if we want to generate profiles in `/data` folder where the prefix for filename will be `my_current_profile` we can run ClickHouse/Keeper with following environment variable:
+```sh
+MALLOC_CONF=background_thread:true,prof:true,prof_prefix:/data/my_current_profile
+```
+Generated file will append to prefix PID and sequence number.
+
+## Analyzing heap profiles
+
+After we generated heap profiles, we need to analyze them.  
+For that, we need to use `jemalloc`'s tool called [jeprof](https://github.com/jemalloc/jemalloc/blob/dev/bin/jeprof.in) which can be installed in multiple ways:
+- installing `jemalloc` using system's package manager
+- cloning [jemalloc repo](https://github.com/jemalloc/jemalloc) and running autogen.sh from the root folder that will provide you with `jeprof` script inside the `bin` folder
+
+:::note
+`jeprof` uses `addr2line` to generate stacktraces which can be really slow.  
+If that’s the case, we recommend installing an [alternative implementation](https://github.com/gimli-rs/addr2line) of the tool.
+
+```
+git clone https://github.com/gimli-rs/addr2line
+cd addr2line
+cargo b --examples -r
+cp ./target/release/examples/addr2line path/to/current/addr2line
+```
+:::
+
+There are many different formats to generate from the heap profile using `jeprof`.
+We recommend to run `jeprof --help` to check usage and many different options the tool provides. 
+
+In general, `jeprof` command will look like this:
+
+```sh
+jeprof path/to/binary path/to/heap/profile --output_format [ > output_file]
+```
+
+If we want to compare which allocations happened between 2 profiles we can set the base argument:
+
+```sh
+jeprof path/to/binary --base path/to/first/heap/profile path/to/second/heap/profile --output_format [ > output_file]
+```
+
+For example:
+
+- if we want to generate a text file with each procedure written per line:
+
+```sh
+jeprof path/to/binary path/to/heap/profile --text > result.txt
+```
+
+- if we want to generate a PDF file with call-graph:
+
+```sh
+jeprof path/to/binary path/to/heap/profile --pdf > result.pdf
+```
+
+### Generating flame graph
+
+`jeprof` allows us to generate collapsed stacks for building flame graphs.
+
+We need to use `--collapsed` argument:
+
+```sh
+jeprof path/to/binary path/to/heap/profile --collapsed > result.collapsed
+```
+
+After that, we can use many different tools to visualize collapsed stacks.
+
+Most popular would be [FlameGraph](https://github.com/brendangregg/FlameGraph) which contains a script called `flamegraph.pl`:
+
+```sh
+cat result.collapsed | /path/to/FlameGraph/flamegraph.pl --color=mem --title="Allocation Flame Graph" --width 2400 > result.svg
+```
+
+Another interesting tool is [speedscope](https://www.speedscope.app/) that allows you to analyze collected stacks in a more interactive way.
+
+## Controlling allocation profiler during runtime
+
+If ClickHouse/Keeper were started with enabled profiler, they support additional commands for disabling/enabling allocation profiling during runtime.
+Using those commands, it's easier to profile only specific intervals.
+
+Disable profiler:
+
+<Tabs groupId="binary">
+<TabItem value="clickhouse" label="ClickHouse">
+
+    SYSTEM JEMALLOC DISABLE PROFILE
+
+</TabItem>
+<TabItem value="keeper" label="Keeper">
+
+    echo jmdp | nc localhost 9181
+
+</TabItem>
+</Tabs>
+
+Enable profiler:
+
+<Tabs groupId="binary">
+<TabItem value="clickhouse" label="ClickHouse">
+
+    SYSTEM JEMALLOC ENABLE PROFILE
+
+</TabItem>
+<TabItem value="keeper" label="Keeper">
+
+    echo jmep | nc localhost 9181
+
+</TabItem>
+</Tabs>
+
+It's also possible to control the initial state of the profiler by setting `prof_active` option which is enabled by default.  
+For example, if we don't want to sample allocations during startup but only after we enable the profiler, we can start ClickHouse/Keeper with following environment variable:
+```sh
+MALLOC_CONF=background_thread:true,prof:true,prof_active:false
+```
+
+and enable profiler at a later point.
+
+## Additional options for profiler
+
+`jemalloc` has many different options available related to profiler which can be controlled by modifying `MALLOC_CONF` environment variable.
+For example, interval between allocation samples can be controlled with `lg_prof_sample`.  
+If you want to dump heap profile every N bytes you can enable it using `lg_prof_interval`.  
+
+We recommend to check `jemalloc`s [reference page](https://jemalloc.net/jemalloc.3.html) for such options.
+
+## Other resources
+
+ClickHouse/Keeper expose `jemalloc` related metrics in many different ways.
+
+:::warning Warning
+It's important to be aware that none of these metrics are synchronized with each other and values may drift.
+:::
+
+### System table `asynchronous_metrics`
+
+```sql
+SELECT *
+FROM system.asynchronous_metrics
+WHERE metric ILIKE '%jemalloc%'
+FORMAT Vertical
+```
+
+[Reference](/en/operations/system-tables/asynchronous_metrics)
+
+### System table `jemalloc_bins`
+
+Contains information about memory allocations done via jemalloc allocator in different size classes (bins) aggregated from all arenas.
+
+[Reference](/en/operations/system-tables/jemalloc_bins)
+
+### Prometheus
+
+All `jemalloc` related metrics from `asynchronous_metrics` are also exposed using Prometheus endpoint in both ClickHouse and Keeper.
+
+[Reference](/en/operations/server-configuration-parameters/settings#prometheus)
+
+### `jmst` 4LW command in Keeper
+
+Keeper supports `jmst` 4LW command which returns [basic allocator statistics](https://github.com/jemalloc/jemalloc/wiki/Use-Case%3A-Basic-Allocator-Statistics).
+
+Example:
+```sh
+echo jmst | nc localhost 9181
+```
--- a/docs/en/operations/settings/query-complexity.md
+++ b/docs/en/operations/settings/query-complexity.md
@ -172,7 +172,7 @@ If you set `timeout_before_checking_execution_speed `to 0, ClickHouse will use c

 ## timeout_overflow_mode {#timeout-overflow-mode}

-What to do if the query is run longer than `max_execution_time`: `throw` or `break`. By default, `throw`.
+What to do if the query is run longer than `max_execution_time` or the estimated running time is longer than `max_estimated_execution_time`: `throw` or `break`. By default, `throw`.

 # max_execution_time_leaf

@ -214,6 +214,10 @@ A maximum number of execution bytes per second. Checked on every data block when

 Checks that execution speed is not too slow (no less than ‘min_execution_speed’), after the specified time in seconds has expired.

+## max_estimated_execution_time {#max_estimated_execution_time}
+
+Maximum query estimate execution time in seconds. Checked on every data block when ‘timeout_before_checking_execution_speed’ expires.
+
 ## max_columns_to_read {#max-columns-to-read}

 A maximum number of columns that can be read from a table in a single query. If a query requires reading a greater number of columns, it throws an exception.
--- a/docs/en/operations/settings/settings-formats.md
+++ b/docs/en/operations/settings/settings-formats.md
@ -1597,7 +1597,13 @@ Result:

 Use ANSI escape sequences to paint colors in Pretty formats.

-Enabled by default.
+possible values:
+
+-   `0` — Disabled. Pretty formats do not use ANSI escape sequences.
+-   `1` — Enabled. Pretty formats will use ANSI escape sequences except for `NoEscapes` formats.
+-   `auto` - Enabled if `stdout` is a terminal except for `NoEscapes` formats.
+
+Default value is `auto`. 

 ### output_format_pretty_grid_charset {#output_format_pretty_grid_charset}

--- a/docs/en/sql-reference/aggregate-functions/reference/index.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/index.md
@ -88,6 +88,7 @@ ClickHouse-specific aggregate functions:
 - [quantileTDigestWeighted](/docs/en/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md)
 - [quantileBFloat16](/docs/en/sql-reference/aggregate-functions/reference/quantilebfloat16.md#quantilebfloat16)
 - [quantileBFloat16Weighted](/docs/en/sql-reference/aggregate-functions/reference/quantilebfloat16.md#quantilebfloat16weighted)
+- [quantileDDSketch](/docs/en/sql-reference/aggregate-functions/reference/quantileddsketch.md#quantileddsketch)
 - [simpleLinearRegression](/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md)
 - [stochasticLinearRegression](/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md)
 - [stochasticLogisticRegression](/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md)
--- a/docs/en/sql-reference/aggregate-functions/reference/median.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/median.md
@ -18,6 +18,7 @@ Functions:
 - `medianTDigest` — Alias for [quantileTDigest](../../../sql-reference/aggregate-functions/reference/quantiletdigest.md#quantiletdigest).
 - `medianTDigestWeighted` — Alias for [quantileTDigestWeighted](../../../sql-reference/aggregate-functions/reference/quantiletdigestweighted.md#quantiletdigestweighted).
 - `medianBFloat16` — Alias for [quantileBFloat16](../../../sql-reference/aggregate-functions/reference/quantilebfloat16.md#quantilebfloat16).
+- `medianDDSketch` — Alias for [quantileDDSketch](../../../sql-reference/aggregate-functions/reference/quantileddsketch.md#quantileddsketch).

 **Example**

--- a/docs/en/sql-reference/aggregate-functions/reference/quantileddsketch.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/quantileddsketch.md
@ -0,0 +1,61 @@
+---
+slug: /en/sql-reference/aggregate-functions/reference/quantileddsketch
+sidebar_position: 211
+title: quantileDDSketch
+---
+
+Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a sample with relative-error guarantees. It works by building a [DDSketch](https://www.vldb.org/pvldb/vol12/p2195-masson.pdf).
+
+**Syntax**
+
+``` sql
+quantileDDsketch[relative_accuracy, (level)](expr)
+```
+
+**Arguments**
+
+- `expr` — Column with numeric data. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md).
+
+**Parameters**
+
+- `relative_accuracy` — Relative accuracy of the quantile. Possible values are in the range from 0 to 1. [Float](../../../sql-reference/data-types/float.md). The size of the sketch depends on the range of the data and the relative accuracy. The larger the range and the smaller the relative accuracy, the larger the sketch. The rough memory size of the of the sketch is `log(max_value/min_value)/relative_accuracy`. The recommended value is 0.001 or higher.
+
+- `level` — Level of quantile. Optional. Possible values are in the range from 0 to 1. Default value: 0.5. [Float](../../../sql-reference/data-types/float.md).
+
+**Returned value**
+
+- Approximate quantile of the specified level.
+
+Type: [Float64](../../../sql-reference/data-types/float.md#float32-float64).
+
+**Example**
+
+Input table has an integer and a float columns:
+
+``` text
+┌─a─┬─────b─┐
+│ 1 │ 1.001 │
+│ 2 │ 1.002 │
+│ 3 │ 1.003 │
+│ 4 │ 1.004 │
+└───┴───────┘
+```
+
+Query to calculate 0.75-quantile (third quartile):
+
+``` sql
+SELECT quantileDDSketch(0.01, 0.75)(a), quantileDDSketch(0.01, 0.75)(b) FROM example_table;
+```
+
+Result:
+
+``` text
+┌─quantileDDSketch(0.01, 0.75)(a)─┬─quantileDDSketch(0.01, 0.75)(b)─┐
+│               2.974233423476717 │                            1.01 │
+└─────────────────────────────────┴─────────────────────────────────┘
+```
+
+**See Also**
+
+- [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
+- [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
--- a/docs/en/sql-reference/aggregate-functions/reference/quantiles.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/quantiles.md
@ -9,7 +9,7 @@ sidebar_position: 201

 Syntax: `quantiles(level1, level2, …)(x)`

-All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantileInterpolatedWeighted`, `quantilesTDigest`, `quantilesBFloat16`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values.
+All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantileInterpolatedWeighted`, `quantilesTDigest`, `quantilesBFloat16`, `quantilesDDSketch`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values.

 ## quantilesExactExclusive

--- a/docs/en/sql-reference/data-types/nullable.md
+++ b/docs/en/sql-reference/data-types/nullable.md
@ -4,11 +4,11 @@ sidebar_position: 55
 sidebar_label: Nullable
 ---

-# Nullable(typename)
+# Nullable(T)

-Allows to store special marker ([NULL](../../sql-reference/syntax.md)) that denotes “missing value” alongside normal values allowed by `TypeName`. For example, a `Nullable(Int8)` type column can store `Int8` type values, and the rows that do not have a value will store `NULL`.
+Allows to store special marker ([NULL](../../sql-reference/syntax.md)) that denotes “missing value” alongside normal values allowed by `T`. For example, a `Nullable(Int8)` type column can store `Int8` type values, and the rows that do not have a value will store `NULL`.

-For a `TypeName`, you can’t use composite data types [Array](../../sql-reference/data-types/array.md), [Map](../../sql-reference/data-types/map.md) and [Tuple](../../sql-reference/data-types/tuple.md). Composite data types can contain `Nullable` type values, such as `Array(Nullable(Int8))`.
+`T` can’t be any of the composite data types [Array](../../sql-reference/data-types/array.md), [Map](../../sql-reference/data-types/map.md) and [Tuple](../../sql-reference/data-types/tuple.md) but composite data types can contain `Nullable` type values, e.g. `Array(Nullable(Int8))`.

 A `Nullable` type field can’t be included in table indexes.

--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@ -657,6 +657,43 @@ SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res;

 Array elements set to `NULL` are handled as normal values.

+## arrayShingles
+
+Generates an array of "shingles", i.e. consecutive sub-arrays with specified length of the input array.
+
+**Syntax**
+
+``` sql
+arrayShingles(array, length)
+```
+
+**Arguments**
+
+- `array` — Input array [Array](../../sql-reference/data-types/array.md).
+- `length` — The length of each shingle.
+
+**Returned value**
+
+- An array of generated shingles.
+
+Type: [Array](../../sql-reference/data-types/array.md).
+
+**Examples**
+
+Query:
+
+``` sql
+SELECT arrayShingles([1,2,3,4], 3) as res;
+```
+
+Result:
+
+``` text
+┌─res───────────────┐
+│ [[1,2,3],[2,3,4]] │
+└───────────────────┘
+```
+
 ## arraySort(\[func,\] arr, …) {#sort}

 Sorts the elements of the `arr` array in ascending order. If the `func` function is specified, sorting order is determined by the result of the `func` function applied to the elements of the array. If `func` accepts multiple arguments, the `arraySort` function is passed several arrays that the arguments of `func` will correspond to. Detailed examples are shown at the end of `arraySort` description.
--- a/docs/en/sql-reference/table-functions/executable.md
+++ b/docs/en/sql-reference/table-functions/executable.md
@ -63,7 +63,7 @@ if __name__ == "__main__":
 Let's invoke the script and have it generate 10 random strings:

 ```sql
-SELECT * FROM executable('my_script.py', TabSeparated, 'id UInt32, random String', (SELECT 10))
+SELECT * FROM executable('generate_random.py', TabSeparated, 'id UInt32, random String', (SELECT 10))
 ```

 The response looks like:
--- a/docs/ru/getting-started/tutorial.md
+++ b/docs/ru/getting-started/tutorial.md
@ -670,4 +670,4 @@ ENGINE = ReplicatedMergeTree(
 INSERT INTO tutorial.hits_replica SELECT * FROM tutorial.hits_local;
 ```

-Репликация работает в режиме мультимастера. Это означает, что данные могут быть загружены на любую из реплик и система автоматически синхронизирует данные между остальными репликами. Репликация асинхронна, то есть в конкретный момент времнени не все реплики могут содержать недавно добавленные данные. Как минимум одна реплика должна быть в строю для приёма данных. Прочие реплики синхронизируются и восстановят согласованное состояния как только снова станут активными. Заметим, что при таком подходе есть вероятность утраты недавно добавленных данных.
+Репликация работает в режиме мультимастера. Это означает, что данные могут быть загружены на любую из реплик и система автоматически синхронизирует данные между остальными репликами. Репликация асинхронна, то есть в конкретный момент времени не все реплики могут содержать недавно добавленные данные. Как минимум одна реплика должна быть в строю для приёма данных. Прочие реплики синхронизируются и восстановят согласованное состояния как только снова станут активными. Заметим, что при таком подходе есть вероятность утраты недавно добавленных данных.
--- a/docs/ru/operations/settings/settings.md
+++ b/docs/ru/operations/settings/settings.md
@ -2796,6 +2796,17 @@ SELECT TOP 3 name, value FROM system.settings;
 3. │ max_block_size          │ 65505   │
   └─────────────────────────┴─────────┘
 ```
+### output_format_pretty_color {#output_format_pretty_color}
+
+Включает/выключает управляющие последовательности ANSI в форматах Pretty. 
+
+Возможные значения:
+
+-   `0` — выключена. Не исползует ANSI последовательности в форматах Pretty.
+-   `1` — включена. Исползует ANSI последовательности с исключением форматов `NoEscapes`.
+-   `auto` - включена если `stdout` является терминалом с исключением форматов `NoEscapes`.
+
+Значение по умолчанию: `auto`

 ## system_events_show_zero_values {#system_events_show_zero_values}

--- a/docs/zh/faq/general/ne-tormozit.md
+++ b/docs/zh/faq/general/ne-tormozit.md
@ -1,27 +1,27 @@
 ---
 slug: /zh/faq/general/ne-tormozit
-title: "What does \u201C\u043D\u0435 \u0442\u043E\u0440\u043C\u043E\u0437\u0438\u0442\
-  \u201D mean?"
+title: "\u201C\u043D\u0435 \u0442\u043E\u0440\u043C\u043E\u0437\u0438\u0442\
+  \u201D 是什么意思？"
 toc_hidden: true
 sidebar_position: 11
 ---

-# What Does “Не тормозит” Mean? {#what-does-ne-tormozit-mean}
+# “Не тормозит” 是什么意思？ {#what-does-ne-tormozit-mean}

-This question usually arises when people see official ClickHouse t-shirts. They have large words **“ClickHouse не тормозит”** on the front.
+这个问题通常出现在人们看到官方 ClickHouse T恤时。它们的正面印有大字**“ClickHouse не тормозит”**。

-Before ClickHouse became open-source, it has been developed as an in-house storage system by the largest Russian IT company, [Yandex](https://yandex.com/company/). That’s why it initially got its slogan in Russian, which is “не тормозит” (pronounced as “ne tormozit”). After the open-source release we first produced some of those t-shirts for events in Russia and it was a no-brainer to use the slogan as-is.
+在 ClickHouse 开源之前，它作为俄罗斯最大的 IT 公司 [Yandex](https://yandex.com/company/) 的内部存储系统而开发。这就是为什么它最初获得了俄文口号“не тормозит”（发音为“ne tormozit”）。在开源发布后，我们首先为俄罗斯的活动制作了一些这样的T恤，使用原汁原味的口号是理所当然的。

-One of the following batches of those t-shirts was supposed to be given away on events outside of Russia and we tried to make the English version of the slogan. Unfortunately, the Russian language is kind of elegant in terms of expressing stuff and there was a restriction of limited space on a t-shirt, so we failed to come up with good enough translation (most options appeared to be either long or inaccurate) and decided to keep the slogan in Russian even on t-shirts produced for international events. It appeared to be a great decision because people all over the world get positively surprised and curious when they see it.
+其中一批这样的T恤原本打算在俄罗斯之外的活动中赠送，我们尝试制作口号的英文版本。不幸的是，俄语在表达方面有些优雅，而且T恤上的空间有限，所以我们未能提出足够好的翻译（大多数选项要么太长，要么不够准确），并决定即使在为国际活动制作的T恤上也保留俄文口号。这被证明是一个绝妙的决定，因为全世界的人们看到它时都会感到惊喜和好奇。

-So, what does it mean? Here are some ways to translate *“не тормозит”*:
+那么，它是什么意思呢？以下是翻译“не тормозит”的一些方式：

-   If you translate it literally, it’d be something like *“ClickHouse does not press the brake pedal”*.
-   If you’d want to express it as close to how it sounds to a Russian person with IT background, it’d be something like *“If your larger system lags, it’s not because it uses ClickHouse”*.
-   Shorter, but not so precise versions could be *“ClickHouse is not slow”*, *“ClickHouse does not lag”* or just *“ClickHouse is fast”*.
+-   如果你直译，那就是“ClickHouse 不踩刹车”。
+-   如果你想尽可能接近一个有 IT 背景的俄罗斯人的听觉感受，那就是“如果你的大型系统延迟，不是因为它使用了 ClickHouse”。
+-   更短，但不那么精确的版本可能是“ClickHouse 不慢”，“ClickHouse 不卡顿”或仅仅“ClickHouse 很快”。

-If you haven’t seen one of those t-shirts in person, you can check them out online in many ClickHouse-related videos. For example, this one:
+如果您还没有亲眼见过这些 T恤，可以在许多与 ClickHouse 相关的视频中在线查看。例如，这个：

 ![iframe](https://www.youtube.com/embed/bSyQahMVZ7w)

-P.S. These t-shirts are not for sale, they are given away for free on most [ClickHouse Meetups](https://clickhouse.com/#meet), usually for best questions or other forms of active participation.
+附言：这些 T恤不出售，它们在大多数 [ClickHouse 聚会](https://clickhouse.com/#meet)上免费赠送，通常是给出最佳问题或其他形式的积极参与者。
--- a/docs/zh/faq/general/why-clickhouse-is-so-fast.md
+++ b/docs/zh/faq/general/why-clickhouse-is-so-fast.md
@ -1,63 +1,63 @@
 ---
 slug: /zh/faq/general/why-clickhouse-is-so-fast
-title: Why is ClickHouse so fast?
+title: 为什么 ClickHouse 如此快速？
 toc_hidden: true
 sidebar_position: 8
 ---

-# Why ClickHouse Is So Fast? {#why-clickhouse-is-so-fast}
+# 为什么 ClickHouse 如此快速？ {#why-clickhouse-is-so-fast}

-It was designed to be fast. Query execution performance has always been a top priority during the development process, but other important characteristics like user-friendliness, scalability, and security were also considered so ClickHouse could become a real production system.
+它被设计成一个快速的系统。在开发过程中，查询执行性能一直是首要考虑的优先级，但也考虑了其他重要特性，如用户友好性、可扩展性和安全性，使 ClickHouse 成为一个真正的生产系统。

-ClickHouse was initially built as a prototype to do just a single task well: to filter and aggregate data as fast as possible. That’s what needs to be done to build a typical analytical report and that’s what a typical [GROUP BY](../../sql-reference/statements/select/group-by.md) query does. ClickHouse team has made several high-level decisions that combined made achieving this task possible:
+ClickHouse 最初是作为一个原型构建的，它的单一任务就是尽可能快速地过滤和聚合数据。这正是构建典型分析报告所需做的，也是典型 [GROUP BY](../../sql-reference/statements/select/group-by.md) 查询所做的。ClickHouse 团队做出了几个高层次的决策，这些决策组合在一起使得实现这一任务成为可能：

-Column-oriented storage
-:   Source data often contain hundreds or even thousands of columns, while a report can use just a few of them. The system needs to avoid reading unnecessary columns, or most expensive disk read operations would be wasted.
+列式存储
+:   源数据通常包含数百甚至数千列，而报告可能只使用其中的几列。系统需要避免读取不必要的列，否则大部分昂贵的磁盘读取操作将被浪费。

-Indexes
-:   ClickHouse keeps data structures in memory that allows reading not only used columns but only necessary row ranges of those columns.
+索引
+:   ClickHouse 在内存中保留数据结构，允许不仅读取使用的列，而且只读取这些列的必要行范围。

-Data compression
-:   Storing different values of the same column together often leads to better compression ratios (compared to row-oriented systems) because in real data column often has the same or not so many different values for neighboring rows. In addition to general-purpose compression, ClickHouse supports [specialized codecs](../../sql-reference/statements/create/table.mdx/#create-query-specialized-codecs) that can make data even more compact.
+数据压缩
+:   将同一列的不同值存储在一起通常会导致更好的压缩比（与行式系统相比），因为在实际数据中列通常对相邻行有相同或不太多的不同值。除了通用压缩之外，ClickHouse 还支持 [专用编解码器](../../sql-reference/statements/create/table.mdx/#create-query-specialized-codecs)，可以使数据更加紧凑。

-Vectorized query execution
-:   ClickHouse not only stores data in columns but also processes data in columns. It leads to better CPU cache utilization and allows for [SIMD](https://en.wikipedia.org/wiki/SIMD) CPU instructions usage.
+向量化查询执行
+:   ClickHouse 不仅以列的形式存储数据，而且以列的形式处理数据。这导致更好的 CPU 缓存利用率，并允许使用 [SIMD](https://en.wikipedia.org/wiki/SIMD) CPU 指令。

-Scalability
-:   ClickHouse can leverage all available CPU cores and disks to execute even a single query. Not only on a single server but all CPU cores and disks of a cluster as well.
+可扩展性
+:   ClickHouse 可以利用所有可用的 CPU 核心和磁盘来执行甚至是单个查询。不仅在单个服务器上，而且在集群的所有 CPU 核心和磁盘上。

-But many other database management systems use similar techniques. What really makes ClickHouse stand out is **attention to low-level details**. Most programming languages provide implementations for most common algorithms and data structures, but they tend to be too generic to be effective. Every task can be considered as a landscape with various characteristics, instead of just throwing in random implementation. For example, if you need a hash table, here are some key questions to consider:
+但许多其他数据库管理系统也使用类似的技术。真正使 ClickHouse 脱颖而出的是 **对底层细节的关注**。大多数编程语言为最常见的算法和数据结构提供了实现，但它们往往过于通用而无法高效。每个任务都可以被视为具有各种特征的景观，而不是仅仅随意投入某个实现。例如，如果您需要一个哈希表，这里有一些关键问题需要考虑：

-   Which hash function to choose?
-   Collision resolution algorithm: [open addressing](https://en.wikipedia.org/wiki/Open_addressing) vs [chaining](https://en.wikipedia.org/wiki/Hash_table#Separate_chaining)?
-   Memory layout: one array for keys and values or separate arrays? Will it store small or large values?
-   Fill factor: when and how to resize? How to move values around on resize?
-   Will values be removed and which algorithm will work better if they will?
-   Will we need fast probing with bitmaps, inline placement of string keys, support for non-movable values, prefetch, and batching?
+-   选择哪种哈希函数？
+-   冲突解决算法：[开放寻址](https://en.wikipedia.org/wiki/Open_addressing)还是[链接](https://en.wikipedia.org/wiki/Hash_table#Separate_chaining)？
+-   内存布局：一个数组用于键和值还是分开的数组？它会存储小值还是大值？
+-   填充因子：何时以及如何调整大小？在调整大小时如何移动值？
+-   是否会移除值，如果会，哪种算法会更好？
+-   我们是否需要使用位图进行快速探测，字符串键的内联放置，对不可移动值的支持，预取和批处理？

-Hash table is a key data structure for `GROUP BY` implementation and ClickHouse automatically chooses one of [30+ variations](https://github.com/ClickHouse/ClickHouse/blob/master/src/Interpreters/Aggregator.h) for each specific query.
+哈希表是 `GROUP BY` 实现的关键数据结构，ClickHouse 会根据每个特定查询自动选择 [30 多种变体](https://github.com/ClickHouse/ClickHouse/blob/master/src/Interpreters/Aggregator.h) 中的一种。

-The same goes for algorithms, for example, in sorting you might consider:
+算法也是如此，例如，在排序中，您可能会考虑：

-   What will be sorted: an array of numbers, tuples, strings, or structures?
-   Is all data available completely in RAM?
-   Do we need a stable sort?
-   Do we need a full sort? Maybe partial sort or n-th element will suffice?
-   How to implement comparisons?
-   Are we sorting data that has already been partially sorted?
+-   将要排序的是数字数组、元组、字符串还是结构？
+-   所有数据是否完全可用于 RAM？
+-   我们需要稳定排序吗？
+-   我们需要完全排序吗？也许部分排序或第 n 个元素就足够了？
+-   如何实现比较？
+-   我们正在对已经部分排序的数据进行排序吗？

-Algorithms that they rely on characteristics of data they are working with can often do better than their generic counterparts. If it is not really known in advance, the system can try various implementations and choose the one that works best in runtime. For example, see an [article on how LZ4 decompression is implemented in ClickHouse](https://habr.com/en/company/yandex/blog/457612/).
+他们所依赖的算法根据其所处理的数据特性，往往可以比通用算法做得更好。如果事先真的不知道，系统可以尝试各种实现，并在运行时选择最佳的一种。例如，看一篇关于 [ClickHouse 中 LZ4 解压缩是如何实现的文章](https://habr.com/en/company/yandex/blog/457612/)。

-Last but not least, the ClickHouse team always monitors the Internet on people claiming that they came up with the best implementation, algorithm, or data structure to do something and tries it out. Those claims mostly appear to be false, but from time to time you’ll indeed find a gem.
+最后但同样重要的是，ClickHouse 团队始终关注互联网上人们声称他们提出了最佳的实现、算法或数据结构来做某事，并尝试它。这些声称大多是虚假的，但有时你确实会找到一颗宝石。

-:::info Tips for building your own high-performance software
-   Keep in mind low-level details when designing your system.
-   Design based on hardware capabilities.
-   Choose data structures and abstractions based on the needs of the task.
-   Provide specializations for special cases.
-   Try new, “best” algorithms, that you read about yesterday.
-   Choose an algorithm in runtime based on statistics.
-   Benchmark on real datasets.
-   Test for performance regressions in CI.
-   Measure and observe everything.
+:::info 构建高性能软件的提示
+-   设计系统时要考虑到底层细节。
+-   基于硬件能力进行设计。
+-   根据任务的需求选择数据结构和抽象。
+-   为特殊情况提供专门化。
+-   尝试您昨天阅读的关于新的“最佳”算法。
+-   根据统计数据在运行时选择算法。
+-   在真实数据集上进行基准测试。
+-   在 CI 中测试性能回归。
+-   测量并观察一切。
 :::
--- a/docs/zh/faq/integration/json-import.md
+++ b/docs/zh/faq/integration/json-import.md
@ -1,35 +1,35 @@
 ---
 slug: /zh/faq/integration/json-import
-title: How to import JSON into ClickHouse?
+title: 如何将 JSON 导入到 ClickHouse？
 toc_hidden: true
 sidebar_position: 11
 ---

-# How to Import JSON Into ClickHouse? {#how-to-import-json-into-clickhouse}
+# 如何将 JSON 导入到 ClickHouse？ {#how-to-import-json-into-clickhouse}

-ClickHouse supports a wide range of [data formats for input and output](../../interfaces/formats.md). There are multiple JSON variations among them, but the most commonly used for data ingestion is [JSONEachRow](../../interfaces/formats.md#jsoneachrow). It expects one JSON object per row, each object separated by a newline.
+ClickHouse 支持多种[输入和输出的数据格式](../../interfaces/formats.md)。其中包括多种 JSON 变体，但最常用于数据导入的是 [JSONEachRow](../../interfaces/formats.md#jsoneachrow)。它期望每行一个 JSON 对象，每个对象由一个新行分隔。

-## Examples {#examples}
+## 示例 {#examples}

-Using [HTTP interface](../../interfaces/http.md):
+使用 [HTTP 接口](../../interfaces/http.md)：

 ``` bash
 $ echo '{"foo":"bar"}' | curl 'http://localhost:8123/?query=INSERT%20INTO%20test%20FORMAT%20JSONEachRow' --data-binary @-
 ```

-Using [CLI interface](../../interfaces/cli.md):
+使用 [CLI接口](../../interfaces/cli.md):

 ``` bash
 $ echo '{"foo":"bar"}'  | clickhouse-client --query="INSERT INTO test FORMAT JSONEachRow"
 ```

-Instead of inserting data manually, you might consider to use one of [client libraries](../../interfaces/index.md) instead.
+除了手动插入数据外，您可能会考虑使用 [客户端库](../../interfaces/index.md) 之一。

-## Useful Settings {#useful-settings}
+## 实用设置 {#useful-settings}

-   `input_format_skip_unknown_fields` allows to insert JSON even if there were additional fields not present in table schema (by discarding them).
-   `input_format_import_nested_json` allows to insert nested JSON objects into columns of [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) type.
+-   `input_format_skip_unknown_fields` 允许插入 JSON，即使存在表格架构中未出现的额外字段（通过丢弃它们）。
+-   `input_format_import_nested_json` 允许将嵌套 JSON 对象插入到 [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) 类型的列中。

 :::note
-Settings are specified as `GET` parameters for the HTTP interface or as additional command-line arguments prefixed with `--` for the `CLI` interface.
+对于 HTTP 接口，设置作为 `GET` 参数指定；对于 `CLI` 接口，则作为前缀为 -- 的附加命令行参数。
 :::
--- a/docs/zh/faq/integration/oracle-odbc.md
+++ b/docs/zh/faq/integration/oracle-odbc.md
@ -1,16 +1,16 @@
 ---
 slug: /zh/faq/integration/oracle-odbc
-title: What if I have a problem with encodings when using Oracle via ODBC?
+title: 使用 Oracle ODBC 时遇到编码问题怎么办？
 toc_hidden: true
 sidebar_position: 20
 ---

-# What If I Have a Problem with Encodings When Using Oracle Via ODBC? {#oracle-odbc-encodings}
+# 使用 Oracle ODBC 时遇到编码问题怎么办？ {#oracle-odbc-encodings}

-If you use Oracle as a source of ClickHouse external dictionaries via Oracle ODBC driver, you need to set the correct value for the `NLS_LANG` environment variable in `/etc/default/clickhouse`. For more information, see the [Oracle NLS_LANG FAQ](https://www.oracle.com/technetwork/products/globalization/nls-lang-099431.html).
+如果您使用 Oracle 作为 ClickHouse 外部字典的数据源，并通过 Oracle ODBC 驱动程序，您需要在 `/etc/default/clickhouse` 中为 `NLS_LANG` 环境变量设置正确的值。更多信息，请参阅 [Oracle NLS_LANG FAQ](https://www.oracle.com/technetwork/products/globalization/nls-lang-099431.html)。

-**Example**
+**示例**

 ``` sql
 NLS_LANG=RUSSIAN_RUSSIA.UTF8
-```
+```
--- a/docs/zh/faq/operations/delete-old-data.md
+++ b/docs/zh/faq/operations/delete-old-data.md
@ -1,44 +1,44 @@
 ---
 slug: /zh/faq/operations/delete-old-data
-title: Is it possible to delete old records from a ClickHouse table?
+title: 是否可以从ClickHouse表中删除旧记录？
 toc_hidden: true
 sidebar_position: 20
 ---

-# Is It Possible to Delete Old Records from a ClickHouse Table? {#is-it-possible-to-delete-old-records-from-a-clickhouse-table}
+# 是否可以从ClickHouse表中删除旧记录？ {#is-it-possible-to-delete-old-records-from-a-clickhouse-table}

-The short answer is “yes”. ClickHouse has multiple mechanisms that allow freeing up disk space by removing old data. Each mechanism is aimed for different scenarios.
+简短的答案是“可以”。ClickHouse具有多种机制，允许通过删除旧数据来释放磁盘空间。每种机制都针对不同的场景。

 ## TTL {#ttl}

-ClickHouse allows to automatically drop values when some condition happens. This condition is configured as an expression based on any columns, usually just static offset for any timestamp column.
+ClickHouse 允许在某些条件发生时自动删除值。这个条件被配置为基于任何列的表达式，通常只是针对任何时间戳列的静态偏移量。

-The key advantage of this approach is that it does not need any external system to trigger, once TTL is configured, data removal happens automatically in background.
+这种方法的主要优势是它不需要任何外部系统来触发，一旦配置了 TTL，数据删除就会自动在后台发生。

 :::note
-TTL can also be used to move data not only to [/dev/null](https://en.wikipedia.org/wiki/Null_device), but also between different storage systems, like from SSD to HDD.
+TTL 也可以用来将数据移动到非 [/dev/null](https://en.wikipedia.org/wiki/Null_device) 的不同存储系统，例如从 SSD 到 HDD。
 :::

-More details on [configuring TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl).
+有关 [配置 TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) 的更多详细信息。

 ## ALTER DELETE {#alter-delete}

-ClickHouse does not have real-time point deletes like in [OLTP](https://en.wikipedia.org/wiki/Online_transaction_processing) databases. The closest thing to them are mutations. They are issued as `ALTER ... DELETE` or `ALTER ... UPDATE` queries to distinguish from normal `DELETE` or `UPDATE` as they are asynchronous batch operations, not immediate modifications. The rest of syntax after `ALTER TABLE` prefix is similar.
+ClickHouse没有像[OLTP](https://en.wikipedia.org/wiki/Online_transaction_processing)数据库那样的实时点删除。最接近的东西是 `Mutation`，执行 `ALTER ... DELETE` 或 `ALTER ... UPDATE` 查询，以区别于普通的`DELETE`或`UPDATE`。因为它们是异步批处理操作，而不是立即修改。`ALTER TABLE`前缀后的其余语法相似。

-`ALTER DELETE` can be issued to flexibly remove old data. If you need to do it regularly, the main downside will be the need to have an external system to submit the query. There are also some performance considerations since mutation rewrite complete parts even there’s only a single row to be deleted.
+`ALTER DELETE`可以灵活地用来删除旧数据。如果你需要定期这样做，主要缺点将是需要有一个外部系统来提交查询。还有一些性能方面的考虑，因为即使只有一行要被删除，突变也会重写完整部分。

-This is the most common approach to make your system based on ClickHouse [GDPR](https://gdpr-info.eu)-compliant.
+这是使基于ClickHouse的系统符合[GDPR](https://gdpr-info.eu)的最常见方法。

-More details on [mutations](../../sql-reference/statements/alter.md/#alter-mutations).
+有关[mutations](../../sql-reference/statements/alter.md/#alter-mutations)的更多详细信息。

 ## DROP PARTITION {#drop-partition}

-`ALTER TABLE ... DROP PARTITION` provides a cost-efficient way to drop a whole partition. It’s not that flexible and needs proper partitioning scheme configured on table creation, but still covers most common cases. Like mutations need to be executed from an external system for regular use.
+`ALTER TABLE ... DROP PARTITION`提供了一种成本效率高的方式来删除整个分区。它不是那么灵活，需要在创建表时配置适当的分区方案，但仍然涵盖了大多数常见情况。像 mutations 一样，需要从外部系统执行以进行常规使用。

-More details on [manipulating partitions](../../sql-reference/statements/alter/partition.mdx/#alter_drop-partition).
+有关[操作分区](../../sql-reference/statements/alter/partition.mdx/#alter_drop-partition)的更多详细信息。

 ## TRUNCATE {#truncate}

-It’s rather radical to drop all data from a table, but in some cases it might be exactly what you need.
+从表中删除所有数据是相当激进的，但在某些情况下可能正是您所需要的。

-More details on [table truncation](../../sql-reference/statements/truncate.md).
+有关[truncate](../../sql-reference/statements/truncate.md)的更多详细信息。
--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@ -290,6 +290,11 @@ void LocalServer::cleanup()
    {
        connection.reset();

+        /// Suggestions are loaded async in a separate thread and it can use global context.
+        /// We should reset it before resetting global_context.
+        if (suggest)
+            suggest.reset();
+
        if (global_context)
        {
            global_context->shutdown();
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -76,8 +76,8 @@
 #include <Databases/registerDatabases.h>
 #include <Dictionaries/registerDictionaries.h>
 #include <Disks/registerDisks.h>
-#include <IO/Resource/registerSchedulerNodes.h>
-#include <IO/Resource/registerResourceManagers.h>
+#include <Common/Scheduler/Nodes/registerSchedulerNodes.h>
+#include <Common/Scheduler/Nodes/registerResourceManagers.h>
 #include <Common/Config/ConfigReloader.h>
 #include <Server/HTTPHandlerFactory.h>
 #include "MetricsTransmitter.h"
@ -1467,6 +1467,8 @@ try

                global_context->reloadAuxiliaryZooKeepersConfigIfChanged(config);

+                global_context->reloadQueryMaskingRulesIfChanged(config);
+
                std::lock_guard lock(servers_lock);
                updateServers(*config, server_pool, async_metrics, servers, servers_to_start_before_tables);
            }
@ -2002,6 +2004,12 @@ try
            else
                LOG_INFO(log, "Closed all listening sockets.");

+            /// Wait for unfinished backups and restores.
+            /// This must be done after closing listening sockets (no more backups/restores) but before ProcessList::killAllQueries
+            /// (because killAllQueries() will cancel all running backups/restores).
+            if (server_settings.shutdown_wait_backups_and_restores)
+                global_context->waitAllBackupsAndRestores();
+
            /// Killing remaining queries.
            if (!server_settings.shutdown_wait_unfinished_queries)
                global_context->getProcessList().killAllQueries();
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@ -440,6 +440,9 @@
    <!-- Cache size in elements for compiled expressions.-->
    <compiled_expression_cache_elements_size>10000</compiled_expression_cache_elements_size>

+    <!-- Cache path for custom (created from SQL) cached disks -->
+    <custom_cached_disks_base_directory>/var/lib/clickhouse/caches/</custom_cached_disks_base_directory>
+
    <validate_tcp_client_information>false</validate_tcp_client_information>

    <!-- Path to data directory, with trailing slash. -->
--- a/src/AggregateFunctions/AggregateFunctionQuantile.h
+++ b/src/AggregateFunctions/AggregateFunctionQuantile.h
@ -31,7 +31,7 @@ namespace ErrorCodes

 template <typename> class QuantileTiming;
 template <typename> class QuantileGK;
-
+template <typename> class QuantileDDSketch;

 /** Generic aggregate function for calculation of quantiles.
  * It depends on quantile calculation data structure. Look at Quantile*.h for various implementations.
@ -64,6 +64,7 @@ private:
    using ColVecType = ColumnVectorOrDecimal<Value>;

    static constexpr bool returns_float = !(std::is_same_v<FloatReturnType, void>);
+    static constexpr bool is_quantile_ddsketch = std::is_same_v<Data, QuantileDDSketch<Value>>;
    static_assert(!is_decimal<Value> || !returns_float);

    QuantileLevels<Float64> levels;
@ -74,6 +75,9 @@ private:
    /// Used for the approximate version of the algorithm (Greenwald-Khanna)
    ssize_t accuracy = 10000;

+    /// Used for the quantile sketch
+    Float64 relative_accuracy = 0.01;
+
    DataTypePtr & argument_type;

 public:
@ -87,7 +91,36 @@ public:
        if (!returns_many && levels.size() > 1)
            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} requires one level parameter or less", getName());

-        if constexpr (has_accuracy_parameter)
+        if constexpr (is_quantile_ddsketch)
+        {
+            if (params.empty())
+                throw Exception(
+                    ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} requires at least one param", getName());
+
+            const auto & relative_accuracy_field = params[0];
+            if (relative_accuracy_field.getType() != Field::Types::Float64)
+                throw Exception(
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregate function {} requires relative accuracy parameter with Float64 type", getName());
+
+            relative_accuracy = relative_accuracy_field.get<Float64>();
+
+            if (relative_accuracy <= 0 || relative_accuracy >= 1 || isNaN(relative_accuracy))
+                throw Exception(
+                    ErrorCodes::BAD_ARGUMENTS,
+                    "Aggregate function {} requires relative accuracy parameter with value between 0 and 1 but is {}",
+                    getName(),
+                    relative_accuracy);
+            // Throw exception if the relative accuracy is too small.
+            // This is to avoid the case where the user specifies a relative accuracy that is too small
+            // and the sketch is not able to allocate enough memory to satisfy the accuracy requirement.
+            if (relative_accuracy < 1e-6)
+                throw Exception(
+                    ErrorCodes::BAD_ARGUMENTS,
+                    "Aggregate function {} requires relative accuracy parameter with value greater than 1e-6 but is {}",
+                    getName(),
+                    relative_accuracy);
+        }
+        else if constexpr (has_accuracy_parameter)
        {
            if (params.empty())
                throw Exception(
@ -116,7 +149,9 @@ public:

    void create(AggregateDataPtr __restrict place) const override /// NOLINT
    {
-        if constexpr (has_accuracy_parameter)
+        if constexpr (is_quantile_ddsketch)
+            new (place) Data(relative_accuracy);
+        else if constexpr (has_accuracy_parameter)
            new (place) Data(accuracy);
        else
            new (place) Data;
@ -147,6 +182,10 @@ public:
    {
        /// Return normalized state type: quantiles*(1)(...)
        Array params{1};
+        if constexpr (is_quantile_ddsketch)
+            params = {relative_accuracy, 1};
+        else if constexpr (has_accuracy_parameter)
+            params = {accuracy, 1};
        AggregateFunctionProperties properties;
        return std::make_shared<DataTypeAggregateFunction>(
            AggregateFunctionFactory::instance().get(
@ -295,4 +334,7 @@ struct NameQuantilesBFloat16Weighted { static constexpr auto name = "quantilesBF
 struct NameQuantileGK { static constexpr auto name = "quantileGK"; };
 struct NameQuantilesGK { static constexpr auto name = "quantilesGK"; };

+struct NameQuantileDDSketch { static constexpr auto name = "quantileDDSketch"; };
+struct NameQuantilesDDSketch { static constexpr auto name = "quantilesDDSketch"; };
+
 }
--- a/src/AggregateFunctions/AggregateFunctionQuantileDDSketch.cpp
+++ b/src/AggregateFunctions/AggregateFunctionQuantileDDSketch.cpp
@ -0,0 +1,61 @@
+#include <AggregateFunctions/AggregateFunctionQuantile.h>
+#include <AggregateFunctions/QuantileDDSketch.h>
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <AggregateFunctions/Helpers.h>
+#include <DataTypes/DataTypeDate.h>
+#include <DataTypes/DataTypeDateTime.h>
+#include <Core/Field.h>
+
+
+namespace DB
+{
+struct Settings;
+
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+
+namespace
+{
+
+template <typename Value, bool float_return> using FuncQuantileDDSketch = AggregateFunctionQuantile<Value, QuantileDDSketch<Value>, NameQuantileDDSketch, false, std::conditional_t<float_return, Float64, void>, false, true>;
+template <typename Value, bool float_return> using FuncQuantilesDDSketch = AggregateFunctionQuantile<Value, QuantileDDSketch<Value>, NameQuantilesDDSketch, false, std::conditional_t<float_return, Float64, void>, true, true>;
+
+
+template <template <typename, bool> class Function>
+AggregateFunctionPtr createAggregateFunctionQuantile(
+    const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *)
+{
+    /// Second argument type check doesn't depend on the type of the first one.
+    Function<void, true>::assertSecondArg(argument_types);
+
+    const DataTypePtr & argument_type = argument_types[0];
+    WhichDataType which(argument_type);
+
+#define DISPATCH(TYPE) \
+    if (which.idx == TypeIndex::TYPE) return std::make_shared<Function<TYPE, true>>(argument_types, params);
+    FOR_BASIC_NUMERIC_TYPES(DISPATCH)
+#undef DISPATCH
+    if (which.idx == TypeIndex::Date) return std::make_shared<Function<DataTypeDate::FieldType, false>>(argument_types, params);
+    if (which.idx == TypeIndex::DateTime) return std::make_shared<Function<DataTypeDateTime::FieldType, false>>(argument_types, params);
+
+    throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument for aggregate function {}",
+                    argument_type->getName(), name);
+}
+
+}
+
+void registerAggregateFunctionsQuantileDDSketch(AggregateFunctionFactory & factory)
+{
+    /// For aggregate functions returning array we cannot return NULL on empty set.
+    AggregateFunctionProperties properties = { .returns_default_when_only_null = true };
+
+    factory.registerFunction(NameQuantileDDSketch::name, createAggregateFunctionQuantile<FuncQuantileDDSketch>);
+    factory.registerFunction(NameQuantilesDDSketch::name, { createAggregateFunctionQuantile<FuncQuantilesDDSketch>, properties });
+
+    /// 'median' is an alias for 'quantile'
+    factory.registerAlias("medianDDSketch", NameQuantileDDSketch::name);
+}
+
+}
--- a/src/AggregateFunctions/DDSketch.h
+++ b/src/AggregateFunctions/DDSketch.h
@ -0,0 +1,253 @@
+#pragma once
+
+#include <memory> // for std::unique_ptr
+#include <cmath>
+#include <stdexcept>
+#include <limits>
+#include <iostream>
+#include <base/types.h>
+
+#include <IO/ReadBuffer.h>
+#include <IO/WriteBuffer.h>
+
+#include <AggregateFunctions/DDSketch/Mapping.h>
+#include <AggregateFunctions/DDSketch/Store.h>
+#include <AggregateFunctions/DDSketch/DDSketchEncoding.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+    extern const int INCORRECT_DATA;
+}
+
+class DDSketchDenseLogarithmic
+{
+public:
+    explicit DDSketchDenseLogarithmic(Float64 relative_accuracy = 0.01)
+        : mapping(std::make_unique<DDSketchLogarithmicMapping>(relative_accuracy)),
+          store(std::make_unique<DDSketchDenseStore>()),
+          negative_store(std::make_unique<DDSketchDenseStore>()),
+          zero_count(0.0),
+          count(0.0)
+    {
+    }
+
+    DDSketchDenseLogarithmic(std::unique_ptr<DDSketchLogarithmicMapping> mapping_,
+             std::unique_ptr<DDSketchDenseStore> store_,
+             std::unique_ptr<DDSketchDenseStore> negative_store_,
+             Float64 zero_count_)
+        : mapping(std::move(mapping_)),
+          store(std::move(store_)),
+          negative_store(std::move(negative_store_)),
+          zero_count(zero_count_),
+          count(store->count + negative_store->count + zero_count_)
+    {
+    }
+
+    void add(Float64 val, Float64 weight = 1.0)
+    {
+        if (weight <= 0.0)
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "weight must be a positive Float64");
+        }
+
+        if (val > mapping->getMinPossible())
+        {
+            store->add(mapping->key(val), weight);
+        }
+        else if (val < -mapping->getMinPossible())
+        {
+            negative_store->add(mapping->key(-val), weight);
+        }
+        else
+        {
+            zero_count += weight;
+        }
+
+        count += weight;
+    }
+
+    Float64 get(Float64 quantile) const
+    {
+        if (quantile < 0 || quantile > 1 || count == 0)
+        {
+            return std::numeric_limits<Float64>::quiet_NaN(); // Return NaN if the conditions are not met
+        }
+
+        Float64 rank = quantile * (count - 1);
+        Float64 quantile_value;
+        if (rank < negative_store->count)
+        {
+            Float64 reversed_rank = negative_store->count - rank - 1;
+            int key = negative_store->keyAtRank(reversed_rank, false);
+            quantile_value = -mapping->value(key);
+        }
+        else if (rank < zero_count + negative_store->count)
+        {
+            quantile_value = 0;
+        }
+        else
+        {
+            int key = store->keyAtRank(rank - zero_count - negative_store->count, true);
+            quantile_value = mapping->value(key);
+        }
+        return quantile_value;
+    }
+
+    void copy(const DDSketchDenseLogarithmic& other)
+    {
+        Float64 rel_acc = (other.mapping->getGamma() - 1) / (other.mapping->getGamma() + 1);
+        mapping = std::make_unique<DDSketchLogarithmicMapping>(rel_acc);
+        store = std::make_unique<DDSketchDenseStore>();
+        negative_store = std::make_unique<DDSketchDenseStore>();
+        store->copy(other.store.get());
+        negative_store->copy(other.negative_store.get());
+        zero_count = other.zero_count;
+        count = other.count;
+    }
+
+    void merge(const DDSketchDenseLogarithmic& other)
+    {
+        if (mapping->getGamma() != other.mapping->getGamma())
+        {
+            // modify the one with higher precision to match the one with lower precision
+            if (mapping->getGamma() > other.mapping->getGamma())
+            {
+                DDSketchDenseLogarithmic new_sketch = other.changeMapping(mapping->getGamma());
+                this->merge(new_sketch);
+                return;
+            }
+            else
+            {
+                DDSketchDenseLogarithmic new_sketch = changeMapping(other.mapping->getGamma());
+                copy(new_sketch);
+            }
+        }
+
+        // If the other sketch is empty, do nothing
+        if (other.count == 0)
+        {
+            return;
+        }
+
+        // If this sketch is empty, copy the other sketch
+        if (count == 0)
+        {
+            copy(other);
+            return;
+        }
+
+        count += other.count;
+        zero_count += other.zero_count;
+
+        store->merge(other.store.get());
+        negative_store->merge(other.negative_store.get());
+    }
+
+    void serialize(WriteBuffer& buf) const
+    {
+        // Write the mapping
+        writeBinary(enc.FlagIndexMappingBaseLogarithmic.byte, buf);
+        mapping->serialize(buf);
+
+        // Write the positive and negative stores
+        writeBinary(enc.FlagTypePositiveStore, buf);
+        store->serialize(buf);
+
+        writeBinary(enc.FlagTypeNegativeStore, buf);
+        negative_store->serialize(buf);
+
+        // Write the zero count
+        writeBinary(enc.FlagZeroCountVarFloat.byte, buf);
+        writeBinary(zero_count, buf);
+    }
+
+    void deserialize(ReadBuffer& buf)
+    {
+        // Read the mapping
+        UInt8 flag = 0;
+        readBinary(flag, buf);
+        if (flag != enc.FlagIndexMappingBaseLogarithmic.byte)
+        {
+            throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid flag for mapping");
+        }
+        mapping->deserialize(buf);
+
+        // Read the positive and negative stores
+        readBinary(flag, buf);
+        if (flag != enc.FlagTypePositiveStore)
+        {
+            throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid flag for positive store");
+        }
+        store->deserialize(buf);
+
+        readBinary(flag, buf);
+        if (flag != enc.FlagTypeNegativeStore)
+        {
+            throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid flag for negative store");
+        }
+        negative_store->deserialize(buf);
+
+        // Read the zero count
+        readBinary(flag, buf);
+        if (flag != enc.FlagZeroCountVarFloat.byte)
+        {
+            throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid flag for zero count");
+        }
+        readBinary(zero_count, buf);
+        count = static_cast<Float64>(negative_store->count + zero_count + store->count);
+    }
+
+private:
+    std::unique_ptr<DDSketchLogarithmicMapping> mapping;
+    std::unique_ptr<DDSketchDenseStore> store;
+    std::unique_ptr<DDSketchDenseStore> negative_store;
+    Float64 zero_count;
+    Float64 count;
+    DDSketchEncoding enc;
+
+
+    DDSketchDenseLogarithmic changeMapping(Float64 new_gamma) const
+    {
+        auto new_mapping = std::make_unique<DDSketchLogarithmicMapping>((new_gamma - 1) / (new_gamma + 1));
+
+        auto new_positive_store = std::make_unique<DDSketchDenseStore>();
+        auto new_negative_store = std::make_unique<DDSketchDenseStore>();
+
+        auto remap_store = [this, &new_mapping](DDSketchDenseStore& old_store, std::unique_ptr<DDSketchDenseStore>& target_store)
+        {
+            for (int i = 0; i < old_store.length(); ++i)
+            {
+                int old_index = i + old_store.offset;
+                Float64 old_bin_count = old_store.bins[i];
+
+                Float64 in_lower_bound = this->mapping->lowerBound(old_index);
+                Float64 in_upper_bound = this->mapping->lowerBound(old_index + 1);
+                Float64 in_size = in_upper_bound - in_lower_bound;
+
+                int new_index = new_mapping->key(in_lower_bound);
+                // Distribute counts to new bins
+                for (; new_mapping->lowerBound(new_index) < in_upper_bound; ++new_index)
+                {
+                    Float64 out_lower_bound = new_mapping->lowerBound(new_index);
+                    Float64 out_upper_bound = new_mapping->lowerBound(new_index + 1);
+                    Float64 lower_intersection_bound = std::max(out_lower_bound, in_lower_bound);
+                    Float64 higher_intersection_bound = std::min(out_upper_bound, in_upper_bound);
+                    Float64 intersection_size = higher_intersection_bound - lower_intersection_bound;
+                    Float64 proportion = intersection_size / in_size;
+                    target_store->add(new_index, proportion * old_bin_count);
+                }
+            }
+        };
+
+        remap_store(*store, new_positive_store);
+        remap_store(*negative_store, new_negative_store);
+
+        return DDSketchDenseLogarithmic(std::move(new_mapping), std::move(new_positive_store), std::move(new_negative_store), zero_count);
+    }
+};
+
+}
--- a/src/AggregateFunctions/DDSketch/DDSketchEncoding.h
+++ b/src/AggregateFunctions/DDSketch/DDSketchEncoding.h
@ -0,0 +1,101 @@
+#pragma once
+
+#include <vector>
+#include <stdexcept>
+
+/**
+  * An encoded DDSketch comprises multiple contiguous blocks (sequences of bytes).
+  * Each block is prefixed with a flag that indicates what the block contains and how the data is encoded in the block.
+  * A flag is a single byte, which itself contains two parts:
+  * - the flag type (the 2 least significant bits),
+  * - the subflag (the 6 most significant bits).
+  *
+  * There are four flag types, for:
+  * - sketch features,
+  * - index mapping,
+  * - positive value store,
+  * - negative value store.
+  *
+  * The meaning of the subflag depends on the flag type:
+  * - for the sketch feature flag type, it indicates what feature is encoded,
+  * - for the index mapping flag type, it indicates what mapping is encoded and how,
+  * - for the store flag types, it indicates how bins are encoded.
+  */
+namespace DB
+{
+class DDSketchEncoding
+{
+private:
+    static constexpr UInt8 numBitsForType = 2;
+    static constexpr UInt8 flagTypeMask = (1 << numBitsForType) - 1;
+    static constexpr UInt8 subFlagMask = ~flagTypeMask;
+    static constexpr UInt8 flagTypeSketchFeatures = 0b00;
+
+public:
+    class Flag
+    {
+    public:
+        UInt8 byte;
+        Flag(UInt8 t, UInt8 s) : byte(t | s) { }
+        [[maybe_unused]] UInt8 Type() const { return byte & flagTypeMask; }
+        [[maybe_unused]] UInt8 SubFlag() const { return byte & subFlagMask; }
+    };
+
+    // FLAG TYPES
+    static constexpr UInt8 FlagTypeIndexMapping = 0b10;
+    static constexpr UInt8 FlagTypePositiveStore = 0b01;
+    static constexpr UInt8 FlagTypeNegativeStore = 0b11;
+
+    // SKETCH FEATURES
+
+    // Encoding format:
+    // - [byte] flag
+    // - [varfloat64] count of the zero bin
+    const Flag FlagZeroCountVarFloat = Flag(flagTypeSketchFeatures, 1 << numBitsForType);
+
+    // INDEX MAPPING
+    // Encoding format:
+    // - [byte] flag
+    // - [float64LE] gamma
+    // - [float64LE] index offset
+    const Flag FlagIndexMappingBaseLogarithmic = Flag(FlagTypeIndexMapping, 0 << numBitsForType);
+
+    // BINS
+    // Encoding format:
+    // - [byte] flag
+    // - [uvarint64] number of bins N
+    // - [varint64] index of first bin
+    // - [varfloat64] count of first bin
+    // - [varint64] difference between the index of the second bin and the index
+    // of the first bin
+    // - [varfloat64] count of second bin
+    // - ...
+    // - [varint64] difference between the index of the N-th bin and the index
+    // of the (N-1)-th bin
+    // - [varfloat64] count of N-th bin
+    static constexpr UInt8 BinEncodingIndexDeltasAndCounts = 1 << numBitsForType;
+
+    // Encoding format:
+    // - [byte] flag
+    // - [uvarint64] number of bins N
+    // - [varint64] index of first bin
+    // - [varint64] difference between the index of the second bin and the index
+    // of the first bin
+    // - ...
+    // - [varint64] difference between the index of the N-th bin and the index
+    // of the (N-1)-th bin
+    static constexpr UInt8 BinEncodingIndexDeltas = 2 << numBitsForType;
+
+    // Encoding format:
+    // - [byte] flag
+    // - [uvarint64] number of bins N
+    // - [varint64] index of first bin
+    // - [varint64] difference between two successive indexes
+    // - [varfloat64] count of first bin
+    // - [varfloat64] count of second bin
+    // - ...
+    // - [varfloat64] count of N-th bin
+    static constexpr UInt8 BinEncodingContiguousCounts = 3 << numBitsForType;
+};
+
+}
--- a/src/AggregateFunctions/DDSketch/Mapping.h
+++ b/src/AggregateFunctions/DDSketch/Mapping.h
@ -0,0 +1,110 @@
+#pragma once
+
+#include <base/types.h>
+#include <cmath>
+#include <stdexcept>
+#include <limits>
+
+#include <IO/ReadBuffer.h>
+#include <IO/WriteBuffer.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+}
+
+class DDSketchLogarithmicMapping
+{
+public:
+    explicit DDSketchLogarithmicMapping(Float64 relative_accuracy_, Float64 offset_ = 0.0)
+        : relative_accuracy(relative_accuracy_), offset(offset_)
+    {
+        if (relative_accuracy <= 0 || relative_accuracy >= 1)
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Relative accuracy must be between 0 and 1 but is {}", relative_accuracy);
+        }
+
+        gamma = (1 + relative_accuracy) / (1 - relative_accuracy);
+        multiplier = 1 / std::log(gamma);
+        min_possible = std::numeric_limits<Float64>::min() * gamma;
+        max_possible = std::numeric_limits<Float64>::max() / gamma;
+    }
+
+    ~DDSketchLogarithmicMapping() = default;
+
+    int key(Float64 value) const
+    {
+        if (value < min_possible || value > max_possible)
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Value {} is out of range [{}, {}]", value, min_possible, max_possible);
+        }
+        return static_cast<int>(logGamma(value) + offset);
+    }
+
+    Float64 value(int key) const
+    {
+        return lowerBound(key) * (1 + relative_accuracy);
+    }
+
+    Float64 logGamma(Float64 value) const
+    {
+        return std::log(value) * multiplier;
+    }
+
+    Float64 powGamma(Float64 value) const
+    {
+        return std::exp(value / multiplier);
+    }
+
+    Float64 lowerBound(int index) const
+    {
+        return powGamma(static_cast<Float64>(index) - offset);
+    }
+
+    Float64 getGamma() const
+    {
+        return gamma;
+    }
+
+    Float64 getMinPossible() const
+    {
+        return min_possible;
+    }
+
+    [[maybe_unused]] Float64 getMaxPossible() const
+    {
+        return max_possible;
+    }
+
+    void serialize(WriteBuffer& buf) const
+    {
+        writeBinary(gamma, buf);
+        writeBinary(offset, buf);
+    }
+
+    void deserialize(ReadBuffer& buf)
+    {
+        readBinary(gamma, buf);
+        readBinary(offset, buf);
+        if (gamma <= 1.0)
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid gamma value after deserialization: {}", gamma);
+        }
+        multiplier = 1 / std::log(gamma);
+        min_possible = std::numeric_limits<Float64>::min() * gamma;
+        max_possible = std::numeric_limits<Float64>::max() / gamma;
+    }
+
+protected:
+    Float64 relative_accuracy;
+    Float64 gamma;
+    Float64 min_possible;
+    Float64 max_possible;
+    Float64 multiplier;
+    Float64 offset;
+};
+
+}
--- a/src/AggregateFunctions/DDSketch/Store.h
+++ b/src/AggregateFunctions/DDSketch/Store.h
@ -0,0 +1,260 @@
+#pragma once
+
+#include <base/types.h>
+#include <vector>
+#include <cmath>
+#include <limits>
+
+#include <IO/ReadBuffer.h>
+#include <IO/WriteBuffer.h>
+#include <AggregateFunctions/DDSketch/DDSketchEncoding.h>
+
+
+// We start with 128 bins and grow the number of bins by 128
+// each time we need to extend the range of the bins.
+// This is done to avoid reallocating the bins vector too often.
+constexpr UInt32 CHUNK_SIZE = 128;
+
+namespace DB
+{
+
+class DDSketchDenseStore
+{
+public:
+    Float64 count = 0;
+    int min_key = std::numeric_limits<int>::max();
+    int max_key = std::numeric_limits<int>::min();
+    int offset = 0;
+    std::vector<Float64> bins;
+
+    explicit DDSketchDenseStore(UInt32 chunk_size_ = CHUNK_SIZE) : chunk_size(chunk_size_) {}
+
+    void copy(DDSketchDenseStore* other)
+    {
+        bins = other->bins;
+        count = other->count;
+        min_key = other->min_key;
+        max_key = other->max_key;
+        offset = other->offset;
+    }
+
+    int length()
+    {
+        return static_cast<int>(bins.size());
+    }
+
+    void add(int key, Float64 weight)
+    {
+        int idx = getIndex(key);
+        bins[idx] += weight;
+        count += weight;
+    }
+
+    int keyAtRank(Float64 rank, bool lower)
+    {
+        Float64 running_ct = 0.0;
+        for (size_t i = 0; i < bins.size(); ++i)
+        {
+            running_ct += bins[i];
+            if ((lower && running_ct > rank) || (!lower && running_ct >= rank + 1))
+            {
+                return static_cast<int>(i) + offset;
+            }
+        }
+        return max_key;
+    }
+
+    void merge(DDSketchDenseStore* other)
+    {
+        if (other->count == 0) return;
+
+        if (count == 0)
+        {
+            copy(other);
+            return;
+        }
+
+        if (other->min_key < min_key || other->max_key > max_key)
+        {
+            extendRange(other->min_key, other->max_key);
+        }
+
+        for (int key = other->min_key; key <= other->max_key; ++key)
+        {
+            bins[key - offset] += other->bins[key - other->offset];
+        }
+
+        count += other->count;
+    }
+
+    void serialize(WriteBuffer& buf) const
+    {
+
+        // Calculate the size of the dense and sparse encodings to choose the smallest one
+        UInt64 num_bins = 0, num_non_empty_bins = 0;
+        if (count != 0)
+        {
+            num_bins = max_key - min_key + 1;
+        }
+
+        size_t sparse_encoding_overhead = 0;
+        for (int index = min_key; index <= max_key; ++index)
+        {
+            if (bins[index - offset] != 0)
+            {
+                num_non_empty_bins++;
+                sparse_encoding_overhead += 2; // 2 bytes for index delta
+            }
+        }
+
+        size_t dense_encoding_overhead = (num_bins - num_non_empty_bins) * estimatedFloatSize(0.0);
+
+        // Choose the smallest encoding and write to buffer
+        if (dense_encoding_overhead <= sparse_encoding_overhead)
+        {
+            // Write the dense encoding
+            writeBinary(enc.BinEncodingContiguousCounts, buf); // Flag for dense encoding
+            writeVarUInt(num_bins, buf);
+            writeVarInt(min_key, buf);
+            writeVarInt(1, buf); // indexDelta in dense encoding
+            for (int index = min_key; index <= max_key; ++index)
+            {
+                writeFloatBinary(bins[index - offset], buf);
+            }
+        }
+        else
+        {
+            // Write the sparse encoding
+            writeBinary(enc.BinEncodingIndexDeltasAndCounts, buf); // Flag for sparse encoding
+            writeVarUInt(num_non_empty_bins, buf);
+            int previous_index = 0;
+            for (int index = min_key; index <= max_key; ++index)
+            {
+                Float64 bin_count = bins[index - offset];
+                if (bin_count != 0)
+                {
+                    writeVarInt(index - previous_index, buf);
+                    writeFloatBinary(bin_count, buf);
+                    previous_index = index;
+                }
+            }
+        }
+    }
+
+    void deserialize(ReadBuffer& buf)
+    {
+        UInt8 encoding_mode;
+        readBinary(encoding_mode, buf);
+        if (encoding_mode == enc.BinEncodingContiguousCounts)
+        {
+            UInt64 num_bins;
+            readVarUInt(num_bins, buf);
+            int start_key;
+            readVarInt(start_key, buf);
+            int index_delta;
+            readVarInt(index_delta, buf);
+
+            for (UInt64 i = 0; i < num_bins; ++i)
+            {
+                Float64 bin_count;
+                readFloatBinary(bin_count, buf);
+                add(start_key, bin_count);
+                start_key += index_delta;
+            }
+        }
+        else
+        {
+            UInt64 num_non_empty_bins;
+            readVarUInt(num_non_empty_bins, buf);
+            int previous_index = 0;
+            for (UInt64 i = 0; i < num_non_empty_bins; ++i)
+            {
+                int index_delta;
+                readVarInt(index_delta, buf);
+                Float64 bin_count;
+                readFloatBinary(bin_count, buf);
+                previous_index += index_delta;
+                add(previous_index, bin_count);
+            }
+        }
+    }
+
+private:
+    UInt32 chunk_size;
+    DDSketchEncoding enc;
+
+    int getIndex(int key)
+    {
+        if (key < min_key || key > max_key)
+        {
+            extendRange(key, key);
+        }
+        return key - offset;
+    }
+
+    UInt32 getNewLength(int new_min_key, int new_max_key) const
+    {
+        int desired_length = new_max_key - new_min_key + 1;
+        return static_cast<UInt32>(chunk_size * std::ceil(static_cast<Float64>(desired_length) / chunk_size)); // Fixed float conversion
+    }
+
+    void extendRange(int key, int second_key)
+    {
+        int new_min_key = std::min({key, min_key});
+        int new_max_key = std::max({second_key, max_key});
+
+        if (length() == 0)
+        {
+            bins = std::vector<Float64>(getNewLength(new_min_key, new_max_key), 0.0);
+            offset = new_min_key;
+            adjust(new_min_key, new_max_key);
+        }
+        else if (new_min_key >= offset && new_max_key < offset + length())
+        {
+            min_key = new_min_key;
+            max_key = new_max_key;
+        }
+        else
+        {
+            UInt32 new_length = getNewLength(new_min_key, new_max_key);
+            if (new_length > bins.size())
+            {
+                bins.resize(new_length);
+                bins.resize(bins.capacity());
+            }
+            adjust(new_min_key, new_max_key);
+        }
+    }
+
+    void adjust(int new_min_key, int new_max_key)
+    {
+        centerBins(new_min_key, new_max_key);
+        min_key = new_min_key;
+        max_key = new_max_key;
+    }
+
+    void shiftBins(int shift)
+    {
+        int new_offset = offset - shift;
+        if (new_offset > offset)
+            std::rotate(bins.begin(), bins.begin() + (new_offset - offset) % bins.size(), bins.end());
+        else
+            std::rotate(bins.begin(), bins.end() - (offset - new_offset) % bins.size(), bins.end());
+        offset = new_offset;
+    }
+
+    void centerBins(int new_min_key, int new_max_key)
+    {
+        int margins = length() - (new_max_key - new_min_key + 1);
+        int new_offset = new_min_key - margins / 2;
+        shiftBins(offset - new_offset);
+    }
+
+    size_t estimatedFloatSize(Float64 value) const
+    {
+        // Assuming IEEE 754 double-precision binary floating-point format: binary64
+        return sizeof(value);
+    }
+};
+
+}
--- a/src/AggregateFunctions/QuantileDDSketch.h
+++ b/src/AggregateFunctions/QuantileDDSketch.h
@ -0,0 +1,108 @@
+#pragma once
+
+#include <base/types.h>
+#include <base/sort.h>
+#include <AggregateFunctions/DDSketch.h>
+
+#include <IO/ReadBuffer.h>
+#include <IO/WriteBuffer.h>
+
+
+namespace DB
+{
+
+/**
+ * A DDSketch is a fully-mergeable quantile sketch with relative-error guarantees. That is, for any value x,
+ * the value returned by the sketch is guaranteed to be in the (1 +- epsilon) * x range. The sketch is
+ * parameterized by a relative accuracy epsilon, which is the maximum relative error of any quantile estimate.
+ *
+ * The sketch is implemented as a set of logarithmically-spaced bins. Each bin is a pair of a value and a count.
+ *
+ * The sketch is fully mergeable, meaning that the merge of two sketches is equivalent to the sketch of the
+ * union of the input datasets. The memory size of the sketch depends on the range that is covered by
+ * the input values: the larger that range, the more bins are needed to keep track of the input values.
+ * As a rough estimate, if working on durations using DDSketches.unboundedDense(0.02) (relative accuracy of 2%),
+ * about 2kB (275 bins) are needed to cover values between 1 millisecond and 1 minute, and about 6kB (802 bins)
+ * to cover values between 1 nanosecond and 1 day.
+ *
+ * This implementation maintains the binary compatibility with the DDSketch ProtoBuf format
+ * https://github.com/DataDog/sketches-java/blob/master/src/protobuf/proto/DDSketch.proto.
+ * Which enables sending the pre-aggregated sketches to the ClickHouse server and calculating the quantiles
+ * during the query time. See DDSketchEncoding.h for byte-level details.
+ *
+*/
+
+template <typename Value>
+class QuantileDDSketch
+{
+public:
+    using Weight = UInt64;
+
+    QuantileDDSketch() = default;
+
+    explicit QuantileDDSketch(Float64 relative_accuracy) : data(relative_accuracy) { }
+
+    void add(const Value & x)
+    {
+        add(x, 1);
+    }
+
+    void add(const Value & x, Weight w)
+    {
+        if (!isNaN(x))
+            data.add(x, w);
+    }
+
+    void merge(const QuantileDDSketch &other)
+    {
+        data.merge(other.data);
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        data.serialize(buf);
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        data.deserialize(buf);
+    }
+
+    Value get(Float64 level) const
+    {
+        return getImpl<Value>(level);
+    }
+
+    void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result) const
+    {
+        getManyImpl(levels, indices, size, result);
+    }
+
+    Float64 getFloat(Float64 level) const
+    {
+        return getImpl<Float64>(level);
+    }
+
+    void getManyFloat(const Float64 * levels, const size_t * indices, size_t size, Float64 * result) const
+    {
+        getManyImpl(levels, indices, size, result);
+    }
+
+private:
+    DDSketchDenseLogarithmic data;
+
+    template <typename T>
+    T getImpl(Float64 level) const
+    {
+        return static_cast<T>(data.get(level));
+    }
+
+    template <typename T>
+    void getManyImpl(const Float64 * levels, const size_t *, size_t num_levels, T * result) const
+    {
+        for (size_t i = 0; i < num_levels; ++i)
+            result[i] = getImpl<T>(levels[i]);
+    }
+};
+
+}
--- a/src/AggregateFunctions/registerAggregateFunctions.cpp
+++ b/src/AggregateFunctions/registerAggregateFunctions.cpp
@ -31,6 +31,7 @@ void registerAggregateFunctionsQuantileTimingWeighted(AggregateFunctionFactory &
 void registerAggregateFunctionsQuantileTDigest(AggregateFunctionFactory &);
 void registerAggregateFunctionsQuantileTDigestWeighted(AggregateFunctionFactory &);
 void registerAggregateFunctionsQuantileBFloat16(AggregateFunctionFactory &);
+void registerAggregateFunctionsQuantileDDSketch(AggregateFunctionFactory &);
 void registerAggregateFunctionsQuantileBFloat16Weighted(AggregateFunctionFactory &);
 void registerAggregateFunctionsQuantileApprox(AggregateFunctionFactory &);
 void registerAggregateFunctionsSequenceMatch(AggregateFunctionFactory &);
@ -127,6 +128,7 @@ void registerAggregateFunctions()
        registerAggregateFunctionsQuantileTDigest(factory);
        registerAggregateFunctionsQuantileTDigestWeighted(factory);
        registerAggregateFunctionsQuantileBFloat16(factory);
+        registerAggregateFunctionsQuantileDDSketch(factory);
        registerAggregateFunctionsQuantileBFloat16Weighted(factory);
        registerAggregateFunctionsQuantileApprox(factory);
        registerAggregateFunctionsSequenceMatch(factory);
--- a/src/Analyzer/Passes/AutoFinalOnQueryPass.cpp
+++ b/src/Analyzer/Passes/AutoFinalOnQueryPass.cpp
@ -1,4 +1,4 @@
-#include "AutoFinalOnQueryPass.h"
+#include <Analyzer/Passes/AutoFinalOnQueryPass.h>

 #include <Storages/IStorage.h>

--- a/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp
+++ b/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp
@ -8,14 +8,12 @@
 #include <Analyzer/ConstantNode.h>
 #include <Analyzer/Passes/CNF.h>
 #include <Analyzer/Utils.h>
+#include <Analyzer/HashUtils.h>

 #include <Storages/IStorage.h>

 #include <Functions/FunctionFactory.h>
-#include "Analyzer/HashUtils.h"
-#include "Analyzer/IQueryTreeNode.h"
-#include "Interpreters/ComparisonGraph.h"
-#include "base/types.h"
+#include <Interpreters/ComparisonGraph.h>

 namespace DB
 {
--- a/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp
+++ b/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp
@ -75,6 +75,7 @@ public:
                {
                    /// Replace `length(array_argument)` with `array_argument.size0`
                    column.name += ".size0";
+                    column.type = std::make_shared<DataTypeUInt64>();

                    node = std::make_shared<ColumnNode>(column, column_source);
                }
@ -109,6 +110,7 @@ public:
                {
                    /// Replace `isNull(nullable_argument)` with `nullable_argument.null`
                    column.name += ".null";
+                    column.type = std::make_shared<DataTypeUInt8>();

                    node = std::make_shared<ColumnNode>(column, column_source);
                }
--- a/src/Analyzer/Passes/QueryAnalysisPass.cpp
+++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp
@ -2980,6 +2980,8 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromStorage(

    if (!result_expression)
    {
+        if (can_be_not_found)
+            return {};
        std::unordered_set<Identifier> valid_identifiers;
        collectTableExpressionValidIdentifiersForTypoCorrection(identifier,
            table_expression_node,
--- a/src/Analyzer/Passes/UniqToCountPass.cpp
+++ b/src/Analyzer/Passes/UniqToCountPass.cpp
@ -1,4 +1,4 @@
-#include "UniqToCountPass.h"
+#include <Analyzer/Passes/UniqToCountPass.h>

 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <AggregateFunctions/IAggregateFunction.h>
--- a/src/Backups/BackupCoordinationRemote.cpp
+++ b/src/Backups/BackupCoordinationRemote.cpp
@ -162,7 +162,8 @@ BackupCoordinationRemote::BackupCoordinationRemote(
    const Strings & all_hosts_,
    const String & current_host_,
    bool plain_backup_,
-    bool is_internal_)
+    bool is_internal_,
+    QueryStatusPtr process_list_element_)
    : root_zookeeper_path(root_zookeeper_path_)
    , zookeeper_path(root_zookeeper_path_ + "/backup-" + backup_uuid_)
    , keeper_settings(keeper_settings_)
@ -177,6 +178,7 @@ BackupCoordinationRemote::BackupCoordinationRemote(
        log,
        get_zookeeper_,
        keeper_settings,
+        process_list_element_,
        [my_zookeeper_path = zookeeper_path, my_current_host = current_host, my_is_internal = is_internal]
        (WithRetries::FaultyKeeper & zk)
        {
--- a/src/Backups/BackupCoordinationRemote.h
+++ b/src/Backups/BackupCoordinationRemote.h
@ -30,7 +30,8 @@ public:
        const Strings & all_hosts_,
        const String & current_host_,
        bool plain_backup_,
-        bool is_internal_);
+        bool is_internal_,
+        QueryStatusPtr process_list_element_);

    ~BackupCoordinationRemote() override;

--- a/src/Backups/BackupEntriesCollector.cpp
+++ b/src/Backups/BackupEntriesCollector.cpp
@ -87,6 +87,7 @@ BackupEntriesCollector::BackupEntriesCollector(
    , backup_coordination(backup_coordination_)
    , read_settings(read_settings_)
    , context(context_)
+    , process_list_element(context->getProcessListElement())
    , on_cluster_first_sync_timeout(context->getConfigRef().getUInt64("backups.on_cluster_first_sync_timeout", 180000))
    , collect_metadata_timeout(context->getConfigRef().getUInt64(
          "backups.collect_metadata_timeout", context->getConfigRef().getUInt64("backups.consistent_metadata_snapshot_timeout", 600000)))
@ -158,8 +159,9 @@ BackupEntries BackupEntriesCollector::run()
 Strings BackupEntriesCollector::setStage(const String & new_stage, const String & message)
 {
    LOG_TRACE(log, "Setting stage: {}", new_stage);
-    current_stage = new_stage;
+    checkIsQueryCancelled();

+    current_stage = new_stage;
    backup_coordination->setStage(new_stage, message);

    if (new_stage == Stage::formatGatheringMetadata(0))
@ -179,6 +181,12 @@ Strings BackupEntriesCollector::setStage(const String & new_stage, const String
    }
 }

+void BackupEntriesCollector::checkIsQueryCancelled() const
+{
+    if (process_list_element)
+        process_list_element->checkTimeLimit();
+}
+
 /// Calculates the root path for collecting backup entries,
 /// it's either empty or has the format "shards/<shard_num>/replicas/<replica_num>/".
 void BackupEntriesCollector::calculateRootPathInBackup()
@ -413,6 +421,8 @@ void BackupEntriesCollector::gatherDatabaseMetadata(
    bool all_tables,
    const std::set<DatabaseAndTableName> & except_table_names)
 {
+    checkIsQueryCancelled();
+
    auto it = database_infos.find(database_name);
    if (it == database_infos.end())
    {
@ -491,6 +501,8 @@ void BackupEntriesCollector::gatherDatabaseMetadata(

 void BackupEntriesCollector::gatherTablesMetadata()
 {
+    checkIsQueryCancelled();
+
    table_infos.clear();
    for (const auto & [database_name, database_info] : database_infos)
    {
@ -552,6 +564,8 @@ std::vector<std::pair<ASTPtr, StoragePtr>> BackupEntriesCollector::findTablesInD
    const auto & database_info = database_infos.at(database_name);
    const auto & database = database_info.database;

+    checkIsQueryCancelled();
+
    auto filter_by_table_name = [my_database_info = &database_info](const String & table_name)
    {
        /// We skip inner tables of materialized views.
@ -629,8 +643,12 @@ void BackupEntriesCollector::lockTablesForReading()
    for (auto & [table_name, table_info] : table_infos)
    {
        auto storage = table_info.storage;
-        if (storage)
-            table_info.table_lock = storage->tryLockForShare(context->getInitialQueryId(), context->getSettingsRef().lock_acquire_timeout);
+        if (!storage)
+            continue;
+
+        checkIsQueryCancelled();
+
+        table_info.table_lock = storage->tryLockForShare(context->getInitialQueryId(), context->getSettingsRef().lock_acquire_timeout);
    }

    std::erase_if(
@ -734,6 +752,7 @@ void BackupEntriesCollector::makeBackupEntriesForDatabasesDefs()
            continue; /// We store CREATE DATABASE queries only if there was BACKUP DATABASE specified.

        LOG_TRACE(log, "Adding the definition of database {} to backup", backQuoteIfNeed(database_name));
+        checkIsQueryCancelled();

        ASTPtr new_create_query = database_info.create_database_query;
        adjustCreateQueryForBackup(new_create_query, context->getGlobalContext(), nullptr);
@ -750,6 +769,7 @@ void BackupEntriesCollector::makeBackupEntriesForTablesDefs()
    for (auto & [table_name, table_info] : table_infos)
    {
        LOG_TRACE(log, "Adding the definition of {} to backup", tableNameWithTypeToString(table_name.database, table_name.table, false));
+        checkIsQueryCancelled();

        ASTPtr new_create_query = table_info.create_table_query;
        adjustCreateQueryForBackup(new_create_query, context->getGlobalContext(), &table_info.replicated_table_shared_id);
@ -802,6 +822,7 @@ void BackupEntriesCollector::makeBackupEntriesForTableData(const QualifiedTableN
    }

    LOG_TRACE(log, "Collecting data of {} for backup", tableNameWithTypeToString(table_name.database, table_name.table, false));
+    checkIsQueryCancelled();

    try
    {
@ -861,13 +882,17 @@ void BackupEntriesCollector::addPostTask(std::function<void()> task)
 void BackupEntriesCollector::runPostTasks()
 {
    LOG_TRACE(log, "Will run {} post tasks", post_tasks.size());
+
    /// Post collecting tasks can add other post collecting tasks, our code is fine with that.
    while (!post_tasks.empty())
    {
+        checkIsQueryCancelled();
+
        auto task = std::move(post_tasks.front());
        post_tasks.pop();
        std::move(task)();
    }
+
    LOG_TRACE(log, "All post tasks successfully executed");
 }

--- a/src/Backups/BackupEntriesCollector.h
+++ b/src/Backups/BackupEntriesCollector.h
@ -22,6 +22,9 @@ class IDatabase;
 using DatabasePtr = std::shared_ptr<IDatabase>;
 struct StorageID;
 enum class AccessEntityType;
+class QueryStatus;
+using QueryStatusPtr = std::shared_ptr<QueryStatus>;
+

 /// Collects backup entries for all databases and tables which should be put to a backup.
 class BackupEntriesCollector : private boost::noncopyable
@ -97,11 +100,15 @@ private:

    Strings setStage(const String & new_stage, const String & message = "");

+    /// Throws an exception if the BACKUP query was cancelled.
+    void checkIsQueryCancelled() const;
+
    const ASTBackupQuery::Elements backup_query_elements;
    const BackupSettings backup_settings;
    std::shared_ptr<IBackupCoordination> backup_coordination;
    const ReadSettings read_settings;
    ContextPtr context;
+    QueryStatusPtr process_list_element;

    /// The time a BACKUP ON CLUSTER or RESTORE ON CLUSTER command will wait until all the nodes receive the BACKUP (or RESTORE) query and start working.
    /// This setting is similar to `distributed_ddl_task_timeout`.
--- a/src/Backups/BackupEntryFromMemory.h
+++ b/src/Backups/BackupEntryFromMemory.h
@ -17,7 +17,12 @@ public:
    std::unique_ptr<SeekableReadBuffer> getReadBuffer(const ReadSettings &) const override;
    UInt64 getSize() const override { return data.size(); }

-    DataSourceDescription getDataSourceDescription() const override { return DataSourceDescription{DataSourceType::RAM, "", false, false}; }
+    DataSourceDescription getDataSourceDescription() const override
+    {
+        DataSourceDescription res;
+        res.type = DataSourceType::RAM;
+        return res;
+    }

 private:
    const String data;
--- a/src/Backups/BackupFileInfo.cpp
+++ b/src/Backups/BackupFileInfo.cpp
@ -7,6 +7,8 @@
 #include <Common/scope_guard_safe.h>
 #include <Common/setThreadName.h>
 #include <Common/ThreadPool.h>
+#include <Interpreters/ProcessList.h>
+
 #include <base/hex.h>


@ -203,7 +205,7 @@ BackupFileInfo buildFileInfoForBackupEntry(
    return info;
 }

-BackupFileInfos buildFileInfosForBackupEntries(const BackupEntries & backup_entries, const BackupPtr & base_backup, const ReadSettings & read_settings, ThreadPool & thread_pool)
+BackupFileInfos buildFileInfosForBackupEntries(const BackupEntries & backup_entries, const BackupPtr & base_backup, const ReadSettings & read_settings, ThreadPool & thread_pool, QueryStatusPtr process_list_element)
 {
    BackupFileInfos infos;
    infos.resize(backup_entries.size());
@ -225,7 +227,7 @@ BackupFileInfos buildFileInfosForBackupEntries(const BackupEntries & backup_entr
            ++num_active_jobs;
        }

-        auto job = [&mutex, &num_active_jobs, &event, &exception, &infos, &backup_entries, &read_settings, &base_backup, &thread_group, i, log]()
+        auto job = [&mutex, &num_active_jobs, &event, &exception, &infos, &backup_entries, &read_settings, &base_backup, &thread_group, &process_list_element, i, log]()
        {
            SCOPE_EXIT_SAFE({
                std::lock_guard lock{mutex};
@ -250,6 +252,9 @@ BackupFileInfos buildFileInfosForBackupEntries(const BackupEntries & backup_entr
                        return;
                }

+                if (process_list_element)
+                    process_list_element->checkTimeLimit();
+
                infos[i] = buildFileInfoForBackupEntry(name, entry, base_backup, read_settings, log);
            }
            catch (...)
--- a/src/Backups/BackupFileInfo.h
+++ b/src/Backups/BackupFileInfo.h
@ -14,6 +14,8 @@ using BackupPtr = std::shared_ptr<const IBackup>;
 using BackupEntryPtr = std::shared_ptr<const IBackupEntry>;
 using BackupEntries = std::vector<std::pair<String, BackupEntryPtr>>;
 struct ReadSettings;
+class QueryStatus;
+using QueryStatusPtr = std::shared_ptr<QueryStatus>;


 /// Information about a file stored in a backup.
@ -78,6 +80,6 @@ using BackupFileInfos = std::vector<BackupFileInfo>;
 BackupFileInfo buildFileInfoForBackupEntry(const String & file_name, const BackupEntryPtr & backup_entry, const BackupPtr & base_backup, const ReadSettings & read_settings, Poco::Logger * log);

 /// Builds a vector of BackupFileInfos for specified backup entries.
-BackupFileInfos buildFileInfosForBackupEntries(const BackupEntries & backup_entries, const BackupPtr & base_backup, const ReadSettings & read_settings, ThreadPool & thread_pool);
+BackupFileInfos buildFileInfosForBackupEntries(const BackupEntries & backup_entries, const BackupPtr & base_backup, const ReadSettings & read_settings, ThreadPool & thread_pool, QueryStatusPtr process_list_element);

 }
--- a/src/Backups/BackupIO_S3.cpp
+++ b/src/Backups/BackupIO_S3.cpp
@ -126,7 +126,7 @@ BackupReaderS3::BackupReaderS3(
    const ContextPtr & context_)
    : BackupReaderDefault(read_settings_, write_settings_, &Poco::Logger::get("BackupReaderS3"))
    , s3_uri(s3_uri_)
-    , data_source_description{DataSourceType::S3, s3_uri.endpoint, false, false}
+    , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false}
    , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()))
 {
    auto & request_settings = s3_settings.request_settings;
@ -216,7 +216,7 @@ BackupWriterS3::BackupWriterS3(
    const ContextPtr & context_)
    : BackupWriterDefault(read_settings_, write_settings_, &Poco::Logger::get("BackupWriterS3"))
    , s3_uri(s3_uri_)
-    , data_source_description{DataSourceType::S3, s3_uri.endpoint, false, false}
+    , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false}
    , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()))
 {
    auto & request_settings = s3_settings.request_settings;
--- a/src/Backups/BackupStatus.cpp
+++ b/src/Backups/BackupStatus.cpp
@ -21,12 +21,16 @@ std::string_view toString(BackupStatus backup_status)
            return "BACKUP_CREATED";
        case BackupStatus::BACKUP_FAILED:
            return "BACKUP_FAILED";
+        case BackupStatus::BACKUP_CANCELLED:
+            return "BACKUP_CANCELLED";
        case BackupStatus::RESTORING:
            return "RESTORING";
        case BackupStatus::RESTORED:
            return "RESTORED";
        case BackupStatus::RESTORE_FAILED:
            return "RESTORE_FAILED";
+        case BackupStatus::RESTORE_CANCELLED:
+            return "RESTORE_CANCELLED";
        default:
            break;
    }
--- a/src/Backups/BackupStatus.h
+++ b/src/Backups/BackupStatus.h
@ -18,6 +18,10 @@ enum class BackupStatus
    RESTORED,
    RESTORE_FAILED,

+    /// Statuses used after a BACKUP or RESTORE operation was cancelled.
+    BACKUP_CANCELLED,
+    RESTORE_CANCELLED,
+
    MAX,
 };

--- a/src/Backups/BackupsWorker.cpp
+++ b/src/Backups/BackupsWorker.cpp
@ -45,6 +45,7 @@ namespace ErrorCodes
    extern const int BAD_ARGUMENTS;
    extern const int LOGICAL_ERROR;
    extern const int CONCURRENT_ACCESS_NOT_SUPPORTED;
+    extern const int QUERY_WAS_CANCELLED;
 }

 using OperationID = BackupOperationID;
@ -73,7 +74,8 @@ namespace
                all_hosts,
                backup_settings.host_id,
                !backup_settings.deduplicate_files,
-                backup_settings.internal);
+                backup_settings.internal,
+                context->getProcessListElement());
        }
        else
        {
@ -110,7 +112,8 @@ namespace
                toString(*restore_settings.restore_uuid),
                all_hosts,
                restore_settings.host_id,
-                restore_settings.internal);
+                restore_settings.internal,
+                context->getProcessListElement());
        }
        else
        {
@ -150,17 +153,52 @@ namespace
        }
    }

-    bool isFinalStatus(BackupStatus status)
+    bool isFinishedSuccessfully(BackupStatus status)
    {
-        return (status == BackupStatus::BACKUP_CREATED) || (status == BackupStatus::BACKUP_FAILED) || (status == BackupStatus::RESTORED)
-            || (status == BackupStatus::RESTORE_FAILED);
+        return (status == BackupStatus::BACKUP_CREATED) || (status == BackupStatus::RESTORED);
    }

-    bool isErrorStatus(BackupStatus status)
+    bool isFailed(BackupStatus status)
    {
        return (status == BackupStatus::BACKUP_FAILED) || (status == BackupStatus::RESTORE_FAILED);
    }

+    bool isCancelled(BackupStatus status)
+    {
+        return (status == BackupStatus::BACKUP_CANCELLED) || (status == BackupStatus::RESTORE_CANCELLED);
+    }
+
+    bool isFailedOrCancelled(BackupStatus status)
+    {
+        return isFailed(status) || isCancelled(status);
+    }
+
+    bool isFinalStatus(BackupStatus status)
+    {
+        return isFinishedSuccessfully(status) || isFailedOrCancelled(status);
+    }
+
+    bool isBackupStatus(BackupStatus status)
+    {
+        return (status == BackupStatus::CREATING_BACKUP) || (status == BackupStatus::BACKUP_CREATED) || (status == BackupStatus::BACKUP_FAILED) || (status == BackupStatus::BACKUP_CANCELLED);
+    }
+
+    BackupStatus getBackupStatusFromCurrentException()
+    {
+        if (getCurrentExceptionCode() == ErrorCodes::QUERY_WAS_CANCELLED)
+            return BackupStatus::BACKUP_CANCELLED;
+        else
+            return BackupStatus::BACKUP_FAILED;
+    }
+
+    BackupStatus getRestoreStatusFromCurrentException()
+    {
+        if (getCurrentExceptionCode() == ErrorCodes::QUERY_WAS_CANCELLED)
+            return BackupStatus::RESTORE_CANCELLED;
+        else
+            return BackupStatus::RESTORE_FAILED;
+    }
+
    /// Used to change num_active_backups.
    size_t getNumActiveBackupsChange(BackupStatus status)
    {
@ -337,13 +375,15 @@ private:
 };


-BackupsWorker::BackupsWorker(ContextPtr global_context, size_t num_backup_threads, size_t num_restore_threads, bool allow_concurrent_backups_, bool allow_concurrent_restores_)
+BackupsWorker::BackupsWorker(ContextMutablePtr global_context, size_t num_backup_threads, size_t num_restore_threads, bool allow_concurrent_backups_, bool allow_concurrent_restores_, bool test_inject_sleep_)
    : thread_pools(std::make_unique<ThreadPools>(num_backup_threads, num_restore_threads))
    , allow_concurrent_backups(allow_concurrent_backups_)
    , allow_concurrent_restores(allow_concurrent_restores_)
+    , test_inject_sleep(test_inject_sleep_)
    , log(&Poco::Logger::get("BackupsWorker"))
+    , backup_log(global_context->getBackupLog())
+    , process_list(global_context->getProcessList())
 {
-    backup_log = global_context->getBackupLog();
 }


@ -400,7 +440,7 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context

    try
    {
-        addInfo(backup_id, backup_name_for_logging, base_backup_name, backup_settings.internal, BackupStatus::CREATING_BACKUP);
+        addInfo(backup_id, backup_name_for_logging, base_backup_name, backup_settings.internal, context->getProcessListElement(), BackupStatus::CREATING_BACKUP);

        /// Prepare context to use.
        ContextPtr context_in_use = context;
@ -408,8 +448,9 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
        bool on_cluster = !backup_query->cluster.empty();
        if (on_cluster || backup_settings.async)
        {
-            /// For ON CLUSTER queries we will need to change some settings.
-            /// For ASYNC queries we have to clone the context anyway.
+            /// We have to clone the query context here because:
+            /// if this is an "ON CLUSTER" query we need to change some settings, and
+            /// if this is an "ASYNC" query it's going to be executed in another thread.
            context_in_use = mutable_context = Context::createCopy(context);
            mutable_context->makeQueryContext();
        }
@ -417,8 +458,22 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
        if (backup_settings.async)
        {
            auto & thread_pool = getThreadPool(on_cluster ? ThreadPoolId::BACKUP_ASYNC_ON_CLUSTER : ThreadPoolId::BACKUP_ASYNC);
+
+            /// process_list_element_holder is used to make an element in ProcessList live while BACKUP is working asynchronously.
+            auto process_list_element = context_in_use->getProcessListElement();
+
            thread_pool.scheduleOrThrowOnError(
-                [this, backup_query, backup_id, backup_name_for_logging, backup_info, backup_settings, backup_coordination, context_in_use, mutable_context]
+                [this,
+                 backup_query,
+                 backup_id,
+                 backup_name_for_logging,
+                 backup_info,
+                 backup_settings,
+                 backup_coordination,
+                 context_in_use,
+                 mutable_context,
+                 thread_group = CurrentThread::getGroup(),
+                 process_list_element_holder = process_list_element ? process_list_element->getProcessListEntry() : nullptr]
                {
                    doBackup(
                        backup_query,
@ -429,6 +484,7 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
                        backup_coordination,
                        context_in_use,
                        mutable_context,
+                        thread_group,
                        /* called_async= */ true);
                });
        }
@ -443,6 +499,7 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
                backup_coordination,
                context_in_use,
                mutable_context,
+                nullptr,
                /* called_async= */ false);
        }

@ -452,7 +509,7 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
    {
        tryLogCurrentException(log, fmt::format("Failed to start {} {}", (backup_settings.internal ? "internal backup" : "backup"), backup_name_for_logging));
        /// Something bad happened, the backup has not built.
-        setStatusSafe(backup_id, BackupStatus::BACKUP_FAILED);
+        setStatusSafe(backup_id, getBackupStatusFromCurrentException());
        sendCurrentExceptionToCoordination(backup_coordination);
        throw;
    }
@ -468,19 +525,22 @@ void BackupsWorker::doBackup(
    std::shared_ptr<IBackupCoordination> backup_coordination,
    const ContextPtr & context,
    ContextMutablePtr mutable_context,
+    ThreadGroupPtr thread_group,
    bool called_async)
 {
-    std::optional<CurrentThread::QueryScope> query_scope;
+    SCOPE_EXIT_SAFE(
+        if (called_async && thread_group)
+            CurrentThread::detachFromGroupIfNotDetached();
+    );
+
    try
    {
+        if (called_async && thread_group)
+            CurrentThread::attachToGroup(thread_group);
        if (called_async)
-        {
-            query_scope.emplace(mutable_context);
            setThreadName("BackupWorker");
-        }

        bool on_cluster = !backup_query->cluster.empty();
-
        assert(mutable_context || (!on_cluster && !called_async));

        /// Checks access rights if this is not ON CLUSTER query.
@ -557,8 +617,8 @@ void BackupsWorker::doBackup(
            }

            /// Write the backup entries to the backup.
-            buildFileInfosForBackupEntries(backup, backup_entries, backup_create_params.read_settings, backup_coordination);
-            writeBackupEntries(backup, std::move(backup_entries), backup_id, backup_coordination, backup_settings.internal);
+            buildFileInfosForBackupEntries(backup, backup_entries, backup_create_params.read_settings, backup_coordination, context->getProcessListElement());
+            writeBackupEntries(backup, std::move(backup_entries), backup_id, backup_coordination, backup_settings.internal, context->getProcessListElement());

            /// We have written our backup entries, we need to tell other hosts (they could be waiting for it).
            backup_coordination->setStage(Stage::COMPLETED,"");
@ -596,7 +656,7 @@ void BackupsWorker::doBackup(
        if (called_async)
        {
            tryLogCurrentException(log, fmt::format("Failed to make {} {}", (backup_settings.internal ? "internal backup" : "backup"), backup_name_for_logging));
-            setStatusSafe(backup_id, BackupStatus::BACKUP_FAILED);
+            setStatusSafe(backup_id, getBackupStatusFromCurrentException());
            sendCurrentExceptionToCoordination(backup_coordination);
        }
        else
@ -608,15 +668,21 @@ void BackupsWorker::doBackup(
 }


-void BackupsWorker::buildFileInfosForBackupEntries(const BackupPtr & backup, const BackupEntries & backup_entries, const ReadSettings & read_settings, std::shared_ptr<IBackupCoordination> backup_coordination)
+void BackupsWorker::buildFileInfosForBackupEntries(const BackupPtr & backup, const BackupEntries & backup_entries, const ReadSettings & read_settings, std::shared_ptr<IBackupCoordination> backup_coordination, QueryStatusPtr process_list_element)
 {
    backup_coordination->setStage(Stage::BUILDING_FILE_INFOS, "");
    backup_coordination->waitForStage(Stage::BUILDING_FILE_INFOS);
-    backup_coordination->addFileInfos(::DB::buildFileInfosForBackupEntries(backup_entries, backup->getBaseBackup(), read_settings, getThreadPool(ThreadPoolId::BACKUP_MAKE_FILES_LIST)));
+    backup_coordination->addFileInfos(::DB::buildFileInfosForBackupEntries(backup_entries, backup->getBaseBackup(), read_settings, getThreadPool(ThreadPoolId::BACKUP_MAKE_FILES_LIST), process_list_element));
 }


-void BackupsWorker::writeBackupEntries(BackupMutablePtr backup, BackupEntries && backup_entries, const OperationID & backup_id, std::shared_ptr<IBackupCoordination> backup_coordination, bool internal)
+void BackupsWorker::writeBackupEntries(
+    BackupMutablePtr backup,
+    BackupEntries && backup_entries,
+    const OperationID & backup_id,
+    std::shared_ptr<IBackupCoordination> backup_coordination,
+    bool internal,
+    QueryStatusPtr process_list_element)
 {
    LOG_TRACE(log, "{}, num backup entries={}", Stage::WRITING_BACKUP, backup_entries.size());
    backup_coordination->setStage(Stage::WRITING_BACKUP, "");
@ -677,7 +743,13 @@ void BackupsWorker::writeBackupEntries(BackupMutablePtr backup, BackupEntries &&
                        return;
                }

+                if (process_list_element)
+                    process_list_element->checkTimeLimit();
+
                backup->writeFile(file_info, std::move(entry));
+
+                maybeSleepForTesting();
+
                // Update metadata
                if (!internal)
                {
@ -690,7 +762,6 @@ void BackupsWorker::writeBackupEntries(BackupMutablePtr backup, BackupEntries &&
                            backup->getCompressedSize(),
                            0, 0);
                }
-
            }
            catch (...)
            {
@ -752,15 +823,16 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
        if (restore_settings.base_backup_info)
            base_backup_name = restore_settings.base_backup_info->toStringForLogging();

-        addInfo(restore_id, backup_name_for_logging, base_backup_name, restore_settings.internal, BackupStatus::RESTORING);
+        addInfo(restore_id, backup_name_for_logging, base_backup_name, restore_settings.internal, context->getProcessListElement(), BackupStatus::RESTORING);

        /// Prepare context to use.
        ContextMutablePtr context_in_use = context;
        bool on_cluster = !restore_query->cluster.empty();
        if (restore_settings.async || on_cluster)
        {
-            /// For ON CLUSTER queries we will need to change some settings.
-            /// For ASYNC queries we have to clone the context anyway.
+            /// We have to clone the query context here because:
+            /// if this is an "ON CLUSTER" query we need to change some settings, and
+            /// if this is an "ASYNC" query it's going to be executed in another thread.
            context_in_use = Context::createCopy(context);
            context_in_use->makeQueryContext();
        }
@ -768,8 +840,21 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
        if (restore_settings.async)
        {
            auto & thread_pool = getThreadPool(on_cluster ? ThreadPoolId::RESTORE_ASYNC_ON_CLUSTER : ThreadPoolId::RESTORE_ASYNC);
+
+            /// process_list_element_holder is used to make an element in ProcessList live while RESTORE is working asynchronously.
+            auto process_list_element = context_in_use->getProcessListElement();
+
            thread_pool.scheduleOrThrowOnError(
-                [this, restore_query, restore_id, backup_name_for_logging, backup_info, restore_settings, restore_coordination, context_in_use]
+                [this,
+                 restore_query,
+                 restore_id,
+                 backup_name_for_logging,
+                 backup_info,
+                 restore_settings,
+                 restore_coordination,
+                 context_in_use,
+                 thread_group = CurrentThread::getGroup(),
+                 process_list_element_holder = process_list_element ? process_list_element->getProcessListEntry() : nullptr]
                {
                    doRestore(
                        restore_query,
@ -779,6 +864,7 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
                        restore_settings,
                        restore_coordination,
                        context_in_use,
+                        thread_group,
                        /* called_async= */ true);
                });
        }
@ -792,6 +878,7 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
                restore_settings,
                restore_coordination,
                context_in_use,
+                nullptr,
                /* called_async= */ false);
        }

@ -800,7 +887,7 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
    catch (...)
    {
        /// Something bad happened, the backup has not built.
-        setStatusSafe(restore_id, BackupStatus::RESTORE_FAILED);
+        setStatusSafe(restore_id, getRestoreStatusFromCurrentException());
        sendCurrentExceptionToCoordination(restore_coordination);
        throw;
    }
@ -815,16 +902,20 @@ void BackupsWorker::doRestore(
    RestoreSettings restore_settings,
    std::shared_ptr<IRestoreCoordination> restore_coordination,
    ContextMutablePtr context,
+    ThreadGroupPtr thread_group,
    bool called_async)
 {
-    std::optional<CurrentThread::QueryScope> query_scope;
+    SCOPE_EXIT_SAFE(
+        if (called_async && thread_group)
+            CurrentThread::detachFromGroupIfNotDetached();
+    );
+
    try
    {
+        if (called_async && thread_group)
+            CurrentThread::attachToGroup(thread_group);
        if (called_async)
-        {
-            query_scope.emplace(context);
            setThreadName("RestoreWorker");
-        }

        /// Open the backup for reading.
        BackupFactory::CreateParams backup_open_params;
@ -913,7 +1004,7 @@ void BackupsWorker::doRestore(
            }

            /// Execute the data restoring tasks.
-            restoreTablesData(restore_id, backup, std::move(data_restore_tasks), getThreadPool(ThreadPoolId::RESTORE_TABLES_DATA));
+            restoreTablesData(restore_id, backup, std::move(data_restore_tasks), getThreadPool(ThreadPoolId::RESTORE_TABLES_DATA), context->getProcessListElement());

            /// We have restored everything, we need to tell other hosts (they could be waiting for it).
            restore_coordination->setStage(Stage::COMPLETED, "");
@ -928,7 +1019,7 @@ void BackupsWorker::doRestore(
        if (called_async)
        {
            tryLogCurrentException(log, fmt::format("Failed to restore from {} {}", (restore_settings.internal ? "internal backup" : "backup"), backup_name_for_logging));
-            setStatusSafe(restore_id, BackupStatus::RESTORE_FAILED);
+            setStatusSafe(restore_id, getRestoreStatusFromCurrentException());
            sendCurrentExceptionToCoordination(restore_coordination);
        }
        else
@ -940,7 +1031,7 @@ void BackupsWorker::doRestore(
 }


-void BackupsWorker::restoreTablesData(const OperationID & restore_id, BackupPtr backup, DataRestoreTasks && tasks, ThreadPool & thread_pool)
+void BackupsWorker::restoreTablesData(const OperationID & restore_id, BackupPtr backup, DataRestoreTasks && tasks, ThreadPool & thread_pool, QueryStatusPtr process_list_element)
 {
    size_t num_active_jobs = 0;
    std::mutex mutex;
@ -980,7 +1071,13 @@ void BackupsWorker::restoreTablesData(const OperationID & restore_id, BackupPtr
                        return;
                }

+                if (process_list_element)
+                    process_list_element->checkTimeLimit();
+
                std::move(task)();
+
+                maybeSleepForTesting();
+
                setNumFilesAndSize(
                    restore_id,
                    backup->getNumFiles(),
@ -1011,9 +1108,10 @@ void BackupsWorker::restoreTablesData(const OperationID & restore_id, BackupPtr
 }


-void BackupsWorker::addInfo(const OperationID & id, const String & name, const String & base_backup_name, bool internal, BackupStatus status)
+void BackupsWorker::addInfo(const OperationID & id, const String & name, const String & base_backup_name, bool internal, QueryStatusPtr process_list_element, BackupStatus status)
 {
-    BackupOperationInfo info;
+    ExtendedOperationInfo extended_info;
+    auto & info = extended_info.info;
    info.id = id;
    info.name = name;
    info.base_backup_name = base_backup_name;
@ -1021,7 +1119,16 @@ void BackupsWorker::addInfo(const OperationID & id, const String & name, const S
    info.status = status;
    info.start_time = std::chrono::system_clock::now();

-    if (isFinalStatus(status))
+    bool is_final_status = isFinalStatus(status);
+
+    if (process_list_element)
+    {
+        info.profile_counters = process_list_element->getInfo(/* get_thread_list= */ false, /* get_profile_events= */ true, /* get_settings= */ false).profile_counters;
+        if (!is_final_status)
+            extended_info.process_list_element = process_list_element;
+    }
+
+    if (is_final_status)
        info.end_time = info.start_time;

    std::lock_guard lock{infos_mutex};
@ -1030,7 +1137,7 @@ void BackupsWorker::addInfo(const OperationID & id, const String & name, const S
    if (it != infos.end())
    {
        /// It's better not allow to overwrite the current status if it's in progress.
-        auto current_status = it->second.status;
+        auto current_status = it->second.info.status;
        if (!isFinalStatus(current_status))
            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot start a backup or restore: ID {} is already in use", id);
    }
@ -1038,7 +1145,7 @@ void BackupsWorker::addInfo(const OperationID & id, const String & name, const S
    if (backup_log)
        backup_log->add(BackupLogElement{info});

-    infos[id] = std::move(info);
+    infos[id] = std::move(extended_info);

    num_active_backups += getNumActiveBackupsChange(status);
    num_active_restores += getNumActiveRestoresChange(status);
@ -1057,16 +1164,24 @@ void BackupsWorker::setStatus(const String & id, BackupStatus status, bool throw
            return;
    }

-    auto & info = it->second;
+    auto & extended_info = it->second;
+    auto & info = extended_info.info;
+
    auto old_status = info.status;
-
    info.status = status;
-    info.profile_counters = std::make_shared<ProfileEvents::Counters::Snapshot>(CurrentThread::getProfileEvents().getPartiallyAtomicSnapshot());
+    bool is_final_status = isFinalStatus(status);

-    if (isFinalStatus(status))
+    if (extended_info.process_list_element)
+    {
+        info.profile_counters = extended_info.process_list_element->getInfo(/* get_thread_list= */ false, /* get_profile_events= */ true, /* get_settings= */ false).profile_counters;
+        if (is_final_status)
+            extended_info.process_list_element = nullptr;
+    }
+
+    if (is_final_status)
        info.end_time = std::chrono::system_clock::now();

-    if (isErrorStatus(status))
+    if (isFailedOrCancelled(status))
    {
        info.error_message = getCurrentExceptionMessage(false);
        info.exception = std::current_exception();
@ -1077,6 +1192,9 @@ void BackupsWorker::setStatus(const String & id, BackupStatus status, bool throw

    num_active_backups += getNumActiveBackupsChange(status) - getNumActiveBackupsChange(old_status);
    num_active_restores += getNumActiveRestoresChange(status) - getNumActiveRestoresChange(old_status);
+
+    if (status != old_status)
+        status_changed.notify_all();
 }


@ -1090,7 +1208,7 @@ void BackupsWorker::setNumFilesAndSize(const OperationID & id, size_t num_files,
    if (it == infos.end())
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup ID {}", id);

-    auto & info = it->second;
+    auto & info = it->second.info;
    info.num_files = num_files;
    info.total_size = total_size;
    info.num_entries = num_entries;
@ -1101,37 +1219,120 @@ void BackupsWorker::setNumFilesAndSize(const OperationID & id, size_t num_files,
 }


-void BackupsWorker::wait(const OperationID & id, bool rethrow_exception)
+void BackupsWorker::maybeSleepForTesting() const
+{
+    if (test_inject_sleep)
+        sleepForSeconds(1);
+}
+
+
+void BackupsWorker::wait(const OperationID & backup_or_restore_id, bool rethrow_exception)
 {
    std::unique_lock lock{infos_mutex};
    status_changed.wait(lock, [&]
    {
-        auto it = infos.find(id);
+        auto it = infos.find(backup_or_restore_id);
        if (it == infos.end())
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup ID {}", id);
-        const auto & info = it->second;
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup ID {}", backup_or_restore_id);
+        const auto & info = it->second.info;
        auto current_status = info.status;
-        if (rethrow_exception && isErrorStatus(current_status))
+        if (rethrow_exception && isFailedOrCancelled(current_status))
            std::rethrow_exception(info.exception);
-        return isFinalStatus(current_status);
+        if (isFinalStatus(current_status))
+            return true;
+        LOG_INFO(log, "Waiting {} {}", isBackupStatus(info.status) ? "backup" : "restore", info.name);
+        return false;
    });
 }

+void BackupsWorker::waitAll()
+{
+    std::vector<OperationID> current_operations;
+    {
+        std::lock_guard lock{infos_mutex};
+        for (const auto & [id, extended_info] : infos)
+            if (!isFinalStatus(extended_info.info.status))
+                current_operations.push_back(id);
+    }
+
+    if (current_operations.empty())
+        return;
+
+    LOG_INFO(log, "Waiting for running backups and restores to finish");
+
+    for (const auto & id : current_operations)
+        wait(id, /* rethrow_exception= */ false);
+
+    LOG_INFO(log, "Backups and restores finished");
+}
+
+void BackupsWorker::cancel(const BackupOperationID & backup_or_restore_id, bool wait_)
+{
+    QueryStatusPtr process_list_element;
+    {
+        std::unique_lock lock{infos_mutex};
+        auto it = infos.find(backup_or_restore_id);
+        if (it == infos.end())
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup ID {}", backup_or_restore_id);
+
+        const auto & extended_info = it->second;
+        const auto & info = extended_info.info;
+        if (isFinalStatus(info.status) || !extended_info.process_list_element)
+            return;
+
+        LOG_INFO(log, "Cancelling {} {}", isBackupStatus(info.status) ? "backup" : "restore", info.name);
+        process_list_element = extended_info.process_list_element;
+    }
+
+    process_list.sendCancelToQuery(process_list_element);
+
+    if (wait_)
+        wait(backup_or_restore_id, /* rethrow_exception= */ false);
+}
+
+
+void BackupsWorker::cancelAll(bool wait_)
+{
+    std::vector<OperationID> current_operations;
+    {
+        std::lock_guard lock{infos_mutex};
+        for (const auto & [id, extended_info] : infos)
+            if (!isFinalStatus(extended_info.info.status))
+                current_operations.push_back(id);
+    }
+
+    if (current_operations.empty())
+        return;
+
+    LOG_INFO(log, "Cancelling running backups and restores");
+
+    for (const auto & id : current_operations)
+        cancel(id, /* wait= */ false);
+
+    if (wait_)
+        for (const auto & id : current_operations)
+            wait(id, /* rethrow_exception= */ false);
+
+    LOG_INFO(log, "Backups and restores finished or stopped");
+}
+
+
 BackupOperationInfo BackupsWorker::getInfo(const OperationID & id) const
 {
    std::lock_guard lock{infos_mutex};
    auto it = infos.find(id);
    if (it == infos.end())
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup ID {}", id);
-    return it->second;
+    return it->second.info;
 }

 std::vector<BackupOperationInfo> BackupsWorker::getAllInfos() const
 {
    std::vector<BackupOperationInfo> res_infos;
    std::lock_guard lock{infos_mutex};
-    for (const auto & info : infos | boost::adaptors::map_values)
+    for (const auto & extended_info : infos | boost::adaptors::map_values)
    {
+        const auto & info = extended_info.info;
        if (!info.internal)
            res_infos.push_back(info);
    }
@ -1140,14 +1341,11 @@ std::vector<BackupOperationInfo> BackupsWorker::getAllInfos() const

 void BackupsWorker::shutdown()
 {
-    bool has_active_backups_and_restores = (num_active_backups || num_active_restores);
-    if (has_active_backups_and_restores)
-        LOG_INFO(log, "Waiting for {} backups and {} restores to be finished", num_active_backups, num_active_restores);
+    /// Cancel running backups and restores.
+    cancelAll(/* wait= */ true);

+    /// Wait for our thread pools (it must be done before destroying them).
    thread_pools->wait();
-
-    if (has_active_backups_and_restores)
-        LOG_INFO(log, "All backup and restore tasks have finished");
 }

 }
--- a/src/Backups/BackupsWorker.h
+++ b/src/Backups/BackupsWorker.h
@ -26,13 +26,26 @@ using BackupEntries = std::vector<std::pair<String, std::shared_ptr<const IBacku
 using DataRestoreTasks = std::vector<std::function<void()>>;
 struct ReadSettings;
 class BackupLog;
+class ThreadGroup;
+using ThreadGroupPtr = std::shared_ptr<ThreadGroup>;
+class QueryStatus;
+using QueryStatusPtr = std::shared_ptr<QueryStatus>;
+class ProcessList;
+

 /// Manager of backups and restores: executes backups and restores' threads in the background.
 /// Keeps information about backups and restores started in this session.
 class BackupsWorker
 {
 public:
-    BackupsWorker(ContextPtr global_context, size_t num_backup_threads, size_t num_restore_threads, bool allow_concurrent_backups_, bool allow_concurrent_restores_);
+    BackupsWorker(
+        ContextMutablePtr global_context,
+        size_t num_backup_threads,
+        size_t num_restore_threads,
+        bool allow_concurrent_backups_,
+        bool allow_concurrent_restores_,
+        bool test_inject_sleep_);
+
    ~BackupsWorker();

    /// Waits until all tasks have been completed.
@ -41,10 +54,20 @@ public:
    /// Starts executing a BACKUP or RESTORE query. Returns ID of the operation.
    BackupOperationID start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context);

-    /// Waits until a BACKUP or RESTORE query started by start() is finished.
+    /// Waits until the specified backup or restore operation finishes or stops.
    /// The function returns immediately if the operation is already finished.
    void wait(const BackupOperationID & backup_or_restore_id, bool rethrow_exception = true);

+    /// Waits until all running backup and restore operations finish or stop.
+    void waitAll();
+
+    /// Cancels the specified backup or restore operation.
+    /// The function does nothing if this operation has already finished.
+    void cancel(const BackupOperationID & backup_or_restore_id, bool wait_ = true);
+
+    /// Cancels all running backup and restore operations.
+    void cancelAll(bool wait_ = true);
+
    BackupOperationInfo getInfo(const BackupOperationID & id) const;
    std::vector<BackupOperationInfo> getAllInfos() const;

@ -60,13 +83,14 @@ private:
        std::shared_ptr<IBackupCoordination> backup_coordination,
        const ContextPtr & context,
        ContextMutablePtr mutable_context,
+        ThreadGroupPtr thread_group,
        bool called_async);

    /// Builds file infos for specified backup entries.
-    void buildFileInfosForBackupEntries(const BackupPtr & backup, const BackupEntries & backup_entries, const ReadSettings & read_settings, std::shared_ptr<IBackupCoordination> backup_coordination);
+    void buildFileInfosForBackupEntries(const BackupPtr & backup, const BackupEntries & backup_entries, const ReadSettings & read_settings, std::shared_ptr<IBackupCoordination> backup_coordination, QueryStatusPtr process_list_element);

    /// Write backup entries to an opened backup.
-    void writeBackupEntries(BackupMutablePtr backup, BackupEntries && backup_entries, const BackupOperationID & backup_id, std::shared_ptr<IBackupCoordination> backup_coordination, bool internal);
+    void writeBackupEntries(BackupMutablePtr backup, BackupEntries && backup_entries, const BackupOperationID & backup_id, std::shared_ptr<IBackupCoordination> backup_coordination, bool internal, QueryStatusPtr process_list_element);

    BackupOperationID startRestoring(const ASTPtr & query, ContextMutablePtr context);

@ -78,12 +102,13 @@ private:
        RestoreSettings restore_settings,
        std::shared_ptr<IRestoreCoordination> restore_coordination,
        ContextMutablePtr context,
+        ThreadGroupPtr thread_group,
        bool called_async);

    /// Run data restoring tasks which insert data to tables.
-    void restoreTablesData(const BackupOperationID & restore_id, BackupPtr backup, DataRestoreTasks && tasks, ThreadPool & thread_pool);
+    void restoreTablesData(const BackupOperationID & restore_id, BackupPtr backup, DataRestoreTasks && tasks, ThreadPool & thread_pool, QueryStatusPtr process_list_element);

-    void addInfo(const BackupOperationID & id, const String & name, const String & base_backup_name, bool internal, BackupStatus status);
+    void addInfo(const BackupOperationID & id, const String & name, const String & base_backup_name, bool internal, QueryStatusPtr process_list_element, BackupStatus status);
    void setStatus(const BackupOperationID & id, BackupStatus status, bool throw_if_error = true);
    void setStatusSafe(const String & id, BackupStatus status) { setStatus(id, status, false); }
    void setNumFilesAndSize(const BackupOperationID & id, size_t num_files, UInt64 total_size, size_t num_entries,
@ -92,19 +117,33 @@ private:
    enum class ThreadPoolId;
    ThreadPool & getThreadPool(ThreadPoolId thread_pool_id);

+    /// Waits for some time if `test_inject_sleep` is true.
+    void maybeSleepForTesting() const;
+
    class ThreadPools;
    std::unique_ptr<ThreadPools> thread_pools;

    const bool allow_concurrent_backups;
    const bool allow_concurrent_restores;
+    const bool test_inject_sleep;
+
    Poco::Logger * log;

-    std::unordered_map<BackupOperationID, BackupOperationInfo> infos;
-    std::shared_ptr<BackupLog> backup_log;
+    struct ExtendedOperationInfo
+    {
+        BackupOperationInfo info;
+        QueryStatusPtr process_list_element; /// to cancel this operation if we want to
+    };
+
+    std::unordered_map<BackupOperationID, ExtendedOperationInfo> infos;
+
    std::condition_variable status_changed;
    std::atomic<size_t> num_active_backups = 0;
    std::atomic<size_t> num_active_restores = 0;
    mutable std::mutex infos_mutex;
+
+    std::shared_ptr<BackupLog> backup_log;
+    ProcessList & process_list;
 };

 }
--- a/src/Backups/RestoreCoordinationRemote.cpp
+++ b/src/Backups/RestoreCoordinationRemote.cpp
@ -21,7 +21,8 @@ RestoreCoordinationRemote::RestoreCoordinationRemote(
    const String & restore_uuid_,
    const Strings & all_hosts_,
    const String & current_host_,
-    bool is_internal_)
+    bool is_internal_,
+    QueryStatusPtr process_list_element_)
    : get_zookeeper(get_zookeeper_)
    , root_zookeeper_path(root_zookeeper_path_)
    , keeper_settings(keeper_settings_)
@ -36,6 +37,7 @@ RestoreCoordinationRemote::RestoreCoordinationRemote(
        log,
        get_zookeeper_,
        keeper_settings,
+        process_list_element_,
        [my_zookeeper_path = zookeeper_path, my_current_host = current_host, my_is_internal = is_internal]
        (WithRetries::FaultyKeeper & zk)
        {
--- a/src/Backups/RestoreCoordinationRemote.h
+++ b/src/Backups/RestoreCoordinationRemote.h
@ -21,7 +21,8 @@ public:
        const String & restore_uuid_,
        const Strings & all_hosts_,
        const String & current_host_,
-        bool is_internal_);
+        bool is_internal_,
+        QueryStatusPtr process_list_element_);

    ~RestoreCoordinationRemote() override;

--- a/src/Backups/RestorerFromBackup.cpp
+++ b/src/Backups/RestorerFromBackup.cpp
@ -16,6 +16,7 @@
 #include <Interpreters/DatabaseCatalog.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/InterpreterCreateQuery.h>
+#include <Interpreters/ProcessList.h>
 #include <Databases/IDatabase.h>
 #include <Databases/DDLDependencyVisitor.h>
 #include <Storages/IStorage.h>
@ -85,6 +86,7 @@ RestorerFromBackup::RestorerFromBackup(
    , restore_coordination(restore_coordination_)
    , backup(backup_)
    , context(context_)
+    , process_list_element(context->getProcessListElement())
    , on_cluster_first_sync_timeout(context->getConfigRef().getUInt64("backups.on_cluster_first_sync_timeout", 180000))
    , create_table_timeout(context->getConfigRef().getUInt64("backups.create_table_timeout", 300000))
    , log(&Poco::Logger::get("RestorerFromBackup"))
@ -138,6 +140,8 @@ RestorerFromBackup::DataRestoreTasks RestorerFromBackup::run(Mode mode)
 void RestorerFromBackup::setStage(const String & new_stage, const String & message)
 {
    LOG_TRACE(log, "Setting stage: {}", new_stage);
+    checkIsQueryCancelled();
+
    current_stage = new_stage;

    if (restore_coordination)
@ -150,6 +154,12 @@ void RestorerFromBackup::setStage(const String & new_stage, const String & messa
    }
 }

+void RestorerFromBackup::checkIsQueryCancelled() const
+{
+    if (process_list_element)
+        process_list_element->checkTimeLimit();
+}
+
 void RestorerFromBackup::findRootPathsInBackup()
 {
    size_t shard_num = 1;
@ -563,6 +573,8 @@ void RestorerFromBackup::createDatabase(const String & database_name) const
    if (database_info.is_predefined_database)
        return;

+    checkIsQueryCancelled();
+
    auto create_database_query = typeid_cast<std::shared_ptr<ASTCreateQuery>>(database_info.create_database_query->clone());

    /// Generate a new UUID for a database.
@ -709,6 +721,8 @@ void RestorerFromBackup::createTable(const QualifiedTableName & table_name)
    if (table_info.is_predefined_table)
        return;

+    checkIsQueryCancelled();
+
    auto create_table_query = typeid_cast<std::shared_ptr<ASTCreateQuery>>(table_info.create_table_query->clone());

    /// Generate a new UUID for a table (the same table on different hosts must use the same UUID, `restore_coordination` will make it so).
@ -790,6 +804,8 @@ void RestorerFromBackup::insertDataToTable(const QualifiedTableName & table_name
    auto & table_info = table_infos.at(table_name);
    auto storage = table_info.storage;

+    checkIsQueryCancelled();
+
    try
    {
        const auto & data_path_in_backup = table_info.data_path_in_backup;
--- a/src/Backups/RestorerFromBackup.h
+++ b/src/Backups/RestorerFromBackup.h
@ -21,6 +21,8 @@ using DatabasePtr = std::shared_ptr<IDatabase>;
 class AccessRestorerFromBackup;
 struct IAccessEntity;
 using AccessEntityPtr = std::shared_ptr<const IAccessEntity>;
+class QueryStatus;
+using QueryStatusPtr = std::shared_ptr<QueryStatus>;


 /// Restores the definition of databases and tables and prepares tasks to restore the data of the tables.
@ -74,6 +76,7 @@ private:
    std::shared_ptr<IRestoreCoordination> restore_coordination;
    BackupPtr backup;
    ContextMutablePtr context;
+    QueryStatusPtr process_list_element;
    std::chrono::milliseconds on_cluster_first_sync_timeout;
    std::chrono::milliseconds create_table_timeout;
    Poco::Logger * log;
@ -107,6 +110,9 @@ private:

    void setStage(const String & new_stage, const String & message = "");

+    /// Throws an exception if the RESTORE query was cancelled.
+    void checkIsQueryCancelled() const;
+
    struct DatabaseInfo
    {
        ASTPtr create_database_query;
--- a/src/Backups/WithRetries.cpp
+++ b/src/Backups/WithRetries.cpp
@ -21,10 +21,11 @@ WithRetries::KeeperSettings WithRetries::KeeperSettings::fromContext(ContextPtr
 }

 WithRetries::WithRetries(
-    Poco::Logger * log_, zkutil::GetZooKeeper get_zookeeper_, const KeeperSettings & settings_, RenewerCallback callback_)
+    Poco::Logger * log_, zkutil::GetZooKeeper get_zookeeper_, const KeeperSettings & settings_, QueryStatusPtr process_list_element_, RenewerCallback callback_)
    : log(log_)
    , get_zookeeper(get_zookeeper_)
    , settings(settings_)
+    , process_list_element(process_list_element_)
    , callback(callback_)
    , global_zookeeper_retries_info(
          settings.keeper_max_retries, settings.keeper_retry_initial_backoff_ms, settings.keeper_retry_max_backoff_ms)
@ -32,7 +33,7 @@ WithRetries::WithRetries(

 WithRetries::RetriesControlHolder::RetriesControlHolder(const WithRetries * parent, const String & name)
    : info(parent->global_zookeeper_retries_info)
-    , retries_ctl(name, parent->log, info, nullptr)
+    , retries_ctl(name, parent->log, info, parent->process_list_element)
    , faulty_zookeeper(parent->getFaultyZooKeeper())
 {}

--- a/src/Backups/WithRetries.h
+++ b/src/Backups/WithRetries.h
@ -52,7 +52,7 @@ public:
    };

    RetriesControlHolder createRetriesControlHolder(const String & name);
-    WithRetries(Poco::Logger * log, zkutil::GetZooKeeper get_zookeeper_, const KeeperSettings & settings, RenewerCallback callback);
+    WithRetries(Poco::Logger * log, zkutil::GetZooKeeper get_zookeeper_, const KeeperSettings & settings, QueryStatusPtr process_list_element_, RenewerCallback callback);

    /// Used to re-establish new connection inside a retry loop.
    void renewZooKeeper(FaultyKeeper my_faulty_zookeeper) const;
@ -65,6 +65,8 @@ private:
    Poco::Logger * log;
    zkutil::GetZooKeeper get_zookeeper;
    KeeperSettings settings;
+    QueryStatusPtr process_list_element;
+
    /// This callback is called each time when a new [Zoo]Keeper session is created.
    /// In backups it is primarily used to re-create an ephemeral node to signal the coordinator
    /// that the host is alive and able to continue writing the backup.
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -83,10 +83,11 @@ add_subdirectory (Formats)

 add_headers_and_sources(clickhouse_common_io Common)
 add_headers_and_sources(clickhouse_common_io Common/HashTable)
+add_headers_and_sources(clickhouse_common_io Common/Scheduler)
+add_headers_and_sources(clickhouse_common_io Common/Scheduler/Nodes)
 add_headers_and_sources(clickhouse_common_io Common/SSH)
 add_headers_and_sources(clickhouse_common_io IO)
 add_headers_and_sources(clickhouse_common_io IO/Archives)
-add_headers_and_sources(clickhouse_common_io IO/Resource)
 add_headers_and_sources(clickhouse_common_io IO/S3)
 list (REMOVE_ITEM clickhouse_common_io_sources Common/malloc.cpp Common/new_delete.cpp)

--- a/src/Common/AsyncLoader.cpp
+++ b/src/Common/AsyncLoader.cpp
@ -36,13 +36,36 @@ static constexpr size_t PRINT_MESSAGE_EACH_N_SECONDS = 5;

 void logAboutProgress(Poco::Logger * log, size_t processed, size_t total, AtomicStopwatch & watch)
 {
-    if (processed % PRINT_MESSAGE_EACH_N_OBJECTS == 0 || watch.compareAndRestart(PRINT_MESSAGE_EACH_N_SECONDS))
+    if (total && (processed % PRINT_MESSAGE_EACH_N_OBJECTS == 0 || watch.compareAndRestart(PRINT_MESSAGE_EACH_N_SECONDS)))
    {
        LOG_INFO(log, "Processed: {}%", processed * 100.0 / total);
        watch.restart();
    }
 }

+AsyncLoader::Pool::Pool(const AsyncLoader::PoolInitializer & init)
+    : name(init.name)
+    , priority(init.priority)
+    , thread_pool(std::make_unique<ThreadPool>(
+        init.metric_threads,
+        init.metric_active_threads,
+        init.metric_scheduled_threads,
+        /* max_threads = */ std::numeric_limits<size_t>::max(), // Unlimited number of threads, we do worker management ourselves
+        /* max_free_threads = */ 0, // We do not require free threads
+        /* queue_size = */0)) // Unlimited queue to avoid blocking during worker spawning
+    , max_threads(init.max_threads > 0 ? init.max_threads : getNumberOfPhysicalCPUCores())
+{}
+
+AsyncLoader::Pool::Pool(Pool&& o) noexcept
+    : name(o.name)
+    , priority(o.priority)
+    , thread_pool(std::move(o.thread_pool))
+    , ready_queue(std::move(o.ready_queue))
+    , max_threads(o.max_threads)
+    , workers(o.workers)
+    , suspended_workers(o.suspended_workers.load()) // All these constructors are needed because std::atomic is neither copy-constructible, nor move-constructible. We never move pools after init, so it is safe.
+{}
+
 void cancelOnDependencyFailure(const LoadJobPtr & self, const LoadJobPtr & dependency, std::exception_ptr & cancel)
 {
    cancel = std::make_exception_ptr(Exception(ErrorCodes::ASYNC_LOAD_CANCELED,
@ -84,39 +107,38 @@ size_t LoadJob::waitersCount() const
    return waiters;
 }

-size_t LoadJob::ok()
+void LoadJob::ok()
 {
    std::unique_lock lock{mutex};
    load_status = LoadStatus::OK;
-    return finish();
+    finish();
 }

-size_t LoadJob::failed(const std::exception_ptr & ptr)
+void LoadJob::failed(const std::exception_ptr & ptr)
 {
    std::unique_lock lock{mutex};
    load_status = LoadStatus::FAILED;
    load_exception = ptr;
-    return finish();
+    finish();
 }

-size_t LoadJob::canceled(const std::exception_ptr & ptr)
+void LoadJob::canceled(const std::exception_ptr & ptr)
 {
    std::unique_lock lock{mutex};
    load_status = LoadStatus::CANCELED;
    load_exception = ptr;
-    return finish();
+    finish();
 }

-size_t LoadJob::finish()
+void LoadJob::finish()
 {
-    // To ensure functions are destructed before `AsyncLoader::wait()` return
+    // To ensure functions are destructed before `AsyncLoader::wait()` returns
    func = {};
    dependency_failure = {};

    finish_time = std::chrono::system_clock::now();
    if (waiters > 0)
        finished.notify_all();
-    return std::exchange(suspended_waiters, 0);
 }

 void LoadJob::scheduled(UInt64 job_id_)
@ -134,7 +156,7 @@ void LoadJob::enqueued()

 void LoadJob::execute(AsyncLoader & loader, size_t pool, const LoadJobPtr & self)
 {
-    execution_pool_id = pool;
+    execution_pool_id.store(pool);
    start_time = std::chrono::system_clock::now();
    func(loader, self);
 }
@ -187,19 +209,7 @@ AsyncLoader::AsyncLoader(std::vector<PoolInitializer> pool_initializers, bool lo
 {
    pools.reserve(pool_initializers.size());
    for (auto && init : pool_initializers)
-        pools.push_back({
-            .name = init.name,
-            .priority = init.priority,
-            .thread_pool = std::make_unique<ThreadPool>(
-                init.metric_threads,
-                init.metric_active_threads,
-                init.metric_scheduled_threads,
-                /* max_threads = */ std::numeric_limits<size_t>::max(), // Unlimited number of threads, we do worker management ourselves
-                /* max_free_threads = */ 0, // We do not require free threads
-                /* queue_size = */0), // Unlimited queue to avoid blocking during worker spawning
-            .ready_queue = {},
-            .max_threads = init.max_threads > 0 ? init.max_threads : getNumberOfPhysicalCPUCores()
-        });
+        pools.push_back(Pool(init));
 }

 AsyncLoader::~AsyncLoader()
@ -498,6 +508,11 @@ std::vector<AsyncLoader::JobState> AsyncLoader::getJobStates() const
    return result;
 }

+size_t AsyncLoader::suspendedWorkersCount(size_t pool_id)
+{
+    return pools[pool_id].suspended_workers.load();
+}
+
 void AsyncLoader::checkCycle(const LoadJobSet & jobs, std::unique_lock<std::mutex> & lock)
 {
    LoadJobSet left = jobs;
@ -538,20 +553,12 @@ void AsyncLoader::finish(const LoadJobPtr & job, LoadStatus status, std::excepti
    chassert(scheduled_jobs.contains(job)); // Job was pending

    // Notify waiters
-    size_t resumed_workers = 0; // Number of workers resumed in the execution pool of the job
    if (status == LoadStatus::OK)
-        resumed_workers = job->ok();
+        job->ok();
    else if (status == LoadStatus::FAILED)
-        resumed_workers = job->failed(reason);
+        job->failed(reason);
    else if (status == LoadStatus::CANCELED)
-        resumed_workers = job->canceled(reason);
-
-    // Adjust suspended workers count
-    if (resumed_workers)
-    {
-        Pool & pool = pools[job->executionPool()];
-        pool.suspended_workers -= resumed_workers;
-    }
+        job->canceled(reason);

    Info & info = scheduled_jobs[job];
    if (info.isReady())
@ -568,30 +575,24 @@ void AsyncLoader::finish(const LoadJobPtr & job, LoadStatus status, std::excepti
    // Update dependent jobs
    for (const auto & dpt : dependent)
    {
-        if (auto dpt_info = scheduled_jobs.find(dpt); dpt_info != scheduled_jobs.end())
-        {
-            dpt_info->second.dependencies_left--;
-            if (!dpt_info->second.isBlocked())
-                enqueue(dpt_info->second, dpt, lock);
+        auto dpt_info = scheduled_jobs.find(dpt);
+        if (dpt_info == scheduled_jobs.end())
+            continue;
+        dpt_info->second.dependencies_left--;
+        if (!dpt_info->second.isBlocked())
+            enqueue(dpt_info->second, dpt, lock);

-            if (status != LoadStatus::OK)
-            {
-                std::exception_ptr cancel;
-                NOEXCEPT_SCOPE({
-                    ALLOW_ALLOCATIONS_IN_SCOPE;
-                    if (dpt->dependency_failure)
-                        dpt->dependency_failure(dpt, job, cancel);
-                });
-                // Recurse into dependent job if it should be canceled
-                if (cancel)
-                    finish(dpt, LoadStatus::CANCELED, cancel, lock);
-            }
-        }
-        else
+        if (status != LoadStatus::OK)
        {
-            // Job has already been canceled. Do not enter twice into the same job during finish recursion.
-            // This happens in {A<-B; A<-C; B<-D; C<-D} graph for D if A is failed or canceled.
-            chassert(status == LoadStatus::CANCELED);
+            std::exception_ptr cancel;
+            NOEXCEPT_SCOPE({
+                ALLOW_ALLOCATIONS_IN_SCOPE;
+                if (dpt->dependency_failure)
+                    dpt->dependency_failure(dpt, job, cancel);
+            });
+            // Recurse into dependent job if it should be canceled
+            if (cancel)
+                finish(dpt, LoadStatus::CANCELED, cancel, lock);
        }
    }

@ -637,9 +638,6 @@ void AsyncLoader::prioritize(const LoadJobPtr & job, size_t new_pool_id, std::un
    }

    job->pool_id.store(new_pool_id);
-    // TODO(serxa): we should adjust suspended_workers and suspended_waiters here.
-    // Otherwise suspended_workers we be left inconsistent. Fix it and add a test.
-    // Scenario: schedule a job A, wait for it from a job B in the same pool, prioritize A

    // Recurse into dependencies
    for (const auto & dep : job->dependencies)
@ -697,6 +695,8 @@ void AsyncLoader::wait(std::unique_lock<std::mutex> & job_lock, const LoadJobPtr
    if (job->job_id == 0)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Load job '{}' waits for not scheduled load job '{}'", current_load_job->name, job->name);

+    scope_guard suspended_lock;
+
    // Deadlock detection and resolution
    if (current_load_job && job->load_status == LoadStatus::PENDING)
    {
@ -719,11 +719,30 @@ void AsyncLoader::wait(std::unique_lock<std::mutex> & job_lock, const LoadJobPtr
        if (worker_pool == job->pool_id)
        {
            job_lock.unlock(); // Avoid reverse locking order
-            workerIsSuspendedByWait(worker_pool, job);
+            std::unique_lock lock{mutex};
            job_lock.lock();
+
+            // Rechecks are required because we have reacquired mutexes
+            if (job->load_status != LoadStatus::PENDING)
+                return; // Job is already done, no wait required
+
+            if (worker_pool == job->pool_id)
+            {
+                // To resolve "blocked pool" deadlocks we spawn a new worker for every suspended worker, if required
+                // This can lead to a visible excess of `max_threads` specified for a pool,
+                // but actual number of NOT suspended workers may exceed `max_threads` ONLY in intermittent state.
+                Pool & pool = pools[worker_pool];
+                pool.suspended_workers.fetch_add(1);
+                suspended_lock = [&pool] { chassert(pool.suspended_workers.load()); pool.suspended_workers.fetch_sub(1); };
+                if (canSpawnWorker(pool, lock))
+                    spawn(pool, lock);
+            }
        }
    }

+    if (job->load_status != LoadStatus::PENDING) // Shortcut just to avoid incrementing ProfileEvents
+        return;
+
    Stopwatch watch;
    job->waiters++;
    job->finished.wait(job_lock, [&] { return job->load_status != LoadStatus::PENDING; });
@ -731,34 +750,12 @@ void AsyncLoader::wait(std::unique_lock<std::mutex> & job_lock, const LoadJobPtr
    ProfileEvents::increment(ProfileEvents::AsyncLoaderWaitMicroseconds, watch.elapsedMicroseconds());
 }

-void AsyncLoader::workerIsSuspendedByWait(size_t pool_id, const LoadJobPtr & job)
-{
-    std::unique_lock lock{mutex};
-    std::unique_lock job_lock{job->mutex};
-
-    if (job->load_status != LoadStatus::PENDING)
-        return; // Job is already done, worker can continue execution
-
-    // To resolve "blocked pool" deadlocks we spawn a new worker for every suspended worker, if required
-    // This can lead to a visible excess of `max_threads` specified for a pool,
-    // but actual number of NOT suspended workers may exceed `max_threads` ONLY in intermittent state.
-    Pool & pool = pools[pool_id];
-    pool.suspended_workers++;
-    job->suspended_waiters++;
-    if (canSpawnWorker(pool, lock))
-        spawn(pool, lock);
-
-    // TODO(serxa): it is a good idea to propagate `job` and all its dependencies in `pool.ready_queue` by introducing
-    // key {suspended_waiters, ready_seqno} instead of plain `ready_seqno`, to force newly spawn workers to work on jobs
-    // that are being waited. But it doesn't affect correctness. So let's not complicate it for time being.
-}
-
 bool AsyncLoader::canSpawnWorker(Pool & pool, std::unique_lock<std::mutex> &)
 {
    // TODO(serxa): optimization: we should not spawn new worker on the first enqueue during `finish()` because current worker will take this job.
    return is_running
        && !pool.ready_queue.empty()
-        && pool.workers < pool.max_threads + pool.suspended_workers
+        && pool.workers < pool.max_threads + pool.suspended_workers.load()
        && (!current_priority || *current_priority >= pool.priority);
 }

@ -766,7 +763,7 @@ bool AsyncLoader::canWorkerLive(Pool & pool, std::unique_lock<std::mutex> &)
 {
    return is_running
        && !pool.ready_queue.empty()
-        && pool.workers <= pool.max_threads + pool.suspended_workers
+        && pool.workers <= pool.max_threads + pool.suspended_workers.load()
        && (!current_priority || *current_priority >= pool.priority);
 }

--- a/src/Common/AsyncLoader.h
+++ b/src/Common/AsyncLoader.h
@ -98,10 +98,10 @@ public:
 private:
    friend class AsyncLoader;

-    [[nodiscard]] size_t ok();
-    [[nodiscard]] size_t failed(const std::exception_ptr & ptr);
-    [[nodiscard]] size_t canceled(const std::exception_ptr & ptr);
-    [[nodiscard]] size_t finish();
+    void ok();
+    void failed(const std::exception_ptr & ptr);
+    void canceled(const std::exception_ptr & ptr);
+    void finish();

    void scheduled(UInt64 job_id_);
    void enqueued();
@ -122,8 +122,7 @@ private:

    mutable std::mutex mutex;
    mutable std::condition_variable finished;
-    mutable size_t waiters = 0; // All waiters, including suspended
-    mutable size_t suspended_waiters = 0;
+    mutable size_t waiters = 0;
    LoadStatus load_status{LoadStatus::PENDING};
    std::exception_ptr load_exception;

@ -282,6 +281,20 @@ inline LoadTaskPtr makeLoadTask(AsyncLoader & loader, LoadJobSet && jobs, LoadJo
 // 8)  The job is destructed.
 class AsyncLoader : private boost::noncopyable
 {
+public:
+    using Metric = CurrentMetrics::Metric;
+
+    // Helper struct for AsyncLoader construction
+    struct PoolInitializer
+    {
+        String name;
+        Metric metric_threads;
+        Metric metric_active_threads;
+        Metric metric_scheduled_threads;
+        size_t max_threads; // Zero means use all CPU cores
+        Priority priority;
+    };
+
 private:
    // Thread pool for job execution.
    // Pools control the following aspects of job execution:
@ -296,8 +309,10 @@ private:
        std::map<UInt64, LoadJobPtr> ready_queue; // FIFO queue of jobs to be executed in this pool. Map is used for faster erasing. Key is `ready_seqno`
        size_t max_threads; // Max number of workers to be spawn
        size_t workers = 0; // Number of currently executing workers
-        size_t suspended_workers = 0; // Number of workers that are blocked by `wait()` call on a job executing in the same pool (for deadlock resolution)
+        std::atomic<size_t> suspended_workers{0}; // Number of workers that are blocked by `wait()` call on a job executing in the same pool (for deadlock resolution)

+        explicit Pool(const PoolInitializer & init);
+        Pool(Pool&& o) noexcept;
        bool isActive() const { return workers > 0 || !ready_queue.empty(); }
    };

@ -315,19 +330,6 @@ private:
    };

 public:
-    using Metric = CurrentMetrics::Metric;
-
-    // Helper struct for AsyncLoader construction
-    struct PoolInitializer
-    {
-        String name;
-        Metric metric_threads;
-        Metric metric_active_threads;
-        Metric metric_scheduled_threads;
-        size_t max_threads; // Zero means use all CPU cores
-        Priority priority;
-    };
-
    AsyncLoader(std::vector<PoolInitializer> pool_initializers, bool log_failures_, bool log_progress_);

    // Stops AsyncLoader before destruction
@ -360,12 +362,16 @@ public:
    void schedule(const LoadTaskPtrs & tasks);

    // Increase priority of a job and all its dependencies recursively.
-    // Jobs from higher (than `new_pool`) priority pools are not changed.
+    // Jobs from pools with priority higher than `new_pool` are not changed.
    void prioritize(const LoadJobPtr & job, size_t new_pool);

    // Sync wait for a pending job to be finished: OK, FAILED or CANCELED status.
    // Throws if job is FAILED or CANCELED unless `no_throw` is set. Returns or throws immediately if called on non-pending job.
-    // If job was not scheduled, it will be implicitly scheduled before the wait (deadlock auto-resolution).
+    // Waiting for a not scheduled job is considered to be LOGICAL_ERROR, use waitLoad() helper instead to make sure the job is scheduled.
+    // There are more rules if `wait()` is called from another job:
+    //  1) waiting on a dependent job is considered to be LOGICAL_ERROR;
+    //  2) waiting on a job in the same pool might lead to more workers spawned in that pool to resolve "blocked pool" deadlock;
+    //  3) waiting on a job with lower priority lead to priority inheritance to avoid priority inversion.
    void wait(const LoadJobPtr & job, bool no_throw = false);

    // Remove finished jobs, cancel scheduled jobs, wait for executing jobs to finish and remove them.
@ -393,9 +399,7 @@ public:

    // For introspection and debug only, see `system.asynchronous_loader` table.
    std::vector<JobState> getJobStates() const;
-
-    // For deadlock resolution. Should not be used directly.
-    void workerIsSuspendedByWait(size_t pool_id, const LoadJobPtr & job);
+    size_t suspendedWorkersCount(size_t pool_id);

 private:
    void checkCycle(const LoadJobSet & jobs, std::unique_lock<std::mutex> & lock);
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -590,6 +590,7 @@
    M(708, ILLEGAL_STATISTIC) \
    M(709, CANNOT_GET_REPLICATED_DATABASE_SNAPSHOT) \
    M(710, FAULT_INJECTED) \
+    M(711, FILECACHE_ACCESS_DENIED) \
    \
    M(999, KEEPER_EXCEPTION) \
    M(1000, POCO_EXCEPTION) \
--- a/src/Common/Exception.cpp
+++ b/src/Common/Exception.cpp
@ -69,14 +69,14 @@ void handle_error_code([[maybe_unused]] const std::string & msg, int code, bool
 Exception::MessageMasked::MessageMasked(const std::string & msg_)
    : msg(msg_)
 {
-    if (auto * masker = SensitiveDataMasker::getInstance())
+    if (auto masker = SensitiveDataMasker::getInstance())
        masker->wipeSensitiveData(msg);
 }

 Exception::MessageMasked::MessageMasked(std::string && msg_)
    : msg(std::move(msg_))
 {
-    if (auto * masker = SensitiveDataMasker::getInstance())
+    if (auto masker = SensitiveDataMasker::getInstance())
        masker->wipeSensitiveData(msg);
 }

--- a/src/Common/Scheduler/IResourceManager.h
+++ b/src/Common/Scheduler/IResourceManager.h
@ -1,6 +1,6 @@
 #pragma once

-#include <IO/ResourceLink.h>
+#include <Common/Scheduler/ResourceLink.h>

 #include <Poco/Util/AbstractConfiguration.h>

--- a/src/Common/Scheduler/ISchedulerConstraint.h
+++ b/src/Common/Scheduler/ISchedulerConstraint.h
@ -1,6 +1,6 @@
 #pragma once

-#include <IO/ISchedulerNode.h>
+#include <Common/Scheduler/ISchedulerNode.h>

 namespace DB
 {
--- a/src/Common/Scheduler/ISchedulerNode.h
+++ b/src/Common/Scheduler/ISchedulerNode.h
@ -6,7 +6,7 @@
 #include <base/defines.h>
 #include <base/types.h>

-#include <IO/ResourceRequest.h>
+#include <Common/Scheduler/ResourceRequest.h>
 #include <Poco/Util/AbstractConfiguration.h>
 #include <Poco/Util/XMLConfiguration.h>

--- a/src/Common/Scheduler/ISchedulerQueue.h
+++ b/src/Common/Scheduler/ISchedulerQueue.h
@ -1,8 +1,8 @@
 #pragma once

-#include <IO/ISchedulerNode.h>
-#include <IO/ResourceBudget.h>
-#include <IO/ResourceRequest.h>
+#include <Common/Scheduler/ISchedulerNode.h>
+#include <Common/Scheduler/ResourceBudget.h>
+#include <Common/Scheduler/ResourceRequest.h>

 #include <memory>

--- a/src/Common/Scheduler/Nodes/ClassifiersConfig.cpp
+++ b/src/Common/Scheduler/Nodes/ClassifiersConfig.cpp
@ -1,4 +1,4 @@
-#include <IO/Resource/ClassifiersConfig.h>
+#include <Common/Scheduler/Nodes/ClassifiersConfig.h>

 #include <Common/Exception.h>

--- a/src/Common/Scheduler/Nodes/ClassifiersConfig.h
+++ b/src/Common/Scheduler/Nodes/ClassifiersConfig.h
--- a/src/Common/Scheduler/Nodes/DynamicResourceManager.cpp
+++ b/src/Common/Scheduler/Nodes/DynamicResourceManager.cpp
@ -1,8 +1,8 @@
-#include <IO/Resource/DynamicResourceManager.h>
+#include <Common/Scheduler/Nodes/DynamicResourceManager.h>

-#include <IO/SchedulerNodeFactory.h>
-#include <IO/ResourceManagerFactory.h>
-#include <IO/ISchedulerQueue.h>
+#include <Common/Scheduler/Nodes/SchedulerNodeFactory.h>
+#include <Common/Scheduler/ResourceManagerFactory.h>
+#include <Common/Scheduler/ISchedulerQueue.h>

 #include <Common/Exception.h>
 #include <Common/StringUtils/StringUtils.h>
--- a/src/Common/Scheduler/Nodes/DynamicResourceManager.h
+++ b/src/Common/Scheduler/Nodes/DynamicResourceManager.h
@ -1,8 +1,8 @@
 #pragma once

-#include <IO/IResourceManager.h>
-#include <IO/SchedulerRoot.h>
-#include <IO/Resource/ClassifiersConfig.h>
+#include <Common/Scheduler/IResourceManager.h>
+#include <Common/Scheduler/SchedulerRoot.h>
+#include <Common/Scheduler/Nodes/ClassifiersConfig.h>

 #include <mutex>

--- a/src/Common/Scheduler/Nodes/FairPolicy.cpp
+++ b/src/Common/Scheduler/Nodes/FairPolicy.cpp
@ -1,6 +1,6 @@
-#include <IO/Resource/FairPolicy.h>
+#include <Common/Scheduler/Nodes/FairPolicy.h>

-#include <IO/SchedulerNodeFactory.h>
+#include <Common/Scheduler/Nodes/SchedulerNodeFactory.h>

 namespace DB
 {
--- a/src/Common/Scheduler/Nodes/FairPolicy.h
+++ b/src/Common/Scheduler/Nodes/FairPolicy.h
@ -1,6 +1,6 @@
 #pragma once

-#include <IO/ISchedulerNode.h>
+#include <Common/Scheduler/ISchedulerNode.h>

 #include <Common/Stopwatch.h>

--- a/src/Common/Scheduler/Nodes/FifoQueue.cpp
+++ b/src/Common/Scheduler/Nodes/FifoQueue.cpp
@ -1,6 +1,6 @@
-#include <IO/Resource/FifoQueue.h>
+#include <Common/Scheduler/Nodes/FifoQueue.h>

-#include <IO/SchedulerNodeFactory.h>
+#include <Common/Scheduler/Nodes/SchedulerNodeFactory.h>

 namespace DB
 {
--- a/src/Common/Scheduler/Nodes/FifoQueue.h
+++ b/src/Common/Scheduler/Nodes/FifoQueue.h
@ -2,7 +2,7 @@

 #include <Common/Stopwatch.h>

-#include <IO/ISchedulerQueue.h>
+#include <Common/Scheduler/ISchedulerQueue.h>

 #include <Poco/Util/AbstractConfiguration.h>

--- a/src/Common/Scheduler/Nodes/PriorityPolicy.cpp
+++ b/src/Common/Scheduler/Nodes/PriorityPolicy.cpp
@ -1,6 +1,6 @@
-#include <IO/Resource/PriorityPolicy.h>
+#include <Common/Scheduler/Nodes/PriorityPolicy.h>

-#include <IO/SchedulerNodeFactory.h>
+#include <Common/Scheduler/Nodes/SchedulerNodeFactory.h>

 namespace DB
 {
--- a/src/Common/Scheduler/Nodes/PriorityPolicy.h
+++ b/src/Common/Scheduler/Nodes/PriorityPolicy.h
@ -1,6 +1,6 @@
 #pragma once

-#include <IO/ISchedulerNode.h>
+#include <Common/Scheduler/ISchedulerNode.h>

 #include <algorithm>
 #include <unordered_map>
--- a/src/Common/Scheduler/Nodes/SchedulerNodeFactory.h
+++ b/src/Common/Scheduler/Nodes/SchedulerNodeFactory.h
@ -3,7 +3,7 @@
 #include <Common/ErrorCodes.h>
 #include <Common/Exception.h>

-#include <IO/ISchedulerNode.h>
+#include <Common/Scheduler/ISchedulerNode.h>

 #include <Poco/Util/AbstractConfiguration.h>

--- a/src/Common/Scheduler/Nodes/SemaphoreConstraint.cpp
+++ b/src/Common/Scheduler/Nodes/SemaphoreConstraint.cpp
@ -1,6 +1,6 @@
-#include <IO/Resource/SemaphoreConstraint.h>
+#include <Common/Scheduler/Nodes/SemaphoreConstraint.h>

-#include <IO/SchedulerNodeFactory.h>
+#include <Common/Scheduler/Nodes/SchedulerNodeFactory.h>

 namespace DB
 {
--- a/src/Common/Scheduler/Nodes/SemaphoreConstraint.h
+++ b/src/Common/Scheduler/Nodes/SemaphoreConstraint.h
@ -1,6 +1,6 @@
 #pragma once

-#include <IO/ISchedulerConstraint.h>
+#include <Common/Scheduler/ISchedulerConstraint.h>

 #include <mutex>
 #include <limits>
--- a/src/Common/Scheduler/Nodes/StaticResourceManager.cpp
+++ b/src/Common/Scheduler/Nodes/StaticResourceManager.cpp
@ -1,8 +1,8 @@
-#include <IO/Resource/StaticResourceManager.h>
+#include <Common/Scheduler/Nodes/StaticResourceManager.h>

-#include <IO/SchedulerNodeFactory.h>
-#include <IO/ResourceManagerFactory.h>
-#include <IO/ISchedulerQueue.h>
+#include <Common/Scheduler/Nodes/SchedulerNodeFactory.h>
+#include <Common/Scheduler/ResourceManagerFactory.h>
+#include <Common/Scheduler/ISchedulerQueue.h>

 #include <Common/Exception.h>
 #include <Common/StringUtils/StringUtils.h>
--- a/src/Common/Scheduler/Nodes/StaticResourceManager.h
+++ b/src/Common/Scheduler/Nodes/StaticResourceManager.h
@ -1,8 +1,8 @@
 #pragma once

-#include <IO/IResourceManager.h>
-#include <IO/SchedulerRoot.h>
-#include <IO/Resource/ClassifiersConfig.h>
+#include <Common/Scheduler/IResourceManager.h>
+#include <Common/Scheduler/SchedulerRoot.h>
+#include <Common/Scheduler/Nodes/ClassifiersConfig.h>

 #include <mutex>

--- a/src/Common/Scheduler/Nodes/ThrottlerConstraint.cpp
+++ b/src/Common/Scheduler/Nodes/ThrottlerConstraint.cpp
@ -1,6 +1,6 @@
-#include <IO/Resource/ThrottlerConstraint.h>
+#include <Common/Scheduler/Nodes/ThrottlerConstraint.h>

-#include <IO/SchedulerNodeFactory.h>
+#include <Common/Scheduler/Nodes/SchedulerNodeFactory.h>

 namespace DB
 {
--- a/src/Common/Scheduler/Nodes/ThrottlerConstraint.h
+++ b/src/Common/Scheduler/Nodes/ThrottlerConstraint.h
@ -1,6 +1,6 @@
 #pragma once

-#include <IO/ISchedulerConstraint.h>
+#include <Common/Scheduler/ISchedulerConstraint.h>

 #include <chrono>
 #include <mutex>
--- a/src/Common/Scheduler/Nodes/registerResourceManagers.cpp
+++ b/src/Common/Scheduler/Nodes/registerResourceManagers.cpp
@ -1,5 +1,5 @@
-#include <IO/Resource/registerResourceManagers.h>
-#include <IO/ResourceManagerFactory.h>
+#include <Common/Scheduler/Nodes/registerResourceManagers.h>
+#include <Common/Scheduler/ResourceManagerFactory.h>

 namespace DB
 {
--- a/src/Common/Scheduler/Nodes/registerResourceManagers.h
+++ b/src/Common/Scheduler/Nodes/registerResourceManagers.h
--- a/src/Common/Scheduler/Nodes/registerSchedulerNodes.cpp
+++ b/src/Common/Scheduler/Nodes/registerSchedulerNodes.cpp
@ -1,8 +1,8 @@
-#include <IO/Resource/registerSchedulerNodes.h>
+#include <Common/Scheduler/Nodes/registerSchedulerNodes.h>

-#include <IO/ISchedulerNode.h>
-#include <IO/ISchedulerConstraint.h>
-#include <IO/SchedulerNodeFactory.h>
+#include <Common/Scheduler/ISchedulerNode.h>
+#include <Common/Scheduler/ISchedulerConstraint.h>
+#include <Common/Scheduler/Nodes/SchedulerNodeFactory.h>

 namespace DB
 {
--- a/src/Common/Scheduler/Nodes/registerSchedulerNodes.h
+++ b/src/Common/Scheduler/Nodes/registerSchedulerNodes.h
--- a/src/Common/Scheduler/Nodes/tests/ResourceTest.h
+++ b/src/Common/Scheduler/Nodes/tests/ResourceTest.h
@ -1,14 +1,14 @@
 #pragma once

-#include <IO/IResourceManager.h>
-#include <IO/SchedulerRoot.h>
-#include <IO/ResourceGuard.h>
-#include <IO/SchedulerNodeFactory.h>
-#include <IO/Resource/PriorityPolicy.h>
-#include <IO/Resource/FifoQueue.h>
-#include <IO/Resource/SemaphoreConstraint.h>
-#include <IO/Resource/registerSchedulerNodes.h>
-#include <IO/Resource/registerResourceManagers.h>
+#include <Common/Scheduler/IResourceManager.h>
+#include <Common/Scheduler/SchedulerRoot.h>
+#include <Common/Scheduler/ResourceGuard.h>
+#include <Common/Scheduler/Nodes/SchedulerNodeFactory.h>
+#include <Common/Scheduler/Nodes/PriorityPolicy.h>
+#include <Common/Scheduler/Nodes/FifoQueue.h>
+#include <Common/Scheduler/Nodes/SemaphoreConstraint.h>
+#include <Common/Scheduler/Nodes/registerSchedulerNodes.h>
+#include <Common/Scheduler/Nodes/registerResourceManagers.h>

 #include <Poco/Util/XMLConfiguration.h>

--- a/src/Common/Scheduler/Nodes/tests/gtest_resource_class_fair.cpp
+++ b/src/Common/Scheduler/Nodes/tests/gtest_resource_class_fair.cpp
@ -1,14 +1,14 @@
 #include <gtest/gtest.h>

-#include <IO/Resource/tests/ResourceTest.h>
+#include <Common/Scheduler/Nodes/tests/ResourceTest.h>

-#include <IO/Resource/FairPolicy.h>
+#include <Common/Scheduler/Nodes/FairPolicy.h>

 using namespace DB;

 using ResourceTest = ResourceTestClass;

-TEST(IOResourceFairPolicy, Factory)
+TEST(SchedulerFairPolicy, Factory)
 {
    ResourceTest t;

@ -17,7 +17,7 @@ TEST(IOResourceFairPolicy, Factory)
    EXPECT_TRUE(dynamic_cast<FairPolicy *>(fair.get()) != nullptr);
 }

-TEST(IOResourceFairPolicy, FairnessWeights)
+TEST(SchedulerFairPolicy, FairnessWeights)
 {
    ResourceTest t;

@ -41,7 +41,7 @@ TEST(IOResourceFairPolicy, FairnessWeights)
    t.consumed("B", 20);
 }

-TEST(IOResourceFairPolicy, Activation)
+TEST(SchedulerFairPolicy, Activation)
 {
    ResourceTest t;

@ -77,7 +77,7 @@ TEST(IOResourceFairPolicy, Activation)
    t.consumed("B", 10);
 }

-TEST(IOResourceFairPolicy, FairnessMaxMin)
+TEST(SchedulerFairPolicy, FairnessMaxMin)
 {
    ResourceTest t;

@ -101,7 +101,7 @@ TEST(IOResourceFairPolicy, FairnessMaxMin)
    t.consumed("A", 20);
 }

-TEST(IOResourceFairPolicy, HierarchicalFairness)
+TEST(SchedulerFairPolicy, HierarchicalFairness)
 {
    ResourceTest t;

--- a/src/Common/Scheduler/Nodes/tests/gtest_resource_class_priority.cpp
+++ b/src/Common/Scheduler/Nodes/tests/gtest_resource_class_priority.cpp
@ -1,14 +1,14 @@
 #include <gtest/gtest.h>

-#include <IO/Resource/tests/ResourceTest.h>
+#include <Common/Scheduler/Nodes/tests/ResourceTest.h>

-#include <IO/Resource/PriorityPolicy.h>
+#include <Common/Scheduler/Nodes/PriorityPolicy.h>

 using namespace DB;

 using ResourceTest = ResourceTestClass;

-TEST(IOResourcePriorityPolicy, Factory)
+TEST(SchedulerPriorityPolicy, Factory)
 {
    ResourceTest t;

@ -17,7 +17,7 @@ TEST(IOResourcePriorityPolicy, Factory)
    EXPECT_TRUE(dynamic_cast<PriorityPolicy *>(prio.get()) != nullptr);
 }

-TEST(IOResourcePriorityPolicy, Priorities)
+TEST(SchedulerPriorityPolicy, Priorities)
 {
    ResourceTest t;

@ -51,7 +51,7 @@ TEST(IOResourcePriorityPolicy, Priorities)
    t.consumed("C", 0);
 }

-TEST(IOResourcePriorityPolicy, Activation)
+TEST(SchedulerPriorityPolicy, Activation)
 {
    ResourceTest t;

@ -92,7 +92,7 @@ TEST(IOResourcePriorityPolicy, Activation)
    t.consumed("C", 0);
 }

-TEST(IOResourcePriorityPolicy, SinglePriority)
+TEST(SchedulerPriorityPolicy, SinglePriority)
 {
    ResourceTest t;

--- a/src/Common/Scheduler/Nodes/tests/gtest_resource_manager_hierarchical.cpp
+++ b/src/Common/Scheduler/Nodes/tests/gtest_resource_manager_hierarchical.cpp
@ -1,8 +1,8 @@
 #include <gtest/gtest.h>

-#include <IO/Resource/tests/ResourceTest.h>
+#include <Common/Scheduler/Nodes/tests/ResourceTest.h>

-#include <IO/Resource/DynamicResourceManager.h>
+#include <Common/Scheduler/Nodes/DynamicResourceManager.h>
 #include <Poco/Util/XMLConfiguration.h>

 using namespace DB;
@ -10,7 +10,7 @@ using namespace DB;
 using ResourceTest = ResourceTestManager<DynamicResourceManager>;
 using TestGuard = ResourceTest::Guard;

-TEST(IOResourceDynamicResourceManager, Smoke)
+TEST(SchedulerDynamicResourceManager, Smoke)
 {
    ResourceTest t;

@ -45,7 +45,7 @@ TEST(IOResourceDynamicResourceManager, Smoke)
    }
 }

-TEST(IOResourceDynamicResourceManager, Fairness)
+TEST(SchedulerDynamicResourceManager, Fairness)
 {
    // Total cost for A and B cannot differ for more than 1 (every request has cost equal to 1).
    // Requests from A use `value = 1` and from B `value = -1` is used.
--- a/src/Common/Scheduler/Nodes/tests/gtest_resource_manager_static.cpp
+++ b/src/Common/Scheduler/Nodes/tests/gtest_resource_manager_static.cpp
@ -1,8 +1,8 @@
 #include <gtest/gtest.h>

-#include <IO/Resource/tests/ResourceTest.h>
+#include <Common/Scheduler/Nodes/tests/ResourceTest.h>

-#include <IO/Resource/StaticResourceManager.h>
+#include <Common/Scheduler/Nodes/StaticResourceManager.h>
 #include <Poco/Util/XMLConfiguration.h>

 using namespace DB;
@ -10,7 +10,7 @@ using namespace DB;
 using ResourceTest = ResourceTestManager<StaticResourceManager>;
 using TestGuard = ResourceTest::Guard;

-TEST(IOResourceStaticResourceManager, Smoke)
+TEST(SchedulerStaticResourceManager, Smoke)
 {
    ResourceTest t;

@ -42,7 +42,7 @@ TEST(IOResourceStaticResourceManager, Smoke)
    }
 }

-TEST(IOResourceStaticResourceManager, Prioritization)
+TEST(SchedulerStaticResourceManager, Prioritization)
 {
    std::optional<Priority> last_priority;
    auto check = [&] (Priority priority)
--- a/Show More
+++ b/Show More