Merge remote-tracking branch 'origin/master' into pr-custom-key-failover

2024-11-22 07:31:57 +00:00 · 2023-12-05 12:00:45 +00:00 · 2023-12-05 12:00:45 +00:00 · 71204c2a82
commit 71204c2a82
parent b3d3887f2c 5770def9af
396 changed files with 7575 additions and 2875 deletions
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@ -532,6 +532,11 @@ jobs:
      run_command: |
        cd "$REPO_COPY/tests/ci"

+        mkdir -p "${REPORTS_PATH}/integration"
+        mkdir -p "${REPORTS_PATH}/stateless"
+        cp -r ${REPORTS_PATH}/changed_images* ${REPORTS_PATH}/integration
+        cp -r ${REPORTS_PATH}/changed_images* ${REPORTS_PATH}/stateless
+
        TEMP_PATH="${TEMP_PATH}/integration" \
          REPORTS_PATH="${REPORTS_PATH}/integration" \
          python3 integration_test_check.py "Integration $CHECK_NAME" \
--- a/README.md
+++ b/README.md
@ -35,6 +35,7 @@ curl https://clickhouse.com/ | sh

 * [**ClickHouse Meetup in Berlin**](https://www.meetup.com/clickhouse-berlin-user-group/events/296488501/) - Nov 30
 * [**ClickHouse Meetup in NYC**](https://www.meetup.com/clickhouse-new-york-user-group/events/296488779/) - Dec 11
+* [**ClickHouse Meetup in Sydney**](https://www.meetup.com/clickhouse-sydney-user-group/events/297638812/) - Dec 12
 * [**ClickHouse Meetup in Boston**](https://www.meetup.com/clickhouse-boston-user-group/events/296488840/) - Dec 12

 Also, keep an eye out for upcoming meetups around the world. Somewhere else you want us to be? Please feel free to reach out to tyler <at> clickhouse <dot> com.
--- a/contrib/libcxxabi-cmake/CMakeLists.txt
+++ b/contrib/libcxxabi-cmake/CMakeLists.txt
@ -33,7 +33,7 @@ target_include_directories(cxxabi SYSTEM BEFORE
    PRIVATE $<BUILD_INTERFACE:${LIBCXXABI_SOURCE_DIR}/../libcxx/include>
    PRIVATE $<BUILD_INTERFACE:${LIBCXXABI_SOURCE_DIR}/../libcxx/src>
 )
-target_compile_definitions(cxxabi PRIVATE -D_LIBCPP_BUILDING_LIBRARY)
+target_compile_definitions(cxxabi PRIVATE -D_LIBCPP_BUILDING_LIBRARY -DHAS_THREAD_LOCAL)
 target_compile_options(cxxabi PRIVATE -nostdinc++ -fno-sanitize=undefined -Wno-macro-redefined) # If we don't disable UBSan, infinite recursion happens in dynamic_cast.
 target_link_libraries(cxxabi PUBLIC unwind)

--- a/contrib/qpl
+++ b/contrib/qpl
@ -1 +1 @@
-Subproject commit faaf19350459c076e66bb5df11743c3fade59b73
+Subproject commit a61bdd845fd7ca363b2bcc55454aa520dfcd8298
--- a/docker/test/sqllogic/Dockerfile
+++ b/docker/test/sqllogic/Dockerfile
@ -20,7 +20,8 @@ RUN apt-get update --yes \
 RUN pip3 install \
    numpy \
    pyodbc \
-    deepdiff
+    deepdiff \
+    sqlglot

 ARG odbc_repo="https://github.com/ClickHouse/clickhouse-odbc.git"

@ -35,7 +36,7 @@ RUN git clone --recursive ${odbc_repo} \
    && odbcinst -i -s -l -f /clickhouse-odbc/packaging/odbc.ini.sample

 ENV TZ=Europe/Amsterdam
-ENV MAX_RUN_TIME=900
+ENV MAX_RUN_TIME=9000
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

 ARG sqllogic_test_repo="https://github.com/gregrahn/sqllogictest.git"
--- a/docker/test/sqllogic/run.sh
+++ b/docker/test/sqllogic/run.sh
@ -75,6 +75,20 @@ function run_tests()
        cat /test_output/statements-test/check_status.tsv >> /test_output/check_status.tsv
        cat /test_output/statements-test/test_results.tsv >> /test_output/test_results.tsv
        tar -zcvf statements-check.tar.gz statements-test 1>/dev/null
+
+        mkdir -p /test_output/complete-test
+        /clickhouse-tests/sqllogic/runner.py \
+        --log-file /test_output/runner-complete-test.log \
+        --log-level info \
+            complete-test \
+            --input-dir /sqllogictest \
+            --out-dir /test_output/complete-test \
+            2>&1 \
+            | ts '%Y-%m-%d %H:%M:%S'
+
+        cat /test_output/complete-test/check_status.tsv >> /test_output/check_status.tsv
+        cat /test_output/complete-test/test_results.tsv >> /test_output/test_results.tsv
+        tar -zcvf complete-check.tar.gz complete-test 1>/dev/null
    fi
 }

--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -19,10 +19,14 @@ dpkg -i package_folder/clickhouse-common-static-dbg_*.deb
 dpkg -i package_folder/clickhouse-server_*.deb
 dpkg -i package_folder/clickhouse-client_*.deb

+echo "$BUGFIX_VALIDATE_CHECK"
+
 # Check that the tools are available under short names
-ch --query "SELECT 1" || exit 1
-chl --query "SELECT 1" || exit 1
-chc --version || exit 1
+if [[ -z "$BUGFIX_VALIDATE_CHECK" ]]; then
+    ch --query "SELECT 1" || exit 1
+    chl --query "SELECT 1" || exit 1
+    chc --version || exit 1
+fi

 ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test

@ -46,6 +50,16 @@ fi

 config_logs_export_cluster /etc/clickhouse-server/config.d/system_logs_export.yaml

+if [[ -n "$BUGFIX_VALIDATE_CHECK" ]] && [[ "$BUGFIX_VALIDATE_CHECK" -eq 1 ]]; then
+    sudo cat /etc/clickhouse-server/config.d/zookeeper.xml \
+    | sed "/<use_compression>1<\/use_compression>/d" \
+    > /etc/clickhouse-server/config.d/zookeeper.xml.tmp
+    sudo mv /etc/clickhouse-server/config.d/zookeeper.xml.tmp /etc/clickhouse-server/config.d/zookeeper.xml
+
+    # it contains some new settings, but we can safely remove it
+    rm /etc/clickhouse-server/users.d/s3_cache_new.xml
+fi
+
 # For flaky check we also enable thread fuzzer
 if [ "$NUM_TRIES" -gt "1" ]; then
    export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000
--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@ -191,6 +191,12 @@ sudo cat /etc/clickhouse-server/config.d/logger_trace.xml \
   > /etc/clickhouse-server/config.d/logger_trace.xml.tmp
 mv /etc/clickhouse-server/config.d/logger_trace.xml.tmp /etc/clickhouse-server/config.d/logger_trace.xml

+# Randomize async_load_databases
+if [ $(( $(date +%-d) % 2 )) -eq 1 ]; then
+    sudo echo "<clickhouse><async_load_databases>true</async_load_databases></clickhouse>" \
+        > /etc/clickhouse-server/config.d/enable_async_load_databases.xml
+fi
+
 start

 stress --hung-check --drop-databases --output-folder test_output --skip-func-tests "$SKIP_TESTS_OPTION" --global-time-limit 1200 \
--- a/docker/test/upgrade/run.sh
+++ b/docker/test/upgrade/run.sh
@ -79,6 +79,7 @@ rm /etc/clickhouse-server/config.d/merge_tree.xml
 rm /etc/clickhouse-server/config.d/enable_wait_for_shutdown_replicated_tables.xml
 rm /etc/clickhouse-server/users.d/nonconst_timezone.xml
 rm /etc/clickhouse-server/users.d/s3_cache_new.xml
+rm /etc/clickhouse-server/users.d/replicated_ddl_entry.xml

 start
 stop
@ -116,6 +117,7 @@ rm /etc/clickhouse-server/config.d/merge_tree.xml
 rm /etc/clickhouse-server/config.d/enable_wait_for_shutdown_replicated_tables.xml
 rm /etc/clickhouse-server/users.d/nonconst_timezone.xml
 rm /etc/clickhouse-server/users.d/s3_cache_new.xml
+rm /etc/clickhouse-server/users.d/replicated_ddl_entry.xml

 start

--- a/docs/en/getting-started/install.md
+++ b/docs/en/getting-started/install.md
@ -56,7 +56,7 @@ On Linux, macOS and FreeBSD:
  ./clickhouse client
  ClickHouse client version 23.2.1.1501 (official build).
  Connecting to localhost:9000 as user default.
-  Connected to ClickHouse server version 23.2.1 revision 54461.
+  Connected to ClickHouse server version 23.2.1.

  local-host :)
  ```
--- a/docs/en/interfaces/cli.md
+++ b/docs/en/interfaces/cli.md
@ -16,7 +16,7 @@ ClickHouse provides a native command-line client: `clickhouse-client`. The clien
 $ clickhouse-client
 ClickHouse client version 20.13.1.5273 (official build).
 Connecting to localhost:9000 as user default.
-Connected to ClickHouse server version 20.13.1 revision 54442.
+Connected to ClickHouse server version 20.13.1.

 :)
 ```
--- a/docs/en/operations/monitoring.md
+++ b/docs/en/operations/monitoring.md
@ -15,6 +15,27 @@ You can monitor:
 - Utilization of hardware resources.
 - ClickHouse server metrics.

+## Built-in observability dashboard
+
+<img width="400" alt="Screenshot 2023-11-12 at 6 08 58 PM" src="https://github.com/ClickHouse/ClickHouse/assets/3936029/2bd10011-4a47-4b94-b836-d44557c7fdc1" />
+
+ClickHouse comes with a built-in observability dashboard feature which can be accessed by `$HOST:$PORT/dashboard` (requires user and password) that shows the following metrics:
+- Queries/second
+- CPU usage (cores)
+- Queries running
+- Merges running
+- Selected bytes/second
+- IO wait
+- CPU wait
+- OS CPU Usage (userspace)
+- OS CPU Usage (kernel)
+- Read from disk
+- Read from filesystem
+- Memory (tracked)
+- Inserted rows/second
+- Total MergeTree parts
+- Max parts for partition
+
 ## Resource Utilization {#resource-utilization}

 ClickHouse also monitors the state of hardware resources by itself such as:
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -1646,6 +1646,45 @@ Default value: `0.5`.



+## async_load_databases {#async_load_databases}
+
+Asynchronous loading of databases and tables.
+
+If `true` all non-system databases with `Ordinary`, `Atomic` and `Replicated` engine will be loaded asynchronously after the ClickHouse server start up. See `system.async_loader` table, `tables_loader_background_pool_size` and `tables_loader_foreground_pool_size` server settings. Any query that tries to access a table, that is not yet loaded, will wait for exactly this table to be started up. If load job fails, query will rethrow an error (instead of shutting down the whole server in case of `async_load_databases = false`). The table that is waited for by at least one query will be loaded with higher priority. DDL queries on a database will wait for exactly that database to be started up.
+
+If `false`, all databases are loaded when the server starts.
+
+The default is `false`.
+
+**Example**
+
+``` xml
+<async_load_databases>true</async_load_databases>
+```
+
+## tables_loader_foreground_pool_size {#tables_loader_foreground_pool_size}
+
+Sets the number of threads performing load jobs in foreground pool. The foreground pool is used for loading table synchronously before server start listening on a port and for loading tables that are waited for. Foreground pool has higher priority than background pool. It means that no job starts in background pool while there are jobs running in foreground pool.
+
+Possible values:
+
+-   Any positive integer.
+-   Zero. Use all available CPUs.
+
+Default value: 0.
+
+
+## tables_loader_background_pool_size {#tables_loader_background_pool_size}
+
+Sets the number of threads performing asynchronous load jobs in background pool. The background pool is used for loading tables asynchronously after server start in case there are no queries waiting for the table. It could be beneficial to keep low number of threads in background pool if there are a lot of tables. It will reserve CPU resources for concurrent query execution.
+
+Possible values:
+
+-   Any positive integer.
+-   Zero. Use all available CPUs.
+
+Default value: 0.
+

 ## merge_tree {#merge_tree}

--- a/docs/en/operations/settings/merge-tree-settings.md
+++ b/docs/en/operations/settings/merge-tree-settings.md
@ -149,7 +149,7 @@ Possible values:
 - Any positive integer.
 - 0 (disable deduplication)

-Default value: 100.
+Default value: 1000.

 The `Insert` command creates one or more blocks (parts). For [insert deduplication](../../engines/table-engines/mergetree-family/replication.md), when writing into replicated tables, ClickHouse writes the hash sums of the created parts into ClickHouse Keeper. Hash sums are stored only for the most recent `replicated_deduplication_window` blocks. The oldest hash sums are removed from ClickHouse Keeper.
 A large number of `replicated_deduplication_window` slows down `Inserts` because it needs to compare more entries.
--- a/docs/en/operations/system-tables/async_loader.md
+++ b/docs/en/operations/system-tables/async_loader.md
@ -0,0 +1,54 @@
+---
+slug: /en/operations/system-tables/async_loader
+---
+# async_loader
+
+Contains information and status for recent asynchronous jobs (e.g. for tables loading). The table contains a row for every job. There is a tool for visualizing information from this table `utils/async_loader_graph`.
+
+Example:
+
+``` sql
+SELECT *
+FROM system.async_loader
+FORMAT Vertical
+LIMIT 1
+```
+
+``` text
+```
+
+Columns:
+
+- `job` (`String`) - Job name (may be not unique).
+- `job_id` (`UInt64`) - Unique ID of the job.
+- `dependencies` (`Array(UInt64)`) - List of IDs of jobs that should be done before this job.
+- `dependencies_left` (`UInt64`) - Current number of dependencies left to be done.
+- `status` (`Enum`) - Current load status of a job:
+    `PENDING`:  Load job is not started yet.
+    `OK`: Load job executed and was successful.
+    `FAILED`: Load job executed and failed.
+    `CANCELED`: Load job is not going to be executed due to removal or dependency failure.
+
+A pending job might be in one of the following states:
+- `is_executing` (`UInt8`) - The job is currently being executed by a worker.
+- `is_blocked` (`UInt8`) - The job waits for its dependencies to be done.
+- `is_ready` (`UInt8`) - The job is ready to be executed and waits for a worker.
+- `elapsed` (`Float64`) - Seconds elapsed since start of execution. Zero if job is not started. Total execution time if job finished.
+
+Every job has a pool associated with it and is started in this pool. Each pool has a constant priority and a mutable maximum number of workers. Higher priority (lower `priority` value) jobs are run first. No job with lower priority is started while there is at least one higher priority job ready or executing. Job priority can be elevated (but cannot be lowered) by prioritizing it. For example jobs for a table loading and startup will be prioritized if incoming query required this table. It is possible prioritize a job during its execution, but job is not moved from its `execution_pool` to newly assigned `pool`. The job uses `pool` for creating new jobs to avoid priority inversion. Already started jobs are not preempted by higher priority jobs and always run to completion after start.
+- `pool_id` (`UInt64`) - ID of a pool currently assigned to the job.
+- `pool` (`String`) - Name of `pool_id` pool.
+- `priority` (`Int64`) - Priority of `pool_id` pool.
+- `execution_pool_id` (`UInt64`) - ID of a pool the job is executed in. Equals initially assigned pool before execution starts.
+- `execution_pool` (`String`) - Name of `execution_pool_id` pool.
+- `execution_priority` (`Int64`) - Priority of `execution_pool_id` pool.
+
+- `ready_seqno` (`Nullable(UInt64)`) - Not null for ready jobs. Worker pulls the next job to be executed from a ready queue of its pool. If there are multiple ready jobs, then job with the lowest value of `ready_seqno` is picked.
+- `waiters` (`UInt64`) - The number of threads waiting on this job.
+- `exception` (`Nullable(String)`) - Not null for failed and canceled jobs. Holds error message raised during query execution or error leading to cancelling of this job along with dependency failure chain of job names.
+
+Time instants during job lifetime:
+- `schedule_time` (`DateTime64`) - Time when job was created and scheduled to be executed (usually with all its dependencies).
+- `enqueue_time` (`Nullable(DateTime64)`) - Time when job became ready and was enqueued into a ready queue of it's pool. Null if the job is not ready yet.
+- `start_time` (`Nullable(DateTime64)`) - Time when worker dequeues the job from ready queue and start its execution. Null if the job is not started yet.
+- `finish_time` (`Nullable(DateTime64)`) - Time when job execution is finished. Null if the job is not finished yet.
--- a/docs/en/operations/system-tables/asynchronous_insert_log.md
+++ b/docs/en/operations/system-tables/asynchronous_insert_log.md
@ -13,6 +13,7 @@ ClickHouse does not delete data from the table automatically. See [Introduction]

 Columns:

+- `hostname` ([LowCardinality(String)](../../sql-reference/data-types/string.md)) — Hostname of the server executing the query.
 - `event_date` ([Date](../../sql-reference/data-types/date.md)) — The date when the async insert happened.
 - `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — The date and time when the async insert finished execution.
 - `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — The date and time when the async insert finished execution with microseconds precision.
@ -42,6 +43,7 @@ SELECT * FROM system.asynchronous_insert_log LIMIT 1 \G;
 Result:

 ``` text
+hostname:                clickhouse.eu-central1.internal
 event_date:              2023-06-08
 event_time:              2023-06-08 10:08:53
 event_time_microseconds: 2023-06-08 10:08:53.199516
--- a/docs/en/operations/system-tables/asynchronous_metric_log.md
+++ b/docs/en/operations/system-tables/asynchronous_metric_log.md
@ -7,6 +7,7 @@ Contains the historical values for `system.asynchronous_metrics`, which are save

 Columns:

+- `hostname` ([LowCardinality(String)](../../sql-reference/data-types/string.md)) — Hostname of the server executing the query.
 - `event_date` ([Date](../../sql-reference/data-types/date.md)) — Event date.
 - `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Event time.
 - `name` ([String](../../sql-reference/data-types/string.md)) — Metric name.
@ -15,22 +16,33 @@ Columns:
 **Example**

 ``` sql
-SELECT * FROM system.asynchronous_metric_log LIMIT 10
+SELECT * FROM system.asynchronous_metric_log LIMIT 3 \G
 ```

 ``` text
-┌─event_date─┬──────────event_time─┬─name─────────────────────────────────────┬─────value─┐
-│ 2020-09-05 │ 2020-09-05 15:56:30 │ CPUFrequencyMHz_0                        │    2120.9 │
-│ 2020-09-05 │ 2020-09-05 15:56:30 │ jemalloc.arenas.all.pmuzzy               │       743 │
-│ 2020-09-05 │ 2020-09-05 15:56:30 │ jemalloc.arenas.all.pdirty               │     26288 │
-│ 2020-09-05 │ 2020-09-05 15:56:30 │ jemalloc.background_thread.run_intervals │         0 │
-│ 2020-09-05 │ 2020-09-05 15:56:30 │ jemalloc.background_thread.num_runs      │         0 │
-│ 2020-09-05 │ 2020-09-05 15:56:30 │ jemalloc.retained                        │  60694528 │
-│ 2020-09-05 │ 2020-09-05 15:56:30 │ jemalloc.mapped                          │ 303161344 │
-│ 2020-09-05 │ 2020-09-05 15:56:30 │ jemalloc.resident                        │ 260931584 │
-│ 2020-09-05 │ 2020-09-05 15:56:30 │ jemalloc.metadata                        │  12079488 │
-│ 2020-09-05 │ 2020-09-05 15:56:30 │ jemalloc.allocated                       │ 133756128 │
-└────────────┴─────────────────────┴──────────────────────────────────────────┴───────────┘
+Row 1:
+──────
+hostname:   clickhouse.eu-central1.internal
+event_date: 2023-11-14
+event_time: 2023-11-14 14:39:07
+metric:     AsynchronousHeavyMetricsCalculationTimeSpent
+value:      0.001
+
+Row 2:
+──────
+hostname:   clickhouse.eu-central1.internal
+event_date: 2023-11-14
+event_time: 2023-11-14 14:39:08
+metric:     AsynchronousHeavyMetricsCalculationTimeSpent
+value:      0
+
+Row 3:
+──────
+hostname:   clickhouse.eu-central1.internal
+event_date: 2023-11-14
+event_time: 2023-11-14 14:39:09
+metric:     AsynchronousHeavyMetricsCalculationTimeSpent
+value:      0
 ```

 **See Also**
--- a/docs/en/operations/system-tables/backup_log.md
+++ b/docs/en/operations/system-tables/backup_log.md
@ -7,6 +7,7 @@ Contains logging entries with the information about `BACKUP` and `RESTORE` opera

 Columns:

+- `hostname` ([LowCardinality(String)](../../sql-reference/data-types/string.md)) — Hostname of the server executing the query.
 - `event_date` ([Date](../../sql-reference/data-types/date.md)) — Date of the entry.
 - `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Time of the entry with microseconds precision.
 - `id` ([String](../../sql-reference/data-types/string.md)) — Identifier of the backup or restore operation.
@ -45,6 +46,7 @@ SELECT * FROM system.backup_log WHERE id = 'e5b74ecb-f6f1-426a-80be-872f90043885
 ```response
 Row 1:
 ──────
+hostname:                clickhouse.eu-central1.internal
 event_date:              2023-08-19
 event_time_microseconds: 2023-08-19 11:05:21.998566
 id:                      e5b74ecb-f6f1-426a-80be-872f90043885
@ -63,6 +65,7 @@ bytes_read:              0

 Row 2:
 ──────
+hostname:                clickhouse.eu-central1.internal
 event_date:              2023-08-19
 event_time_microseconds: 2023-08-19 11:08:56.916192
 id:                      e5b74ecb-f6f1-426a-80be-872f90043885
@ -93,6 +96,7 @@ SELECT * FROM system.backup_log WHERE id = 'cdf1f731-52ef-42da-bc65-2e1bfcd4ce90
 ```response
 Row 1:
 ──────
+hostname:                clickhouse.eu-central1.internal
 event_date:              2023-08-19
 event_time_microseconds: 2023-08-19 11:09:19.718077
 id:                      cdf1f731-52ef-42da-bc65-2e1bfcd4ce90
@ -111,6 +115,7 @@ bytes_read:              0

 Row 2:
 ──────
+hostname:                clickhouse.eu-central1.internal
 event_date:              2023-08-19
 event_time_microseconds: 2023-08-19 11:09:29.334234
 id:                      cdf1f731-52ef-42da-bc65-2e1bfcd4ce90
--- a/docs/en/operations/system-tables/crash-log.md
+++ b/docs/en/operations/system-tables/crash-log.md
@ -7,6 +7,7 @@ Contains information about stack traces for fatal errors. The table does not exi

 Columns:

+- `hostname` ([LowCardinality(String)](../../sql-reference/data-types/string.md)) — Hostname of the server executing the query.
 - `event_date` ([DateTime](../../sql-reference/data-types/datetime.md)) — Date of the event.
 - `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Time of the event.
 - `timestamp_ns` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Timestamp of the event with nanoseconds.
@ -32,6 +33,7 @@ Result (not full):
 ``` text
 Row 1:
 ──────
+hostname:     clickhouse.eu-central1.internal
 event_date:   2020-10-14
 event_time:   2020-10-14 15:47:40
 timestamp_ns: 1602679660271312710
--- a/docs/en/operations/system-tables/metric_log.md
+++ b/docs/en/operations/system-tables/metric_log.md
@ -6,6 +6,7 @@ slug: /en/operations/system-tables/metric_log
 Contains history of metrics values from tables `system.metrics` and `system.events`, periodically flushed to disk.

 Columns:
+- `hostname` ([LowCardinality(String)](../../sql-reference/data-types/string.md)) — Hostname of the server executing the query.
 - `event_date` ([Date](../../sql-reference/data-types/date.md)) — Event date.
 - `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Event time.
 - `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Event time with microseconds resolution.
@ -19,6 +20,7 @@ SELECT * FROM system.metric_log LIMIT 1 FORMAT Vertical;
 ``` text
 Row 1:
 ──────
+hostname:                                                        clickhouse.eu-central1.internal
 event_date:                                                      2020-09-05
 event_time:                                                      2020-09-05 16:22:33
 event_time_microseconds:                                         2020-09-05 16:22:33.196807
--- a/docs/en/operations/system-tables/metrics.md
+++ b/docs/en/operations/system-tables/metrics.md
@ -45,6 +45,22 @@ Number of threads in the Aggregator thread pool.

 Number of threads in the Aggregator thread pool running a task.

+### TablesLoaderForegroundThreads
+
+Number of threads in the async loader foreground thread pool.
+
+### TablesLoaderForegroundThreadsActive
+
+Number of threads in the async loader foreground thread pool running a task.
+
+### TablesLoaderBackgroundThreads
+
+Number of threads in the async loader background thread pool.
+
+### TablesLoaderBackgroundThreadsActive
+
+Number of threads in the async loader background thread pool running a task.
+
 ### AsyncInsertCacheSize

 Number of async insert hash id in cache
@ -197,14 +213,6 @@ Number of threads in the DatabaseOnDisk thread pool.

 Number of threads in the DatabaseOnDisk thread pool running a task.

-### DatabaseOrdinaryThreads
-
-Number of threads in the Ordinary database thread pool.
-
-### DatabaseOrdinaryThreadsActive
-
-Number of threads in the Ordinary database thread pool running a task.
-
 ### DelayedInserts

 Number of INSERT queries that are throttled due to high number of active data parts for partition in a MergeTree table.
@ -625,14 +633,6 @@ Number of connections that are sending data for external tables to remote server

 Number of connections that are sending data for scalars to remote servers.

-### StartupSystemTablesThreads
-
-Number of threads in the StartupSystemTables thread pool.
-
-### StartupSystemTablesThreadsActive
-
-Number of threads in the StartupSystemTables thread pool running a task.
-
 ### StorageBufferBytes

 Number of bytes in buffers of Buffer tables
@ -677,14 +677,6 @@ Number of threads in the system.replicas thread pool running a task.

 Number of connections to TCP server (clients with native interface), also included server-server distributed query connections

-### TablesLoaderThreads
-
-Number of threads in the tables loader thread pool.
-
-### TablesLoaderThreadsActive
-
-Number of threads in the tables loader thread pool running a task.
-
 ### TablesToDropQueueSize

 Number of dropped tables, that are waiting for background data removal.
--- a/docs/en/operations/system-tables/numbers.md
+++ b/docs/en/operations/system-tables/numbers.md
@ -31,3 +31,26 @@ SELECT * FROM system.numbers LIMIT 10;

 10 rows in set. Elapsed: 0.001 sec.
 ```
+
+You can also limit the output by predicates.
+
+```sql
+SELECT * FROM system.numbers < 10;
+```
+
+```response
+┌─number─┐
+│      0 │
+│      1 │
+│      2 │
+│      3 │
+│      4 │
+│      5 │
+│      6 │
+│      7 │
+│      8 │
+│      9 │
+└────────┘
+
+10 rows in set. Elapsed: 0.001 sec.
+```
--- a/docs/en/operations/system-tables/opentelemetry_span_log.md
+++ b/docs/en/operations/system-tables/opentelemetry_span_log.md
@ -8,28 +8,19 @@ Contains information about [trace spans](https://opentracing.io/docs/overview/sp
 Columns:

 - `trace_id` ([UUID](../../sql-reference/data-types/uuid.md)) — ID of the trace for executed query.
-
 - `span_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — ID of the `trace span`.
-
 - `parent_span_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — ID of the parent `trace span`.
-
 - `operation_name` ([String](../../sql-reference/data-types/string.md)) — The name of the operation.
-
 - `kind` ([Enum8](../../sql-reference/data-types/enum.md)) — The [SpanKind](https://opentelemetry.io/docs/reference/specification/trace/api/#spankind) of the span.
    - `INTERNAL` — Indicates that the span represents an internal operation within an application.
    - `SERVER` — Indicates that the span covers server-side handling of a synchronous RPC or other remote request.
    - `CLIENT` — Indicates that the span describes a request to some remote service.
    - `PRODUCER` — Indicates that the span describes the initiators of an asynchronous request. This parent span will often end before the corresponding child CONSUMER span, possibly even before the child span starts.
    - `CONSUMER` - Indicates that the span describes a child of an asynchronous PRODUCER request.
-
 - `start_time_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The start time of the `trace span` (in microseconds).
-
 - `finish_time_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The finish time of the `trace span` (in microseconds).
-
 - `finish_date` ([Date](../../sql-reference/data-types/date.md)) — The finish date of the `trace span`.
-
 - `attribute.names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — [Attribute](https://opentelemetry.io/docs/go/instrumentation/#attributes) names depending on the `trace span`. They are filled in according to the recommendations in the [OpenTelemetry](https://opentelemetry.io/) standard.
-
 - `attribute.values` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Attribute values depending on the `trace span`. They are filled in according to the recommendations in the `OpenTelemetry` standard.

 **Example**
--- a/docs/en/operations/system-tables/part_log.md
+++ b/docs/en/operations/system-tables/part_log.md
@ -9,6 +9,7 @@ This table contains information about events that occurred with [data parts](../

 The `system.part_log` table contains the following columns:

+- `hostname` ([LowCardinality(String)](../../sql-reference/data-types/string.md)) — Hostname of the server executing the query.
 - `query_id` ([String](../../sql-reference/data-types/string.md)) — Identifier of the `INSERT` query that created this data part.
 - `event_type` ([Enum8](../../sql-reference/data-types/enum.md)) — Type of the event that occurred with the data part. Can have one of the following values:
    - `NewPart` — Inserting of a new data part.
@ -56,13 +57,14 @@ SELECT * FROM system.part_log LIMIT 1 FORMAT Vertical;
 ``` text
 Row 1:
 ──────
+hostname:                      clickhouse.eu-central1.internal
 query_id:                      983ad9c7-28d5-4ae1-844e-603116b7de31
 event_type:                    NewPart
 merge_reason:                  NotAMerge
 merge_algorithm:               Undecided
 event_date:                    2021-02-02
 event_time:                    2021-02-02 11:14:28
-event_time_microseconds:                    2021-02-02 11:14:28.861919
+event_time_microseconds:       2021-02-02 11:14:28.861919
 duration_ms:                   35
 database:                      default
 table:                         log_mt_2
--- a/docs/en/operations/system-tables/processors_profile_log.md
+++ b/docs/en/operations/system-tables/processors_profile_log.md
@ -4,6 +4,7 @@ This table contains profiling on processors level (that you can find in [`EXPLAI

 Columns:

+- `hostname` ([LowCardinality(String)](../../sql-reference/data-types/string.md)) — Hostname of the server executing the query.
 - `event_date` ([Date](../../sql-reference/data-types/date.md)) — The date when the event happened.
 - `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — The date and time when the event happened.
 - `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — The date and time with microseconds precision when the event happened.
--- a/docs/en/operations/system-tables/query_log.md
+++ b/docs/en/operations/system-tables/query_log.md
@ -34,6 +34,7 @@ You can use the [log_formatted_queries](../../operations/settings/settings.md#se

 Columns:

+- `hostname` ([LowCardinality(String)](../../sql-reference/data-types/string.md)) — Hostname of the server executing the query.
 - `type` ([Enum8](../../sql-reference/data-types/enum.md)) — Type of an event that occurred when executing the query. Values:
    - `'QueryStart' = 1` — Successful start of query execution.
    - `'QueryFinish' = 2` — Successful end of query execution.
@ -127,6 +128,7 @@ SELECT * FROM system.query_log WHERE type = 'QueryFinish' ORDER BY query_start_t
 ``` text
 Row 1:
 ──────
+hostname:                              clickhouse.eu-central1.internal
 type:                                  QueryFinish
 event_date:                            2021-11-03
 event_time:                            2021-11-03 16:13:54
@ -167,7 +169,7 @@ initial_query_start_time:              2021-11-03 16:13:54
 initial_query_start_time_microseconds: 2021-11-03 16:13:54.952325
 interface:                             1
 os_user:                               sevirov
-client_hostname:                       clickhouse.ru-central1.internal
+client_hostname:                       clickhouse.eu-central1.internal
 client_name:                           ClickHouse
 client_revision:                       54449
 client_version_major:                  21
--- a/docs/en/operations/system-tables/query_thread_log.md
+++ b/docs/en/operations/system-tables/query_thread_log.md
@ -18,6 +18,7 @@ You can use the [log_queries_probability](../../operations/settings/settings.md#

 Columns:

+- `hostname` ([LowCardinality(String)](../../sql-reference/data-types/string.md)) — Hostname of the server executing the query.
 - `event_date` ([Date](../../sql-reference/data-types/date.md)) — The date when the thread has finished execution of the query.
 - `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — The date and time when the thread has finished execution of the query.
 - `event_time_microsecinds` ([DateTime](../../sql-reference/data-types/datetime.md)) — The date and time when the thread has finished execution of the query with microseconds precision.
@ -74,6 +75,7 @@ Columns:
 ``` text
 Row 1:
 ──────
+hostname:                      clickhouse.eu-central1.internal
 event_date:                    2020-09-11
 event_time:                    2020-09-11 10:08:17
 event_time_microseconds:       2020-09-11 10:08:17.134042
--- a/docs/en/operations/system-tables/query_views_log.md
+++ b/docs/en/operations/system-tables/query_views_log.md
@ -18,6 +18,7 @@ You can use the [log_queries_probability](../../operations/settings/settings.md#

 Columns:

+- `hostname` ([LowCardinality(String)](../../sql-reference/data-types/string.md)) — Hostname of the server executing the query.
 - `event_date` ([Date](../../sql-reference/data-types/date.md)) — The date when the last event of the view happened.
 - `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — The date and time when the view finished execution.
 - `event_time_microseconds` ([DateTime](../../sql-reference/data-types/datetime.md)) — The date and time when the view finished execution with microseconds precision.
@ -59,6 +60,7 @@ Result:
 ``` text
 Row 1:
 ──────
+hostname:                clickhouse.eu-central1.internal
 event_date:              2021-06-22
 event_time:              2021-06-22 13:23:07
 event_time_microseconds: 2021-06-22 13:23:07.738221
--- a/docs/en/operations/system-tables/session_log.md
+++ b/docs/en/operations/system-tables/session_log.md
@ -7,6 +7,7 @@ Contains information about all successful and failed login and logout events.

 Columns:

+- `hostname` ([LowCardinality(String)](../../sql-reference/data-types/string.md)) — Hostname of the server executing the query.
 - `type` ([Enum8](../../sql-reference/data-types/enum.md)) — Login/logout result. Possible values:
    - `LoginFailure` — Login error.
    - `LoginSuccess` — Successful login.
@ -57,6 +58,7 @@ Result:
 ``` text
 Row 1:
 ──────
+hostname:                clickhouse.eu-central1.internal
 type:                    LoginSuccess
 auth_id:                 45e6bd83-b4aa-4a23-85e6-bd83b4aa1a23
 session_id:
--- a/docs/en/operations/system-tables/text_log.md
+++ b/docs/en/operations/system-tables/text_log.md
@ -7,6 +7,7 @@ Contains logging entries. The logging level which goes to this table can be limi

 Columns:

+- `hostname` ([LowCardinality(String)](../../sql-reference/data-types/string.md)) — Hostname of the server executing the query.
 - `event_date` (Date) — Date of the entry.
 - `event_time` (DateTime) — Time of the entry.
 - `event_time_microseconds` (DateTime) — Time of the entry with microseconds precision.
@ -39,6 +40,7 @@ SELECT * FROM system.text_log LIMIT 1 \G
 ``` text
 Row 1:
 ──────
+hostname:                clickhouse.eu-central1.internal
 event_date:              2020-09-10
 event_time:              2020-09-10 11:23:07
 event_time_microseconds: 2020-09-10 11:23:07.871397
--- a/docs/en/operations/system-tables/trace_log.md
+++ b/docs/en/operations/system-tables/trace_log.md
@ -12,37 +12,27 @@ To analyze logs, use the `addressToLine`, `addressToLineWithInlines`, `addressTo

 Columns:

+- `hostname` ([LowCardinality(String)](../../sql-reference/data-types/string.md)) — Hostname of the server executing the query.
 - `event_date` ([Date](../../sql-reference/data-types/date.md)) — Date of sampling moment.
-
 - `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Timestamp of the sampling moment.
-
 - `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Timestamp of the sampling moment with microseconds precision.
-
 - `timestamp_ns` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Timestamp of the sampling moment in nanoseconds.
-
 - `revision` ([UInt32](../../sql-reference/data-types/int-uint.md)) — ClickHouse server build revision.

-    When connecting to the server by `clickhouse-client`, you see the string similar to `Connected to ClickHouse server version 19.18.1 revision 54429.`. This field contains the `revision`, but not the `version` of a server.
+    When connecting to the server by `clickhouse-client`, you see the string similar to `Connected to ClickHouse server version 19.18.1.`. This field contains the `revision`, but not the `version` of a server.

 - `trace_type` ([Enum8](../../sql-reference/data-types/enum.md)) — Trace type:
-
    - `Real` represents collecting stack traces by wall-clock time.
    - `CPU` represents collecting stack traces by CPU time.
    - `Memory` represents collecting allocations and deallocations when memory allocation exceeds the subsequent watermark.
    - `MemorySample` represents collecting random allocations and deallocations.
    - `MemoryPeak` represents collecting updates of peak memory usage.
    - `ProfileEvent` represents collecting of increments of profile events.
-
 - `thread_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Thread identifier.
-
 - `query_id` ([String](../../sql-reference/data-types/string.md)) — Query identifier that can be used to get details about a query that was running from the [query_log](#system_tables-query_log) system table.
-
 - `trace` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — Stack trace at the moment of sampling. Each element is a virtual memory address inside ClickHouse server process.
-
 - `size` ([Int64](../../sql-reference/data-types/int-uint.md)) - For trace types `Memory`, `MemorySample` or `MemoryPeak` is the amount of memory allocated, for other trace types is 0.
-
 - `event` ([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md)) - For trace type `ProfileEvent` is the name of updated profile event, for other trace types is an empty string.
-
 - `increment` ([UInt64](../../sql-reference/data-types/int-uint.md)) - For trace type `ProfileEvent` is the amount of increment of profile event, for other trace types is 0.

 **Example**
@ -54,6 +44,7 @@ SELECT * FROM system.trace_log LIMIT 1 \G
 ``` text
 Row 1:
 ──────
+hostname:                clickhouse.eu-central1.internal
 event_date:              2020-09-10
 event_time:              2020-09-10 11:23:09
 event_time_microseconds: 2020-09-10 11:23:09.872924
--- a/docs/en/operations/system-tables/zookeeper_log.md
+++ b/docs/en/operations/system-tables/zookeeper_log.md
@ -9,6 +9,7 @@ For requests, only columns with request parameters are filled in, and the remain

 Columns with request parameters:

+- `hostname` ([LowCardinality(String)](../../sql-reference/data-types/string.md)) — Hostname of the server executing the query.
 - `type` ([Enum](../../sql-reference/data-types/enum.md)) — Event type in the ZooKeeper client. Can have one of the following values:
    - `Request` — The request has been sent.
    - `Response` — The response was received.
@ -63,6 +64,7 @@ Result:
 ``` text
 Row 1:
 ──────
+hostname:         clickhouse.eu-central1.internal
 type:             Request
 event_date:       2021-08-09
 event_time:       2021-08-09 21:38:30.291792
--- a/docs/en/sql-reference/aggregate-functions/parametric-functions.md
+++ b/docs/en/sql-reference/aggregate-functions/parametric-functions.md
@ -487,24 +487,23 @@ Where:

 ## uniqUpTo(N)(x)

-Calculates the number of different argument values if it is less than or equal to N. If the number of different argument values is greater than N, it returns N + 1.
+Calculates the number of different values of the argument up to a specified limit, `N`. If the number of different argument values is greater than `N`, this function returns `N` + 1, otherwise it calculates the exact value. 

-Recommended for use with small Ns, up to 10. The maximum value of N is 100.
+Recommended for use with small `N`s, up to 10. The maximum value of `N` is 100.

-For the state of an aggregate function, it uses the amount of memory equal to 1 + N \* the size of one value of bytes.
-For strings, it stores a non-cryptographic hash of 8 bytes. That is, the calculation is approximated for strings.
+For the state of an aggregate function, this function uses the amount of memory equal to 1 + `N` \* the size of one value of bytes.
+When dealing with strings, this function stores a non-cryptographic hash of 8 bytes; the calculation is approximated for strings.

-The function also works for several arguments.
+For example, if you had a table that logs every search query made by users on your website. Each row in the table represents a single search query, with columns for the user ID, the search query, and the timestamp of the query. You can use `uniqUpTo` to generate a report that shows only the keywords that produced at least 5 unique users.

-It works as fast as possible, except for cases when a large N value is used and the number of unique values is slightly less than N.
-
-Usage example:
-
-``` text
-Problem: Generate a report that shows only keywords that produced at least 5 unique users.
-Solution: Write in the GROUP BY query SearchPhrase HAVING uniqUpTo(4)(UserID) >= 5
+```sql
+SELECT SearchPhrase
+FROM SearchLog
+GROUP BY SearchPhrase
+HAVING uniqUpTo(4)(UserID) >= 5
 ```

+`uniqUpTo(4)(UserID)` calculates the number of unique `UserID` values for each `SearchPhrase`, but it only counts up to 4 unique values. If there are more than 4 unique `UserID` values for a `SearchPhrase`, the function returns 5 (4 + 1). The `HAVING` clause then filters out the `SearchPhrase` values for which the number of unique `UserID` values is less than 5. This will give you a list of search keywords that were used by at least 5 unique users.

 ## sumMapFiltered(keys_to_keep)(keys, values)

--- a/docs/en/sql-reference/aggregate-functions/reference/any.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/any.md
@ -5,7 +5,12 @@ sidebar_position: 6

 # any

-Selects the first encountered (non-NULL) value, unless all rows have NULL values in that column.
+Selects the first encountered value of a column.
+
+By default, it ignores NULL values and returns the first NOT NULL value found in the column. As [`first_value`](../../../sql-reference/aggregate-functions/reference/first_value.md) if supports `RESPECT NULLS`, in which case it will select the first value passed, independently on whether it's NULL or not.
+
+The return type of the function is the same as the input, except for LowCardinality which is discarded). This means that given no rows as input it will return the default value of that type (0 for integers, or Null for a Nullable() column). You might use the `-OrNull` [combinator](../../../sql-reference/aggregate-functions/combinators.md) ) to modify this behaviour.
+
 The query can be executed in any order and even in a different order each time, so the result of this function is indeterminate.
 To get a determinate result, you can use the ‘min’ or ‘max’ function instead of ‘any’.

@ -13,4 +18,4 @@ In some cases, you can rely on the order of execution. This applies to cases whe

 When a `SELECT` query has the `GROUP BY` clause or at least one aggregate function, ClickHouse (in contrast to MySQL) requires that all expressions in the `SELECT`, `HAVING`, and `ORDER BY` clauses be calculated from keys or from aggregate functions. In other words, each column selected from the table must be used either in keys or inside aggregate functions. To get behavior like in MySQL, you can put the other columns in the `any` aggregate function.

- Alias: `any_value`
+- Alias: `any_value`, `first_value`.
--- a/docs/en/sql-reference/aggregate-functions/reference/first_value.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/first_value.md
@ -5,9 +5,12 @@ sidebar_position: 7

 # first_value

-Selects the first encountered value, similar to `any`, but could accept NULL.
-Mostly it should be used with [Window Functions](../../window-functions/index.md).
-Without Window Functions the result will be random if the source stream is not ordered.
+It is an alias for [`any`](../../../sql-reference/aggregate-functions/reference/any.md) but it was introduced for compatibility with [Window Functions](../../window-functions/index.md), where sometimes it's necessary to process `NULL` values (by default all ClickHouse aggregate functions ignore NULL values).
+
+It supports declaring a modifier to respect nulls (`RESPECT NULLS`), both under [Window Functions](../../window-functions/index.md) and in normal aggregations.
+
+As with `any`, without Window Functions the result will be random if the source stream is not ordered and the return type
+matches the input type (Null is only returned if the input is Nullable or -OrNull combinator is added).

 ## examples

@ -23,15 +26,15 @@ INSERT INTO test_data (a, b) Values (1,null), (2,3), (4, 5), (6,null);
 ```

 ### example1
-The NULL value is ignored at default.
+By default, the NULL value is ignored.
 ```sql
 select first_value(b) from test_data;
 ```

 ```text
-┌─first_value_ignore_nulls(b)─┐
-│                           3 │
-└─────────────────────────────┘
+┌─any(b)─┐
+│      3 │
+└────────┘
 ```

 ### example2
@ -41,9 +44,9 @@ select first_value(b) ignore nulls from test_data
 ```

 ```text
-┌─first_value_ignore_nulls(b)─┐
-│                           3 │
-└─────────────────────────────┘
+┌─any(b) IGNORE NULLS ─┐
+│                    3 │
+└──────────────────────┘
 ```

 ### example3
@ -53,9 +56,9 @@ select first_value(b) respect nulls from test_data
 ```

 ```text
-┌─first_value_respect_nulls(b)─┐
-│                         ᴺᵁᴸᴸ │
-└──────────────────────────────┘
+┌─any(b) RESPECT NULLS ─┐
+│                  ᴺᵁᴸᴸ │
+└───────────────────────┘
 ```

 ### example4
@ -73,8 +76,8 @@ FROM
 ```

 ```text
-┌─first_value_respect_nulls(b)─┬─first_value(b)─┐
-│                         ᴺᵁᴸᴸ │              3 │
-└──────────────────────────────┴────────────────┘
+┌─any_respect_nulls(b)─┬─any(b)─┐
+│                 ᴺᵁᴸᴸ │      3 │
+└──────────────────────┴────────┘
 ```

--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@ -1083,7 +1083,7 @@ Result:

 **See also**

- [arrayFold](#arrayFold)
+- [arrayFold](#arrayfold)

 ## arrayReduceInRanges

@ -1175,7 +1175,7 @@ FROM numbers(1,10);

 **See also**

- [arrayReduce](#arrayReduce)
+- [arrayReduce](#arrayreduce)

 ## arrayReverse(arr)

--- a/docs/en/sql-reference/functions/other-functions.md
+++ b/docs/en/sql-reference/functions/other-functions.md
@ -67,45 +67,7 @@ WHERE macro = 'test';
 │ test  │ Value        │
 └───────┴──────────────┘
 ```
-  
-## getClientHTTPHeader  
-Returns the value of specified http header.If there is no such header or the request method is not http, it will throw an exception.  

-**Syntax**  
-
-```sql
-getClientHTTPHeader(name);
-``` 
-
-**Arguments**  
-
- `name` — HTTP header name .[String](../../sql-reference/data-types/string.md#string)  
-
-**Returned value**
-
-Value of the specified header.  
-Type:[String](../../sql-reference/data-types/string.md#string).
-
-
-When we use `clickhouse-client` to execute this function, we'll always get empty string, because client doesn't use http protocol.
-```sql
-SELECT getCientHTTPHeader('test')
-```
-result:  
-
-```text
-┌─getClientHTTPHeader('test')─┐
-│                             │
-└────────────------───────────┘
-```  
-Try to use http request:  
-```shell 
-echo "select getClientHTTPHeader('X-Clickhouse-User')" | curl -H 'X-ClickHouse-User: default' -H 'X-ClickHouse-Key: ' 'http://localhost:8123/' -d @-
-
-#result
-default
-```
-  
 ## FQDN

 Returns the fully qualified domain name of the ClickHouse server.
--- a/docs/en/sql-reference/operators/exists.md
+++ b/docs/en/sql-reference/operators/exists.md
@ -5,7 +5,7 @@ slug: /en/sql-reference/operators/exists

 The `EXISTS` operator checks how many records are in the result of a subquery. If it is empty, then the operator returns `0`. Otherwise, it returns `1`.

-`EXISTS` can be used in a [WHERE](../../sql-reference/statements/select/where.md) clause.
+`EXISTS` can also be used in a [WHERE](../../sql-reference/statements/select/where.md) clause.

 :::tip    
 References to main query tables and columns are not supported in a subquery.
@ -13,12 +13,26 @@ References to main query tables and columns are not supported in a subquery.

 **Syntax**

-```sql
-WHERE EXISTS(subquery)
+``` sql
+EXISTS(subquery)
 ```

 **Example**

+Query checking existence of values in a subquery:
+
+``` sql
+SELECT EXISTS(SELECT * FROM numbers(10) WHERE number > 8), EXISTS(SELECT * FROM numbers(10) WHERE number > 11)
+```
+
+Result:
+
+``` text
+┌─in(1, _subquery1)─┬─in(1, _subquery2)─┐
+│                 1 │                 0 │
+└───────────────────┴───────────────────┘
+```
+
 Query with a subquery returning several rows:

 ``` sql
--- a/docs/en/sql-reference/statements/alter/column.md
+++ b/docs/en/sql-reference/statements/alter/column.md
@ -10,7 +10,7 @@ A set of queries that allow changing the table structure.
 Syntax:

 ``` sql
-ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|RENAME|CLEAR|COMMENT|{MODIFY|ALTER}|MATERIALIZE COLUMN ...
+ALTER [TEMPORARY] TABLE [db].name [ON CLUSTER cluster] ADD|DROP|RENAME|CLEAR|COMMENT|{MODIFY|ALTER}|MATERIALIZE COLUMN ...
 ```

 In the query, specify a list of one or more comma-separated actions.
--- a/docs/en/sql-reference/statements/explain.md
+++ b/docs/en/sql-reference/statements/explain.md
@ -415,7 +415,7 @@ ExpressionTransform
        ExpressionTransform × 2
          (SettingQuotaAndLimits)
            (ReadFromStorage)
-            NumbersMt × 2 0 → 1
+            NumbersRange × 2 0 → 1
 ```
 ### EXPLAIN ESTIMATE

--- a/docs/en/sql-reference/table-functions/file.md
+++ b/docs/en/sql-reference/table-functions/file.md
@ -1,4 +1,4 @@
- --
+---
 slug: /en/sql-reference/table-functions/file
 sidebar_position: 60
 sidebar_label: file
--- a/docs/en/sql-reference/table-functions/numbers.md
+++ b/docs/en/sql-reference/table-functions/numbers.md
@ -17,6 +17,8 @@ The following queries are equivalent:
 SELECT * FROM numbers(10);
 SELECT * FROM numbers(0, 10);
 SELECT * FROM system.numbers LIMIT 10;
+SELECT * FROM system.numbers WHERE number BETWEEN 0 AND 9;
+SELECT * FROM system.numbers WHERE number IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9);
 ```

 Examples:
--- a/docs/ru/interfaces/cli.md
+++ b/docs/ru/interfaces/cli.md
@ -14,7 +14,7 @@ ClickHouse предоставляет собственный клиент ком
 $ clickhouse-client
 ClickHouse client version 20.13.1.5273 (official build).
 Connecting to localhost:9000 as user default.
-Connected to ClickHouse server version 20.13.1 revision 54442.
+Connected to ClickHouse server version 20.13.1.

 :)
 ```
--- a/docs/ru/operations/settings/merge-tree-settings.md
+++ b/docs/ru/operations/settings/merge-tree-settings.md
@ -119,7 +119,7 @@ Eсли суммарное число активных кусков во все
 -   Положительное целое число.
 -   0 (без ограничений).

-Значение по умолчанию: 100.
+Значение по умолчанию: 1000.

 Команда `Insert` создает один или несколько блоков (кусков). При вставке в Replicated таблицы ClickHouse для [дедупликации вставок](../../engines/table-engines/mergetree-family/replication.md) записывает в Zookeeper хеш-суммы созданных кусков. Но хранятся только последние `replicated_deduplication_window` хеш-сумм. Самые старые хеш-суммы удаляются из Zookeeper.
 Большое значение `replicated_deduplication_window` замедляет `Insert`, так как приходится сравнивать большее количество хеш-сумм.
--- a/docs/ru/operations/system-tables/trace_log.md
+++ b/docs/ru/operations/system-tables/trace_log.md
@ -19,7 +19,7 @@ ClickHouse создает эту таблицу когда установлен

 -   `revision`([UInt32](../../sql-reference/data-types/int-uint.md)) — ревизия сборки сервера ClickHouse.

-        Во время соединения с сервером через `clickhouse-client`, вы видите строку похожую на `Connected to ClickHouse server version 19.18.1 revision 54429.`. Это поле содержит номер после `revision`, но не содержит строку после `version`.
+        Во время соединения с сервером через `clickhouse-client`, вы видите строку похожую на `Connected to ClickHouse server version 19.18.1.`. Это поле содержит номер после `revision`, но не содержит строку после `version`.

 -   `trace_type`([Enum8](../../sql-reference/data-types/enum.md)) — тип трассировки:

--- a/docs/ru/sql-reference/statements/alter/column.md
+++ b/docs/ru/sql-reference/statements/alter/column.md
@ -11,7 +11,7 @@ sidebar_label: "Манипуляции со столбцами"
 Синтаксис:

 ``` sql
-ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|RENAME|CLEAR|COMMENT|{MODIFY|ALTER}|MATERIALIZE COLUMN ...
+ALTER [TEMPORARY] TABLE [db].name [ON CLUSTER cluster] ADD|DROP|RENAME|CLEAR|COMMENT|{MODIFY|ALTER}|MATERIALIZE COLUMN ...
 ```

 В запросе можно указать сразу несколько действий над одной таблицей через запятую.
--- a/docs/ru/sql-reference/statements/explain.md
+++ b/docs/ru/sql-reference/statements/explain.md
@ -371,7 +371,7 @@ ExpressionTransform
        ExpressionTransform × 2
          (SettingQuotaAndLimits)
            (ReadFromStorage)
-            NumbersMt × 2 0 → 1
+            NumbersRange × 2 0 → 1
 ```

 ### EXPLAIN ESTIMATE {#explain-estimate}
--- a/docs/zh/interfaces/cli.md
+++ b/docs/zh/interfaces/cli.md
@ -14,7 +14,7 @@ ClickHouse提供了一个原生命令行客户端`clickhouse-client`客户端支
 $ clickhouse-client
 ClickHouse client version 19.17.1.1579 (official build).
 Connecting to localhost:9000 as user default.
-Connected to ClickHouse server version 19.17.1 revision 54428.
+Connected to ClickHouse server version 19.17.1.

 :)
 ```
--- a/docs/zh/operations/system-tables/trace_log.md
+++ b/docs/zh/operations/system-tables/trace_log.md
@ -22,7 +22,7 @@ ClickHouse创建此表时 [trace_log](../../operations/server-configuration-para

 -   `revision` ([UInt32](../../sql-reference/data-types/int-uint.md)) — ClickHouse server build revision.

-    通过以下方式连接到服务器 `clickhouse-client`，你看到的字符串类似于 `Connected to ClickHouse server version 19.18.1 revision 54429.`. 该字段包含 `revision`，但不是 `version` 的服务器。
+    通过以下方式连接到服务器 `clickhouse-client`，你看到的字符串类似于 `Connected to ClickHouse server version 19.18.1.`. 该字段包含 `revision`，但不是 `version` 的服务器。

 -   `timer_type` ([枚举8](../../sql-reference/data-types/enum.md)) — Timer type:

--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@ -493,8 +493,7 @@ void Client::connect()

    if (is_interactive)
    {
-        std::cout << "Connected to " << server_name << " server version " << server_version << " revision " << server_revision << "."
-                    << std::endl << std::endl;
+        std::cout << "Connected to " << server_name << " server version " << server_version << "." << std::endl << std::endl;

        auto client_version_tuple = std::make_tuple(VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH);
        auto server_version_tuple = std::make_tuple(server_version_major, server_version_minor, server_version_patch);
--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@ -23,6 +23,7 @@
 #include <Common/scope_guard_safe.h>
 #include <Interpreters/Session.h>
 #include <Access/AccessControl.h>
+#include <Common/PoolId.h>
 #include <Common/Exception.h>
 #include <Common/Macros.h>
 #include <Common/Config/ConfigProcessor.h>
@ -742,16 +743,16 @@ void LocalServer::processConfig()
        status.emplace(fs::path(path) / "status", StatusFile::write_full_info);

        LOG_DEBUG(log, "Loading metadata from {}", path);
-        loadMetadataSystem(global_context);
+        auto startup_system_tasks = loadMetadataSystem(global_context);
        attachSystemTablesLocal(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::SYSTEM_DATABASE));
        attachInformationSchema(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::INFORMATION_SCHEMA));
        attachInformationSchema(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::INFORMATION_SCHEMA_UPPERCASE));
-        startupSystemTables();
+        waitLoad(TablesLoaderForegroundPoolId, startup_system_tasks);

        if (!config().has("only-system-tables"))
        {
            DatabaseCatalog::instance().createBackgroundTasks();
-            loadMetadata(global_context);
+            waitLoad(loadMetadata(global_context));
            DatabaseCatalog::instance().startupBackgroundTasks();
        }

--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -20,6 +20,7 @@
 #include <base/coverage.h>
 #include <base/getFQDNOrHostName.h>
 #include <base/safeExit.h>
+#include <Common/PoolId.h>
 #include <Common/MemoryTracker.h>
 #include <Common/ClickHouseRevision.h>
 #include <Common/DNSResolver.h>
@ -1279,8 +1280,6 @@ try
            global_context->setHTTPHeaderFilter(*config);

            global_context->setMaxTableSizeToDrop(server_settings_.max_table_size_to_drop);
-            global_context->setClientHTTPHeaderForbiddenHeaders(server_settings_.get_client_http_header_forbidden_headers);
-            global_context->setAllowGetHTTPHeaderFunction(server_settings_.allow_get_client_http_header);
            global_context->setMaxPartitionSizeToDrop(server_settings_.max_partition_size_to_drop);

            ConcurrencyControl::SlotCount concurrent_threads_soft_limit = ConcurrencyControl::Unlimited;
@ -1336,6 +1335,10 @@ try
            global_context->getMessageBrokerSchedulePool().increaseThreadsCount(server_settings_.background_message_broker_schedule_pool_size);
            global_context->getDistributedSchedulePool().increaseThreadsCount(server_settings_.background_distributed_schedule_pool_size);

+            global_context->getAsyncLoader().setMaxThreads(TablesLoaderForegroundPoolId, server_settings_.tables_loader_foreground_pool_size);
+            global_context->getAsyncLoader().setMaxThreads(TablesLoaderBackgroundLoadPoolId, server_settings_.tables_loader_background_pool_size);
+            global_context->getAsyncLoader().setMaxThreads(TablesLoaderBackgroundStartupPoolId, server_settings_.tables_loader_background_pool_size);
+
            getIOThreadPool().reloadConfiguration(
                server_settings.max_io_thread_pool_size,
                server_settings.max_io_thread_pool_free_size,
@ -1676,17 +1679,18 @@ try

    LOG_INFO(log, "Loading metadata from {}", path_str);

+    LoadTaskPtrs load_metadata_tasks;
    try
    {
        auto & database_catalog = DatabaseCatalog::instance();
        /// We load temporary database first, because projections need it.
        database_catalog.initializeAndLoadTemporaryDatabase();
-        loadMetadataSystem(global_context);
-        maybeConvertSystemDatabase(global_context);
+        auto system_startup_tasks = loadMetadataSystem(global_context);
+        maybeConvertSystemDatabase(global_context, system_startup_tasks);
        /// This has to be done before the initialization of system logs,
        /// otherwise there is a race condition between the system database initialization
        /// and creation of new tables in the database.
-        startupSystemTables();
+        waitLoad(TablesLoaderForegroundPoolId, system_startup_tasks);
        /// After attaching system databases we can initialize system log.
        global_context->initializeSystemLogs();
        global_context->setSystemZooKeeperLogAfterInitializationIfNeeded();
@ -1702,9 +1706,10 @@ try
        /// and so loadMarkedAsDroppedTables() will find it and try to add, and UUID will overlap.
        database_catalog.loadMarkedAsDroppedTables();
        database_catalog.createBackgroundTasks();
-        /// Then, load remaining databases
-        loadMetadata(global_context, default_database);
-        convertDatabasesEnginesIfNeed(global_context);
+        /// Then, load remaining databases (some of them maybe be loaded asynchronously)
+        load_metadata_tasks = loadMetadata(global_context, default_database, server_settings.async_load_databases);
+        /// If we need to convert database engines, disable async tables loading
+        convertDatabasesEnginesIfNeed(load_metadata_tasks, global_context);
        database_catalog.startupBackgroundTasks();
        /// After loading validate that default database exists
        database_catalog.assertDatabaseExists(default_database);
@ -1716,6 +1721,7 @@ try
        tryLogCurrentException(log, "Caught exception while loading metadata");
        throw;
    }
+
    LOG_DEBUG(log, "Loaded metadata.");

    /// Init trace collector only after trace_log system table was created
@ -1871,9 +1877,14 @@ try
                throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "distributed_ddl.pool_size should be greater then 0");
            global_context->setDDLWorker(std::make_unique<DDLWorker>(pool_size, ddl_zookeeper_path, global_context, &config(),
                                                                     "distributed_ddl", "DDLWorker",
-                                                                     &CurrentMetrics::MaxDDLEntryID, &CurrentMetrics::MaxPushedDDLEntryID));
+                                                                     &CurrentMetrics::MaxDDLEntryID, &CurrentMetrics::MaxPushedDDLEntryID),
+                                         load_metadata_tasks);
        }

+        /// Do not keep tasks in server, they should be kept inside databases. Used here to make dependent tasks only.
+        load_metadata_tasks.clear();
+        load_metadata_tasks.shrink_to_fit();
+
        {
            std::lock_guard lock(servers_lock);
            for (auto & server : servers)
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@ -364,8 +364,15 @@
    <background_schedule_pool_size>128</background_schedule_pool_size>
    <background_message_broker_schedule_pool_size>16</background_message_broker_schedule_pool_size>
    <background_distributed_schedule_pool_size>16</background_distributed_schedule_pool_size>
+    <tables_loader_foreground_pool_size>0</tables_loader_foreground_pool_size>
+    <tables_loader_background_pool_size>0</tables_loader_background_pool_size>
    -->

+    <!-- Enables asynchronous loading of databases and tables to speedup server startup.
+         Queries to not yet loaded entity will be blocked until load is finished.
+      -->
+    <!-- <async_load_databases>true</async_load_databases> -->
+
    <!-- On memory constrained environments you may have to set this to value larger than 1.
      -->
    <max_server_memory_usage_to_ram_ratio>0.9</max_server_memory_usage_to_ram_ratio>
--- a/programs/server/dashboard.html
+++ b/programs/server/dashboard.html
@ -108,7 +108,7 @@
            filter: blur(1px);
        }

-        .chart div { position: absolute; }
+        .chart > div { position: absolute; }

        .inputs {
            height: auto;
@ -215,8 +215,6 @@
            color: var(--text-color);
        }

-        .u-legend th { display: none; }
-
        .themes {
            float: right;
            font-size: 20pt;
@ -433,6 +431,16 @@
            display: none;
        }

+        .u-series {
+            line-height: 0.8;
+        }
+
+        .u-series.footer {
+            font-size: 8px;
+            padding-top: 0;
+            margin-top: 0;
+        }
+
        /* Source: https://cdn.jsdelivr.net/npm/uplot@1.6.21/dist/uPlot.min.css
         * It is copy-pasted to lower the number of requests.
         */
@ -478,7 +486,6 @@
  * - compress the state for URL's #hash;
  * - footer with "about" or a link to source code;
  * - allow to configure a table on a server to save the dashboards;
-  * - multiple lines on chart;
  * - if a query returned one value, display this value instead of a diagram;
  * - if a query returned something unusual, display the table;
  */
@ -520,10 +527,54 @@ let queries = [];
 /// Query parameters with predefined default values.
 /// All other parameters will be automatically found in the queries.
 let params = {
-    "rounding": "60",
-    "seconds": "86400"
+    'rounding': '60',
+    'seconds': '86400'
 };

+/// Palette generation for charts
+function generatePalette(baseColor, numColors) {
+    const baseHSL = hexToHsl(baseColor);
+    const hueStep = 360 / numColors;
+    const palette = [];
+    for (let i = 0; i < numColors; i++) {
+        const hue = Math.round((baseHSL.h + i * hueStep) % 360);
+        const color = `hsl(${hue}, ${baseHSL.s}%, ${baseHSL.l}%)`;
+        palette.push(color);
+    }
+    return palette;
+}
+
+/// Helper function to convert hex color to HSL
+function hexToHsl(hex) {
+    hex = hex.replace(/^#/, '');
+    const bigint = parseInt(hex, 16);
+    const r = (bigint >> 16) & 255;
+    const g = (bigint >> 8) & 255;
+    const b = bigint & 255;
+    const r_norm = r / 255;
+    const g_norm = g / 255;
+    const b_norm = b / 255;
+    const max = Math.max(r_norm, g_norm, b_norm);
+    const min = Math.min(r_norm, g_norm, b_norm);
+    const l = (max + min) / 2;
+    let s = 0;
+    if (max !== min) {
+        s = l > 0.5 ? (max - min) / (2 - max - min) : (max - min) / (max + min);
+    }
+    let h = 0;
+    if (max !== min) {
+        if (max === r_norm) {
+            h = (g_norm - b_norm) / (max - min) + (g_norm < b_norm ? 6 : 0);
+        } else if (max === g_norm) {
+            h = (b_norm - r_norm) / (max - min) + 2;
+        } else {
+            h = (r_norm - g_norm) / (max - min) + 4;
+        }
+    }
+    h = Math.round(h * 60);
+    return { h, s: Math.round(s * 100), l: Math.round(l * 100) };
+}
+
 let theme = 'light';

 function setTheme(new_theme) {
@ -913,6 +964,8 @@ document.getElementById('mass-editor-textarea').addEventListener('input', e => {

 function legendAsTooltipPlugin({ className, style = { background: "var(--legend-background)" } } = {}) {
    let legendEl;
+    let showTop = false;
+    const showLimit = 5;

    function init(u, opts) {
        legendEl = u.root.querySelector(".u-legend");
@ -932,13 +985,28 @@ function legendAsTooltipPlugin({ className, style = { background: "var(--legend-
            ...style
        });

-        // hide series color markers
-        const idents = legendEl.querySelectorAll(".u-marker");
+        if (opts.series.length == 2) {
+            const nodes = legendEl.querySelectorAll("th");
+            for (let i = 0; i < nodes.length; i++)
+                nodes[i].style.display = "none";
+        } else {
+            legendEl.querySelector("th").remove();
+            legendEl.querySelector("td").setAttribute('colspan', '2');
+            legendEl.querySelector("td").style.textAlign = 'center';
+        }

-        for (let i = 0; i < idents.length; i++)
-            idents[i].style.display = "none";
+        if (opts.series.length - 1 > showLimit) {
+            showTop = true;
+            let footer = legendEl.insertRow().insertCell();
+            footer.setAttribute('colspan', '2');
+            footer.style.textAlign = 'center';
+            footer.classList.add('u-value');
+            footer.parentNode.classList.add('u-series','footer');
+            footer.textContent = ". . .";
+        }

        const overEl = u.over;
+        overEl.style.overflow = "visible";

        overEl.appendChild(legendEl);

@ -946,11 +1014,28 @@ function legendAsTooltipPlugin({ className, style = { background: "var(--legend-
        overEl.addEventListener("mouseleave", () => {legendEl.style.display = "none";});
    }

+    function nodeListToArray(nodeList) {
+        return Array.prototype.slice.call(nodeList);
+    }
+
    function update(u) {
        let { left, top } = u.cursor;
        left -= legendEl.clientWidth / 2;
        top -= legendEl.clientHeight / 2;
        legendEl.style.transform = "translate(" + left + "px, " + top + "px)";
+        if (showTop) {
+            let nodes = nodeListToArray(legendEl.querySelectorAll("tr"));
+            let header = nodes.shift();
+            let footer = nodes.pop();
+            nodes.forEach(function (node) { node._sort_key = +node.querySelector("td").textContent; });
+            nodes.sort((a, b) => +b._sort_key - +a._sort_key);
+            nodes.forEach(function (node) { node.parentNode.appendChild(node); });
+            for (let i = 0; i < nodes.length; i++) {
+                nodes[i].style.display = i < showLimit ? null : "none";
+                delete nodes[i]._sort_key;
+            }
+            footer.parentNode.appendChild(footer);
+        }
    }

    return {
@ -961,12 +1046,13 @@ function legendAsTooltipPlugin({ className, style = { background: "var(--legend-
    };
 }

+
 async function doFetch(query, url_params = '') {
    host = document.getElementById('url').value || host;
    user = document.getElementById('user').value;
    password = document.getElementById('password').value;

-    let url = `${host}?default_format=JSONCompactColumns&enable_http_compression=1`
+    let url = `${host}?default_format=JSONColumnsWithMetadata&enable_http_compression=1`

    if (add_http_cors_header) {
        // For debug purposes, you may set add_http_cors_header from a browser console
@ -980,14 +1066,17 @@ async function doFetch(query, url_params = '') {
        url += `&password=${encodeURIComponent(password)}`;
    }

-    let response, data, error;
+    let response, reply, error;
    try {
        response = await fetch(url + url_params, { method: "POST", body: query });
-        data = await response.text();
+        reply = await response.text();
        if (response.ok) {
-            data = JSON.parse(data);
+            reply = JSON.parse(reply);
+            if (reply.exception) {
+                error = reply.exception;
+            }
        } else {
-            error = data;
+            error = reply;
        }
    } catch (e) {
        console.log(e);
@ -1006,7 +1095,7 @@ async function doFetch(query, url_params = '') {
        }
    }

-    return {data, error};
+    return {reply, error};
 }

 async function draw(idx, chart, url_params, query) {
@ -1015,17 +1104,76 @@ async function draw(idx, chart, url_params, query) {
        plots[idx] = null;
    }

-    let {data, error} = await doFetch(query, url_params);
+    let {reply, error} = await doFetch(query, url_params);
+    if (!error) {
+        if (reply.rows.length == 0) {
+            error = "Query returned empty result.";
+        } else if (reply.meta.length < 2) {
+            error = "Query should return at least two columns: unix timestamp and value.";
+        } else {
+            for (let i = 0; i < reply.meta.length; i++) {
+                let label = reply.meta[i].name;
+                let column = reply.data[label];
+                if (!Array.isArray(column) || column.length != reply.data[reply.meta[0].name].length) {
+                    error = "Wrong data format of the query.";
+                    break;
+                }
+            }
+        }
+    }
+
+    // Transform string-labeled data to multi-column data
+    function transformToColumns() {
+        const x = reply.meta[0].name; // time; must be ordered
+        const l = reply.meta[1].name; // string label column to distinguish series; must be ordered
+        const y = reply.meta[2].name; // values; must have single value for (x, l) pair
+        const labels = [...new Set(reply.data[l])].sort((a, b) => a - b);
+        if (labels.includes('__time__')) {
+            error = "The second column is not allowed to contain '__time__' values.";
+            return;
+        }
+        const times = [...new Set(reply.data[x])].sort((a, b) => a - b);
+        let new_meta = [{ name: '__time__', type: reply.meta[0].type }];
+        let new_data = { __time__: [] };
+        for (let label of labels) {
+            new_meta.push({ name: label, type: reply.meta[2].type });
+            new_data[label] = [];
+        }
+        let new_rows = 0;
+        function row_done(row_time) {
+            new_rows++;
+            new_data.__time__.push(row_time);
+            for (let label of labels) {
+                if (new_data[label].length < new_rows) {
+                    new_data[label].push(null);
+                }
+            }
+        }
+        let prev_time = reply.data[x][0];
+        const old_rows = reply.data[x].length;
+        for (let i = 0; i < old_rows; i++) {
+            const time = reply.data[x][i];
+            const label = reply.data[l][i];
+            const value = reply.data[y][i];
+            if (prev_time != time) {
+                row_done(prev_time);
+                prev_time = time;
+            }
+            new_data[label].push(value);
+        }
+        row_done(prev_time);
+        reply.meta = new_meta;
+        reply.data = new_data;
+        reply.rows = new_rows;
+    }
+
+    function isStringColumn(type) {
+        return type === 'String' || type === 'LowCardinality(String)';
+    }

    if (!error) {
-        if (!Array.isArray(data)) {
-            error = "Query should return an array.";
-        } else if (data.length == 0) {
-            error = "Query returned empty result.";
-        } else if (data.length != 2) {
-            error = "Query should return exactly two columns: unix timestamp and value.";
-        } else if (!Array.isArray(data[0]) || !Array.isArray(data[1]) || data[0].length != data[1].length) {
-            error = "Wrong data format of the query.";
+        if (reply.meta.length == 3 && isStringColumn(reply.meta[1].type)) {
+            transformToColumns();
        }
    }

@ -1043,24 +1191,38 @@ async function draw(idx, chart, url_params, query) {
    }

    const [line_color, fill_color, grid_color, axes_color] = theme != 'dark'
-        ? ["#F88", "#FEE", "#EED", "#2c3235"]
-        : ["#864", "#045", "#2c3235", "#c7d0d9"];
+        ? ["#ff8888", "#ffeeee", "#eeeedd", "#2c3235"]
+        : ["#886644", "#004455", "#2c3235", "#c7d0d9"];

    let sync = uPlot.sync("sync");

-    const max_value = Math.max(...data[1]);
+    let axis = {
+        stroke: axes_color,
+        grid: { width: 1 / devicePixelRatio, stroke: grid_color },
+        ticks: { width: 1 / devicePixelRatio, stroke: grid_color }
+    };
+
+    let axes = [axis, axis];
+    let series = [{ label: "x" }];
+    let data = [reply.data[reply.meta[0].name]];
+
+    // Treat every column as series
+    const series_count = reply.meta.length;
+    const fill = series_count == 2 ? fill_color : undefined;
+    const palette = generatePalette(line_color, series_count);
+    let max_value = Number.NEGATIVE_INFINITY;
+    for (let i = 1; i < series_count; i++) {
+        let label = reply.meta[i].name;
+        series.push({ label, stroke: palette[i - 1], fill });
+        data.push(reply.data[label]);
+        max_value = Math.max(max_value, ...reply.data[label]);
+    }

    const opts = {
        width: chart.clientWidth,
        height: chart.clientHeight,
-        axes: [ { stroke: axes_color,
-                  grid: { width: 1 / devicePixelRatio, stroke: grid_color },
-                  ticks: { width: 1 / devicePixelRatio, stroke: grid_color } },
-                { stroke: axes_color,
-                  grid: { width: 1 / devicePixelRatio, stroke: grid_color },
-                  ticks: { width: 1 / devicePixelRatio, stroke: grid_color } } ],
-        series: [ { label: "x" },
-                  { label: "y", stroke: line_color, fill: fill_color } ],
+        axes,
+        series,
        padding: [ null, null, null, (Math.round(max_value * 100) / 100).toString().length * 6 - 10 ],
        plugins: [ legendAsTooltipPlugin() ],
        cursor: {
@ -1216,22 +1378,21 @@ function saveState() {
 }

 async function searchQueries() {
-    let {data, error} = await doFetch(search_query);
+    let {reply, error} = await doFetch(search_query);
    if (error) {
        throw new Error(error);
    }
-    if (!Array.isArray(data)) {
-        throw new Error("Search query should return an array.");
-    } else if (data.length == 0) {
+    let data = reply.data;
+    if (reply.rows == 0) {
        throw new Error("Search query returned empty result.");
-    } else if (data.length != 2) {
+    } else if (reply.meta.length != 2 || reply.meta[0].name != "title" || reply.meta[1].name != "query") {
        throw new Error("Search query should return exactly two columns: title and query.");
-    } else if (!Array.isArray(data[0]) || !Array.isArray(data[1]) || data[0].length != data[1].length) {
+    } else if (!Array.isArray(data.title) || !Array.isArray(data.query) || data.title.length != data.query.length) {
        throw new Error("Wrong data format of the search query.");
    }

-    for (let i = 0; i < data[0].length; i++) {
-        queries.push({title: data[0][i], query: data[1][i]});
+    for (let i = 0; i < data.title.length; i++) {
+        queries.push({title: data.title[i], query: data.query[i]});
    }

    regenerate();
--- a/src/AggregateFunctions/AggregateFunctionAny.cpp
+++ b/src/AggregateFunctions/AggregateFunctionAny.cpp
@ -1,26 +1,213 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <AggregateFunctions/HelpersMinMaxAny.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <base/defines.h>


 namespace DB
 {
 struct Settings;

+namespace ErrorCodes
+{
+    extern const int INCORRECT_DATA;
+    extern const int LOGICAL_ERROR;
+}
+
 namespace
 {
+struct AggregateFunctionAnyRespectNullsData
+{
+    enum Status : UInt8
+    {
+        NotSet = 1,
+        SetNull = 2,
+        SetOther = 3
+    };
+
+    Status status = Status::NotSet;
+    Field value;
+
+    bool isSet() const { return status != Status::NotSet; }
+    void setNull() { status = Status::SetNull; }
+    void setOther() { status = Status::SetOther; }
+};
+
+template <bool First>
+class AggregateFunctionAnyRespectNulls final
+    : public IAggregateFunctionDataHelper<AggregateFunctionAnyRespectNullsData, AggregateFunctionAnyRespectNulls<First>>
+{
+public:
+    using Data = AggregateFunctionAnyRespectNullsData;
+
+    SerializationPtr serialization;
+    const bool returns_nullable_type = false;
+
+    explicit AggregateFunctionAnyRespectNulls(const DataTypePtr & type)
+        : IAggregateFunctionDataHelper<Data, AggregateFunctionAnyRespectNulls<First>>({type}, {}, type)
+        , serialization(type->getDefaultSerialization())
+        , returns_nullable_type(type->isNullable())
+    {
+    }
+
+    String getName() const override
+    {
+        if constexpr (First)
+            return "any_respect_nulls";
+        else
+            return "anyLast_respect_nulls";
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void addNull(AggregateDataPtr __restrict place) const
+    {
+        chassert(returns_nullable_type);
+        auto & d = this->data(place);
+        if (First && d.isSet())
+            return;
+        d.setNull();
+    }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        if (columns[0]->isNullable())
+        {
+            if (columns[0]->isNullAt(row_num))
+                return addNull(place);
+        }
+        auto & d = this->data(place);
+        if (First && d.isSet())
+            return;
+        d.setOther();
+        columns[0]->get(row_num, d.value);
+    }
+
+    void addManyDefaults(AggregateDataPtr __restrict place, const IColumn ** columns, size_t, Arena * arena) const override
+    {
+        if (columns[0]->isNullable())
+            addNull(place);
+        else
+            add(place, columns, 0, arena);
+    }
+
+    void addBatchSinglePlace(
+        size_t row_begin, size_t row_end, AggregateDataPtr place, const IColumn ** columns, Arena * arena, ssize_t if_argument_pos)
+        const override
+    {
+        if (if_argument_pos >= 0)
+        {
+            const auto & flags = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]).getData();
+            size_t size = row_end - row_begin;
+            for (size_t i = 0; i < size; ++i)
+            {
+                size_t pos = First ? row_begin + i : row_end - 1 - i;
+                if (flags[pos])
+                {
+                    add(place, columns, pos, arena);
+                    break;
+                }
+            }
+        }
+        else
+        {
+            size_t pos = First ? row_begin : row_end - 1;
+            add(place, columns, pos, arena);
+        }
+    }
+
+    void addBatchSinglePlaceNotNull(
+        size_t, size_t, AggregateDataPtr __restrict, const IColumn **, const UInt8 *, Arena *, ssize_t) const override
+    {
+        /// This should not happen since it means somebody else has preprocessed the data (NULLs or IFs) and might
+        /// have discarded values that we need (NULLs)
+        throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "AggregateFunctionAnyRespectNulls::addBatchSinglePlaceNotNull called");
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        auto & d = this->data(place);
+        if (First && d.isSet())
+            return;
+
+        auto & other = this->data(rhs);
+        if (other.isSet())
+        {
+            d.status = other.status;
+            d.value = other.value;
+        }
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        auto & d = this->data(place);
+        UInt8 k = d.status;
+
+        writeBinaryLittleEndian<UInt8>(k, buf);
+        if (k == Data::Status::SetOther)
+            serialization->serializeBinary(d.value, buf, {});
+    }
+
+    void deserialize(AggregateDataPtr place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        auto & d = this->data(place);
+        UInt8 k = Data::Status::NotSet;
+        readBinaryLittleEndian<UInt8>(k, buf);
+        d.status = static_cast<Data::Status>(k);
+        if (d.status == Data::Status::NotSet)
+            return;
+        else if (d.status == Data::Status::SetNull)
+        {
+            if (!returns_nullable_type)
+                throw Exception(ErrorCodes::INCORRECT_DATA, "Incorrect type (NULL) in non-nullable {}State", getName());
+            return;
+        }
+        else if (d.status == Data::Status::SetOther)
+            serialization->deserializeBinary(d.value, buf, {});
+        else
+            throw Exception(ErrorCodes::INCORRECT_DATA, "Incorrect type ({}) in {}State", static_cast<Int8>(k), getName());
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        auto & d = this->data(place);
+        if (d.status == Data::Status::SetOther)
+            to.insert(d.value);
+        else
+            to.insertDefault();
+    }
+
+    AggregateFunctionPtr getOwnNullAdapter(
+        const AggregateFunctionPtr & original_function,
+        const DataTypes & /*arguments*/,
+        const Array & /*params*/,
+        const AggregateFunctionProperties & /*properties*/) const override
+    {
+        return original_function;
+    }
+};
+
+
+template <bool First>
+IAggregateFunction * createAggregateFunctionSingleValueRespectNulls(
+    const String & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
+{
+    assertNoParameters(name, parameters);
+    assertUnary(name, argument_types);
+
+    return new AggregateFunctionAnyRespectNulls<First>(argument_types[0]);
+}

 AggregateFunctionPtr createAggregateFunctionAny(const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings * settings)
 {
    return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionAnyData>(name, argument_types, parameters, settings));
 }

-template <bool RespectNulls = false>
-AggregateFunctionPtr createAggregateFunctionNullableAny(
+AggregateFunctionPtr createAggregateFunctionAnyRespectNulls(
    const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings * settings)
 {
-    return AggregateFunctionPtr(
-        createAggregateFunctionSingleNullableValue<AggregateFunctionsSingleValue, AggregateFunctionAnyData, RespectNulls>(
-            name, argument_types, parameters, settings));
+    return AggregateFunctionPtr(createAggregateFunctionSingleValueRespectNulls<true>(name, argument_types, parameters, settings));
 }

 AggregateFunctionPtr createAggregateFunctionAnyLast(const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings * settings)
@ -28,13 +215,10 @@ AggregateFunctionPtr createAggregateFunctionAnyLast(const std::string & name, co
    return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionAnyLastData>(name, argument_types, parameters, settings));
 }

-template <bool RespectNulls = false>
-AggregateFunctionPtr createAggregateFunctionNullableAnyLast(const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings * settings)
+AggregateFunctionPtr createAggregateFunctionAnyLastRespectNulls(
+    const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings * settings)
 {
-    return AggregateFunctionPtr(createAggregateFunctionSingleNullableValue<
-                                AggregateFunctionsSingleValue,
-                                AggregateFunctionAnyLastData,
-                                RespectNulls>(name, argument_types, parameters, settings));
+    return AggregateFunctionPtr(createAggregateFunctionSingleValueRespectNulls<false>(name, argument_types, parameters, settings));
 }

 AggregateFunctionPtr createAggregateFunctionAnyHeavy(const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings * settings)
@ -46,26 +230,28 @@ AggregateFunctionPtr createAggregateFunctionAnyHeavy(const std::string & name, c

 void registerAggregateFunctionsAny(AggregateFunctionFactory & factory)
 {
-    AggregateFunctionProperties properties = { .returns_default_when_only_null = false, .is_order_dependent = true };
+    AggregateFunctionProperties default_properties = {.returns_default_when_only_null = false, .is_order_dependent = true};
+    AggregateFunctionProperties default_properties_for_respect_nulls
+        = {.returns_default_when_only_null = false, .is_order_dependent = true, .is_window_function = true};

-    factory.registerFunction("any", { createAggregateFunctionAny, properties });
+    factory.registerFunction("any", {createAggregateFunctionAny, default_properties});
    factory.registerAlias("any_value", "any", AggregateFunctionFactory::CaseInsensitive);
-    factory.registerFunction("anyLast", { createAggregateFunctionAnyLast, properties });
-    factory.registerFunction("anyHeavy", { createAggregateFunctionAnyHeavy, properties });
+    factory.registerAlias("first_value", "any", AggregateFunctionFactory::CaseInsensitive);

-    // Synonyms for use as window functions.
-    factory.registerFunction("first_value",
-        { createAggregateFunctionAny, properties },
-        AggregateFunctionFactory::CaseInsensitive);
-    factory.registerFunction("first_value_respect_nulls",
-        { createAggregateFunctionNullableAny<true>, properties },
-        AggregateFunctionFactory::CaseInsensitive);
-    factory.registerFunction("last_value",
-        { createAggregateFunctionAnyLast, properties },
-        AggregateFunctionFactory::CaseInsensitive);
-    factory.registerFunction("last_value_respect_nulls",
-        { createAggregateFunctionNullableAnyLast<true>, properties },
-        AggregateFunctionFactory::CaseInsensitive);
+    factory.registerFunction("any_respect_nulls", {createAggregateFunctionAnyRespectNulls, default_properties_for_respect_nulls});
+    factory.registerAlias("any_value_respect_nulls", "any_respect_nulls", AggregateFunctionFactory::CaseInsensitive);
+    factory.registerAlias("first_value_respect_nulls", "any_respect_nulls", AggregateFunctionFactory::CaseInsensitive);
+
+    factory.registerFunction("anyLast", {createAggregateFunctionAnyLast, default_properties});
+    factory.registerAlias("last_value", "anyLast", AggregateFunctionFactory::CaseInsensitive);
+
+    factory.registerFunction("anyLast_respect_nulls", {createAggregateFunctionAnyLastRespectNulls, default_properties_for_respect_nulls});
+    factory.registerAlias("last_value_respect_nulls", "anyLast_respect_nulls", AggregateFunctionFactory::CaseInsensitive);
+
+    factory.registerFunction("anyHeavy", {createAggregateFunctionAnyHeavy, default_properties});
+
+    factory.registerNullsActionTransformation("any", "any_respect_nulls");
+    factory.registerNullsActionTransformation("anyLast", "anyLast_respect_nulls");
 }

 }
--- a/src/AggregateFunctions/AggregateFunctionCount.h
+++ b/src/AggregateFunctions/AggregateFunctionCount.h
@ -116,7 +116,7 @@ public:
        /// Return normalized state type: count()
        AggregateFunctionProperties properties;
        return std::make_shared<DataTypeAggregateFunction>(
-            AggregateFunctionFactory::instance().get(getName(), {}, {}, properties), DataTypes{}, Array{});
+            AggregateFunctionFactory::instance().get(getName(), NullsAction::EMPTY, {}, {}, properties), DataTypes{}, Array{});
    }

    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
@ -267,7 +267,7 @@ public:
        /// Return normalized state type: count()
        AggregateFunctionProperties properties;
        return std::make_shared<DataTypeAggregateFunction>(
-            AggregateFunctionFactory::instance().get(getName(), {}, {}, properties), DataTypes{}, Array{});
+            AggregateFunctionFactory::instance().get(getName(), NullsAction::EMPTY, {}, {}, properties), DataTypes{}, Array{});
    }

    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
--- a/src/AggregateFunctions/AggregateFunctionFactory.cpp
+++ b/src/AggregateFunctions/AggregateFunctionFactory.cpp
@ -1,23 +1,11 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <AggregateFunctions/Combinators/AggregateFunctionCombinatorFactory.h>

-#include <DataTypes/DataTypeAggregateFunction.h>
-#include <DataTypes/DataTypeNullable.h>
-#include <DataTypes/DataTypesNumber.h>
 #include <DataTypes/DataTypeLowCardinality.h>
-
-#include <IO/WriteHelpers.h>
-
-#include <Interpreters/Context.h>
-
-#include <Common/StringUtils/StringUtils.h>
-#include <Common/typeid_cast.h>
-#include <Common/CurrentThread.h>
-
-#include <Poco/String.h>
-
+#include <DataTypes/DataTypesNumber.h>
 #include <Functions/FunctionFactory.h>
-
+#include <IO/WriteHelpers.h>
+#include <Interpreters/Context.h>

 static constexpr size_t MAX_AGGREGATE_FUNCTION_NAME_LENGTH = 1000;

@ -28,10 +16,11 @@ struct Settings;

 namespace ErrorCodes
 {
-    extern const int UNKNOWN_AGGREGATE_FUNCTION;
-    extern const int LOGICAL_ERROR;
    extern const int ILLEGAL_AGGREGATION;
+    extern const int LOGICAL_ERROR;
+    extern const int NOT_IMPLEMENTED;
    extern const int TOO_LARGE_STRING_SIZE;
+    extern const int UNKNOWN_AGGREGATE_FUNCTION;
 }

 const String & getAggregateFunctionCanonicalNameIfAny(const String & name)
@ -59,6 +48,23 @@ void AggregateFunctionFactory::registerFunction(const String & name, Value creat
    }
 }

+void AggregateFunctionFactory::registerNullsActionTransformation(const String & source_ignores_nulls, const String & target_respect_nulls)
+{
+    if (!aggregate_functions.contains(source_ignores_nulls))
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "registerNullsActionTransformation: Source aggregation '{}' not found");
+
+    if (!aggregate_functions.contains(target_respect_nulls))
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "registerNullsActionTransformation: Target aggregation '{}' not found");
+
+    if (!respect_nulls.emplace(source_ignores_nulls, target_respect_nulls).second)
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR, "registerNullsActionTransformation: Assignment from '{}' is not unique", source_ignores_nulls);
+
+    if (!ignore_nulls.emplace(target_respect_nulls, source_ignores_nulls).second)
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR, "registerNullsActionTransformation: Assignment from '{}' is not unique", target_respect_nulls);
+}
+
 static DataTypes convertLowCardinalityTypesToNested(const DataTypes & types)
 {
    DataTypes res_types;
@ -70,7 +76,11 @@ static DataTypes convertLowCardinalityTypesToNested(const DataTypes & types)
 }

 AggregateFunctionPtr AggregateFunctionFactory::get(
-    const String & name, const DataTypes & argument_types, const Array & parameters, AggregateFunctionProperties & out_properties) const
+    const String & name,
+    NullsAction action,
+    const DataTypes & argument_types,
+    const Array & parameters,
+    AggregateFunctionProperties & out_properties) const
 {
    /// This to prevent costly string manipulation in parsing the aggregate function combinators.
    /// Example: avgArrayArrayArrayArray...(1000 times)...Array
@ -81,8 +91,9 @@ AggregateFunctionPtr AggregateFunctionFactory::get(

    /// If one of the types is Nullable, we apply aggregate function combinator "Null" if it's not window function.
    /// Window functions are not real aggregate functions. Applying combinators doesn't make sense for them,
-    /// they must handle the nullability themselves
-    auto properties = tryGetProperties(name);
+    /// they must handle the nullability themselves.
+    /// Aggregate functions such as any_value_respect_nulls are considered window functions in that sense
+    auto properties = tryGetProperties(name, action);
    bool is_window_function = properties.has_value() && properties->is_window_function;
    if (!is_window_function && std::any_of(types_without_low_cardinality.begin(), types_without_low_cardinality.end(),
        [](const auto & type) { return type->isNullable(); }))
@ -98,8 +109,7 @@ AggregateFunctionPtr AggregateFunctionFactory::get(
        bool has_null_arguments = std::any_of(types_without_low_cardinality.begin(), types_without_low_cardinality.end(),
            [](const auto & type) { return type->onlyNull(); });

-        AggregateFunctionPtr nested_function = getImpl(
-            name, nested_types, nested_parameters, out_properties, has_null_arguments);
+        AggregateFunctionPtr nested_function = getImpl(name, action, nested_types, nested_parameters, out_properties, has_null_arguments);

        // Pure window functions are not real aggregate functions. Applying
        // combinators doesn't make sense for them, they must handle the
@ -110,22 +120,54 @@ AggregateFunctionPtr AggregateFunctionFactory::get(
            return combinator->transformAggregateFunction(nested_function, out_properties, types_without_low_cardinality, parameters);
    }

-    auto with_original_arguments = getImpl(name, types_without_low_cardinality, parameters, out_properties, false);
+    auto with_original_arguments = getImpl(name, action, types_without_low_cardinality, parameters, out_properties, false);

    if (!with_original_arguments)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: AggregateFunctionFactory returned nullptr");
    return with_original_arguments;
 }

+std::optional<AggregateFunctionWithProperties>
+AggregateFunctionFactory::getAssociatedFunctionByNullsAction(const String & name, NullsAction action) const
+{
+    if (action == NullsAction::RESPECT_NULLS)
+    {
+        if (auto it = respect_nulls.find(name); it == respect_nulls.end())
+            throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Function {} does not support RESPECT NULLS", name);
+        else if (auto associated_it = aggregate_functions.find(it->second); associated_it != aggregate_functions.end())
+            return {associated_it->second};
+        else
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR, "Unable to find the function {} (equivalent to '{} RESPECT NULLS')", it->second, name);
+    }
+
+    if (action == NullsAction::IGNORE_NULLS)
+    {
+        if (auto it = ignore_nulls.find(name); it != ignore_nulls.end())
+        {
+            if (auto associated_it = aggregate_functions.find(it->second); associated_it != aggregate_functions.end())
+                return {associated_it->second};
+            else
+                throw Exception(
+                    ErrorCodes::LOGICAL_ERROR, "Unable to find the function {} (equivalent to '{} IGNORE NULLS')", it->second, name);
+        }
+        /// We don't throw for IGNORE NULLS of other functions because that's the default in CH
+    }
+
+    return {};
+}
+

 AggregateFunctionPtr AggregateFunctionFactory::getImpl(
    const String & name_param,
+    NullsAction action,
    const DataTypes & argument_types,
    const Array & parameters,
    AggregateFunctionProperties & out_properties,
    bool has_null_arguments) const
 {
    String name = getAliasToOrName(name_param);
+    String case_insensitive_name;
    bool is_case_insensitive = false;
    Value found;

@ -135,10 +177,14 @@ AggregateFunctionPtr AggregateFunctionFactory::getImpl(
        found = it->second;
    }

-    if (auto jt = case_insensitive_aggregate_functions.find(Poco::toLower(name)); jt != case_insensitive_aggregate_functions.end())
+    if (!found.creator)
    {
-        found = jt->second;
-        is_case_insensitive = true;
+        case_insensitive_name = Poco::toLower(name);
+        if (auto jt = case_insensitive_aggregate_functions.find(case_insensitive_name); jt != case_insensitive_aggregate_functions.end())
+        {
+            found = jt->second;
+            is_case_insensitive = true;
+        }
    }

    ContextPtr query_context;
@ -147,11 +193,14 @@ AggregateFunctionPtr AggregateFunctionFactory::getImpl(

    if (found.creator)
    {
-        out_properties = found.properties;
+        auto opt = getAssociatedFunctionByNullsAction(is_case_insensitive ? case_insensitive_name : name, action);
+        if (opt)
+            found = *opt;

+        out_properties = found.properties;
        if (query_context && query_context->getSettingsRef().log_queries)
            query_context->addQueryFactoriesInfo(
-                    Context::QueryLogFactories::AggregateFunction, is_case_insensitive ? Poco::toLower(name) : name);
+                Context::QueryLogFactories::AggregateFunction, is_case_insensitive ? case_insensitive_name : name);

        /// The case when aggregate function should return NULL on NULL arguments. This case is handled in "get" method.
        if (!out_properties.returns_default_when_only_null && has_null_arguments)
@ -196,7 +245,7 @@ AggregateFunctionPtr AggregateFunctionFactory::getImpl(
        DataTypes nested_types = combinator->transformArguments(argument_types);
        Array nested_parameters = combinator->transformParameters(parameters);

-        AggregateFunctionPtr nested_function = get(nested_name, nested_types, nested_parameters, out_properties);
+        AggregateFunctionPtr nested_function = get(nested_name, action, nested_types, nested_parameters, out_properties);
        return combinator->transformAggregateFunction(nested_function, out_properties, argument_types, parameters);
    }

@ -213,16 +262,7 @@ AggregateFunctionPtr AggregateFunctionFactory::getImpl(
        throw Exception(ErrorCodes::UNKNOWN_AGGREGATE_FUNCTION, "Unknown aggregate function {}{}", name, extra_info);
 }

-
-AggregateFunctionPtr AggregateFunctionFactory::tryGet(
-    const String & name, const DataTypes & argument_types, const Array & parameters, AggregateFunctionProperties & out_properties) const
-{
-    return isAggregateFunctionName(name)
-        ? get(name, argument_types, parameters, out_properties)
-        : nullptr;
-}
-
-std::optional<AggregateFunctionProperties> AggregateFunctionFactory::tryGetProperties(String name) const
+std::optional<AggregateFunctionProperties> AggregateFunctionFactory::tryGetProperties(String name, NullsAction action) const
 {
    if (name.size() > MAX_AGGREGATE_FUNCTION_NAME_LENGTH)
        throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too long name of aggregate function, maximum: {}", MAX_AGGREGATE_FUNCTION_NAME_LENGTH);
@ -231,6 +271,8 @@ std::optional<AggregateFunctionProperties> AggregateFunctionFactory::tryGetPrope
    {
        name = getAliasToOrName(name);
        Value found;
+        String lower_case_name;
+        bool is_case_insensitive = false;

        /// Find by exact match.
        if (auto it = aggregate_functions.find(name); it != aggregate_functions.end())
@ -238,11 +280,23 @@ std::optional<AggregateFunctionProperties> AggregateFunctionFactory::tryGetPrope
            found = it->second;
        }

-        if (auto jt = case_insensitive_aggregate_functions.find(Poco::toLower(name)); jt != case_insensitive_aggregate_functions.end())
-            found = jt->second;
+        if (!found.creator)
+        {
+            lower_case_name = Poco::toLower(name);
+            if (auto jt = case_insensitive_aggregate_functions.find(lower_case_name); jt != case_insensitive_aggregate_functions.end())
+            {
+                is_case_insensitive = true;
+                found = jt->second;
+            }
+        }

        if (found.creator)
+        {
+            auto opt = getAssociatedFunctionByNullsAction(is_case_insensitive ? lower_case_name : name, action);
+            if (opt)
+                return opt->properties;
            return found.properties;
+        }

        /// Combinators of aggregate functions.
        /// For every aggregate function 'agg' and combiner '-Comb' there is a combined aggregate function with the name 'aggComb',
@ -262,27 +316,29 @@ std::optional<AggregateFunctionProperties> AggregateFunctionFactory::tryGetPrope
 }


-bool AggregateFunctionFactory::isAggregateFunctionName(String name) const
+bool AggregateFunctionFactory::isAggregateFunctionName(const String & name_) const
 {
-    if (name.size() > MAX_AGGREGATE_FUNCTION_NAME_LENGTH)
+    if (name_.size() > MAX_AGGREGATE_FUNCTION_NAME_LENGTH)
        throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too long name of aggregate function, maximum: {}", MAX_AGGREGATE_FUNCTION_NAME_LENGTH);

-    while (true)
+    if (aggregate_functions.contains(name_) || isAlias(name_))
+        return true;
+
+    String name_lowercase = Poco::toLower(name_);
+    if (case_insensitive_aggregate_functions.contains(name_lowercase) || isAlias(name_lowercase))
+        return true;
+
+    String name = name_;
+    while (AggregateFunctionCombinatorPtr combinator = AggregateFunctionCombinatorFactory::instance().tryFindSuffix(name))
    {
-        if (aggregate_functions.contains(name) || isAlias(name))
-            return true;
+        name = name.substr(0, name.size() - combinator->getName().size());
+        name_lowercase = name_lowercase.substr(0, name_lowercase.size() - combinator->getName().size());

-        String name_lowercase = Poco::toLower(name);
-        if (case_insensitive_aggregate_functions.contains(name_lowercase) || isAlias(name_lowercase))
+        if (aggregate_functions.contains(name) || isAlias(name) || case_insensitive_aggregate_functions.contains(name_lowercase)
+            || isAlias(name_lowercase))
            return true;
-
-        if (AggregateFunctionCombinatorPtr combinator = AggregateFunctionCombinatorFactory::instance().tryFindSuffix(name))
-        {
-            name = name.substr(0, name.size() - combinator->getName().size());
-        }
-        else
-            return false;
    }
+    return false;
 }

 AggregateFunctionFactory & AggregateFunctionFactory::instance()
--- a/src/AggregateFunctions/AggregateFunctionFactory.h
+++ b/src/AggregateFunctions/AggregateFunctionFactory.h
@ -1,9 +1,9 @@
 #pragma once

 #include <AggregateFunctions/IAggregateFunction.h>
-#include <Common/IFactoryWithAliases.h>
 #include <Parsers/ASTFunction.h>
-
+#include <Parsers/NullsAction.h>
+#include <Common/IFactoryWithAliases.h>

 #include <functional>
 #include <memory>
@ -62,36 +62,44 @@ public:
        Value creator,
        CaseSensitiveness case_sensitiveness = CaseSensitive);

+    /// Register how to transform from one aggregate function to other based on NullsAction
+    /// Registers them both ways:
+    /// SOURCE + RESPECT NULLS will be transformed to TARGET
+    /// TARGET + IGNORE NULLS will be transformed to SOURCE
+    void registerNullsActionTransformation(const String & source_ignores_nulls, const String & target_respect_nulls);
+
    /// Throws an exception if not found.
    AggregateFunctionPtr
    get(const String & name,
-        const DataTypes & argument_types,
-        const Array & parameters,
-        AggregateFunctionProperties & out_properties) const;
-
-    /// Returns nullptr if not found.
-    AggregateFunctionPtr tryGet(
-        const String & name,
+        NullsAction action,
        const DataTypes & argument_types,
        const Array & parameters,
        AggregateFunctionProperties & out_properties) const;

    /// Get properties if the aggregate function exists.
-    std::optional<AggregateFunctionProperties> tryGetProperties(String name) const;
+    std::optional<AggregateFunctionProperties> tryGetProperties(String name, NullsAction action) const;

-    bool isAggregateFunctionName(String name) const;
+    bool isAggregateFunctionName(const String & name) const;

 private:
    AggregateFunctionPtr getImpl(
        const String & name,
+        NullsAction action,
        const DataTypes & argument_types,
        const Array & parameters,
        AggregateFunctionProperties & out_properties,
        bool has_null_arguments) const;

    using AggregateFunctions = std::unordered_map<String, Value>;
+    using ActionMap = std::unordered_map<String, String>;

    AggregateFunctions aggregate_functions;
+    /// Mapping from functions with `RESPECT NULLS` modifier to actual aggregate function names
+    /// Example: `any(x) RESPECT NULLS` should be executed as function `any_respect_nulls`
+    ActionMap respect_nulls;
+    /// Same as above for `IGNORE NULLS` modifier
+    ActionMap ignore_nulls;
+    std::optional<AggregateFunctionWithProperties> getAssociatedFunctionByNullsAction(const String & name, NullsAction action) const;

    /// Case insensitive aggregate functions will be additionally added here with lowercased name.
    AggregateFunctions case_insensitive_aggregate_functions;
--- a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h
+++ b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h
@ -771,26 +771,18 @@ static_assert(


 /// For any other value types.
-template <bool RESULT_IS_NULLABLE = false>
 struct SingleValueDataGeneric
 {
 private:
    using Self = SingleValueDataGeneric;
-
    Field value;
-    bool has_value = false;

 public:
-    static constexpr bool result_is_nullable = RESULT_IS_NULLABLE;
-    static constexpr bool should_skip_null_arguments = !RESULT_IS_NULLABLE;
+    static constexpr bool result_is_nullable = false;
+    static constexpr bool should_skip_null_arguments = true;
    static constexpr bool is_any = false;

-    bool has() const
-    {
-        if constexpr (result_is_nullable)
-            return has_value;
-        return !value.isNull();
-    }
+    bool has() const { return !value.isNull(); }

    void insertResultInto(IColumn & to) const
    {
@ -820,19 +812,9 @@ public:
            serialization.deserializeBinary(value, buf, {});
    }

-    void change(const IColumn & column, size_t row_num, Arena *)
-    {
-        column.get(row_num, value);
-        if constexpr (result_is_nullable)
-            has_value = true;
-    }
+    void change(const IColumn & column, size_t row_num, Arena *) { column.get(row_num, value); }

-    void change(const Self & to, Arena *)
-    {
-        value = to.value;
-        if constexpr (result_is_nullable)
-            has_value = true;
-    }
+    void change(const Self & to, Arena *) { value = to.value; }

    bool changeFirstTime(const IColumn & column, size_t row_num, Arena * arena)
    {
@ -847,7 +829,7 @@ public:

    bool changeFirstTime(const Self & to, Arena * arena)
    {
-        if (!has() && (result_is_nullable || to.has()))
+        if (!has() && to.has())
        {
            change(to, arena);
            return true;
@ -882,30 +864,15 @@ public:
        }
        else
        {
-            if constexpr (result_is_nullable)
+            Field new_value;
+            column.get(row_num, new_value);
+            if (new_value < value)
            {
-                Field new_value;
-                column.get(row_num, new_value);
-                if (!value.isNull() && (new_value.isNull() || new_value < value))
-                {
-                    value = new_value;
-                    return true;
-                }
-                else
-                    return false;
+                value = new_value;
+                return true;
            }
            else
-            {
-                Field new_value;
-                column.get(row_num, new_value);
-                if (new_value < value)
-                {
-                    value = new_value;
-                    return true;
-                }
-                else
-                    return false;
-            }
+                return false;
        }
    }

@ -913,30 +880,13 @@ public:
    {
        if (!to.has())
            return false;
-        if constexpr (result_is_nullable)
+        if (!has() || to.value < value)
        {
-            if (!has())
-            {
-                change(to, arena);
-                return true;
-            }
-            if (to.value.isNull() || (!value.isNull() && to.value < value))
-            {
-                value = to.value;
-                return true;
-            }
-            return false;
+            change(to, arena);
+            return true;
        }
        else
-        {
-            if (!has() || to.value < value)
-            {
-                change(to, arena);
-                return true;
-            }
-            else
-                return false;
-        }
+            return false;
    }

    bool changeIfGreater(const IColumn & column, size_t row_num, Arena * arena)
@ -948,29 +898,15 @@ public:
        }
        else
        {
-            if constexpr (result_is_nullable)
+            Field new_value;
+            column.get(row_num, new_value);
+            if (new_value > value)
            {
-                Field new_value;
-                column.get(row_num, new_value);
-                if (!value.isNull() && (new_value.isNull() || value < new_value))
-                {
-                    value = new_value;
-                    return true;
-                }
-                return false;
+                value = new_value;
+                return true;
            }
            else
-            {
-                Field new_value;
-                column.get(row_num, new_value);
-                if (new_value > value)
-                {
-                    value = new_value;
-                    return true;
-                }
-                else
-                    return false;
-            }
+                return false;
        }
    }

@ -978,36 +914,18 @@ public:
    {
        if (!to.has())
            return false;
-        if constexpr (result_is_nullable)
+        if (!has() || to.value > value)
        {
-            if (!value.isNull() && (to.value.isNull() || value < to.value))
-            {
-                value = to.value;
-                return true;
-            }
-            return false;
+            change(to, arena);
+            return true;
        }
        else
-        {
-            if (!has() || to.value > value)
-            {
-                change(to, arena);
-                return true;
-            }
-            else
-                return false;
-        }
+            return false;
    }

-    bool isEqualTo(const IColumn & column, size_t row_num) const
-    {
-        return has() && value == column[row_num];
-    }
+    bool isEqualTo(const IColumn & column, size_t row_num) const { return has() && value == column[row_num]; }

-    bool isEqualTo(const Self & to) const
-    {
-        return has() && to.value == value;
-    }
+    bool isEqualTo(const Self & to) const { return has() && to.value == value; }

    static bool allocatesMemoryInArena()
    {
--- a/src/AggregateFunctions/AggregateFunctionQuantile.h
+++ b/src/AggregateFunctions/AggregateFunctionQuantile.h
@ -150,7 +150,7 @@ public:
        AggregateFunctionProperties properties;
        return std::make_shared<DataTypeAggregateFunction>(
            AggregateFunctionFactory::instance().get(
-                GatherFunctionQuantileData::toFusedNameOrSelf(getName()), this->argument_types, params, properties),
+                GatherFunctionQuantileData::toFusedNameOrSelf(getName()), NullsAction::EMPTY, this->argument_types, params, properties),
            this->argument_types,
            params);
    }
--- a/src/AggregateFunctions/Combinators/AggregateFunctionArgMinMax.cpp
+++ b/src/AggregateFunctions/Combinators/AggregateFunctionArgMinMax.cpp
@ -20,7 +20,7 @@ template <template <typename> class Data>
 class AggregateFunctionCombinatorArgMinMax final : public IAggregateFunctionCombinator
 {
 public:
-    String getName() const override { return Data<SingleValueDataGeneric<>>::name(); }
+    String getName() const override { return Data<SingleValueDataGeneric>::name(); }

    DataTypes transformArguments(const DataTypes & arguments) const override
    {
@ -66,7 +66,7 @@ public:
        if (which.idx == TypeIndex::String)
            return std::make_shared<AggregateFunctionArgMinMax<Data<SingleValueDataString>>>(nested_function, arguments, params);

-        return std::make_shared<AggregateFunctionArgMinMax<Data<SingleValueDataGeneric<>>>>(nested_function, arguments, params);
+        return std::make_shared<AggregateFunctionArgMinMax<Data<SingleValueDataGeneric>>>(nested_function, arguments, params);
    }
 };

--- a/src/AggregateFunctions/Combinators/AggregateFunctionIf.h
+++ b/src/AggregateFunctions/Combinators/AggregateFunctionIf.h
@ -33,6 +33,8 @@ class AggregateFunctionIf final : public IAggregateFunctionHelper<AggregateFunct
 private:
    AggregateFunctionPtr nested_func;
    size_t num_arguments;
+    /// We accept Nullable(Nothing) as condition, but callees always expect UInt8 so we need to avoid calling them
+    bool only_null_condition = false;

 public:
    AggregateFunctionIf(AggregateFunctionPtr nested, const DataTypes & types, const Array & params_)
@ -42,7 +44,9 @@ public:
        if (num_arguments == 0)
            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} require at least one argument", getName());

-        if (!isUInt8(types.back()) && !types.back()->onlyNull())
+        only_null_condition = types.back()->onlyNull();
+
+        if (!isUInt8(types.back()) && !only_null_condition)
            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Last argument for aggregate function {} must be UInt8", getName());
    }

@ -108,6 +112,8 @@ public:

    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
    {
+        if (only_null_condition)
+            return;
        if (assert_cast<const ColumnUInt8 &>(*columns[num_arguments - 1]).getData()[row_num])
            nested_func->add(place, columns, row_num, arena);
    }
@ -121,6 +127,8 @@ public:
        Arena * arena,
        ssize_t) const override
    {
+        if (only_null_condition)
+            return;
        nested_func->addBatch(row_begin, row_end, places, place_offset, columns, arena, num_arguments - 1);
    }

@ -132,6 +140,8 @@ public:
        Arena * arena,
        ssize_t) const override
    {
+        if (only_null_condition)
+            return;
        nested_func->addBatchSinglePlace(row_begin, row_end, place, columns, arena, num_arguments - 1);
    }

@ -144,6 +154,8 @@ public:
        Arena * arena,
        ssize_t) const override
    {
+        if (only_null_condition)
+            return;
        nested_func->addBatchSinglePlaceNotNull(row_begin, row_end, place, columns, null_map, arena, num_arguments - 1);
    }

--- a/src/AggregateFunctions/Combinators/AggregateFunctionMap.cpp
+++ b/src/AggregateFunctions/Combinators/AggregateFunctionMap.cpp
@ -447,7 +447,8 @@ public:
            {
                AggregateFunctionProperties out_properties;
                auto & aggr_func_factory = AggregateFunctionFactory::instance();
-                return aggr_func_factory.get(nested_func_name + "MappedArrays", arguments, params, out_properties);
+                auto action = NullsAction::EMPTY;
+                return aggr_func_factory.get(nested_func_name + "MappedArrays", action, arguments, params, out_properties);
            }
            else
                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregation '{}Map' is not implemented for mapped arrays",
--- a/src/AggregateFunctions/Combinators/AggregateFunctionSimpleState.h
+++ b/src/AggregateFunctions/Combinators/AggregateFunctionSimpleState.h
@ -35,8 +35,8 @@ public:
        auto storage_type_out = DataTypeFactory::instance().get(nested_->getResultType()->getName());
        // Need to make a new function with promoted argument types because SimpleAggregates requires arg_type = return_type.
        AggregateFunctionProperties properties;
-        auto function
-            = AggregateFunctionFactory::instance().get(nested_->getName(), {storage_type_out}, nested_->getParameters(), properties);
+        auto function = AggregateFunctionFactory::instance().get(
+            nested_->getName(), NullsAction::EMPTY, {storage_type_out}, nested_->getParameters(), properties);

        // Need to make a clone because it'll be customized.
        auto storage_type_arg = DataTypeFactory::instance().get(nested_->getResultType()->getName());
--- a/src/AggregateFunctions/HelpersMinMaxAny.h
+++ b/src/AggregateFunctions/HelpersMinMaxAny.h
@ -14,8 +14,9 @@ namespace DB
 struct Settings;

 /// min, max, any, anyLast, anyHeavy, etc...
-template <template <typename> class AggregateFunctionTemplate, template <typename> class Data>
-static IAggregateFunction * createAggregateFunctionSingleValue(const String & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
+template <template <typename> class AggregateFunctionTemplate, template <typename, bool...> class Data>
+static IAggregateFunction *
+createAggregateFunctionSingleValue(const String & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
 {
    assertNoParameters(name, parameters);
    assertUnary(name, argument_types);
@ -44,31 +45,9 @@ static IAggregateFunction * createAggregateFunctionSingleValue(const String & na
    if (which.idx == TypeIndex::String)
        return new AggregateFunctionTemplate<Data<SingleValueDataString>>(argument_type);

-    return new AggregateFunctionTemplate<Data<SingleValueDataGeneric<>>>(argument_type);
+    return new AggregateFunctionTemplate<Data<SingleValueDataGeneric>>(argument_type);
 }

-template <template <typename> class AggregateFunctionTemplate, template <typename> class Data, bool RespectNulls = false>
-static IAggregateFunction * createAggregateFunctionSingleNullableValue(const String & name, const DataTypes & argument_types, const Array & parameters, const Settings * settings)
-{
-    assertNoParameters(name, parameters);
-    assertUnary(name, argument_types);
-
-    const DataTypePtr & argument_type = argument_types[0];
-    WhichDataType which(argument_type);
-    // If the result value could be null (excluding the case that no row is matched),
-    // use SingleValueDataGeneric.
-    if constexpr (!RespectNulls)
-    {
-        return createAggregateFunctionSingleValue<AggregateFunctionTemplate, Data>(name, argument_types, Array(), settings);
-    }
-    else
-    {
-        return new AggregateFunctionTemplate<Data<SingleValueDataGeneric<true>>>(argument_type);
-    }
-    UNREACHABLE();
-}
-
-
 /// argMin, argMax
 template <template <typename> class MinMaxData, typename ResData>
 static IAggregateFunction * createAggregateFunctionArgMinMaxSecond(const DataTypePtr & res_type, const DataTypePtr & val_type)
@ -98,7 +77,7 @@ static IAggregateFunction * createAggregateFunctionArgMinMaxSecond(const DataTyp
    if (which.idx == TypeIndex::String)
        return new AggregateFunctionArgMinMax<AggregateFunctionArgMinMaxData<ResData, MinMaxData<SingleValueDataString>>>(res_type, val_type);

-    return new AggregateFunctionArgMinMax<AggregateFunctionArgMinMaxData<ResData, MinMaxData<SingleValueDataGeneric<>>>>(res_type, val_type);
+    return new AggregateFunctionArgMinMax<AggregateFunctionArgMinMaxData<ResData, MinMaxData<SingleValueDataGeneric>>>(res_type, val_type);
 }

 template <template <typename> class MinMaxData>
@ -134,7 +113,7 @@ static IAggregateFunction * createAggregateFunctionArgMinMax(const String & name
    if (which.idx == TypeIndex::String)
        return createAggregateFunctionArgMinMaxSecond<MinMaxData, SingleValueDataString>(res_type, val_type);

-    return createAggregateFunctionArgMinMaxSecond<MinMaxData, SingleValueDataGeneric<>>(res_type, val_type);
+    return createAggregateFunctionArgMinMaxSecond<MinMaxData, SingleValueDataGeneric>(res_type, val_type);
 }

 }
--- a/src/Analyzer/FunctionNode.cpp
+++ b/src/Analyzer/FunctionNode.cpp
@ -113,6 +113,11 @@ void FunctionNode::dumpTreeImpl(WriteBuffer & buffer, FormatState & format_state

    buffer << ", function_type: " << function_type;

+    if (nulls_action == NullsAction::RESPECT_NULLS)
+        buffer << ", nulls_action : RESPECT_NULLS";
+    else if (nulls_action == NullsAction::IGNORE_NULLS)
+        buffer << ", nulls_action : IGNORE_NULLS";
+
    if (function)
        buffer << ", result_type: " + getResultType()->getName();

@ -140,10 +145,9 @@ void FunctionNode::dumpTreeImpl(WriteBuffer & buffer, FormatState & format_state
 bool FunctionNode::isEqualImpl(const IQueryTreeNode & rhs) const
 {
    const auto & rhs_typed = assert_cast<const FunctionNode &>(rhs);
-    if (function_name != rhs_typed.function_name ||
-        isAggregateFunction() != rhs_typed.isAggregateFunction() ||
-        isOrdinaryFunction() != rhs_typed.isOrdinaryFunction() ||
-        isWindowFunction() != rhs_typed.isWindowFunction())
+    if (function_name != rhs_typed.function_name || isAggregateFunction() != rhs_typed.isAggregateFunction()
+        || isOrdinaryFunction() != rhs_typed.isOrdinaryFunction() || isWindowFunction() != rhs_typed.isWindowFunction()
+        || nulls_action != rhs_typed.nulls_action)
        return false;

    if (isResolved() != rhs_typed.isResolved())
@ -171,6 +175,7 @@ void FunctionNode::updateTreeHashImpl(HashState & hash_state) const
    hash_state.update(isOrdinaryFunction());
    hash_state.update(isAggregateFunction());
    hash_state.update(isWindowFunction());
+    hash_state.update(nulls_action);

    if (!isResolved())
        return;
@ -192,6 +197,7 @@ QueryTreeNodePtr FunctionNode::cloneImpl() const
      */
    result_function->function = function;
    result_function->kind = kind;
+    result_function->nulls_action = nulls_action;
    result_function->wrap_with_nullable = wrap_with_nullable;

    return result_function;
@ -202,6 +208,7 @@ ASTPtr FunctionNode::toASTImpl(const ConvertToASTOptions & options) const
    auto function_ast = std::make_shared<ASTFunction>();

    function_ast->name = function_name;
+    function_ast->nulls_action = nulls_action;

    if (function_name == "nothing")
    {
--- a/src/Analyzer/FunctionNode.h
+++ b/src/Analyzer/FunctionNode.h
@ -5,11 +5,12 @@
 #include <Analyzer/ConstantValue.h>
 #include <Analyzer/IQueryTreeNode.h>
 #include <Analyzer/ListNode.h>
-#include <Common/typeid_cast.h>
 #include <Core/ColumnsWithTypeAndName.h>
 #include <Core/IResolvedFunction.h>
 #include <DataTypes/DataTypeNullable.h>
 #include <Functions/IFunction.h>
+#include <Parsers/NullsAction.h>
+#include <Common/typeid_cast.h>

 namespace DB
 {
@ -63,6 +64,10 @@ public:
    /// Get function name
    const String & getFunctionName() const { return function_name; }

+    /// Get NullAction modifier
+    NullsAction getNullsAction() const { return nulls_action; }
+    void setNullsAction(NullsAction action) { nulls_action = action; }
+
    /// Get parameters
    const ListNode & getParameters() const { return children[parameters_child_index]->as<const ListNode &>(); }

@ -214,6 +219,7 @@ protected:
 private:
    String function_name;
    FunctionKind kind = FunctionKind::UNKNOWN;
+    NullsAction nulls_action = NullsAction::EMPTY;
    IResolvedFunctionPtr function;
    bool wrap_with_nullable = false;

--- a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp
+++ b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp
@ -184,10 +184,9 @@ private:
        auto function_aggregate_function = function_node.getAggregateFunction();

        AggregateFunctionProperties properties;
-        auto aggregate_function = AggregateFunctionFactory::instance().get(aggregate_function_name,
-            { argument->getResultType() },
-            function_aggregate_function->getParameters(),
-            properties);
+        auto action = NullsAction::EMPTY;
+        auto aggregate_function = AggregateFunctionFactory::instance().get(
+            aggregate_function_name, action, {argument->getResultType()}, function_aggregate_function->getParameters(), properties);

        function_node.resolveAsAggregateFunction(std::move(aggregate_function));
    }
--- a/src/Analyzer/Passes/CountDistinctPass.cpp
+++ b/src/Analyzer/Passes/CountDistinctPass.cpp
@ -76,7 +76,8 @@ public:
        /// Replace `countDistinct` of initial query into `count`
        auto result_type = function_node->getResultType();
        AggregateFunctionProperties properties;
-        auto aggregate_function = AggregateFunctionFactory::instance().get("count", {}, {}, properties);
+        auto action = NullsAction::EMPTY;
+        auto aggregate_function = AggregateFunctionFactory::instance().get("count", action, {}, {}, properties);
        function_node->resolveAsAggregateFunction(std::move(aggregate_function));
        function_node->getArguments().getNodes().clear();
    }
--- a/src/Analyzer/Passes/FuseFunctionsPass.cpp
+++ b/src/Analyzer/Passes/FuseFunctionsPass.cpp
@ -78,9 +78,11 @@ QueryTreeNodePtr createResolvedFunction(const ContextPtr & context, const String
    return function_node;
 }

-FunctionNodePtr createResolvedAggregateFunction(const String & name, const QueryTreeNodePtr & argument, const Array & parameters = {})
+FunctionNodePtr createResolvedAggregateFunction(
+    const String & name, const QueryTreeNodePtr & argument, const Array & parameters = {}, NullsAction action = NullsAction::EMPTY)
 {
    auto function_node = std::make_shared<FunctionNode>(name);
+    function_node->setNullsAction(action);

    if (!parameters.empty())
    {
@ -92,11 +94,7 @@ FunctionNodePtr createResolvedAggregateFunction(const String & name, const Query
    function_node->getArguments().getNodes() = { argument };

    AggregateFunctionProperties properties;
-    auto aggregate_function = AggregateFunctionFactory::instance().get(
-        name,
-        { argument->getResultType() },
-        parameters,
-        properties);
+    auto aggregate_function = AggregateFunctionFactory::instance().get(name, action, {argument->getResultType()}, parameters, properties);
    function_node->resolveAsAggregateFunction(std::move(aggregate_function));

    return function_node;
--- a/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp
+++ b/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp
@ -56,7 +56,7 @@ private:
    static inline void resolveAsCountAggregateFunction(FunctionNode & function_node)
    {
        AggregateFunctionProperties properties;
-        auto aggregate_function = AggregateFunctionFactory::instance().get("count", {}, {}, properties);
+        auto aggregate_function = AggregateFunctionFactory::instance().get("count", NullsAction::EMPTY, {}, {}, properties);

        function_node.resolveAsAggregateFunction(std::move(aggregate_function));
    }
--- a/src/Analyzer/Passes/QueryAnalysisPass.cpp
+++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp
@ -118,6 +118,7 @@ namespace ErrorCodes
    extern const int ILLEGAL_COLUMN;
    extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH;
    extern const int FUNCTION_CANNOT_HAVE_PARAMETERS;
+    extern const int SYNTAX_ERROR;
 }

 /** Query analyzer implementation overview. Please check documentation in QueryAnalysisPass.h first.
@ -1208,7 +1209,8 @@ private:

    static void expandGroupByAll(QueryNode & query_tree_node_typed);

-    static std::string rewriteAggregateFunctionNameIfNeeded(const std::string & aggregate_function_name, const ContextPtr & context);
+    static std::string
+    rewriteAggregateFunctionNameIfNeeded(const std::string & aggregate_function_name, NullsAction action, const ContextPtr & context);

    static std::optional<JoinTableSide> getColumnSideFromJoinTree(const QueryTreeNodePtr & resolved_identifier, const JoinNode & join_node)
    {
@ -2310,7 +2312,8 @@ void QueryAnalyzer::expandGroupByAll(QueryNode & query_tree_node_typed)
        recursivelyCollectMaxOrdinaryExpressions(node, group_by_nodes);
 }

-std::string QueryAnalyzer::rewriteAggregateFunctionNameIfNeeded(const std::string & aggregate_function_name, const ContextPtr & context)
+std::string QueryAnalyzer::rewriteAggregateFunctionNameIfNeeded(
+    const std::string & aggregate_function_name, NullsAction action, const ContextPtr & context)
 {
    std::string result_aggregate_function_name = aggregate_function_name;
    auto aggregate_function_name_lowercase = Poco::toLower(aggregate_function_name);
@ -2337,7 +2340,7 @@ std::string QueryAnalyzer::rewriteAggregateFunctionNameIfNeeded(const std::strin
    bool need_add_or_null = settings.aggregate_functions_null_for_empty && !result_aggregate_function_name.ends_with("OrNull");
    if (need_add_or_null)
    {
-        auto properties = AggregateFunctionFactory::instance().tryGetProperties(result_aggregate_function_name);
+        auto properties = AggregateFunctionFactory::instance().tryGetProperties(result_aggregate_function_name, action);
        if (!properties->returns_default_when_only_null)
            result_aggregate_function_name += "OrNull";
    }
@ -2349,7 +2352,7 @@ std::string QueryAnalyzer::rewriteAggregateFunctionNameIfNeeded(const std::strin
      */
    if (result_aggregate_function_name.ends_with("OrNull"))
    {
-        auto function_properies = AggregateFunctionFactory::instance().tryGetProperties(result_aggregate_function_name);
+        auto function_properies = AggregateFunctionFactory::instance().tryGetProperties(result_aggregate_function_name, action);
        if (function_properies && !function_properies->returns_default_when_only_null)
        {
            size_t function_name_size = result_aggregate_function_name.size();
@ -4591,6 +4594,19 @@ ProjectionNames QueryAnalyzer::resolveLambda(const QueryTreeNodePtr & lambda_nod
    return result_projection_names;
 }

+namespace
+{
+void checkFunctionNodeHasEmptyNullsAction(FunctionNode const & node)
+{
+    if (node.getNullsAction() != NullsAction::EMPTY)
+        throw Exception(
+            ErrorCodes::SYNTAX_ERROR,
+            "Function with name '{}' cannot use {} NULLS",
+            node.getFunctionName(),
+            node.getNullsAction() == NullsAction::IGNORE_NULLS ? "IGNORE" : "RESPECT");
+}
+}
+
 /** Resolve function node in scope.
  * During function node resolve, function node can be replaced with another expression (if it match lambda or sql user defined function),
  * with constant (if it allow constant folding), or with expression list. It is caller responsibility to handle such cases appropriately.
@ -4749,6 +4765,7 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi

    if (is_special_function_exists)
    {
+        checkFunctionNodeHasEmptyNullsAction(*function_node_ptr);
        /// Rewrite EXISTS (subquery) into 1 IN (SELECT 1 FROM (subquery) LIMIT 1).
        auto & exists_subquery_argument = function_node_ptr->getArguments().getNodes().at(0);

@ -4769,6 +4786,7 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi

    if (is_special_function_if && !function_node_ptr->getArguments().getNodes().empty())
    {
+        checkFunctionNodeHasEmptyNullsAction(*function_node_ptr);
        /** Handle special case with constant If function, even if some of the arguments are invalid.
          *
          * SELECT if(hasColumnInTable('system', 'numbers', 'not_existing_column'), not_existing_column, 5) FROM system.numbers;
@ -4834,6 +4852,7 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
    /// Replace right IN function argument if it is table or table function with subquery that read ordinary columns
    if (is_special_function_in)
    {
+        checkFunctionNodeHasEmptyNullsAction(function_node);
        if (scope.context->getSettingsRef().transform_null_in)
        {
            static constexpr std::array<std::pair<std::string_view, std::string_view>, 4> in_function_to_replace_null_in_function_map =
@ -5012,6 +5031,8 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
                    lambda_expression_untyped->formatASTForErrorMessage(),
                    scope.scope_node->formatASTForErrorMessage());

+            checkFunctionNodeHasEmptyNullsAction(function_node);
+
            if (!parameters.empty())
            {
                throw Exception(
@ -5041,6 +5062,8 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
                    "Function 'untuple' must have 1 argument. In scope {}",
                    scope.scope_node->formatASTForErrorMessage());

+            checkFunctionNodeHasEmptyNullsAction(function_node);
+
            const auto & untuple_argument = function_arguments[0];
            auto result_type = untuple_argument->getResultType();
            const auto * tuple_data_type = typeid_cast<const DataTypeTuple *>(result_type.get());
@ -5091,6 +5114,7 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
                throw Exception(ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION,
                    "Function GROUPING can have up to 64 arguments, but {} provided",
                    function_arguments_size);
+            checkFunctionNodeHasEmptyNullsAction(function_node);

            bool force_grouping_standard_compatibility = scope.context->getSettingsRef().force_grouping_standard_compatibility;
            auto grouping_function = std::make_shared<FunctionGrouping>(force_grouping_standard_compatibility);
@ -5115,10 +5139,12 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
                "Window function '{}' does not support lambda arguments",
                function_name);

-        std::string aggregate_function_name = rewriteAggregateFunctionNameIfNeeded(function_name, scope.context);
+        auto action = function_node_ptr->getNullsAction();
+        std::string aggregate_function_name = rewriteAggregateFunctionNameIfNeeded(function_name, action, scope.context);

        AggregateFunctionProperties properties;
-        auto aggregate_function = AggregateFunctionFactory::instance().get(aggregate_function_name, argument_types, parameters, properties);
+        auto aggregate_function
+            = AggregateFunctionFactory::instance().get(aggregate_function_name, action, argument_types, parameters, properties);

        function_node.resolveAsWindowFunction(std::move(aggregate_function));

@ -5142,7 +5168,11 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
        is_executable_udf = false;
    }

-    if (!function)
+    if (function)
+    {
+        checkFunctionNodeHasEmptyNullsAction(function_node);
+    }
+    else
    {
        if (!AggregateFunctionFactory::instance().isAggregateFunctionName(function_name))
        {
@ -5181,10 +5211,12 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
                "Aggregate function '{}' does not support lambda arguments",
                function_name);

-        std::string aggregate_function_name = rewriteAggregateFunctionNameIfNeeded(function_name, scope.context);
+        auto action = function_node_ptr->getNullsAction();
+        std::string aggregate_function_name = rewriteAggregateFunctionNameIfNeeded(function_name, action, scope.context);

        AggregateFunctionProperties properties;
-        auto aggregate_function = AggregateFunctionFactory::instance().get(aggregate_function_name, argument_types, parameters, properties);
+        auto aggregate_function
+            = AggregateFunctionFactory::instance().get(aggregate_function_name, action, argument_types, parameters, properties);

        function_node.resolveAsAggregateFunction(std::move(aggregate_function));

--- a/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp
+++ b/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp
@ -97,6 +97,7 @@ private:
        AggregateFunctionProperties properties;
        auto aggregate_function = AggregateFunctionFactory::instance().get(
            function_node.getFunctionName() + suffix,
+            function_node.getNullsAction(),
            argument_types,
            function_node.getAggregateFunction()->getParameters(),
            properties);
--- a/src/Analyzer/Passes/SumIfToCountIfPass.cpp
+++ b/src/Analyzer/Passes/SumIfToCountIfPass.cpp
@ -157,10 +157,8 @@ private:
    static inline void resolveAsCountIfAggregateFunction(FunctionNode & function_node, const DataTypePtr & argument_type)
    {
        AggregateFunctionProperties properties;
-        auto aggregate_function = AggregateFunctionFactory::instance().get("countIf",
-            {argument_type},
-            function_node.getAggregateFunction()->getParameters(),
-            properties);
+        auto aggregate_function = AggregateFunctionFactory::instance().get(
+            "countIf", NullsAction::EMPTY, {argument_type}, function_node.getAggregateFunction()->getParameters(), properties);

        function_node.resolveAsAggregateFunction(std::move(aggregate_function));
    }
--- a/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp
+++ b/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp
@ -76,7 +76,9 @@ public:
            argument_types.emplace_back(function_node_argument->getResultType());

        AggregateFunctionProperties properties;
-        auto aggregate_function = AggregateFunctionFactory::instance().get(function_node->getFunctionName(),
+        auto aggregate_function = AggregateFunctionFactory::instance().get(
+            function_node->getFunctionName(),
+            NullsAction::EMPTY,
            argument_types,
            function_node->getAggregateFunction()->getParameters(),
            properties);
--- a/src/Analyzer/Passes/UniqToCountPass.cpp
+++ b/src/Analyzer/Passes/UniqToCountPass.cpp
@ -176,7 +176,7 @@ public:
        if (match_subquery_with_distinct() || match_subquery_with_group_by())
        {
            AggregateFunctionProperties properties;
-            auto aggregate_function = AggregateFunctionFactory::instance().get("count", {}, {}, properties);
+            auto aggregate_function = AggregateFunctionFactory::instance().get("count", NullsAction::EMPTY, {}, {}, properties);

            function_node->getArguments().getNodes().clear();
            function_node->resolveAsAggregateFunction(std::move(aggregate_function));
--- a/src/Analyzer/QueryTreeBuilder.cpp
+++ b/src/Analyzer/QueryTreeBuilder.cpp
@ -607,6 +607,7 @@ QueryTreeNodePtr QueryTreeBuilder::buildExpression(const ASTPtr & expression, co
        else
        {
            auto function_node = std::make_shared<FunctionNode>(function->name);
+            function_node->setNullsAction(function->nulls_action);

            if (function->parameters)
            {
--- a/src/Analyzer/Utils.cpp
+++ b/src/Analyzer/Utils.cpp
@ -544,11 +544,8 @@ inline AggregateFunctionPtr resolveAggregateFunction(FunctionNode * function_nod
        argument_types.emplace_back(function_node_argument->getResultType());

    AggregateFunctionProperties properties;
-    return AggregateFunctionFactory::instance().get(
-        function_node->getFunctionName(),
-        argument_types,
-        parameters,
-        properties);
+    auto action = NullsAction::EMPTY;
+    return AggregateFunctionFactory::instance().get(function_node->getFunctionName(), action, argument_types, parameters, properties);
 }

 }
--- a/src/Backups/BackupEntriesCollector.cpp
+++ b/src/Backups/BackupEntriesCollector.cpp
@ -451,17 +451,25 @@ void BackupEntriesCollector::gatherDatabaseMetadata(
        }
        catch (...)
        {
-            throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP, "Couldn't get a create query for database {}", database_name);
+            /// Probably the database has been just removed.
+            if (throw_if_database_not_found)
+                throw;
+            LOG_WARNING(log, "Couldn't get a create query for database {}", backQuoteIfNeed(database_name));
+            return;
+        }
+
+        auto * create = create_database_query->as<ASTCreateQuery>();
+        if (create->getDatabase() != database_name)
+        {
+            /// Probably the database has been just renamed. Use the older name for backup to keep the backup consistent.
+            LOG_WARNING(log, "Got a create query with unexpected name {} for database {}",
+                        backQuoteIfNeed(create->getDatabase()), backQuoteIfNeed(database_name));
+            create_database_query = create_database_query->clone();
+            create = create_database_query->as<ASTCreateQuery>();
+            create->setDatabase(database_name);
        }

        database_info.create_database_query = create_database_query;
-        const auto & create = create_database_query->as<const ASTCreateQuery &>();
-
-        if (create.getDatabase() != database_name)
-            throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP,
-                            "Got a create query with unexpected name {} for database {}",
-                            backQuoteIfNeed(create.getDatabase()), backQuoteIfNeed(database_name));
-
        String new_database_name = renaming_map.getNewDatabaseName(database_name);
        database_info.metadata_path_in_backup = root_path_in_backup / "metadata" / (escapeForFileName(new_database_name) + ".sql");
    }
@ -582,26 +590,34 @@ std::vector<std::pair<ASTPtr, StoragePtr>> BackupEntriesCollector::findTablesInD
    }

    std::unordered_set<String> found_table_names;
-    for (const auto & db_table : db_tables)
+    for (auto & db_table : db_tables)
    {
-        const auto & create_table_query = db_table.first;
-        const auto & create = create_table_query->as<const ASTCreateQuery &>();
-        found_table_names.emplace(create.getTable());
+        auto create_table_query = db_table.first;
+        auto * create = create_table_query->as<ASTCreateQuery>();
+        found_table_names.emplace(create->getTable());

        if (database_name == DatabaseCatalog::TEMPORARY_DATABASE)
        {
-            if (!create.temporary)
-                throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP,
+            if (!create->temporary)
+            {
+                throw Exception(ErrorCodes::LOGICAL_ERROR,
                                "Got a non-temporary create query for {}",
-                                tableNameWithTypeToString(database_name, create.getTable(), false));
+                                tableNameWithTypeToString(database_name, create->getTable(), false));
+            }
        }
        else
        {
-            if (create.getDatabase() != database_name)
-                throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP,
-                                "Got a create query with unexpected database name {} for {}",
-                                backQuoteIfNeed(create.getDatabase()),
-                                tableNameWithTypeToString(database_name, create.getTable(), false));
+            if (create->getDatabase() != database_name)
+            {
+                /// Probably the table has been just renamed. Use the older name for backup to keep the backup consistent.
+                LOG_WARNING(log, "Got a create query with unexpected database name {} for {}",
+                            backQuoteIfNeed(create->getDatabase()),
+                            tableNameWithTypeToString(database_name, create->getTable(), false));
+                create_table_query = create_table_query->clone();
+                create = create_table_query->as<ASTCreateQuery>();
+                create->setDatabase(database_name);
+                db_table.first = create_table_query;
+            }
        }
    }

--- a/src/Backups/BackupIO_S3.cpp
+++ b/src/Backups/BackupIO_S3.cpp
@ -48,20 +48,22 @@ namespace
        }

        const auto & request_settings = settings.request_settings;
+        const Settings & global_settings = context->getGlobalContext()->getSettingsRef();
+        const Settings & local_settings = context->getSettingsRef();

        S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration(
            settings.auth_settings.region,
            context->getRemoteHostFilter(),
-            static_cast<unsigned>(context->getGlobalContext()->getSettingsRef().s3_max_redirects),
-            static_cast<unsigned>(context->getGlobalContext()->getSettingsRef().s3_retry_attempts),
-            context->getGlobalContext()->getSettingsRef().enable_s3_requests_logging,
+            static_cast<unsigned>(global_settings.s3_max_redirects),
+            static_cast<unsigned>(global_settings.s3_retry_attempts),
+            global_settings.enable_s3_requests_logging,
            /* for_disk_s3 = */ false,
            request_settings.get_request_throttler,
            request_settings.put_request_throttler,
            s3_uri.uri.getScheme());

        client_configuration.endpointOverride = s3_uri.endpoint;
-        client_configuration.maxConnections = static_cast<unsigned>(context->getSettingsRef().s3_max_connections);
+        client_configuration.maxConnections = static_cast<unsigned>(global_settings.s3_max_connections);
        /// Increase connect timeout
        client_configuration.connectTimeoutMs = 10 * 1000;
        /// Requests in backups can be extremely long, set to one hour
@ -71,6 +73,7 @@ namespace
        return S3::ClientFactory::instance().create(
            client_configuration,
            s3_uri.is_virtual_hosted_style,
+            local_settings.s3_disable_checksum,
            credentials.GetAWSAccessKeyId(),
            credentials.GetAWSSecretKey(),
            settings.auth_settings.server_side_encryption_customer_key_base64,
--- a/src/Client/QueryFuzzer.cpp
+++ b/src/Client/QueryFuzzer.cpp
@ -46,6 +46,7 @@
 #include <Common/assert_cast.h>
 #include <Common/typeid_cast.h>

+#include <AggregateFunctions/AggregateFunctionFactory.h>

 namespace DB
 {
@ -384,6 +385,39 @@ void QueryFuzzer::fuzzColumnLikeExpressionList(IAST * ast)
    // the generic recursion into IAST.children.
 }

+void QueryFuzzer::fuzzNullsAction(NullsAction & action)
+{
+    /// If it's not using actions, then it's a high change it doesn't support it to begin with
+    if ((action == NullsAction::EMPTY) && (fuzz_rand() % 100 == 0))
+    {
+        if (fuzz_rand() % 2 == 0)
+            action = NullsAction::RESPECT_NULLS;
+        else
+            action = NullsAction::IGNORE_NULLS;
+    }
+    else if (fuzz_rand() % 20 == 0)
+    {
+        switch (fuzz_rand() % 3)
+        {
+            case 0:
+            {
+                action = NullsAction::EMPTY;
+                break;
+            }
+            case 1:
+            {
+                action = NullsAction::RESPECT_NULLS;
+                break;
+            }
+            default:
+            {
+                action = NullsAction::IGNORE_NULLS;
+                break;
+            }
+        }
+    }
+}
+
 void QueryFuzzer::fuzzWindowFrame(ASTWindowDefinition & def)
 {
    switch (fuzz_rand() % 40)
@ -966,6 +1000,9 @@ void QueryFuzzer::fuzz(ASTPtr & ast)
        fuzzColumnLikeExpressionList(fn->arguments.get());
        fuzzColumnLikeExpressionList(fn->parameters.get());

+        if (AggregateUtils::isAggregateFunction(*fn))
+            fuzzNullsAction(fn->nulls_action);
+
        if (fn->is_window_function && fn->window_definition)
        {
            auto & def = fn->window_definition->as<ASTWindowDefinition &>();
--- a/src/Client/QueryFuzzer.h
+++ b/src/Client/QueryFuzzer.h
@ -10,6 +10,7 @@
 #include <Core/Field.h>
 #include <Parsers/ASTExplainQuery.h>
 #include <Parsers/IAST.h>
+#include <Parsers/NullsAction.h>
 #include <Common/randomSeed.h>
 #include "Parsers/IAST_fwd.h"

@ -86,6 +87,7 @@ struct QueryFuzzer
    void fuzzOrderByElement(ASTOrderByElement * elem);
    void fuzzOrderByList(IAST * ast);
    void fuzzColumnLikeExpressionList(IAST * ast);
+    void fuzzNullsAction(NullsAction & action);
    void fuzzWindowFrame(ASTWindowDefinition & def);
    void fuzzCreateQuery(ASTCreateQuery & create);
    void fuzzExplainQuery(ASTExplainQuery & explain);
--- a/src/Client/Suggest.cpp
+++ b/src/Client/Suggest.cpp
@ -32,21 +32,23 @@ namespace ErrorCodes
 Suggest::Suggest()
 {
    /// Keywords may be not up to date with ClickHouse parser.
-    addWords({
-        "CREATE",       "DATABASE", "IF",     "NOT",       "EXISTS",   "TEMPORARY",   "TABLE",    "ON",          "CLUSTER", "DEFAULT",
-        "MATERIALIZED", "ALIAS",    "ENGINE", "AS",        "VIEW",     "POPULATE",    "SETTINGS", "ATTACH",      "DETACH",  "DROP",
-        "RENAME",       "TO",       "ALTER",  "ADD",       "MODIFY",   "CLEAR",       "COLUMN",   "AFTER",       "COPY",    "PROJECT",
-        "PRIMARY",      "KEY",      "CHECK",  "PARTITION", "PART",     "FREEZE",      "FETCH",    "FROM",        "SHOW",    "INTO",
-        "OUTFILE",      "FORMAT",   "TABLES", "DATABASES", "LIKE",     "PROCESSLIST", "CASE",     "WHEN",        "THEN",    "ELSE",
-        "END",          "DESCRIBE", "DESC",   "USE",       "SET",      "OPTIMIZE",    "FINAL",    "DEDUPLICATE", "INSERT",  "VALUES",
-        "SELECT",       "DISTINCT", "SAMPLE", "ARRAY",     "JOIN",     "GLOBAL",      "LOCAL",    "ANY",         "ALL",     "INNER",
-        "LEFT",         "RIGHT",    "FULL",   "OUTER",     "CROSS",    "USING",       "PREWHERE", "WHERE",       "GROUP",   "BY",
-        "WITH",         "TOTALS",   "HAVING", "ORDER",     "COLLATE",  "LIMIT",       "UNION",    "AND",         "OR",      "ASC",
-        "IN",           "KILL",     "QUERY",  "SYNC",      "ASYNC",    "TEST",        "BETWEEN",  "TRUNCATE",    "USER",    "ROLE",
-        "PROFILE",      "QUOTA",    "POLICY", "ROW",       "GRANT",    "REVOKE",      "OPTION",   "ADMIN",       "EXCEPT",  "REPLACE",
-        "IDENTIFIED",   "HOST",     "NAME",   "READONLY",  "WRITABLE", "PERMISSIVE",  "FOR",      "RESTRICTIVE", "RANDOMIZED",
-        "INTERVAL",     "LIMITS",   "ONLY",   "TRACKING",  "IP",       "REGEXP",      "ILIKE",    "CLEANUP",     "APPEND"
-    });
+    addWords({"CREATE",       "DATABASE",      "IF",           "NOT",        "EXISTS",   "TEMPORARY",   "TABLE",      "ON",
+              "CLUSTER",      "DEFAULT",       "MATERIALIZED", "ALIAS",      "ENGINE",   "AS",          "VIEW",       "POPULATE",
+              "SETTINGS",     "ATTACH",        "DETACH",       "DROP",       "RENAME",   "TO",          "ALTER",      "ADD",
+              "MODIFY",       "CLEAR",         "COLUMN",       "AFTER",      "COPY",     "PROJECT",     "PRIMARY",    "KEY",
+              "CHECK",        "PARTITION",     "PART",         "FREEZE",     "FETCH",    "FROM",        "SHOW",       "INTO",
+              "OUTFILE",      "FORMAT",        "TABLES",       "DATABASES",  "LIKE",     "PROCESSLIST", "CASE",       "WHEN",
+              "THEN",         "ELSE",          "END",          "DESCRIBE",   "DESC",     "USE",         "SET",        "OPTIMIZE",
+              "FINAL",        "DEDUPLICATE",   "INSERT",       "VALUES",     "SELECT",   "DISTINCT",    "SAMPLE",     "ARRAY",
+              "JOIN",         "GLOBAL",        "LOCAL",        "ANY",        "ALL",      "INNER",       "LEFT",       "RIGHT",
+              "FULL",         "OUTER",         "CROSS",        "USING",      "PREWHERE", "WHERE",       "GROUP",      "BY",
+              "WITH",         "TOTALS",        "HAVING",       "ORDER",      "COLLATE",  "LIMIT",       "UNION",      "AND",
+              "OR",           "ASC",           "IN",           "KILL",       "QUERY",    "SYNC",        "ASYNC",      "TEST",
+              "BETWEEN",      "TRUNCATE",      "USER",         "ROLE",       "PROFILE",  "QUOTA",       "POLICY",     "ROW",
+              "GRANT",        "REVOKE",        "OPTION",       "ADMIN",      "EXCEPT",   "REPLACE",     "IDENTIFIED", "HOST",
+              "NAME",         "READONLY",      "WRITABLE",     "PERMISSIVE", "FOR",      "RESTRICTIVE", "RANDOMIZED", "INTERVAL",
+              "LIMITS",       "ONLY",          "TRACKING",     "IP",         "REGEXP",   "ILIKE",       "CLEANUP",    "APPEND",
+              "IGNORE NULLS", "RESPECT NULLS", "OVER"});
 }

 static String getLoadSuggestionQuery(Int32 suggestion_limit, bool basic_suggestion)
--- a/src/Common/Arena.h
+++ b/src/Common/Arena.h
@ -3,11 +3,11 @@
 #include <cstring>
 #include <memory>
 #include <vector>
-#include <boost/noncopyable.hpp>
 #include <Core/Defines.h>
-#include <Common/memcpySmall.h>
-#include <Common/ProfileEvents.h>
+#include <boost/noncopyable.hpp>
 #include <Common/Allocator.h>
+#include <Common/ProfileEvents.h>
+#include <Common/memcpySmall.h>

 #if __has_include(<sanitizer/asan_interface.h>) && defined(ADDRESS_SANITIZER)
 #   include <sanitizer/asan_interface.h>
@ -180,7 +180,7 @@ public:
    char * alloc(size_t size)
    {
        used_bytes += size;
-        if (unlikely(head.empty() || static_cast<std::ptrdiff_t>(size) > head.end - head.pos))
+        if (unlikely(head.empty() || size > head.remaining()))
            addMemoryChunk(size);

        char * res = head.pos;
@ -193,6 +193,9 @@ public:
    char * alignedAlloc(size_t size, size_t alignment)
    {
        used_bytes += size;
+        if (unlikely(head.empty() || size > head.remaining()))
+            addMemoryChunk(size + alignment);
+
        do
        {
            void * head_pos = head.pos;
--- a/src/Common/AsyncLoader.cpp
+++ b/src/Common/AsyncLoader.cpp
@ -1,12 +1,24 @@
 #include <Common/AsyncLoader.h>

+#include <limits>
+#include <optional>
 #include <base/defines.h>
+#include <base/scope_guard.h>
 #include <Common/ErrorCodes.h>
 #include <Common/Exception.h>
 #include <Common/noexcept_scope.h>
 #include <Common/setThreadName.h>
 #include <Common/logger_useful.h>
 #include <Common/ThreadPool.h>
+#include <Common/getNumberOfPhysicalCPUCores.h>
+#include <Common/ProfileEvents.h>
+#include <Common/Stopwatch.h>
+
+
+namespace ProfileEvents
+{
+    extern const Event AsyncLoaderWaitMicroseconds;
+}

 namespace DB
 {
@ -16,6 +28,7 @@ namespace ErrorCodes
    extern const int ASYNC_LOAD_CYCLE;
    extern const int ASYNC_LOAD_FAILED;
    extern const int ASYNC_LOAD_CANCELED;
+    extern const int LOGICAL_ERROR;
 }

 static constexpr size_t PRINT_MESSAGE_EACH_N_OBJECTS = 256;
@ -52,63 +65,48 @@ size_t LoadJob::pool() const
    return pool_id;
 }

-void LoadJob::wait() const
-{
-    std::unique_lock lock{mutex};
-    waiters++;
-    finished.wait(lock, [this] { return load_status != LoadStatus::PENDING; });
-    waiters--;
-    if (load_exception)
-        std::rethrow_exception(load_exception);
-}
-
-void LoadJob::waitNoThrow() const noexcept
-{
-    std::unique_lock lock{mutex};
-    waiters++;
-    finished.wait(lock, [this] { return load_status != LoadStatus::PENDING; });
-    waiters--;
-}
-
 size_t LoadJob::waitersCount() const
 {
    std::unique_lock lock{mutex};
    return waiters;
 }

-void LoadJob::ok()
+size_t LoadJob::ok()
 {
    std::unique_lock lock{mutex};
    load_status = LoadStatus::OK;
-    finish();
+    return finish();
 }

-void LoadJob::failed(const std::exception_ptr & ptr)
+size_t LoadJob::failed(const std::exception_ptr & ptr)
 {
    std::unique_lock lock{mutex};
    load_status = LoadStatus::FAILED;
    load_exception = ptr;
-    finish();
+    return finish();
 }

-void LoadJob::canceled(const std::exception_ptr & ptr)
+size_t LoadJob::canceled(const std::exception_ptr & ptr)
 {
    std::unique_lock lock{mutex};
    load_status = LoadStatus::CANCELED;
    load_exception = ptr;
-    finish();
+    return finish();
 }

-void LoadJob::finish()
+size_t LoadJob::finish()
 {
-    func = {}; // To ensure job function is destructed before `AsyncLoader::wait()` and `LoadJob::wait()` return
+    func = {}; // To ensure job function is destructed before `AsyncLoader::wait()` return
    finish_time = std::chrono::system_clock::now();
    if (waiters > 0)
        finished.notify_all();
+    return std::exchange(suspended_waiters, 0);
 }

-void LoadJob::scheduled()
+void LoadJob::scheduled(UInt64 job_id_)
 {
+    chassert(job_id == 0); // Job cannot be scheduled twice
+    job_id = job_id_;
    schedule_time = std::chrono::system_clock::now();
 }

@ -118,11 +116,11 @@ void LoadJob::enqueued()
        enqueue_time = std::chrono::system_clock::now();
 }

-void LoadJob::execute(size_t pool, const LoadJobPtr & self)
+void LoadJob::execute(AsyncLoader & loader, size_t pool, const LoadJobPtr & self)
 {
    execution_pool_id = pool;
    start_time = std::chrono::system_clock::now();
-    func(self);
+    func(loader, self);
 }


@ -180,11 +178,11 @@ AsyncLoader::AsyncLoader(std::vector<PoolInitializer> pool_initializers, bool lo
                init.metric_threads,
                init.metric_active_threads,
                init.metric_scheduled_threads,
-                init.max_threads,
-                /* max_free_threads = */ 0,
-                init.max_threads),
+                /* max_threads = */ std::numeric_limits<size_t>::max(), // Unlimited number of threads, we do worker management ourselves
+                /* max_free_threads = */ 0, // We do not require free threads
+                /* queue_size = */0), // Unlimited queue to avoid blocking during worker spawning
            .ready_queue = {},
-            .max_threads = init.max_threads
+            .max_threads = init.max_threads > 0 ? init.max_threads : getNumberOfPhysicalCPUCores()
        });
 }

@ -228,16 +226,16 @@ void AsyncLoader::stop()
 void AsyncLoader::schedule(LoadTask & task)
 {
    chassert(this == &task.loader);
-    scheduleImpl(task.jobs);
+    schedule(task.jobs);
 }

 void AsyncLoader::schedule(const LoadTaskPtr & task)
 {
    chassert(this == &task->loader);
-    scheduleImpl(task->jobs);
+    schedule(task->jobs);
 }

-void AsyncLoader::schedule(const std::vector<LoadTaskPtr> & tasks)
+void AsyncLoader::schedule(const LoadTaskPtrs & tasks)
 {
    LoadJobSet all_jobs;
    for (const auto & task : tasks)
@ -245,10 +243,10 @@ void AsyncLoader::schedule(const std::vector<LoadTaskPtr> & tasks)
        chassert(this == &task->loader);
        all_jobs.insert(task->jobs.begin(), task->jobs.end());
    }
-    scheduleImpl(all_jobs);
+    schedule(all_jobs);
 }

-void AsyncLoader::scheduleImpl(const LoadJobSet & input_jobs)
+void AsyncLoader::schedule(const LoadJobSet & jobs_to_schedule)
 {
    std::unique_lock lock{mutex};

@ -264,7 +262,7 @@ void AsyncLoader::scheduleImpl(const LoadJobSet & input_jobs)
    // 1) exclude already scheduled or finished jobs
    // 2) include assigned job dependencies (that are not yet scheduled)
    LoadJobSet jobs;
-    for (const auto & job : input_jobs)
+    for (const auto & job : jobs_to_schedule)
        gatherNotScheduled(job, jobs, lock);

    // Ensure scheduled_jobs graph will have no cycles. The only way to get a cycle is to add a cycle, assuming old jobs cannot reference new ones.
@ -280,7 +278,7 @@ void AsyncLoader::scheduleImpl(const LoadJobSet & input_jobs)
        NOEXCEPT_SCOPE({
            ALLOW_ALLOCATIONS_IN_SCOPE;
            scheduled_jobs.try_emplace(job);
-            job->scheduled();
+            job->scheduled(++last_job_id);
        });
    }

@ -365,11 +363,20 @@ void AsyncLoader::prioritize(const LoadJobPtr & job, size_t new_pool)
    if (!job)
        return;
    chassert(new_pool < pools.size());
+
    DENY_ALLOCATIONS_IN_SCOPE;
    std::unique_lock lock{mutex};
    prioritize(job, new_pool, lock);
 }

+void AsyncLoader::wait(const LoadJobPtr & job, bool no_throw)
+{
+    std::unique_lock job_lock{job->mutex};
+    wait(job_lock, job);
+    if (!no_throw && job->load_exception)
+        std::rethrow_exception(job->load_exception);
+}
+
 void AsyncLoader::remove(const LoadJobSet & jobs)
 {
    DENY_ALLOCATIONS_IN_SCOPE;
@ -397,9 +404,10 @@ void AsyncLoader::remove(const LoadJobSet & jobs)
        if (auto info = scheduled_jobs.find(job); info != scheduled_jobs.end())
        {
            // Job is currently executing
+            ALLOW_ALLOCATIONS_IN_SCOPE;
            chassert(info->second.isExecuting());
            lock.unlock();
-            job->waitNoThrow(); // Wait for job to finish
+            wait(job, /* no_throw = */ true); // Wait for job to finish
            lock.lock();
        }
    }
@ -415,10 +423,12 @@ void AsyncLoader::remove(const LoadJobSet & jobs)

 void AsyncLoader::setMaxThreads(size_t pool, size_t value)
 {
+    if (value == 0)
+        value = getNumberOfPhysicalCPUCores();
    std::unique_lock lock{mutex};
    auto & p = pools[pool];
-    p.thread_pool->setMaxThreads(value);
-    p.thread_pool->setQueueSize(value); // Keep queue size equal max threads count to avoid blocking during spawning
+    // Note that underlying `ThreadPool` always has unlimited `queue_size` and `max_threads`.
+    // Worker management is done by `AsyncLoader` based on `Pool::max_threads + Pool::suspended_workers` instead.
    p.max_threads = value;
    if (!is_running)
        return;
@ -442,7 +452,6 @@ Priority AsyncLoader::getPoolPriority(size_t pool) const
    return pools[pool].priority; // NOTE: lock is not needed because `priority` is const and `pools` are immutable
 }

-
 size_t AsyncLoader::getScheduledJobCount() const
 {
    std::unique_lock lock{mutex};
@ -479,11 +488,11 @@ void AsyncLoader::checkCycle(const LoadJobSet & jobs, std::unique_lock<std::mute
    while (!left.empty())
    {
        LoadJobPtr job = *left.begin();
-        checkCycleImpl(job, left, visited, lock);
+        checkCycle(job, left, visited, lock);
    }
 }

-String AsyncLoader::checkCycleImpl(const LoadJobPtr & job, LoadJobSet & left, LoadJobSet & visited, std::unique_lock<std::mutex> & lock)
+String AsyncLoader::checkCycle(const LoadJobPtr & job, LoadJobSet & left, LoadJobSet & visited, std::unique_lock<std::mutex> & lock)
 {
    if (!left.contains(job))
        return {}; // Do not consider external dependencies and already processed jobs
@ -494,7 +503,7 @@ String AsyncLoader::checkCycleImpl(const LoadJobPtr & job, LoadJobSet & left, Lo
    }
    for (const auto & dep : job->dependencies)
    {
-        if (auto chain = checkCycleImpl(dep, left, visited, lock); !chain.empty())
+        if (auto chain = checkCycle(dep, left, visited, lock); !chain.empty())
        {
            if (!visited.contains(job)) // Check for cycle end
                throw Exception(ErrorCodes::ASYNC_LOAD_CYCLE, "Load job dependency cycle detected: {} -> {}", job->name, chain);
@ -509,10 +518,11 @@ String AsyncLoader::checkCycleImpl(const LoadJobPtr & job, LoadJobSet & left, Lo
 void AsyncLoader::finish(const LoadJobPtr & job, LoadStatus status, std::exception_ptr exception_from_job, std::unique_lock<std::mutex> & lock)
 {
    chassert(scheduled_jobs.contains(job)); // Job was pending
+    size_t resumed_workers = 0; // Number of workers resumed in the execution pool of the job
    if (status == LoadStatus::OK)
    {
        // Notify waiters
-        job->ok();
+        resumed_workers += job->ok();

        // Update dependent jobs and enqueue if ready
        for (const auto & dep : scheduled_jobs[job].dependent_jobs)
@ -528,9 +538,9 @@ void AsyncLoader::finish(const LoadJobPtr & job, LoadStatus status, std::excepti
    {
        // Notify waiters
        if (status == LoadStatus::FAILED)
-            job->failed(exception_from_job);
+            resumed_workers += job->failed(exception_from_job);
        else if (status == LoadStatus::CANCELED)
-            job->canceled(exception_from_job);
+            resumed_workers += job->canceled(exception_from_job);

        Info & info = scheduled_jobs[job];
        if (info.isReady())
@ -572,35 +582,40 @@ void AsyncLoader::finish(const LoadJobPtr & job, LoadStatus status, std::excepti
        if (log_progress)
            logAboutProgress(log, finished_jobs.size() - old_jobs, finished_jobs.size() + scheduled_jobs.size() - old_jobs, stopwatch);
    });
+
+    if (resumed_workers)
+    {
+        Pool & pool = pools[job->executionPool()];
+        pool.suspended_workers -= resumed_workers;
+    }
 }

 void AsyncLoader::prioritize(const LoadJobPtr & job, size_t new_pool_id, std::unique_lock<std::mutex> & lock)
 {
+    Pool & old_pool = pools[job->pool_id];
+    Pool & new_pool = pools[new_pool_id];
+    if (old_pool.priority <= new_pool.priority)
+        return; // Never lower priority or change pool leaving the same priority
+
+    // Note that there is no point in prioritizing finished jobs, but because we do not lock `job.mutex` here (due to recursion),
+    // Races are inevitable, so we prioritize all job unconditionally: both finished and pending.
+
    if (auto info = scheduled_jobs.find(job); info != scheduled_jobs.end())
    {
-        Pool & old_pool = pools[job->pool_id];
-        Pool & new_pool = pools[new_pool_id];
-        if (old_pool.priority <= new_pool.priority)
-            return; // Never lower priority or change pool leaving the same priority
-
-        // Update priority and push job forward through ready queue if needed
-        UInt64 ready_seqno = info->second.ready_seqno;
-
        // Requeue job into the new pool queue without allocations
-        if (ready_seqno)
+        if (UInt64 ready_seqno = info->second.ready_seqno)
        {
            new_pool.ready_queue.insert(old_pool.ready_queue.extract(ready_seqno));
            if (canSpawnWorker(new_pool, lock))
                spawn(new_pool, lock);
        }
-
-        // Set user-facing pool (may affect executing jobs)
-        job->pool_id.store(new_pool_id);
-
-        // Recurse into dependencies
-        for (const auto & dep : job->dependencies)
-            prioritize(dep, new_pool_id, lock);
    }
+
+    job->pool_id.store(new_pool_id);
+
+    // Recurse into dependencies
+    for (const auto & dep : job->dependencies)
+        prioritize(dep, new_pool_id, lock);
 }

 void AsyncLoader::enqueue(Info & info, const LoadJobPtr & job, std::unique_lock<std::mutex> & lock)
@ -620,11 +635,102 @@ void AsyncLoader::enqueue(Info & info, const LoadJobPtr & job, std::unique_lock<
        spawn(pool, lock);
 }

+// Keep track of currently executing load jobs to be able to:
+// 1) Detect "wait dependent" deadlocks -- throw LOGICAL_ERROR
+//    (when job A function waits for job B that depends on job A)
+// 2) Detect "wait not scheduled" deadlocks -- throw LOGICAL_ERROR
+//    (thread T is waiting on an assigned job A, but job A is not yet scheduled)
+// 3) Resolve "priority inversion" deadlocks -- apply priority inheritance
+//    (when high-priority job A function waits for a lower-priority job B, and B never starts due to its priority)
+// 4) Resolve "blocked pool" deadlocks -- spawn more workers
+//    (when job A in pool P waits for another ready job B in P, but B never starts because there are no free workers in P)
+thread_local LoadJob * current_load_job = nullptr;
+
+size_t currentPoolOr(size_t pool)
+{
+    return current_load_job ? current_load_job->executionPool() : pool;
+}
+
+bool detectWaitDependentDeadlock(const LoadJobPtr & waited)
+{
+    if (waited.get() == current_load_job)
+        return true;
+    for (const auto & dep : waited->dependencies)
+    {
+        if (detectWaitDependentDeadlock(dep))
+            return true;
+    }
+    return false;
+}
+
+void AsyncLoader::wait(std::unique_lock<std::mutex> & job_lock, const LoadJobPtr & job)
+{
+    // Ensure job we are going to wait was scheduled to avoid "wait not scheduled" deadlocks
+    if (job->job_id == 0)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Load job '{}' waits for not scheduled load job '{}'", current_load_job->name, job->name);
+
+    // Deadlock detection and resolution
+    if (current_load_job && job->load_status == LoadStatus::PENDING)
+    {
+        if (detectWaitDependentDeadlock(job))
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Load job '{}' waits for dependent load job '{}'", current_load_job->name, job->name);
+
+        auto worker_pool = current_load_job->executionPool();
+        auto worker_priority = getPoolPriority(worker_pool);
+        auto job_priority = getPoolPriority(job->pool_id);
+
+        // Waiting for a lower-priority job ("priority inversion" deadlock) is resolved using priority inheritance.
+        if (worker_priority < job_priority)
+        {
+            job_lock.unlock(); // Avoid reverse locking order
+            prioritize(job, worker_pool);
+            job_lock.lock();
+        }
+
+        // Spawn more workers to avoid exhaustion of worker pool ("blocked pool" deadlock)
+        if (worker_pool == job->pool_id)
+        {
+            job_lock.unlock(); // Avoid reverse locking order
+            workerIsSuspendedByWait(worker_pool, job);
+            job_lock.lock();
+        }
+    }
+
+    Stopwatch watch;
+    job->waiters++;
+    job->finished.wait(job_lock, [&] { return job->load_status != LoadStatus::PENDING; });
+    job->waiters--;
+    ProfileEvents::increment(ProfileEvents::AsyncLoaderWaitMicroseconds, watch.elapsedMicroseconds());
+}
+
+void AsyncLoader::workerIsSuspendedByWait(size_t pool_id, const LoadJobPtr & job)
+{
+    std::unique_lock lock{mutex};
+    std::unique_lock job_lock{job->mutex};
+
+    if (job->load_status != LoadStatus::PENDING)
+        return; // Job is already done, worker can continue execution
+
+    // To resolve "blocked pool" deadlocks we spawn a new worker for every suspended worker, if required
+    // This can lead to a visible excess of `max_threads` specified for a pool,
+    // but actual number of NOT suspended workers may exceed `max_threads` ONLY in intermittent state.
+    Pool & pool = pools[pool_id];
+    pool.suspended_workers++;
+    job->suspended_waiters++;
+    if (canSpawnWorker(pool, lock))
+        spawn(pool, lock);
+
+    // TODO(serxa): it is a good idea to propagate `job` and all its dependencies in `pool.ready_queue` by introducing
+    // key {suspended_waiters, ready_seqno} instead of plain `ready_seqno`, to force newly spawn workers to work on jobs
+    // that are being waited. But it doesn't affect correctness. So let's not complicate it for time being.
+}
+
 bool AsyncLoader::canSpawnWorker(Pool & pool, std::unique_lock<std::mutex> &)
 {
+    // TODO(serxa): optimization: we should not spawn new worker on the first enqueue during `finish()` because current worker will take this job.
    return is_running
        && !pool.ready_queue.empty()
-        && pool.workers < pool.max_threads
+        && pool.workers < pool.max_threads + pool.suspended_workers
        && (!current_priority || *current_priority >= pool.priority);
 }

@ -632,7 +738,7 @@ bool AsyncLoader::canWorkerLive(Pool & pool, std::unique_lock<std::mutex> &)
 {
    return is_running
        && !pool.ready_queue.empty()
-        && pool.workers <= pool.max_threads
+        && pool.workers <= pool.max_threads + pool.suspended_workers
        && (!current_priority || *current_priority >= pool.priority);
 }

@ -705,7 +811,9 @@ void AsyncLoader::worker(Pool & pool)

        try
        {
-            job->execute(pool_id, job);
+            current_load_job = job.get();
+            SCOPE_EXIT({ current_load_job = nullptr; }); // Note that recursive job execution is not supported
+            job->execute(*this, pool_id, job);
            exception_from_job = {};
        }
        catch (...)
--- a/src/Common/AsyncLoader.h
+++ b/src/Common/AsyncLoader.h
@ -21,6 +21,16 @@ namespace Poco { class Logger; }
 namespace DB
 {

+// TERMINOLOGY:
+// Job (`LoadJob`) - The smallest part of loading process, executed by worker. Job can depend on the other jobs. Jobs are grouped in tasks.
+// Task (`LoadTask`) - Owning holder of a set of jobs. Should be held during the whole job lifetime. Cancels all jobs on destruction.
+// Goal jobs (goals) - a subset of "final" jobs of a task (usually no job in task depend on a goal job).
+//      By default all jobs in task are included in goal jobs.
+//      Goals should used if you need to create a job that depends on a task (to avoid placing all jobs of the task in dependencies).
+// Pool (worker pool) - A set of workers with specific priority. Every job is assigned to a pool. Job can change its pool dynamically.
+// Priority (pool priority) - Constant integer value showing relative priority of a pool. Lower value means higher priority.
+// AsyncLoader - scheduling system responsible for job dependency tracking and worker management respecting pool priorities.
+
 class LoadJob;
 using LoadJobPtr = std::shared_ptr<LoadJob>;
 using LoadJobSet = std::unordered_set<LoadJobPtr>;
@ -43,6 +53,7 @@ enum class LoadStatus
 // Smallest indivisible part of a loading process. Load job can have multiple dependencies, thus jobs constitute a direct acyclic graph (DAG).
 // Job encapsulates a function to be executed by `AsyncLoader` as soon as job functions of all dependencies are successfully executed.
 // Job can be waited for by an arbitrary number of threads. See `AsyncLoader` class description for more details.
+// WARNING: jobs are usually held with ownership by tasks (see `LoadTask`). You are encouraged to add jobs into a tasks as soon as the are created.
 class LoadJob : private boost::noncopyable
 {
 public:
@ -50,6 +61,7 @@ public:
    LoadJob(LoadJobSetType && dependencies_, String name_, size_t pool_id_, Func && func_)
        : dependencies(std::forward<LoadJobSetType>(dependencies_))
        , name(std::move(name_))
+        , execution_pool_id(pool_id_)
        , pool_id(pool_id_)
        , func(std::forward<Func>(func_))
    {}
@ -67,18 +79,12 @@ public:
    // Value may change during job execution by `prioritize()`.
    size_t pool() const;

-    // Sync wait for a pending job to be finished: OK, FAILED or CANCELED status.
-    // Throws if job is FAILED or CANCELED. Returns or throws immediately if called on non-pending job.
-    void wait() const;
-
-    // Wait for a job to reach any non PENDING status.
-    void waitNoThrow() const noexcept;
-
-    // Returns number of threads blocked by `wait()` or `waitNoThrow()` calls.
+    // Returns number of threads blocked by `wait()` calls.
    size_t waitersCount() const;

    // Introspection
    using TimePoint = std::chrono::system_clock::time_point;
+    UInt64 jobId() const { return job_id; }
    TimePoint scheduleTime() const { return schedule_time; }
    TimePoint enqueueTime() const { return enqueue_time; }
    TimePoint startTime() const { return start_time; }
@ -90,22 +96,24 @@ public:
 private:
    friend class AsyncLoader;

-    void ok();
-    void failed(const std::exception_ptr & ptr);
-    void canceled(const std::exception_ptr & ptr);
-    void finish();
+    [[nodiscard]] size_t ok();
+    [[nodiscard]] size_t failed(const std::exception_ptr & ptr);
+    [[nodiscard]] size_t canceled(const std::exception_ptr & ptr);
+    [[nodiscard]] size_t finish();

-    void scheduled();
+    void scheduled(UInt64 job_id_);
    void enqueued();
-    void execute(size_t pool, const LoadJobPtr & self);
+    void execute(AsyncLoader & loader, size_t pool, const LoadJobPtr & self);

+    std::atomic<UInt64> job_id{0};
    std::atomic<size_t> execution_pool_id;
    std::atomic<size_t> pool_id;
-    std::function<void(const LoadJobPtr & self)> func;
+    std::function<void(AsyncLoader & loader, const LoadJobPtr & self)> func;

    mutable std::mutex mutex;
    mutable std::condition_variable finished;
-    mutable size_t waiters = 0;
+    mutable size_t waiters = 0; // All waiters, including suspended
+    mutable size_t suspended_waiters = 0;
    LoadStatus load_status{LoadStatus::PENDING};
    std::exception_ptr load_exception;

@ -117,7 +125,7 @@ private:

 struct EmptyJobFunc
 {
-    void operator()(const LoadJobPtr &) {}
+    void operator()(AsyncLoader &, const LoadJobPtr &) {}
 };

 template <class Func = EmptyJobFunc>
@ -144,6 +152,7 @@ LoadJobPtr makeLoadJob(const LoadJobSet & dependencies, size_t pool_id, String n
    return std::make_shared<LoadJob>(dependencies, std::move(name), pool_id, std::forward<Func>(func));
 }

+
 // Represents a logically connected set of LoadJobs required to achieve some goals (final LoadJob in the set).
 class LoadTask : private boost::noncopyable
 {
@ -168,10 +177,11 @@ public:
    //   auto load_task = loadSomethingAsync(async_loader, load_after_task.goals(), something);
    const LoadJobSet & goals() const { return goal_jobs.empty() ? jobs : goal_jobs; }

+    AsyncLoader & loader;
+
 private:
    friend class AsyncLoader;

-    AsyncLoader & loader;
    LoadJobSet jobs;
    LoadJobSet goal_jobs;
 };
@ -181,91 +191,6 @@ inline LoadTaskPtr makeLoadTask(AsyncLoader & loader, LoadJobSet && jobs, LoadJo
    return std::make_shared<LoadTask>(loader, std::move(jobs), std::move(goals));
 }

-inline void scheduleLoad(const LoadTaskPtr & task)
-{
-    task->schedule();
-}
-
-inline void scheduleLoad(const LoadTaskPtrs & tasks)
-{
-    for (const auto & task : tasks)
-        task->schedule();
-}
-
-template <class... Args>
-inline void scheduleLoadAll(Args && ... args)
-{
-    (scheduleLoad(std::forward<Args>(args)), ...);
-}
-
-inline void waitLoad(const LoadJobSet & jobs)
-{
-    for (const auto & job : jobs)
-        job->wait();
-}
-
-inline void waitLoad(const LoadTaskPtr & task)
-{
-    waitLoad(task->goals());
-}
-
-inline void waitLoad(const LoadTaskPtrs & tasks)
-{
-    for (const auto & task : tasks)
-        waitLoad(task->goals());
-}
-
-template <class... Args>
-inline void waitLoadAll(Args && ... args)
-{
-    (waitLoad(std::forward<Args>(args)), ...);
-}
-
-template <class... Args>
-inline void scheduleAndWaitLoadAll(Args && ... args)
-{
-    scheduleLoadAll(std::forward<Args>(args)...);
-    waitLoadAll(std::forward<Args>(args)...);
-}
-
-inline LoadJobSet getGoals(const LoadTaskPtrs & tasks)
-{
-    LoadJobSet result;
-    for (const auto & task : tasks)
-        result.insert(task->goals().begin(), task->goals().end());
-    return result;
-}
-
-inline LoadJobSet getGoalsOr(const LoadTaskPtrs & tasks, const LoadJobSet & alternative)
-{
-    LoadJobSet result;
-    for (const auto & task : tasks)
-        result.insert(task->goals().begin(), task->goals().end());
-    return result.empty() ? alternative : result;
-}
-
-inline LoadJobSet joinJobs(const LoadJobSet & jobs1, const LoadJobSet & jobs2)
-{
-    LoadJobSet result;
-    if (!jobs1.empty())
-        result.insert(jobs1.begin(), jobs1.end());
-    if (!jobs2.empty())
-        result.insert(jobs2.begin(), jobs2.end());
-    return result;
-}
-
-inline LoadTaskPtrs joinTasks(const LoadTaskPtrs & tasks1, const LoadTaskPtrs & tasks2)
-{
-    if (tasks1.empty())
-        return tasks2;
-    if (tasks2.empty())
-        return tasks1;
-    LoadTaskPtrs result;
-    result.reserve(tasks1.size() + tasks2.size());
-    result.insert(result.end(), tasks1.begin(), tasks1.end());
-    result.insert(result.end(), tasks2.begin(), tasks2.end());
-    return result;
-}

 // `AsyncLoader` is a scheduler for DAG of `LoadJob`s. It tracks job dependencies and priorities.
 // Basic usage example:
@ -277,8 +202,8 @@ inline LoadTaskPtrs joinTasks(const LoadTaskPtrs & tasks1, const LoadTaskPtrs &
 //
 //     // Create and schedule a task consisting of three jobs. Job1 has no dependencies and is run first.
 //     // Job2 and job3 depend on job1 and are run only after job1 completion.
-//     auto job_func = [&] (const LoadJobPtr & self) {
-//         LOG_TRACE(log, "Executing load job '{}' in pool '{}'", self->name, async_loader->getPoolName(self->pool()));
+//     auto job_func = [&] (AsyncLoader & loader, const LoadJobPtr & self) {
+//         LOG_TRACE(log, "Executing load job '{}' in pool '{}'", self->name, loader->getPoolName(self->pool()));
 //     };
 //     auto job1 = makeLoadJob({}, "job1", /* pool_id = */ 1, job_func);
 //     auto job2 = makeLoadJob({ job1 }, "job2", /* pool_id = */ 1, job_func);
@ -287,8 +212,8 @@ inline LoadTaskPtrs joinTasks(const LoadTaskPtrs & tasks1, const LoadTaskPtrs &
 //     task.schedule();
 //
 //     // Another thread may prioritize a job by changing its pool and wait for it:
-//     async_loader->prioritize(job3, /* pool_id = */ 0); // Increase priority: 1 -> 0 (lower is better)
-//     job3->wait(); // Blocks until job completion or cancellation and rethrow an exception (if any)
+//     async_loader.prioritize(job3, /* pool_id = */ 0); // Increase priority: 1 -> 0 (lower is better)
+//     async_loader.wait(job3); // Blocks until job completion or cancellation and rethrow an exception (if any)
 //
 // Every job has a pool associated with it. AsyncLoader starts every job in its thread pool.
 // Each pool has a constant priority and a mutable maximum number of threads.
@ -341,7 +266,8 @@ private:
        std::unique_ptr<ThreadPool> thread_pool; // NOTE: we avoid using a `ThreadPool` queue to be able to move jobs between pools.
        std::map<UInt64, LoadJobPtr> ready_queue; // FIFO queue of jobs to be executed in this pool. Map is used for faster erasing. Key is `ready_seqno`
        size_t max_threads; // Max number of workers to be spawn
-        size_t workers = 0; // Number of currently execution workers
+        size_t workers = 0; // Number of currently executing workers
+        size_t suspended_workers = 0; // Number of workers that are blocked by `wait()` call on a job executing in the same pool (for deadlock resolution)

        bool isActive() const { return workers > 0 || !ready_queue.empty(); }
    };
@ -369,7 +295,7 @@ public:
        Metric metric_threads;
        Metric metric_active_threads;
        Metric metric_scheduled_threads;
-        size_t max_threads;
+        size_t max_threads; // Zero means use all CPU cores
        Priority priority;
    };

@ -399,6 +325,7 @@ public:
    // and are removed from AsyncLoader, so it is thread-safe to destroy them.
    void schedule(LoadTask & task);
    void schedule(const LoadTaskPtr & task);
+    void schedule(const LoadJobSet & jobs_to_schedule);

    // Schedule all tasks atomically. To ensure only highest priority jobs among all tasks are run first.
    void schedule(const LoadTaskPtrs & tasks);
@ -407,6 +334,11 @@ public:
    // Jobs from higher (than `new_pool`) priority pools are not changed.
    void prioritize(const LoadJobPtr & job, size_t new_pool);

+    // Sync wait for a pending job to be finished: OK, FAILED or CANCELED status.
+    // Throws if job is FAILED or CANCELED unless `no_throw` is set. Returns or throws immediately if called on non-pending job.
+    // If job was not scheduled, it will be implicitly scheduled before the wait (deadlock auto-resolution).
+    void wait(const LoadJobPtr & job, bool no_throw = false);
+
    // Remove finished jobs, cancel scheduled jobs, wait for executing jobs to finish and remove them.
    void remove(const LoadJobSet & jobs);

@ -430,23 +362,26 @@ public:
        bool is_executing = false;
    };

-    // For introspection and debug only, see `system.async_loader` table
+    // For introspection and debug only, see `system.async_loader` table.
    std::vector<JobState> getJobStates() const;

+    // For deadlock resolution. Should not be used directly.
+    void workerIsSuspendedByWait(size_t pool_id, const LoadJobPtr & job);
+
 private:
    void checkCycle(const LoadJobSet & jobs, std::unique_lock<std::mutex> & lock);
-    String checkCycleImpl(const LoadJobPtr & job, LoadJobSet & left, LoadJobSet & visited, std::unique_lock<std::mutex> & lock);
+    String checkCycle(const LoadJobPtr & job, LoadJobSet & left, LoadJobSet & visited, std::unique_lock<std::mutex> & lock);
    void finish(const LoadJobPtr & job, LoadStatus status, std::exception_ptr exception_from_job, std::unique_lock<std::mutex> & lock);
-    void scheduleImpl(const LoadJobSet & input_jobs);
    void gatherNotScheduled(const LoadJobPtr & job, LoadJobSet & jobs, std::unique_lock<std::mutex> & lock);
    void prioritize(const LoadJobPtr & job, size_t new_pool_id, std::unique_lock<std::mutex> & lock);
    void enqueue(Info & info, const LoadJobPtr & job, std::unique_lock<std::mutex> & lock);
-    bool canSpawnWorker(Pool & pool, std::unique_lock<std::mutex> &);
-    bool canWorkerLive(Pool & pool, std::unique_lock<std::mutex> &);
-    void updateCurrentPriorityAndSpawn(std::unique_lock<std::mutex> &);
-    void spawn(Pool & pool, std::unique_lock<std::mutex> &);
+    void wait(std::unique_lock<std::mutex> & job_lock, const LoadJobPtr & job);
+    bool canSpawnWorker(Pool & pool, std::unique_lock<std::mutex> & lock);
+    bool canWorkerLive(Pool & pool, std::unique_lock<std::mutex> & lock);
+    void updateCurrentPriorityAndSpawn(std::unique_lock<std::mutex> & lock);
+    void spawn(Pool & pool, std::unique_lock<std::mutex> & lock);
    void worker(Pool & pool);
-    bool hasWorker(std::unique_lock<std::mutex> &) const;
+    bool hasWorker(std::unique_lock<std::mutex> & lock) const;

    // Logging
    const bool log_failures; // Worker should log all exceptions caught from job functions.
@ -457,6 +392,7 @@ private:
    bool is_running = true;
    std::optional<Priority> current_priority; // highest priority among active pools
    UInt64 last_ready_seqno = 0; // Increasing counter for ready queue keys.
+    UInt64 last_job_id = 0; // Increasing counter for job IDs
    std::unordered_map<LoadJobPtr, Info> scheduled_jobs; // Full set of scheduled pending jobs along with scheduling info.
    std::vector<Pool> pools; // Thread pools for job execution and ready queues
    LoadJobSet finished_jobs; // Set of finished jobs (for introspection only, until jobs are removed).
@ -465,4 +401,136 @@ private:
    std::chrono::system_clock::time_point busy_period_start_time;
 };

+// === HELPER FUNCTIONS ===
+// There are three types of helper functions:
+//  schedulerLoad([loader], {jobs|task|tasks}):
+//      Just schedule jobs for async loading.
+//      Note that normally function `doSomethingAsync()` returns you a task which is NOT scheduled.
+//      This is done to allow you:
+//          (1) construct complex dependency graph offline.
+//          (2) schedule tasks simultaneously to respect their relative priorities.
+//          (3) do prioritization independently, before scheduling.
+//  prioritizeLoad([loader], pool_id, {jobs|task|tasks}):
+//      Prioritize jobs w/o waiting for it.
+//      Note that prioritization may be done
+//          (1) before scheduling (to ensure all jobs are started in the correct pools)
+//          (2) after scheduling (for dynamic prioritization, e.g. when new query arrives)
+//  waitLoad([loader], pool_id, {jobs|task|tasks}, [no_throw]):
+//      Prioritize and wait for jobs.
+//      Note that to avoid deadlocks it implicitly schedules all the jobs before waiting for them.
+//      Also to avoid priority inversion you should never wait for a job that has lower priority.
+//      So it prioritizes all jobs, then schedules all jobs and waits every job.
+//      IMPORTANT: Any load error will be rethrown, unless `no_throw` is set.
+//      Common usage pattern is:
+//          waitLoad(currentPoolOr(foreground_pool_id), tasks);
+
+// Returns current execution pool if it is called from load job, or `pool` otherwise
+// It should be used for waiting other load jobs in places that can be executed from load jobs
+size_t currentPoolOr(size_t pool);
+
+inline void scheduleLoad(AsyncLoader & loader, const LoadJobSet & jobs)
+{
+    loader.schedule(jobs);
+}
+
+inline void scheduleLoad(const LoadTaskPtr & task)
+{
+    task->schedule();
+}
+
+inline void scheduleLoad(const LoadTaskPtrs & tasks)
+{
+    if (tasks.empty())
+        return;
+    // NOTE: it is assumed that all tasks use the same `AsyncLoader`
+    AsyncLoader & loader = tasks.front()->loader;
+    loader.schedule(tasks);
+}
+
+inline void waitLoad(AsyncLoader & loader, const LoadJobSet & jobs, bool no_throw = false)
+{
+    scheduleLoad(loader, jobs);
+    for (const auto & job : jobs)
+        loader.wait(job, no_throw);
+}
+
+inline void waitLoad(const LoadTaskPtr & task, bool no_throw = false)
+{
+    scheduleLoad(task);
+    waitLoad(task->loader, task->goals(), no_throw);
+}
+
+inline void waitLoad(const LoadTaskPtrs & tasks, bool no_throw = false)
+{
+    scheduleLoad(tasks);
+    for (const auto & task : tasks)
+        waitLoad(task->loader, task->goals(), no_throw);
+}
+
+inline void prioritizeLoad(AsyncLoader & loader, size_t pool_id, const LoadJobSet & jobs)
+{
+    for (const auto & job : jobs)
+        loader.prioritize(job, pool_id);
+}
+
+inline void prioritizeLoad(size_t pool_id, const LoadTaskPtr & task)
+{
+    prioritizeLoad(task->loader, pool_id, task->goals());
+}
+
+inline void prioritizeLoad(size_t pool_id, const LoadTaskPtrs & tasks)
+{
+    for (const auto & task : tasks)
+        prioritizeLoad(task->loader, pool_id, task->goals());
+}
+
+inline void waitLoad(AsyncLoader & loader, size_t pool_id, const LoadJobSet & jobs, bool no_throw = false)
+{
+    prioritizeLoad(loader, pool_id, jobs);
+    waitLoad(loader, jobs, no_throw);
+}
+
+inline void waitLoad(size_t pool_id, const LoadTaskPtr & task, bool no_throw = false)
+{
+    prioritizeLoad(task->loader, pool_id, task->goals());
+    waitLoad(task->loader, task->goals(), no_throw);
+}
+
+inline void waitLoad(size_t pool_id, const LoadTaskPtrs & tasks, bool no_throw = false)
+{
+    prioritizeLoad(pool_id, tasks);
+    waitLoad(tasks, no_throw);
+}
+
+inline LoadJobSet getGoals(const LoadTaskPtrs & tasks, const LoadJobSet & alternative = {})
+{
+    LoadJobSet result;
+    for (const auto & task : tasks)
+        result.insert(task->goals().begin(), task->goals().end());
+    return result.empty() ? alternative : result;
+}
+
+inline LoadJobSet joinJobs(const LoadJobSet & jobs1, const LoadJobSet & jobs2)
+{
+    LoadJobSet result;
+    if (!jobs1.empty())
+        result.insert(jobs1.begin(), jobs1.end());
+    if (!jobs2.empty())
+        result.insert(jobs2.begin(), jobs2.end());
+    return result;
+}
+
+inline LoadTaskPtrs joinTasks(const LoadTaskPtrs & tasks1, const LoadTaskPtrs & tasks2)
+{
+    if (tasks1.empty())
+        return tasks2;
+    if (tasks2.empty())
+        return tasks1;
+    LoadTaskPtrs result;
+    result.reserve(tasks1.size() + tasks2.size());
+    result.insert(result.end(), tasks1.begin(), tasks1.end());
+    result.insert(result.end(), tasks2.begin(), tasks2.end());
+    return result;
+}
+
 }
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@ -110,12 +110,12 @@
    M(StorageHiveThreads, "Number of threads in the StorageHive thread pool.") \
    M(StorageHiveThreadsActive, "Number of threads in the StorageHive thread pool running a task.") \
    M(StorageHiveThreadsScheduled, "Number of queued or active jobs in the StorageHive thread pool.") \
-    M(TablesLoaderThreads, "Number of threads in the tables loader thread pool.") \
-    M(TablesLoaderThreadsActive, "Number of threads in the tables loader thread pool running a task.") \
-    M(TablesLoaderThreadsScheduled, "Number of queued or active jobs in the tables loader thread pool.") \
-    M(DatabaseOrdinaryThreads, "Number of threads in the Ordinary database thread pool.") \
-    M(DatabaseOrdinaryThreadsActive, "Number of threads in the Ordinary database thread pool running a task.") \
-    M(DatabaseOrdinaryThreadsScheduled, "Number of queued or active jobs in the Ordinary database thread pool.") \
+    M(TablesLoaderBackgroundThreads, "Number of threads in the tables loader background thread pool.") \
+    M(TablesLoaderBackgroundThreadsActive, "Number of threads in the tables loader background thread pool running a task.") \
+    M(TablesLoaderBackgroundThreadsScheduled, "Number of queued or active jobs in the tables loader background thread pool.") \
+    M(TablesLoaderForegroundThreads, "Number of threads in the tables loader foreground thread pool.") \
+    M(TablesLoaderForegroundThreadsActive, "Number of threads in the tables loader foreground thread pool running a task.") \
+    M(TablesLoaderForegroundThreadsScheduled, "Number of queued or active jobs in the tables loader foreground thread pool.") \
    M(DatabaseOnDiskThreads, "Number of threads in the DatabaseOnDisk thread pool.") \
    M(DatabaseOnDiskThreadsActive, "Number of threads in the DatabaseOnDisk thread pool running a task.") \
    M(DatabaseOnDiskThreadsScheduled, "Number of queued or active jobs in the DatabaseOnDisk thread pool.") \
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -588,6 +588,7 @@
    M(706, LIBSSH_ERROR) \
    M(707, GCP_ERROR) \
    M(708, ILLEGAL_STATISTIC) \
+    M(709, CANNOT_GET_REPLICATED_DATABASE_SNAPSHOT) \
    \
    M(999, KEEPER_EXCEPTION) \
    M(1000, POCO_EXCEPTION) \
--- a/src/Common/PoolId.h
+++ b/src/Common/PoolId.h
@ -0,0 +1,32 @@
+#pragma once
+
+#include <Common/Priority.h>
+
+namespace DB
+{
+
+/// Indices and priorities of `AsyncLoader` pools.
+
+/// The most important difference from regular ThreadPools is priorities of pools:
+///  * Pools that have different priorities do NOT run jobs simultaneously (with small exception due to dynamic prioritization).
+///  * Pools with lower priority wait for all jobs in higher priority pools to be done.
+
+/// Note that pools also have different configurable sizes not listed here. See `Context::getAsyncLoader()` for details.
+
+/// WARNING: `*PoolId` values must be unique and sequential w/o gaps.
+
+/// Used for executing load jobs that are waited for by queries or in case of synchronous table loading.
+constexpr size_t TablesLoaderForegroundPoolId = 0;
+constexpr Priority TablesLoaderForegroundPriority{0};
+
+/// Has lower priority and is used by table load jobs.
+constexpr size_t TablesLoaderBackgroundLoadPoolId = 1;
+constexpr Priority TablesLoaderBackgroundLoadPriority{1};
+
+/// Has even lower priority and is used by startup jobs.
+/// NOTE: This pool is required to begin table startup only after all tables are loaded.
+/// NOTE: Which is needed to prevent heavy merges/mutations from consuming all the resources, slowing table loading down.
+constexpr size_t TablesLoaderBackgroundStartupPoolId = 2;
+constexpr Priority TablesLoaderBackgroundStartupPriority{2};
+
+}
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -444,8 +444,13 @@ The server successfully detected this situation and will download merged part fr
    M(WaitPrefetchTaskMicroseconds, "Time spend waiting for prefetched reader") \
    \
    M(ThreadpoolReaderTaskMicroseconds, "Time spent getting the data in asynchronous reading") \
+    M(ThreadpoolReaderPrepareMicroseconds, "Time spent on preparation (e.g. call to reader seek() method)") \
    M(ThreadpoolReaderReadBytes, "Bytes read from a threadpool task in asynchronous reading") \
    M(ThreadpoolReaderSubmit, "Bytes read from a threadpool task in asynchronous reading") \
+    M(ThreadpoolReaderSubmitReadSynchronously, "How many times we haven't scheduled a task on the thread pool and read synchronously instead") \
+    M(ThreadpoolReaderSubmitReadSynchronouslyBytes, "How many bytes were read synchronously") \
+    M(ThreadpoolReaderSubmitReadSynchronouslyMicroseconds, "How much time we spent reading synchronously") \
+    M(AsynchronousReaderIgnoredBytes, "Number of bytes ignored during asynchronous reading") \
    \
    M(FileSegmentWaitReadBufferMicroseconds, "Metric per file segment. Time spend waiting for internal read buffer (includes cache waiting)") \
    M(FileSegmentReadMicroseconds, "Metric per file segment. Time spend reading from file") \
@ -569,6 +574,8 @@ The server successfully detected this situation and will download merged part fr
    \
    M(ConnectionPoolIsFullMicroseconds, "Total time spent waiting for a slot in connection pool.") \
    \
+    M(AsyncLoaderWaitMicroseconds, "Total time a query was waiting for async loader jobs.") \
+    \
    M(LogTest, "Number of log messages with level Test") \
    M(LogTrace, "Number of log messages with level Trace") \
    M(LogDebug, "Number of log messages with level Debug") \
--- a/src/Common/tests/gtest_async_loader.cpp
+++ b/src/Common/tests/gtest_async_loader.cpp
@ -1,8 +1,12 @@
+#include <boost/core/noncopyable.hpp>
 #include <gtest/gtest.h>

+#include <array>
+#include <list>
 #include <barrier>
 #include <chrono>
 #include <mutex>
+#include <shared_mutex>
 #include <stdexcept>
 #include <string_view>
 #include <vector>
@ -19,9 +23,9 @@ using namespace DB;

 namespace CurrentMetrics
 {
-    extern const Metric TablesLoaderThreads;
-    extern const Metric TablesLoaderThreadsActive;
-    extern const Metric TablesLoaderThreadsScheduled;
+    extern const Metric TablesLoaderBackgroundThreads;
+    extern const Metric TablesLoaderBackgroundThreadsActive;
+    extern const Metric TablesLoaderBackgroundThreadsScheduled;
 }

 namespace DB::ErrorCodes
@ -61,9 +65,9 @@ struct AsyncLoaderTest
        {
            result.push_back({
                .name = fmt::format("Pool{}", pool_id),
-                .metric_threads = CurrentMetrics::TablesLoaderThreads,
-                .metric_active_threads = CurrentMetrics::TablesLoaderThreadsActive,
-                .metric_scheduled_threads = CurrentMetrics::TablesLoaderThreadsScheduled,
+                .metric_threads = CurrentMetrics::TablesLoaderBackgroundThreads,
+                .metric_active_threads = CurrentMetrics::TablesLoaderBackgroundThreadsActive,
+                .metric_scheduled_threads = CurrentMetrics::TablesLoaderBackgroundThreadsScheduled,
                .max_threads = desc.max_threads,
                .priority = desc.priority
            });
@ -155,7 +159,7 @@ TEST(AsyncLoader, Smoke)
    std::atomic<size_t> jobs_done{0};
    std::atomic<size_t> low_priority_jobs_done{0};

-    auto job_func = [&] (const LoadJobPtr & self) {
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr & self) {
        jobs_done++;
        if (self->pool() == low_priority_pool)
            low_priority_jobs_done++;
@ -172,13 +176,13 @@ TEST(AsyncLoader, Smoke)
        auto job5 = makeLoadJob({ job3, job4 }, low_priority_pool, "job5", job_func);
        task2->merge(t.schedule({ job5 }));

-        std::thread waiter_thread([=] { job5->wait(); });
+        std::thread waiter_thread([&t, job5] { t.loader.wait(job5); });

        t.loader.start();

-        job3->wait();
+        t.loader.wait(job3);
        t.loader.wait();
-        job4->wait();
+        t.loader.wait(job4);

        waiter_thread.join();

@ -196,7 +200,7 @@ TEST(AsyncLoader, CycleDetection)
 {
    AsyncLoaderTest t;

-    auto job_func = [&] (const LoadJobPtr &) {};
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr &) {};

    LoadJobPtr cycle_breaker; // To avoid memleak we introduce with a cycle

@ -241,7 +245,7 @@ TEST(AsyncLoader, CancelPendingJob)
 {
    AsyncLoaderTest t;

-    auto job_func = [&] (const LoadJobPtr &) {};
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr &) {};

    auto job = makeLoadJob({}, "job", job_func);
    auto task = t.schedule({ job });
@ -251,7 +255,7 @@ TEST(AsyncLoader, CancelPendingJob)
    ASSERT_EQ(job->status(), LoadStatus::CANCELED);
    try
    {
-        job->wait();
+        t.loader.wait(job);
        FAIL();
    }
    catch (Exception & e)
@ -264,7 +268,7 @@ TEST(AsyncLoader, CancelPendingTask)
 {
    AsyncLoaderTest t;

-    auto job_func = [&] (const LoadJobPtr &) {};
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr &) {};

    auto job1 = makeLoadJob({}, "job1", job_func);
    auto job2 = makeLoadJob({ job1 }, "job2", job_func);
@ -277,7 +281,7 @@ TEST(AsyncLoader, CancelPendingTask)

    try
    {
-        job1->wait();
+        t.loader.wait(job1);
        FAIL();
    }
    catch (Exception & e)
@ -287,7 +291,7 @@ TEST(AsyncLoader, CancelPendingTask)

    try
    {
-        job2->wait();
+        t.loader.wait(job2);
        FAIL();
    }
    catch (Exception & e)
@ -300,7 +304,7 @@ TEST(AsyncLoader, CancelPendingDependency)
 {
    AsyncLoaderTest t;

-    auto job_func = [&] (const LoadJobPtr &) {};
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr &) {};

    auto job1 = makeLoadJob({}, "job1", job_func);
    auto job2 = makeLoadJob({ job1 }, "job2", job_func);
@ -314,7 +318,7 @@ TEST(AsyncLoader, CancelPendingDependency)

    try
    {
-        job1->wait();
+        t.loader.wait(job1);
        FAIL();
    }
    catch (Exception & e)
@ -324,7 +328,7 @@ TEST(AsyncLoader, CancelPendingDependency)

    try
    {
-        job2->wait();
+        t.loader.wait(job2);
        FAIL();
    }
    catch (Exception & e)
@ -340,7 +344,7 @@ TEST(AsyncLoader, CancelExecutingJob)

    std::barrier sync(2);

-    auto job_func = [&] (const LoadJobPtr &)
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr &)
    {
        sync.arrive_and_wait(); // (A) sync with main thread
        sync.arrive_and_wait(); // (B) wait for waiter
@ -362,7 +366,7 @@ TEST(AsyncLoader, CancelExecutingJob)
    canceler.join();

    ASSERT_EQ(job->status(), LoadStatus::OK);
-    job->wait();
+    t.loader.wait(job);
 }

 TEST(AsyncLoader, CancelExecutingTask)
@ -371,19 +375,19 @@ TEST(AsyncLoader, CancelExecutingTask)
    t.loader.start();
    std::barrier sync(2);

-    auto blocker_job_func = [&] (const LoadJobPtr &)
+    auto blocker_job_func = [&] (AsyncLoader &, const LoadJobPtr &)
    {
        sync.arrive_and_wait(); // (A) sync with main thread
        sync.arrive_and_wait(); // (B) wait for waiter
        // signals (C)
    };

-    auto job_to_cancel_func = [&] (const LoadJobPtr &)
+    auto job_to_cancel_func = [&] (AsyncLoader &, const LoadJobPtr &)
    {
        FAIL(); // this job should be canceled
    };

-    auto job_to_succeed_func = [&] (const LoadJobPtr &)
+    auto job_to_succeed_func = [&] (AsyncLoader &, const LoadJobPtr &)
    {
    };

@ -430,7 +434,7 @@ TEST(AsyncLoader, DISABLED_JobFailure)

    std::string error_message = "test job failure";

-    auto job_func = [&] (const LoadJobPtr &) {
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr &) {
        throw std::runtime_error(error_message);
    };

@ -442,7 +446,7 @@ TEST(AsyncLoader, DISABLED_JobFailure)
    ASSERT_EQ(job->status(), LoadStatus::FAILED);
    try
    {
-        job->wait();
+        t.loader.wait(job);
        FAIL();
    }
    catch (Exception & e)
@ -459,7 +463,7 @@ TEST(AsyncLoader, ScheduleJobWithFailedDependencies)

    std::string_view error_message = "test job failure";

-    auto failed_job_func = [&] (const LoadJobPtr &) {
+    auto failed_job_func = [&] (AsyncLoader &, const LoadJobPtr &) {
        throw Exception(ErrorCodes::ASYNC_LOAD_FAILED, "{}", error_message);
    };

@ -468,7 +472,7 @@ TEST(AsyncLoader, ScheduleJobWithFailedDependencies)

    t.loader.wait();

-    auto job_func = [&] (const LoadJobPtr &) {};
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr &) {};

    auto job1 = makeLoadJob({ failed_job }, "job1", job_func);
    auto job2 = makeLoadJob({ job1 }, "job2", job_func);
@ -480,7 +484,7 @@ TEST(AsyncLoader, ScheduleJobWithFailedDependencies)
    ASSERT_EQ(job2->status(), LoadStatus::CANCELED);
    try
    {
-        job1->wait();
+        t.loader.wait(job1);
        FAIL();
    }
    catch (Exception & e)
@ -490,7 +494,7 @@ TEST(AsyncLoader, ScheduleJobWithFailedDependencies)
    }
    try
    {
-        job2->wait();
+        t.loader.wait(job2);
        FAIL();
    }
    catch (Exception & e)
@ -504,14 +508,14 @@ TEST(AsyncLoader, ScheduleJobWithCanceledDependencies)
 {
    AsyncLoaderTest t;

-    auto canceled_job_func = [&] (const LoadJobPtr &) {};
+    auto canceled_job_func = [&] (AsyncLoader &, const LoadJobPtr &) {};
    auto canceled_job = makeLoadJob({}, "canceled_job", canceled_job_func);
    auto canceled_task = t.schedule({ canceled_job });
    canceled_task->remove();

    t.loader.start();

-    auto job_func = [&] (const LoadJobPtr &) {};
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr &) {};
    auto job1 = makeLoadJob({ canceled_job }, "job1", job_func);
    auto job2 = makeLoadJob({ job1 }, "job2", job_func);
    auto task = t.schedule({ job1, job2 });
@ -522,7 +526,7 @@ TEST(AsyncLoader, ScheduleJobWithCanceledDependencies)
    ASSERT_EQ(job2->status(), LoadStatus::CANCELED);
    try
    {
-        job1->wait();
+        t.loader.wait(job1);
        FAIL();
    }
    catch (Exception & e)
@ -531,7 +535,7 @@ TEST(AsyncLoader, ScheduleJobWithCanceledDependencies)
    }
    try
    {
-        job2->wait();
+        t.loader.wait(job2);
        FAIL();
    }
    catch (Exception & e)
@ -550,7 +554,7 @@ TEST(AsyncLoader, TestConcurrency)
        std::barrier sync(concurrency);

        std::atomic<int> executing{0};
-        auto job_func = [&] (const LoadJobPtr &)
+        auto job_func = [&] (AsyncLoader &, const LoadJobPtr &)
        {
            executing++;
            ASSERT_LE(executing, concurrency);
@ -577,7 +581,7 @@ TEST(AsyncLoader, TestOverload)

    for (int concurrency = 4; concurrency <= 8; concurrency++)
    {
-        auto job_func = [&] (const LoadJobPtr &)
+        auto job_func = [&] (AsyncLoader &, const LoadJobPtr &)
        {
            executing++;
            t.randomSleepUs(100, 200, 100);
@ -613,7 +617,7 @@ TEST(AsyncLoader, StaticPriorities)

    std::string schedule;

-    auto job_func = [&] (const LoadJobPtr & self)
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr & self)
    {
        schedule += fmt::format("{}{}", self->name, self->pool());
    };
@ -656,18 +660,18 @@ TEST(AsyncLoader, SimplePrioritization)
    std::atomic<int> executed{0}; // Number of previously executed jobs (to test execution order)
    LoadJobPtr job_to_prioritize;

-    auto job_func_A_booster = [&] (const LoadJobPtr &)
+    auto job_func_A_booster = [&] (AsyncLoader &, const LoadJobPtr &)
    {
        ASSERT_EQ(executed++, 0);
        t.loader.prioritize(job_to_prioritize, 2);
    };

-    auto job_func_B_tester = [&] (const LoadJobPtr &)
+    auto job_func_B_tester = [&] (AsyncLoader &, const LoadJobPtr &)
    {
        ASSERT_EQ(executed++, 2);
    };

-    auto job_func_C_boosted = [&] (const LoadJobPtr &)
+    auto job_func_C_boosted = [&] (AsyncLoader &, const LoadJobPtr &)
    {
        ASSERT_EQ(executed++, 1);
    };
@ -680,7 +684,8 @@ TEST(AsyncLoader, SimplePrioritization)

    job_to_prioritize = jobs[2]; // C

-    scheduleAndWaitLoadAll(task);
+    scheduleLoad(task);
+    waitLoad(task);
 }

 TEST(AsyncLoader, DynamicPriorities)
@ -714,7 +719,7 @@ TEST(AsyncLoader, DynamicPriorities)
        UInt64 ready_seqno_D = 0;
        UInt64 ready_seqno_E = 0;

-        auto job_func = [&] (const LoadJobPtr & self)
+        auto job_func = [&] (AsyncLoader &, const LoadJobPtr & self)
        {
            {
                std::unique_lock lock{schedule_mutex};
@ -791,7 +796,7 @@ TEST(AsyncLoader, RandomIndependentTasks)
    AsyncLoaderTest t(16);
    t.loader.start();

-    auto job_func = [&] (const LoadJobPtr & self)
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr & self)
    {
        for (const auto & dep : self->dependencies)
            ASSERT_EQ(dep->status(), LoadStatus::OK);
@ -818,7 +823,7 @@ TEST(AsyncLoader, RandomDependentTasks)
    std::vector<LoadTaskPtr> tasks;
    std::vector<LoadJobPtr> all_jobs;

-    auto job_func = [&] (const LoadJobPtr & self)
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr & self)
    {
        for (const auto & dep : self->dependencies)
            ASSERT_EQ(dep->status(), LoadStatus::OK);
@ -860,7 +865,7 @@ TEST(AsyncLoader, SetMaxThreads)
        syncs.push_back(std::make_unique<std::barrier<>>(max_threads + 1));


-    auto job_func = [&] (const LoadJobPtr &)
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr &)
    {
        int idx = sync_index;
        if (idx < syncs.size())
@ -914,10 +919,11 @@ TEST(AsyncLoader, DynamicPools)
    {
        std::atomic<bool> boosted{false}; // Visible concurrency was increased
        std::atomic<int> left{concurrency * jobs_in_chain / 2}; // Number of jobs to start before `prioritize()` call
+        std::shared_mutex prioritization_mutex; // To slow down job execution during prioritization to avoid race condition

        LoadJobSet jobs_to_prioritize;

-        auto job_func = [&] (const LoadJobPtr & self)
+        auto job_func = [&] (AsyncLoader & loader, const LoadJobPtr & self)
        {
            auto pool_id = self->executionPool();
            executing[pool_id]++;
@ -928,10 +934,12 @@ TEST(AsyncLoader, DynamicPools)
            // Dynamic prioritization
            if (--left == 0)
            {
+                std::unique_lock lock{prioritization_mutex};
                for (const auto & job : jobs_to_prioritize)
-                    t.loader.prioritize(job, 1);
+                    loader.prioritize(job, 1);
            }

+            std::shared_lock lock{prioritization_mutex};
            t.randomSleepUs(100, 200, 100);

            ASSERT_LE(executing[pool_id], max_threads[pool_id]);
@ -941,9 +949,10 @@ TEST(AsyncLoader, DynamicPools)
        std::vector<LoadTaskPtr> tasks;
        tasks.reserve(concurrency);
        for (int i = 0; i < concurrency; i++)
-            tasks.push_back(makeLoadTask(t.loader, t.chainJobSet(jobs_in_chain, job_func)));
+            tasks.push_back(makeLoadTask(t.loader, t.chainJobSet(jobs_in_chain, job_func, fmt::format("c{}-j", i))));
        jobs_to_prioritize = getGoals(tasks); // All jobs
-        scheduleAndWaitLoadAll(tasks);
+        scheduleLoad(tasks);
+        waitLoad(tasks);

        ASSERT_EQ(executing[0], 0);
        ASSERT_EQ(executing[1], 0);
@ -952,3 +961,136 @@ TEST(AsyncLoader, DynamicPools)
    }

 }
+
+TEST(AsyncLoader, SubJobs)
+{
+    AsyncLoaderTest t(1);
+    t.loader.start();
+
+    // An example of component with an asynchronous loading interface
+    class MyComponent : boost::noncopyable {
+    public:
+        MyComponent(AsyncLoader & loader_, int jobs)
+            : loader(loader_)
+            , jobs_left(jobs)
+        {}
+
+        [[nodiscard]] LoadTaskPtr loadAsync()
+        {
+            auto job_func = [this] (AsyncLoader &, const LoadJobPtr &) {
+                auto sub_job_func = [this] (AsyncLoader &, const LoadJobPtr &) {
+                    --jobs_left;
+                };
+                LoadJobSet jobs;
+                for (size_t j = 0; j < jobs_left; j++)
+                    jobs.insert(makeLoadJob({}, fmt::format("sub job {}", j), sub_job_func));
+                waitLoad(makeLoadTask(loader, std::move(jobs)));
+            };
+            auto job = makeLoadJob({}, "main job", job_func);
+            return load_task = makeLoadTask(loader, { job });
+        }
+
+        bool isLoaded() const
+        {
+            return jobs_left == 0;
+        }
+
+    private:
+        AsyncLoader & loader;
+        std::atomic<int> jobs_left;
+        // It is a good practice to keep load task inside the component:
+        // 1) to make sure it outlives its load jobs;
+        // 2) to avoid removing load jobs from `system.async_loader` while we use the component
+        LoadTaskPtr load_task;
+    };
+
+    for (double jobs_per_thread : std::array{0.5, 1.0, 2.0})
+    {
+        for (size_t threads = 1; threads <= 32; threads *= 2)
+        {
+            t.loader.setMaxThreads(0, threads);
+            std::list<MyComponent> components;
+            LoadTaskPtrs tasks;
+            size_t size = static_cast<size_t>(jobs_per_thread * threads);
+            tasks.reserve(size);
+            for (size_t j = 0; j < size; j++)
+            {
+                components.emplace_back(t.loader, 5);
+                tasks.emplace_back(components.back().loadAsync());
+            }
+            waitLoad(tasks);
+            for (const auto & component: components)
+                ASSERT_TRUE(component.isLoaded());
+        }
+    }
+}
+
+TEST(AsyncLoader, RecursiveJob)
+{
+    AsyncLoaderTest t(1);
+    t.loader.start();
+
+    // An example of component with an asynchronous loading interface (a complicated one)
+    class MyComponent : boost::noncopyable {
+    public:
+        MyComponent(AsyncLoader & loader_, int jobs)
+            : loader(loader_)
+            , jobs_left(jobs)
+        {}
+
+        [[nodiscard]] LoadTaskPtr loadAsync()
+        {
+            return load_task = loadAsyncImpl(jobs_left);
+        }
+
+        bool isLoaded() const
+        {
+            return jobs_left == 0;
+        }
+
+    private:
+        [[nodiscard]] LoadTaskPtr loadAsyncImpl(int id)
+        {
+            auto job_func = [this] (AsyncLoader &, const LoadJobPtr & self) {
+                jobFunction(self);
+            };
+            auto job = makeLoadJob({}, fmt::format("job{}", id), job_func);
+            auto task = makeLoadTask(loader, { job });
+            return task;
+        }
+
+        void jobFunction(const LoadJobPtr & self)
+        {
+            int next = --jobs_left;
+            if (next > 0)
+                waitLoad(self->pool(), loadAsyncImpl(next));
+        }
+
+        AsyncLoader & loader;
+        std::atomic<int> jobs_left;
+        // It is a good practice to keep load task inside the component:
+        // 1) to make sure it outlives its load jobs;
+        // 2) to avoid removing load jobs from `system.async_loader` while we use the component
+        LoadTaskPtr load_task;
+    };
+
+    for (double jobs_per_thread : std::array{0.5, 1.0, 2.0})
+    {
+        for (size_t threads = 1; threads <= 32; threads *= 2)
+        {
+            t.loader.setMaxThreads(0, threads);
+            std::list<MyComponent> components;
+            LoadTaskPtrs tasks;
+            size_t size = static_cast<size_t>(jobs_per_thread * threads);
+            tasks.reserve(size);
+            for (size_t j = 0; j < size; j++)
+            {
+                components.emplace_back(t.loader, 5);
+                tasks.emplace_back(components.back().loadAsync());
+            }
+            waitLoad(tasks);
+            for (const auto & component: components)
+                ASSERT_TRUE(component.isLoaded());
+        }
+    }
+}
--- a/src/Compression/CompressionCodecDeflateQpl.cpp
+++ b/src/Compression/CompressionCodecDeflateQpl.cpp
@ -139,9 +139,9 @@ void DeflateQplJobHWPool::unLockJob(UInt32 index)
    hw_job_ptr_locks[index].store(false);
 }

-//HardwareCodecDeflateQpl
-HardwareCodecDeflateQpl::HardwareCodecDeflateQpl()
-    :log(&Poco::Logger::get("HardwareCodecDeflateQpl"))
+HardwareCodecDeflateQpl::HardwareCodecDeflateQpl(SoftwareCodecDeflateQpl & sw_codec_)
+    : log(&Poco::Logger::get("HardwareCodecDeflateQpl"))
+    , sw_codec(sw_codec_)
 {
 }

@ -169,7 +169,7 @@ Int32 HardwareCodecDeflateQpl::doCompressData(const char * source, UInt32 source
    UInt32 compressed_size = 0;
    if (!(job_ptr = DeflateQplJobHWPool::instance().acquireJob(job_id)))
    {
-        LOG_INFO(log, "DeflateQpl HW codec failed, falling back to SW codec.(Details: doCompressData->acquireJob fail, probably job pool exhausted)");
+        LOG_INFO(log, "DeflateQpl HW codec failed, falling back to SW codec. (Details: doCompressData->acquireJob fail, probably job pool exhausted)");
        return RET_ERROR;
    }

@ -189,7 +189,7 @@ Int32 HardwareCodecDeflateQpl::doCompressData(const char * source, UInt32 source
    }
    else
    {
-        LOG_WARNING(log, "DeflateQpl HW codec failed, falling back to SW codec.(Details: doCompressData->qpl_execute_job with error code: {} - please refer to qpl_status in ./contrib/qpl/include/qpl/c_api/status.h)", static_cast<UInt32>(status));
+        LOG_WARNING(log, "DeflateQpl HW codec failed, falling back to SW codec. (Details: doCompressData->qpl_execute_job with error code: {} - please refer to qpl_status in ./contrib/qpl/include/qpl/c_api/status.h)", static_cast<UInt32>(status));
        DeflateQplJobHWPool::instance().releaseJob(job_id);
        return RET_ERROR;
    }
@ -202,7 +202,7 @@ Int32 HardwareCodecDeflateQpl::doDecompressDataSynchronous(const char * source,
    UInt32 decompressed_size = 0;
    if (!(job_ptr = DeflateQplJobHWPool::instance().acquireJob(job_id)))
    {
-        LOG_INFO(log, "DeflateQpl HW codec failed, falling back to SW codec.(Details: doDecompressDataSynchronous->acquireJob fail, probably job pool exhausted)");
+        LOG_INFO(log, "DeflateQpl HW codec failed, falling back to SW codec. (Details: doDecompressDataSynchronous->acquireJob fail, probably job pool exhausted)");
        return RET_ERROR;
    }

@ -214,17 +214,29 @@ Int32 HardwareCodecDeflateQpl::doDecompressDataSynchronous(const char * source,
    job_ptr->available_out = uncompressed_size;
    job_ptr->flags = QPL_FLAG_FIRST | QPL_FLAG_LAST;

-    if (auto status = qpl_submit_job(job_ptr); status != QPL_STS_OK)
+    auto status = qpl_submit_job(job_ptr);
+    if (status != QPL_STS_OK)
    {
        DeflateQplJobHWPool::instance().releaseJob(job_id);
-        LOG_WARNING(log, "DeflateQpl HW codec failed, falling back to SW codec.(Details: doDecompressDataSynchronous->qpl_execute_job with error code: {} - please refer to qpl_status in ./contrib/qpl/include/qpl/c_api/status.h)", static_cast<UInt32>(status));
+        LOG_WARNING(log, "DeflateQpl HW codec failed, falling back to SW codec. (Details: doDecompressDataSynchronous->qpl_submit_job with error code: {} - please refer to qpl_status in ./contrib/qpl/include/qpl/c_api/status.h)", static_cast<UInt32>(status));
        return RET_ERROR;
    }
+
    /// Busy waiting till job complete.
+    UInt32 num_checks = 0;
    do
    {
        _tpause(1, __rdtsc() + 1000);
-    } while (qpl_check_job(job_ptr) == QPL_STS_BEING_PROCESSED);
+        status = qpl_check_job(job_ptr);
+        ++num_checks;
+    } while (status == QPL_STS_BEING_PROCESSED && num_checks < MAX_CHECKS);
+
+    if (status != QPL_STS_OK)
+    {
+        DeflateQplJobHWPool::instance().releaseJob(job_id);
+        LOG_WARNING(log, "DeflateQpl HW codec failed, falling back to SW codec. (Details: doDecompressDataSynchronous->qpl_submit_job with error code: {} - please refer to qpl_status in ./contrib/qpl/include/qpl/c_api/status.h)", static_cast<UInt32>(status));
+        return RET_ERROR;
+    }

    decompressed_size = job_ptr->total_out;
    DeflateQplJobHWPool::instance().releaseJob(job_id);
@ -237,7 +249,7 @@ Int32 HardwareCodecDeflateQpl::doDecompressDataAsynchronous(const char * source,
    qpl_job * job_ptr = nullptr;
    if (!(job_ptr = DeflateQplJobHWPool::instance().acquireJob(job_id)))
    {
-        LOG_INFO(log, "DeflateQpl HW codec failed, falling back to SW codec.(Details: doDecompressDataAsynchronous->acquireJob fail, probably job pool exhausted)");
+        LOG_INFO(log, "DeflateQpl HW codec failed, falling back to SW codec. (Details: doDecompressDataAsynchronous->acquireJob fail, probably job pool exhausted)");
        return RET_ERROR;
    }

@ -257,7 +269,7 @@ Int32 HardwareCodecDeflateQpl::doDecompressDataAsynchronous(const char * source,
    else
    {
        DeflateQplJobHWPool::instance().releaseJob(job_id);
-        LOG_WARNING(log, "DeflateQpl HW codec failed, falling back to SW codec.(Details: doDecompressDataAsynchronous->qpl_execute_job with error code: {} - please refer to qpl_status in ./contrib/qpl/include/qpl/c_api/status.h)", static_cast<UInt32>(status));
+        LOG_WARNING(log, "DeflateQpl HW codec failed, falling back to SW codec. (Details: doDecompressDataAsynchronous->qpl_submit_job with error code: {} - please refer to qpl_status in ./contrib/qpl/include/qpl/c_api/status.h)", static_cast<UInt32>(status));
        return RET_ERROR;
    }
 }
@ -266,6 +278,7 @@ void HardwareCodecDeflateQpl::flushAsynchronousDecompressRequests()
 {
    auto n_jobs_processing = decomp_async_job_map.size();
    std::map<UInt32, qpl_job *>::iterator it = decomp_async_job_map.begin();
+    UInt32 num_checks = 0;

    while (n_jobs_processing)
    {
@ -274,22 +287,34 @@ void HardwareCodecDeflateQpl::flushAsynchronousDecompressRequests()
        job_id = it->first;
        job_ptr = it->second;

-        if (qpl_check_job(job_ptr) == QPL_STS_BEING_PROCESSED)
+        auto status = qpl_check_job(job_ptr);
+        if ((status == QPL_STS_BEING_PROCESSED) && (num_checks < MAX_CHECKS))
        {
            it++;
        }
        else
        {
+            if (status != QPL_STS_OK)
+            {
+                sw_codec.doDecompressData(
+                    reinterpret_cast<const char * >(job_ptr->next_in_ptr),
+                    job_ptr->available_in,
+                    reinterpret_cast<char *>(job_ptr->next_out_ptr),
+                    job_ptr->available_out);
+                LOG_WARNING(log, "DeflateQpl HW codec failed, falling back to SW codec. (Details: flushAsynchronousDecompressRequests with error code: {} - please refer to qpl_status in ./contrib/qpl/include/qpl/c_api/status.h)", static_cast<UInt32>(status));
+            }
            it = decomp_async_job_map.erase(it);
            DeflateQplJobHWPool::instance().releaseJob(job_id);
            n_jobs_processing--;
            if (n_jobs_processing <= 0)
                break;
        }
+
        if (it == decomp_async_job_map.end())
        {
            it = decomp_async_job_map.begin();
            _tpause(1, __rdtsc() + 1000);
+            ++num_checks;
        }
    }
 }
@ -364,8 +389,8 @@ void SoftwareCodecDeflateQpl::doDecompressData(const char * source, UInt32 sourc
 }

 CompressionCodecDeflateQpl::CompressionCodecDeflateQpl()
-    : hw_codec(std::make_unique<HardwareCodecDeflateQpl>())
-    , sw_codec(std::make_unique<SoftwareCodecDeflateQpl>())
+    : sw_codec(std::make_unique<SoftwareCodecDeflateQpl>())
+    , hw_codec(std::make_unique<HardwareCodecDeflateQpl>(*sw_codec))
 {
    setCodecDescription("DEFLATE_QPL");
 }
--- a/src/Compression/CompressionCodecDeflateQpl.h
+++ b/src/Compression/CompressionCodecDeflateQpl.h
@ -65,8 +65,10 @@ class HardwareCodecDeflateQpl
 public:
    /// RET_ERROR stands for hardware codec fail, needs fallback to software codec.
    static constexpr Int32 RET_ERROR = -1;
+    /// Maximum times to check if hardware job complete, otherwise fallback to software codec.
+    static constexpr UInt32 MAX_CHECKS = UINT16_MAX;

-    HardwareCodecDeflateQpl();
+    HardwareCodecDeflateQpl(SoftwareCodecDeflateQpl & sw_codec_);
    ~HardwareCodecDeflateQpl();

    Int32 doCompressData(const char * source, UInt32 source_size, char * dest, UInt32 dest_size) const;
@ -87,6 +89,8 @@ private:
    /// For flush, pop out job ID && job object from this map. Use job ID to release job lock and use job object to check job status till complete.
    std::map<UInt32, qpl_job *> decomp_async_job_map;
    Poco::Logger * log;
+    /// Provides a fallback in case of errors.
+    SoftwareCodecDeflateQpl & sw_codec;
 };

 class CompressionCodecDeflateQpl final : public ICompressionCodec
@ -110,8 +114,8 @@ protected:
 private:
    UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override;

-    std::unique_ptr<HardwareCodecDeflateQpl> hw_codec;
    std::unique_ptr<SoftwareCodecDeflateQpl> sw_codec;
+    std::unique_ptr<HardwareCodecDeflateQpl> hw_codec;
 };

 }
--- a/src/Coordination/KeeperSnapshotManagerS3.cpp
+++ b/src/Coordination/KeeperSnapshotManagerS3.cpp
@ -101,6 +101,7 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo
        auto client = S3::ClientFactory::instance().create(
            client_configuration,
            new_uri.is_virtual_hosted_style,
+            /* disable_checksum= */ false,
            credentials.GetAWSAccessKeyId(),
            credentials.GetAWSSecretKey(),
            auth_settings.server_side_encryption_customer_key_base64,
--- a/src/Core/PlainRanges.cpp
+++ b/src/Core/PlainRanges.cpp
@ -0,0 +1,157 @@
+#include <Core/PlainRanges.h>
+
+namespace DB
+{
+
+PlainRanges::PlainRanges(const Range & range)
+{
+    ranges.push_back(range);
+}
+
+
+PlainRanges::PlainRanges(const Ranges & ranges_, bool may_have_intersection, bool ordered)
+{
+    if (may_have_intersection)
+        ranges = ordered ? makePlainFromOrdered(ranges_) : makePlainFromUnordered(ranges_);
+    else
+        ranges = ranges_;
+}
+
+Ranges PlainRanges::makePlainFromOrdered(const Ranges & ranges_)
+{
+    if (ranges_.size() <= 1)
+        return ranges_;
+
+    Ranges ret{ranges_.front()};
+
+    for (size_t i = 1; i < ranges_.size(); ++i)
+    {
+        const auto & cur = ranges_[i];
+        if (ret.back().intersectsRange(cur))
+            ret.back() = *ret.back().unionWith(cur);
+        else
+            ret.push_back(cur);
+    }
+
+    return ret;
+}
+
+Ranges PlainRanges::makePlainFromUnordered(Ranges ranges_)
+{
+    if (ranges_.size() <= 1)
+        return ranges_;
+
+    std::sort(ranges_.begin(), ranges_.end(), compareByLeftBound);
+    return makePlainFromOrdered(ranges_);
+}
+
+PlainRanges PlainRanges::unionWith(const PlainRanges & other)
+{
+    auto left_itr = ranges.begin();
+    auto right_itr = other.ranges.begin();
+
+    Ranges new_range;
+    for (; left_itr != ranges.end() && right_itr != other.ranges.end();)
+    {
+        if (left_itr->leftThan(*right_itr))
+        {
+            new_range.push_back(*left_itr);
+            left_itr++;
+        }
+        else if (left_itr->rightThan(*right_itr))
+        {
+            new_range.push_back(*right_itr);
+            right_itr++;
+        }
+        else /// union
+        {
+            new_range.emplace_back(*(left_itr->unionWith(*right_itr)));
+            if (compareByRightBound(*left_itr, *right_itr))
+                left_itr++;
+            else
+                right_itr++;
+        }
+    }
+
+    while (left_itr != ranges.end())
+    {
+        new_range.push_back(*left_itr);
+        left_itr++;
+    }
+
+    while (right_itr != other.ranges.end())
+    {
+        new_range.push_back(*right_itr);
+        right_itr++;
+    }
+
+    /// After union two PlainRanges, new ranges may like: [1, 4], [2, 5]
+    /// We must make them plain.
+
+    return PlainRanges(makePlainFromOrdered(new_range));
+}
+
+PlainRanges PlainRanges::intersectWith(const PlainRanges & other)
+{
+    auto left_itr = ranges.begin();
+    auto right_itr = other.ranges.begin();
+
+    Ranges new_ranges;
+    for (; left_itr != ranges.end() && right_itr != other.ranges.end();)
+    {
+        if (left_itr->leftThan(*right_itr))
+        {
+            left_itr++;
+        }
+        else if (left_itr->rightThan(*right_itr))
+        {
+            right_itr++;
+        }
+        else /// intersection
+        {
+            auto intersected = left_itr->intersectWith(*right_itr);
+
+            if (intersected) /// skip blank range
+                new_ranges.emplace_back(*intersected);
+
+            if (compareByRightBound(*left_itr, *right_itr))
+                left_itr++;
+            else
+                right_itr++;
+        }
+    }
+    return PlainRanges(new_ranges);
+}
+
+bool PlainRanges::compareByLeftBound(const Range & lhs, const Range & rhs)
+{
+    if (lhs.left == NEGATIVE_INFINITY && rhs.left == NEGATIVE_INFINITY)
+        return false;
+    return Range::less(lhs.left, rhs.left) || ((!lhs.left_included && rhs.left_included) && Range::equals(lhs.left, rhs.left));
+};
+
+bool PlainRanges::compareByRightBound(const Range & lhs, const Range & rhs)
+{
+    if (lhs.right == POSITIVE_INFINITY && rhs.right == POSITIVE_INFINITY)
+        return false;
+    return Range::less(lhs.right, rhs.right) || ((!lhs.right_included && rhs.right_included) && Range::equals(lhs.right, rhs.right));
+};
+
+
+std::vector<Ranges> PlainRanges::invert(const Ranges & to_invert_ranges)
+{
+    /// invert a blank ranges
+    if (to_invert_ranges.empty())
+        return {makeUniverse().ranges};
+
+    std::vector<Ranges> reverted_ranges;
+    for (const auto & range : to_invert_ranges)
+    {
+        if (range.isInfinite())
+            /// return a blank ranges
+            return {{}};
+        reverted_ranges.push_back(range.invertRange());
+    }
+    return reverted_ranges;
+};
+}
--- a/src/Core/PlainRanges.h
+++ b/src/Core/PlainRanges.h
@ -0,0 +1,46 @@
+#pragma once
+
+#include <Core/Range.h>
+
+namespace DB
+{
+
+/** A plain ranges is a series of ranges who
+ *      1. have no intersection in any two of the ranges
+ *      2. ordered by left side
+ *      3. does not contain blank range
+ *
+ * Example:
+ *      query: (k > 1 and key < 5) or (k > 3 and k < 10) or key in (2, 12)
+ *      original ranges: (1, 5), (3, 10), [2, 2], [12, 12]
+ *      plain ranges: (1, 10), [12, 12]
+ *
+ * If it is blank, ranges is empty.
+ */
+struct PlainRanges
+{
+    Ranges ranges;
+
+    explicit PlainRanges(const Range & range);
+
+    explicit PlainRanges(const Ranges & ranges_, bool may_have_intersection = false, bool ordered = true);
+
+    PlainRanges unionWith(const PlainRanges & other);
+    PlainRanges intersectWith(const PlainRanges & other);
+
+    /// Union ranges and return a new plain(ordered and no intersection) ranges.
+    /// Example:
+    ///         [1, 3], [2, 4], [6, 8] -> [1, 4], [6, 8]
+    ///         [1, 3], [2, 4], (4, 5] -> [1, 4], [5, 5]
+    static Ranges makePlainFromUnordered(Ranges ranges_);
+    static Ranges makePlainFromOrdered(const Ranges & ranges_);
+
+    static bool compareByLeftBound(const Range & lhs, const Range & rhs);
+    static bool compareByRightBound(const Range & lhs, const Range & rhs);
+
+    static std::vector<Ranges> invert(const Ranges & to_invert_ranges);
+
+    static PlainRanges makeBlank() { return PlainRanges({}); }
+    static PlainRanges makeUniverse() { return PlainRanges({Range::createWholeUniverseWithoutNull()}); }
+};
+}
--- a/src/Core/Range.cpp
+++ b/src/Core/Range.cpp
@ -123,6 +123,27 @@ bool Range::leftThan(const FieldRef & x) const
    return less(x, right) || (right_included && equals(x, right));
 }

+bool Range::rightThan(const Range & x) const
+{
+    return less(x.right, left) || (!(left_included && x.right_included) && equals(left, x.right));
+}
+
+bool Range::leftThan(const Range & x) const
+{
+    return less(right, x.left) || (!(x.left_included && right_included) && equals(right, x.left));
+}
+
+bool Range::fullBounded() const
+{
+    return left.getType() != Field::Types::Null && right.getType() != Field::Types::Null;
+}
+
+/// (-inf, +inf)
+bool Range::isInfinite() const
+{
+    return left.isNegativeInfinity() && right.isPositiveInfinity();
+}
+
 bool Range::intersectsRange(const Range & r) const
 {
    /// r to the left of me.
@ -159,6 +180,95 @@ void Range::invert()
    std::swap(left_included, right_included);
 }

+Ranges Range::invertRange() const
+{
+    Ranges ranges;
+    /// For full bounded range will generate two ranges.
+    if (fullBounded()) /// case: [1, 3] -> (-inf, 1), (3, +inf)
+    {
+        ranges.push_back({NEGATIVE_INFINITY, false, left, !left_included});
+        ranges.push_back({right, !right_included, POSITIVE_INFINITY, false});
+    }
+    else if (isInfinite())
+    {
+        /// blank ranges
+    }
+    else /// case: (-inf, 1] or [1, +inf)
+    {
+        Range r = *this;
+        std::swap(r.left, r.right);
+        if (r.left.isPositiveInfinity()) /// [1, +inf)
+        {
+            r.left = NEGATIVE_INFINITY;
+            r.right_included = !r.left_included;
+            r.left_included = false;
+        }
+        else if (r.right.isNegativeInfinity()) /// (-inf, 1]
+        {
+            r.right = POSITIVE_INFINITY;
+            r.left_included = !r.right_included;
+            r.right_included = false;
+        }
+        ranges.push_back(r);
+    }
+    return ranges;
+}
+
+std::optional<Range> Range::intersectWith(const Range & r) const
+{
+    if (!intersectsRange(r))
+        return {};
+
+    bool left_bound_use_mine = true;
+    bool right_bound_use_mine = true;
+
+    if (less(left, r.left) || ((!left_included && r.left_included) && equals(left, r.left)))
+        left_bound_use_mine = false;
+
+    if (less(r.right, right) || ((!r.right_included && right_included) && equals(r.right, right)))
+        right_bound_use_mine = false;
+
+    return Range(
+        left_bound_use_mine ? left : r.left,
+        left_bound_use_mine ? left_included : r.left_included,
+        right_bound_use_mine ? right : r.right,
+        right_bound_use_mine ? right_included : r.right_included);
+}
+
+std::optional<Range> Range::unionWith(const Range & r) const
+{
+    if (!intersectsRange(r) && !nearByWith(r))
+        return {};
+
+    bool left_bound_use_mine = false;
+    bool right_bound_use_mine = false;
+
+    if (less(left, r.left) || ((!left_included && r.left_included) && equals(left, r.left)))
+        left_bound_use_mine = true;
+
+    if (less(r.right, right) || ((!r.right_included && right_included) && equals(r.right, right)))
+        right_bound_use_mine = true;
+
+    return Range(
+        left_bound_use_mine ? left : r.left,
+        left_bound_use_mine ? left_included : r.left_included,
+        right_bound_use_mine ? right : r.right,
+        right_bound_use_mine ? right_included : r.right_included);
+}
+
+bool Range::nearByWith(const Range & r) const
+{
+    /// me locates at left
+    if (((right_included && !r.left_included) || (!right_included && r.left_included)) && equals(right, r.left))
+        return true;
+
+    /// r locate left
+    if (((r.right_included && !left_included) || (r.right_included && !left_included)) && equals(r.right, left))
+        return true;
+
+    return false;
+}
+
 Range intersect(const Range & a, const Range & b)
 {
    Range res = Range::createWholeUniverse();
--- a/src/Core/Range.h
+++ b/src/Core/Range.h
@ -38,6 +38,13 @@ struct FieldRef : public Field
    size_t column_idx = 0;
 };

+/** Range with open or closed ends; possibly unbounded.
+ */
+struct Range;
+/** A serious of range who can overlap or non-overlap.
+ */
+using Ranges = std::vector<Range>;
+
 /** Range with open or closed ends; possibly unbounded.
  */
 struct Range
@ -79,12 +86,37 @@ public:
    /// x is to the right
    bool leftThan(const FieldRef & x) const;

+    /// completely right than x
+    bool rightThan(const Range & x) const;
+    /// completely left than x
+    bool leftThan(const Range & x) const;
+
+    /// range like [1, 2]
+    bool fullBounded() const;
+    /// (-inf, +inf)
+    bool isInfinite() const;
+
+    bool isBlank() const;
+
    bool intersectsRange(const Range & r) const;

    bool containsRange(const Range & r) const;

+    /// Invert left and right
    void invert();

+    /// Invert the range.
+    /// Example:
+    ///     [1, 3] -> (-inf, 1), (3, +inf)
+    Ranges invertRange() const;
+
+    std::optional<Range> intersectWith(const Range & r) const;
+    std::optional<Range> unionWith(const Range & r) const;
+
+    /// If near by r, they can be combined to a continuous range.
+    /// TODO If field is integer, case like [2, 3], [4, 5] is excluded.
+    bool nearByWith(const Range & r) const;
+
    String toString() const;
 };

--- a/src/Core/ServerSettings.h
+++ b/src/Core/ServerSettings.h
@ -92,14 +92,15 @@ namespace DB
    M(UInt64, background_schedule_pool_size, 512, "The maximum number of threads that will be used for constantly executing some lightweight periodic operations.", 0) \
    M(UInt64, background_message_broker_schedule_pool_size, 16, "The maximum number of threads that will be used for executing background operations for message streaming.", 0) \
    M(UInt64, background_distributed_schedule_pool_size, 16, "The maximum number of threads that will be used for executing distributed sends.", 0) \
+    M(UInt64, tables_loader_foreground_pool_size, 0, "The maximum number of threads that will be used for foreground (that is being waited for by a query) loading of tables. Also used for synchronous loading of tables before the server start. Zero means use all CPUs.", 0) \
+    M(UInt64, tables_loader_background_pool_size, 0, "The maximum number of threads that will be used for background async loading of tables. Zero means use all CPUs.", 0) \
+    M(Bool, async_load_databases, false, "Enable asynchronous loading of databases and tables to speedup server startup. Queries to not yet loaded entity will be blocked until load is finished.", 0) \
    M(Bool, display_secrets_in_show_and_select, false, "Allow showing secrets in SHOW and SELECT queries via a format setting and a grant", 0) \
    \
    M(UInt64, total_memory_profiler_step, 0, "Whenever server memory usage becomes larger than every next step in number of bytes the memory profiler will collect the allocating stack trace. Zero means disabled memory profiler. Values lower than a few megabytes will slow down server.", 0) \
    M(Double, total_memory_tracker_sample_probability, 0, "Collect random allocations and deallocations and write them into system.trace_log with 'MemorySample' trace_type. The probability is for every alloc/free regardless to the size of the allocation (can be changed with `memory_profiler_sample_min_allocation_size` and `memory_profiler_sample_max_allocation_size`). Note that sampling happens only when the amount of untracked memory exceeds 'max_untracked_memory'. You may want to set 'max_untracked_memory' to 0 for extra fine grained sampling.", 0) \
    M(UInt64, total_memory_profiler_sample_min_allocation_size, 0, "Collect random allocations of size greater or equal than specified value with probability equal to `total_memory_profiler_sample_probability`. 0 means disabled. You may want to set 'max_untracked_memory' to 0 to make this threshold to work as expected.", 0) \
    M(UInt64, total_memory_profiler_sample_max_allocation_size, 0, "Collect random allocations of size less or equal than specified value with probability equal to `total_memory_profiler_sample_probability`. 0 means disabled. You may want to set 'max_untracked_memory' to 0 to make this threshold to work as expected.", 0) \
-    M(String, get_client_http_header_forbidden_headers, "", "Comma separated list of http header names that will not be returned by function getClientHTTPHeader.", 0) \
-    M(Bool, allow_get_client_http_header, false, "Allow function getClientHTTPHeader", 0) \
    M(Bool, validate_tcp_client_information, false, "Validate client_information in the query packet over the native TCP protocol.", 0) \
    M(Bool, storage_metadata_write_full_object_key, false, "Write disk metadata files with VERSION_FULL_OBJECT_KEY format", 0) \

--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -104,9 +104,10 @@ class IColumn;
    M(Bool, s3_check_objects_after_upload, false, "Check each uploaded object to s3 with head request to be sure that upload was successful", 0) \
    M(Bool, s3_allow_parallel_part_upload, true, "Use multiple threads for s3 multipart upload. It may lead to slightly higher memory usage", 0) \
    M(Bool, s3_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \
+    M(Bool, s3_disable_checksum, false, "Do not calculate a checksum when sending a file to S3. This speeds up writes by avoiding excessive processing passes on a file. It is mostly safe as the data of MergeTree tables is checksummed by ClickHouse anyway, and when S3 is accessed with HTTPS, the TLS layer already provides integrity while transferring through the network. While additional checksums on S3 give defense in depth.", 0) \
    M(UInt64, s3_retry_attempts, 100, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries", 0) \
    M(UInt64, s3_request_timeout_ms, 30000, "Idleness timeout for sending and receiving data to/from S3. Fail if a single TCP read or write call blocks for this long.", 0) \
-    M(UInt64, s3_http_connection_pool_size, 1000, "How many reusable open connections to keep per S3 endpoint. Only applies to the S3 table engine and table function, not to S3 disks (for disks, use disk config instead). Global setting, can only be set in config, overriding it per session or per query has no effect.", 0) \
+    M(UInt64, s3_http_connection_pool_size, 1000, "How many reusable open connections to keep per S3 endpoint. This only applies to the S3 table engine and table function, not to S3 disks (for disks, use disk config instead). Global setting, can only be set in config, overriding it per session or per query has no effect.", 0) \
    M(Bool, enable_s3_requests_logging, false, "Enable very explicit logging of S3 requests. Makes sense for debug only.", 0) \
    M(String, s3queue_default_zookeeper_path, "/clickhouse/s3queue/", "Default zookeeper path prefix for S3Queue engine", 0) \
    M(Bool, s3queue_enable_logging_to_s3queue_log, false, "Enable writing to system.s3queue_log. The value can be overwritten per table with table settings", 0) \
@ -122,10 +123,10 @@ class IColumn;
    M(UInt64, max_remote_write_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for write.", 0) \
    M(UInt64, max_local_read_bandwidth, 0, "The maximum speed of local reads in bytes per second.", 0) \
    M(UInt64, max_local_write_bandwidth, 0, "The maximum speed of local writes in bytes per second.", 0) \
-    M(Bool, stream_like_engine_allow_direct_select, false, "Allow direct SELECT query for Kafka, RabbitMQ, FileLog, Redis Streams and NATS engines. In case there are attached materialized views, SELECT query is not allowed even if this setting is enabled.", 0) \
+    M(Bool, stream_like_engine_allow_direct_select, false, "Allow direct SELECT query for Kafka, RabbitMQ, FileLog, Redis Streams, and NATS engines. In case there are attached materialized views, SELECT query is not allowed even if this setting is enabled.", 0) \
    M(String, stream_like_engine_insert_queue, "", "When stream like engine reads from multiple queues, user will need to select one queue to insert into when writing. Used by Redis Streams and NATS.", 0) \
    \
-    M(Bool, distributed_foreground_insert, false, "If setting is enabled, insert query into distributed waits until data will be sent to all nodes in cluster. \n\nEnables or disables synchronous data insertion into a `Distributed` table.\n\nBy default, when inserting data into a Distributed table, the ClickHouse server sends data to cluster nodes in background. When `distributed_foreground_insert` = 1, the data is processed synchronously, and the `INSERT` operation succeeds only after all the data is saved on all shards (at least one replica for each shard if `internal_replication` is true).", 0) ALIAS(insert_distributed_sync) \
+    M(Bool, distributed_foreground_insert, false, "If setting is enabled, insert query into distributed waits until data are sent to all nodes in a cluster. \n\nEnables or disables synchronous data insertion into a `Distributed` table.\n\nBy default, when inserting data into a Distributed table, the ClickHouse server sends data to cluster nodes in the background. When `distributed_foreground_insert` = 1, the data is processed synchronously, and the `INSERT` operation succeeds only after all the data is saved on all shards (at least one replica for each shard if `internal_replication` is true).", 0) ALIAS(insert_distributed_sync) \
    M(UInt64, distributed_background_insert_timeout, 0, "Timeout for insert query into distributed. Setting is used only with insert_distributed_sync enabled. Zero value means no timeout.", 0) ALIAS(insert_distributed_timeout) \
    M(Milliseconds, distributed_background_insert_sleep_time_ms, 100, "Sleep time for background INSERTs into Distributed, in case of any errors delay grows exponentially.", 0) ALIAS(distributed_directory_monitor_sleep_time_ms) \
    M(Milliseconds, distributed_background_insert_max_sleep_time_ms, 30000, "Maximum sleep time for background INSERTs into Distributed, it limits exponential growth too.", 0) ALIAS(distributed_directory_monitor_max_sleep_time_ms) \
@ -575,7 +576,6 @@ class IColumn;
    M(Bool, optimize_substitute_columns, false, "Use constraints for column substitution", 0)                                                                                                                                         \
    M(Bool, optimize_append_index, false, "Use constraints in order to append index condition (indexHint)", 0) \
    M(Bool, normalize_function_names, true, "Normalize function names to their canonical names", 0) \
-    M(Bool, allow_experimental_alter_materialized_view_structure, false, "Allow atomic alter on Materialized views. Work in progress.", 0) \
    M(Bool, enable_early_constant_folding, true, "Enable query optimization where we analyze function and subqueries results and rewrite query if there're constants there", 0) \
    M(Bool, deduplicate_blocks_in_dependent_materialized_views, false, "Should deduplicate blocks for materialized views if the block is not a duplicate for the table. Use true to always deduplicate in dependent tables.", 0) \
    M(Bool, materialized_views_ignore_errors, false, "Allows to ignore errors for MATERIALIZED VIEW, and deliver original block to the table regardless of MVs", 0) \
@ -750,7 +750,7 @@ class IColumn;
    M(UInt64, prefetch_buffer_size, DBMS_DEFAULT_BUFFER_SIZE, "The maximum size of the prefetch buffer to read from the filesystem.", 0) \
    M(UInt64, filesystem_prefetch_step_bytes, 0, "Prefetch step in bytes. Zero means `auto` - approximately the best prefetch step will be auto deduced, but might not be 100% the best. The actual value might be different because of setting filesystem_prefetch_min_bytes_for_single_read_task", 0) \
    M(UInt64, filesystem_prefetch_step_marks, 0, "Prefetch step in marks. Zero means `auto` - approximately the best prefetch step will be auto deduced, but might not be 100% the best. The actual value might be different because of setting filesystem_prefetch_min_bytes_for_single_read_task", 0) \
-    M(UInt64, filesystem_prefetch_min_bytes_for_single_read_task, "8Mi", "Do not parallelize within one file read less than this amount of bytes. E.g. one reader will not receive a read task of size less than this amount. This setting is recommended to avoid spikes of time for aws getObject requests to aws", 0) \
+    M(UInt64, filesystem_prefetch_min_bytes_for_single_read_task, "2Mi", "Do not parallelize within one file read less than this amount of bytes. E.g. one reader will not receive a read task of size less than this amount. This setting is recommended to avoid spikes of time for aws getObject requests to aws", 0) \
    M(UInt64, filesystem_prefetch_max_memory_usage, "1Gi", "Maximum memory usage for prefetches.", 0) \
    M(UInt64, filesystem_prefetches_limit, 200, "Maximum number of prefetches. Zero means unlimited. A setting `filesystem_prefetches_max_memory_usage` is more recommended if you want to limit the number of prefetches", 0) \
    \
@ -856,6 +856,7 @@ class IColumn;
    MAKE_OBSOLETE(M, Bool, allow_experimental_window_functions, true) \
    MAKE_OBSOLETE(M, Bool, allow_experimental_geo_types, true) \
    MAKE_OBSOLETE(M, Bool, allow_experimental_query_cache, true) \
+    MAKE_OBSOLETE(M, Bool, allow_experimental_alter_materialized_view_structure, true) \
    \
    MAKE_OBSOLETE(M, Milliseconds, async_insert_stale_timeout_ms, 0) \
    MAKE_OBSOLETE(M, StreamingHandleErrorMode, handle_kafka_error_mode, StreamingHandleErrorMode::DEFAULT) \
--- a/Show More
+++ b/Show More