Merge branch 'master' into feature/more_warnings

2024-11-22 07:31:57 +00:00 · 2023-12-05 11:50:05 +08:00 · 2023-12-05 11:50:05 +08:00 · 9e05e79d66
commit 9e05e79d66
parent 6794bbe196 ab80b2e8e2
360 changed files with 8399 additions and 2938 deletions
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@ -532,6 +532,11 @@ jobs:
      run_command: |
        cd "$REPO_COPY/tests/ci"

+        mkdir -p "${REPORTS_PATH}/integration"
+        mkdir -p "${REPORTS_PATH}/stateless"
+        cp -r ${REPORTS_PATH}/changed_images* ${REPORTS_PATH}/integration
+        cp -r ${REPORTS_PATH}/changed_images* ${REPORTS_PATH}/stateless
+
        TEMP_PATH="${TEMP_PATH}/integration" \
          REPORTS_PATH="${REPORTS_PATH}/integration" \
          python3 integration_test_check.py "Integration $CHECK_NAME" \
--- a/.gitmodules
+++ b/.gitmodules
@ -357,3 +357,6 @@
 [submodule "contrib/pocketfft"]
 	path = contrib/pocketfft
 	url = https://github.com/mreineck/pocketfft.git
+[submodule "contrib/sqids-cpp"]
+	path = contrib/sqids-cpp
+	url = https://github.com/sqids/sqids-cpp.git
--- a/README.md
+++ b/README.md
@ -35,6 +35,7 @@ curl https://clickhouse.com/ | sh

 * [**ClickHouse Meetup in Berlin**](https://www.meetup.com/clickhouse-berlin-user-group/events/296488501/) - Nov 30
 * [**ClickHouse Meetup in NYC**](https://www.meetup.com/clickhouse-new-york-user-group/events/296488779/) - Dec 11
+* [**ClickHouse Meetup in Sydney**](https://www.meetup.com/clickhouse-sydney-user-group/events/297638812/) - Dec 12
 * [**ClickHouse Meetup in Boston**](https://www.meetup.com/clickhouse-boston-user-group/events/296488840/) - Dec 12

 Also, keep an eye out for upcoming meetups around the world. Somewhere else you want us to be? Please feel free to reach out to tyler <at> clickhouse <dot> com.
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@ -156,6 +156,7 @@ add_contrib (nuraft-cmake NuRaft)
 add_contrib (fast_float-cmake fast_float)
 add_contrib (datasketches-cpp-cmake datasketches-cpp)
 add_contrib (incbin-cmake incbin)
+add_contrib (sqids-cpp-cmake sqids-cpp)

 option(ENABLE_NLP "Enable NLP functions support" ${ENABLE_LIBRARIES})
 if (ENABLE_NLP)
--- a/contrib/libcxxabi-cmake/CMakeLists.txt
+++ b/contrib/libcxxabi-cmake/CMakeLists.txt
@ -33,7 +33,7 @@ target_include_directories(cxxabi SYSTEM BEFORE
    PRIVATE $<BUILD_INTERFACE:${LIBCXXABI_SOURCE_DIR}/../libcxx/include>
    PRIVATE $<BUILD_INTERFACE:${LIBCXXABI_SOURCE_DIR}/../libcxx/src>
 )
-target_compile_definitions(cxxabi PRIVATE -D_LIBCPP_BUILDING_LIBRARY)
+target_compile_definitions(cxxabi PRIVATE -D_LIBCPP_BUILDING_LIBRARY -DHAS_THREAD_LOCAL)
 target_compile_options(cxxabi PRIVATE -nostdinc++ -fno-sanitize=undefined -Wno-macro-redefined) # If we don't disable UBSan, infinite recursion happens in dynamic_cast.
 target_link_libraries(cxxabi PUBLIC unwind)

--- a/contrib/qpl
+++ b/contrib/qpl
@ -1 +1 @@
-Subproject commit faaf19350459c076e66bb5df11743c3fade59b73
+Subproject commit a61bdd845fd7ca363b2bcc55454aa520dfcd8298
--- a/contrib/sqids-cpp
+++ b/contrib/sqids-cpp
@ -0,0 +1 @@
+Subproject commit 3756e537d4d48cc0dd4176801fe19f99601439b0
--- a/contrib/sqids-cpp-cmake/CMakeLists.txt
+++ b/contrib/sqids-cpp-cmake/CMakeLists.txt
@ -0,0 +1,14 @@
+option(ENABLE_SQIDS "Enable sqids support" ${ENABLE_LIBRARIES})
+if ((NOT ENABLE_SQIDS))
+    message (STATUS "Not using sqids")
+    return()
+endif()
+
+set (SQIDS_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/sqids-cpp")
+set (SQIDS_INCLUDE_DIR "${SQIDS_SOURCE_DIR}/include")
+
+add_library(_sqids INTERFACE)
+target_include_directories(_sqids SYSTEM INTERFACE ${SQIDS_INCLUDE_DIR})
+
+add_library(ch_contrib::sqids ALIAS _sqids)
+target_compile_definitions(_sqids INTERFACE ENABLE_SQIDS)
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -19,10 +19,14 @@ dpkg -i package_folder/clickhouse-common-static-dbg_*.deb
 dpkg -i package_folder/clickhouse-server_*.deb
 dpkg -i package_folder/clickhouse-client_*.deb

+echo "$BUGFIX_VALIDATE_CHECK"
+
 # Check that the tools are available under short names
-ch --query "SELECT 1" || exit 1
-chl --query "SELECT 1" || exit 1
-chc --version || exit 1
+if [[ -z "$BUGFIX_VALIDATE_CHECK" ]]; then
+    ch --query "SELECT 1" || exit 1
+    chl --query "SELECT 1" || exit 1
+    chc --version || exit 1
+fi

 ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test

@ -46,6 +50,16 @@ fi

 config_logs_export_cluster /etc/clickhouse-server/config.d/system_logs_export.yaml

+if [[ -n "$BUGFIX_VALIDATE_CHECK" ]] && [[ "$BUGFIX_VALIDATE_CHECK" -eq 1 ]]; then
+    sudo cat /etc/clickhouse-server/config.d/zookeeper.xml \
+    | sed "/<use_compression>1<\/use_compression>/d" \
+    > /etc/clickhouse-server/config.d/zookeeper.xml.tmp
+    sudo mv /etc/clickhouse-server/config.d/zookeeper.xml.tmp /etc/clickhouse-server/config.d/zookeeper.xml
+
+    # it contains some new settings, but we can safely remove it
+    rm /etc/clickhouse-server/users.d/s3_cache_new.xml
+fi
+
 # For flaky check we also enable thread fuzzer
 if [ "$NUM_TRIES" -gt "1" ]; then
    export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000
--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@ -191,6 +191,12 @@ sudo cat /etc/clickhouse-server/config.d/logger_trace.xml \
   > /etc/clickhouse-server/config.d/logger_trace.xml.tmp
 mv /etc/clickhouse-server/config.d/logger_trace.xml.tmp /etc/clickhouse-server/config.d/logger_trace.xml

+# Randomize async_load_databases
+if [ $(( $(date +%-d) % 2 )) -eq 1 ]; then
+    sudo echo "<clickhouse><async_load_databases>true</async_load_databases></clickhouse>" \
+        > /etc/clickhouse-server/config.d/enable_async_load_databases.xml
+fi
+
 start

 stress --hung-check --drop-databases --output-folder test_output --skip-func-tests "$SKIP_TESTS_OPTION" --global-time-limit 1200 \
--- a/docker/test/upgrade/run.sh
+++ b/docker/test/upgrade/run.sh
@ -79,6 +79,7 @@ rm /etc/clickhouse-server/config.d/merge_tree.xml
 rm /etc/clickhouse-server/config.d/enable_wait_for_shutdown_replicated_tables.xml
 rm /etc/clickhouse-server/users.d/nonconst_timezone.xml
 rm /etc/clickhouse-server/users.d/s3_cache_new.xml
+rm /etc/clickhouse-server/users.d/replicated_ddl_entry.xml

 start
 stop
@ -116,6 +117,7 @@ rm /etc/clickhouse-server/config.d/merge_tree.xml
 rm /etc/clickhouse-server/config.d/enable_wait_for_shutdown_replicated_tables.xml
 rm /etc/clickhouse-server/users.d/nonconst_timezone.xml
 rm /etc/clickhouse-server/users.d/s3_cache_new.xml
+rm /etc/clickhouse-server/users.d/replicated_ddl_entry.xml

 start

--- a/docs/en/getting-started/install.md
+++ b/docs/en/getting-started/install.md
@ -56,7 +56,7 @@ On Linux, macOS and FreeBSD:
  ./clickhouse client
  ClickHouse client version 23.2.1.1501 (official build).
  Connecting to localhost:9000 as user default.
-  Connected to ClickHouse server version 23.2.1 revision 54461.
+  Connected to ClickHouse server version 23.2.1.

  local-host :)
  ```
--- a/docs/en/interfaces/cli.md
+++ b/docs/en/interfaces/cli.md
@ -16,7 +16,7 @@ ClickHouse provides a native command-line client: `clickhouse-client`. The clien
 $ clickhouse-client
 ClickHouse client version 20.13.1.5273 (official build).
 Connecting to localhost:9000 as user default.
-Connected to ClickHouse server version 20.13.1 revision 54442.
+Connected to ClickHouse server version 20.13.1.

 :)
 ```
--- a/docs/en/operations/optimizing-performance/profile-guided-optimization.md
+++ b/docs/en/operations/optimizing-performance/profile-guided-optimization.md
@ -16,9 +16,9 @@ More information about PGO in ClickHouse you can read in the corresponding GitHu

 There are two major kinds of PGO: [Instrumentation](https://clang.llvm.org/docs/UsersManual.html#using-sampling-profilers) and [Sampling](https://clang.llvm.org/docs/UsersManual.html#using-sampling-profilers) (also known as AutoFDO). In this guide is described the Instrumentation PGO with ClickHouse.

-1. Build ClickHouse in Instrumented mode. In Clang it can be done via passing `-fprofile-instr-generate` option to `CXXFLAGS`.
+1. Build ClickHouse in Instrumented mode. In Clang it can be done via passing `-fprofile-generate` option to `CXXFLAGS`.
 2. Run instrumented ClickHouse on a sample workload. Here you need to use your usual workload. One of the approaches could be using [ClickBench](https://github.com/ClickHouse/ClickBench) as a sample workload. ClickHouse in the instrumentation mode could work slowly so be ready for that and do not run instrumented ClickHouse in performance-critical environments.
-3. Recompile ClickHouse once again with `-fprofile-instr-use` compiler flags and profiles that are collected from the previous step.
+3. Recompile ClickHouse once again with `-fprofile-use` compiler flags and profiles that are collected from the previous step.

 A more detailed guide on how to apply PGO is in the Clang [documentation](https://clang.llvm.org/docs/UsersManual.html#profile-guided-optimization).

--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -1679,6 +1679,45 @@ Default value: `0.5`.



+## async_load_databases {#async_load_databases}
+
+Asynchronous loading of databases and tables.
+
+If `true` all non-system databases with `Ordinary`, `Atomic` and `Replicated` engine will be loaded asynchronously after the ClickHouse server start up. See `system.async_loader` table, `tables_loader_background_pool_size` and `tables_loader_foreground_pool_size` server settings. Any query that tries to access a table, that is not yet loaded, will wait for exactly this table to be started up. If load job fails, query will rethrow an error (instead of shutting down the whole server in case of `async_load_databases = false`). The table that is waited for by at least one query will be loaded with higher priority. DDL queries on a database will wait for exactly that database to be started up.
+
+If `false`, all databases are loaded when the server starts.
+
+The default is `false`.
+
+**Example**
+
+``` xml
+<async_load_databases>true</async_load_databases>
+```
+
+## tables_loader_foreground_pool_size {#tables_loader_foreground_pool_size}
+
+Sets the number of threads performing load jobs in foreground pool. The foreground pool is used for loading table synchronously before server start listening on a port and for loading tables that are waited for. Foreground pool has higher priority than background pool. It means that no job starts in background pool while there are jobs running in foreground pool.
+
+Possible values:
+
+-   Any positive integer.
+-   Zero. Use all available CPUs.
+
+Default value: 0.
+
+
+## tables_loader_background_pool_size {#tables_loader_background_pool_size}
+
+Sets the number of threads performing asynchronous load jobs in background pool. The background pool is used for loading tables asynchronously after server start in case there are no queries waiting for the table. It could be beneficial to keep low number of threads in background pool if there are a lot of tables. It will reserve CPU resources for concurrent query execution.
+
+Possible values:
+
+-   Any positive integer.
+-   Zero. Use all available CPUs.
+
+Default value: 0.
+

 ## merge_tree {#merge_tree}

@ -2385,7 +2424,7 @@ Path on the local filesystem to store temporary data for processing large querie

 ## user_files_path {#user_files_path}

-The directory with user files. Used in the table function [file()](../../sql-reference/table-functions/file.md).
+The directory with user files. Used in the table function [file()](../../sql-reference/table-functions/file.md), [fileCluster()](../../sql-reference/table-functions/fileCluster.md).

 **Example**

--- a/docs/en/operations/settings/merge-tree-settings.md
+++ b/docs/en/operations/settings/merge-tree-settings.md
@ -149,7 +149,7 @@ Possible values:
 - Any positive integer.
 - 0 (disable deduplication)

-Default value: 100.
+Default value: 1000.

 The `Insert` command creates one or more blocks (parts). For [insert deduplication](../../engines/table-engines/mergetree-family/replication.md), when writing into replicated tables, ClickHouse writes the hash sums of the created parts into ClickHouse Keeper. Hash sums are stored only for the most recent `replicated_deduplication_window` blocks. The oldest hash sums are removed from ClickHouse Keeper.
 A large number of `replicated_deduplication_window` slows down `Inserts` because it needs to compare more entries.
--- a/docs/en/operations/system-tables/async_loader.md
+++ b/docs/en/operations/system-tables/async_loader.md
@ -0,0 +1,54 @@
+---
+slug: /en/operations/system-tables/async_loader
+---
+# async_loader
+
+Contains information and status for recent asynchronous jobs (e.g. for tables loading). The table contains a row for every job. There is a tool for visualizing information from this table `utils/async_loader_graph`.
+
+Example:
+
+``` sql
+SELECT *
+FROM system.async_loader
+FORMAT Vertical
+LIMIT 1
+```
+
+``` text
+```
+
+Columns:
+
+- `job` (`String`) - Job name (may be not unique).
+- `job_id` (`UInt64`) - Unique ID of the job.
+- `dependencies` (`Array(UInt64)`) - List of IDs of jobs that should be done before this job.
+- `dependencies_left` (`UInt64`) - Current number of dependencies left to be done.
+- `status` (`Enum`) - Current load status of a job:
+    `PENDING`:  Load job is not started yet.
+    `OK`: Load job executed and was successful.
+    `FAILED`: Load job executed and failed.
+    `CANCELED`: Load job is not going to be executed due to removal or dependency failure.
+
+A pending job might be in one of the following states:
+- `is_executing` (`UInt8`) - The job is currently being executed by a worker.
+- `is_blocked` (`UInt8`) - The job waits for its dependencies to be done.
+- `is_ready` (`UInt8`) - The job is ready to be executed and waits for a worker.
+- `elapsed` (`Float64`) - Seconds elapsed since start of execution. Zero if job is not started. Total execution time if job finished.
+
+Every job has a pool associated with it and is started in this pool. Each pool has a constant priority and a mutable maximum number of workers. Higher priority (lower `priority` value) jobs are run first. No job with lower priority is started while there is at least one higher priority job ready or executing. Job priority can be elevated (but cannot be lowered) by prioritizing it. For example jobs for a table loading and startup will be prioritized if incoming query required this table. It is possible prioritize a job during its execution, but job is not moved from its `execution_pool` to newly assigned `pool`. The job uses `pool` for creating new jobs to avoid priority inversion. Already started jobs are not preempted by higher priority jobs and always run to completion after start.
+- `pool_id` (`UInt64`) - ID of a pool currently assigned to the job.
+- `pool` (`String`) - Name of `pool_id` pool.
+- `priority` (`Int64`) - Priority of `pool_id` pool.
+- `execution_pool_id` (`UInt64`) - ID of a pool the job is executed in. Equals initially assigned pool before execution starts.
+- `execution_pool` (`String`) - Name of `execution_pool_id` pool.
+- `execution_priority` (`Int64`) - Priority of `execution_pool_id` pool.
+
+- `ready_seqno` (`Nullable(UInt64)`) - Not null for ready jobs. Worker pulls the next job to be executed from a ready queue of its pool. If there are multiple ready jobs, then job with the lowest value of `ready_seqno` is picked.
+- `waiters` (`UInt64`) - The number of threads waiting on this job.
+- `exception` (`Nullable(String)`) - Not null for failed and canceled jobs. Holds error message raised during query execution or error leading to cancelling of this job along with dependency failure chain of job names.
+
+Time instants during job lifetime:
+- `schedule_time` (`DateTime64`) - Time when job was created and scheduled to be executed (usually with all its dependencies).
+- `enqueue_time` (`Nullable(DateTime64)`) - Time when job became ready and was enqueued into a ready queue of it's pool. Null if the job is not ready yet.
+- `start_time` (`Nullable(DateTime64)`) - Time when worker dequeues the job from ready queue and start its execution. Null if the job is not started yet.
+- `finish_time` (`Nullable(DateTime64)`) - Time when job execution is finished. Null if the job is not finished yet.
--- a/docs/en/operations/system-tables/metrics.md
+++ b/docs/en/operations/system-tables/metrics.md
@ -45,6 +45,22 @@ Number of threads in the Aggregator thread pool.

 Number of threads in the Aggregator thread pool running a task.

+### TablesLoaderForegroundThreads
+
+Number of threads in the async loader foreground thread pool.
+
+### TablesLoaderForegroundThreadsActive
+
+Number of threads in the async loader foreground thread pool running a task.
+
+### TablesLoaderBackgroundThreads
+
+Number of threads in the async loader background thread pool.
+
+### TablesLoaderBackgroundThreadsActive
+
+Number of threads in the async loader background thread pool running a task.
+
 ### AsyncInsertCacheSize

 Number of async insert hash id in cache
@ -197,14 +213,6 @@ Number of threads in the DatabaseOnDisk thread pool.

 Number of threads in the DatabaseOnDisk thread pool running a task.

-### DatabaseOrdinaryThreads
-
-Number of threads in the Ordinary database thread pool.
-
-### DatabaseOrdinaryThreadsActive
-
-Number of threads in the Ordinary database thread pool running a task.
-
 ### DelayedInserts

 Number of INSERT queries that are throttled due to high number of active data parts for partition in a MergeTree table.
@ -625,14 +633,6 @@ Number of connections that are sending data for external tables to remote server

 Number of connections that are sending data for scalars to remote servers.

-### StartupSystemTablesThreads
-
-Number of threads in the StartupSystemTables thread pool.
-
-### StartupSystemTablesThreadsActive
-
-Number of threads in the StartupSystemTables thread pool running a task.
-
 ### StorageBufferBytes

 Number of bytes in buffers of Buffer tables
@ -677,14 +677,6 @@ Number of threads in the system.replicas thread pool running a task.

 Number of connections to TCP server (clients with native interface), also included server-server distributed query connections

-### TablesLoaderThreads
-
-Number of threads in the tables loader thread pool.
-
-### TablesLoaderThreadsActive
-
-Number of threads in the tables loader thread pool running a task.
-
 ### TablesToDropQueueSize

 Number of dropped tables, that are waiting for background data removal.
--- a/docs/en/operations/system-tables/numbers.md
+++ b/docs/en/operations/system-tables/numbers.md
@ -31,3 +31,26 @@ SELECT * FROM system.numbers LIMIT 10;

 10 rows in set. Elapsed: 0.001 sec.
 ```
+
+You can also limit the output by predicates.
+
+```sql
+SELECT * FROM system.numbers < 10;
+```
+
+```response
+┌─number─┐
+│      0 │
+│      1 │
+│      2 │
+│      3 │
+│      4 │
+│      5 │
+│      6 │
+│      7 │
+│      8 │
+│      9 │
+└────────┘
+
+10 rows in set. Elapsed: 0.001 sec.
+```
--- a/docs/en/operations/system-tables/trace_log.md
+++ b/docs/en/operations/system-tables/trace_log.md
@ -22,7 +22,7 @@ Columns:

 - `revision` ([UInt32](../../sql-reference/data-types/int-uint.md)) — ClickHouse server build revision.

-    When connecting to the server by `clickhouse-client`, you see the string similar to `Connected to ClickHouse server version 19.18.1 revision 54429.`. This field contains the `revision`, but not the `version` of a server.
+    When connecting to the server by `clickhouse-client`, you see the string similar to `Connected to ClickHouse server version 19.18.1.`. This field contains the `revision`, but not the `version` of a server.

 - `trace_type` ([Enum8](../../sql-reference/data-types/enum.md)) — Trace type:

--- a/docs/en/sql-reference/aggregate-functions/reference/any.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/any.md
@ -5,7 +5,12 @@ sidebar_position: 6

 # any

-Selects the first encountered (non-NULL) value, unless all rows have NULL values in that column.
+Selects the first encountered value of a column.
+
+By default, it ignores NULL values and returns the first NOT NULL value found in the column. As [`first_value`](../../../sql-reference/aggregate-functions/reference/first_value.md) if supports `RESPECT NULLS`, in which case it will select the first value passed, independently on whether it's NULL or not.
+
+The return type of the function is the same as the input, except for LowCardinality which is discarded). This means that given no rows as input it will return the default value of that type (0 for integers, or Null for a Nullable() column). You might use the `-OrNull` [combinator](../../../sql-reference/aggregate-functions/combinators.md) ) to modify this behaviour.
+
 The query can be executed in any order and even in a different order each time, so the result of this function is indeterminate.
 To get a determinate result, you can use the ‘min’ or ‘max’ function instead of ‘any’.

@ -13,4 +18,4 @@ In some cases, you can rely on the order of execution. This applies to cases whe

 When a `SELECT` query has the `GROUP BY` clause or at least one aggregate function, ClickHouse (in contrast to MySQL) requires that all expressions in the `SELECT`, `HAVING`, and `ORDER BY` clauses be calculated from keys or from aggregate functions. In other words, each column selected from the table must be used either in keys or inside aggregate functions. To get behavior like in MySQL, you can put the other columns in the `any` aggregate function.

- Alias: `any_value`
+- Alias: `any_value`, `first_value`.
--- a/docs/en/sql-reference/aggregate-functions/reference/first_value.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/first_value.md
@ -5,9 +5,12 @@ sidebar_position: 7

 # first_value

-Selects the first encountered value, similar to `any`, but could accept NULL.
-Mostly it should be used with [Window Functions](../../window-functions/index.md).
-Without Window Functions the result will be random if the source stream is not ordered.
+It is an alias for [`any`](../../../sql-reference/aggregate-functions/reference/any.md) but it was introduced for compatibility with [Window Functions](../../window-functions/index.md), where sometimes it's necessary to process `NULL` values (by default all ClickHouse aggregate functions ignore NULL values).
+
+It supports declaring a modifier to respect nulls (`RESPECT NULLS`), both under [Window Functions](../../window-functions/index.md) and in normal aggregations.
+
+As with `any`, without Window Functions the result will be random if the source stream is not ordered and the return type
+matches the input type (Null is only returned if the input is Nullable or -OrNull combinator is added).

 ## examples

@ -23,15 +26,15 @@ INSERT INTO test_data (a, b) Values (1,null), (2,3), (4, 5), (6,null);
 ```

 ### example1
-The NULL value is ignored at default.
+By default, the NULL value is ignored.
 ```sql
 select first_value(b) from test_data;
 ```

 ```text
-┌─first_value_ignore_nulls(b)─┐
-│                           3 │
-└─────────────────────────────┘
+┌─any(b)─┐
+│      3 │
+└────────┘
 ```

 ### example2
@ -41,9 +44,9 @@ select first_value(b) ignore nulls from test_data
 ```

 ```text
-┌─first_value_ignore_nulls(b)─┐
-│                           3 │
-└─────────────────────────────┘
+┌─any(b) IGNORE NULLS ─┐
+│                    3 │
+└──────────────────────┘
 ```

 ### example3
@ -53,9 +56,9 @@ select first_value(b) respect nulls from test_data
 ```

 ```text
-┌─first_value_respect_nulls(b)─┐
-│                         ᴺᵁᴸᴸ │
-└──────────────────────────────┘
+┌─any(b) RESPECT NULLS ─┐
+│                  ᴺᵁᴸᴸ │
+└───────────────────────┘
 ```

 ### example4
@ -73,8 +76,8 @@ FROM
 ```

 ```text
-┌─first_value_respect_nulls(b)─┬─first_value(b)─┐
-│                         ᴺᵁᴸᴸ │              3 │
-└──────────────────────────────┴────────────────┘
+┌─any_respect_nulls(b)─┬─any(b)─┐
+│                 ᴺᵁᴸᴸ │      3 │
+└──────────────────────┴────────┘
 ```

--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@ -1083,7 +1083,7 @@ Result:

 **See also**

- [arrayFold](#arrayFold)
+- [arrayFold](#arrayfold)

 ## arrayReduceInRanges

@ -1175,7 +1175,7 @@ FROM numbers(1,10);

 **See also**

- [arrayReduce](#arrayReduce)
+- [arrayReduce](#arrayreduce)

 ## arrayReverse(arr)

--- a/docs/en/sql-reference/functions/hash-functions.md
+++ b/docs/en/sql-reference/functions/hash-functions.md
@ -1776,3 +1776,34 @@ Result:
 │ (('queries','database','analytical'),('oriented','processing','DBMS')) │
 └────────────────────────────────────────────────────────────────────────┘
 ```
+
+## sqid
+
+Transforms numbers into YouTube-like short URL hash called [Sqid](https://sqids.org/).
+To use this function, set setting `allow_experimental_hash_functions = 1`.
+
+**Syntax**
+
+```sql
+sqid(number1,...)
+```
+
+**Arguments**
+
+- A variable number of UInt8, UInt16, UInt32 or UInt64 numbers.
+
+**Returned Value**
+
+A hash id [String](/docs/en/sql-reference/data-types/string.md).
+
+**Example**
+
+```sql
+SELECT sqid(1, 2, 3, 4, 5);
+```
+
+```response
+┌─sqid(1, 2, 3, 4, 5)─┐
+│ gXHfJ1C6dN          │
+└─────────────────────┘
+```
--- a/docs/en/sql-reference/operators/exists.md
+++ b/docs/en/sql-reference/operators/exists.md
@ -5,7 +5,7 @@ slug: /en/sql-reference/operators/exists

 The `EXISTS` operator checks how many records are in the result of a subquery. If it is empty, then the operator returns `0`. Otherwise, it returns `1`.

-`EXISTS` can be used in a [WHERE](../../sql-reference/statements/select/where.md) clause.
+`EXISTS` can also be used in a [WHERE](../../sql-reference/statements/select/where.md) clause.

 :::tip    
 References to main query tables and columns are not supported in a subquery.
@ -13,12 +13,26 @@ References to main query tables and columns are not supported in a subquery.

 **Syntax**

-```sql
-WHERE EXISTS(subquery)
+``` sql
+EXISTS(subquery)
 ```

 **Example**

+Query checking existence of values in a subquery:
+
+``` sql
+SELECT EXISTS(SELECT * FROM numbers(10) WHERE number > 8), EXISTS(SELECT * FROM numbers(10) WHERE number > 11)
+```
+
+Result:
+
+``` text
+┌─in(1, _subquery1)─┬─in(1, _subquery2)─┐
+│                 1 │                 0 │
+└───────────────────┴───────────────────┘
+```
+
 Query with a subquery returning several rows:

 ``` sql
--- a/docs/en/sql-reference/statements/alter/column.md
+++ b/docs/en/sql-reference/statements/alter/column.md
@ -10,7 +10,7 @@ A set of queries that allow changing the table structure.
 Syntax:

 ``` sql
-ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|RENAME|CLEAR|COMMENT|{MODIFY|ALTER}|MATERIALIZE COLUMN ...
+ALTER [TEMPORARY] TABLE [db].name [ON CLUSTER cluster] ADD|DROP|RENAME|CLEAR|COMMENT|{MODIFY|ALTER}|MATERIALIZE COLUMN ...
 ```

 In the query, specify a list of one or more comma-separated actions.
--- a/docs/en/sql-reference/statements/explain.md
+++ b/docs/en/sql-reference/statements/explain.md
@ -415,7 +415,7 @@ ExpressionTransform
        ExpressionTransform × 2
          (SettingQuotaAndLimits)
            (ReadFromStorage)
-            NumbersMt × 2 0 → 1
+            NumbersRange × 2 0 → 1
 ```
 ### EXPLAIN ESTIMATE

--- a/docs/en/sql-reference/table-functions/fileCluster.md
+++ b/docs/en/sql-reference/table-functions/fileCluster.md
@ -0,0 +1,85 @@
+---
+slug: /en/sql-reference/table-functions/fileCluster
+sidebar_position: 61
+sidebar_label: fileCluster
+---
+
+# fileCluster Table Function
+
+Enables simultaneous processing of files matching a specified path across multiple nodes within a cluster. The initiator establishes connections to worker nodes, expands globs in the file path, and delegates file-reading tasks to worker nodes. Each worker node is querying the initiator for the next file to process, repeating until all tasks are completed (all files are read).
+
+:::note    
+This function will operate _correctly_ only in case the set of files matching the initially specified path is identical across all nodes, and their content is consistent among different nodes.  
+In case these files differ between nodes, the return value cannot be predetermined and depends on the order in which worker nodes request tasks from the initiator.
+:::
+
+**Syntax**
+
+``` sql
+fileCluster(cluster_name, path[, format, structure, compression_method])
+```
+
+**Arguments**
+
+- `cluster_name` — Name of a cluster that is used to build a set of addresses and connection parameters to remote and local servers.
+- `path` — The relative path to the file from [user_files_path](/docs/en/operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Path to file also supports [globs](#globs_in_path). 
+- `format` — [Format](../../interfaces/formats.md#formats) of the files. Type: [String](../../sql-reference/data-types/string.md).
+- `structure` — Table structure in `'UserID UInt64, Name String'` format. Determines column names and types. Type: [String](../../sql-reference/data-types/string.md).
+- `compression_method` — Compression method. Supported compression types are `gz`, `br`, `xz`, `zst`, `lz4`, and `bz2`.
+
+**Returned value**
+
+A table with the specified format and structure and with data from files matching the specified path.
+
+**Example**
+
+Given a cluster named `my_cluster` and given the following value of setting `user_files_path`:
+
+``` bash
+$ grep user_files_path /etc/clickhouse-server/config.xml
+    <user_files_path>/var/lib/clickhouse/user_files/</user_files_path>
+```
+Also, given there are files `test1.csv` and `test2.csv` inside `user_files_path` of each cluster node, and their content is identical across different nodes:
+```bash
+$ cat /var/lib/clickhouse/user_files/test1.csv
+    1,"file1"
+    11,"file11"
+
+$ cat /var/lib/clickhouse/user_files/test1.csv
+    2,"file2"
+    22,"file22"
+```
+
+For example, one can create these files by executing these two queries on every cluster node:
+```sql
+INSERT INTO TABLE FUNCTION file('file1.csv', 'CSV', 'i UInt32, s String') VALUES (1,'file1'), (11,'file11');
+INSERT INTO TABLE FUNCTION file('file2.csv', 'CSV', 'i UInt32, s String') VALUES (2,'file2'), (22,'file22');
+```
+
+Now, read data contents of `test1.csv` and `test2.csv` via `fileCluster` table function:
+
+```sql
+SELECT * from fileCluster(
+    'my_cluster', 'file{1,2}.csv', 'CSV', 'i UInt32, s String') ORDER BY (i, s)"""
+)
+```
+
+```
+┌──i─┬─s──────┐
+│  1 │ file1  │
+│ 11 │ file11 │
+└────┴────────┘
+┌──i─┬─s──────┐
+│  2 │ file2  │
+│ 22 │ file22 │
+└────┴────────┘
+```
+
+
+## Globs in Path {#globs_in_path}
+
+All patterns supported by [File](../../sql-reference/table-functions/file.md#globs-in-path) table function are supported by FileCluster.
+
+**See Also**
+
+- [File table function](../../sql-reference/table-functions/file.md)
--- a/docs/en/sql-reference/table-functions/numbers.md
+++ b/docs/en/sql-reference/table-functions/numbers.md
@ -17,6 +17,8 @@ The following queries are equivalent:
 SELECT * FROM numbers(10);
 SELECT * FROM numbers(0, 10);
 SELECT * FROM system.numbers LIMIT 10;
+SELECT * FROM system.numbers WHERE number BETWEEN 0 AND 9;
+SELECT * FROM system.numbers WHERE number IN (0, 1, 2, 3, 4, 5, 6, 7, 8, 9);
 ```

 Examples:
--- a/docs/ru/interfaces/cli.md
+++ b/docs/ru/interfaces/cli.md
@ -14,7 +14,7 @@ ClickHouse предоставляет собственный клиент ком
 $ clickhouse-client
 ClickHouse client version 20.13.1.5273 (official build).
 Connecting to localhost:9000 as user default.
-Connected to ClickHouse server version 20.13.1 revision 54442.
+Connected to ClickHouse server version 20.13.1.

 :)
 ```
--- a/docs/ru/operations/server-configuration-parameters/settings.md
+++ b/docs/ru/operations/server-configuration-parameters/settings.md
@ -1678,7 +1678,7 @@ TCP порт для защищённого обмена данными с кли

 ## user_files_path {#server_configuration_parameters-user_files_path}

-Каталог с пользовательскими файлами. Используется в табличной функции [file()](../../operations/server-configuration-parameters/settings.md).
+Каталог с пользовательскими файлами. Используется в табличных функциях [file()](../../sql-reference/table-functions/fileCluster.md) и [fileCluster()](../../sql-reference/table-functions/fileCluster.md).

 **Пример**

--- a/docs/ru/operations/settings/merge-tree-settings.md
+++ b/docs/ru/operations/settings/merge-tree-settings.md
@ -119,7 +119,7 @@ Eсли суммарное число активных кусков во все
 -   Положительное целое число.
 -   0 (без ограничений).

-Значение по умолчанию: 100.
+Значение по умолчанию: 1000.

 Команда `Insert` создает один или несколько блоков (кусков). При вставке в Replicated таблицы ClickHouse для [дедупликации вставок](../../engines/table-engines/mergetree-family/replication.md) записывает в Zookeeper хеш-суммы созданных кусков. Но хранятся только последние `replicated_deduplication_window` хеш-сумм. Самые старые хеш-суммы удаляются из Zookeeper.
 Большое значение `replicated_deduplication_window` замедляет `Insert`, так как приходится сравнивать большее количество хеш-сумм.
--- a/docs/ru/operations/system-tables/trace_log.md
+++ b/docs/ru/operations/system-tables/trace_log.md
@ -19,7 +19,7 @@ ClickHouse создает эту таблицу когда установлен

 -   `revision`([UInt32](../../sql-reference/data-types/int-uint.md)) — ревизия сборки сервера ClickHouse.

-        Во время соединения с сервером через `clickhouse-client`, вы видите строку похожую на `Connected to ClickHouse server version 19.18.1 revision 54429.`. Это поле содержит номер после `revision`, но не содержит строку после `version`.
+        Во время соединения с сервером через `clickhouse-client`, вы видите строку похожую на `Connected to ClickHouse server version 19.18.1.`. Это поле содержит номер после `revision`, но не содержит строку после `version`.

 -   `trace_type`([Enum8](../../sql-reference/data-types/enum.md)) — тип трассировки:

--- a/docs/ru/sql-reference/statements/alter/column.md
+++ b/docs/ru/sql-reference/statements/alter/column.md
@ -11,7 +11,7 @@ sidebar_label: "Манипуляции со столбцами"
 Синтаксис:

 ``` sql
-ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|RENAME|CLEAR|COMMENT|{MODIFY|ALTER}|MATERIALIZE COLUMN ...
+ALTER [TEMPORARY] TABLE [db].name [ON CLUSTER cluster] ADD|DROP|RENAME|CLEAR|COMMENT|{MODIFY|ALTER}|MATERIALIZE COLUMN ...
 ```

 В запросе можно указать сразу несколько действий над одной таблицей через запятую.
--- a/docs/ru/sql-reference/statements/explain.md
+++ b/docs/ru/sql-reference/statements/explain.md
@ -371,7 +371,7 @@ ExpressionTransform
        ExpressionTransform × 2
          (SettingQuotaAndLimits)
            (ReadFromStorage)
-            NumbersMt × 2 0 → 1
+            NumbersRange × 2 0 → 1
 ```

 ### EXPLAIN ESTIMATE {#explain-estimate}
--- a/docs/ru/sql-reference/table-functions/file.md
+++ b/docs/ru/sql-reference/table-functions/file.md
@ -13,7 +13,7 @@ sidebar_label: file
 **Синтаксис**

 ``` sql
-file(path [,format] [,structure])
+file(path [,format] [,structure] [,compression])
 ```

 **Параметры**
@ -21,6 +21,7 @@ file(path [,format] [,structure])
 -   `path` — относительный путь до файла от [user_files_path](../../sql-reference/table-functions/file.md#server_configuration_parameters-user_files_path). Путь к файлу поддерживает следующие шаблоны в режиме доступа только для чтения `*`, `?`, `{abc,def}` и `{N..M}`, где `N`, `M` — числа, `'abc', 'def'` — строки.
 -   `format` — [формат](../../interfaces/formats.md#formats) файла.
 -   `structure` — структура таблицы. Формат: `'colunmn1_name column1_ype, column2_name column2_type, ...'`.
+- `compression` — Используемый тип сжатия для запроса SELECT или желаемый тип сжатия для запроса INSERT. Поддерживаемые типы сжатия: `gz`, `br`, `xz`, `zst`, `lz4` и `bz2`.

 **Возвращаемое значение**

--- a/docs/ru/sql-reference/table-functions/fileCluster.md
+++ b/docs/ru/sql-reference/table-functions/fileCluster.md
@ -0,0 +1,84 @@
+---
+slug: /ru/sql-reference/table-functions/fileCluster
+sidebar_position: 38
+sidebar_label: fileCluster
+---
+
+# fileCluster
+
+Позволяет одновременно обрабатывать файлы, находящиеся по указанному пути, на нескольких узлах внутри кластера. Узел-инициатор устанавливает соединения с рабочими узлами (worker nodes), раскрывает шаблоны в пути к файлам и отдаёт задачи по чтению файлов рабочим узлам. Рабочий узел запрашивает у инициатора путь к следующему файлу для обработки, повторяя до тех пор, пока не завершатся все задачи (то есть пока не будут обработаны все файлы).
+
+:::note    
+Эта табличная функция будет работать _корректно_ только в случае, если набор файлов, соответствующих изначально указанному пути, одинаков на всех узлах и содержание этих файлов идентично на различных узлах. В случае, если эти файлы различаются между узлами, результат не предопределён и зависит от очерёдности, с которой рабочие узлы будут запрашивать задачи у инициатора.
+:::
+
+**Синтаксис**
+
+``` sql
+fileCluster(cluster_name, path[, format, structure, compression_method])
+```
+
+**Аргументы**
+
+- `cluster_name` — имя кластера, используемое для создания набора адресов и параметров подключения к удаленным и локальным серверам.
+- `path` — относительный путь до файла от [user_files_path](../../sql-reference/table-functions/file.md#server_configuration_parameters-user_files_path). Путь к файлу поддерживает [шаблоны поискаglobs](#globs_in_path). 
+- `format` — [формат](../../interfaces/formats.md#formats) файла.
+- `structure` — структура таблицы. Формат: `'colunmn1_name column1_ype, column2_name column2_type, ...'`.
+- `compression_method` — Используемый тип сжатия. Поддерживаемые типы: `gz`, `br`, `xz`, `zst`, `lz4` и `bz2`.
+
+**Возвращаемое значение**
+
+Таблица с указанным форматом и структурой, содержащая данные из файлов, соответствующих указанному пути.
+
+**Пример**
+Пусть есть кластер с именем `my_cluster`, а также установлено нижеследующее значение параметра `user_files_path`:
+
+``` bash
+$ grep user_files_path /etc/clickhouse-server/config.xml
+    <user_files_path>/var/lib/clickhouse/user_files/</user_files_path>
+```
+
+Пусть также на каждом узле кластера в директории `user_files_path` находятся файлы `test1.csv` и `test2.csv`, и их содержимое идентично на разных узлах:
+```bash
+$ cat /var/lib/clickhouse/user_files/test1.csv
+    1,"file1"
+    11,"file11"
+
+$ cat /var/lib/clickhouse/user_files/test1.csv
+    2,"file2"
+    22,"file22"
+```
+
+Например, эти файлы можно создать, выполнив на каждом узле два запроса:
+```sql
+INSERT INTO TABLE FUNCTION file('file1.csv', 'CSV', 'i UInt32, s String') VALUES (1,'file1'), (11,'file11');
+INSERT INTO TABLE FUNCTION file('file2.csv', 'CSV', 'i UInt32, s String') VALUES (2,'file2'), (22,'file22');
+```
+
+Прочитаем содержимое файлов `test1.csv` и `test2.csv` с помощью табличной функции `fileCluster`:
+
+```sql
+SELECT * from fileCluster(
+    'my_cluster', 'file{1,2}.csv', 'CSV', 'i UInt32, s String') ORDER BY (i, s)"""
+)
+```
+
+```
+┌──i─┬─s──────┐
+│  1 │ file1  │
+│ 11 │ file11 │
+└────┴────────┘
+┌──i─┬─s──────┐
+│  2 │ file2  │
+│ 22 │ file22 │
+└────┴────────┘
+```
+
+
+## Шаблоны поиска в компонентах пути {#globs_in_path}
+
+Поддерживаются все шаблоны поиска, что поддерживаются табличной функцией [File](../../sql-reference/table-functions/file.md#globs-in-path).
+
+**Смотрите также**
+
+- [File (табличная функция)](../../sql-reference/table-functions/file.md)
--- a/docs/zh/interfaces/cli.md
+++ b/docs/zh/interfaces/cli.md
@ -14,7 +14,7 @@ ClickHouse提供了一个原生命令行客户端`clickhouse-client`客户端支
 $ clickhouse-client
 ClickHouse client version 19.17.1.1579 (official build).
 Connecting to localhost:9000 as user default.
-Connected to ClickHouse server version 19.17.1 revision 54428.
+Connected to ClickHouse server version 19.17.1.

 :)
 ```
--- a/docs/zh/operations/system-tables/trace_log.md
+++ b/docs/zh/operations/system-tables/trace_log.md
@ -22,7 +22,7 @@ ClickHouse创建此表时 [trace_log](../../operations/server-configuration-para

 -   `revision` ([UInt32](../../sql-reference/data-types/int-uint.md)) — ClickHouse server build revision.

-    通过以下方式连接到服务器 `clickhouse-client`，你看到的字符串类似于 `Connected to ClickHouse server version 19.18.1 revision 54429.`. 该字段包含 `revision`，但不是 `version` 的服务器。
+    通过以下方式连接到服务器 `clickhouse-client`，你看到的字符串类似于 `Connected to ClickHouse server version 19.18.1.`. 该字段包含 `revision`，但不是 `version` 的服务器。

 -   `timer_type` ([枚举8](../../sql-reference/data-types/enum.md)) — Timer type:

--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@ -493,8 +493,7 @@ void Client::connect()

    if (is_interactive)
    {
-        std::cout << "Connected to " << server_name << " server version " << server_version << " revision " << server_revision << "."
-                    << std::endl << std::endl;
+        std::cout << "Connected to " << server_name << " server version " << server_version << "." << std::endl << std::endl;

        auto client_version_tuple = std::make_tuple(VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH);
        auto server_version_tuple = std::make_tuple(server_version_major, server_version_minor, server_version_patch);
--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@ -23,6 +23,7 @@
 #include <Common/scope_guard_safe.h>
 #include <Interpreters/Session.h>
 #include <Access/AccessControl.h>
+#include <Common/PoolId.h>
 #include <Common/Exception.h>
 #include <Common/Macros.h>
 #include <Common/Config/ConfigProcessor.h>
@ -742,16 +743,16 @@ void LocalServer::processConfig()
        status.emplace(fs::path(path) / "status", StatusFile::write_full_info);

        LOG_DEBUG(log, "Loading metadata from {}", path);
-        loadMetadataSystem(global_context);
+        auto startup_system_tasks = loadMetadataSystem(global_context);
        attachSystemTablesLocal(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::SYSTEM_DATABASE));
        attachInformationSchema(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::INFORMATION_SCHEMA));
        attachInformationSchema(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::INFORMATION_SCHEMA_UPPERCASE));
-        startupSystemTables();
+        waitLoad(TablesLoaderForegroundPoolId, startup_system_tasks);

        if (!config().has("only-system-tables"))
        {
            DatabaseCatalog::instance().createBackgroundTasks();
-            loadMetadata(global_context);
+            waitLoad(loadMetadata(global_context));
            DatabaseCatalog::instance().startupBackgroundTasks();
        }

--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -20,6 +20,7 @@
 #include <base/coverage.h>
 #include <base/getFQDNOrHostName.h>
 #include <base/safeExit.h>
+#include <Common/PoolId.h>
 #include <Common/MemoryTracker.h>
 #include <Common/ClickHouseRevision.h>
 #include <Common/DNSResolver.h>
@ -1339,6 +1340,10 @@ try
            global_context->getMessageBrokerSchedulePool().increaseThreadsCount(server_settings_.background_message_broker_schedule_pool_size);
            global_context->getDistributedSchedulePool().increaseThreadsCount(server_settings_.background_distributed_schedule_pool_size);

+            global_context->getAsyncLoader().setMaxThreads(TablesLoaderForegroundPoolId, server_settings_.tables_loader_foreground_pool_size);
+            global_context->getAsyncLoader().setMaxThreads(TablesLoaderBackgroundLoadPoolId, server_settings_.tables_loader_background_pool_size);
+            global_context->getAsyncLoader().setMaxThreads(TablesLoaderBackgroundStartupPoolId, server_settings_.tables_loader_background_pool_size);
+
            getIOThreadPool().reloadConfiguration(
                server_settings.max_io_thread_pool_size,
                server_settings.max_io_thread_pool_free_size,
@ -1679,17 +1684,18 @@ try

    LOG_INFO(log, "Loading metadata from {}", path_str);

+    LoadTaskPtrs load_metadata_tasks;
    try
    {
        auto & database_catalog = DatabaseCatalog::instance();
        /// We load temporary database first, because projections need it.
        database_catalog.initializeAndLoadTemporaryDatabase();
-        loadMetadataSystem(global_context);
-        maybeConvertSystemDatabase(global_context);
+        auto system_startup_tasks = loadMetadataSystem(global_context);
+        maybeConvertSystemDatabase(global_context, system_startup_tasks);
        /// This has to be done before the initialization of system logs,
        /// otherwise there is a race condition between the system database initialization
        /// and creation of new tables in the database.
-        startupSystemTables();
+        waitLoad(TablesLoaderForegroundPoolId, system_startup_tasks);
        /// After attaching system databases we can initialize system log.
        global_context->initializeSystemLogs();
        global_context->setSystemZooKeeperLogAfterInitializationIfNeeded();
@ -1705,9 +1711,10 @@ try
        /// and so loadMarkedAsDroppedTables() will find it and try to add, and UUID will overlap.
        database_catalog.loadMarkedAsDroppedTables();
        database_catalog.createBackgroundTasks();
-        /// Then, load remaining databases
-        loadMetadata(global_context, default_database);
-        convertDatabasesEnginesIfNeed(global_context);
+        /// Then, load remaining databases (some of them maybe be loaded asynchronously)
+        load_metadata_tasks = loadMetadata(global_context, default_database, server_settings.async_load_databases);
+        /// If we need to convert database engines, disable async tables loading
+        convertDatabasesEnginesIfNeed(load_metadata_tasks, global_context);
        database_catalog.startupBackgroundTasks();
        /// After loading validate that default database exists
        database_catalog.assertDatabaseExists(default_database);
@ -1719,6 +1726,7 @@ try
        tryLogCurrentException(log, "Caught exception while loading metadata");
        throw;
    }
+
    LOG_DEBUG(log, "Loaded metadata.");

    /// Init trace collector only after trace_log system table was created
@ -1874,9 +1882,14 @@ try
                throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "distributed_ddl.pool_size should be greater then 0");
            global_context->setDDLWorker(std::make_unique<DDLWorker>(pool_size, ddl_zookeeper_path, global_context, &config(),
                                                                     "distributed_ddl", "DDLWorker",
-                                                                     &CurrentMetrics::MaxDDLEntryID, &CurrentMetrics::MaxPushedDDLEntryID));
+                                                                     &CurrentMetrics::MaxDDLEntryID, &CurrentMetrics::MaxPushedDDLEntryID),
+                                         load_metadata_tasks);
        }

+        /// Do not keep tasks in server, they should be kept inside databases. Used here to make dependent tasks only.
+        load_metadata_tasks.clear();
+        load_metadata_tasks.shrink_to_fit();
+
        {
            std::lock_guard lock(servers_lock);
            for (auto & server : servers)
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@ -364,8 +364,15 @@
    <background_schedule_pool_size>128</background_schedule_pool_size>
    <background_message_broker_schedule_pool_size>16</background_message_broker_schedule_pool_size>
    <background_distributed_schedule_pool_size>16</background_distributed_schedule_pool_size>
+    <tables_loader_foreground_pool_size>0</tables_loader_foreground_pool_size>
+    <tables_loader_background_pool_size>0</tables_loader_background_pool_size>
    -->

+    <!-- Enables asynchronous loading of databases and tables to speedup server startup.
+         Queries to not yet loaded entity will be blocked until load is finished.
+      -->
+    <!-- <async_load_databases>true</async_load_databases> -->
+
    <!-- On memory constrained environments you may have to set this to value larger than 1.
      -->
    <max_server_memory_usage_to_ram_ratio>0.9</max_server_memory_usage_to_ram_ratio>
--- a/programs/server/dashboard.html
+++ b/programs/server/dashboard.html
@ -108,7 +108,7 @@
            filter: blur(1px);
        }

-        .chart div { position: absolute; }
+        .chart > div { position: absolute; }

        .inputs {
            height: auto;
@ -215,8 +215,6 @@
            color: var(--text-color);
        }

-        .u-legend th { display: none; }
-
        .themes {
            float: right;
            font-size: 20pt;
@ -433,6 +431,16 @@
            display: none;
        }

+        .u-series {
+            line-height: 0.8;
+        }
+
+        .u-series.footer {
+            font-size: 8px;
+            padding-top: 0;
+            margin-top: 0;
+        }
+
        /* Source: https://cdn.jsdelivr.net/npm/uplot@1.6.21/dist/uPlot.min.css
         * It is copy-pasted to lower the number of requests.
         */
@ -478,7 +486,6 @@
  * - compress the state for URL's #hash;
  * - footer with "about" or a link to source code;
  * - allow to configure a table on a server to save the dashboards;
-  * - multiple lines on chart;
  * - if a query returned one value, display this value instead of a diagram;
  * - if a query returned something unusual, display the table;
  */
@ -520,10 +527,54 @@ let queries = [];
 /// Query parameters with predefined default values.
 /// All other parameters will be automatically found in the queries.
 let params = {
-    "rounding": "60",
-    "seconds": "86400"
+    'rounding': '60',
+    'seconds': '86400'
 };

+/// Palette generation for charts
+function generatePalette(baseColor, numColors) {
+    const baseHSL = hexToHsl(baseColor);
+    const hueStep = 360 / numColors;
+    const palette = [];
+    for (let i = 0; i < numColors; i++) {
+        const hue = Math.round((baseHSL.h + i * hueStep) % 360);
+        const color = `hsl(${hue}, ${baseHSL.s}%, ${baseHSL.l}%)`;
+        palette.push(color);
+    }
+    return palette;
+}
+
+/// Helper function to convert hex color to HSL
+function hexToHsl(hex) {
+    hex = hex.replace(/^#/, '');
+    const bigint = parseInt(hex, 16);
+    const r = (bigint >> 16) & 255;
+    const g = (bigint >> 8) & 255;
+    const b = bigint & 255;
+    const r_norm = r / 255;
+    const g_norm = g / 255;
+    const b_norm = b / 255;
+    const max = Math.max(r_norm, g_norm, b_norm);
+    const min = Math.min(r_norm, g_norm, b_norm);
+    const l = (max + min) / 2;
+    let s = 0;
+    if (max !== min) {
+        s = l > 0.5 ? (max - min) / (2 - max - min) : (max - min) / (max + min);
+    }
+    let h = 0;
+    if (max !== min) {
+        if (max === r_norm) {
+            h = (g_norm - b_norm) / (max - min) + (g_norm < b_norm ? 6 : 0);
+        } else if (max === g_norm) {
+            h = (b_norm - r_norm) / (max - min) + 2;
+        } else {
+            h = (r_norm - g_norm) / (max - min) + 4;
+        }
+    }
+    h = Math.round(h * 60);
+    return { h, s: Math.round(s * 100), l: Math.round(l * 100) };
+}
+
 let theme = 'light';

 function setTheme(new_theme) {
@ -913,6 +964,8 @@ document.getElementById('mass-editor-textarea').addEventListener('input', e => {

 function legendAsTooltipPlugin({ className, style = { background: "var(--legend-background)" } } = {}) {
    let legendEl;
+    let showTop = false;
+    const showLimit = 5;

    function init(u, opts) {
        legendEl = u.root.querySelector(".u-legend");
@ -932,13 +985,28 @@ function legendAsTooltipPlugin({ className, style = { background: "var(--legend-
            ...style
        });

-        // hide series color markers
-        const idents = legendEl.querySelectorAll(".u-marker");
+        if (opts.series.length == 2) {
+            const nodes = legendEl.querySelectorAll("th");
+            for (let i = 0; i < nodes.length; i++)
+                nodes[i].style.display = "none";
+        } else {
+            legendEl.querySelector("th").remove();
+            legendEl.querySelector("td").setAttribute('colspan', '2');
+            legendEl.querySelector("td").style.textAlign = 'center';
+        }

-        for (let i = 0; i < idents.length; i++)
-            idents[i].style.display = "none";
+        if (opts.series.length - 1 > showLimit) {
+            showTop = true;
+            let footer = legendEl.insertRow().insertCell();
+            footer.setAttribute('colspan', '2');
+            footer.style.textAlign = 'center';
+            footer.classList.add('u-value');
+            footer.parentNode.classList.add('u-series','footer');
+            footer.textContent = ". . .";
+        }

        const overEl = u.over;
+        overEl.style.overflow = "visible";

        overEl.appendChild(legendEl);

@ -946,11 +1014,28 @@ function legendAsTooltipPlugin({ className, style = { background: "var(--legend-
        overEl.addEventListener("mouseleave", () => {legendEl.style.display = "none";});
    }

+    function nodeListToArray(nodeList) {
+        return Array.prototype.slice.call(nodeList);
+    }
+
    function update(u) {
        let { left, top } = u.cursor;
        left -= legendEl.clientWidth / 2;
        top -= legendEl.clientHeight / 2;
        legendEl.style.transform = "translate(" + left + "px, " + top + "px)";
+        if (showTop) {
+            let nodes = nodeListToArray(legendEl.querySelectorAll("tr"));
+            let header = nodes.shift();
+            let footer = nodes.pop();
+            nodes.forEach(function (node) { node._sort_key = +node.querySelector("td").textContent; });
+            nodes.sort((a, b) => +b._sort_key - +a._sort_key);
+            nodes.forEach(function (node) { node.parentNode.appendChild(node); });
+            for (let i = 0; i < nodes.length; i++) {
+                nodes[i].style.display = i < showLimit ? null : "none";
+                delete nodes[i]._sort_key;
+            }
+            footer.parentNode.appendChild(footer);
+        }
    }

    return {
@ -961,12 +1046,13 @@ function legendAsTooltipPlugin({ className, style = { background: "var(--legend-
    };
 }

+
 async function doFetch(query, url_params = '') {
    host = document.getElementById('url').value || host;
    user = document.getElementById('user').value;
    password = document.getElementById('password').value;

-    let url = `${host}?default_format=JSONCompactColumns&enable_http_compression=1`
+    let url = `${host}?default_format=JSONColumnsWithMetadata&enable_http_compression=1`

    if (add_http_cors_header) {
        // For debug purposes, you may set add_http_cors_header from a browser console
@ -980,14 +1066,17 @@ async function doFetch(query, url_params = '') {
        url += `&password=${encodeURIComponent(password)}`;
    }

-    let response, data, error;
+    let response, reply, error;
    try {
        response = await fetch(url + url_params, { method: "POST", body: query });
-        data = await response.text();
+        reply = await response.text();
        if (response.ok) {
-            data = JSON.parse(data);
+            reply = JSON.parse(reply);
+            if (reply.exception) {
+                error = reply.exception;
+            }
        } else {
-            error = data;
+            error = reply;
        }
    } catch (e) {
        console.log(e);
@ -1006,7 +1095,7 @@ async function doFetch(query, url_params = '') {
        }
    }

-    return {data, error};
+    return {reply, error};
 }

 async function draw(idx, chart, url_params, query) {
@ -1015,17 +1104,76 @@ async function draw(idx, chart, url_params, query) {
        plots[idx] = null;
    }

-    let {data, error} = await doFetch(query, url_params);
+    let {reply, error} = await doFetch(query, url_params);
+    if (!error) {
+        if (reply.rows.length == 0) {
+            error = "Query returned empty result.";
+        } else if (reply.meta.length < 2) {
+            error = "Query should return at least two columns: unix timestamp and value.";
+        } else {
+            for (let i = 0; i < reply.meta.length; i++) {
+                let label = reply.meta[i].name;
+                let column = reply.data[label];
+                if (!Array.isArray(column) || column.length != reply.data[reply.meta[0].name].length) {
+                    error = "Wrong data format of the query.";
+                    break;
+                }
+            }
+        }
+    }
+
+    // Transform string-labeled data to multi-column data
+    function transformToColumns() {
+        const x = reply.meta[0].name; // time; must be ordered
+        const l = reply.meta[1].name; // string label column to distinguish series; must be ordered
+        const y = reply.meta[2].name; // values; must have single value for (x, l) pair
+        const labels = [...new Set(reply.data[l])].sort((a, b) => a - b);
+        if (labels.includes('__time__')) {
+            error = "The second column is not allowed to contain '__time__' values.";
+            return;
+        }
+        const times = [...new Set(reply.data[x])].sort((a, b) => a - b);
+        let new_meta = [{ name: '__time__', type: reply.meta[0].type }];
+        let new_data = { __time__: [] };
+        for (let label of labels) {
+            new_meta.push({ name: label, type: reply.meta[2].type });
+            new_data[label] = [];
+        }
+        let new_rows = 0;
+        function row_done(row_time) {
+            new_rows++;
+            new_data.__time__.push(row_time);
+            for (let label of labels) {
+                if (new_data[label].length < new_rows) {
+                    new_data[label].push(null);
+                }
+            }
+        }
+        let prev_time = reply.data[x][0];
+        const old_rows = reply.data[x].length;
+        for (let i = 0; i < old_rows; i++) {
+            const time = reply.data[x][i];
+            const label = reply.data[l][i];
+            const value = reply.data[y][i];
+            if (prev_time != time) {
+                row_done(prev_time);
+                prev_time = time;
+            }
+            new_data[label].push(value);
+        }
+        row_done(prev_time);
+        reply.meta = new_meta;
+        reply.data = new_data;
+        reply.rows = new_rows;
+    }
+
+    function isStringColumn(type) {
+        return type === 'String' || type === 'LowCardinality(String)';
+    }

    if (!error) {
-        if (!Array.isArray(data)) {
-            error = "Query should return an array.";
-        } else if (data.length == 0) {
-            error = "Query returned empty result.";
-        } else if (data.length != 2) {
-            error = "Query should return exactly two columns: unix timestamp and value.";
-        } else if (!Array.isArray(data[0]) || !Array.isArray(data[1]) || data[0].length != data[1].length) {
-            error = "Wrong data format of the query.";
+        if (reply.meta.length == 3 && isStringColumn(reply.meta[1].type)) {
+            transformToColumns();
        }
    }

@ -1043,24 +1191,38 @@ async function draw(idx, chart, url_params, query) {
    }

    const [line_color, fill_color, grid_color, axes_color] = theme != 'dark'
-        ? ["#F88", "#FEE", "#EED", "#2c3235"]
-        : ["#864", "#045", "#2c3235", "#c7d0d9"];
+        ? ["#ff8888", "#ffeeee", "#eeeedd", "#2c3235"]
+        : ["#886644", "#004455", "#2c3235", "#c7d0d9"];

    let sync = uPlot.sync("sync");

-    const max_value = Math.max(...data[1]);
+    let axis = {
+        stroke: axes_color,
+        grid: { width: 1 / devicePixelRatio, stroke: grid_color },
+        ticks: { width: 1 / devicePixelRatio, stroke: grid_color }
+    };
+
+    let axes = [axis, axis];
+    let series = [{ label: "x" }];
+    let data = [reply.data[reply.meta[0].name]];
+
+    // Treat every column as series
+    const series_count = reply.meta.length;
+    const fill = series_count == 2 ? fill_color : undefined;
+    const palette = generatePalette(line_color, series_count);
+    let max_value = Number.NEGATIVE_INFINITY;
+    for (let i = 1; i < series_count; i++) {
+        let label = reply.meta[i].name;
+        series.push({ label, stroke: palette[i - 1], fill });
+        data.push(reply.data[label]);
+        max_value = Math.max(max_value, ...reply.data[label]);
+    }

    const opts = {
        width: chart.clientWidth,
        height: chart.clientHeight,
-        axes: [ { stroke: axes_color,
-                  grid: { width: 1 / devicePixelRatio, stroke: grid_color },
-                  ticks: { width: 1 / devicePixelRatio, stroke: grid_color } },
-                { stroke: axes_color,
-                  grid: { width: 1 / devicePixelRatio, stroke: grid_color },
-                  ticks: { width: 1 / devicePixelRatio, stroke: grid_color } } ],
-        series: [ { label: "x" },
-                  { label: "y", stroke: line_color, fill: fill_color } ],
+        axes,
+        series,
        padding: [ null, null, null, (Math.round(max_value * 100) / 100).toString().length * 6 - 10 ],
        plugins: [ legendAsTooltipPlugin() ],
        cursor: {
@ -1216,22 +1378,21 @@ function saveState() {
 }

 async function searchQueries() {
-    let {data, error} = await doFetch(search_query);
+    let {reply, error} = await doFetch(search_query);
    if (error) {
        throw new Error(error);
    }
-    if (!Array.isArray(data)) {
-        throw new Error("Search query should return an array.");
-    } else if (data.length == 0) {
+    let data = reply.data;
+    if (reply.rows == 0) {
        throw new Error("Search query returned empty result.");
-    } else if (data.length != 2) {
+    } else if (reply.meta.length != 2 || reply.meta[0].name != "title" || reply.meta[1].name != "query") {
        throw new Error("Search query should return exactly two columns: title and query.");
-    } else if (!Array.isArray(data[0]) || !Array.isArray(data[1]) || data[0].length != data[1].length) {
+    } else if (!Array.isArray(data.title) || !Array.isArray(data.query) || data.title.length != data.query.length) {
        throw new Error("Wrong data format of the search query.");
    }

-    for (let i = 0; i < data[0].length; i++) {
-        queries.push({title: data[0][i], query: data[1][i]});
+    for (let i = 0; i < data.title.length; i++) {
+        queries.push({title: data.title[i], query: data.query[i]});
    }

    regenerate();
--- a/src/AggregateFunctions/AggregateFunctionAny.cpp
+++ b/src/AggregateFunctions/AggregateFunctionAny.cpp
@ -1,26 +1,213 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <AggregateFunctions/HelpersMinMaxAny.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <base/defines.h>


 namespace DB
 {
 struct Settings;

+namespace ErrorCodes
+{
+    extern const int INCORRECT_DATA;
+    extern const int LOGICAL_ERROR;
+}
+
 namespace
 {
+struct AggregateFunctionAnyRespectNullsData
+{
+    enum Status : UInt8
+    {
+        NotSet = 1,
+        SetNull = 2,
+        SetOther = 3
+    };
+
+    Status status = Status::NotSet;
+    Field value;
+
+    bool isSet() const { return status != Status::NotSet; }
+    void setNull() { status = Status::SetNull; }
+    void setOther() { status = Status::SetOther; }
+};
+
+template <bool First>
+class AggregateFunctionAnyRespectNulls final
+    : public IAggregateFunctionDataHelper<AggregateFunctionAnyRespectNullsData, AggregateFunctionAnyRespectNulls<First>>
+{
+public:
+    using Data = AggregateFunctionAnyRespectNullsData;
+
+    SerializationPtr serialization;
+    const bool returns_nullable_type = false;
+
+    explicit AggregateFunctionAnyRespectNulls(const DataTypePtr & type)
+        : IAggregateFunctionDataHelper<Data, AggregateFunctionAnyRespectNulls<First>>({type}, {}, type)
+        , serialization(type->getDefaultSerialization())
+        , returns_nullable_type(type->isNullable())
+    {
+    }
+
+    String getName() const override
+    {
+        if constexpr (First)
+            return "any_respect_nulls";
+        else
+            return "anyLast_respect_nulls";
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void addNull(AggregateDataPtr __restrict place) const
+    {
+        chassert(returns_nullable_type);
+        auto & d = this->data(place);
+        if (First && d.isSet())
+            return;
+        d.setNull();
+    }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        if (columns[0]->isNullable())
+        {
+            if (columns[0]->isNullAt(row_num))
+                return addNull(place);
+        }
+        auto & d = this->data(place);
+        if (First && d.isSet())
+            return;
+        d.setOther();
+        columns[0]->get(row_num, d.value);
+    }
+
+    void addManyDefaults(AggregateDataPtr __restrict place, const IColumn ** columns, size_t, Arena * arena) const override
+    {
+        if (columns[0]->isNullable())
+            addNull(place);
+        else
+            add(place, columns, 0, arena);
+    }
+
+    void addBatchSinglePlace(
+        size_t row_begin, size_t row_end, AggregateDataPtr place, const IColumn ** columns, Arena * arena, ssize_t if_argument_pos)
+        const override
+    {
+        if (if_argument_pos >= 0)
+        {
+            const auto & flags = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]).getData();
+            size_t size = row_end - row_begin;
+            for (size_t i = 0; i < size; ++i)
+            {
+                size_t pos = First ? row_begin + i : row_end - 1 - i;
+                if (flags[pos])
+                {
+                    add(place, columns, pos, arena);
+                    break;
+                }
+            }
+        }
+        else
+        {
+            size_t pos = First ? row_begin : row_end - 1;
+            add(place, columns, pos, arena);
+        }
+    }
+
+    void addBatchSinglePlaceNotNull(
+        size_t, size_t, AggregateDataPtr __restrict, const IColumn **, const UInt8 *, Arena *, ssize_t) const override
+    {
+        /// This should not happen since it means somebody else has preprocessed the data (NULLs or IFs) and might
+        /// have discarded values that we need (NULLs)
+        throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "AggregateFunctionAnyRespectNulls::addBatchSinglePlaceNotNull called");
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        auto & d = this->data(place);
+        if (First && d.isSet())
+            return;
+
+        auto & other = this->data(rhs);
+        if (other.isSet())
+        {
+            d.status = other.status;
+            d.value = other.value;
+        }
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        auto & d = this->data(place);
+        UInt8 k = d.status;
+
+        writeBinaryLittleEndian<UInt8>(k, buf);
+        if (k == Data::Status::SetOther)
+            serialization->serializeBinary(d.value, buf, {});
+    }
+
+    void deserialize(AggregateDataPtr place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        auto & d = this->data(place);
+        UInt8 k = Data::Status::NotSet;
+        readBinaryLittleEndian<UInt8>(k, buf);
+        d.status = static_cast<Data::Status>(k);
+        if (d.status == Data::Status::NotSet)
+            return;
+        else if (d.status == Data::Status::SetNull)
+        {
+            if (!returns_nullable_type)
+                throw Exception(ErrorCodes::INCORRECT_DATA, "Incorrect type (NULL) in non-nullable {}State", getName());
+            return;
+        }
+        else if (d.status == Data::Status::SetOther)
+            serialization->deserializeBinary(d.value, buf, {});
+        else
+            throw Exception(ErrorCodes::INCORRECT_DATA, "Incorrect type ({}) in {}State", static_cast<Int8>(k), getName());
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        auto & d = this->data(place);
+        if (d.status == Data::Status::SetOther)
+            to.insert(d.value);
+        else
+            to.insertDefault();
+    }
+
+    AggregateFunctionPtr getOwnNullAdapter(
+        const AggregateFunctionPtr & original_function,
+        const DataTypes & /*arguments*/,
+        const Array & /*params*/,
+        const AggregateFunctionProperties & /*properties*/) const override
+    {
+        return original_function;
+    }
+};
+
+
+template <bool First>
+IAggregateFunction * createAggregateFunctionSingleValueRespectNulls(
+    const String & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
+{
+    assertNoParameters(name, parameters);
+    assertUnary(name, argument_types);
+
+    return new AggregateFunctionAnyRespectNulls<First>(argument_types[0]);
+}

 AggregateFunctionPtr createAggregateFunctionAny(const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings * settings)
 {
    return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionAnyData>(name, argument_types, parameters, settings));
 }

-template <bool RespectNulls = false>
-AggregateFunctionPtr createAggregateFunctionNullableAny(
+AggregateFunctionPtr createAggregateFunctionAnyRespectNulls(
    const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings * settings)
 {
-    return AggregateFunctionPtr(
-        createAggregateFunctionSingleNullableValue<AggregateFunctionsSingleValue, AggregateFunctionAnyData, RespectNulls>(
-            name, argument_types, parameters, settings));
+    return AggregateFunctionPtr(createAggregateFunctionSingleValueRespectNulls<true>(name, argument_types, parameters, settings));
 }

 AggregateFunctionPtr createAggregateFunctionAnyLast(const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings * settings)
@ -28,13 +215,10 @@ AggregateFunctionPtr createAggregateFunctionAnyLast(const std::string & name, co
    return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionAnyLastData>(name, argument_types, parameters, settings));
 }

-template <bool RespectNulls = false>
-AggregateFunctionPtr createAggregateFunctionNullableAnyLast(const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings * settings)
+AggregateFunctionPtr createAggregateFunctionAnyLastRespectNulls(
+    const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings * settings)
 {
-    return AggregateFunctionPtr(createAggregateFunctionSingleNullableValue<
-                                AggregateFunctionsSingleValue,
-                                AggregateFunctionAnyLastData,
-                                RespectNulls>(name, argument_types, parameters, settings));
+    return AggregateFunctionPtr(createAggregateFunctionSingleValueRespectNulls<false>(name, argument_types, parameters, settings));
 }

 AggregateFunctionPtr createAggregateFunctionAnyHeavy(const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings * settings)
@ -46,26 +230,28 @@ AggregateFunctionPtr createAggregateFunctionAnyHeavy(const std::string & name, c

 void registerAggregateFunctionsAny(AggregateFunctionFactory & factory)
 {
-    AggregateFunctionProperties properties = { .returns_default_when_only_null = false, .is_order_dependent = true };
+    AggregateFunctionProperties default_properties = {.returns_default_when_only_null = false, .is_order_dependent = true};
+    AggregateFunctionProperties default_properties_for_respect_nulls
+        = {.returns_default_when_only_null = false, .is_order_dependent = true, .is_window_function = true};

-    factory.registerFunction("any", { createAggregateFunctionAny, properties });
+    factory.registerFunction("any", {createAggregateFunctionAny, default_properties});
    factory.registerAlias("any_value", "any", AggregateFunctionFactory::CaseInsensitive);
-    factory.registerFunction("anyLast", { createAggregateFunctionAnyLast, properties });
-    factory.registerFunction("anyHeavy", { createAggregateFunctionAnyHeavy, properties });
+    factory.registerAlias("first_value", "any", AggregateFunctionFactory::CaseInsensitive);

-    // Synonyms for use as window functions.
-    factory.registerFunction("first_value",
-        { createAggregateFunctionAny, properties },
-        AggregateFunctionFactory::CaseInsensitive);
-    factory.registerFunction("first_value_respect_nulls",
-        { createAggregateFunctionNullableAny<true>, properties },
-        AggregateFunctionFactory::CaseInsensitive);
-    factory.registerFunction("last_value",
-        { createAggregateFunctionAnyLast, properties },
-        AggregateFunctionFactory::CaseInsensitive);
-    factory.registerFunction("last_value_respect_nulls",
-        { createAggregateFunctionNullableAnyLast<true>, properties },
-        AggregateFunctionFactory::CaseInsensitive);
+    factory.registerFunction("any_respect_nulls", {createAggregateFunctionAnyRespectNulls, default_properties_for_respect_nulls});
+    factory.registerAlias("any_value_respect_nulls", "any_respect_nulls", AggregateFunctionFactory::CaseInsensitive);
+    factory.registerAlias("first_value_respect_nulls", "any_respect_nulls", AggregateFunctionFactory::CaseInsensitive);
+
+    factory.registerFunction("anyLast", {createAggregateFunctionAnyLast, default_properties});
+    factory.registerAlias("last_value", "anyLast", AggregateFunctionFactory::CaseInsensitive);
+
+    factory.registerFunction("anyLast_respect_nulls", {createAggregateFunctionAnyLastRespectNulls, default_properties_for_respect_nulls});
+    factory.registerAlias("last_value_respect_nulls", "anyLast_respect_nulls", AggregateFunctionFactory::CaseInsensitive);
+
+    factory.registerFunction("anyHeavy", {createAggregateFunctionAnyHeavy, default_properties});
+
+    factory.registerNullsActionTransformation("any", "any_respect_nulls");
+    factory.registerNullsActionTransformation("anyLast", "anyLast_respect_nulls");
 }

 }
--- a/src/AggregateFunctions/AggregateFunctionCount.h
+++ b/src/AggregateFunctions/AggregateFunctionCount.h
@ -116,7 +116,7 @@ public:
        /// Return normalized state type: count()
        AggregateFunctionProperties properties;
        return std::make_shared<DataTypeAggregateFunction>(
-            AggregateFunctionFactory::instance().get(getName(), {}, {}, properties), DataTypes{}, Array{});
+            AggregateFunctionFactory::instance().get(getName(), NullsAction::EMPTY, {}, {}, properties), DataTypes{}, Array{});
    }

    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
@ -267,7 +267,7 @@ public:
        /// Return normalized state type: count()
        AggregateFunctionProperties properties;
        return std::make_shared<DataTypeAggregateFunction>(
-            AggregateFunctionFactory::instance().get(getName(), {}, {}, properties), DataTypes{}, Array{});
+            AggregateFunctionFactory::instance().get(getName(), NullsAction::EMPTY, {}, {}, properties), DataTypes{}, Array{});
    }

    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
--- a/src/AggregateFunctions/AggregateFunctionFactory.cpp
+++ b/src/AggregateFunctions/AggregateFunctionFactory.cpp
@ -1,23 +1,11 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <AggregateFunctions/Combinators/AggregateFunctionCombinatorFactory.h>

-#include <DataTypes/DataTypeAggregateFunction.h>
-#include <DataTypes/DataTypeNullable.h>
-#include <DataTypes/DataTypesNumber.h>
 #include <DataTypes/DataTypeLowCardinality.h>
-
-#include <IO/WriteHelpers.h>
-
-#include <Interpreters/Context.h>
-
-#include <Common/StringUtils/StringUtils.h>
-#include <Common/typeid_cast.h>
-#include <Common/CurrentThread.h>
-
-#include <Poco/String.h>
-
+#include <DataTypes/DataTypesNumber.h>
 #include <Functions/FunctionFactory.h>
-
+#include <IO/WriteHelpers.h>
+#include <Interpreters/Context.h>

 static constexpr size_t MAX_AGGREGATE_FUNCTION_NAME_LENGTH = 1000;

@ -28,10 +16,11 @@ struct Settings;

 namespace ErrorCodes
 {
-    extern const int UNKNOWN_AGGREGATE_FUNCTION;
-    extern const int LOGICAL_ERROR;
    extern const int ILLEGAL_AGGREGATION;
+    extern const int LOGICAL_ERROR;
+    extern const int NOT_IMPLEMENTED;
    extern const int TOO_LARGE_STRING_SIZE;
+    extern const int UNKNOWN_AGGREGATE_FUNCTION;
 }

 const String & getAggregateFunctionCanonicalNameIfAny(const String & name)
@ -59,6 +48,23 @@ void AggregateFunctionFactory::registerFunction(const String & name, Value creat
    }
 }

+void AggregateFunctionFactory::registerNullsActionTransformation(const String & source_ignores_nulls, const String & target_respect_nulls)
+{
+    if (!aggregate_functions.contains(source_ignores_nulls))
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "registerNullsActionTransformation: Source aggregation '{}' not found");
+
+    if (!aggregate_functions.contains(target_respect_nulls))
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "registerNullsActionTransformation: Target aggregation '{}' not found");
+
+    if (!respect_nulls.emplace(source_ignores_nulls, target_respect_nulls).second)
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR, "registerNullsActionTransformation: Assignment from '{}' is not unique", source_ignores_nulls);
+
+    if (!ignore_nulls.emplace(target_respect_nulls, source_ignores_nulls).second)
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR, "registerNullsActionTransformation: Assignment from '{}' is not unique", target_respect_nulls);
+}
+
 static DataTypes convertLowCardinalityTypesToNested(const DataTypes & types)
 {
    DataTypes res_types;
@ -70,7 +76,11 @@ static DataTypes convertLowCardinalityTypesToNested(const DataTypes & types)
 }

 AggregateFunctionPtr AggregateFunctionFactory::get(
-    const String & name, const DataTypes & argument_types, const Array & parameters, AggregateFunctionProperties & out_properties) const
+    const String & name,
+    NullsAction action,
+    const DataTypes & argument_types,
+    const Array & parameters,
+    AggregateFunctionProperties & out_properties) const
 {
    /// This to prevent costly string manipulation in parsing the aggregate function combinators.
    /// Example: avgArrayArrayArrayArray...(1000 times)...Array
@ -81,8 +91,9 @@ AggregateFunctionPtr AggregateFunctionFactory::get(

    /// If one of the types is Nullable, we apply aggregate function combinator "Null" if it's not window function.
    /// Window functions are not real aggregate functions. Applying combinators doesn't make sense for them,
-    /// they must handle the nullability themselves
-    auto properties = tryGetProperties(name);
+    /// they must handle the nullability themselves.
+    /// Aggregate functions such as any_value_respect_nulls are considered window functions in that sense
+    auto properties = tryGetProperties(name, action);
    bool is_window_function = properties.has_value() && properties->is_window_function;
    if (!is_window_function && std::any_of(types_without_low_cardinality.begin(), types_without_low_cardinality.end(),
        [](const auto & type) { return type->isNullable(); }))
@ -98,8 +109,7 @@ AggregateFunctionPtr AggregateFunctionFactory::get(
        bool has_null_arguments = std::any_of(types_without_low_cardinality.begin(), types_without_low_cardinality.end(),
            [](const auto & type) { return type->onlyNull(); });

-        AggregateFunctionPtr nested_function = getImpl(
-            name, nested_types, nested_parameters, out_properties, has_null_arguments);
+        AggregateFunctionPtr nested_function = getImpl(name, action, nested_types, nested_parameters, out_properties, has_null_arguments);

        // Pure window functions are not real aggregate functions. Applying
        // combinators doesn't make sense for them, they must handle the
@ -110,22 +120,54 @@ AggregateFunctionPtr AggregateFunctionFactory::get(
            return combinator->transformAggregateFunction(nested_function, out_properties, types_without_low_cardinality, parameters);
    }

-    auto with_original_arguments = getImpl(name, types_without_low_cardinality, parameters, out_properties, false);
+    auto with_original_arguments = getImpl(name, action, types_without_low_cardinality, parameters, out_properties, false);

    if (!with_original_arguments)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: AggregateFunctionFactory returned nullptr");
    return with_original_arguments;
 }

+std::optional<AggregateFunctionWithProperties>
+AggregateFunctionFactory::getAssociatedFunctionByNullsAction(const String & name, NullsAction action) const
+{
+    if (action == NullsAction::RESPECT_NULLS)
+    {
+        if (auto it = respect_nulls.find(name); it == respect_nulls.end())
+            throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Function {} does not support RESPECT NULLS", name);
+        else if (auto associated_it = aggregate_functions.find(it->second); associated_it != aggregate_functions.end())
+            return {associated_it->second};
+        else
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR, "Unable to find the function {} (equivalent to '{} RESPECT NULLS')", it->second, name);
+    }
+
+    if (action == NullsAction::IGNORE_NULLS)
+    {
+        if (auto it = ignore_nulls.find(name); it != ignore_nulls.end())
+        {
+            if (auto associated_it = aggregate_functions.find(it->second); associated_it != aggregate_functions.end())
+                return {associated_it->second};
+            else
+                throw Exception(
+                    ErrorCodes::LOGICAL_ERROR, "Unable to find the function {} (equivalent to '{} IGNORE NULLS')", it->second, name);
+        }
+        /// We don't throw for IGNORE NULLS of other functions because that's the default in CH
+    }
+
+    return {};
+}
+

 AggregateFunctionPtr AggregateFunctionFactory::getImpl(
    const String & name_param,
+    NullsAction action,
    const DataTypes & argument_types,
    const Array & parameters,
    AggregateFunctionProperties & out_properties,
    bool has_null_arguments) const
 {
    String name = getAliasToOrName(name_param);
+    String case_insensitive_name;
    bool is_case_insensitive = false;
    Value found;

@ -135,10 +177,14 @@ AggregateFunctionPtr AggregateFunctionFactory::getImpl(
        found = it->second;
    }

-    if (auto jt = case_insensitive_aggregate_functions.find(Poco::toLower(name)); jt != case_insensitive_aggregate_functions.end())
+    if (!found.creator)
    {
-        found = jt->second;
-        is_case_insensitive = true;
+        case_insensitive_name = Poco::toLower(name);
+        if (auto jt = case_insensitive_aggregate_functions.find(case_insensitive_name); jt != case_insensitive_aggregate_functions.end())
+        {
+            found = jt->second;
+            is_case_insensitive = true;
+        }
    }

    ContextPtr query_context;
@ -147,11 +193,14 @@ AggregateFunctionPtr AggregateFunctionFactory::getImpl(

    if (found.creator)
    {
-        out_properties = found.properties;
+        auto opt = getAssociatedFunctionByNullsAction(is_case_insensitive ? case_insensitive_name : name, action);
+        if (opt)
+            found = *opt;

+        out_properties = found.properties;
        if (query_context && query_context->getSettingsRef().log_queries)
            query_context->addQueryFactoriesInfo(
-                    Context::QueryLogFactories::AggregateFunction, is_case_insensitive ? Poco::toLower(name) : name);
+                Context::QueryLogFactories::AggregateFunction, is_case_insensitive ? case_insensitive_name : name);

        /// The case when aggregate function should return NULL on NULL arguments. This case is handled in "get" method.
        if (!out_properties.returns_default_when_only_null && has_null_arguments)
@ -196,7 +245,7 @@ AggregateFunctionPtr AggregateFunctionFactory::getImpl(
        DataTypes nested_types = combinator->transformArguments(argument_types);
        Array nested_parameters = combinator->transformParameters(parameters);

-        AggregateFunctionPtr nested_function = get(nested_name, nested_types, nested_parameters, out_properties);
+        AggregateFunctionPtr nested_function = get(nested_name, action, nested_types, nested_parameters, out_properties);
        return combinator->transformAggregateFunction(nested_function, out_properties, argument_types, parameters);
    }

@ -213,16 +262,7 @@ AggregateFunctionPtr AggregateFunctionFactory::getImpl(
        throw Exception(ErrorCodes::UNKNOWN_AGGREGATE_FUNCTION, "Unknown aggregate function {}{}", name, extra_info);
 }

-
-AggregateFunctionPtr AggregateFunctionFactory::tryGet(
-    const String & name, const DataTypes & argument_types, const Array & parameters, AggregateFunctionProperties & out_properties) const
-{
-    return isAggregateFunctionName(name)
-        ? get(name, argument_types, parameters, out_properties)
-        : nullptr;
-}
-
-std::optional<AggregateFunctionProperties> AggregateFunctionFactory::tryGetProperties(String name) const
+std::optional<AggregateFunctionProperties> AggregateFunctionFactory::tryGetProperties(String name, NullsAction action) const
 {
    if (name.size() > MAX_AGGREGATE_FUNCTION_NAME_LENGTH)
        throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too long name of aggregate function, maximum: {}", MAX_AGGREGATE_FUNCTION_NAME_LENGTH);
@ -231,6 +271,8 @@ std::optional<AggregateFunctionProperties> AggregateFunctionFactory::tryGetPrope
    {
        name = getAliasToOrName(name);
        Value found;
+        String lower_case_name;
+        bool is_case_insensitive = false;

        /// Find by exact match.
        if (auto it = aggregate_functions.find(name); it != aggregate_functions.end())
@ -238,11 +280,23 @@ std::optional<AggregateFunctionProperties> AggregateFunctionFactory::tryGetPrope
            found = it->second;
        }

-        if (auto jt = case_insensitive_aggregate_functions.find(Poco::toLower(name)); jt != case_insensitive_aggregate_functions.end())
-            found = jt->second;
+        if (!found.creator)
+        {
+            lower_case_name = Poco::toLower(name);
+            if (auto jt = case_insensitive_aggregate_functions.find(lower_case_name); jt != case_insensitive_aggregate_functions.end())
+            {
+                is_case_insensitive = true;
+                found = jt->second;
+            }
+        }

        if (found.creator)
+        {
+            auto opt = getAssociatedFunctionByNullsAction(is_case_insensitive ? lower_case_name : name, action);
+            if (opt)
+                return opt->properties;
            return found.properties;
+        }

        /// Combinators of aggregate functions.
        /// For every aggregate function 'agg' and combiner '-Comb' there is a combined aggregate function with the name 'aggComb',
@ -262,27 +316,29 @@ std::optional<AggregateFunctionProperties> AggregateFunctionFactory::tryGetPrope
 }


-bool AggregateFunctionFactory::isAggregateFunctionName(String name) const
+bool AggregateFunctionFactory::isAggregateFunctionName(const String & name_) const
 {
-    if (name.size() > MAX_AGGREGATE_FUNCTION_NAME_LENGTH)
+    if (name_.size() > MAX_AGGREGATE_FUNCTION_NAME_LENGTH)
        throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too long name of aggregate function, maximum: {}", MAX_AGGREGATE_FUNCTION_NAME_LENGTH);

-    while (true)
+    if (aggregate_functions.contains(name_) || isAlias(name_))
+        return true;
+
+    String name_lowercase = Poco::toLower(name_);
+    if (case_insensitive_aggregate_functions.contains(name_lowercase) || isAlias(name_lowercase))
+        return true;
+
+    String name = name_;
+    while (AggregateFunctionCombinatorPtr combinator = AggregateFunctionCombinatorFactory::instance().tryFindSuffix(name))
    {
-        if (aggregate_functions.contains(name) || isAlias(name))
-            return true;
+        name = name.substr(0, name.size() - combinator->getName().size());
+        name_lowercase = name_lowercase.substr(0, name_lowercase.size() - combinator->getName().size());

-        String name_lowercase = Poco::toLower(name);
-        if (case_insensitive_aggregate_functions.contains(name_lowercase) || isAlias(name_lowercase))
+        if (aggregate_functions.contains(name) || isAlias(name) || case_insensitive_aggregate_functions.contains(name_lowercase)
+            || isAlias(name_lowercase))
            return true;
-
-        if (AggregateFunctionCombinatorPtr combinator = AggregateFunctionCombinatorFactory::instance().tryFindSuffix(name))
-        {
-            name = name.substr(0, name.size() - combinator->getName().size());
-        }
-        else
-            return false;
    }
+    return false;
 }

 AggregateFunctionFactory & AggregateFunctionFactory::instance()
--- a/src/AggregateFunctions/AggregateFunctionFactory.h
+++ b/src/AggregateFunctions/AggregateFunctionFactory.h
@ -1,9 +1,9 @@
 #pragma once

 #include <AggregateFunctions/IAggregateFunction.h>
-#include <Common/IFactoryWithAliases.h>
 #include <Parsers/ASTFunction.h>
-
+#include <Parsers/NullsAction.h>
+#include <Common/IFactoryWithAliases.h>

 #include <functional>
 #include <memory>
@ -62,36 +62,44 @@ public:
        Value creator,
        CaseSensitiveness case_sensitiveness = CaseSensitive);

+    /// Register how to transform from one aggregate function to other based on NullsAction
+    /// Registers them both ways:
+    /// SOURCE + RESPECT NULLS will be transformed to TARGET
+    /// TARGET + IGNORE NULLS will be transformed to SOURCE
+    void registerNullsActionTransformation(const String & source_ignores_nulls, const String & target_respect_nulls);
+
    /// Throws an exception if not found.
    AggregateFunctionPtr
    get(const String & name,
-        const DataTypes & argument_types,
-        const Array & parameters,
-        AggregateFunctionProperties & out_properties) const;
-
-    /// Returns nullptr if not found.
-    AggregateFunctionPtr tryGet(
-        const String & name,
+        NullsAction action,
        const DataTypes & argument_types,
        const Array & parameters,
        AggregateFunctionProperties & out_properties) const;

    /// Get properties if the aggregate function exists.
-    std::optional<AggregateFunctionProperties> tryGetProperties(String name) const;
+    std::optional<AggregateFunctionProperties> tryGetProperties(String name, NullsAction action) const;

-    bool isAggregateFunctionName(String name) const;
+    bool isAggregateFunctionName(const String & name) const;

 private:
    AggregateFunctionPtr getImpl(
        const String & name,
+        NullsAction action,
        const DataTypes & argument_types,
        const Array & parameters,
        AggregateFunctionProperties & out_properties,
        bool has_null_arguments) const;

    using AggregateFunctions = std::unordered_map<String, Value>;
+    using ActionMap = std::unordered_map<String, String>;

    AggregateFunctions aggregate_functions;
+    /// Mapping from functions with `RESPECT NULLS` modifier to actual aggregate function names
+    /// Example: `any(x) RESPECT NULLS` should be executed as function `any_respect_nulls`
+    ActionMap respect_nulls;
+    /// Same as above for `IGNORE NULLS` modifier
+    ActionMap ignore_nulls;
+    std::optional<AggregateFunctionWithProperties> getAssociatedFunctionByNullsAction(const String & name, NullsAction action) const;

    /// Case insensitive aggregate functions will be additionally added here with lowercased name.
    AggregateFunctions case_insensitive_aggregate_functions;
--- a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h
+++ b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h
@ -771,26 +771,18 @@ static_assert(


 /// For any other value types.
-template <bool RESULT_IS_NULLABLE = false>
 struct SingleValueDataGeneric
 {
 private:
    using Self = SingleValueDataGeneric;
-
    Field value;
-    bool has_value = false;

 public:
-    static constexpr bool result_is_nullable = RESULT_IS_NULLABLE;
-    static constexpr bool should_skip_null_arguments = !RESULT_IS_NULLABLE;
+    static constexpr bool result_is_nullable = false;
+    static constexpr bool should_skip_null_arguments = true;
    static constexpr bool is_any = false;

-    bool has() const
-    {
-        if constexpr (result_is_nullable)
-            return has_value;
-        return !value.isNull();
-    }
+    bool has() const { return !value.isNull(); }

    void insertResultInto(IColumn & to) const
    {
@ -820,19 +812,9 @@ public:
            serialization.deserializeBinary(value, buf, {});
    }

-    void change(const IColumn & column, size_t row_num, Arena *)
-    {
-        column.get(row_num, value);
-        if constexpr (result_is_nullable)
-            has_value = true;
-    }
+    void change(const IColumn & column, size_t row_num, Arena *) { column.get(row_num, value); }

-    void change(const Self & to, Arena *)
-    {
-        value = to.value;
-        if constexpr (result_is_nullable)
-            has_value = true;
-    }
+    void change(const Self & to, Arena *) { value = to.value; }

    bool changeFirstTime(const IColumn & column, size_t row_num, Arena * arena)
    {
@ -847,7 +829,7 @@ public:

    bool changeFirstTime(const Self & to, Arena * arena)
    {
-        if (!has() && (result_is_nullable || to.has()))
+        if (!has() && to.has())
        {
            change(to, arena);
            return true;
@ -882,30 +864,15 @@ public:
        }
        else
        {
-            if constexpr (result_is_nullable)
+            Field new_value;
+            column.get(row_num, new_value);
+            if (new_value < value)
            {
-                Field new_value;
-                column.get(row_num, new_value);
-                if (!value.isNull() && (new_value.isNull() || new_value < value))
-                {
-                    value = new_value;
-                    return true;
-                }
-                else
-                    return false;
+                value = new_value;
+                return true;
            }
            else
-            {
-                Field new_value;
-                column.get(row_num, new_value);
-                if (new_value < value)
-                {
-                    value = new_value;
-                    return true;
-                }
-                else
-                    return false;
-            }
+                return false;
        }
    }

@ -913,30 +880,13 @@ public:
    {
        if (!to.has())
            return false;
-        if constexpr (result_is_nullable)
+        if (!has() || to.value < value)
        {
-            if (!has())
-            {
-                change(to, arena);
-                return true;
-            }
-            if (to.value.isNull() || (!value.isNull() && to.value < value))
-            {
-                value = to.value;
-                return true;
-            }
-            return false;
+            change(to, arena);
+            return true;
        }
        else
-        {
-            if (!has() || to.value < value)
-            {
-                change(to, arena);
-                return true;
-            }
-            else
-                return false;
-        }
+            return false;
    }

    bool changeIfGreater(const IColumn & column, size_t row_num, Arena * arena)
@ -948,29 +898,15 @@ public:
        }
        else
        {
-            if constexpr (result_is_nullable)
+            Field new_value;
+            column.get(row_num, new_value);
+            if (new_value > value)
            {
-                Field new_value;
-                column.get(row_num, new_value);
-                if (!value.isNull() && (new_value.isNull() || value < new_value))
-                {
-                    value = new_value;
-                    return true;
-                }
-                return false;
+                value = new_value;
+                return true;
            }
            else
-            {
-                Field new_value;
-                column.get(row_num, new_value);
-                if (new_value > value)
-                {
-                    value = new_value;
-                    return true;
-                }
-                else
-                    return false;
-            }
+                return false;
        }
    }

@ -978,36 +914,18 @@ public:
    {
        if (!to.has())
            return false;
-        if constexpr (result_is_nullable)
+        if (!has() || to.value > value)
        {
-            if (!value.isNull() && (to.value.isNull() || value < to.value))
-            {
-                value = to.value;
-                return true;
-            }
-            return false;
+            change(to, arena);
+            return true;
        }
        else
-        {
-            if (!has() || to.value > value)
-            {
-                change(to, arena);
-                return true;
-            }
-            else
-                return false;
-        }
+            return false;
    }

-    bool isEqualTo(const IColumn & column, size_t row_num) const
-    {
-        return has() && value == column[row_num];
-    }
+    bool isEqualTo(const IColumn & column, size_t row_num) const { return has() && value == column[row_num]; }

-    bool isEqualTo(const Self & to) const
-    {
-        return has() && to.value == value;
-    }
+    bool isEqualTo(const Self & to) const { return has() && to.value == value; }

    static bool allocatesMemoryInArena()
    {
--- a/src/AggregateFunctions/AggregateFunctionQuantile.h
+++ b/src/AggregateFunctions/AggregateFunctionQuantile.h
@ -150,7 +150,7 @@ public:
        AggregateFunctionProperties properties;
        return std::make_shared<DataTypeAggregateFunction>(
            AggregateFunctionFactory::instance().get(
-                GatherFunctionQuantileData::toFusedNameOrSelf(getName()), this->argument_types, params, properties),
+                GatherFunctionQuantileData::toFusedNameOrSelf(getName()), NullsAction::EMPTY, this->argument_types, params, properties),
            this->argument_types,
            params);
    }
--- a/src/AggregateFunctions/Combinators/AggregateFunctionArgMinMax.cpp
+++ b/src/AggregateFunctions/Combinators/AggregateFunctionArgMinMax.cpp
@ -20,7 +20,7 @@ template <template <typename> class Data>
 class AggregateFunctionCombinatorArgMinMax final : public IAggregateFunctionCombinator
 {
 public:
-    String getName() const override { return Data<SingleValueDataGeneric<>>::name(); }
+    String getName() const override { return Data<SingleValueDataGeneric>::name(); }

    DataTypes transformArguments(const DataTypes & arguments) const override
    {
@ -66,7 +66,7 @@ public:
        if (which.idx == TypeIndex::String)
            return std::make_shared<AggregateFunctionArgMinMax<Data<SingleValueDataString>>>(nested_function, arguments, params);

-        return std::make_shared<AggregateFunctionArgMinMax<Data<SingleValueDataGeneric<>>>>(nested_function, arguments, params);
+        return std::make_shared<AggregateFunctionArgMinMax<Data<SingleValueDataGeneric>>>(nested_function, arguments, params);
    }
 };

--- a/src/AggregateFunctions/Combinators/AggregateFunctionIf.h
+++ b/src/AggregateFunctions/Combinators/AggregateFunctionIf.h
@ -33,6 +33,8 @@ class AggregateFunctionIf final : public IAggregateFunctionHelper<AggregateFunct
 private:
    AggregateFunctionPtr nested_func;
    size_t num_arguments;
+    /// We accept Nullable(Nothing) as condition, but callees always expect UInt8 so we need to avoid calling them
+    bool only_null_condition = false;

 public:
    AggregateFunctionIf(AggregateFunctionPtr nested, const DataTypes & types, const Array & params_)
@ -42,7 +44,9 @@ public:
        if (num_arguments == 0)
            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} require at least one argument", getName());

-        if (!isUInt8(types.back()) && !types.back()->onlyNull())
+        only_null_condition = types.back()->onlyNull();
+
+        if (!isUInt8(types.back()) && !only_null_condition)
            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Last argument for aggregate function {} must be UInt8", getName());
    }

@ -108,6 +112,8 @@ public:

    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
    {
+        if (only_null_condition)
+            return;
        if (assert_cast<const ColumnUInt8 &>(*columns[num_arguments - 1]).getData()[row_num])
            nested_func->add(place, columns, row_num, arena);
    }
@ -121,6 +127,8 @@ public:
        Arena * arena,
        ssize_t) const override
    {
+        if (only_null_condition)
+            return;
        nested_func->addBatch(row_begin, row_end, places, place_offset, columns, arena, num_arguments - 1);
    }

@ -132,6 +140,8 @@ public:
        Arena * arena,
        ssize_t) const override
    {
+        if (only_null_condition)
+            return;
        nested_func->addBatchSinglePlace(row_begin, row_end, place, columns, arena, num_arguments - 1);
    }

@ -144,6 +154,8 @@ public:
        Arena * arena,
        ssize_t) const override
    {
+        if (only_null_condition)
+            return;
        nested_func->addBatchSinglePlaceNotNull(row_begin, row_end, place, columns, null_map, arena, num_arguments - 1);
    }

--- a/src/AggregateFunctions/Combinators/AggregateFunctionMap.cpp
+++ b/src/AggregateFunctions/Combinators/AggregateFunctionMap.cpp
@ -447,7 +447,8 @@ public:
            {
                AggregateFunctionProperties out_properties;
                auto & aggr_func_factory = AggregateFunctionFactory::instance();
-                return aggr_func_factory.get(nested_func_name + "MappedArrays", arguments, params, out_properties);
+                auto action = NullsAction::EMPTY;
+                return aggr_func_factory.get(nested_func_name + "MappedArrays", action, arguments, params, out_properties);
            }
            else
                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregation '{}Map' is not implemented for mapped arrays",
--- a/src/AggregateFunctions/Combinators/AggregateFunctionSimpleState.h
+++ b/src/AggregateFunctions/Combinators/AggregateFunctionSimpleState.h
@ -35,8 +35,8 @@ public:
        auto storage_type_out = DataTypeFactory::instance().get(nested_->getResultType()->getName());
        // Need to make a new function with promoted argument types because SimpleAggregates requires arg_type = return_type.
        AggregateFunctionProperties properties;
-        auto function
-            = AggregateFunctionFactory::instance().get(nested_->getName(), {storage_type_out}, nested_->getParameters(), properties);
+        auto function = AggregateFunctionFactory::instance().get(
+            nested_->getName(), NullsAction::EMPTY, {storage_type_out}, nested_->getParameters(), properties);

        // Need to make a clone because it'll be customized.
        auto storage_type_arg = DataTypeFactory::instance().get(nested_->getResultType()->getName());
--- a/src/AggregateFunctions/HelpersMinMaxAny.h
+++ b/src/AggregateFunctions/HelpersMinMaxAny.h
@ -14,8 +14,9 @@ namespace DB
 struct Settings;

 /// min, max, any, anyLast, anyHeavy, etc...
-template <template <typename> class AggregateFunctionTemplate, template <typename> class Data>
-static IAggregateFunction * createAggregateFunctionSingleValue(const String & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
+template <template <typename> class AggregateFunctionTemplate, template <typename, bool...> class Data>
+static IAggregateFunction *
+createAggregateFunctionSingleValue(const String & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
 {
    assertNoParameters(name, parameters);
    assertUnary(name, argument_types);
@ -44,31 +45,9 @@ static IAggregateFunction * createAggregateFunctionSingleValue(const String & na
    if (which.idx == TypeIndex::String)
        return new AggregateFunctionTemplate<Data<SingleValueDataString>>(argument_type);

-    return new AggregateFunctionTemplate<Data<SingleValueDataGeneric<>>>(argument_type);
+    return new AggregateFunctionTemplate<Data<SingleValueDataGeneric>>(argument_type);
 }

-template <template <typename> class AggregateFunctionTemplate, template <typename> class Data, bool RespectNulls = false>
-static IAggregateFunction * createAggregateFunctionSingleNullableValue(const String & name, const DataTypes & argument_types, const Array & parameters, const Settings * settings)
-{
-    assertNoParameters(name, parameters);
-    assertUnary(name, argument_types);
-
-    const DataTypePtr & argument_type = argument_types[0];
-    WhichDataType which(argument_type);
-    // If the result value could be null (excluding the case that no row is matched),
-    // use SingleValueDataGeneric.
-    if constexpr (!RespectNulls)
-    {
-        return createAggregateFunctionSingleValue<AggregateFunctionTemplate, Data>(name, argument_types, Array(), settings);
-    }
-    else
-    {
-        return new AggregateFunctionTemplate<Data<SingleValueDataGeneric<true>>>(argument_type);
-    }
-    UNREACHABLE();
-}
-
-
 /// argMin, argMax
 template <template <typename> class MinMaxData, typename ResData>
 static IAggregateFunction * createAggregateFunctionArgMinMaxSecond(const DataTypePtr & res_type, const DataTypePtr & val_type)
@ -98,7 +77,7 @@ static IAggregateFunction * createAggregateFunctionArgMinMaxSecond(const DataTyp
    if (which.idx == TypeIndex::String)
        return new AggregateFunctionArgMinMax<AggregateFunctionArgMinMaxData<ResData, MinMaxData<SingleValueDataString>>>(res_type, val_type);

-    return new AggregateFunctionArgMinMax<AggregateFunctionArgMinMaxData<ResData, MinMaxData<SingleValueDataGeneric<>>>>(res_type, val_type);
+    return new AggregateFunctionArgMinMax<AggregateFunctionArgMinMaxData<ResData, MinMaxData<SingleValueDataGeneric>>>(res_type, val_type);
 }

 template <template <typename> class MinMaxData>
@ -134,7 +113,7 @@ static IAggregateFunction * createAggregateFunctionArgMinMax(const String & name
    if (which.idx == TypeIndex::String)
        return createAggregateFunctionArgMinMaxSecond<MinMaxData, SingleValueDataString>(res_type, val_type);

-    return createAggregateFunctionArgMinMaxSecond<MinMaxData, SingleValueDataGeneric<>>(res_type, val_type);
+    return createAggregateFunctionArgMinMaxSecond<MinMaxData, SingleValueDataGeneric>(res_type, val_type);
 }

 }
--- a/src/Analyzer/FunctionNode.cpp
+++ b/src/Analyzer/FunctionNode.cpp
@ -113,6 +113,11 @@ void FunctionNode::dumpTreeImpl(WriteBuffer & buffer, FormatState & format_state

    buffer << ", function_type: " << function_type;

+    if (nulls_action == NullsAction::RESPECT_NULLS)
+        buffer << ", nulls_action : RESPECT_NULLS";
+    else if (nulls_action == NullsAction::IGNORE_NULLS)
+        buffer << ", nulls_action : IGNORE_NULLS";
+
    if (function)
        buffer << ", result_type: " + getResultType()->getName();

@ -140,10 +145,9 @@ void FunctionNode::dumpTreeImpl(WriteBuffer & buffer, FormatState & format_state
 bool FunctionNode::isEqualImpl(const IQueryTreeNode & rhs) const
 {
    const auto & rhs_typed = assert_cast<const FunctionNode &>(rhs);
-    if (function_name != rhs_typed.function_name ||
-        isAggregateFunction() != rhs_typed.isAggregateFunction() ||
-        isOrdinaryFunction() != rhs_typed.isOrdinaryFunction() ||
-        isWindowFunction() != rhs_typed.isWindowFunction())
+    if (function_name != rhs_typed.function_name || isAggregateFunction() != rhs_typed.isAggregateFunction()
+        || isOrdinaryFunction() != rhs_typed.isOrdinaryFunction() || isWindowFunction() != rhs_typed.isWindowFunction()
+        || nulls_action != rhs_typed.nulls_action)
        return false;

    if (isResolved() != rhs_typed.isResolved())
@ -171,6 +175,7 @@ void FunctionNode::updateTreeHashImpl(HashState & hash_state) const
    hash_state.update(isOrdinaryFunction());
    hash_state.update(isAggregateFunction());
    hash_state.update(isWindowFunction());
+    hash_state.update(nulls_action);

    if (!isResolved())
        return;
@ -192,6 +197,7 @@ QueryTreeNodePtr FunctionNode::cloneImpl() const
      */
    result_function->function = function;
    result_function->kind = kind;
+    result_function->nulls_action = nulls_action;
    result_function->wrap_with_nullable = wrap_with_nullable;

    return result_function;
@ -202,6 +208,7 @@ ASTPtr FunctionNode::toASTImpl(const ConvertToASTOptions & options) const
    auto function_ast = std::make_shared<ASTFunction>();

    function_ast->name = function_name;
+    function_ast->nulls_action = nulls_action;

    if (function_name == "nothing")
    {
--- a/src/Analyzer/FunctionNode.h
+++ b/src/Analyzer/FunctionNode.h
@ -5,11 +5,12 @@
 #include <Analyzer/ConstantValue.h>
 #include <Analyzer/IQueryTreeNode.h>
 #include <Analyzer/ListNode.h>
-#include <Common/typeid_cast.h>
 #include <Core/ColumnsWithTypeAndName.h>
 #include <Core/IResolvedFunction.h>
 #include <DataTypes/DataTypeNullable.h>
 #include <Functions/IFunction.h>
+#include <Parsers/NullsAction.h>
+#include <Common/typeid_cast.h>

 namespace DB
 {
@ -63,6 +64,10 @@ public:
    /// Get function name
    const String & getFunctionName() const { return function_name; }

+    /// Get NullAction modifier
+    NullsAction getNullsAction() const { return nulls_action; }
+    void setNullsAction(NullsAction action) { nulls_action = action; }
+
    /// Get parameters
    const ListNode & getParameters() const { return children[parameters_child_index]->as<const ListNode &>(); }

@ -214,6 +219,7 @@ protected:
 private:
    String function_name;
    FunctionKind kind = FunctionKind::UNKNOWN;
+    NullsAction nulls_action = NullsAction::EMPTY;
    IResolvedFunctionPtr function;
    bool wrap_with_nullable = false;

--- a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp
+++ b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp
@ -184,10 +184,9 @@ private:
        auto function_aggregate_function = function_node.getAggregateFunction();

        AggregateFunctionProperties properties;
-        auto aggregate_function = AggregateFunctionFactory::instance().get(aggregate_function_name,
-            { argument->getResultType() },
-            function_aggregate_function->getParameters(),
-            properties);
+        auto action = NullsAction::EMPTY;
+        auto aggregate_function = AggregateFunctionFactory::instance().get(
+            aggregate_function_name, action, {argument->getResultType()}, function_aggregate_function->getParameters(), properties);

        function_node.resolveAsAggregateFunction(std::move(aggregate_function));
    }
--- a/src/Analyzer/Passes/CountDistinctPass.cpp
+++ b/src/Analyzer/Passes/CountDistinctPass.cpp
@ -76,7 +76,8 @@ public:
        /// Replace `countDistinct` of initial query into `count`
        auto result_type = function_node->getResultType();
        AggregateFunctionProperties properties;
-        auto aggregate_function = AggregateFunctionFactory::instance().get("count", {}, {}, properties);
+        auto action = NullsAction::EMPTY;
+        auto aggregate_function = AggregateFunctionFactory::instance().get("count", action, {}, {}, properties);
        function_node->resolveAsAggregateFunction(std::move(aggregate_function));
        function_node->getArguments().getNodes().clear();
    }
--- a/src/Analyzer/Passes/FuseFunctionsPass.cpp
+++ b/src/Analyzer/Passes/FuseFunctionsPass.cpp
@ -78,9 +78,11 @@ QueryTreeNodePtr createResolvedFunction(const ContextPtr & context, const String
    return function_node;
 }

-FunctionNodePtr createResolvedAggregateFunction(const String & name, const QueryTreeNodePtr & argument, const Array & parameters = {})
+FunctionNodePtr createResolvedAggregateFunction(
+    const String & name, const QueryTreeNodePtr & argument, const Array & parameters = {}, NullsAction action = NullsAction::EMPTY)
 {
    auto function_node = std::make_shared<FunctionNode>(name);
+    function_node->setNullsAction(action);

    if (!parameters.empty())
    {
@ -92,11 +94,7 @@ FunctionNodePtr createResolvedAggregateFunction(const String & name, const Query
    function_node->getArguments().getNodes() = { argument };

    AggregateFunctionProperties properties;
-    auto aggregate_function = AggregateFunctionFactory::instance().get(
-        name,
-        { argument->getResultType() },
-        parameters,
-        properties);
+    auto aggregate_function = AggregateFunctionFactory::instance().get(name, action, {argument->getResultType()}, parameters, properties);
    function_node->resolveAsAggregateFunction(std::move(aggregate_function));

    return function_node;
--- a/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp
+++ b/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp
@ -56,7 +56,7 @@ private:
    static inline void resolveAsCountAggregateFunction(FunctionNode & function_node)
    {
        AggregateFunctionProperties properties;
-        auto aggregate_function = AggregateFunctionFactory::instance().get("count", {}, {}, properties);
+        auto aggregate_function = AggregateFunctionFactory::instance().get("count", NullsAction::EMPTY, {}, {}, properties);

        function_node.resolveAsAggregateFunction(std::move(aggregate_function));
    }
--- a/src/Analyzer/Passes/QueryAnalysisPass.cpp
+++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp
@ -118,6 +118,7 @@ namespace ErrorCodes
    extern const int ILLEGAL_COLUMN;
    extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH;
    extern const int FUNCTION_CANNOT_HAVE_PARAMETERS;
+    extern const int SYNTAX_ERROR;
 }

 /** Query analyzer implementation overview. Please check documentation in QueryAnalysisPass.h first.
@ -1208,7 +1209,8 @@ private:

    static void expandGroupByAll(QueryNode & query_tree_node_typed);

-    static std::string rewriteAggregateFunctionNameIfNeeded(const std::string & aggregate_function_name, const ContextPtr & context);
+    static std::string
+    rewriteAggregateFunctionNameIfNeeded(const std::string & aggregate_function_name, NullsAction action, const ContextPtr & context);

    static std::optional<JoinTableSide> getColumnSideFromJoinTree(const QueryTreeNodePtr & resolved_identifier, const JoinNode & join_node)
    {
@ -2310,7 +2312,8 @@ void QueryAnalyzer::expandGroupByAll(QueryNode & query_tree_node_typed)
        recursivelyCollectMaxOrdinaryExpressions(node, group_by_nodes);
 }

-std::string QueryAnalyzer::rewriteAggregateFunctionNameIfNeeded(const std::string & aggregate_function_name, const ContextPtr & context)
+std::string QueryAnalyzer::rewriteAggregateFunctionNameIfNeeded(
+    const std::string & aggregate_function_name, NullsAction action, const ContextPtr & context)
 {
    std::string result_aggregate_function_name = aggregate_function_name;
    auto aggregate_function_name_lowercase = Poco::toLower(aggregate_function_name);
@ -2337,7 +2340,7 @@ std::string QueryAnalyzer::rewriteAggregateFunctionNameIfNeeded(const std::strin
    bool need_add_or_null = settings.aggregate_functions_null_for_empty && !result_aggregate_function_name.ends_with("OrNull");
    if (need_add_or_null)
    {
-        auto properties = AggregateFunctionFactory::instance().tryGetProperties(result_aggregate_function_name);
+        auto properties = AggregateFunctionFactory::instance().tryGetProperties(result_aggregate_function_name, action);
        if (!properties->returns_default_when_only_null)
            result_aggregate_function_name += "OrNull";
    }
@ -2349,7 +2352,7 @@ std::string QueryAnalyzer::rewriteAggregateFunctionNameIfNeeded(const std::strin
      */
    if (result_aggregate_function_name.ends_with("OrNull"))
    {
-        auto function_properies = AggregateFunctionFactory::instance().tryGetProperties(result_aggregate_function_name);
+        auto function_properies = AggregateFunctionFactory::instance().tryGetProperties(result_aggregate_function_name, action);
        if (function_properies && !function_properies->returns_default_when_only_null)
        {
            size_t function_name_size = result_aggregate_function_name.size();
@ -4591,6 +4594,19 @@ ProjectionNames QueryAnalyzer::resolveLambda(const QueryTreeNodePtr & lambda_nod
    return result_projection_names;
 }

+namespace
+{
+void checkFunctionNodeHasEmptyNullsAction(FunctionNode const & node)
+{
+    if (node.getNullsAction() != NullsAction::EMPTY)
+        throw Exception(
+            ErrorCodes::SYNTAX_ERROR,
+            "Function with name '{}' cannot use {} NULLS",
+            node.getFunctionName(),
+            node.getNullsAction() == NullsAction::IGNORE_NULLS ? "IGNORE" : "RESPECT");
+}
+}
+
 /** Resolve function node in scope.
  * During function node resolve, function node can be replaced with another expression (if it match lambda or sql user defined function),
  * with constant (if it allow constant folding), or with expression list. It is caller responsibility to handle such cases appropriately.
@ -4749,6 +4765,7 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi

    if (is_special_function_exists)
    {
+        checkFunctionNodeHasEmptyNullsAction(*function_node_ptr);
        /// Rewrite EXISTS (subquery) into 1 IN (SELECT 1 FROM (subquery) LIMIT 1).
        auto & exists_subquery_argument = function_node_ptr->getArguments().getNodes().at(0);

@ -4769,6 +4786,7 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi

    if (is_special_function_if && !function_node_ptr->getArguments().getNodes().empty())
    {
+        checkFunctionNodeHasEmptyNullsAction(*function_node_ptr);
        /** Handle special case with constant If function, even if some of the arguments are invalid.
          *
          * SELECT if(hasColumnInTable('system', 'numbers', 'not_existing_column'), not_existing_column, 5) FROM system.numbers;
@ -4834,6 +4852,7 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
    /// Replace right IN function argument if it is table or table function with subquery that read ordinary columns
    if (is_special_function_in)
    {
+        checkFunctionNodeHasEmptyNullsAction(function_node);
        if (scope.context->getSettingsRef().transform_null_in)
        {
            static constexpr std::array<std::pair<std::string_view, std::string_view>, 4> in_function_to_replace_null_in_function_map =
@ -5012,6 +5031,8 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
                    lambda_expression_untyped->formatASTForErrorMessage(),
                    scope.scope_node->formatASTForErrorMessage());

+            checkFunctionNodeHasEmptyNullsAction(function_node);
+
            if (!parameters.empty())
            {
                throw Exception(
@ -5041,6 +5062,8 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
                    "Function 'untuple' must have 1 argument. In scope {}",
                    scope.scope_node->formatASTForErrorMessage());

+            checkFunctionNodeHasEmptyNullsAction(function_node);
+
            const auto & untuple_argument = function_arguments[0];
            auto result_type = untuple_argument->getResultType();
            const auto * tuple_data_type = typeid_cast<const DataTypeTuple *>(result_type.get());
@ -5091,6 +5114,7 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
                throw Exception(ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION,
                    "Function GROUPING can have up to 64 arguments, but {} provided",
                    function_arguments_size);
+            checkFunctionNodeHasEmptyNullsAction(function_node);

            bool force_grouping_standard_compatibility = scope.context->getSettingsRef().force_grouping_standard_compatibility;
            auto grouping_function = std::make_shared<FunctionGrouping>(force_grouping_standard_compatibility);
@ -5115,10 +5139,12 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
                "Window function '{}' does not support lambda arguments",
                function_name);

-        std::string aggregate_function_name = rewriteAggregateFunctionNameIfNeeded(function_name, scope.context);
+        auto action = function_node_ptr->getNullsAction();
+        std::string aggregate_function_name = rewriteAggregateFunctionNameIfNeeded(function_name, action, scope.context);

        AggregateFunctionProperties properties;
-        auto aggregate_function = AggregateFunctionFactory::instance().get(aggregate_function_name, argument_types, parameters, properties);
+        auto aggregate_function
+            = AggregateFunctionFactory::instance().get(aggregate_function_name, action, argument_types, parameters, properties);

        function_node.resolveAsWindowFunction(std::move(aggregate_function));

@ -5142,7 +5168,11 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
        is_executable_udf = false;
    }

-    if (!function)
+    if (function)
+    {
+        checkFunctionNodeHasEmptyNullsAction(function_node);
+    }
+    else
    {
        if (!AggregateFunctionFactory::instance().isAggregateFunctionName(function_name))
        {
@ -5181,10 +5211,12 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
                "Aggregate function '{}' does not support lambda arguments",
                function_name);

-        std::string aggregate_function_name = rewriteAggregateFunctionNameIfNeeded(function_name, scope.context);
+        auto action = function_node_ptr->getNullsAction();
+        std::string aggregate_function_name = rewriteAggregateFunctionNameIfNeeded(function_name, action, scope.context);

        AggregateFunctionProperties properties;
-        auto aggregate_function = AggregateFunctionFactory::instance().get(aggregate_function_name, argument_types, parameters, properties);
+        auto aggregate_function
+            = AggregateFunctionFactory::instance().get(aggregate_function_name, action, argument_types, parameters, properties);

        function_node.resolveAsAggregateFunction(std::move(aggregate_function));

--- a/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp
+++ b/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp
@ -97,6 +97,7 @@ private:
        AggregateFunctionProperties properties;
        auto aggregate_function = AggregateFunctionFactory::instance().get(
            function_node.getFunctionName() + suffix,
+            function_node.getNullsAction(),
            argument_types,
            function_node.getAggregateFunction()->getParameters(),
            properties);
--- a/src/Analyzer/Passes/SumIfToCountIfPass.cpp
+++ b/src/Analyzer/Passes/SumIfToCountIfPass.cpp
@ -157,10 +157,8 @@ private:
    static inline void resolveAsCountIfAggregateFunction(FunctionNode & function_node, const DataTypePtr & argument_type)
    {
        AggregateFunctionProperties properties;
-        auto aggregate_function = AggregateFunctionFactory::instance().get("countIf",
-            {argument_type},
-            function_node.getAggregateFunction()->getParameters(),
-            properties);
+        auto aggregate_function = AggregateFunctionFactory::instance().get(
+            "countIf", NullsAction::EMPTY, {argument_type}, function_node.getAggregateFunction()->getParameters(), properties);

        function_node.resolveAsAggregateFunction(std::move(aggregate_function));
    }
--- a/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp
+++ b/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp
@ -76,7 +76,9 @@ public:
            argument_types.emplace_back(function_node_argument->getResultType());

        AggregateFunctionProperties properties;
-        auto aggregate_function = AggregateFunctionFactory::instance().get(function_node->getFunctionName(),
+        auto aggregate_function = AggregateFunctionFactory::instance().get(
+            function_node->getFunctionName(),
+            NullsAction::EMPTY,
            argument_types,
            function_node->getAggregateFunction()->getParameters(),
            properties);
--- a/src/Analyzer/Passes/UniqToCountPass.cpp
+++ b/src/Analyzer/Passes/UniqToCountPass.cpp
@ -176,7 +176,7 @@ public:
        if (match_subquery_with_distinct() || match_subquery_with_group_by())
        {
            AggregateFunctionProperties properties;
-            auto aggregate_function = AggregateFunctionFactory::instance().get("count", {}, {}, properties);
+            auto aggregate_function = AggregateFunctionFactory::instance().get("count", NullsAction::EMPTY, {}, {}, properties);

            function_node->getArguments().getNodes().clear();
            function_node->resolveAsAggregateFunction(std::move(aggregate_function));
--- a/src/Analyzer/QueryTreeBuilder.cpp
+++ b/src/Analyzer/QueryTreeBuilder.cpp
@ -607,6 +607,7 @@ QueryTreeNodePtr QueryTreeBuilder::buildExpression(const ASTPtr & expression, co
        else
        {
            auto function_node = std::make_shared<FunctionNode>(function->name);
+            function_node->setNullsAction(function->nulls_action);

            if (function->parameters)
            {
--- a/src/Analyzer/Utils.cpp
+++ b/src/Analyzer/Utils.cpp
@ -544,11 +544,8 @@ inline AggregateFunctionPtr resolveAggregateFunction(FunctionNode * function_nod
        argument_types.emplace_back(function_node_argument->getResultType());

    AggregateFunctionProperties properties;
-    return AggregateFunctionFactory::instance().get(
-        function_node->getFunctionName(),
-        argument_types,
-        parameters,
-        properties);
+    auto action = NullsAction::EMPTY;
+    return AggregateFunctionFactory::instance().get(function_node->getFunctionName(), action, argument_types, parameters, properties);
 }

 }
--- a/src/Backups/BackupEntriesCollector.cpp
+++ b/src/Backups/BackupEntriesCollector.cpp
@ -451,17 +451,25 @@ void BackupEntriesCollector::gatherDatabaseMetadata(
        }
        catch (...)
        {
-            throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP, "Couldn't get a create query for database {}", database_name);
+            /// Probably the database has been just removed.
+            if (throw_if_database_not_found)
+                throw;
+            LOG_WARNING(log, "Couldn't get a create query for database {}", backQuoteIfNeed(database_name));
+            return;
+        }
+
+        auto * create = create_database_query->as<ASTCreateQuery>();
+        if (create->getDatabase() != database_name)
+        {
+            /// Probably the database has been just renamed. Use the older name for backup to keep the backup consistent.
+            LOG_WARNING(log, "Got a create query with unexpected name {} for database {}",
+                        backQuoteIfNeed(create->getDatabase()), backQuoteIfNeed(database_name));
+            create_database_query = create_database_query->clone();
+            create = create_database_query->as<ASTCreateQuery>();
+            create->setDatabase(database_name);
        }

        database_info.create_database_query = create_database_query;
-        const auto & create = create_database_query->as<const ASTCreateQuery &>();
-
-        if (create.getDatabase() != database_name)
-            throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP,
-                            "Got a create query with unexpected name {} for database {}",
-                            backQuoteIfNeed(create.getDatabase()), backQuoteIfNeed(database_name));
-
        String new_database_name = renaming_map.getNewDatabaseName(database_name);
        database_info.metadata_path_in_backup = root_path_in_backup / "metadata" / (escapeForFileName(new_database_name) + ".sql");
    }
@ -582,26 +590,34 @@ std::vector<std::pair<ASTPtr, StoragePtr>> BackupEntriesCollector::findTablesInD
    }

    std::unordered_set<String> found_table_names;
-    for (const auto & db_table : db_tables)
+    for (auto & db_table : db_tables)
    {
-        const auto & create_table_query = db_table.first;
-        const auto & create = create_table_query->as<const ASTCreateQuery &>();
-        found_table_names.emplace(create.getTable());
+        auto create_table_query = db_table.first;
+        auto * create = create_table_query->as<ASTCreateQuery>();
+        found_table_names.emplace(create->getTable());

        if (database_name == DatabaseCatalog::TEMPORARY_DATABASE)
        {
-            if (!create.temporary)
-                throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP,
+            if (!create->temporary)
+            {
+                throw Exception(ErrorCodes::LOGICAL_ERROR,
                                "Got a non-temporary create query for {}",
-                                tableNameWithTypeToString(database_name, create.getTable(), false));
+                                tableNameWithTypeToString(database_name, create->getTable(), false));
+            }
        }
        else
        {
-            if (create.getDatabase() != database_name)
-                throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP,
-                                "Got a create query with unexpected database name {} for {}",
-                                backQuoteIfNeed(create.getDatabase()),
-                                tableNameWithTypeToString(database_name, create.getTable(), false));
+            if (create->getDatabase() != database_name)
+            {
+                /// Probably the table has been just renamed. Use the older name for backup to keep the backup consistent.
+                LOG_WARNING(log, "Got a create query with unexpected database name {} for {}",
+                            backQuoteIfNeed(create->getDatabase()),
+                            tableNameWithTypeToString(database_name, create->getTable(), false));
+                create_table_query = create_table_query->clone();
+                create = create_table_query->as<ASTCreateQuery>();
+                create->setDatabase(database_name);
+                db_table.first = create_table_query;
+            }
        }
    }

--- a/src/Backups/WithRetries.cpp
+++ b/src/Backups/WithRetries.cpp
@ -55,6 +55,10 @@ void WithRetries::renewZooKeeper(FaultyKeeper my_faulty_zookeeper) const

        callback(my_faulty_zookeeper);
    }
+    else
+    {
+        my_faulty_zookeeper->setKeeper(zookeeper);
+    }
 }

 const WithRetries::KeeperSettings & WithRetries::getKeeperSettings() const
--- a/src/Client/QueryFuzzer.cpp
+++ b/src/Client/QueryFuzzer.cpp
@ -46,6 +46,7 @@
 #include <Common/assert_cast.h>
 #include <Common/typeid_cast.h>

+#include <AggregateFunctions/AggregateFunctionFactory.h>

 namespace DB
 {
@ -384,6 +385,39 @@ void QueryFuzzer::fuzzColumnLikeExpressionList(IAST * ast)
    // the generic recursion into IAST.children.
 }

+void QueryFuzzer::fuzzNullsAction(NullsAction & action)
+{
+    /// If it's not using actions, then it's a high change it doesn't support it to begin with
+    if ((action == NullsAction::EMPTY) && (fuzz_rand() % 100 == 0))
+    {
+        if (fuzz_rand() % 2 == 0)
+            action = NullsAction::RESPECT_NULLS;
+        else
+            action = NullsAction::IGNORE_NULLS;
+    }
+    else if (fuzz_rand() % 20 == 0)
+    {
+        switch (fuzz_rand() % 3)
+        {
+            case 0:
+            {
+                action = NullsAction::EMPTY;
+                break;
+            }
+            case 1:
+            {
+                action = NullsAction::RESPECT_NULLS;
+                break;
+            }
+            default:
+            {
+                action = NullsAction::IGNORE_NULLS;
+                break;
+            }
+        }
+    }
+}
+
 void QueryFuzzer::fuzzWindowFrame(ASTWindowDefinition & def)
 {
    switch (fuzz_rand() % 40)
@ -966,6 +1000,9 @@ void QueryFuzzer::fuzz(ASTPtr & ast)
        fuzzColumnLikeExpressionList(fn->arguments.get());
        fuzzColumnLikeExpressionList(fn->parameters.get());

+        if (AggregateUtils::isAggregateFunction(*fn))
+            fuzzNullsAction(fn->nulls_action);
+
        if (fn->is_window_function && fn->window_definition)
        {
            auto & def = fn->window_definition->as<ASTWindowDefinition &>();
--- a/src/Client/QueryFuzzer.h
+++ b/src/Client/QueryFuzzer.h
@ -10,6 +10,7 @@
 #include <Core/Field.h>
 #include <Parsers/ASTExplainQuery.h>
 #include <Parsers/IAST.h>
+#include <Parsers/NullsAction.h>
 #include <Common/randomSeed.h>
 #include "Parsers/IAST_fwd.h"

@ -86,6 +87,7 @@ struct QueryFuzzer
    void fuzzOrderByElement(ASTOrderByElement * elem);
    void fuzzOrderByList(IAST * ast);
    void fuzzColumnLikeExpressionList(IAST * ast);
+    void fuzzNullsAction(NullsAction & action);
    void fuzzWindowFrame(ASTWindowDefinition & def);
    void fuzzCreateQuery(ASTCreateQuery & create);
    void fuzzExplainQuery(ASTExplainQuery & explain);
--- a/src/Client/Suggest.cpp
+++ b/src/Client/Suggest.cpp
@ -32,21 +32,23 @@ namespace ErrorCodes
 Suggest::Suggest()
 {
    /// Keywords may be not up to date with ClickHouse parser.
-    addWords({
-        "CREATE",       "DATABASE", "IF",     "NOT",       "EXISTS",   "TEMPORARY",   "TABLE",    "ON",          "CLUSTER", "DEFAULT",
-        "MATERIALIZED", "ALIAS",    "ENGINE", "AS",        "VIEW",     "POPULATE",    "SETTINGS", "ATTACH",      "DETACH",  "DROP",
-        "RENAME",       "TO",       "ALTER",  "ADD",       "MODIFY",   "CLEAR",       "COLUMN",   "AFTER",       "COPY",    "PROJECT",
-        "PRIMARY",      "KEY",      "CHECK",  "PARTITION", "PART",     "FREEZE",      "FETCH",    "FROM",        "SHOW",    "INTO",
-        "OUTFILE",      "FORMAT",   "TABLES", "DATABASES", "LIKE",     "PROCESSLIST", "CASE",     "WHEN",        "THEN",    "ELSE",
-        "END",          "DESCRIBE", "DESC",   "USE",       "SET",      "OPTIMIZE",    "FINAL",    "DEDUPLICATE", "INSERT",  "VALUES",
-        "SELECT",       "DISTINCT", "SAMPLE", "ARRAY",     "JOIN",     "GLOBAL",      "LOCAL",    "ANY",         "ALL",     "INNER",
-        "LEFT",         "RIGHT",    "FULL",   "OUTER",     "CROSS",    "USING",       "PREWHERE", "WHERE",       "GROUP",   "BY",
-        "WITH",         "TOTALS",   "HAVING", "ORDER",     "COLLATE",  "LIMIT",       "UNION",    "AND",         "OR",      "ASC",
-        "IN",           "KILL",     "QUERY",  "SYNC",      "ASYNC",    "TEST",        "BETWEEN",  "TRUNCATE",    "USER",    "ROLE",
-        "PROFILE",      "QUOTA",    "POLICY", "ROW",       "GRANT",    "REVOKE",      "OPTION",   "ADMIN",       "EXCEPT",  "REPLACE",
-        "IDENTIFIED",   "HOST",     "NAME",   "READONLY",  "WRITABLE", "PERMISSIVE",  "FOR",      "RESTRICTIVE", "RANDOMIZED",
-        "INTERVAL",     "LIMITS",   "ONLY",   "TRACKING",  "IP",       "REGEXP",      "ILIKE",    "CLEANUP",     "APPEND"
-    });
+    addWords({"CREATE",       "DATABASE",      "IF",           "NOT",        "EXISTS",   "TEMPORARY",   "TABLE",      "ON",
+              "CLUSTER",      "DEFAULT",       "MATERIALIZED", "ALIAS",      "ENGINE",   "AS",          "VIEW",       "POPULATE",
+              "SETTINGS",     "ATTACH",        "DETACH",       "DROP",       "RENAME",   "TO",          "ALTER",      "ADD",
+              "MODIFY",       "CLEAR",         "COLUMN",       "AFTER",      "COPY",     "PROJECT",     "PRIMARY",    "KEY",
+              "CHECK",        "PARTITION",     "PART",         "FREEZE",     "FETCH",    "FROM",        "SHOW",       "INTO",
+              "OUTFILE",      "FORMAT",        "TABLES",       "DATABASES",  "LIKE",     "PROCESSLIST", "CASE",       "WHEN",
+              "THEN",         "ELSE",          "END",          "DESCRIBE",   "DESC",     "USE",         "SET",        "OPTIMIZE",
+              "FINAL",        "DEDUPLICATE",   "INSERT",       "VALUES",     "SELECT",   "DISTINCT",    "SAMPLE",     "ARRAY",
+              "JOIN",         "GLOBAL",        "LOCAL",        "ANY",        "ALL",      "INNER",       "LEFT",       "RIGHT",
+              "FULL",         "OUTER",         "CROSS",        "USING",      "PREWHERE", "WHERE",       "GROUP",      "BY",
+              "WITH",         "TOTALS",        "HAVING",       "ORDER",      "COLLATE",  "LIMIT",       "UNION",      "AND",
+              "OR",           "ASC",           "IN",           "KILL",       "QUERY",    "SYNC",        "ASYNC",      "TEST",
+              "BETWEEN",      "TRUNCATE",      "USER",         "ROLE",       "PROFILE",  "QUOTA",       "POLICY",     "ROW",
+              "GRANT",        "REVOKE",        "OPTION",       "ADMIN",      "EXCEPT",   "REPLACE",     "IDENTIFIED", "HOST",
+              "NAME",         "READONLY",      "WRITABLE",     "PERMISSIVE", "FOR",      "RESTRICTIVE", "RANDOMIZED", "INTERVAL",
+              "LIMITS",       "ONLY",          "TRACKING",     "IP",         "REGEXP",   "ILIKE",       "CLEANUP",    "APPEND",
+              "IGNORE NULLS", "RESPECT NULLS", "OVER"});
 }

 static String getLoadSuggestionQuery(Int32 suggestion_limit, bool basic_suggestion)
--- a/src/Common/Arena.h
+++ b/src/Common/Arena.h
@ -3,11 +3,11 @@
 #include <cstring>
 #include <memory>
 #include <vector>
-#include <boost/noncopyable.hpp>
 #include <Core/Defines.h>
-#include <Common/memcpySmall.h>
-#include <Common/ProfileEvents.h>
+#include <boost/noncopyable.hpp>
 #include <Common/Allocator.h>
+#include <Common/ProfileEvents.h>
+#include <Common/memcpySmall.h>

 #if __has_include(<sanitizer/asan_interface.h>) && defined(ADDRESS_SANITIZER)
 #   include <sanitizer/asan_interface.h>
@ -180,7 +180,7 @@ public:
    char * alloc(size_t size)
    {
        used_bytes += size;
-        if (unlikely(head.empty() || static_cast<std::ptrdiff_t>(size) > head.end - head.pos))
+        if (unlikely(head.empty() || size > head.remaining()))
            addMemoryChunk(size);

        char * res = head.pos;
@ -193,6 +193,9 @@ public:
    char * alignedAlloc(size_t size, size_t alignment)
    {
        used_bytes += size;
+        if (unlikely(head.empty() || size > head.remaining()))
+            addMemoryChunk(size + alignment);
+
        do
        {
            void * head_pos = head.pos;
--- a/src/Common/AsyncLoader.cpp
+++ b/src/Common/AsyncLoader.cpp
@ -1,12 +1,24 @@
 #include <Common/AsyncLoader.h>

+#include <limits>
+#include <optional>
 #include <base/defines.h>
+#include <base/scope_guard.h>
 #include <Common/ErrorCodes.h>
 #include <Common/Exception.h>
 #include <Common/noexcept_scope.h>
 #include <Common/setThreadName.h>
 #include <Common/logger_useful.h>
 #include <Common/ThreadPool.h>
+#include <Common/getNumberOfPhysicalCPUCores.h>
+#include <Common/ProfileEvents.h>
+#include <Common/Stopwatch.h>
+
+
+namespace ProfileEvents
+{
+    extern const Event AsyncLoaderWaitMicroseconds;
+}

 namespace DB
 {
@ -16,6 +28,7 @@ namespace ErrorCodes
    extern const int ASYNC_LOAD_CYCLE;
    extern const int ASYNC_LOAD_FAILED;
    extern const int ASYNC_LOAD_CANCELED;
+    extern const int LOGICAL_ERROR;
 }

 static constexpr size_t PRINT_MESSAGE_EACH_N_OBJECTS = 256;
@ -52,63 +65,48 @@ size_t LoadJob::pool() const
    return pool_id;
 }

-void LoadJob::wait() const
-{
-    std::unique_lock lock{mutex};
-    waiters++;
-    finished.wait(lock, [this] { return load_status != LoadStatus::PENDING; });
-    waiters--;
-    if (load_exception)
-        std::rethrow_exception(load_exception);
-}
-
-void LoadJob::waitNoThrow() const noexcept
-{
-    std::unique_lock lock{mutex};
-    waiters++;
-    finished.wait(lock, [this] { return load_status != LoadStatus::PENDING; });
-    waiters--;
-}
-
 size_t LoadJob::waitersCount() const
 {
    std::unique_lock lock{mutex};
    return waiters;
 }

-void LoadJob::ok()
+size_t LoadJob::ok()
 {
    std::unique_lock lock{mutex};
    load_status = LoadStatus::OK;
-    finish();
+    return finish();
 }

-void LoadJob::failed(const std::exception_ptr & ptr)
+size_t LoadJob::failed(const std::exception_ptr & ptr)
 {
    std::unique_lock lock{mutex};
    load_status = LoadStatus::FAILED;
    load_exception = ptr;
-    finish();
+    return finish();
 }

-void LoadJob::canceled(const std::exception_ptr & ptr)
+size_t LoadJob::canceled(const std::exception_ptr & ptr)
 {
    std::unique_lock lock{mutex};
    load_status = LoadStatus::CANCELED;
    load_exception = ptr;
-    finish();
+    return finish();
 }

-void LoadJob::finish()
+size_t LoadJob::finish()
 {
-    func = {}; // To ensure job function is destructed before `AsyncLoader::wait()` and `LoadJob::wait()` return
+    func = {}; // To ensure job function is destructed before `AsyncLoader::wait()` return
    finish_time = std::chrono::system_clock::now();
    if (waiters > 0)
        finished.notify_all();
+    return std::exchange(suspended_waiters, 0);
 }

-void LoadJob::scheduled()
+void LoadJob::scheduled(UInt64 job_id_)
 {
+    chassert(job_id == 0); // Job cannot be scheduled twice
+    job_id = job_id_;
    schedule_time = std::chrono::system_clock::now();
 }

@ -118,11 +116,11 @@ void LoadJob::enqueued()
        enqueue_time = std::chrono::system_clock::now();
 }

-void LoadJob::execute(size_t pool, const LoadJobPtr & self)
+void LoadJob::execute(AsyncLoader & loader, size_t pool, const LoadJobPtr & self)
 {
    execution_pool_id = pool;
    start_time = std::chrono::system_clock::now();
-    func(self);
+    func(loader, self);
 }


@ -180,11 +178,11 @@ AsyncLoader::AsyncLoader(std::vector<PoolInitializer> pool_initializers, bool lo
                init.metric_threads,
                init.metric_active_threads,
                init.metric_scheduled_threads,
-                init.max_threads,
-                /* max_free_threads = */ 0,
-                init.max_threads),
+                /* max_threads = */ std::numeric_limits<size_t>::max(), // Unlimited number of threads, we do worker management ourselves
+                /* max_free_threads = */ 0, // We do not require free threads
+                /* queue_size = */0), // Unlimited queue to avoid blocking during worker spawning
            .ready_queue = {},
-            .max_threads = init.max_threads
+            .max_threads = init.max_threads > 0 ? init.max_threads : getNumberOfPhysicalCPUCores()
        });
 }

@ -228,16 +226,16 @@ void AsyncLoader::stop()
 void AsyncLoader::schedule(LoadTask & task)
 {
    chassert(this == &task.loader);
-    scheduleImpl(task.jobs);
+    schedule(task.jobs);
 }

 void AsyncLoader::schedule(const LoadTaskPtr & task)
 {
    chassert(this == &task->loader);
-    scheduleImpl(task->jobs);
+    schedule(task->jobs);
 }

-void AsyncLoader::schedule(const std::vector<LoadTaskPtr> & tasks)
+void AsyncLoader::schedule(const LoadTaskPtrs & tasks)
 {
    LoadJobSet all_jobs;
    for (const auto & task : tasks)
@ -245,10 +243,10 @@ void AsyncLoader::schedule(const std::vector<LoadTaskPtr> & tasks)
        chassert(this == &task->loader);
        all_jobs.insert(task->jobs.begin(), task->jobs.end());
    }
-    scheduleImpl(all_jobs);
+    schedule(all_jobs);
 }

-void AsyncLoader::scheduleImpl(const LoadJobSet & input_jobs)
+void AsyncLoader::schedule(const LoadJobSet & jobs_to_schedule)
 {
    std::unique_lock lock{mutex};

@ -264,7 +262,7 @@ void AsyncLoader::scheduleImpl(const LoadJobSet & input_jobs)
    // 1) exclude already scheduled or finished jobs
    // 2) include assigned job dependencies (that are not yet scheduled)
    LoadJobSet jobs;
-    for (const auto & job : input_jobs)
+    for (const auto & job : jobs_to_schedule)
        gatherNotScheduled(job, jobs, lock);

    // Ensure scheduled_jobs graph will have no cycles. The only way to get a cycle is to add a cycle, assuming old jobs cannot reference new ones.
@ -280,7 +278,7 @@ void AsyncLoader::scheduleImpl(const LoadJobSet & input_jobs)
        NOEXCEPT_SCOPE({
            ALLOW_ALLOCATIONS_IN_SCOPE;
            scheduled_jobs.try_emplace(job);
-            job->scheduled();
+            job->scheduled(++last_job_id);
        });
    }

@ -365,11 +363,20 @@ void AsyncLoader::prioritize(const LoadJobPtr & job, size_t new_pool)
    if (!job)
        return;
    chassert(new_pool < pools.size());
+
    DENY_ALLOCATIONS_IN_SCOPE;
    std::unique_lock lock{mutex};
    prioritize(job, new_pool, lock);
 }

+void AsyncLoader::wait(const LoadJobPtr & job, bool no_throw)
+{
+    std::unique_lock job_lock{job->mutex};
+    wait(job_lock, job);
+    if (!no_throw && job->load_exception)
+        std::rethrow_exception(job->load_exception);
+}
+
 void AsyncLoader::remove(const LoadJobSet & jobs)
 {
    DENY_ALLOCATIONS_IN_SCOPE;
@ -397,9 +404,10 @@ void AsyncLoader::remove(const LoadJobSet & jobs)
        if (auto info = scheduled_jobs.find(job); info != scheduled_jobs.end())
        {
            // Job is currently executing
+            ALLOW_ALLOCATIONS_IN_SCOPE;
            chassert(info->second.isExecuting());
            lock.unlock();
-            job->waitNoThrow(); // Wait for job to finish
+            wait(job, /* no_throw = */ true); // Wait for job to finish
            lock.lock();
        }
    }
@ -415,10 +423,12 @@ void AsyncLoader::remove(const LoadJobSet & jobs)

 void AsyncLoader::setMaxThreads(size_t pool, size_t value)
 {
+    if (value == 0)
+        value = getNumberOfPhysicalCPUCores();
    std::unique_lock lock{mutex};
    auto & p = pools[pool];
-    p.thread_pool->setMaxThreads(value);
-    p.thread_pool->setQueueSize(value); // Keep queue size equal max threads count to avoid blocking during spawning
+    // Note that underlying `ThreadPool` always has unlimited `queue_size` and `max_threads`.
+    // Worker management is done by `AsyncLoader` based on `Pool::max_threads + Pool::suspended_workers` instead.
    p.max_threads = value;
    if (!is_running)
        return;
@ -442,7 +452,6 @@ Priority AsyncLoader::getPoolPriority(size_t pool) const
    return pools[pool].priority; // NOTE: lock is not needed because `priority` is const and `pools` are immutable
 }

-
 size_t AsyncLoader::getScheduledJobCount() const
 {
    std::unique_lock lock{mutex};
@ -479,11 +488,11 @@ void AsyncLoader::checkCycle(const LoadJobSet & jobs, std::unique_lock<std::mute
    while (!left.empty())
    {
        LoadJobPtr job = *left.begin();
-        checkCycleImpl(job, left, visited, lock);
+        checkCycle(job, left, visited, lock);
    }
 }

-String AsyncLoader::checkCycleImpl(const LoadJobPtr & job, LoadJobSet & left, LoadJobSet & visited, std::unique_lock<std::mutex> & lock)
+String AsyncLoader::checkCycle(const LoadJobPtr & job, LoadJobSet & left, LoadJobSet & visited, std::unique_lock<std::mutex> & lock)
 {
    if (!left.contains(job))
        return {}; // Do not consider external dependencies and already processed jobs
@ -494,7 +503,7 @@ String AsyncLoader::checkCycleImpl(const LoadJobPtr & job, LoadJobSet & left, Lo
    }
    for (const auto & dep : job->dependencies)
    {
-        if (auto chain = checkCycleImpl(dep, left, visited, lock); !chain.empty())
+        if (auto chain = checkCycle(dep, left, visited, lock); !chain.empty())
        {
            if (!visited.contains(job)) // Check for cycle end
                throw Exception(ErrorCodes::ASYNC_LOAD_CYCLE, "Load job dependency cycle detected: {} -> {}", job->name, chain);
@ -509,10 +518,11 @@ String AsyncLoader::checkCycleImpl(const LoadJobPtr & job, LoadJobSet & left, Lo
 void AsyncLoader::finish(const LoadJobPtr & job, LoadStatus status, std::exception_ptr exception_from_job, std::unique_lock<std::mutex> & lock)
 {
    chassert(scheduled_jobs.contains(job)); // Job was pending
+    size_t resumed_workers = 0; // Number of workers resumed in the execution pool of the job
    if (status == LoadStatus::OK)
    {
        // Notify waiters
-        job->ok();
+        resumed_workers += job->ok();

        // Update dependent jobs and enqueue if ready
        for (const auto & dep : scheduled_jobs[job].dependent_jobs)
@ -528,9 +538,9 @@ void AsyncLoader::finish(const LoadJobPtr & job, LoadStatus status, std::excepti
    {
        // Notify waiters
        if (status == LoadStatus::FAILED)
-            job->failed(exception_from_job);
+            resumed_workers += job->failed(exception_from_job);
        else if (status == LoadStatus::CANCELED)
-            job->canceled(exception_from_job);
+            resumed_workers += job->canceled(exception_from_job);

        Info & info = scheduled_jobs[job];
        if (info.isReady())
@ -572,35 +582,40 @@ void AsyncLoader::finish(const LoadJobPtr & job, LoadStatus status, std::excepti
        if (log_progress)
            logAboutProgress(log, finished_jobs.size() - old_jobs, finished_jobs.size() + scheduled_jobs.size() - old_jobs, stopwatch);
    });
+
+    if (resumed_workers)
+    {
+        Pool & pool = pools[job->executionPool()];
+        pool.suspended_workers -= resumed_workers;
+    }
 }

 void AsyncLoader::prioritize(const LoadJobPtr & job, size_t new_pool_id, std::unique_lock<std::mutex> & lock)
 {
+    Pool & old_pool = pools[job->pool_id];
+    Pool & new_pool = pools[new_pool_id];
+    if (old_pool.priority <= new_pool.priority)
+        return; // Never lower priority or change pool leaving the same priority
+
+    // Note that there is no point in prioritizing finished jobs, but because we do not lock `job.mutex` here (due to recursion),
+    // Races are inevitable, so we prioritize all job unconditionally: both finished and pending.
+
    if (auto info = scheduled_jobs.find(job); info != scheduled_jobs.end())
    {
-        Pool & old_pool = pools[job->pool_id];
-        Pool & new_pool = pools[new_pool_id];
-        if (old_pool.priority <= new_pool.priority)
-            return; // Never lower priority or change pool leaving the same priority
-
-        // Update priority and push job forward through ready queue if needed
-        UInt64 ready_seqno = info->second.ready_seqno;
-
        // Requeue job into the new pool queue without allocations
-        if (ready_seqno)
+        if (UInt64 ready_seqno = info->second.ready_seqno)
        {
            new_pool.ready_queue.insert(old_pool.ready_queue.extract(ready_seqno));
            if (canSpawnWorker(new_pool, lock))
                spawn(new_pool, lock);
        }
-
-        // Set user-facing pool (may affect executing jobs)
-        job->pool_id.store(new_pool_id);
-
-        // Recurse into dependencies
-        for (const auto & dep : job->dependencies)
-            prioritize(dep, new_pool_id, lock);
    }
+
+    job->pool_id.store(new_pool_id);
+
+    // Recurse into dependencies
+    for (const auto & dep : job->dependencies)
+        prioritize(dep, new_pool_id, lock);
 }

 void AsyncLoader::enqueue(Info & info, const LoadJobPtr & job, std::unique_lock<std::mutex> & lock)
@ -620,11 +635,102 @@ void AsyncLoader::enqueue(Info & info, const LoadJobPtr & job, std::unique_lock<
        spawn(pool, lock);
 }

+// Keep track of currently executing load jobs to be able to:
+// 1) Detect "wait dependent" deadlocks -- throw LOGICAL_ERROR
+//    (when job A function waits for job B that depends on job A)
+// 2) Detect "wait not scheduled" deadlocks -- throw LOGICAL_ERROR
+//    (thread T is waiting on an assigned job A, but job A is not yet scheduled)
+// 3) Resolve "priority inversion" deadlocks -- apply priority inheritance
+//    (when high-priority job A function waits for a lower-priority job B, and B never starts due to its priority)
+// 4) Resolve "blocked pool" deadlocks -- spawn more workers
+//    (when job A in pool P waits for another ready job B in P, but B never starts because there are no free workers in P)
+thread_local LoadJob * current_load_job = nullptr;
+
+size_t currentPoolOr(size_t pool)
+{
+    return current_load_job ? current_load_job->executionPool() : pool;
+}
+
+bool detectWaitDependentDeadlock(const LoadJobPtr & waited)
+{
+    if (waited.get() == current_load_job)
+        return true;
+    for (const auto & dep : waited->dependencies)
+    {
+        if (detectWaitDependentDeadlock(dep))
+            return true;
+    }
+    return false;
+}
+
+void AsyncLoader::wait(std::unique_lock<std::mutex> & job_lock, const LoadJobPtr & job)
+{
+    // Ensure job we are going to wait was scheduled to avoid "wait not scheduled" deadlocks
+    if (job->job_id == 0)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Load job '{}' waits for not scheduled load job '{}'", current_load_job->name, job->name);
+
+    // Deadlock detection and resolution
+    if (current_load_job && job->load_status == LoadStatus::PENDING)
+    {
+        if (detectWaitDependentDeadlock(job))
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Load job '{}' waits for dependent load job '{}'", current_load_job->name, job->name);
+
+        auto worker_pool = current_load_job->executionPool();
+        auto worker_priority = getPoolPriority(worker_pool);
+        auto job_priority = getPoolPriority(job->pool_id);
+
+        // Waiting for a lower-priority job ("priority inversion" deadlock) is resolved using priority inheritance.
+        if (worker_priority < job_priority)
+        {
+            job_lock.unlock(); // Avoid reverse locking order
+            prioritize(job, worker_pool);
+            job_lock.lock();
+        }
+
+        // Spawn more workers to avoid exhaustion of worker pool ("blocked pool" deadlock)
+        if (worker_pool == job->pool_id)
+        {
+            job_lock.unlock(); // Avoid reverse locking order
+            workerIsSuspendedByWait(worker_pool, job);
+            job_lock.lock();
+        }
+    }
+
+    Stopwatch watch;
+    job->waiters++;
+    job->finished.wait(job_lock, [&] { return job->load_status != LoadStatus::PENDING; });
+    job->waiters--;
+    ProfileEvents::increment(ProfileEvents::AsyncLoaderWaitMicroseconds, watch.elapsedMicroseconds());
+}
+
+void AsyncLoader::workerIsSuspendedByWait(size_t pool_id, const LoadJobPtr & job)
+{
+    std::unique_lock lock{mutex};
+    std::unique_lock job_lock{job->mutex};
+
+    if (job->load_status != LoadStatus::PENDING)
+        return; // Job is already done, worker can continue execution
+
+    // To resolve "blocked pool" deadlocks we spawn a new worker for every suspended worker, if required
+    // This can lead to a visible excess of `max_threads` specified for a pool,
+    // but actual number of NOT suspended workers may exceed `max_threads` ONLY in intermittent state.
+    Pool & pool = pools[pool_id];
+    pool.suspended_workers++;
+    job->suspended_waiters++;
+    if (canSpawnWorker(pool, lock))
+        spawn(pool, lock);
+
+    // TODO(serxa): it is a good idea to propagate `job` and all its dependencies in `pool.ready_queue` by introducing
+    // key {suspended_waiters, ready_seqno} instead of plain `ready_seqno`, to force newly spawn workers to work on jobs
+    // that are being waited. But it doesn't affect correctness. So let's not complicate it for time being.
+}
+
 bool AsyncLoader::canSpawnWorker(Pool & pool, std::unique_lock<std::mutex> &)
 {
+    // TODO(serxa): optimization: we should not spawn new worker on the first enqueue during `finish()` because current worker will take this job.
    return is_running
        && !pool.ready_queue.empty()
-        && pool.workers < pool.max_threads
+        && pool.workers < pool.max_threads + pool.suspended_workers
        && (!current_priority || *current_priority >= pool.priority);
 }

@ -632,7 +738,7 @@ bool AsyncLoader::canWorkerLive(Pool & pool, std::unique_lock<std::mutex> &)
 {
    return is_running
        && !pool.ready_queue.empty()
-        && pool.workers <= pool.max_threads
+        && pool.workers <= pool.max_threads + pool.suspended_workers
        && (!current_priority || *current_priority >= pool.priority);
 }

@ -705,7 +811,9 @@ void AsyncLoader::worker(Pool & pool)

        try
        {
-            job->execute(pool_id, job);
+            current_load_job = job.get();
+            SCOPE_EXIT({ current_load_job = nullptr; }); // Note that recursive job execution is not supported
+            job->execute(*this, pool_id, job);
            exception_from_job = {};
        }
        catch (...)
--- a/src/Common/AsyncLoader.h
+++ b/src/Common/AsyncLoader.h
@ -21,6 +21,16 @@ namespace Poco { class Logger; }
 namespace DB
 {

+// TERMINOLOGY:
+// Job (`LoadJob`) - The smallest part of loading process, executed by worker. Job can depend on the other jobs. Jobs are grouped in tasks.
+// Task (`LoadTask`) - Owning holder of a set of jobs. Should be held during the whole job lifetime. Cancels all jobs on destruction.
+// Goal jobs (goals) - a subset of "final" jobs of a task (usually no job in task depend on a goal job).
+//      By default all jobs in task are included in goal jobs.
+//      Goals should used if you need to create a job that depends on a task (to avoid placing all jobs of the task in dependencies).
+// Pool (worker pool) - A set of workers with specific priority. Every job is assigned to a pool. Job can change its pool dynamically.
+// Priority (pool priority) - Constant integer value showing relative priority of a pool. Lower value means higher priority.
+// AsyncLoader - scheduling system responsible for job dependency tracking and worker management respecting pool priorities.
+
 class LoadJob;
 using LoadJobPtr = std::shared_ptr<LoadJob>;
 using LoadJobSet = std::unordered_set<LoadJobPtr>;
@ -43,6 +53,7 @@ enum class LoadStatus
 // Smallest indivisible part of a loading process. Load job can have multiple dependencies, thus jobs constitute a direct acyclic graph (DAG).
 // Job encapsulates a function to be executed by `AsyncLoader` as soon as job functions of all dependencies are successfully executed.
 // Job can be waited for by an arbitrary number of threads. See `AsyncLoader` class description for more details.
+// WARNING: jobs are usually held with ownership by tasks (see `LoadTask`). You are encouraged to add jobs into a tasks as soon as the are created.
 class LoadJob : private boost::noncopyable
 {
 public:
@ -50,6 +61,7 @@ public:
    LoadJob(LoadJobSetType && dependencies_, String name_, size_t pool_id_, Func && func_)
        : dependencies(std::forward<LoadJobSetType>(dependencies_))
        , name(std::move(name_))
+        , execution_pool_id(pool_id_)
        , pool_id(pool_id_)
        , func(std::forward<Func>(func_))
    {}
@ -67,18 +79,12 @@ public:
    // Value may change during job execution by `prioritize()`.
    size_t pool() const;

-    // Sync wait for a pending job to be finished: OK, FAILED or CANCELED status.
-    // Throws if job is FAILED or CANCELED. Returns or throws immediately if called on non-pending job.
-    void wait() const;
-
-    // Wait for a job to reach any non PENDING status.
-    void waitNoThrow() const noexcept;
-
-    // Returns number of threads blocked by `wait()` or `waitNoThrow()` calls.
+    // Returns number of threads blocked by `wait()` calls.
    size_t waitersCount() const;

    // Introspection
    using TimePoint = std::chrono::system_clock::time_point;
+    UInt64 jobId() const { return job_id; }
    TimePoint scheduleTime() const { return schedule_time; }
    TimePoint enqueueTime() const { return enqueue_time; }
    TimePoint startTime() const { return start_time; }
@ -90,22 +96,24 @@ public:
 private:
    friend class AsyncLoader;

-    void ok();
-    void failed(const std::exception_ptr & ptr);
-    void canceled(const std::exception_ptr & ptr);
-    void finish();
+    [[nodiscard]] size_t ok();
+    [[nodiscard]] size_t failed(const std::exception_ptr & ptr);
+    [[nodiscard]] size_t canceled(const std::exception_ptr & ptr);
+    [[nodiscard]] size_t finish();

-    void scheduled();
+    void scheduled(UInt64 job_id_);
    void enqueued();
-    void execute(size_t pool, const LoadJobPtr & self);
+    void execute(AsyncLoader & loader, size_t pool, const LoadJobPtr & self);

+    std::atomic<UInt64> job_id{0};
    std::atomic<size_t> execution_pool_id;
    std::atomic<size_t> pool_id;
-    std::function<void(const LoadJobPtr & self)> func;
+    std::function<void(AsyncLoader & loader, const LoadJobPtr & self)> func;

    mutable std::mutex mutex;
    mutable std::condition_variable finished;
-    mutable size_t waiters = 0;
+    mutable size_t waiters = 0; // All waiters, including suspended
+    mutable size_t suspended_waiters = 0;
    LoadStatus load_status{LoadStatus::PENDING};
    std::exception_ptr load_exception;

@ -117,7 +125,7 @@ private:

 struct EmptyJobFunc
 {
-    void operator()(const LoadJobPtr &) {}
+    void operator()(AsyncLoader &, const LoadJobPtr &) {}
 };

 template <class Func = EmptyJobFunc>
@ -144,6 +152,7 @@ LoadJobPtr makeLoadJob(const LoadJobSet & dependencies, size_t pool_id, String n
    return std::make_shared<LoadJob>(dependencies, std::move(name), pool_id, std::forward<Func>(func));
 }

+
 // Represents a logically connected set of LoadJobs required to achieve some goals (final LoadJob in the set).
 class LoadTask : private boost::noncopyable
 {
@ -168,10 +177,11 @@ public:
    //   auto load_task = loadSomethingAsync(async_loader, load_after_task.goals(), something);
    const LoadJobSet & goals() const { return goal_jobs.empty() ? jobs : goal_jobs; }

+    AsyncLoader & loader;
+
 private:
    friend class AsyncLoader;

-    AsyncLoader & loader;
    LoadJobSet jobs;
    LoadJobSet goal_jobs;
 };
@ -181,91 +191,6 @@ inline LoadTaskPtr makeLoadTask(AsyncLoader & loader, LoadJobSet && jobs, LoadJo
    return std::make_shared<LoadTask>(loader, std::move(jobs), std::move(goals));
 }

-inline void scheduleLoad(const LoadTaskPtr & task)
-{
-    task->schedule();
-}
-
-inline void scheduleLoad(const LoadTaskPtrs & tasks)
-{
-    for (const auto & task : tasks)
-        task->schedule();
-}
-
-template <class... Args>
-inline void scheduleLoadAll(Args && ... args)
-{
-    (scheduleLoad(std::forward<Args>(args)), ...);
-}
-
-inline void waitLoad(const LoadJobSet & jobs)
-{
-    for (const auto & job : jobs)
-        job->wait();
-}
-
-inline void waitLoad(const LoadTaskPtr & task)
-{
-    waitLoad(task->goals());
-}
-
-inline void waitLoad(const LoadTaskPtrs & tasks)
-{
-    for (const auto & task : tasks)
-        waitLoad(task->goals());
-}
-
-template <class... Args>
-inline void waitLoadAll(Args && ... args)
-{
-    (waitLoad(std::forward<Args>(args)), ...);
-}
-
-template <class... Args>
-inline void scheduleAndWaitLoadAll(Args && ... args)
-{
-    scheduleLoadAll(std::forward<Args>(args)...);
-    waitLoadAll(std::forward<Args>(args)...);
-}
-
-inline LoadJobSet getGoals(const LoadTaskPtrs & tasks)
-{
-    LoadJobSet result;
-    for (const auto & task : tasks)
-        result.insert(task->goals().begin(), task->goals().end());
-    return result;
-}
-
-inline LoadJobSet getGoalsOr(const LoadTaskPtrs & tasks, const LoadJobSet & alternative)
-{
-    LoadJobSet result;
-    for (const auto & task : tasks)
-        result.insert(task->goals().begin(), task->goals().end());
-    return result.empty() ? alternative : result;
-}
-
-inline LoadJobSet joinJobs(const LoadJobSet & jobs1, const LoadJobSet & jobs2)
-{
-    LoadJobSet result;
-    if (!jobs1.empty())
-        result.insert(jobs1.begin(), jobs1.end());
-    if (!jobs2.empty())
-        result.insert(jobs2.begin(), jobs2.end());
-    return result;
-}
-
-inline LoadTaskPtrs joinTasks(const LoadTaskPtrs & tasks1, const LoadTaskPtrs & tasks2)
-{
-    if (tasks1.empty())
-        return tasks2;
-    if (tasks2.empty())
-        return tasks1;
-    LoadTaskPtrs result;
-    result.reserve(tasks1.size() + tasks2.size());
-    result.insert(result.end(), tasks1.begin(), tasks1.end());
-    result.insert(result.end(), tasks2.begin(), tasks2.end());
-    return result;
-}

 // `AsyncLoader` is a scheduler for DAG of `LoadJob`s. It tracks job dependencies and priorities.
 // Basic usage example:
@ -277,8 +202,8 @@ inline LoadTaskPtrs joinTasks(const LoadTaskPtrs & tasks1, const LoadTaskPtrs &
 //
 //     // Create and schedule a task consisting of three jobs. Job1 has no dependencies and is run first.
 //     // Job2 and job3 depend on job1 and are run only after job1 completion.
-//     auto job_func = [&] (const LoadJobPtr & self) {
-//         LOG_TRACE(log, "Executing load job '{}' in pool '{}'", self->name, async_loader->getPoolName(self->pool()));
+//     auto job_func = [&] (AsyncLoader & loader, const LoadJobPtr & self) {
+//         LOG_TRACE(log, "Executing load job '{}' in pool '{}'", self->name, loader->getPoolName(self->pool()));
 //     };
 //     auto job1 = makeLoadJob({}, "job1", /* pool_id = */ 1, job_func);
 //     auto job2 = makeLoadJob({ job1 }, "job2", /* pool_id = */ 1, job_func);
@ -287,8 +212,8 @@ inline LoadTaskPtrs joinTasks(const LoadTaskPtrs & tasks1, const LoadTaskPtrs &
 //     task.schedule();
 //
 //     // Another thread may prioritize a job by changing its pool and wait for it:
-//     async_loader->prioritize(job3, /* pool_id = */ 0); // Increase priority: 1 -> 0 (lower is better)
-//     job3->wait(); // Blocks until job completion or cancellation and rethrow an exception (if any)
+//     async_loader.prioritize(job3, /* pool_id = */ 0); // Increase priority: 1 -> 0 (lower is better)
+//     async_loader.wait(job3); // Blocks until job completion or cancellation and rethrow an exception (if any)
 //
 // Every job has a pool associated with it. AsyncLoader starts every job in its thread pool.
 // Each pool has a constant priority and a mutable maximum number of threads.
@ -341,7 +266,8 @@ private:
        std::unique_ptr<ThreadPool> thread_pool; // NOTE: we avoid using a `ThreadPool` queue to be able to move jobs between pools.
        std::map<UInt64, LoadJobPtr> ready_queue; // FIFO queue of jobs to be executed in this pool. Map is used for faster erasing. Key is `ready_seqno`
        size_t max_threads; // Max number of workers to be spawn
-        size_t workers = 0; // Number of currently execution workers
+        size_t workers = 0; // Number of currently executing workers
+        size_t suspended_workers = 0; // Number of workers that are blocked by `wait()` call on a job executing in the same pool (for deadlock resolution)

        bool isActive() const { return workers > 0 || !ready_queue.empty(); }
    };
@ -369,7 +295,7 @@ public:
        Metric metric_threads;
        Metric metric_active_threads;
        Metric metric_scheduled_threads;
-        size_t max_threads;
+        size_t max_threads; // Zero means use all CPU cores
        Priority priority;
    };

@ -399,6 +325,7 @@ public:
    // and are removed from AsyncLoader, so it is thread-safe to destroy them.
    void schedule(LoadTask & task);
    void schedule(const LoadTaskPtr & task);
+    void schedule(const LoadJobSet & jobs_to_schedule);

    // Schedule all tasks atomically. To ensure only highest priority jobs among all tasks are run first.
    void schedule(const LoadTaskPtrs & tasks);
@ -407,6 +334,11 @@ public:
    // Jobs from higher (than `new_pool`) priority pools are not changed.
    void prioritize(const LoadJobPtr & job, size_t new_pool);

+    // Sync wait for a pending job to be finished: OK, FAILED or CANCELED status.
+    // Throws if job is FAILED or CANCELED unless `no_throw` is set. Returns or throws immediately if called on non-pending job.
+    // If job was not scheduled, it will be implicitly scheduled before the wait (deadlock auto-resolution).
+    void wait(const LoadJobPtr & job, bool no_throw = false);
+
    // Remove finished jobs, cancel scheduled jobs, wait for executing jobs to finish and remove them.
    void remove(const LoadJobSet & jobs);

@ -430,23 +362,26 @@ public:
        bool is_executing = false;
    };

-    // For introspection and debug only, see `system.async_loader` table
+    // For introspection and debug only, see `system.async_loader` table.
    std::vector<JobState> getJobStates() const;

+    // For deadlock resolution. Should not be used directly.
+    void workerIsSuspendedByWait(size_t pool_id, const LoadJobPtr & job);
+
 private:
    void checkCycle(const LoadJobSet & jobs, std::unique_lock<std::mutex> & lock);
-    String checkCycleImpl(const LoadJobPtr & job, LoadJobSet & left, LoadJobSet & visited, std::unique_lock<std::mutex> & lock);
+    String checkCycle(const LoadJobPtr & job, LoadJobSet & left, LoadJobSet & visited, std::unique_lock<std::mutex> & lock);
    void finish(const LoadJobPtr & job, LoadStatus status, std::exception_ptr exception_from_job, std::unique_lock<std::mutex> & lock);
-    void scheduleImpl(const LoadJobSet & input_jobs);
    void gatherNotScheduled(const LoadJobPtr & job, LoadJobSet & jobs, std::unique_lock<std::mutex> & lock);
    void prioritize(const LoadJobPtr & job, size_t new_pool_id, std::unique_lock<std::mutex> & lock);
    void enqueue(Info & info, const LoadJobPtr & job, std::unique_lock<std::mutex> & lock);
-    bool canSpawnWorker(Pool & pool, std::unique_lock<std::mutex> &);
-    bool canWorkerLive(Pool & pool, std::unique_lock<std::mutex> &);
-    void updateCurrentPriorityAndSpawn(std::unique_lock<std::mutex> &);
-    void spawn(Pool & pool, std::unique_lock<std::mutex> &);
+    void wait(std::unique_lock<std::mutex> & job_lock, const LoadJobPtr & job);
+    bool canSpawnWorker(Pool & pool, std::unique_lock<std::mutex> & lock);
+    bool canWorkerLive(Pool & pool, std::unique_lock<std::mutex> & lock);
+    void updateCurrentPriorityAndSpawn(std::unique_lock<std::mutex> & lock);
+    void spawn(Pool & pool, std::unique_lock<std::mutex> & lock);
    void worker(Pool & pool);
-    bool hasWorker(std::unique_lock<std::mutex> &) const;
+    bool hasWorker(std::unique_lock<std::mutex> & lock) const;

    // Logging
    const bool log_failures; // Worker should log all exceptions caught from job functions.
@ -457,6 +392,7 @@ private:
    bool is_running = true;
    std::optional<Priority> current_priority; // highest priority among active pools
    UInt64 last_ready_seqno = 0; // Increasing counter for ready queue keys.
+    UInt64 last_job_id = 0; // Increasing counter for job IDs
    std::unordered_map<LoadJobPtr, Info> scheduled_jobs; // Full set of scheduled pending jobs along with scheduling info.
    std::vector<Pool> pools; // Thread pools for job execution and ready queues
    LoadJobSet finished_jobs; // Set of finished jobs (for introspection only, until jobs are removed).
@ -465,4 +401,136 @@ private:
    std::chrono::system_clock::time_point busy_period_start_time;
 };

+// === HELPER FUNCTIONS ===
+// There are three types of helper functions:
+//  schedulerLoad([loader], {jobs|task|tasks}):
+//      Just schedule jobs for async loading.
+//      Note that normally function `doSomethingAsync()` returns you a task which is NOT scheduled.
+//      This is done to allow you:
+//          (1) construct complex dependency graph offline.
+//          (2) schedule tasks simultaneously to respect their relative priorities.
+//          (3) do prioritization independently, before scheduling.
+//  prioritizeLoad([loader], pool_id, {jobs|task|tasks}):
+//      Prioritize jobs w/o waiting for it.
+//      Note that prioritization may be done
+//          (1) before scheduling (to ensure all jobs are started in the correct pools)
+//          (2) after scheduling (for dynamic prioritization, e.g. when new query arrives)
+//  waitLoad([loader], pool_id, {jobs|task|tasks}, [no_throw]):
+//      Prioritize and wait for jobs.
+//      Note that to avoid deadlocks it implicitly schedules all the jobs before waiting for them.
+//      Also to avoid priority inversion you should never wait for a job that has lower priority.
+//      So it prioritizes all jobs, then schedules all jobs and waits every job.
+//      IMPORTANT: Any load error will be rethrown, unless `no_throw` is set.
+//      Common usage pattern is:
+//          waitLoad(currentPoolOr(foreground_pool_id), tasks);
+
+// Returns current execution pool if it is called from load job, or `pool` otherwise
+// It should be used for waiting other load jobs in places that can be executed from load jobs
+size_t currentPoolOr(size_t pool);
+
+inline void scheduleLoad(AsyncLoader & loader, const LoadJobSet & jobs)
+{
+    loader.schedule(jobs);
+}
+
+inline void scheduleLoad(const LoadTaskPtr & task)
+{
+    task->schedule();
+}
+
+inline void scheduleLoad(const LoadTaskPtrs & tasks)
+{
+    if (tasks.empty())
+        return;
+    // NOTE: it is assumed that all tasks use the same `AsyncLoader`
+    AsyncLoader & loader = tasks.front()->loader;
+    loader.schedule(tasks);
+}
+
+inline void waitLoad(AsyncLoader & loader, const LoadJobSet & jobs, bool no_throw = false)
+{
+    scheduleLoad(loader, jobs);
+    for (const auto & job : jobs)
+        loader.wait(job, no_throw);
+}
+
+inline void waitLoad(const LoadTaskPtr & task, bool no_throw = false)
+{
+    scheduleLoad(task);
+    waitLoad(task->loader, task->goals(), no_throw);
+}
+
+inline void waitLoad(const LoadTaskPtrs & tasks, bool no_throw = false)
+{
+    scheduleLoad(tasks);
+    for (const auto & task : tasks)
+        waitLoad(task->loader, task->goals(), no_throw);
+}
+
+inline void prioritizeLoad(AsyncLoader & loader, size_t pool_id, const LoadJobSet & jobs)
+{
+    for (const auto & job : jobs)
+        loader.prioritize(job, pool_id);
+}
+
+inline void prioritizeLoad(size_t pool_id, const LoadTaskPtr & task)
+{
+    prioritizeLoad(task->loader, pool_id, task->goals());
+}
+
+inline void prioritizeLoad(size_t pool_id, const LoadTaskPtrs & tasks)
+{
+    for (const auto & task : tasks)
+        prioritizeLoad(task->loader, pool_id, task->goals());
+}
+
+inline void waitLoad(AsyncLoader & loader, size_t pool_id, const LoadJobSet & jobs, bool no_throw = false)
+{
+    prioritizeLoad(loader, pool_id, jobs);
+    waitLoad(loader, jobs, no_throw);
+}
+
+inline void waitLoad(size_t pool_id, const LoadTaskPtr & task, bool no_throw = false)
+{
+    prioritizeLoad(task->loader, pool_id, task->goals());
+    waitLoad(task->loader, task->goals(), no_throw);
+}
+
+inline void waitLoad(size_t pool_id, const LoadTaskPtrs & tasks, bool no_throw = false)
+{
+    prioritizeLoad(pool_id, tasks);
+    waitLoad(tasks, no_throw);
+}
+
+inline LoadJobSet getGoals(const LoadTaskPtrs & tasks, const LoadJobSet & alternative = {})
+{
+    LoadJobSet result;
+    for (const auto & task : tasks)
+        result.insert(task->goals().begin(), task->goals().end());
+    return result.empty() ? alternative : result;
+}
+
+inline LoadJobSet joinJobs(const LoadJobSet & jobs1, const LoadJobSet & jobs2)
+{
+    LoadJobSet result;
+    if (!jobs1.empty())
+        result.insert(jobs1.begin(), jobs1.end());
+    if (!jobs2.empty())
+        result.insert(jobs2.begin(), jobs2.end());
+    return result;
+}
+
+inline LoadTaskPtrs joinTasks(const LoadTaskPtrs & tasks1, const LoadTaskPtrs & tasks2)
+{
+    if (tasks1.empty())
+        return tasks2;
+    if (tasks2.empty())
+        return tasks1;
+    LoadTaskPtrs result;
+    result.reserve(tasks1.size() + tasks2.size());
+    result.insert(result.end(), tasks1.begin(), tasks1.end());
+    result.insert(result.end(), tasks2.begin(), tasks2.end());
+    return result;
+}
+
 }
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@ -110,12 +110,12 @@
    M(StorageHiveThreads, "Number of threads in the StorageHive thread pool.") \
    M(StorageHiveThreadsActive, "Number of threads in the StorageHive thread pool running a task.") \
    M(StorageHiveThreadsScheduled, "Number of queued or active jobs in the StorageHive thread pool.") \
-    M(TablesLoaderThreads, "Number of threads in the tables loader thread pool.") \
-    M(TablesLoaderThreadsActive, "Number of threads in the tables loader thread pool running a task.") \
-    M(TablesLoaderThreadsScheduled, "Number of queued or active jobs in the tables loader thread pool.") \
-    M(DatabaseOrdinaryThreads, "Number of threads in the Ordinary database thread pool.") \
-    M(DatabaseOrdinaryThreadsActive, "Number of threads in the Ordinary database thread pool running a task.") \
-    M(DatabaseOrdinaryThreadsScheduled, "Number of queued or active jobs in the Ordinary database thread pool.") \
+    M(TablesLoaderBackgroundThreads, "Number of threads in the tables loader background thread pool.") \
+    M(TablesLoaderBackgroundThreadsActive, "Number of threads in the tables loader background thread pool running a task.") \
+    M(TablesLoaderBackgroundThreadsScheduled, "Number of queued or active jobs in the tables loader background thread pool.") \
+    M(TablesLoaderForegroundThreads, "Number of threads in the tables loader foreground thread pool.") \
+    M(TablesLoaderForegroundThreadsActive, "Number of threads in the tables loader foreground thread pool running a task.") \
+    M(TablesLoaderForegroundThreadsScheduled, "Number of queued or active jobs in the tables loader foreground thread pool.") \
    M(DatabaseOnDiskThreads, "Number of threads in the DatabaseOnDisk thread pool.") \
    M(DatabaseOnDiskThreadsActive, "Number of threads in the DatabaseOnDisk thread pool running a task.") \
    M(DatabaseOnDiskThreadsScheduled, "Number of queued or active jobs in the DatabaseOnDisk thread pool.") \
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -588,6 +588,7 @@
    M(706, LIBSSH_ERROR) \
    M(707, GCP_ERROR) \
    M(708, ILLEGAL_STATISTIC) \
+    M(709, CANNOT_GET_REPLICATED_DATABASE_SNAPSHOT) \
    \
    M(999, KEEPER_EXCEPTION) \
    M(1000, POCO_EXCEPTION) \
--- a/src/Common/PoolId.h
+++ b/src/Common/PoolId.h
@ -0,0 +1,32 @@
+#pragma once
+
+#include <Common/Priority.h>
+
+namespace DB
+{
+
+/// Indices and priorities of `AsyncLoader` pools.
+
+/// The most important difference from regular ThreadPools is priorities of pools:
+///  * Pools that have different priorities do NOT run jobs simultaneously (with small exception due to dynamic prioritization).
+///  * Pools with lower priority wait for all jobs in higher priority pools to be done.
+
+/// Note that pools also have different configurable sizes not listed here. See `Context::getAsyncLoader()` for details.
+
+/// WARNING: `*PoolId` values must be unique and sequential w/o gaps.
+
+/// Used for executing load jobs that are waited for by queries or in case of synchronous table loading.
+constexpr size_t TablesLoaderForegroundPoolId = 0;
+constexpr Priority TablesLoaderForegroundPriority{0};
+
+/// Has lower priority and is used by table load jobs.
+constexpr size_t TablesLoaderBackgroundLoadPoolId = 1;
+constexpr Priority TablesLoaderBackgroundLoadPriority{1};
+
+/// Has even lower priority and is used by startup jobs.
+/// NOTE: This pool is required to begin table startup only after all tables are loaded.
+/// NOTE: Which is needed to prevent heavy merges/mutations from consuming all the resources, slowing table loading down.
+constexpr size_t TablesLoaderBackgroundStartupPoolId = 2;
+constexpr Priority TablesLoaderBackgroundStartupPriority{2};
+
+}
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -444,8 +444,13 @@ The server successfully detected this situation and will download merged part fr
    M(WaitPrefetchTaskMicroseconds, "Time spend waiting for prefetched reader") \
    \
    M(ThreadpoolReaderTaskMicroseconds, "Time spent getting the data in asynchronous reading") \
+    M(ThreadpoolReaderPrepareMicroseconds, "Time spent on preparation (e.g. call to reader seek() method)") \
    M(ThreadpoolReaderReadBytes, "Bytes read from a threadpool task in asynchronous reading") \
    M(ThreadpoolReaderSubmit, "Bytes read from a threadpool task in asynchronous reading") \
+    M(ThreadpoolReaderSubmitReadSynchronously, "How many times we haven't scheduled a task on the thread pool and read synchronously instead") \
+    M(ThreadpoolReaderSubmitReadSynchronouslyBytes, "How many bytes were read synchronously") \
+    M(ThreadpoolReaderSubmitReadSynchronouslyMicroseconds, "How much time we spent reading synchronously") \
+    M(AsynchronousReaderIgnoredBytes, "Number of bytes ignored during asynchronous reading") \
    \
    M(FileSegmentWaitReadBufferMicroseconds, "Metric per file segment. Time spend waiting for internal read buffer (includes cache waiting)") \
    M(FileSegmentReadMicroseconds, "Metric per file segment. Time spend reading from file") \
@ -569,6 +574,8 @@ The server successfully detected this situation and will download merged part fr
    \
    M(ConnectionPoolIsFullMicroseconds, "Total time spent waiting for a slot in connection pool.") \
    \
+    M(AsyncLoaderWaitMicroseconds, "Total time a query was waiting for async loader jobs.") \
+    \
    M(LogTest, "Number of log messages with level Test") \
    M(LogTrace, "Number of log messages with level Trace") \
    M(LogDebug, "Number of log messages with level Debug") \
--- a/src/Common/config.h.in
+++ b/src/Common/config.h.in
@ -27,6 +27,7 @@
 #cmakedefine01 USE_H3
 #cmakedefine01 USE_S2_GEOMETRY
 #cmakedefine01 USE_FASTOPS
+#cmakedefine01 USE_SQIDS
 #cmakedefine01 USE_NLP
 #cmakedefine01 USE_VECTORSCAN
 #cmakedefine01 USE_LIBURING
--- a/src/Common/tests/gtest_async_loader.cpp
+++ b/src/Common/tests/gtest_async_loader.cpp
@ -1,8 +1,12 @@
+#include <boost/core/noncopyable.hpp>
 #include <gtest/gtest.h>

+#include <array>
+#include <list>
 #include <barrier>
 #include <chrono>
 #include <mutex>
+#include <shared_mutex>
 #include <stdexcept>
 #include <string_view>
 #include <vector>
@ -19,9 +23,9 @@ using namespace DB;

 namespace CurrentMetrics
 {
-    extern const Metric TablesLoaderThreads;
-    extern const Metric TablesLoaderThreadsActive;
-    extern const Metric TablesLoaderThreadsScheduled;
+    extern const Metric TablesLoaderBackgroundThreads;
+    extern const Metric TablesLoaderBackgroundThreadsActive;
+    extern const Metric TablesLoaderBackgroundThreadsScheduled;
 }

 namespace DB::ErrorCodes
@ -61,9 +65,9 @@ struct AsyncLoaderTest
        {
            result.push_back({
                .name = fmt::format("Pool{}", pool_id),
-                .metric_threads = CurrentMetrics::TablesLoaderThreads,
-                .metric_active_threads = CurrentMetrics::TablesLoaderThreadsActive,
-                .metric_scheduled_threads = CurrentMetrics::TablesLoaderThreadsScheduled,
+                .metric_threads = CurrentMetrics::TablesLoaderBackgroundThreads,
+                .metric_active_threads = CurrentMetrics::TablesLoaderBackgroundThreadsActive,
+                .metric_scheduled_threads = CurrentMetrics::TablesLoaderBackgroundThreadsScheduled,
                .max_threads = desc.max_threads,
                .priority = desc.priority
            });
@ -155,7 +159,7 @@ TEST(AsyncLoader, Smoke)
    std::atomic<size_t> jobs_done{0};
    std::atomic<size_t> low_priority_jobs_done{0};

-    auto job_func = [&] (const LoadJobPtr & self) {
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr & self) {
        jobs_done++;
        if (self->pool() == low_priority_pool)
            low_priority_jobs_done++;
@ -172,13 +176,13 @@ TEST(AsyncLoader, Smoke)
        auto job5 = makeLoadJob({ job3, job4 }, low_priority_pool, "job5", job_func);
        task2->merge(t.schedule({ job5 }));

-        std::thread waiter_thread([=] { job5->wait(); });
+        std::thread waiter_thread([&t, job5] { t.loader.wait(job5); });

        t.loader.start();

-        job3->wait();
+        t.loader.wait(job3);
        t.loader.wait();
-        job4->wait();
+        t.loader.wait(job4);

        waiter_thread.join();

@ -196,7 +200,7 @@ TEST(AsyncLoader, CycleDetection)
 {
    AsyncLoaderTest t;

-    auto job_func = [&] (const LoadJobPtr &) {};
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr &) {};

    LoadJobPtr cycle_breaker; // To avoid memleak we introduce with a cycle

@ -241,7 +245,7 @@ TEST(AsyncLoader, CancelPendingJob)
 {
    AsyncLoaderTest t;

-    auto job_func = [&] (const LoadJobPtr &) {};
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr &) {};

    auto job = makeLoadJob({}, "job", job_func);
    auto task = t.schedule({ job });
@ -251,7 +255,7 @@ TEST(AsyncLoader, CancelPendingJob)
    ASSERT_EQ(job->status(), LoadStatus::CANCELED);
    try
    {
-        job->wait();
+        t.loader.wait(job);
        FAIL();
    }
    catch (Exception & e)
@ -264,7 +268,7 @@ TEST(AsyncLoader, CancelPendingTask)
 {
    AsyncLoaderTest t;

-    auto job_func = [&] (const LoadJobPtr &) {};
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr &) {};

    auto job1 = makeLoadJob({}, "job1", job_func);
    auto job2 = makeLoadJob({ job1 }, "job2", job_func);
@ -277,7 +281,7 @@ TEST(AsyncLoader, CancelPendingTask)

    try
    {
-        job1->wait();
+        t.loader.wait(job1);
        FAIL();
    }
    catch (Exception & e)
@ -287,7 +291,7 @@ TEST(AsyncLoader, CancelPendingTask)

    try
    {
-        job2->wait();
+        t.loader.wait(job2);
        FAIL();
    }
    catch (Exception & e)
@ -300,7 +304,7 @@ TEST(AsyncLoader, CancelPendingDependency)
 {
    AsyncLoaderTest t;

-    auto job_func = [&] (const LoadJobPtr &) {};
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr &) {};

    auto job1 = makeLoadJob({}, "job1", job_func);
    auto job2 = makeLoadJob({ job1 }, "job2", job_func);
@ -314,7 +318,7 @@ TEST(AsyncLoader, CancelPendingDependency)

    try
    {
-        job1->wait();
+        t.loader.wait(job1);
        FAIL();
    }
    catch (Exception & e)
@ -324,7 +328,7 @@ TEST(AsyncLoader, CancelPendingDependency)

    try
    {
-        job2->wait();
+        t.loader.wait(job2);
        FAIL();
    }
    catch (Exception & e)
@ -340,7 +344,7 @@ TEST(AsyncLoader, CancelExecutingJob)

    std::barrier sync(2);

-    auto job_func = [&] (const LoadJobPtr &)
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr &)
    {
        sync.arrive_and_wait(); // (A) sync with main thread
        sync.arrive_and_wait(); // (B) wait for waiter
@ -362,7 +366,7 @@ TEST(AsyncLoader, CancelExecutingJob)
    canceler.join();

    ASSERT_EQ(job->status(), LoadStatus::OK);
-    job->wait();
+    t.loader.wait(job);
 }

 TEST(AsyncLoader, CancelExecutingTask)
@ -371,19 +375,19 @@ TEST(AsyncLoader, CancelExecutingTask)
    t.loader.start();
    std::barrier sync(2);

-    auto blocker_job_func = [&] (const LoadJobPtr &)
+    auto blocker_job_func = [&] (AsyncLoader &, const LoadJobPtr &)
    {
        sync.arrive_and_wait(); // (A) sync with main thread
        sync.arrive_and_wait(); // (B) wait for waiter
        // signals (C)
    };

-    auto job_to_cancel_func = [&] (const LoadJobPtr &)
+    auto job_to_cancel_func = [&] (AsyncLoader &, const LoadJobPtr &)
    {
        FAIL(); // this job should be canceled
    };

-    auto job_to_succeed_func = [&] (const LoadJobPtr &)
+    auto job_to_succeed_func = [&] (AsyncLoader &, const LoadJobPtr &)
    {
    };

@ -430,7 +434,7 @@ TEST(AsyncLoader, DISABLED_JobFailure)

    std::string error_message = "test job failure";

-    auto job_func = [&] (const LoadJobPtr &) {
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr &) {
        throw std::runtime_error(error_message);
    };

@ -442,7 +446,7 @@ TEST(AsyncLoader, DISABLED_JobFailure)
    ASSERT_EQ(job->status(), LoadStatus::FAILED);
    try
    {
-        job->wait();
+        t.loader.wait(job);
        FAIL();
    }
    catch (Exception & e)
@ -459,7 +463,7 @@ TEST(AsyncLoader, ScheduleJobWithFailedDependencies)

    std::string_view error_message = "test job failure";

-    auto failed_job_func = [&] (const LoadJobPtr &) {
+    auto failed_job_func = [&] (AsyncLoader &, const LoadJobPtr &) {
        throw Exception(ErrorCodes::ASYNC_LOAD_FAILED, "{}", error_message);
    };

@ -468,7 +472,7 @@ TEST(AsyncLoader, ScheduleJobWithFailedDependencies)

    t.loader.wait();

-    auto job_func = [&] (const LoadJobPtr &) {};
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr &) {};

    auto job1 = makeLoadJob({ failed_job }, "job1", job_func);
    auto job2 = makeLoadJob({ job1 }, "job2", job_func);
@ -480,7 +484,7 @@ TEST(AsyncLoader, ScheduleJobWithFailedDependencies)
    ASSERT_EQ(job2->status(), LoadStatus::CANCELED);
    try
    {
-        job1->wait();
+        t.loader.wait(job1);
        FAIL();
    }
    catch (Exception & e)
@ -490,7 +494,7 @@ TEST(AsyncLoader, ScheduleJobWithFailedDependencies)
    }
    try
    {
-        job2->wait();
+        t.loader.wait(job2);
        FAIL();
    }
    catch (Exception & e)
@ -504,14 +508,14 @@ TEST(AsyncLoader, ScheduleJobWithCanceledDependencies)
 {
    AsyncLoaderTest t;

-    auto canceled_job_func = [&] (const LoadJobPtr &) {};
+    auto canceled_job_func = [&] (AsyncLoader &, const LoadJobPtr &) {};
    auto canceled_job = makeLoadJob({}, "canceled_job", canceled_job_func);
    auto canceled_task = t.schedule({ canceled_job });
    canceled_task->remove();

    t.loader.start();

-    auto job_func = [&] (const LoadJobPtr &) {};
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr &) {};
    auto job1 = makeLoadJob({ canceled_job }, "job1", job_func);
    auto job2 = makeLoadJob({ job1 }, "job2", job_func);
    auto task = t.schedule({ job1, job2 });
@ -522,7 +526,7 @@ TEST(AsyncLoader, ScheduleJobWithCanceledDependencies)
    ASSERT_EQ(job2->status(), LoadStatus::CANCELED);
    try
    {
-        job1->wait();
+        t.loader.wait(job1);
        FAIL();
    }
    catch (Exception & e)
@ -531,7 +535,7 @@ TEST(AsyncLoader, ScheduleJobWithCanceledDependencies)
    }
    try
    {
-        job2->wait();
+        t.loader.wait(job2);
        FAIL();
    }
    catch (Exception & e)
@ -550,7 +554,7 @@ TEST(AsyncLoader, TestConcurrency)
        std::barrier sync(concurrency);

        std::atomic<int> executing{0};
-        auto job_func = [&] (const LoadJobPtr &)
+        auto job_func = [&] (AsyncLoader &, const LoadJobPtr &)
        {
            executing++;
            ASSERT_LE(executing, concurrency);
@ -577,7 +581,7 @@ TEST(AsyncLoader, TestOverload)

    for (int concurrency = 4; concurrency <= 8; concurrency++)
    {
-        auto job_func = [&] (const LoadJobPtr &)
+        auto job_func = [&] (AsyncLoader &, const LoadJobPtr &)
        {
            executing++;
            t.randomSleepUs(100, 200, 100);
@ -613,7 +617,7 @@ TEST(AsyncLoader, StaticPriorities)

    std::string schedule;

-    auto job_func = [&] (const LoadJobPtr & self)
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr & self)
    {
        schedule += fmt::format("{}{}", self->name, self->pool());
    };
@ -656,18 +660,18 @@ TEST(AsyncLoader, SimplePrioritization)
    std::atomic<int> executed{0}; // Number of previously executed jobs (to test execution order)
    LoadJobPtr job_to_prioritize;

-    auto job_func_A_booster = [&] (const LoadJobPtr &)
+    auto job_func_A_booster = [&] (AsyncLoader &, const LoadJobPtr &)
    {
        ASSERT_EQ(executed++, 0);
        t.loader.prioritize(job_to_prioritize, 2);
    };

-    auto job_func_B_tester = [&] (const LoadJobPtr &)
+    auto job_func_B_tester = [&] (AsyncLoader &, const LoadJobPtr &)
    {
        ASSERT_EQ(executed++, 2);
    };

-    auto job_func_C_boosted = [&] (const LoadJobPtr &)
+    auto job_func_C_boosted = [&] (AsyncLoader &, const LoadJobPtr &)
    {
        ASSERT_EQ(executed++, 1);
    };
@ -680,7 +684,8 @@ TEST(AsyncLoader, SimplePrioritization)

    job_to_prioritize = jobs[2]; // C

-    scheduleAndWaitLoadAll(task);
+    scheduleLoad(task);
+    waitLoad(task);
 }

 TEST(AsyncLoader, DynamicPriorities)
@ -714,7 +719,7 @@ TEST(AsyncLoader, DynamicPriorities)
        UInt64 ready_seqno_D = 0;
        UInt64 ready_seqno_E = 0;

-        auto job_func = [&] (const LoadJobPtr & self)
+        auto job_func = [&] (AsyncLoader &, const LoadJobPtr & self)
        {
            {
                std::unique_lock lock{schedule_mutex};
@ -791,7 +796,7 @@ TEST(AsyncLoader, RandomIndependentTasks)
    AsyncLoaderTest t(16);
    t.loader.start();

-    auto job_func = [&] (const LoadJobPtr & self)
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr & self)
    {
        for (const auto & dep : self->dependencies)
            ASSERT_EQ(dep->status(), LoadStatus::OK);
@ -818,7 +823,7 @@ TEST(AsyncLoader, RandomDependentTasks)
    std::vector<LoadTaskPtr> tasks;
    std::vector<LoadJobPtr> all_jobs;

-    auto job_func = [&] (const LoadJobPtr & self)
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr & self)
    {
        for (const auto & dep : self->dependencies)
            ASSERT_EQ(dep->status(), LoadStatus::OK);
@ -860,7 +865,7 @@ TEST(AsyncLoader, SetMaxThreads)
        syncs.push_back(std::make_unique<std::barrier<>>(max_threads + 1));


-    auto job_func = [&] (const LoadJobPtr &)
+    auto job_func = [&] (AsyncLoader &, const LoadJobPtr &)
    {
        int idx = sync_index;
        if (idx < syncs.size())
@ -914,10 +919,11 @@ TEST(AsyncLoader, DynamicPools)
    {
        std::atomic<bool> boosted{false}; // Visible concurrency was increased
        std::atomic<int> left{concurrency * jobs_in_chain / 2}; // Number of jobs to start before `prioritize()` call
+        std::shared_mutex prioritization_mutex; // To slow down job execution during prioritization to avoid race condition

        LoadJobSet jobs_to_prioritize;

-        auto job_func = [&] (const LoadJobPtr & self)
+        auto job_func = [&] (AsyncLoader & loader, const LoadJobPtr & self)
        {
            auto pool_id = self->executionPool();
            executing[pool_id]++;
@ -928,10 +934,12 @@ TEST(AsyncLoader, DynamicPools)
            // Dynamic prioritization
            if (--left == 0)
            {
+                std::unique_lock lock{prioritization_mutex};
                for (const auto & job : jobs_to_prioritize)
-                    t.loader.prioritize(job, 1);
+                    loader.prioritize(job, 1);
            }

+            std::shared_lock lock{prioritization_mutex};
            t.randomSleepUs(100, 200, 100);

            ASSERT_LE(executing[pool_id], max_threads[pool_id]);
@ -941,9 +949,10 @@ TEST(AsyncLoader, DynamicPools)
        std::vector<LoadTaskPtr> tasks;
        tasks.reserve(concurrency);
        for (int i = 0; i < concurrency; i++)
-            tasks.push_back(makeLoadTask(t.loader, t.chainJobSet(jobs_in_chain, job_func)));
+            tasks.push_back(makeLoadTask(t.loader, t.chainJobSet(jobs_in_chain, job_func, fmt::format("c{}-j", i))));
        jobs_to_prioritize = getGoals(tasks); // All jobs
-        scheduleAndWaitLoadAll(tasks);
+        scheduleLoad(tasks);
+        waitLoad(tasks);

        ASSERT_EQ(executing[0], 0);
        ASSERT_EQ(executing[1], 0);
@ -952,3 +961,136 @@ TEST(AsyncLoader, DynamicPools)
    }

 }
+
+TEST(AsyncLoader, SubJobs)
+{
+    AsyncLoaderTest t(1);
+    t.loader.start();
+
+    // An example of component with an asynchronous loading interface
+    class MyComponent : boost::noncopyable {
+    public:
+        MyComponent(AsyncLoader & loader_, int jobs)
+            : loader(loader_)
+            , jobs_left(jobs)
+        {}
+
+        [[nodiscard]] LoadTaskPtr loadAsync()
+        {
+            auto job_func = [this] (AsyncLoader &, const LoadJobPtr &) {
+                auto sub_job_func = [this] (AsyncLoader &, const LoadJobPtr &) {
+                    --jobs_left;
+                };
+                LoadJobSet jobs;
+                for (size_t j = 0; j < jobs_left; j++)
+                    jobs.insert(makeLoadJob({}, fmt::format("sub job {}", j), sub_job_func));
+                waitLoad(makeLoadTask(loader, std::move(jobs)));
+            };
+            auto job = makeLoadJob({}, "main job", job_func);
+            return load_task = makeLoadTask(loader, { job });
+        }
+
+        bool isLoaded() const
+        {
+            return jobs_left == 0;
+        }
+
+    private:
+        AsyncLoader & loader;
+        std::atomic<int> jobs_left;
+        // It is a good practice to keep load task inside the component:
+        // 1) to make sure it outlives its load jobs;
+        // 2) to avoid removing load jobs from `system.async_loader` while we use the component
+        LoadTaskPtr load_task;
+    };
+
+    for (double jobs_per_thread : std::array{0.5, 1.0, 2.0})
+    {
+        for (size_t threads = 1; threads <= 32; threads *= 2)
+        {
+            t.loader.setMaxThreads(0, threads);
+            std::list<MyComponent> components;
+            LoadTaskPtrs tasks;
+            size_t size = static_cast<size_t>(jobs_per_thread * threads);
+            tasks.reserve(size);
+            for (size_t j = 0; j < size; j++)
+            {
+                components.emplace_back(t.loader, 5);
+                tasks.emplace_back(components.back().loadAsync());
+            }
+            waitLoad(tasks);
+            for (const auto & component: components)
+                ASSERT_TRUE(component.isLoaded());
+        }
+    }
+}
+
+TEST(AsyncLoader, RecursiveJob)
+{
+    AsyncLoaderTest t(1);
+    t.loader.start();
+
+    // An example of component with an asynchronous loading interface (a complicated one)
+    class MyComponent : boost::noncopyable {
+    public:
+        MyComponent(AsyncLoader & loader_, int jobs)
+            : loader(loader_)
+            , jobs_left(jobs)
+        {}
+
+        [[nodiscard]] LoadTaskPtr loadAsync()
+        {
+            return load_task = loadAsyncImpl(jobs_left);
+        }
+
+        bool isLoaded() const
+        {
+            return jobs_left == 0;
+        }
+
+    private:
+        [[nodiscard]] LoadTaskPtr loadAsyncImpl(int id)
+        {
+            auto job_func = [this] (AsyncLoader &, const LoadJobPtr & self) {
+                jobFunction(self);
+            };
+            auto job = makeLoadJob({}, fmt::format("job{}", id), job_func);
+            auto task = makeLoadTask(loader, { job });
+            return task;
+        }
+
+        void jobFunction(const LoadJobPtr & self)
+        {
+            int next = --jobs_left;
+            if (next > 0)
+                waitLoad(self->pool(), loadAsyncImpl(next));
+        }
+
+        AsyncLoader & loader;
+        std::atomic<int> jobs_left;
+        // It is a good practice to keep load task inside the component:
+        // 1) to make sure it outlives its load jobs;
+        // 2) to avoid removing load jobs from `system.async_loader` while we use the component
+        LoadTaskPtr load_task;
+    };
+
+    for (double jobs_per_thread : std::array{0.5, 1.0, 2.0})
+    {
+        for (size_t threads = 1; threads <= 32; threads *= 2)
+        {
+            t.loader.setMaxThreads(0, threads);
+            std::list<MyComponent> components;
+            LoadTaskPtrs tasks;
+            size_t size = static_cast<size_t>(jobs_per_thread * threads);
+            tasks.reserve(size);
+            for (size_t j = 0; j < size; j++)
+            {
+                components.emplace_back(t.loader, 5);
+                tasks.emplace_back(components.back().loadAsync());
+            }
+            waitLoad(tasks);
+            for (const auto & component: components)
+                ASSERT_TRUE(component.isLoaded());
+        }
+    }
+}
--- a/src/Compression/CompressionCodecDeflateQpl.cpp
+++ b/src/Compression/CompressionCodecDeflateQpl.cpp
@ -139,9 +139,9 @@ void DeflateQplJobHWPool::unLockJob(UInt32 index)
    hw_job_ptr_locks[index].store(false);
 }

-//HardwareCodecDeflateQpl
-HardwareCodecDeflateQpl::HardwareCodecDeflateQpl()
-    :log(&Poco::Logger::get("HardwareCodecDeflateQpl"))
+HardwareCodecDeflateQpl::HardwareCodecDeflateQpl(SoftwareCodecDeflateQpl & sw_codec_)
+    : log(&Poco::Logger::get("HardwareCodecDeflateQpl"))
+    , sw_codec(sw_codec_)
 {
 }

@ -169,7 +169,7 @@ Int32 HardwareCodecDeflateQpl::doCompressData(const char * source, UInt32 source
    UInt32 compressed_size = 0;
    if (!(job_ptr = DeflateQplJobHWPool::instance().acquireJob(job_id)))
    {
-        LOG_INFO(log, "DeflateQpl HW codec failed, falling back to SW codec.(Details: doCompressData->acquireJob fail, probably job pool exhausted)");
+        LOG_INFO(log, "DeflateQpl HW codec failed, falling back to SW codec. (Details: doCompressData->acquireJob fail, probably job pool exhausted)");
        return RET_ERROR;
    }

@ -189,7 +189,7 @@ Int32 HardwareCodecDeflateQpl::doCompressData(const char * source, UInt32 source
    }
    else
    {
-        LOG_WARNING(log, "DeflateQpl HW codec failed, falling back to SW codec.(Details: doCompressData->qpl_execute_job with error code: {} - please refer to qpl_status in ./contrib/qpl/include/qpl/c_api/status.h)", static_cast<UInt32>(status));
+        LOG_WARNING(log, "DeflateQpl HW codec failed, falling back to SW codec. (Details: doCompressData->qpl_execute_job with error code: {} - please refer to qpl_status in ./contrib/qpl/include/qpl/c_api/status.h)", static_cast<UInt32>(status));
        DeflateQplJobHWPool::instance().releaseJob(job_id);
        return RET_ERROR;
    }
@ -202,7 +202,7 @@ Int32 HardwareCodecDeflateQpl::doDecompressDataSynchronous(const char * source,
    UInt32 decompressed_size = 0;
    if (!(job_ptr = DeflateQplJobHWPool::instance().acquireJob(job_id)))
    {
-        LOG_INFO(log, "DeflateQpl HW codec failed, falling back to SW codec.(Details: doDecompressDataSynchronous->acquireJob fail, probably job pool exhausted)");
+        LOG_INFO(log, "DeflateQpl HW codec failed, falling back to SW codec. (Details: doDecompressDataSynchronous->acquireJob fail, probably job pool exhausted)");
        return RET_ERROR;
    }

@ -214,17 +214,29 @@ Int32 HardwareCodecDeflateQpl::doDecompressDataSynchronous(const char * source,
    job_ptr->available_out = uncompressed_size;
    job_ptr->flags = QPL_FLAG_FIRST | QPL_FLAG_LAST;

-    if (auto status = qpl_submit_job(job_ptr); status != QPL_STS_OK)
+    auto status = qpl_submit_job(job_ptr);
+    if (status != QPL_STS_OK)
    {
        DeflateQplJobHWPool::instance().releaseJob(job_id);
-        LOG_WARNING(log, "DeflateQpl HW codec failed, falling back to SW codec.(Details: doDecompressDataSynchronous->qpl_execute_job with error code: {} - please refer to qpl_status in ./contrib/qpl/include/qpl/c_api/status.h)", static_cast<UInt32>(status));
+        LOG_WARNING(log, "DeflateQpl HW codec failed, falling back to SW codec. (Details: doDecompressDataSynchronous->qpl_submit_job with error code: {} - please refer to qpl_status in ./contrib/qpl/include/qpl/c_api/status.h)", static_cast<UInt32>(status));
        return RET_ERROR;
    }
+
    /// Busy waiting till job complete.
+    UInt32 num_checks = 0;
    do
    {
        _tpause(1, __rdtsc() + 1000);
-    } while (qpl_check_job(job_ptr) == QPL_STS_BEING_PROCESSED);
+        status = qpl_check_job(job_ptr);
+        ++num_checks;
+    } while (status == QPL_STS_BEING_PROCESSED && num_checks < MAX_CHECKS);
+
+    if (status != QPL_STS_OK)
+    {
+        DeflateQplJobHWPool::instance().releaseJob(job_id);
+        LOG_WARNING(log, "DeflateQpl HW codec failed, falling back to SW codec. (Details: doDecompressDataSynchronous->qpl_submit_job with error code: {} - please refer to qpl_status in ./contrib/qpl/include/qpl/c_api/status.h)", static_cast<UInt32>(status));
+        return RET_ERROR;
+    }

    decompressed_size = job_ptr->total_out;
    DeflateQplJobHWPool::instance().releaseJob(job_id);
@ -237,7 +249,7 @@ Int32 HardwareCodecDeflateQpl::doDecompressDataAsynchronous(const char * source,
    qpl_job * job_ptr = nullptr;
    if (!(job_ptr = DeflateQplJobHWPool::instance().acquireJob(job_id)))
    {
-        LOG_INFO(log, "DeflateQpl HW codec failed, falling back to SW codec.(Details: doDecompressDataAsynchronous->acquireJob fail, probably job pool exhausted)");
+        LOG_INFO(log, "DeflateQpl HW codec failed, falling back to SW codec. (Details: doDecompressDataAsynchronous->acquireJob fail, probably job pool exhausted)");
        return RET_ERROR;
    }

@ -257,7 +269,7 @@ Int32 HardwareCodecDeflateQpl::doDecompressDataAsynchronous(const char * source,
    else
    {
        DeflateQplJobHWPool::instance().releaseJob(job_id);
-        LOG_WARNING(log, "DeflateQpl HW codec failed, falling back to SW codec.(Details: doDecompressDataAsynchronous->qpl_execute_job with error code: {} - please refer to qpl_status in ./contrib/qpl/include/qpl/c_api/status.h)", static_cast<UInt32>(status));
+        LOG_WARNING(log, "DeflateQpl HW codec failed, falling back to SW codec. (Details: doDecompressDataAsynchronous->qpl_submit_job with error code: {} - please refer to qpl_status in ./contrib/qpl/include/qpl/c_api/status.h)", static_cast<UInt32>(status));
        return RET_ERROR;
    }
 }
@ -266,6 +278,7 @@ void HardwareCodecDeflateQpl::flushAsynchronousDecompressRequests()
 {
    auto n_jobs_processing = decomp_async_job_map.size();
    std::map<UInt32, qpl_job *>::iterator it = decomp_async_job_map.begin();
+    UInt32 num_checks = 0;

    while (n_jobs_processing)
    {
@ -274,22 +287,34 @@ void HardwareCodecDeflateQpl::flushAsynchronousDecompressRequests()
        job_id = it->first;
        job_ptr = it->second;

-        if (qpl_check_job(job_ptr) == QPL_STS_BEING_PROCESSED)
+        auto status = qpl_check_job(job_ptr);
+        if ((status == QPL_STS_BEING_PROCESSED) && (num_checks < MAX_CHECKS))
        {
            it++;
        }
        else
        {
+            if (status != QPL_STS_OK)
+            {
+                sw_codec.doDecompressData(
+                    reinterpret_cast<const char * >(job_ptr->next_in_ptr),
+                    job_ptr->available_in,
+                    reinterpret_cast<char *>(job_ptr->next_out_ptr),
+                    job_ptr->available_out);
+                LOG_WARNING(log, "DeflateQpl HW codec failed, falling back to SW codec. (Details: flushAsynchronousDecompressRequests with error code: {} - please refer to qpl_status in ./contrib/qpl/include/qpl/c_api/status.h)", static_cast<UInt32>(status));
+            }
            it = decomp_async_job_map.erase(it);
            DeflateQplJobHWPool::instance().releaseJob(job_id);
            n_jobs_processing--;
            if (n_jobs_processing <= 0)
                break;
        }
+
        if (it == decomp_async_job_map.end())
        {
            it = decomp_async_job_map.begin();
            _tpause(1, __rdtsc() + 1000);
+            ++num_checks;
        }
    }
 }
@ -364,8 +389,8 @@ void SoftwareCodecDeflateQpl::doDecompressData(const char * source, UInt32 sourc
 }

 CompressionCodecDeflateQpl::CompressionCodecDeflateQpl()
-    : hw_codec(std::make_unique<HardwareCodecDeflateQpl>())
-    , sw_codec(std::make_unique<SoftwareCodecDeflateQpl>())
+    : sw_codec(std::make_unique<SoftwareCodecDeflateQpl>())
+    , hw_codec(std::make_unique<HardwareCodecDeflateQpl>(*sw_codec))
 {
    setCodecDescription("DEFLATE_QPL");
 }
--- a/src/Compression/CompressionCodecDeflateQpl.h
+++ b/src/Compression/CompressionCodecDeflateQpl.h
@ -65,8 +65,10 @@ class HardwareCodecDeflateQpl
 public:
    /// RET_ERROR stands for hardware codec fail, needs fallback to software codec.
    static constexpr Int32 RET_ERROR = -1;
+    /// Maximum times to check if hardware job complete, otherwise fallback to software codec.
+    static constexpr UInt32 MAX_CHECKS = UINT16_MAX;

-    HardwareCodecDeflateQpl();
+    HardwareCodecDeflateQpl(SoftwareCodecDeflateQpl & sw_codec_);
    ~HardwareCodecDeflateQpl();

    Int32 doCompressData(const char * source, UInt32 source_size, char * dest, UInt32 dest_size) const;
@ -87,6 +89,8 @@ private:
    /// For flush, pop out job ID && job object from this map. Use job ID to release job lock and use job object to check job status till complete.
    std::map<UInt32, qpl_job *> decomp_async_job_map;
    Poco::Logger * log;
+    /// Provides a fallback in case of errors.
+    SoftwareCodecDeflateQpl & sw_codec;
 };

 class CompressionCodecDeflateQpl final : public ICompressionCodec
@ -110,8 +114,8 @@ protected:
 private:
    UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override;

-    std::unique_ptr<HardwareCodecDeflateQpl> hw_codec;
    std::unique_ptr<SoftwareCodecDeflateQpl> sw_codec;
+    std::unique_ptr<HardwareCodecDeflateQpl> hw_codec;
 };

 }
--- a/src/Core/PlainRanges.cpp
+++ b/src/Core/PlainRanges.cpp
@ -0,0 +1,157 @@
+#include <Core/PlainRanges.h>
+
+namespace DB
+{
+
+PlainRanges::PlainRanges(const Range & range)
+{
+    ranges.push_back(range);
+}
+
+
+PlainRanges::PlainRanges(const Ranges & ranges_, bool may_have_intersection, bool ordered)
+{
+    if (may_have_intersection)
+        ranges = ordered ? makePlainFromOrdered(ranges_) : makePlainFromUnordered(ranges_);
+    else
+        ranges = ranges_;
+}
+
+Ranges PlainRanges::makePlainFromOrdered(const Ranges & ranges_)
+{
+    if (ranges_.size() <= 1)
+        return ranges_;
+
+    Ranges ret{ranges_.front()};
+
+    for (size_t i = 1; i < ranges_.size(); ++i)
+    {
+        const auto & cur = ranges_[i];
+        if (ret.back().intersectsRange(cur))
+            ret.back() = *ret.back().unionWith(cur);
+        else
+            ret.push_back(cur);
+    }
+
+    return ret;
+}
+
+Ranges PlainRanges::makePlainFromUnordered(Ranges ranges_)
+{
+    if (ranges_.size() <= 1)
+        return ranges_;
+
+    std::sort(ranges_.begin(), ranges_.end(), compareByLeftBound);
+    return makePlainFromOrdered(ranges_);
+}
+
+PlainRanges PlainRanges::unionWith(const PlainRanges & other)
+{
+    auto left_itr = ranges.begin();
+    auto right_itr = other.ranges.begin();
+
+    Ranges new_range;
+    for (; left_itr != ranges.end() && right_itr != other.ranges.end();)
+    {
+        if (left_itr->leftThan(*right_itr))
+        {
+            new_range.push_back(*left_itr);
+            left_itr++;
+        }
+        else if (left_itr->rightThan(*right_itr))
+        {
+            new_range.push_back(*right_itr);
+            right_itr++;
+        }
+        else /// union
+        {
+            new_range.emplace_back(*(left_itr->unionWith(*right_itr)));
+            if (compareByRightBound(*left_itr, *right_itr))
+                left_itr++;
+            else
+                right_itr++;
+        }
+    }
+
+    while (left_itr != ranges.end())
+    {
+        new_range.push_back(*left_itr);
+        left_itr++;
+    }
+
+    while (right_itr != other.ranges.end())
+    {
+        new_range.push_back(*right_itr);
+        right_itr++;
+    }
+
+    /// After union two PlainRanges, new ranges may like: [1, 4], [2, 5]
+    /// We must make them plain.
+
+    return PlainRanges(makePlainFromOrdered(new_range));
+}
+
+PlainRanges PlainRanges::intersectWith(const PlainRanges & other)
+{
+    auto left_itr = ranges.begin();
+    auto right_itr = other.ranges.begin();
+
+    Ranges new_ranges;
+    for (; left_itr != ranges.end() && right_itr != other.ranges.end();)
+    {
+        if (left_itr->leftThan(*right_itr))
+        {
+            left_itr++;
+        }
+        else if (left_itr->rightThan(*right_itr))
+        {
+            right_itr++;
+        }
+        else /// intersection
+        {
+            auto intersected = left_itr->intersectWith(*right_itr);
+
+            if (intersected) /// skip blank range
+                new_ranges.emplace_back(*intersected);
+
+            if (compareByRightBound(*left_itr, *right_itr))
+                left_itr++;
+            else
+                right_itr++;
+        }
+    }
+    return PlainRanges(new_ranges);
+}
+
+bool PlainRanges::compareByLeftBound(const Range & lhs, const Range & rhs)
+{
+    if (lhs.left == NEGATIVE_INFINITY && rhs.left == NEGATIVE_INFINITY)
+        return false;
+    return Range::less(lhs.left, rhs.left) || ((!lhs.left_included && rhs.left_included) && Range::equals(lhs.left, rhs.left));
+};
+
+bool PlainRanges::compareByRightBound(const Range & lhs, const Range & rhs)
+{
+    if (lhs.right == POSITIVE_INFINITY && rhs.right == POSITIVE_INFINITY)
+        return false;
+    return Range::less(lhs.right, rhs.right) || ((!lhs.right_included && rhs.right_included) && Range::equals(lhs.right, rhs.right));
+};
+
+
+std::vector<Ranges> PlainRanges::invert(const Ranges & to_invert_ranges)
+{
+    /// invert a blank ranges
+    if (to_invert_ranges.empty())
+        return {makeUniverse().ranges};
+
+    std::vector<Ranges> reverted_ranges;
+    for (const auto & range : to_invert_ranges)
+    {
+        if (range.isInfinite())
+            /// return a blank ranges
+            return {{}};
+        reverted_ranges.push_back(range.invertRange());
+    }
+    return reverted_ranges;
+};
+}
--- a/src/Core/PlainRanges.h
+++ b/src/Core/PlainRanges.h
@ -0,0 +1,46 @@
+#pragma once
+
+#include <Core/Range.h>
+
+namespace DB
+{
+
+/** A plain ranges is a series of ranges who
+ *      1. have no intersection in any two of the ranges
+ *      2. ordered by left side
+ *      3. does not contain blank range
+ *
+ * Example:
+ *      query: (k > 1 and key < 5) or (k > 3 and k < 10) or key in (2, 12)
+ *      original ranges: (1, 5), (3, 10), [2, 2], [12, 12]
+ *      plain ranges: (1, 10), [12, 12]
+ *
+ * If it is blank, ranges is empty.
+ */
+struct PlainRanges
+{
+    Ranges ranges;
+
+    explicit PlainRanges(const Range & range);
+
+    explicit PlainRanges(const Ranges & ranges_, bool may_have_intersection = false, bool ordered = true);
+
+    PlainRanges unionWith(const PlainRanges & other);
+    PlainRanges intersectWith(const PlainRanges & other);
+
+    /// Union ranges and return a new plain(ordered and no intersection) ranges.
+    /// Example:
+    ///         [1, 3], [2, 4], [6, 8] -> [1, 4], [6, 8]
+    ///         [1, 3], [2, 4], (4, 5] -> [1, 4], [5, 5]
+    static Ranges makePlainFromUnordered(Ranges ranges_);
+    static Ranges makePlainFromOrdered(const Ranges & ranges_);
+
+    static bool compareByLeftBound(const Range & lhs, const Range & rhs);
+    static bool compareByRightBound(const Range & lhs, const Range & rhs);
+
+    static std::vector<Ranges> invert(const Ranges & to_invert_ranges);
+
+    static PlainRanges makeBlank() { return PlainRanges({}); }
+    static PlainRanges makeUniverse() { return PlainRanges({Range::createWholeUniverseWithoutNull()}); }
+};
+}
--- a/src/Core/Range.cpp
+++ b/src/Core/Range.cpp
@ -123,6 +123,27 @@ bool Range::leftThan(const FieldRef & x) const
    return less(x, right) || (right_included && equals(x, right));
 }

+bool Range::rightThan(const Range & x) const
+{
+    return less(x.right, left) || (!(left_included && x.right_included) && equals(left, x.right));
+}
+
+bool Range::leftThan(const Range & x) const
+{
+    return less(right, x.left) || (!(x.left_included && right_included) && equals(right, x.left));
+}
+
+bool Range::fullBounded() const
+{
+    return left.getType() != Field::Types::Null && right.getType() != Field::Types::Null;
+}
+
+/// (-inf, +inf)
+bool Range::isInfinite() const
+{
+    return left.isNegativeInfinity() && right.isPositiveInfinity();
+}
+
 bool Range::intersectsRange(const Range & r) const
 {
    /// r to the left of me.
@ -159,6 +180,95 @@ void Range::invert()
    std::swap(left_included, right_included);
 }

+Ranges Range::invertRange() const
+{
+    Ranges ranges;
+    /// For full bounded range will generate two ranges.
+    if (fullBounded()) /// case: [1, 3] -> (-inf, 1), (3, +inf)
+    {
+        ranges.push_back({NEGATIVE_INFINITY, false, left, !left_included});
+        ranges.push_back({right, !right_included, POSITIVE_INFINITY, false});
+    }
+    else if (isInfinite())
+    {
+        /// blank ranges
+    }
+    else /// case: (-inf, 1] or [1, +inf)
+    {
+        Range r = *this;
+        std::swap(r.left, r.right);
+        if (r.left.isPositiveInfinity()) /// [1, +inf)
+        {
+            r.left = NEGATIVE_INFINITY;
+            r.right_included = !r.left_included;
+            r.left_included = false;
+        }
+        else if (r.right.isNegativeInfinity()) /// (-inf, 1]
+        {
+            r.right = POSITIVE_INFINITY;
+            r.left_included = !r.right_included;
+            r.right_included = false;
+        }
+        ranges.push_back(r);
+    }
+    return ranges;
+}
+
+std::optional<Range> Range::intersectWith(const Range & r) const
+{
+    if (!intersectsRange(r))
+        return {};
+
+    bool left_bound_use_mine = true;
+    bool right_bound_use_mine = true;
+
+    if (less(left, r.left) || ((!left_included && r.left_included) && equals(left, r.left)))
+        left_bound_use_mine = false;
+
+    if (less(r.right, right) || ((!r.right_included && right_included) && equals(r.right, right)))
+        right_bound_use_mine = false;
+
+    return Range(
+        left_bound_use_mine ? left : r.left,
+        left_bound_use_mine ? left_included : r.left_included,
+        right_bound_use_mine ? right : r.right,
+        right_bound_use_mine ? right_included : r.right_included);
+}
+
+std::optional<Range> Range::unionWith(const Range & r) const
+{
+    if (!intersectsRange(r) && !nearByWith(r))
+        return {};
+
+    bool left_bound_use_mine = false;
+    bool right_bound_use_mine = false;
+
+    if (less(left, r.left) || ((!left_included && r.left_included) && equals(left, r.left)))
+        left_bound_use_mine = true;
+
+    if (less(r.right, right) || ((!r.right_included && right_included) && equals(r.right, right)))
+        right_bound_use_mine = true;
+
+    return Range(
+        left_bound_use_mine ? left : r.left,
+        left_bound_use_mine ? left_included : r.left_included,
+        right_bound_use_mine ? right : r.right,
+        right_bound_use_mine ? right_included : r.right_included);
+}
+
+bool Range::nearByWith(const Range & r) const
+{
+    /// me locates at left
+    if (((right_included && !r.left_included) || (!right_included && r.left_included)) && equals(right, r.left))
+        return true;
+
+    /// r locate left
+    if (((r.right_included && !left_included) || (r.right_included && !left_included)) && equals(r.right, left))
+        return true;
+
+    return false;
+}
+
 Range intersect(const Range & a, const Range & b)
 {
    Range res = Range::createWholeUniverse();
--- a/src/Core/Range.h
+++ b/src/Core/Range.h
@ -38,6 +38,13 @@ struct FieldRef : public Field
    size_t column_idx = 0;
 };

+/** Range with open or closed ends; possibly unbounded.
+ */
+struct Range;
+/** A serious of range who can overlap or non-overlap.
+ */
+using Ranges = std::vector<Range>;
+
 /** Range with open or closed ends; possibly unbounded.
  */
 struct Range
@ -79,12 +86,37 @@ public:
    /// x is to the right
    bool leftThan(const FieldRef & x) const;

+    /// completely right than x
+    bool rightThan(const Range & x) const;
+    /// completely left than x
+    bool leftThan(const Range & x) const;
+
+    /// range like [1, 2]
+    bool fullBounded() const;
+    /// (-inf, +inf)
+    bool isInfinite() const;
+
+    bool isBlank() const;
+
    bool intersectsRange(const Range & r) const;

    bool containsRange(const Range & r) const;

+    /// Invert left and right
    void invert();

+    /// Invert the range.
+    /// Example:
+    ///     [1, 3] -> (-inf, 1), (3, +inf)
+    Ranges invertRange() const;
+
+    std::optional<Range> intersectWith(const Range & r) const;
+    std::optional<Range> unionWith(const Range & r) const;
+
+    /// If near by r, they can be combined to a continuous range.
+    /// TODO If field is integer, case like [2, 3], [4, 5] is excluded.
+    bool nearByWith(const Range & r) const;
+
    String toString() const;
 };

--- a/src/Core/ServerSettings.h
+++ b/src/Core/ServerSettings.h
@ -95,6 +95,9 @@ namespace DB
    M(UInt64, background_schedule_pool_size, 512, "The maximum number of threads that will be used for constantly executing some lightweight periodic operations.", 0) \
    M(UInt64, background_message_broker_schedule_pool_size, 16, "The maximum number of threads that will be used for executing background operations for message streaming.", 0) \
    M(UInt64, background_distributed_schedule_pool_size, 16, "The maximum number of threads that will be used for executing distributed sends.", 0) \
+    M(UInt64, tables_loader_foreground_pool_size, 0, "The maximum number of threads that will be used for foreground (that is being waited for by a query) loading of tables. Also used for synchronous loading of tables before the server start. Zero means use all CPUs.", 0) \
+    M(UInt64, tables_loader_background_pool_size, 0, "The maximum number of threads that will be used for background async loading of tables. Zero means use all CPUs.", 0) \
+    M(Bool, async_load_databases, false, "Enable asynchronous loading of databases and tables to speedup server startup. Queries to not yet loaded entity will be blocked until load is finished.", 0) \
    M(Bool, display_secrets_in_show_and_select, false, "Allow showing secrets in SHOW and SELECT queries via a format setting and a grant", 0) \
    \
    M(UInt64, total_memory_profiler_step, 0, "Whenever server memory usage becomes larger than every next step in number of bytes the memory profiler will collect the allocating stack trace. Zero means disabled memory profiler. Values lower than a few megabytes will slow down server.", 0) \
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -575,7 +575,6 @@ class IColumn;
    M(Bool, optimize_substitute_columns, false, "Use constraints for column substitution", 0)                                                                                                                                         \
    M(Bool, optimize_append_index, false, "Use constraints in order to append index condition (indexHint)", 0) \
    M(Bool, normalize_function_names, true, "Normalize function names to their canonical names", 0) \
-    M(Bool, allow_experimental_alter_materialized_view_structure, false, "Allow atomic alter on Materialized views. Work in progress.", 0) \
    M(Bool, enable_early_constant_folding, true, "Enable query optimization where we analyze function and subqueries results and rewrite query if there're constants there", 0) \
    M(Bool, deduplicate_blocks_in_dependent_materialized_views, false, "Should deduplicate blocks for materialized views if the block is not a duplicate for the table. Use true to always deduplicate in dependent tables.", 0) \
    M(Bool, materialized_views_ignore_errors, false, "Allows to ignore errors for MATERIALIZED VIEW, and deliver original block to the table regardless of MVs", 0) \
@ -856,6 +855,7 @@ class IColumn;
    MAKE_OBSOLETE(M, Bool, allow_experimental_window_functions, true) \
    MAKE_OBSOLETE(M, Bool, allow_experimental_geo_types, true) \
    MAKE_OBSOLETE(M, Bool, allow_experimental_query_cache, true) \
+    MAKE_OBSOLETE(M, Bool, allow_experimental_alter_materialized_view_structure, true) \
    \
    MAKE_OBSOLETE(M, Milliseconds, async_insert_stale_timeout_ms, 0) \
    MAKE_OBSOLETE(M, StreamingHandleErrorMode, handle_kafka_error_mode, StreamingHandleErrorMode::DEFAULT) \
--- a/src/DataTypes/DataTypeAggregateFunction.cpp
+++ b/src/DataTypes/DataTypeAggregateFunction.cpp
@ -166,7 +166,6 @@ SerializationPtr DataTypeAggregateFunction::doGetDefaultSerialization() const
 static DataTypePtr create(const ASTPtr & arguments)
 {
    String function_name;
-    AggregateFunctionPtr function;
    DataTypes argument_types;
    Array params_row;
    std::optional<size_t> version;
@ -193,12 +192,14 @@ static DataTypePtr create(const ASTPtr & arguments)
        argument_types_start_idx = 2;
    }

+    auto action = NullsAction::EMPTY;
    if (const auto * parametric = data_type_ast->as<ASTFunction>())
    {
        if (parametric->parameters)
            throw Exception(ErrorCodes::SYNTAX_ERROR, "Unexpected level of parameters to aggregate function");

        function_name = parametric->name;
+        action = parametric->nulls_action;

        if (parametric->arguments)
        {
@ -241,7 +242,7 @@ static DataTypePtr create(const ASTPtr & arguments)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: empty name of aggregate function passed");

    AggregateFunctionProperties properties;
-    function = AggregateFunctionFactory::instance().get(function_name, argument_types, params_row, properties);
+    AggregateFunctionPtr function = AggregateFunctionFactory::instance().get(function_name, action, argument_types, params_row, properties);
    return std::make_shared<DataTypeAggregateFunction>(function, argument_types, params_row, version);
 }

--- a/src/DataTypes/DataTypeCustomSimpleAggregateFunction.cpp
+++ b/src/DataTypes/DataTypeCustomSimpleAggregateFunction.cpp
@ -144,7 +144,9 @@ static std::pair<DataTypePtr, DataTypeCustomDescPtr> create(const ASTPtr & argum
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: empty name of aggregate function passed");

    AggregateFunctionProperties properties;
-    function = AggregateFunctionFactory::instance().get(function_name, argument_types, params_row, properties);
+    /// NullsAction is not part of the type definition, instead it will have transformed the function into a different one
+    auto action = NullsAction::EMPTY;
+    function = AggregateFunctionFactory::instance().get(function_name, action, argument_types, params_row, properties);

    DataTypeCustomSimpleAggregateFunction::checkSupportedFunctions(function);

--- a/src/Databases/DatabaseAtomic.cpp
+++ b/src/Databases/DatabaseAtomic.cpp
@ -5,6 +5,7 @@
 #include <IO/WriteHelpers.h>
 #include <IO/ReadBufferFromFile.h>
 #include <Parsers/formatAST.h>
+#include <Common/PoolId.h>
 #include <Common/atomicRename.h>
 #include <Common/filesystemHelpers.h>
 #include <Storages/StorageMaterializedView.h>
@ -74,6 +75,7 @@ String DatabaseAtomic::getTableDataPath(const ASTCreateQuery & query) const

 void DatabaseAtomic::drop(ContextPtr)
 {
+    waitDatabaseStarted(false);
    assert(TSA_SUPPRESS_WARNING_FOR_READ(tables).empty());
    try
    {
@ -112,6 +114,7 @@ StoragePtr DatabaseAtomic::detachTable(ContextPtr /* context */, const String &

 void DatabaseAtomic::dropTable(ContextPtr local_context, const String & table_name, bool sync)
 {
+    waitDatabaseStarted(false);
    auto table = tryGetTable(table_name, local_context);
    /// Remove the inner table (if any) to avoid deadlock
    /// (due to attempt to execute DROP from the worker thread)
@ -175,6 +178,8 @@ void DatabaseAtomic::renameTable(ContextPtr local_context, const String & table_
    if (exchange && !supportsAtomicRename())
        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "RENAME EXCHANGE is not supported");

+    waitDatabaseStarted(false);
+
    auto & other_db = dynamic_cast<DatabaseAtomic &>(to_database);
    bool inside_database = this == &other_db;

@ -412,7 +417,7 @@ void DatabaseAtomic::assertCanBeDetached(bool cleanup)
 DatabaseTablesIteratorPtr
 DatabaseAtomic::getTablesIterator(ContextPtr local_context, const IDatabase::FilterByNameFunction & filter_by_table_name) const
 {
-    auto base_iter = DatabaseWithOwnTablesBase::getTablesIterator(local_context, filter_by_table_name);
+    auto base_iter = DatabaseOrdinary::getTablesIterator(local_context, filter_by_table_name);
    return std::make_unique<AtomicDatabaseTablesSnapshotIterator>(std::move(typeid_cast<DatabaseTablesSnapshotIterator &>(*base_iter)));
 }

@ -441,28 +446,34 @@ void DatabaseAtomic::beforeLoadingMetadata(ContextMutablePtr /*context*/, Loadin
    }
 }

-void DatabaseAtomic::loadStoredObjects(ContextMutablePtr local_context, LoadingStrictnessLevel mode)
+LoadTaskPtr DatabaseAtomic::startupDatabaseAsync(AsyncLoader & async_loader, LoadJobSet startup_after, LoadingStrictnessLevel mode)
 {
-    beforeLoadingMetadata(local_context, mode);
-    DatabaseOrdinary::loadStoredObjects(local_context, mode);
+    auto base = DatabaseOrdinary::startupDatabaseAsync(async_loader, std::move(startup_after), mode);
+    auto job = makeLoadJob(
+        base->goals(),
+        TablesLoaderBackgroundStartupPoolId,
+        fmt::format("startup Atomic database {}", getDatabaseName()),
+        [this, mode] (AsyncLoader &, const LoadJobPtr &)
+        {
+            if (mode < LoadingStrictnessLevel::FORCE_RESTORE)
+                return;
+            NameToPathMap table_names;
+            {
+                std::lock_guard lock{mutex};
+                table_names = table_name_to_path;
+            }
+
+            fs::create_directories(path_to_table_symlinks);
+            for (const auto & table : table_names)
+                tryCreateSymlink(table.first, table.second, true);
+        });
+    return startup_atomic_database_task = makeLoadTask(async_loader, {job});
 }

-void DatabaseAtomic::startupTables(ThreadPool & thread_pool, LoadingStrictnessLevel mode)
+void DatabaseAtomic::waitDatabaseStarted(bool no_throw) const
 {
-    DatabaseOrdinary::startupTables(thread_pool, mode);
-
-    if (mode < LoadingStrictnessLevel::FORCE_RESTORE)
-        return;
-
-    NameToPathMap table_names;
-    {
-        std::lock_guard lock{mutex};
-        table_names = table_name_to_path;
-    }
-
-    fs::create_directories(path_to_table_symlinks);
-    for (const auto & table : table_names)
-        tryCreateSymlink(table.first, table.second, true);
+    if (startup_atomic_database_task)
+        waitLoad(currentPoolOr(TablesLoaderForegroundPoolId), startup_atomic_database_task, no_throw);
 }

 void DatabaseAtomic::tryCreateSymlink(const String & table_name, const String & actual_data_path, bool if_data_path_exist)
@ -532,6 +543,8 @@ void DatabaseAtomic::renameDatabase(ContextPtr query_context, const String & new
 {
    /// CREATE, ATTACH, DROP, DETACH and RENAME DATABASE must hold DDLGuard

+    waitDatabaseStarted(false);
+
    bool check_ref_deps = query_context->getSettingsRef().check_referential_table_dependencies;
    bool check_loading_deps = !check_ref_deps && query_context->getSettingsRef().check_table_dependencies;
    if (check_ref_deps || check_loading_deps)
--- a/src/Databases/DatabaseAtomic.h
+++ b/src/Databases/DatabaseAtomic.h
@ -48,11 +48,10 @@ public:

    DatabaseTablesIteratorPtr getTablesIterator(ContextPtr context, const FilterByNameFunction & filter_by_table_name) const override;

-    void loadStoredObjects(ContextMutablePtr context, LoadingStrictnessLevel mode) override;
-
    void beforeLoadingMetadata(ContextMutablePtr context, LoadingStrictnessLevel mode) override;

-    void startupTables(ThreadPool & thread_pool, LoadingStrictnessLevel mode) override;
+    LoadTaskPtr startupDatabaseAsync(AsyncLoader & async_loader, LoadJobSet startup_after, LoadingStrictnessLevel mode) override;
+    void waitDatabaseStarted(bool no_throw) const override;

    /// Atomic database cannot be detached if there is detached table which still in use
    void assertCanBeDetached(bool cleanup) override;
@ -87,6 +86,8 @@ protected:
    String path_to_table_symlinks;
    String path_to_metadata_symlink;
    const UUID db_uuid;
+
+    LoadTaskPtr startup_atomic_database_task;
 };

 }
--- a/src/Databases/DatabaseMemory.cpp
+++ b/src/Databases/DatabaseMemory.cpp
@ -20,7 +20,6 @@ namespace ErrorCodes
 {
    extern const int UNKNOWN_TABLE;
    extern const int LOGICAL_ERROR;
-    extern const int INCONSISTENT_METADATA_FOR_BACKUP;
 }

 DatabaseMemory::DatabaseMemory(const String & name_, ContextPtr context_)
@ -177,21 +176,30 @@ std::vector<std::pair<ASTPtr, StoragePtr>> DatabaseMemory::getTablesForBackup(co

        auto storage_id = local_context->tryResolveStorageID(StorageID{"", table_name}, Context::ResolveExternal);
        if (!storage_id)
-            throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP,
-                            "Couldn't resolve the name of temporary table {}", backQuoteIfNeed(table_name));
+        {
+            LOG_WARNING(log, "Couldn't resolve the name of temporary table {}", backQuoteIfNeed(table_name));
+            continue;
+        }

        /// Here `storage_id.table_name` looks like looks like "_tmp_ab9b15a3-fb43-4670-abec-14a0e9eb70f1"
        /// it's not the real name of the table.
        auto create_table_query = tryGetCreateTableQuery(storage_id.table_name, local_context);
        if (!create_table_query)
-            throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP,
-                            "Couldn't get a create query for temporary table {}", backQuoteIfNeed(table_name));
+        {
+            LOG_WARNING(log, "Couldn't get a create query for temporary table {}", backQuoteIfNeed(table_name));
+            continue;
+        }

-        const auto & create = create_table_query->as<const ASTCreateQuery &>();
-        if (create.getTable() != table_name)
-            throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP,
-                            "Got a create query with unexpected name {} for temporary table {}",
-                            backQuoteIfNeed(create.getTable()), backQuoteIfNeed(table_name));
+        auto * create = create_table_query->as<ASTCreateQuery>();
+        if (create->getTable() != table_name)
+        {
+            /// Probably the database has been just renamed. Use the older name for backup to keep the backup consistent.
+            LOG_WARNING(log, "Got a create query with unexpected name {} for temporary table {}",
+                        backQuoteIfNeed(create->getTable()), backQuoteIfNeed(table_name));
+            create_table_query = create_table_query->clone();
+            create = create_table_query->as<ASTCreateQuery>();
+            create->setTable(table_name);
+        }

        chassert(storage);
        storage->adjustCreateQueryForBackup(create_table_query);
--- a/src/Databases/DatabaseOnDisk.cpp
+++ b/src/Databases/DatabaseOnDisk.cpp
@ -163,6 +163,13 @@ DatabaseOnDisk::DatabaseOnDisk(
 }


+void DatabaseOnDisk::shutdown()
+{
+    waitDatabaseStarted(/* no_throw = */ true);
+    DatabaseWithOwnTablesBase::shutdown();
+}
+
+
 void DatabaseOnDisk::createTable(
    ContextPtr local_context,
    const String & table_name,
@ -189,6 +196,8 @@ void DatabaseOnDisk::createTable(
        throw Exception(
            ErrorCodes::TABLE_ALREADY_EXISTS, "Table {}.{} already exists", backQuote(getDatabaseName()), backQuote(table_name));

+    waitDatabaseStarted(false);
+
    String table_metadata_path = getObjectMetadataPath(table_name);

    if (create.attach_short_syntax)
@ -278,6 +287,8 @@ void DatabaseOnDisk::commitCreateTable(const ASTCreateQuery & query, const Stora

 void DatabaseOnDisk::detachTablePermanently(ContextPtr query_context, const String & table_name)
 {
+    waitDatabaseStarted(false);
+
    auto table = detachTable(query_context, table_name);

    fs::path detached_permanently_flag(getObjectMetadataPath(table_name) + detached_suffix);
@ -294,6 +305,8 @@ void DatabaseOnDisk::detachTablePermanently(ContextPtr query_context, const Stri

 void DatabaseOnDisk::dropTable(ContextPtr local_context, const String & table_name, bool /*sync*/)
 {
+    waitDatabaseStarted(false);
+
    String table_metadata_path = getObjectMetadataPath(table_name);
    String table_metadata_path_drop = table_metadata_path + drop_suffix;
    String table_data_path_relative = getTableDataPath(table_name);
@ -378,6 +391,8 @@ void DatabaseOnDisk::renameTable(
            throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Moving tables between databases of different engines is not supported");
    }

+    waitDatabaseStarted(false);
+
    auto table_data_relative_path = getTableDataPath(table_name);
    TableExclusiveLockHolder table_lock;
    String table_metadata_path;
@ -519,6 +534,8 @@ ASTPtr DatabaseOnDisk::getCreateDatabaseQuery() const

 void DatabaseOnDisk::drop(ContextPtr local_context)
 {
+    waitDatabaseStarted(false);
+
    assert(TSA_SUPPRESS_WARNING_FOR_READ(tables).empty());
    if (local_context->getSettingsRef().force_remove_data_recursively_on_drop)
    {
--- a/src/Databases/DatabaseOnDisk.h
+++ b/src/Databases/DatabaseOnDisk.h
@ -32,6 +32,8 @@ class DatabaseOnDisk : public DatabaseWithOwnTablesBase
 public:
    DatabaseOnDisk(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, ContextPtr context);

+    void shutdown() override;
+
    void createTable(
        ContextPtr context,
        const String & table_name,
--- a/src/Databases/DatabaseOrdinary.cpp
+++ b/src/Databases/DatabaseOrdinary.cpp
@ -22,6 +22,7 @@
 #include <Parsers/queryToString.h>
 #include <Common/Stopwatch.h>
 #include <Common/ThreadPool.h>
+#include <Common/PoolId.h>
 #include <Common/escapeForFileName.h>
 #include <Common/quoteString.h>
 #include <Common/typeid_cast.h>
@ -30,13 +31,6 @@

 namespace fs = std::filesystem;

-namespace CurrentMetrics
-{
-    extern const Metric DatabaseOrdinaryThreads;
-    extern const Metric DatabaseOrdinaryThreadsActive;
-    extern const Metric DatabaseOrdinaryThreadsScheduled;
-}
-
 namespace DB
 {

@ -47,38 +41,6 @@ namespace ErrorCodes

 static constexpr size_t METADATA_FILE_BUFFER_SIZE = 32768;

-namespace
-{
-    void tryAttachTable(
-        ContextMutablePtr context,
-        const ASTCreateQuery & query,
-        DatabaseOrdinary & database,
-        const String & database_name,
-        const String & metadata_path,
-        bool force_restore)
-    {
-        try
-        {
-            auto [table_name, table] = createTableFromAST(
-                query,
-                database_name,
-                database.getTableDataPath(query),
-                context,
-                force_restore);
-
-            database.attachTable(context, table_name, table, database.getTableDataPath(query));
-        }
-        catch (Exception & e)
-        {
-            e.addMessage(
-                "Cannot attach table " + backQuote(database_name) + "." + backQuote(query.getTable()) + " from metadata file " + metadata_path
-                + " from query " + serializeAST(query));
-            throw;
-        }
-    }
-}
-
-
 DatabaseOrdinary::DatabaseOrdinary(const String & name_, const String & metadata_path_, ContextPtr context_)
    : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseOrdinary (" + name_ + ")", context_)
 {
@ -90,75 +52,10 @@ DatabaseOrdinary::DatabaseOrdinary(
 {
 }

-void DatabaseOrdinary::loadStoredObjects(ContextMutablePtr local_context, LoadingStrictnessLevel mode)
+void DatabaseOrdinary::loadStoredObjects(ContextMutablePtr, LoadingStrictnessLevel)
 {
-    /** Tables load faster if they are loaded in sorted (by name) order.
-      * Otherwise (for the ext4 filesystem), `DirectoryIterator` iterates through them in some order,
-      *  which does not correspond to order tables creation and does not correspond to order of their location on disk.
-      */
-
-    ParsedTablesMetadata metadata;
-    bool force_attach = LoadingStrictnessLevel::FORCE_ATTACH <= mode;
-    loadTablesMetadata(local_context, metadata, force_attach);
-
-    size_t total_tables = metadata.parsed_tables.size() - metadata.total_dictionaries;
-
-    AtomicStopwatch watch;
-    std::atomic<size_t> dictionaries_processed{0};
-    std::atomic<size_t> tables_processed{0};
-
-    ThreadPool pool(CurrentMetrics::DatabaseOrdinaryThreads, CurrentMetrics::DatabaseOrdinaryThreadsActive, CurrentMetrics::DatabaseOrdinaryThreadsScheduled);
-
-    /// We must attach dictionaries before attaching tables
-    /// because while we're attaching tables we may need to have some dictionaries attached
-    /// (for example, dictionaries can be used in the default expressions for some tables).
-    /// On the other hand we can attach any dictionary (even sourced from ClickHouse table)
-    /// without having any tables attached. It is so because attaching of a dictionary means
-    /// loading of its config only, it doesn't involve loading the dictionary itself.
-
-    /// Attach dictionaries.
-    for (const auto & name_with_path_and_query : metadata.parsed_tables)
-    {
-        const auto & name = name_with_path_and_query.first;
-        const auto & path = name_with_path_and_query.second.path;
-        const auto & ast = name_with_path_and_query.second.ast;
-        const auto & create_query = ast->as<const ASTCreateQuery &>();
-
-        if (create_query.is_dictionary)
-        {
-            pool.scheduleOrThrowOnError([&]()
-            {
-                loadTableFromMetadata(local_context, path, name, ast, mode);
-
-                /// Messages, so that it's not boring to wait for the server to load for a long time.
-                logAboutProgress(log, ++dictionaries_processed, metadata.total_dictionaries, watch);
-            });
-        }
-    }
-
-    pool.wait();
-
-    /// Attach tables.
-    for (const auto & name_with_path_and_query : metadata.parsed_tables)
-    {
-        const auto & name = name_with_path_and_query.first;
-        const auto & path = name_with_path_and_query.second.path;
-        const auto & ast = name_with_path_and_query.second.ast;
-        const auto & create_query = ast->as<const ASTCreateQuery &>();
-
-        if (!create_query.is_dictionary)
-        {
-            pool.scheduleOrThrowOnError([&]()
-            {
-                loadTableFromMetadata(local_context, path, name, ast, mode);
-
-                /// Messages, so that it's not boring to wait for the server to load for a long time.
-                logAboutProgress(log, ++tables_processed, total_tables, watch);
-            });
-        }
-    }
-
-    pool.wait();
+    // Because it supportsLoadingInTopologicalOrder, we don't need this loading method.
+    throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented");
 }

 void DatabaseOrdinary::loadTablesMetadata(ContextPtr local_context, ParsedTablesMetadata & metadata, bool is_startup)
@ -232,59 +129,143 @@ void DatabaseOrdinary::loadTablesMetadata(ContextPtr local_context, ParsedTables
             TSA_SUPPRESS_WARNING_FOR_READ(database_name), tables_in_database, dictionaries_in_database);
 }

-void DatabaseOrdinary::loadTableFromMetadata(ContextMutablePtr local_context, const String & file_path, const QualifiedTableName & name, const ASTPtr & ast,
+void DatabaseOrdinary::loadTableFromMetadata(
+    ContextMutablePtr local_context,
+    const String & file_path,
+    const QualifiedTableName & name,
+    const ASTPtr & ast,
    LoadingStrictnessLevel mode)
 {
    assert(name.database == TSA_SUPPRESS_WARNING_FOR_READ(database_name));
-    const auto & create_query = ast->as<const ASTCreateQuery &>();
-
-    tryAttachTable(
-        local_context,
-        create_query,
-        *this,
-        name.database,
-        file_path, LoadingStrictnessLevel::FORCE_RESTORE <= mode);
-}
-
-void DatabaseOrdinary::startupTables(ThreadPool & thread_pool, LoadingStrictnessLevel /*mode*/)
-{
-    LOG_INFO(log, "Starting up tables.");
-
-    /// NOTE No concurrent writes are possible during database loading
-    const size_t total_tables = TSA_SUPPRESS_WARNING_FOR_READ(tables).size();
-    if (!total_tables)
-        return;
-
-    AtomicStopwatch watch;
-    std::atomic<size_t> tables_processed{0};
-
-    auto startup_one_table = [&](const StoragePtr & table)
-    {
-        /// Since startup() method can use physical paths on disk we don't allow any exclusive actions (rename, drop so on)
-        /// until startup finished.
-        auto table_lock_holder = table->lockForShare(RWLockImpl::NO_QUERY, getContext()->getSettingsRef().lock_acquire_timeout);
-        table->startup();
-        logAboutProgress(log, ++tables_processed, total_tables, watch);
-    };
-
+    const auto & query = ast->as<const ASTCreateQuery &>();

    try
    {
-        for (const auto & table : TSA_SUPPRESS_WARNING_FOR_READ(tables))
-            thread_pool.scheduleOrThrowOnError([&]() { startup_one_table(table.second); });
+        auto [table_name, table] = createTableFromAST(
+            query,
+            name.database,
+            getTableDataPath(query),
+            local_context,
+            LoadingStrictnessLevel::FORCE_RESTORE <= mode);
+
+        attachTable(local_context, table_name, table, getTableDataPath(query));
    }
-    catch (...)
+    catch (Exception & e)
    {
-        /// We have to wait for jobs to finish here, because job function has reference to variables on the stack of current thread.
-        thread_pool.wait();
+        e.addMessage(
+            "Cannot attach table " + backQuote(name.database) + "." + backQuote(query.getTable()) + " from metadata file " + file_path
+            + " from query " + serializeAST(query));
        throw;
    }
-    thread_pool.wait();
+}
+
+LoadTaskPtr DatabaseOrdinary::loadTableFromMetadataAsync(
+    AsyncLoader & async_loader,
+    LoadJobSet load_after,
+    ContextMutablePtr local_context,
+    const String & file_path,
+    const QualifiedTableName & name,
+    const ASTPtr & ast,
+    LoadingStrictnessLevel mode)
+{
+    std::scoped_lock lock(mutex);
+    auto job = makeLoadJob(
+        std::move(load_after),
+        TablesLoaderBackgroundLoadPoolId,
+        fmt::format("load table {}", name.getFullName()),
+        [this, local_context, file_path, name, ast, mode] (AsyncLoader &, const LoadJobPtr &)
+        {
+            loadTableFromMetadata(local_context, file_path, name, ast, mode);
+        });
+
+    return load_table[name.table] = makeLoadTask(async_loader, {job});
+}
+
+LoadTaskPtr DatabaseOrdinary::startupTableAsync(
+    AsyncLoader & async_loader,
+    LoadJobSet startup_after,
+    const QualifiedTableName & name,
+    LoadingStrictnessLevel /*mode*/)
+{
+    std::scoped_lock lock(mutex);
+
+    /// Initialize progress indication on the first call
+    if (total_tables_to_startup == 0)
+    {
+        total_tables_to_startup = tables.size();
+        startup_watch.restart();
+    }
+
+    auto job = makeLoadJob(
+        std::move(startup_after),
+        TablesLoaderBackgroundStartupPoolId,
+        fmt::format("startup table {}", name.getFullName()),
+        [this, name] (AsyncLoader &, const LoadJobPtr &)
+        {
+            if (auto table = tryGetTableNoWait(name.table))
+            {
+                /// Since startup() method can use physical paths on disk we don't allow any exclusive actions (rename, drop so on)
+                /// until startup finished.
+                auto table_lock_holder = table->lockForShare(RWLockImpl::NO_QUERY, getContext()->getSettingsRef().lock_acquire_timeout);
+                table->startup();
+                logAboutProgress(log, ++tables_started, total_tables_to_startup, startup_watch);
+            }
+            else
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "Table {}.{} doesn't exist during startup",
+                    backQuote(name.database), backQuote(name.table));
+        });
+
+    return startup_table[name.table] = makeLoadTask(async_loader, {job});
+}
+
+LoadTaskPtr DatabaseOrdinary::startupDatabaseAsync(
+    AsyncLoader & async_loader,
+    LoadJobSet startup_after,
+    LoadingStrictnessLevel /*mode*/)
+{
+    // NOTE: this task is empty, but it is required for correct dependency handling (startup should be done after tables loading)
+    auto job = makeLoadJob(
+        std::move(startup_after),
+        TablesLoaderBackgroundStartupPoolId,
+        fmt::format("startup Ordinary database {}", getDatabaseName()));
+    return startup_database_task = makeLoadTask(async_loader, {job});
+}
+
+void DatabaseOrdinary::waitTableStarted(const String & name) const
+{
+    /// Prioritize jobs (load and startup the table) to be executed in foreground pool and wait for them synchronously
+    LoadTaskPtr task;
+    {
+        std::scoped_lock lock(mutex);
+        if (auto it = startup_table.find(name); it != startup_table.end())
+            task = it->second;
+    }
+
+    if (task)
+        waitLoad(currentPoolOr(TablesLoaderForegroundPoolId), task);
+}
+
+void DatabaseOrdinary::waitDatabaseStarted(bool no_throw) const
+{
+    /// Prioritize load and startup of all tables and database itself and wait for them synchronously
+    if (startup_database_task)
+        waitLoad(currentPoolOr(TablesLoaderForegroundPoolId), startup_database_task, no_throw);
+}
+
+DatabaseTablesIteratorPtr DatabaseOrdinary::getTablesIterator(ContextPtr local_context, const DatabaseOnDisk::FilterByNameFunction & filter_by_table_name) const
+{
+    auto result = DatabaseWithOwnTablesBase::getTablesIterator(local_context, filter_by_table_name);
+    std::scoped_lock lock(mutex);
+    typeid_cast<DatabaseTablesSnapshotIterator &>(*result).setLoadTasks(startup_table);
+    return result;
 }

 void DatabaseOrdinary::alterTable(ContextPtr local_context, const StorageID & table_id, const StorageInMemoryMetadata & metadata)
 {
+    waitDatabaseStarted(false);
+
    String table_name = table_id.table_name;
+
    /// Read the definition of the table and replace the necessary parts with new ones.
    String table_metadata_path = getObjectMetadataPath(table_name);
    String table_metadata_tmp_path = table_metadata_path + ".tmp";
--- a/src/Databases/DatabaseOrdinary.h
+++ b/src/Databases/DatabaseOrdinary.h
@ -27,10 +27,35 @@ public:

    void loadTablesMetadata(ContextPtr context, ParsedTablesMetadata & metadata, bool is_startup) override;

-    void loadTableFromMetadata(ContextMutablePtr local_context, const String & file_path, const QualifiedTableName & name, const ASTPtr & ast,
+    void loadTableFromMetadata(
+        ContextMutablePtr local_context,
+        const String & file_path,
+        const QualifiedTableName & name,
+        const ASTPtr & ast,
        LoadingStrictnessLevel mode) override;

-    void startupTables(ThreadPool & thread_pool, LoadingStrictnessLevel mode) override;
+    LoadTaskPtr loadTableFromMetadataAsync(
+        AsyncLoader & async_loader,
+        LoadJobSet load_after,
+        ContextMutablePtr local_context,
+        const String & file_path,
+        const QualifiedTableName & name,
+        const ASTPtr & ast,
+        LoadingStrictnessLevel mode) override;
+
+    LoadTaskPtr startupTableAsync(
+        AsyncLoader & async_loader,
+        LoadJobSet startup_after,
+        const QualifiedTableName & name,
+        LoadingStrictnessLevel mode) override;
+
+    void waitTableStarted(const String & name) const override;
+
+    void waitDatabaseStarted(bool no_throw) const override;
+
+    LoadTaskPtr startupDatabaseAsync(AsyncLoader & async_loader, LoadJobSet startup_after, LoadingStrictnessLevel mode) override;
+
+    DatabaseTablesIteratorPtr getTablesIterator(ContextPtr local_context, const DatabaseOnDisk::FilterByNameFunction & filter_by_table_name) const override;

    void alterTable(
        ContextPtr context,
@ -48,6 +73,13 @@ protected:
        ContextPtr query_context);

    Strings permanently_detached_tables;
+
+    std::unordered_map<String, LoadTaskPtr> load_table TSA_GUARDED_BY(mutex);
+    std::unordered_map<String, LoadTaskPtr> startup_table TSA_GUARDED_BY(mutex);
+    LoadTaskPtr startup_database_task;
+    std::atomic<size_t> total_tables_to_startup{0};
+    std::atomic<size_t> tables_started{0};
+    AtomicStopwatch startup_watch;
 };

 }
--- a/src/Databases/DatabaseReplicated.cpp
+++ b/src/Databases/DatabaseReplicated.cpp
@ -12,6 +12,7 @@
 #include <Common/ZooKeeper/KeeperException.h>
 #include <Common/ZooKeeper/Types.h>
 #include <Common/ZooKeeper/ZooKeeper.h>
+#include <Common/PoolId.h>
 #include <Databases/DatabaseReplicated.h>
 #include <Databases/DatabaseReplicatedWorker.h>
 #include <Databases/DDLDependencyVisitor.h>
@ -53,7 +54,7 @@ namespace ErrorCodes
    extern const int INCORRECT_QUERY;
    extern const int ALL_CONNECTION_TRIES_FAILED;
    extern const int NO_ACTIVE_REPLICAS;
-    extern const int INCONSISTENT_METADATA_FOR_BACKUP;
+    extern const int CANNOT_GET_REPLICATED_DATABASE_SNAPSHOT;
    extern const int CANNOT_RESTORE_TABLE;
 }

@ -533,41 +534,54 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt
    createEmptyLogEntry(current_zookeeper);
 }

-void DatabaseReplicated::beforeLoadingMetadata(ContextMutablePtr /*context*/, LoadingStrictnessLevel mode)
+void DatabaseReplicated::beforeLoadingMetadata(ContextMutablePtr context_, LoadingStrictnessLevel mode)
 {
+    DatabaseAtomic::beforeLoadingMetadata(context_, mode);
    tryConnectToZooKeeperAndInitDatabase(mode);
 }

-void DatabaseReplicated::loadStoredObjects(ContextMutablePtr local_context, LoadingStrictnessLevel mode)
-{
-    beforeLoadingMetadata(local_context, mode);
-    DatabaseAtomic::loadStoredObjects(local_context, mode);
-}
-
 UInt64 DatabaseReplicated::getMetadataHash(const String & table_name) const
 {
    return DB::getMetadataHash(table_name, readMetadataFile(table_name));
 }

-void DatabaseReplicated::startupTables(ThreadPool & thread_pool, LoadingStrictnessLevel mode)
+LoadTaskPtr DatabaseReplicated::startupDatabaseAsync(AsyncLoader & async_loader, LoadJobSet startup_after, LoadingStrictnessLevel mode)
 {
-    DatabaseAtomic::startupTables(thread_pool, mode);
+    auto base = DatabaseAtomic::startupDatabaseAsync(async_loader, std::move(startup_after), mode);
+    auto job = makeLoadJob(
+        base->goals(),
+        TablesLoaderBackgroundStartupPoolId,
+        fmt::format("startup Replicated database {}", getDatabaseName()),
+        [this] (AsyncLoader &, const LoadJobPtr &)
+        {
+            UInt64 digest = 0;
+            {
+                std::lock_guard lock{mutex};
+                for (const auto & table : tables)
+                    digest += getMetadataHash(table.first);
+                LOG_DEBUG(log, "Calculated metadata digest of {} tables: {}", tables.size(), digest);
+            }

-    /// TSA: No concurrent writes are possible during loading
-    UInt64 digest = 0;
-    for (const auto & table : TSA_SUPPRESS_WARNING_FOR_READ(tables))
-        digest += getMetadataHash(table.first);
+            {
+                std::lock_guard lock{metadata_mutex};
+                chassert(!tables_metadata_digest);
+                tables_metadata_digest = digest;
+            }

-    LOG_DEBUG(log, "Calculated metadata digest of {} tables: {}", TSA_SUPPRESS_WARNING_FOR_READ(tables).size(), digest);
-    chassert(!TSA_SUPPRESS_WARNING_FOR_READ(tables_metadata_digest));
-    TSA_SUPPRESS_WARNING_FOR_WRITE(tables_metadata_digest) = digest;
+            if (is_probably_dropped)
+                return;

-    if (is_probably_dropped)
-        return;
+            ddl_worker = std::make_unique<DatabaseReplicatedDDLWorker>(this, getContext());
+            ddl_worker->startup();
+            ddl_worker_initialized = true;
+        });
+    return startup_replicated_database_task = makeLoadTask(async_loader, {job});
+}

-    ddl_worker = std::make_unique<DatabaseReplicatedDDLWorker>(this, getContext());
-    ddl_worker->startup();
-    ddl_worker_initialized = true;
+void DatabaseReplicated::waitDatabaseStarted(bool no_throw) const
+{
+    if (startup_replicated_database_task)
+        waitLoad(currentPoolOr(TablesLoaderForegroundPoolId), startup_replicated_database_task, no_throw);
 }

 bool DatabaseReplicated::checkDigestValid(const ContextPtr & local_context, bool debug_check /* = true */) const
@ -728,6 +742,7 @@ void DatabaseReplicated::checkQueryValid(const ASTPtr & query, ContextPtr query_

 BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, ContextPtr query_context, QueryFlags flags)
 {
+    waitDatabaseStarted(false);

    if (query_context->getCurrentTransaction() && query_context->getSettingsRef().throw_on_unsupported_query_inside_transaction)
        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Distributed DDL queries inside transactions are not supported");
@ -791,6 +806,8 @@ static UUID getTableUUIDIfReplicated(const String & metadata, ContextPtr context

 void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 our_log_ptr, UInt32 & max_log_ptr)
 {
+    waitDatabaseStarted(false);
+
    is_recovering = true;
    SCOPE_EXIT({ is_recovering = false; });

@ -1107,31 +1124,43 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep
 }

 std::map<String, String> DatabaseReplicated::tryGetConsistentMetadataSnapshot(const ZooKeeperPtr & zookeeper, UInt32 & max_log_ptr)
+{
+    return getConsistentMetadataSnapshotImpl(zookeeper, {}, /* max_retries= */ 10, max_log_ptr);
+}
+
+std::map<String, String> DatabaseReplicated::getConsistentMetadataSnapshotImpl(
+    const ZooKeeperPtr & zookeeper,
+    const FilterByNameFunction & filter_by_table_name,
+    size_t max_retries,
+    UInt32 & max_log_ptr) const
 {
    std::map<String, String> table_name_to_metadata;
-    constexpr int max_retries = 10;
-    int iteration = 0;
+    size_t iteration = 0;
    while (++iteration <= max_retries)
    {
        table_name_to_metadata.clear();
        LOG_DEBUG(log, "Trying to get consistent metadata snapshot for log pointer {}", max_log_ptr);
-        Strings table_names = zookeeper->getChildren(zookeeper_path + "/metadata");
+
+        Strings escaped_table_names;
+        escaped_table_names = zookeeper->getChildren(zookeeper_path + "/metadata");
+        if (filter_by_table_name)
+            std::erase_if(escaped_table_names, [&](const String & table) { return !filter_by_table_name(unescapeForFileName(table)); });

        std::vector<zkutil::ZooKeeper::FutureGet> futures;
-        futures.reserve(table_names.size());
-        for (const auto & table : table_names)
+        futures.reserve(escaped_table_names.size());
+        for (const auto & table : escaped_table_names)
            futures.emplace_back(zookeeper->asyncTryGet(zookeeper_path + "/metadata/" + table));

-        for (size_t i = 0; i < table_names.size(); ++i)
+        for (size_t i = 0; i < escaped_table_names.size(); ++i)
        {
            auto res = futures[i].get();
            if (res.error != Coordination::Error::ZOK)
                break;
-            table_name_to_metadata.emplace(unescapeForFileName(table_names[i]), res.data);
+            table_name_to_metadata.emplace(unescapeForFileName(escaped_table_names[i]), res.data);
        }

        UInt32 new_max_log_ptr = parse<UInt32>(zookeeper->get(zookeeper_path + "/max_log_ptr"));
-        if (new_max_log_ptr == max_log_ptr && table_names.size() == table_name_to_metadata.size())
+        if (new_max_log_ptr == max_log_ptr && escaped_table_names.size() == table_name_to_metadata.size())
            break;

        if (max_log_ptr < new_max_log_ptr)
@ -1142,13 +1171,13 @@ std::map<String, String> DatabaseReplicated::tryGetConsistentMetadataSnapshot(co
        else
        {
            chassert(max_log_ptr == new_max_log_ptr);
-            chassert(table_names.size() != table_name_to_metadata.size());
+            chassert(escaped_table_names.size() != table_name_to_metadata.size());
            LOG_DEBUG(log, "Cannot get metadata of some tables due to ZooKeeper error, will retry");
        }
    }

    if (max_retries < iteration)
-        throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Cannot get consistent metadata snapshot");
+        throw Exception(ErrorCodes::CANNOT_GET_REPLICATED_DATABASE_SNAPSHOT, "Cannot get consistent metadata snapshot");

    LOG_DEBUG(log, "Got consistent metadata snapshot for log pointer {}", max_log_ptr);

@ -1221,6 +1250,8 @@ void DatabaseReplicated::drop(ContextPtr context_)
        return;
    }

+    waitDatabaseStarted(false);
+
    auto current_zookeeper = getZooKeeper();
    current_zookeeper->set(replica_path, DROPPED_MARK, -1);
    createEmptyLogEntry(current_zookeeper);
@ -1238,6 +1269,7 @@ void DatabaseReplicated::drop(ContextPtr context_)

 void DatabaseReplicated::stopReplication()
 {
+    waitDatabaseStarted(/* no_throw = */ true);
    if (ddl_worker)
        ddl_worker->shutdown();
 }
@ -1253,6 +1285,8 @@ void DatabaseReplicated::shutdown()

 void DatabaseReplicated::dropTable(ContextPtr local_context, const String & table_name, bool sync)
 {
+    waitDatabaseStarted(false);
+
    auto txn = local_context->getZooKeeperMetadataTransaction();
    assert(!ddl_worker || !ddl_worker->isCurrentlyActive() || txn || startsWith(table_name, ".inner_id."));
    if (txn && txn->isInitialQuery() && !txn->isCreateOrReplaceQuery())
@ -1295,6 +1329,8 @@ void DatabaseReplicated::renameTable(ContextPtr local_context, const String & ta
    if (exchange && !to_database.isTableExist(to_table_name, local_context))
        throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", to_table_name);

+    waitDatabaseStarted(false);
+
    String statement = readMetadataFile(table_name);
    String statement_to;
    if (exchange)
@ -1395,6 +1431,8 @@ bool DatabaseReplicated::canExecuteReplicatedMetadataAlter() const

 void DatabaseReplicated::detachTablePermanently(ContextPtr local_context, const String & table_name)
 {
+    waitDatabaseStarted(false);
+
    auto txn = local_context->getZooKeeperMetadataTransaction();
    assert(!ddl_worker->isCurrentlyActive() || txn);
    if (txn && txn->isInitialQuery())
@ -1418,6 +1456,8 @@ void DatabaseReplicated::detachTablePermanently(ContextPtr local_context, const

 void DatabaseReplicated::removeDetachedPermanentlyFlag(ContextPtr local_context, const String & table_name, const String & table_metadata_path, bool attach)
 {
+    waitDatabaseStarted(false);
+
    auto txn = local_context->getZooKeeperMetadataTransaction();
    assert(!ddl_worker->isCurrentlyActive() || txn);
    if (txn && txn->isInitialQuery() && attach)
@ -1454,23 +1494,19 @@ String DatabaseReplicated::readMetadataFile(const String & table_name) const
 std::vector<std::pair<ASTPtr, StoragePtr>>
 DatabaseReplicated::getTablesForBackup(const FilterByNameFunction & filter, const ContextPtr &) const
 {
+    waitDatabaseStarted(false);
+
    /// Here we read metadata from ZooKeeper. We could do that by simple call of DatabaseAtomic::getTablesForBackup() however
    /// reading from ZooKeeper is better because thus we won't be dependent on how fast the replication queue of this database is.
-    std::vector<std::pair<ASTPtr, StoragePtr>> res;
    auto zookeeper = getContext()->getZooKeeper();
-    auto escaped_table_names = zookeeper->getChildren(zookeeper_path + "/metadata");
-    for (const auto & escaped_table_name : escaped_table_names)
+    UInt32 snapshot_version = parse<UInt32>(zookeeper->get(zookeeper_path + "/max_log_ptr"));
+    auto snapshot = getConsistentMetadataSnapshotImpl(zookeeper, filter, /* max_retries= */ 20, snapshot_version);
+
+    std::vector<std::pair<ASTPtr, StoragePtr>> res;
+    for (const auto & [table_name, metadata] : snapshot)
    {
-        String table_name = unescapeForFileName(escaped_table_name);
-        if (!filter(table_name))
-            continue;
-
-        String zk_metadata;
-        if (!zookeeper->tryGet(zookeeper_path + "/metadata/" + escaped_table_name, zk_metadata))
-            throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP, "Metadata for table {} was not found in ZooKeeper", table_name);
-
        ParserCreateQuery parser;
-        auto create_table_query = parseQuery(parser, zk_metadata, 0, getContext()->getSettingsRef().max_parser_depth);
+        auto create_table_query = parseQuery(parser, metadata, 0, getContext()->getSettingsRef().max_parser_depth);

        auto & create = create_table_query->as<ASTCreateQuery &>();
        create.attach = false;
@ -1501,6 +1537,8 @@ void DatabaseReplicated::createTableRestoredFromBackup(
    std::shared_ptr<IRestoreCoordination> restore_coordination,
    UInt64 timeout_ms)
 {
+    waitDatabaseStarted(false);
+
    /// Because of the replication multiple nodes can try to restore the same tables again and failed with "Table already exists"
    /// because of some table could be restored already on other node and then replicated to this node.
    /// To solve this problem we use the restore coordination: the first node calls
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`Subproject commit 3756e537d4d48cc0dd4176801fe19f99601439b0`