Merge branch 'master' into alexey-milovidov-patch-4

2024-11-21 15:12:02 +00:00 · 2022-08-20 13:42:39 +02:00 · 2022-08-20 13:42:39 +02:00 · c5a7e75c88
commit c5a7e75c88
parent d2ed4c5fc5 40ed829a29
449 changed files with 12246 additions and 3844 deletions
--- a/.github/workflows/tags_stable.yml
+++ b/.github/workflows/tags_stable.yml
@ -13,13 +13,24 @@ on: # yamllint disable-line rule:truthy
    - 'v*-prestable'
    - 'v*-stable'
    - 'v*-lts'
+  workflow_dispatch:
+    inputs:
+      tag:
+        description: 'Test tag'
+        required: true
+        type: string


 jobs:
  UpdateVersions:
    runs-on: [self-hosted, style-checker]
    steps:
+    - name: Set test tag
+      if: github.event_name == 'workflow_dispatch'
+      run: |
+        echo "GITHUB_TAG=${{ github.event.inputs.tag }}" >> "$GITHUB_ENV"
    - name: Get tag name
+      if: github.event_name != 'workflow_dispatch'
      run: |
        echo "GITHUB_TAG=${GITHUB_REF#refs/tags/}" >> "$GITHUB_ENV"
    - name: Check out repository code
@ -35,12 +46,15 @@ jobs:
        GID=$(id -g "${UID}")
        docker run -u "${UID}:${GID}" -e PYTHONUNBUFFERED=1 \
            --volume="${GITHUB_WORKSPACE}:/ClickHouse" clickhouse/style-test \
-                /ClickHouse/utils/changelog/changelog.py -vv --gh-user-or-token="$GITHUB_TOKEN" \
-                --output="/ClickHouse/docs/changelogs/${GITHUB_TAG}.md" --jobs=5 "${GITHUB_TAG}"
+                /ClickHouse/utils/changelog/changelog.py -v --debug-helpers \
+                --gh-user-or-token="$GITHUB_TOKEN" --jobs=5 \
+                --output="/ClickHouse/docs/changelogs/${GITHUB_TAG}.md" "${GITHUB_TAG}"
        git add "./docs/changelogs/${GITHUB_TAG}.md"
        git diff HEAD
    - name: Create Pull Request
      uses: peter-evans/create-pull-request@v3
+      env:
+        GITHUB_TOKEN: ${{ secrets.ROBOT_CLICKHOUSE_COMMIT_TOKEN }}
      with:
        author: "robot-clickhouse <robot-clickhouse@users.noreply.github.com>"
        committer: "robot-clickhouse <robot-clickhouse@users.noreply.github.com>"
@ -48,6 +62,7 @@ jobs:
        branch: auto/${{ env.GITHUB_TAG }}
        delete-branch: true
        title: Update version_date.tsv and changelogs after ${{ env.GITHUB_TAG }}
+        labels: do not test
        body: |
          Update version_date.tsv and changelogs after ${{ env.GITHUB_TAG }}

--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,5 @@
 ### Table of Contents
+**[ClickHouse release v22.8, 2022-08-18](#228)**<br/>
 **[ClickHouse release v22.7, 2022-07-21](#227)**<br/>
 **[ClickHouse release v22.6, 2022-06-16](#226)**<br/>
 **[ClickHouse release v22.5, 2022-05-19](#225)**<br/>
@ -8,6 +9,148 @@
 **[ClickHouse release v22.1, 2022-01-18](#221)**<br/>
 **[Changelog for 2021](https://clickhouse.com/docs/en/whats-new/changelog/2021/)**<br/>

+
+### <a id="228"></a> ClickHouse release 22.8, 2022-08-18
+
+#### Backward Incompatible Change
+* Extended range of `Date32` and `DateTime64` to support dates from the year 1900 to 2299. In previous versions, the supported interval was only from the year 1925 to 2283. The implementation is using the proleptic Gregorian calendar (which is conformant with [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601):2004 (clause 3.2.1 The Gregorian calendar)) instead of accounting for historical transitions from the Julian to the Gregorian calendar. This change affects implementation-specific behavior for out-of-range arguments. E.g. if in previous versions the value of `1899-01-01` was clamped to `1925-01-01`, in the new version it will be clamped to `1900-01-01`. It changes the behavior of rounding with `toStartOfInterval` if you pass `INTERVAL 3 QUARTER` up to one quarter because the intervals are counted from an implementation-specific point of time. Closes [#28216](https://github.com/ClickHouse/ClickHouse/issues/28216), improves [#38393](https://github.com/ClickHouse/ClickHouse/issues/38393). [#39425](https://github.com/ClickHouse/ClickHouse/pull/39425) ([Roman Vasin](https://github.com/rvasin)).
+* Now, all relevant dictionary sources respect `remote_url_allow_hosts` setting. It was already done for HTTP, Cassandra, Redis. Added ClickHouse, MongoDB, MySQL, PostgreSQL. Host is checked only for dictionaries created from DDL. [#39184](https://github.com/ClickHouse/ClickHouse/pull/39184) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Prebuilt ClickHouse x86 binaries now require support for AVX instructions, i.e. a CPU not older than Intel Sandy Bridge / AMD Bulldozer, both released in 2011. [#39000](https://github.com/ClickHouse/ClickHouse/pull/39000) ([Robert Schulze](https://github.com/rschu1ze)).
+* Make the remote filesystem cache composable, allow not to evict certain files (regarding idx, mrk, ..), delete old cache version. Now it is possible to configure cache over Azure blob storage disk, over Local disk, over StaticWeb disk, etc. This PR is marked backward incompatible because cache configuration changes and in order for cache to work need to update the config file. Old cache will still be used with new configuration. The server will startup fine with the old cache configuration. Closes https://github.com/ClickHouse/ClickHouse/issues/36140. Closes https://github.com/ClickHouse/ClickHouse/issues/37889. ([Kseniia Sumarokova](https://github.com/kssenii)). [#36171](https://github.com/ClickHouse/ClickHouse/pull/36171))
+
+#### New Feature
+* Support SQL standard DELETE FROM syntax on merge tree tables and lightweight delete implementation for merge tree families. [#37893](https://github.com/ClickHouse/ClickHouse/pull/37893) ([Jianmei Zhang](https://github.com/zhangjmruc)) ([Alexander Gololobov](https://github.com/davenger)). Note: this new feature does not make ClickHouse an HTAP DBMS.
+* Query parameters can be set in interactive mode as `SET param_abc = 'def'` and transferred via the native protocol as settings. [#39906](https://github.com/ClickHouse/ClickHouse/pull/39906) ([Nikita Taranov](https://github.com/nickitat)).
+* Quota key can be set in the native protocol ([Yakov Olkhovsky](https://github.com/ClickHouse/ClickHouse/pull/39874)).
+* Added a setting `exact_rows_before_limit` (0/1). When enabled, ClickHouse will provide exact value for `rows_before_limit_at_least` statistic, but with the cost that the data before limit will have to be read completely. This closes [#6613](https://github.com/ClickHouse/ClickHouse/issues/6613). [#25333](https://github.com/ClickHouse/ClickHouse/pull/25333) ([kevin wan](https://github.com/MaxWk)).
+* Added support for parallel distributed insert select with `s3Cluster` table function into tables with `Distributed` and `Replicated` engine [#34670](https://github.com/ClickHouse/ClickHouse/issues/34670). [#39107](https://github.com/ClickHouse/ClickHouse/pull/39107) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Add new settings to control schema inference from text formats: - `input_format_try_infer_dates` - try infer dates from strings. - `input_format_try_infer_datetimes` - try infer datetimes from strings. - `input_format_try_infer_integers` - try infer `Int64` instead of `Float64`. - `input_format_json_try_infer_numbers_from_strings` - try infer numbers from json strings in JSON formats. [#39186](https://github.com/ClickHouse/ClickHouse/pull/39186) ([Kruglov Pavel](https://github.com/Avogar)).
+* An option to provide JSON formatted log output. The purpose is to allow easier ingestion and query in log analysis tools. [#39277](https://github.com/ClickHouse/ClickHouse/pull/39277) ([Mallik Hassan](https://github.com/SadiHassan)).
+* Add function `nowInBlock` which allows getting the current time during long-running and continuous queries. Closes [#39522](https://github.com/ClickHouse/ClickHouse/issues/39522). Notes: there are no functions `now64InBlock` neither `todayInBlock`. [#39533](https://github.com/ClickHouse/ClickHouse/pull/39533) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Add ability to specify settings for an `executable()` table function. [#39681](https://github.com/ClickHouse/ClickHouse/pull/39681) ([Constantine Peresypkin](https://github.com/pkit)).
+* Implemented automatic conversion of database engine from `Ordinary` to `Atomic`. Create empty `convert_ordinary_to_atomic` file in `flags` directory and all `Ordinary` databases will be converted automatically on next server start. Resolves [#39546](https://github.com/ClickHouse/ClickHouse/issues/39546). [#39933](https://github.com/ClickHouse/ClickHouse/pull/39933) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Support `SELECT ... INTO OUTFILE '...' AND STDOUT`. [#37490](https://github.com/ClickHouse/ClickHouse/issues/37490). [#39054](https://github.com/ClickHouse/ClickHouse/pull/39054) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
+* Add formats `PrettyMonoBlock`, `PrettyNoEscapesMonoBlock`, `PrettyCompactNoEscapes`, `PrettyCompactNoEscapesMonoBlock`, `PrettySpaceNoEscapes`, `PrettySpaceMonoBlock`, `PrettySpaceNoEscapesMonoBlock`. [#39646](https://github.com/ClickHouse/ClickHouse/pull/39646) ([Kruglov Pavel](https://github.com/Avogar)).
+* Add new setting schema_inference_hints that allows to specify structure hints in schema inference for specific columns. Closes [#39569](https://github.com/ClickHouse/ClickHouse/issues/39569). [#40068](https://github.com/ClickHouse/ClickHouse/pull/40068) ([Kruglov Pavel](https://github.com/Avogar)).
+
+
+#### Performance Improvement
+* Improved memory usage during memory efficient merging of aggregation results. [#39429](https://github.com/ClickHouse/ClickHouse/pull/39429) ([Nikita Taranov](https://github.com/nickitat)).
+* Added concurrency control logic to limit total number of concurrent threads created by queries. [#37558](https://github.com/ClickHouse/ClickHouse/pull/37558) ([Sergei Trifonov](https://github.com/serxa)). Add `concurrent_threads_soft_limit parameter` to increase performance in case of high QPS by means of limiting total number of threads for all queries. [#37285](https://github.com/ClickHouse/ClickHouse/pull/37285) ([Roman Vasin](https://github.com/rvasin)).
+* Add `SLRU` cache policy for uncompressed cache and marks cache. ([Kseniia Sumarokova](https://github.com/kssenii)). [#34651](https://github.com/ClickHouse/ClickHouse/pull/34651) ([alexX512](https://github.com/alexX512)). Decoupling local cache function and cache algorithm [#38048](https://github.com/ClickHouse/ClickHouse/pull/38048) ([Han Shukai](https://github.com/KinderRiven)). 
+* Intel® In-Memory Analytics Accelerator (Intel® IAA) is a hardware accelerator available in the upcoming generation of Intel® Xeon® Scalable processors ("Sapphire Rapids"). Its goal is to speed up common operations in analytics like data (de)compression and filtering. ClickHouse gained the new "DeflateQpl" compression codec which utilizes the Intel® IAA offloading technology to provide a high-performance DEFLATE implementation. The codec uses the [Intel® Query Processing Library (QPL)](https://github.com/intel/qpl) which abstracts access to the hardware accelerator, respectively to a software fallback in case the hardware accelerator is not available. DEFLATE provides in general higher compression rates than ClickHouse's LZ4 default codec, and as a result, offers less disk I/O and lower main memory consumption. [#36654](https://github.com/ClickHouse/ClickHouse/pull/36654) ([jasperzhu](https://github.com/jinjunzh)). [#39494](https://github.com/ClickHouse/ClickHouse/pull/39494) ([Robert Schulze](https://github.com/rschu1ze)).
+* `DISTINCT` in order with `ORDER BY`: Deduce way to sort based on input stream sort description. Skip sorting if input stream is already sorted. [#38719](https://github.com/ClickHouse/ClickHouse/pull/38719) ([Igor Nikonov](https://github.com/devcrafter)). Improve memory usage (significantly) and query execution time + use `DistinctSortedChunkTransform` for final distinct when `DISTINCT` columns match `ORDER BY` columns, but rename to `DistinctSortedStreamTransform` in `EXPLAIN PIPELINE` → this improves memory usage significantly + remove unnecessary allocations in hot loop in `DistinctSortedChunkTransform`. [#39432](https://github.com/ClickHouse/ClickHouse/pull/39432) ([Igor Nikonov](https://github.com/devcrafter)). Use `DistinctSortedTransform` only when sort description is applicable to DISTINCT columns, otherwise fall back to ordinary DISTINCT implementation + it allows making less checks during `DistinctSortedTransform` execution. [#39528](https://github.com/ClickHouse/ClickHouse/pull/39528) ([Igor Nikonov](https://github.com/devcrafter)). Fix: `DistinctSortedTransform` didn't take advantage of sorting. It never cleared HashSet since clearing_columns were detected incorrectly (always empty). So, it basically worked as ordinary `DISTINCT` (`DistinctTransform`). The fix reduces memory usage significantly. [#39538](https://github.com/ClickHouse/ClickHouse/pull/39538) ([Igor Nikonov](https://github.com/devcrafter)).
+* Use local node as first priority to get structure of remote table when executing `cluster` and similar table functions. [#39440](https://github.com/ClickHouse/ClickHouse/pull/39440) ([Mingliang Pan](https://github.com/liangliangpan)).
+* Optimize filtering by numeric columns with AVX512VBMI2 compress store. [#39633](https://github.com/ClickHouse/ClickHouse/pull/39633) ([Guo Wangyang](https://github.com/guowangy)). For systems with AVX512 VBMI2, this PR improves performance by ca. 6% for SSB benchmark queries queries 3.1, 3.2 and 3.3 (SF=100). Tested on Intel Icelake Xeon 8380 * 2 socket. [#40033](https://github.com/ClickHouse/ClickHouse/pull/40033) ([Robert Schulze](https://github.com/rschu1ze)).
+* Optimize index analysis with functional expressions in multi-thread scenario. [#39812](https://github.com/ClickHouse/ClickHouse/pull/39812) ([Guo Wangyang](https://github.com/guowangy)).
+* Optimizations for complex queries: Don't visit the AST for UDFs if none are registered. [#40069](https://github.com/ClickHouse/ClickHouse/pull/40069) ([Raúl Marín](https://github.com/Algunenano)). Optimize CurrentMemoryTracker alloc and free. [#40078](https://github.com/ClickHouse/ClickHouse/pull/40078) ([Raúl Marín](https://github.com/Algunenano)).
+* Improved Base58 encoding/decoding. [#39292](https://github.com/ClickHouse/ClickHouse/pull/39292) ([Andrey Zvonov](https://github.com/zvonand)).
+* Improve bytes to bits mask transform for SSE/AVX/AVX512. [#39586](https://github.com/ClickHouse/ClickHouse/pull/39586) ([Guo Wangyang](https://github.com/guowangy)).
+
+#### Improvement
+* Normalize `AggregateFunction` types and state representations because optimizations like [#35788](https://github.com/ClickHouse/ClickHouse/pull/35788) will treat `count(not null columns)` as `count()`, which might confuses distributed interpreters with the following error : `Conversion from AggregateFunction(count) to AggregateFunction(count, Int64) is not supported`. [#39420](https://github.com/ClickHouse/ClickHouse/pull/39420) ([Amos Bird](https://github.com/amosbird)). The functions with identical states can be used in materialized views interchangeably.
+* Rework and simplify the `system.backups` table, remove the `internal` column, allow user to set the ID of operation, add columns `num_files`, `uncompressed_size`, `compressed_size`, `start_time`, `end_time`. [#39503](https://github.com/ClickHouse/ClickHouse/pull/39503) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Improved structure of DDL query result table for `Replicated` database (separate columns with shard and replica name, more clear status) - `CREATE TABLE ... ON CLUSTER` queries can be normalized on initiator first if `distributed_ddl_entry_format_version` is set to 3 (default value). It means that `ON CLUSTER` queries may not work if initiator does not belong to the cluster that specified in query. Fixes [#37318](https://github.com/ClickHouse/ClickHouse/issues/37318), [#39500](https://github.com/ClickHouse/ClickHouse/issues/39500) - Ignore `ON CLUSTER` clause if database is `Replicated` and cluster name equals to database name. Related to [#35570](https://github.com/ClickHouse/ClickHouse/issues/35570) - Miscellaneous minor fixes for `Replicated` database engine - Check metadata consistency when starting up `Replicated` database, start replica recovery in case of mismatch of local metadata and metadata in Keeper. Resolves [#24880](https://github.com/ClickHouse/ClickHouse/issues/24880). [#37198](https://github.com/ClickHouse/ClickHouse/pull/37198) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Add result_rows and result_bytes to progress reports (`X-ClickHouse-Summary`). [#39567](https://github.com/ClickHouse/ClickHouse/pull/39567) ([Raúl Marín](https://github.com/Algunenano)).
+* Improve primary key analysis for MergeTree. [#25563](https://github.com/ClickHouse/ClickHouse/pull/25563) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* `timeSlots` now works with DateTime64; subsecond duration and slot size available when working with DateTime64. [#37951](https://github.com/ClickHouse/ClickHouse/pull/37951) ([Andrey Zvonov](https://github.com/zvonand)).
+* Added support of `LEFT SEMI` and `LEFT ANTI` direct join with `EmbeddedRocksDB` tables. [#38956](https://github.com/ClickHouse/ClickHouse/pull/38956) ([Vladimir C](https://github.com/vdimir)).
+* Add profile events for fsync operations. [#39179](https://github.com/ClickHouse/ClickHouse/pull/39179) ([Azat Khuzhin](https://github.com/azat)).
+* Add the second argument to the ordinary function `file(path[, default])`, which function returns in the case when a file does not exists. [#39218](https://github.com/ClickHouse/ClickHouse/pull/39218) ([Nikolay Degterinsky](https://github.com/evillique)).
+* Some small fixes for reading via http, allow to retry partial content in case if 200 OK. [#39244](https://github.com/ClickHouse/ClickHouse/pull/39244) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Support queries `CREATE TEMPORARY TABLE ... (<list of columns>) AS ...`. [#39462](https://github.com/ClickHouse/ClickHouse/pull/39462) ([Kruglov Pavel](https://github.com/Avogar)).
+* Add support of `!`/`*` (exclamation/asterisk) in custom TLDs (`cutToFirstSignificantSubdomainCustom()`/`cutToFirstSignificantSubdomainCustomWithWWW()`/`firstSignificantSubdomainCustom()`). [#39496](https://github.com/ClickHouse/ClickHouse/pull/39496) ([Azat Khuzhin](https://github.com/azat)).
+* Add support for TLS connections to NATS. Implements [#39525](https://github.com/ClickHouse/ClickHouse/issues/39525). [#39527](https://github.com/ClickHouse/ClickHouse/pull/39527) ([Constantine Peresypkin](https://github.com/pkit)).
+* `clickhouse-obfuscator` (a tool for database obfuscation for testing and load generation) now has the new `--save` and `--load` parameters to work with pre-trained models. This closes [#39534](https://github.com/ClickHouse/ClickHouse/issues/39534). [#39541](https://github.com/ClickHouse/ClickHouse/pull/39541) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fix incorrect behavior of log rotation during restart. [#39558](https://github.com/ClickHouse/ClickHouse/pull/39558) ([Nikolay Degterinsky](https://github.com/evillique)).
+* Fix building aggregate projections when external aggregation is on. Mark as improvement because the case is rare and there exists easy workaround to fix it via changing settings. This fixes [#39667](https://github.com/ClickHouse/ClickHouse/issues/39667) . [#39671](https://github.com/ClickHouse/ClickHouse/pull/39671) ([Amos Bird](https://github.com/amosbird)).
+* Allow to execute hash functions with arguments of type `Map`. [#39685](https://github.com/ClickHouse/ClickHouse/pull/39685) ([Anton Popov](https://github.com/CurtizJ)).
+* Add a configuration parameter to hide addresses in stack traces. It may improve security a little but generally, it is harmful and should not be used. [#39690](https://github.com/ClickHouse/ClickHouse/pull/39690) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Change the prefix size of AggregateFunctionDistinct to make sure nested function data memory segment is aligned. [#39696](https://github.com/ClickHouse/ClickHouse/pull/39696) ([Pxl](https://github.com/BiteTheDDDDt)).
+* Properly escape credentials passed to the `clickhouse-diagnostic` tool. [#39707](https://github.com/ClickHouse/ClickHouse/pull/39707) ([Dale McDiarmid](https://github.com/gingerwizard)).
+* ClickHouse Keeper improvement: create a snapshot on exit. It can be controlled with the config `keeper_server.create_snapshot_on_exit`, `true` by default. [#39755](https://github.com/ClickHouse/ClickHouse/pull/39755) ([Antonio Andelic](https://github.com/antonio2368)).
+* Support primary key analysis for `row_policy_filter` and `additional_filter`. It also helps fix issues like [#37454](https://github.com/ClickHouse/ClickHouse/issues/37454) . [#39826](https://github.com/ClickHouse/ClickHouse/pull/39826) ([Amos Bird](https://github.com/amosbird)).
+* Fix two usability issues in Play UI: - it was non-pixel-perfect on iPad due to parasitic border radius and margins; - the progress indication did not display after the first query. This closes [#39957](https://github.com/ClickHouse/ClickHouse/issues/39957). This closes [#39960](https://github.com/ClickHouse/ClickHouse/issues/39960). [#39961](https://github.com/ClickHouse/ClickHouse/pull/39961) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Play UI: add row numbers; add cell selection on click; add hysteresis for table cells. [#39962](https://github.com/ClickHouse/ClickHouse/pull/39962) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Play UI: recognize tab key in textarea, but at the same time don't mess up with tab navigation. [#40053](https://github.com/ClickHouse/ClickHouse/pull/40053) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* The client will show server-side elapsed time. This is important for the performance comparison of ClickHouse services in remote datacenters. This closes [#38070](https://github.com/ClickHouse/ClickHouse/issues/38070). See also [this](https://github.com/ClickHouse/ClickBench/blob/main/hardware/benchmark-cloud.sh#L37) for motivation. [#39968](https://github.com/ClickHouse/ClickHouse/pull/39968) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Adds `parseDateTime64BestEffortUS`, `parseDateTime64BestEffortUSOrNull`, `parseDateTime64BestEffortUSOrZero` functions, closing [#37492](https://github.com/ClickHouse/ClickHouse/issues/37492). [#40015](https://github.com/ClickHouse/ClickHouse/pull/40015) ([Tanya Bragin](https://github.com/tbragin)).
+* Extend the `system.processors_profile_log` with more information such as input rows. [#40121](https://github.com/ClickHouse/ClickHouse/pull/40121) ([Amos Bird](https://github.com/amosbird)).
+* Display server-side time in `clickhouse-benchmark` by default if it is available (since ClickHouse version 22.8). This is needed to correctly compare the performance of clouds. This behavior can be changed with the new `--client-side-time` command line option. Change the `--randomize` command line option from `--randomize 1` to the form without argument. [#40193](https://github.com/ClickHouse/ClickHouse/pull/40193) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Add counters (ProfileEvents) for cases when query complexity limitation has been set and has reached (a separate counter for `overflow_mode` = `break` and `throw`). For example, if you have set up `max_rows_to_read` with `read_overflow_mode = 'break'`, looking at the value of `OverflowBreak` counter will allow distinguishing incomplete results. [#40205](https://github.com/ClickHouse/ClickHouse/pull/40205) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fix memory accounting in case of "Memory limit exceeded" errors (previously [peak] memory usage was takes failed allocations into account). [#40249](https://github.com/ClickHouse/ClickHouse/pull/40249) ([Azat Khuzhin](https://github.com/azat)).
+* Add metrics for filesystem cache: `FilesystemCacheSize` and `FilesystemCacheElements`. [#40260](https://github.com/ClickHouse/ClickHouse/pull/40260) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Support hadoop secure RPC transfer (hadoop.rpc.protection=privacy and hadoop.rpc.protection=integrity). [#39411](https://github.com/ClickHouse/ClickHouse/pull/39411) ([michael1589](https://github.com/michael1589)).
+* Avoid continuously growing memory consumption of pattern cache when using functions multi(Fuzzy)Match(Any|AllIndices|AnyIndex)(). [#40264](https://github.com/ClickHouse/ClickHouse/pull/40264) ([Robert Schulze](https://github.com/rschu1ze)).
+* Add cache for schema inference for file/s3/hdfs/url table functions. Now, schema inference will be performed only on the first query to the file, all subsequent queries to the same file will use the schema from cache if data wasn't changed. Add system table system.schema_inference_cache with all current schemas in cache and system queries SYSTEM DROP SCHEMA CACHE [FOR FILE/S3/HDFS/URL] to drop schemas from cache. [#38286](https://github.com/ClickHouse/ClickHouse/pull/38286) ([Kruglov Pavel](https://github.com/Avogar)).
+* Add support for LARGE_BINARY/LARGE_STRING with Arrow (Closes [#32401](https://github.com/ClickHouse/ClickHouse/issues/32401)). [#40293](https://github.com/ClickHouse/ClickHouse/pull/40293) ([Josh Taylor](https://github.com/joshuataylor)).
+
+#### Build/Testing/Packaging Improvement
+* [ClickFiddle](https://fiddle.clickhouse.com/): A new tool for testing ClickHouse versions in read/write mode (**Igor Baliuk**).
+* ClickHouse binary is made self-extracting [#35775](https://github.com/ClickHouse/ClickHouse/pull/35775) ([Yakov Olkhovskiy, Arthur Filatenkov](https://github.com/yakov-olkhovskiy)).
+* Update tzdata to 2022b to support the new timezone changes. See https://github.com/google/cctz/pull/226. Chile's 2022 DST start is delayed from September 4 to September 11. Iran plans to stop observing DST permanently, after it falls back on 2022-09-21. There are corrections of the historical time zone of Asia/Tehran in the year 1977: Iran adopted standard time in 1935, not 1946. In 1977 it observed DST from 03-21 23:00 to 10-20 24:00; its 1978 transitions were on 03-24 and 08-05, not 03-20 and 10-20; and its spring 1979 transition was on 05-27, not 03-21 (https://data.iana.org/time-zones/tzdb/NEWS). ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Former packages used to install systemd.service file to `/etc`. The files there are marked as `conf` and are not cleaned out, and not updated automatically. This PR cleans them out. [#39323](https://github.com/ClickHouse/ClickHouse/pull/39323) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Ensure LSan is effective. [#39430](https://github.com/ClickHouse/ClickHouse/pull/39430) ([Azat Khuzhin](https://github.com/azat)).
+* TSAN has issues with clang-14 (https://github.com/google/sanitizers/issues/1552, https://github.com/google/sanitizers/issues/1540), so here we build the TSAN binaries with clang-15. [#39450](https://github.com/ClickHouse/ClickHouse/pull/39450) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Remove the option to build ClickHouse tools as separate executable programs. This fixes [#37847](https://github.com/ClickHouse/ClickHouse/issues/37847). [#39520](https://github.com/ClickHouse/ClickHouse/pull/39520) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Small preparations for build on s390x (which is big-endian). [#39627](https://github.com/ClickHouse/ClickHouse/pull/39627) ([Harry Lee](https://github.com/HarryLeeIBM)). [#39656](https://github.com/ClickHouse/ClickHouse/pull/39656) ([Harry Lee](https://github.com/HarryLeeIBM)). Fixed Endian issue in BitHelpers for s390x. [#39656](https://github.com/ClickHouse/ClickHouse/pull/39656) ([Harry Lee](https://github.com/HarryLeeIBM)). Implement a piece of code related to SipHash for s390x architecture (which is not supported by ClickHouse). [#39732](https://github.com/ClickHouse/ClickHouse/pull/39732) ([Harry Lee](https://github.com/HarryLeeIBM)). Fixed an Endian issue in Coordination snapshot code for s390x architecture (which is not supported by ClickHouse). [#39931](https://github.com/ClickHouse/ClickHouse/pull/39931) ([Harry Lee](https://github.com/HarryLeeIBM)). Fixed Endian issues in Codec code for s390x architecture (which is not supported by ClickHouse). [#40008](https://github.com/ClickHouse/ClickHouse/pull/40008) ([Harry Lee](https://github.com/HarryLeeIBM)). Fixed Endian issues in reading/writing BigEndian binary data in ReadHelpers and WriteHelpers code for s390x architecture (which is not supported by ClickHouse). [#40179](https://github.com/ClickHouse/ClickHouse/pull/40179) ([Harry Lee](https://github.com/HarryLeeIBM)).
+* Support build with `clang-16` (trunk). This closes [#39949](https://github.com/ClickHouse/ClickHouse/issues/39949). [#40181](https://github.com/ClickHouse/ClickHouse/pull/40181) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Prepare RISC-V 64 build to run in CI. This is for [#40141](https://github.com/ClickHouse/ClickHouse/issues/40141). [#40197](https://github.com/ClickHouse/ClickHouse/pull/40197) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Simplified function registration macro interface (`FUNCTION_REGISTER*`) to eliminate the step to add and call an extern function in the registerFunctions.cpp, it also makes incremental builds of a new function faster. [#38615](https://github.com/ClickHouse/ClickHouse/pull/38615) ([Li Yin](https://github.com/liyinsg)).
+* Docker: Now entrypoint.sh in docker image creates and executes chown for all folders it found in config for multidisk setup [#17717](https://github.com/ClickHouse/ClickHouse/issues/17717). [#39121](https://github.com/ClickHouse/ClickHouse/pull/39121) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+
+#### Bug Fix
+* Fix possible segfault in `CapnProto` input format. This bug was found and send through ClickHouse bug-bounty [program](https://github.com/ClickHouse/ClickHouse/issues/38986) by *kiojj*. [#40241](https://github.com/ClickHouse/ClickHouse/pull/40241) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix a very rare case of incorrect behavior of array subscript operator. This closes [#28720](https://github.com/ClickHouse/ClickHouse/issues/28720). [#40185](https://github.com/ClickHouse/ClickHouse/pull/40185) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fix insufficient argument check for encryption functions (found by query fuzzer). This closes [#39987](https://github.com/ClickHouse/ClickHouse/issues/39987). [#40194](https://github.com/ClickHouse/ClickHouse/pull/40194) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fix the case when the order of columns can be incorrect if the `IN` operator is used with a table with `ENGINE = Set` containing multiple columns. This fixes [#13014](https://github.com/ClickHouse/ClickHouse/issues/13014). [#40225](https://github.com/ClickHouse/ClickHouse/pull/40225) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fix seeking while reading from encrypted disk. This PR fixes [#38381](https://github.com/ClickHouse/ClickHouse/issues/38381). [#39687](https://github.com/ClickHouse/ClickHouse/pull/39687) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Fix duplicate columns in join plan. Finally, solve [#26809](https://github.com/ClickHouse/ClickHouse/issues/26809). [#40009](https://github.com/ClickHouse/ClickHouse/pull/40009) ([Vladimir C](https://github.com/vdimir)).
+* Fixed query hanging for SELECT with ORDER BY WITH FILL with different date/time types. [#37849](https://github.com/ClickHouse/ClickHouse/pull/37849) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* Fix ORDER BY that matches projections ORDER BY (before it simply returns unsorted result). [#38725](https://github.com/ClickHouse/ClickHouse/pull/38725) ([Azat Khuzhin](https://github.com/azat)).
+* Do not optimise functions in GROUP BY statements if they shadow one of the table columns or expressions. Fixes [#37032](https://github.com/ClickHouse/ClickHouse/issues/37032). [#39103](https://github.com/ClickHouse/ClickHouse/pull/39103) ([Anton Kozlov](https://github.com/tonickkozlov)).
+* Fix wrong table name in logs after RENAME TABLE. This fixes [#38018](https://github.com/ClickHouse/ClickHouse/issues/38018). [#39227](https://github.com/ClickHouse/ClickHouse/pull/39227) ([Amos Bird](https://github.com/amosbird)).
+* Fix positional arguments in case of columns pruning when optimising the query. Closes [#38433](https://github.com/ClickHouse/ClickHouse/issues/38433). [#39293](https://github.com/ClickHouse/ClickHouse/pull/39293) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix bug in schema inference in case of empty messages in Protobuf/CapnProto formats that allowed to create column with empty `Tuple` type. Closes [#39051](https://github.com/ClickHouse/ClickHouse/issues/39051) Add 2 new settings `input_format_{protobuf/capnproto}_skip_fields_with_unsupported_types_in_schema_inference` that allow to skip fields with unsupported types while schema inference for Protobuf and CapnProto formats. [#39357](https://github.com/ClickHouse/ClickHouse/pull/39357) ([Kruglov Pavel](https://github.com/Avogar)).
+* (Window View is an experimental feature) Fix segmentation fault on `CREATE WINDOW VIEW .. ON CLUSTER ... INNER`. Closes [#39363](https://github.com/ClickHouse/ClickHouse/issues/39363). [#39384](https://github.com/ClickHouse/ClickHouse/pull/39384) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix WriteBuffer finalize when cancelling insert into function (in previous versions it may leat to std::terminate). [#39458](https://github.com/ClickHouse/ClickHouse/pull/39458) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix storing of columns of type `Object` in sparse serialization. [#39464](https://github.com/ClickHouse/ClickHouse/pull/39464) ([Anton Popov](https://github.com/CurtizJ)).
+* Fix possible "Not found column in block" exception when using projections. This closes [#39469](https://github.com/ClickHouse/ClickHouse/issues/39469). [#39470](https://github.com/ClickHouse/ClickHouse/pull/39470) ([小路](https://github.com/nicelulu)).
+* Fix exception on race between DROP and INSERT with materialized views. [#39477](https://github.com/ClickHouse/ClickHouse/pull/39477) ([Azat Khuzhin](https://github.com/azat)).
+* A bug in Apache Avro library: fix data race and possible heap-buffer-overflow in Avro format. Closes [#39094](https://github.com/ClickHouse/ClickHouse/issues/39094) Closes [#33652](https://github.com/ClickHouse/ClickHouse/issues/33652). [#39498](https://github.com/ClickHouse/ClickHouse/pull/39498) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix rare bug in asynchronous reading (with setting `local_filesystem_read_method='pread_threadpool'`) with enabled `O_DIRECT` (enabled by setting `min_bytes_to_use_direct_io`). [#39506](https://github.com/ClickHouse/ClickHouse/pull/39506) ([Anton Popov](https://github.com/CurtizJ)).
+* (only on FreeBSD) Fixes "Code: 49. DB::Exception: FunctionFactory: the function name '' is not unique. (LOGICAL_ERROR)" observed on FreeBSD when starting clickhouse. [#39551](https://github.com/ClickHouse/ClickHouse/pull/39551) ([Alexander Gololobov](https://github.com/davenger)).
+* Fix bug with the recently introduced "maxsplit" argument for `splitByChar`, which was not working correctly. [#39552](https://github.com/ClickHouse/ClickHouse/pull/39552) ([filimonov](https://github.com/filimonov)).
+* Fix bug in ASOF JOIN with `enable_optimize_predicate_expression`, close [#37813](https://github.com/ClickHouse/ClickHouse/issues/37813). [#39556](https://github.com/ClickHouse/ClickHouse/pull/39556) ([Vladimir C](https://github.com/vdimir)).
+* Fixed `CREATE/DROP INDEX` query with `ON CLUSTER` or `Replicated` database and `ReplicatedMergeTree`. It used to be executed on all replicas (causing error or DDL queue stuck). Fixes [#39511](https://github.com/ClickHouse/ClickHouse/issues/39511). [#39565](https://github.com/ClickHouse/ClickHouse/pull/39565) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Fix "column not found" error for push down with join, close [#39505](https://github.com/ClickHouse/ClickHouse/issues/39505). [#39575](https://github.com/ClickHouse/ClickHouse/pull/39575) ([Vladimir C](https://github.com/vdimir)).
+* Fix the wrong `REGEXP_REPLACE` alias. This fixes https://github.com/ClickHouse/ClickBench/issues/9. [#39592](https://github.com/ClickHouse/ClickHouse/pull/39592) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fixed point of origin for exponential decay window functions to the last value in window. Previously, decay was calculated by formula `exp((t - curr_row_t) / decay_length)`, which is incorrect when right boundary of window is not `CURRENT ROW`. It was changed to: `exp((t - last_row_t) / decay_length)`. There is no change in results for windows with `ROWS BETWEEN (smth) AND CURRENT ROW`. [#39593](https://github.com/ClickHouse/ClickHouse/pull/39593) ([Vladimir Chebotaryov](https://github.com/quickhouse)).
+* Fix Decimal division overflow, which can be detected based on operands scale. [#39600](https://github.com/ClickHouse/ClickHouse/pull/39600) ([Andrey Zvonov](https://github.com/zvonand)).
+* Fix settings `output_format_arrow_string_as_string` and `output_format_arrow_low_cardinality_as_dictionary` work in combination. Closes [#39624](https://github.com/ClickHouse/ClickHouse/issues/39624). [#39647](https://github.com/ClickHouse/ClickHouse/pull/39647) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fixed a bug in default database resolution in distributed table reads. [#39674](https://github.com/ClickHouse/ClickHouse/pull/39674) ([Anton Kozlov](https://github.com/tonickkozlov)).
+* (Only with the obsolete Ordinary databases) Select might read data of dropped table if cache for mmap IO is used and database engine is Ordinary and new tables was created with the same name as dropped one had. It's fixed. [#39708](https://github.com/ClickHouse/ClickHouse/pull/39708) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Fix possible error `Invalid column type for ColumnUnique::insertRangeFrom. Expected String, got ColumnLowCardinality` Fixes [#38460](https://github.com/ClickHouse/ClickHouse/issues/38460). [#39716](https://github.com/ClickHouse/ClickHouse/pull/39716) ([Arthur Passos](https://github.com/arthurpassos)).
+* Field names in the `meta` section of JSON format were erroneously double escaped. This closes [#39693](https://github.com/ClickHouse/ClickHouse/issues/39693). [#39747](https://github.com/ClickHouse/ClickHouse/pull/39747) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fix wrong index analysis with tuples and operator `IN`, which could lead to wrong query result. [#39752](https://github.com/ClickHouse/ClickHouse/pull/39752) ([Anton Popov](https://github.com/CurtizJ)).
+* Fix `EmbeddedRocksDB` tables filtering by key using params. [#39757](https://github.com/ClickHouse/ClickHouse/pull/39757) ([Antonio Andelic](https://github.com/antonio2368)).
+* Fix error `Invalid number of columns in chunk pushed to OutputPort` which was caused by ARRAY JOIN optimization. Fixes [#39164](https://github.com/ClickHouse/ClickHouse/issues/39164). [#39799](https://github.com/ClickHouse/ClickHouse/pull/39799) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* A workaround for a bug in Linux kernel. Fix `CANNOT_READ_ALL_DATA` exception with `local_filesystem_read_method=pread_threadpool`. This bug affected only Linux kernel version 5.9 and 5.10 according to [man](https://manpages.debian.org/testing/manpages-dev/preadv2.2.en.html#BUGS). [#39800](https://github.com/ClickHouse/ClickHouse/pull/39800) ([Anton Popov](https://github.com/CurtizJ)).
+* (Only on NFS) Fix broken NFS mkdir for root-squashed volumes. [#39898](https://github.com/ClickHouse/ClickHouse/pull/39898) ([Constantine Peresypkin](https://github.com/pkit)).
+* Remove dictionaries from prometheus metrics on DETACH/DROP. [#39926](https://github.com/ClickHouse/ClickHouse/pull/39926) ([Azat Khuzhin](https://github.com/azat)).
+* Fix read of StorageFile with virtual columns. Closes [#39907](https://github.com/ClickHouse/ClickHouse/issues/39907). [#39943](https://github.com/ClickHouse/ClickHouse/pull/39943) ([flynn](https://github.com/ucasfl)).
+* Fix big memory usage during fetches. Fixes [#39915](https://github.com/ClickHouse/ClickHouse/issues/39915). [#39990](https://github.com/ClickHouse/ClickHouse/pull/39990) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* (experimental feature) Fix `hashId` crash and salt parameter not being used. [#40002](https://github.com/ClickHouse/ClickHouse/pull/40002) ([Raúl Marín](https://github.com/Algunenano)).
+* `EXCEPT` and `INTERSECT` operators may lead to crash if a specific combination of constant and non-constant columns were used. [#40020](https://github.com/ClickHouse/ClickHouse/pull/40020) ([Duc Canh Le](https://github.com/canhld94)).
+* Fixed "Part directory doesn't exist" and "`tmp_<part_name>` ... No such file or directory" errors during too slow INSERT or too long merge/mutation. Also fixed issue that may cause some replication queue entries to stuck without any errors or warnings in logs if previous attempt to fetch part failed, but `tmp-fetch_<part_name>` directory was not cleaned up. [#40031](https://github.com/ClickHouse/ClickHouse/pull/40031) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Fix rare cases of parsing of arrays of tuples in format `Values`. [#40034](https://github.com/ClickHouse/ClickHouse/pull/40034) ([Anton Popov](https://github.com/CurtizJ)).
+* Fixes ArrowColumn format Dictionary(X) & Dictionary(Nullable(X)) conversion to ClickHouse LowCardinality(X) & LowCardinality(Nullable(X)) respectively. [#40037](https://github.com/ClickHouse/ClickHouse/pull/40037) ([Arthur Passos](https://github.com/arthurpassos)).
+* Fix potential deadlock in writing to S3 during task scheduling failure. [#40070](https://github.com/ClickHouse/ClickHouse/pull/40070) ([Maksim Kita](https://github.com/kitaisreal)).
+* Fix bug in collectFilesToSkip() by adding correct file extension (.idx or idx2) for indexes to be recalculated, avoid wrong hard links. Fixed [#39896](https://github.com/ClickHouse/ClickHouse/issues/39896). [#40095](https://github.com/ClickHouse/ClickHouse/pull/40095) ([Jianmei Zhang](https://github.com/zhangjmruc)).
+* A fix for reverse DNS resolution. [#40134](https://github.com/ClickHouse/ClickHouse/pull/40134) ([Arthur Passos](https://github.com/arthurpassos)).
+* Fix unexpected result `arrayDifference` of `Array(UInt32). [#40211](https://github.com/ClickHouse/ClickHouse/pull/40211) ([Duc Canh Le](https://github.com/canhld94)).
+
+
 ### <a id="227"></a> ClickHouse release 22.7, 2022-07-21

 #### Upgrade Notes
@ -258,7 +401,7 @@
 * Allows providing `NULL`/`NOT NULL` right after type in column declaration. [#37337](https://github.com/ClickHouse/ClickHouse/pull/37337) ([Igor Nikonov](https://github.com/devcrafter)).
 * optimize file segment PARTIALLY_DOWNLOADED get read buffer. [#37338](https://github.com/ClickHouse/ClickHouse/pull/37338) ([xiedeyantu](https://github.com/xiedeyantu)).
 * Try to improve short circuit functions processing to fix problems with stress tests. [#37384](https://github.com/ClickHouse/ClickHouse/pull/37384) ([Kruglov Pavel](https://github.com/Avogar)).
-* Closes [#37395](https://github.com/ClickHouse/ClickHouse/issues/37395). [#37415](https://github.com/ClickHouse/ClickHouse/pull/37415) ([Memo](https://github.com/Joeywzr)).
+* Generate multiple columns with UUID (generateUUIDv4(1), generateUUIDv4(2)) [#37395](https://github.com/ClickHouse/ClickHouse/issues/37395). [#37415](https://github.com/ClickHouse/ClickHouse/pull/37415) ([Memo](https://github.com/Joeywzr)).
 * Fix extremely rare deadlock during part fetch in zero-copy replication. Fixes [#37423](https://github.com/ClickHouse/ClickHouse/issues/37423). [#37424](https://github.com/ClickHouse/ClickHouse/pull/37424) ([metahys](https://github.com/metahys)).
 * Don't allow to create storage with unknown data format. [#37450](https://github.com/ClickHouse/ClickHouse/pull/37450) ([Kruglov Pavel](https://github.com/Avogar)).
 * Set `global_memory_usage_overcommit_max_wait_microseconds` default value to 5 seconds. Add info about `OvercommitTracker` to OOM exception message. Add `MemoryOvercommitWaitTimeMicroseconds` profile event. [#37460](https://github.com/ClickHouse/ClickHouse/pull/37460) ([Dmitry Novik](https://github.com/novikd)).
--- a/cmake/autogenerated_versions.txt
+++ b/cmake/autogenerated_versions.txt
@ -2,11 +2,11 @@

 # NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION,
 # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes.
-SET(VERSION_REVISION 54465)
+SET(VERSION_REVISION 54466)
 SET(VERSION_MAJOR 22)
-SET(VERSION_MINOR 8)
+SET(VERSION_MINOR 9)
 SET(VERSION_PATCH 1)
-SET(VERSION_GITHASH f4f05ec786a8b8966dd0ea2a2d7e39a8c7db24f4)
-SET(VERSION_DESCRIBE v22.8.1.1-testing)
-SET(VERSION_STRING 22.8.1.1)
+SET(VERSION_GITHASH 09a2ff88435f79e5279745bbe1dc0e5e401df38d)
+SET(VERSION_DESCRIBE v22.9.1.1-testing)
+SET(VERSION_STRING 22.9.1.1)
 # end of autochange
--- a/contrib/cctz
+++ b/contrib/cctz
@ -1 +1 @@
-Subproject commit 8c71d74bdf76c3fa401da845089ae60a6c0aeefa
+Subproject commit 49c656c62fbd36a1bc20d64c476853bdb7cf7bb9
--- a/contrib/librdkafka
+++ b/contrib/librdkafka
@ -1 +1 @@
-Subproject commit 6062e711a919fb3b669b243b7dceabd045d0e4a2
+Subproject commit ff32b4e9eeafd0b276f010ee969179e4e9e6d0b2
--- a/docker/server/entrypoint.sh
+++ b/docker/server/entrypoint.sh
@ -107,6 +107,13 @@ fi
 if [ -n "$(ls /docker-entrypoint-initdb.d/)" ] || [ -n "$CLICKHOUSE_DB" ]; then
    # port is needed to check if clickhouse-server is ready for connections
    HTTP_PORT="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=http_port)"
+    HTTPS_PORT="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=https_port)"
+    
+    if [ -n "$HTTP_PORT" ]; then
+        URL="http://127.0.0.1:$HTTP_PORT/ping"
+    else
+        URL="https://127.0.0.1:$HTTPS_PORT/ping"
+    fi

    # Listen only on localhost until the initialization is done
    /usr/bin/clickhouse su "${USER}:${GROUP}" /usr/bin/clickhouse-server --config-file="$CLICKHOUSE_CONFIG" -- --listen_host=127.0.0.1 &
@ -115,7 +122,7 @@ if [ -n "$(ls /docker-entrypoint-initdb.d/)" ] || [ -n "$CLICKHOUSE_DB" ]; then
    # check if clickhouse is ready to accept connections
    # will try to send ping clickhouse via http_port (max 12 retries by default, with 1 sec timeout and 1 sec delay between retries)
    tries=${CLICKHOUSE_INIT_TIMEOUT:-12}
-    while ! wget --spider -T 1 -q "http://127.0.0.1:$HTTP_PORT/ping" 2>/dev/null; do
+    while ! wget --spider --no-check-certificate -T 1 -q "$URL" 2>/dev/null; do
        if [ "$tries" -le "0" ]; then
            echo >&2 'ClickHouse init process failed.'
            exit 1
--- a/docker/test/performance-comparison/compare.sh
+++ b/docker/test/performance-comparison/compare.sh
@ -284,13 +284,21 @@ function run_tests
        # Use awk because bash doesn't support floating point arithmetic.
        profile_seconds=$(awk "BEGIN { print ($profile_seconds_left > 0 ? 10 : 0) }")

+        if [ "$(grep -c $(basename $test) changed-test-definitions.txt)" -gt 0 ]
+        then
+          # Run all queries from changed test files to ensure that all new queries will be tested.
+          max_queries=0
+        else
+          max_queries=$CHPC_MAX_QUERIES
+        fi
+
        (
            set +x
            argv=(
                --host localhost localhost
                --port "$LEFT_SERVER_PORT" "$RIGHT_SERVER_PORT"
                --runs "$CHPC_RUNS"
-                --max-queries "$CHPC_MAX_QUERIES"
+                --max-queries "$max_queries"
                --profile-seconds "$profile_seconds"

                "$test"
--- a/docker/test/stateful/Dockerfile
+++ b/docker/test/stateful/Dockerfile
@ -7,6 +7,8 @@ RUN apt-get update -y \
    && env DEBIAN_FRONTEND=noninteractive \
        apt-get install --yes --no-install-recommends \
        python3-requests \
+        nodejs \
+        npm \
    && apt-get clean

 COPY s3downloader /s3downloader
@ -14,5 +16,7 @@ COPY s3downloader /s3downloader
 ENV S3_URL="https://clickhouse-datasets.s3.amazonaws.com"
 ENV DATASETS="hits visits"

+RUN npm install -g azurite
+
 COPY run.sh /
 CMD ["/bin/bash", "/run.sh"]
--- a/docker/test/stateful/run.sh
+++ b/docker/test/stateful/run.sh
@ -17,6 +17,7 @@ ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test
 # install test configs
 /usr/share/clickhouse-test/config/install.sh

+azurite-blob --blobHost 0.0.0.0 --blobPort 10000 --debug /azurite_log &
 ./setup_minio.sh stateful

 function start()
--- a/docker/test/stateless/Dockerfile
+++ b/docker/test/stateless/Dockerfile
@ -17,6 +17,8 @@ RUN apt-get update -y \
            mysql-client=8.0* \
            ncdu \
            netcat-openbsd \
+            nodejs \
+            npm \
            openjdk-11-jre-headless \
            openssl \
            postgresql-client \
@ -75,6 +77,8 @@ ENV MINIO_ROOT_USER="clickhouse"
 ENV MINIO_ROOT_PASSWORD="clickhouse"
 ENV EXPORT_S3_STORAGE_POLICIES=1

+RUN npm install -g azurite
+
 COPY run.sh /
 COPY setup_minio.sh /
 COPY setup_hdfs_minicluster.sh /
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -18,6 +18,12 @@ ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test
 # install test configs
 /usr/share/clickhouse-test/config/install.sh

+if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
+    echo "Azure is disabled"
+else
+    azurite-blob --blobHost 0.0.0.0 --blobPort 10000 --debug /azurite_log &
+fi
+
 ./setup_minio.sh stateless
 ./setup_hdfs_minicluster.sh

--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@ -178,6 +178,7 @@ install_packages package_folder

 configure

+azurite-blob --blobHost 0.0.0.0 --blobPort 10000 --debug /azurite_log &
 ./setup_minio.sh stateful  # to have a proper environment

 start
@ -314,6 +315,11 @@ else

    # Avoid "Setting allow_deprecated_database_ordinary is neither a builtin setting..."
    rm -f /etc/clickhouse-server/users.d/database_ordinary.xml ||:
+
+    # Remove s3 related configs to avoid "there is no disk type `cache`"
+    rm -f /etc/clickhouse-server/config.d/storage_conf.xml ||:
+    rm -f /etc/clickhouse-server/config.d/azure_storage_conf.xml ||:
+
    # Disable aggressive cleanup of tmp dirs (it worked incorrectly before 22.8)
    rm -f /etc/clickhouse-server/config.d/merge_tree_old_dirs_cleanup.xml ||:

@ -393,6 +399,7 @@ else
               -e "Missing columns: 'v3' while processing query: 'v3, k, v1, v2, p'" \
               -e "This engine is deprecated and is not supported in transactions" \
               -e "[Queue = DB::MergeMutateRuntimeQueue]: Code: 235. DB::Exception: Part" \
+               -e "The set of parts restored in place of" \
        /var/log/clickhouse-server/clickhouse-server.backward.clean.log | zgrep -Fa "<Error>" > /test_output/bc_check_error_messages.txt \
        && echo -e 'Backward compatibility check: Error message in clickhouse-server.log (see bc_check_error_messages.txt)\tFAIL' >> /test_output/test_results.tsv \
        || echo -e 'Backward compatibility check: No Error messages in clickhouse-server.log\tOK' >> /test_output/test_results.tsv
--- a/docs/changelogs/v22.8.1.2097-lts.md
+++ b/docs/changelogs/v22.8.1.2097-lts.md
@ -0,0 +1,374 @@
+---
+sidebar_position: 1
+sidebar_label: 2022
+---
+
+# 2022 Changelog
+
+### ClickHouse release v22.8.1.2097-lts (09a2ff88435) FIXME as compared to v22.7.1.2484-stable (f4f05ec786a)
+
+#### Backward Incompatible Change
+* Make cache composable, allow not to evict certain files (regarding idx, mrk, ..), delete old cache version. Now it is possible to configure cache over Azure blob storage disk, over Local disk, over StaticWeb disk, etc. This PR is marked backward incompatible because cache configuration changes and in order for cache to work need to update the config file. Old cache will still be used with new configuration. The server will startup fine with the old cache configuration. Closes [#36140](https://github.com/ClickHouse/ClickHouse/issues/36140). Closes [#37889](https://github.com/ClickHouse/ClickHouse/issues/37889). [#36171](https://github.com/ClickHouse/ClickHouse/pull/36171) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Now, all relevant dictionary sources respect `remote_url_allow_hosts` setting. It was already done for HTTP, Cassandra, Redis. Added ClickHouse, MongoDB, MySQL, PostgreSQL. Host is checked only for dictionaries created from DDL. [#39184](https://github.com/ClickHouse/ClickHouse/pull/39184) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Extended range of Date32 and DateTime64 to support dates from the year 1900 to 2299. In previous versions, the supported interval was only from the year 1925 to 2283. The implementation is using the proleptic Gregorian calendar (which is conformant with [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601):2004 (clause 3.2.1 The Gregorian calendar)) instead of accounting for historical transitions from the Julian to the Gregorian calendar. This change affects implementation-specific behavior for out-of-range arguments. E.g. if in previous versions the value of `1899-01-01` was clamped to `1925-01-01`, in the new version it will be clamped to `1900-01-01`. It changes the behavior of rounding with `toStartOfInterval` if you pass `INTERVAL 3 QUARTER` up to one quarter because the intervals are counted from an implementation-specific point of time. Closes [#28216](https://github.com/ClickHouse/ClickHouse/issues/28216), improves [#38393](https://github.com/ClickHouse/ClickHouse/issues/38393). [#39425](https://github.com/ClickHouse/ClickHouse/pull/39425) ([Roman Vasin](https://github.com/rvasin)).
+
+#### New Feature
+* Added a setting `exact_rows_before_limit` (0/1). When enabled, ClickHouse will provide exact value for `rows_before_limit_at_least` statistic, but with the cost that the data before limit will have to be read completely. This closes [#6613](https://github.com/ClickHouse/ClickHouse/issues/6613). [#25333](https://github.com/ClickHouse/ClickHouse/pull/25333) ([kevin wan](https://github.com/MaxWk)).
+* Add SLRU cache policy for uncompressed cache and marks cache. [#34651](https://github.com/ClickHouse/ClickHouse/pull/34651) ([alexX512](https://github.com/alexX512)).
+* Intel® In-Memory Analytics Accelerator (Intel® IAA) is a hardware accelerator available in the upcoming generation of Intel® Xeon® Scalable processors ("Sapphire Rapids"). Its goal is to speed up common operations in analytics like data (de)compression and filtering. ClickHouse gained the new "DeflateQpl" compression codec which utilizes the Intel® IAA offloading technology to provide a high-performance DEFLATE implementation. The codec uses the [Intel® Query Processing Library (QPL)](https://github.com/intel/qpl) which abstracts access to the hardware accelerator, respectively to a software fallback in case the hardware accelerator is not available. DEFLATE provides in general higher compression rates than ClickHouse's LZ4 default codec, and as a result, offers less disk I/O and lower main memory consumption. [#36654](https://github.com/ClickHouse/ClickHouse/pull/36654) ([jasperzhu](https://github.com/jinjunzh)).
+* Add concurrent_threads_soft_limit parameter to increase performance in case of high RPS by means of limiting total number of threads for all queries. [#37285](https://github.com/ClickHouse/ClickHouse/pull/37285) ([Roman Vasin](https://github.com/rvasin)).
+* Added concurrency control logic to limit total number of concurrent threads created by queries. [#37558](https://github.com/ClickHouse/ClickHouse/pull/37558) ([Sergei Trifonov](https://github.com/serxa)).
+* Added support for parallel distributed insert select into tables with Distributed and Replicated engine [#34670](https://github.com/ClickHouse/ClickHouse/issues/34670). [#39107](https://github.com/ClickHouse/ClickHouse/pull/39107) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Add new settings to control schema inference from text formats: - `input_format_try_infer_dates` - try infer dates from strings. - `input_format_try_infer_datetimes` - try infer datetimes from strings. - `input_format_try_infer_integers` - try infer `Int64` instead of `Float64`. - `input_format_json_try_infer_numbers_from_strings` - try infer numbers from json strings in JSON formats. [#39186](https://github.com/ClickHouse/ClickHouse/pull/39186) ([Kruglov Pavel](https://github.com/Avogar)).
+* This feature will provide JSON formatted log output in console. The purpose is to allow easier ingestion and query in log analysis tools. [#39277](https://github.com/ClickHouse/ClickHouse/pull/39277) ([Mallik Hassan](https://github.com/SadiHassan)).
+* Intel® In-Memory Analytics Accelerator (Intel® IAA) is a hardware accelerator available in the upcoming generation of Intel® Xeon® Scalable processors ("Sapphire Rapids"). Its goal is to speed up common operations in analytics like data (de)compression and filtering. ClickHouse gained the new "DeflateQpl" compression codec which utilizes the Intel® IAA offloading technology to provide a high-performance DEFLATE implementation. The codec uses the [Intel® Query Processing Library (QPL)](https://github.com/intel/qpl) which abstracts access to the hardware accelerator, respectively to a software fallback in case the hardware accelerator is not available. DEFLATE provides in general higher compression rates than ClickHouse's LZ4 default codec, and as a result, offers less disk I/O and lower main memory consumption. [#39494](https://github.com/ClickHouse/ClickHouse/pull/39494) ([Robert Schulze](https://github.com/rschu1ze)).
+* Add function `nowInBlock` which allows getting the current time during long-running and continuous queries. Closes [#39522](https://github.com/ClickHouse/ClickHouse/issues/39522). Notes: there are no functions `now64InBlock` neither `todayInBlock`. [#39533](https://github.com/ClickHouse/ClickHouse/pull/39533) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* - Add result_rows and result_bytes to progress reports (`X-ClickHouse-Summary`). [#39567](https://github.com/ClickHouse/ClickHouse/pull/39567) ([Raúl Marín](https://github.com/Algunenano)).
+* adds ability to specify settings for an `executable()` table function. [#39681](https://github.com/ClickHouse/ClickHouse/pull/39681) ([Constantine Peresypkin](https://github.com/pkit)).
+* Implemented automatic conversion of database engine from `Ordinary` to `Atomic`. Create empty `convert_ordinary_to_atomic` file in `flags` directory and all `Ordinary` databases will be converted automatically on next server start. Resolves [#39546](https://github.com/ClickHouse/ClickHouse/issues/39546). [#39933](https://github.com/ClickHouse/ClickHouse/pull/39933) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Add new setting `schema_inference_hints` that allows to specify structure hints in schema inference for specific columns. Closes [#39569](https://github.com/ClickHouse/ClickHouse/issues/39569). [#40068](https://github.com/ClickHouse/ClickHouse/pull/40068) ([Kruglov Pavel](https://github.com/Avogar)).
+
+#### Performance Improvement
+* * Break on analyze stuck on complex query. [#38185](https://github.com/ClickHouse/ClickHouse/pull/38185) ([Vladimir C](https://github.com/vdimir)).
+* Deduce way to sort based on input stream sort description. Skip sorting if input stream is already sorted. [#38719](https://github.com/ClickHouse/ClickHouse/pull/38719) ([Igor Nikonov](https://github.com/devcrafter)).
+* `DISTINCT` in order with `ORDER BY` improves memory usage (significantly) and query execution time if `DISTINCT` columns match (or form a prefix of) `ORDER BY` columns. [#39432](https://github.com/ClickHouse/ClickHouse/pull/39432) ([Igor Nikonov](https://github.com/devcrafter)).
+* Use local node as first priority to get structure of remote table when executing `cluster` and similar table functions. [#39440](https://github.com/ClickHouse/ClickHouse/pull/39440) ([Mingliang Pan](https://github.com/liangliangpan)).
+* Use `DistinctSortedTransform` only when sort description is applicable to DISTINCT columns, otherwise fall back to ordinary DISTINCT implementation. It allows making less checks during `DistinctSortedTransform` execution. [#39528](https://github.com/ClickHouse/ClickHouse/pull/39528) ([Igor Nikonov](https://github.com/devcrafter)).
+* `DistinctSortedTransform` didn't take advantage of sorting, i.e. it worked like ordinary `DISTINCT` implementation. The fix reduces memory usage significantly. [#39538](https://github.com/ClickHouse/ClickHouse/pull/39538) ([Igor Nikonov](https://github.com/devcrafter)).
+* ColumnVector: optimize filter with AVX512VBMI2 compress store. [#39633](https://github.com/ClickHouse/ClickHouse/pull/39633) ([Guo Wangyang](https://github.com/guowangy)).
+* KeyCondition: optimize applyFunction in multi-thread scenario. [#39812](https://github.com/ClickHouse/ClickHouse/pull/39812) ([Guo Wangyang](https://github.com/guowangy)).
+* For systems with AVX512 VBMI2, this PR improves performance by ca. 6% for SSB benchmark queries queries 3.1, 3.2 and 3.3 (SF=100). Tested on Intel Icelake Xeon 8380 * 2 socket. [#40033](https://github.com/ClickHouse/ClickHouse/pull/40033) ([Robert Schulze](https://github.com/rschu1ze)).
+* - Don't visit the AST for UDFs if none are registered. [#40069](https://github.com/ClickHouse/ClickHouse/pull/40069) ([Raúl Marín](https://github.com/Algunenano)).
+* - Optimize CurrentMemoryTracker alloc and free. [#40078](https://github.com/ClickHouse/ClickHouse/pull/40078) ([Raúl Marín](https://github.com/Algunenano)).
+
+#### Improvement
+* Change the way how PK is analyzed for MergeTree. [#25563](https://github.com/ClickHouse/ClickHouse/pull/25563) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* - Improved structure of DDL query result table for `Replicated` database (separate columns with shard and replica name, more clear status) - `CREATE TABLE ... ON CLUSTER` queries can be normalized on initiator first if `distributed_ddl_entry_format_version` is set to 3 (default value). It means that `ON CLUSTER` queries may not work if initiator does not belong to the cluster that specified in query. Fixes [#37318](https://github.com/ClickHouse/ClickHouse/issues/37318), [#39500](https://github.com/ClickHouse/ClickHouse/issues/39500) - Ignore `ON CLUSTER` clause if database is `Replicated` and cluster name equals to database name. Related to [#35570](https://github.com/ClickHouse/ClickHouse/issues/35570) - Miscellaneous minor fixes for `Replicated` database engine - Check metadata consistency when starting up `Replicated` database, start replica recovery in case of mismatch of local metadata and metadata in Keeper. Resolves [#24880](https://github.com/ClickHouse/ClickHouse/issues/24880). [#37198](https://github.com/ClickHouse/ClickHouse/pull/37198) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Support SQL standard DELETE FROM syntax on merge tree tables and lightweight delete implementation for merge tree families. [#37893](https://github.com/ClickHouse/ClickHouse/pull/37893) ([Jianmei Zhang](https://github.com/zhangjmruc)).
+* `timeSlots` now works with DateTime64; subsecond duration and slot size available when working with DateTime64. [#37951](https://github.com/ClickHouse/ClickHouse/pull/37951) ([Andrey Zvonov](https://github.com/zvonand)).
+* Add cache for schema inference for file/s3/hdfs/url table functions. Now, schema inference will be performed only on the first query to the file, all subsequent queries to the same file will use the schema from cache if data wasn't changed. Add system table `system.schema_inference_cache` with all current schemas in cache and system queries `SYSTEM DROP SCHEMA CACHE [FOR FILE/S3/HDFS/URL]` to drop schemas from cache. [#38286](https://github.com/ClickHouse/ClickHouse/pull/38286) ([Kruglov Pavel](https://github.com/Avogar)).
+* - Simplified function registration macro interface (`FUNCTION_REGISTER*`) to eliminate the step to add and call an extern function in the registerFunctions.cpp, it also makes incremental builds of a new function faster. [#38615](https://github.com/ClickHouse/ClickHouse/pull/38615) ([Li Yin](https://github.com/liyinsg)).
+* * Added support of `LEFT SEMI` and `LEFT ANTI` direct join with rocksdb. [#38956](https://github.com/ClickHouse/ClickHouse/pull/38956) ([Vladimir C](https://github.com/vdimir)).
+* resolves [#37490](https://github.com/ClickHouse/ClickHouse/issues/37490). [#39054](https://github.com/ClickHouse/ClickHouse/pull/39054) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
+* Store Keeper API version inside a predefined path. [#39096](https://github.com/ClickHouse/ClickHouse/pull/39096) ([Antonio Andelic](https://github.com/antonio2368)).
+* Now entrypoint.sh in docker image creates and executes chown for all folders it found in config for multidisk setup [#17717](https://github.com/ClickHouse/ClickHouse/issues/17717). [#39121](https://github.com/ClickHouse/ClickHouse/pull/39121) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Add profile events for fsync. [#39179](https://github.com/ClickHouse/ClickHouse/pull/39179) ([Azat Khuzhin](https://github.com/azat)).
+* Add the second argument to the ordinary function `file(path[, default])`, which function returns in the case when a file does not exists. [#39218](https://github.com/ClickHouse/ClickHouse/pull/39218) ([Nikolay Degterinsky](https://github.com/evillique)).
+* Some small fixes for reading via http, allow to retry partial content in case if got 200OK. [#39244](https://github.com/ClickHouse/ClickHouse/pull/39244) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Improved Base58 encoding/decoding. [#39292](https://github.com/ClickHouse/ClickHouse/pull/39292) ([Andrey Zvonov](https://github.com/zvonand)).
+* Normalize `AggregateFunction` types and state representations because optimizations like https://github.com/ClickHouse/ClickHouse/pull/35788 will treat `count(not null columns)` as `count()`, which might confuses distributed interpreters with the following error : `Conversion from AggregateFunction(count) to AggregateFunction(count, Int64) is not supported`. [#39420](https://github.com/ClickHouse/ClickHouse/pull/39420) ([Amos Bird](https://github.com/amosbird)).
+* Improved memory usage during memory efficient merging of aggregation results. [#39429](https://github.com/ClickHouse/ClickHouse/pull/39429) ([Nikita Taranov](https://github.com/nickitat)).
+* Support queries `CREATE TEMPORARY TABLE ... (<list of columns>) AS ...`. [#39462](https://github.com/ClickHouse/ClickHouse/pull/39462) ([Kruglov Pavel](https://github.com/Avogar)).
+* Add support of `!`/`*` (exclamation/asterisk) in custom TLDs (`cutToFirstSignificantSubdomainCustom()`/`cutToFirstSignificantSubdomainCustomWithWWW()`/`firstSignificantSubdomainCustom()`). [#39496](https://github.com/ClickHouse/ClickHouse/pull/39496) ([Azat Khuzhin](https://github.com/azat)).
+* Rework and simplify the `system.backups` table, remove the `internal` column, allow user to set ID of operation, add columns `num_files`, `uncompressed_size`, `compressed_size`, `start_time`, `end_time`. [#39503](https://github.com/ClickHouse/ClickHouse/pull/39503) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Refactored a little code, removed duplicate code. [#39509](https://github.com/ClickHouse/ClickHouse/pull/39509) ([Simon Liu](https://github.com/monadbobo)).
+* Add support for TLS connections to NATS. Implements [#39525](https://github.com/ClickHouse/ClickHouse/issues/39525). [#39527](https://github.com/ClickHouse/ClickHouse/pull/39527) ([Constantine Peresypkin](https://github.com/pkit)).
+* `clickhouse-obfuscator` (a tool for database obfuscation for testing and load generation) now has the new `--save` and `--load` parameters to work with pre-trained models. This closes [#39534](https://github.com/ClickHouse/ClickHouse/issues/39534). [#39541](https://github.com/ClickHouse/ClickHouse/pull/39541) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fix incorrect behavior of log rotation during restart. [#39558](https://github.com/ClickHouse/ClickHouse/pull/39558) ([Nikolay Degterinsky](https://github.com/evillique)).
+* Improve bytes to bits mask transform for SSE/AVX/AVX512. [#39586](https://github.com/ClickHouse/ClickHouse/pull/39586) ([Guo Wangyang](https://github.com/guowangy)).
+* Add formats `PrettyMonoBlock`, `PrettyNoEscapesMonoBlock`, `PrettyCompactNoEscapes`, `PrettyCompactNoEscapesMonoBlock`, `PrettySpaceNoEscapes`, `PrettySpaceMonoBlock`, `PrettySpaceNoEscapesMonoBlock`. [#39646](https://github.com/ClickHouse/ClickHouse/pull/39646) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix building aggregate projections when external aggregation is on. Mark as improvement because the case is rare and there exists easy workaround to fix it via changing settings. This fixes [#39667](https://github.com/ClickHouse/ClickHouse/issues/39667) . [#39671](https://github.com/ClickHouse/ClickHouse/pull/39671) ([Amos Bird](https://github.com/amosbird)).
+* Allow to execute hash functions with arguments of type `Map`. [#39685](https://github.com/ClickHouse/ClickHouse/pull/39685) ([Anton Popov](https://github.com/CurtizJ)).
+* Add a configuration parameter to hide addresses in stack traces. It may improve security a little but generally, it is harmful and should not be used. [#39690](https://github.com/ClickHouse/ClickHouse/pull/39690) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* change the prefix size of AggregateFunctionDistinct to make sure nested function data memory aligned. [#39696](https://github.com/ClickHouse/ClickHouse/pull/39696) ([Pxl](https://github.com/BiteTheDDDDt)).
+* Properly escape credentials passed to the `clickhouse-diagnostic` tool. [#39707](https://github.com/ClickHouse/ClickHouse/pull/39707) ([Dale McDiarmid](https://github.com/gingerwizard)).
+* keeper-improvement: create a snapshot on exit. It can be controlled with the config `keeper_server.create_snapshot_on_exit`, `true` by default. [#39755](https://github.com/ClickHouse/ClickHouse/pull/39755) ([Antonio Andelic](https://github.com/antonio2368)).
+* Support primary key analysis for `row_policy_filter` and `additional_filter`. It also helps fix issues like [#37454](https://github.com/ClickHouse/ClickHouse/issues/37454) . [#39826](https://github.com/ClickHouse/ClickHouse/pull/39826) ([Amos Bird](https://github.com/amosbird)).
+* Parameters are now transferred in `Query` packets right after the query text in the same serialisation format as the settings. [#39906](https://github.com/ClickHouse/ClickHouse/pull/39906) ([Nikita Taranov](https://github.com/nickitat)).
+* Fix two usability issues in Play UI: - it was non-pixel-perfect on iPad due to parasitic border radius and margins; - the progress indication did not display after the first query. This closes [#39957](https://github.com/ClickHouse/ClickHouse/issues/39957). This closes [#39960](https://github.com/ClickHouse/ClickHouse/issues/39960). [#39961](https://github.com/ClickHouse/ClickHouse/pull/39961) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Play UI: add row numbers; add cell selection on click; add hysteresis for table cells. [#39962](https://github.com/ClickHouse/ClickHouse/pull/39962) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* The client will show server-side elapsed time. This is important for the performance comparison of ClickHouse services in remote datacenters. This closes [#38070](https://github.com/ClickHouse/ClickHouse/issues/38070). See also [this](https://github.com/ClickHouse/ClickBench/blob/main/hardware/benchmark-cloud.sh#L37) for motivation. [#39968](https://github.com/ClickHouse/ClickHouse/pull/39968) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Adds `parseDateTime64BestEffortUS`, `parseDateTime64BestEffortUSOrNull`, `parseDateTime64BestEffortUSOrZero` functions, closing [#37492](https://github.com/ClickHouse/ClickHouse/issues/37492). [#40015](https://github.com/ClickHouse/ClickHouse/pull/40015) ([Tanya Bragin](https://github.com/tbragin)).
+* * Add observer mode to (zoo)keeper cluster discovery feature. In this mode node itself doesn't belong to cluster. [#40035](https://github.com/ClickHouse/ClickHouse/pull/40035) ([Vladimir C](https://github.com/vdimir)).
+* Play UI: recognize tab key in textarea, but at the same time don't mess up with tab navigation. [#40053](https://github.com/ClickHouse/ClickHouse/pull/40053) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Extend processors_profile_log with more information such as input rows. [#40121](https://github.com/ClickHouse/ClickHouse/pull/40121) ([Amos Bird](https://github.com/amosbird)).
+* Update tzdata to 2022b to support the new timezone changes. See https://github.com/google/cctz/pull/226. Chile's 2022 DST start is delayed from September 4 to September 11. Iran plans to stop observing DST permanently, after it falls back on 2022-09-21. There are corrections of the historical time zone of Asia/Tehran in the year 1977: Iran adopted standard time in 1935, not 1946. In 1977 it observed DST from 03-21 23:00 to 10-20 24:00; its 1978 transitions were on 03-24 and 08-05, not 03-20 and 10-20; and its spring 1979 transition was on 05-27, not 03-21 (https://data.iana.org/time-zones/tzdb/NEWS). [#40184](https://github.com/ClickHouse/ClickHouse/pull/40184) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Display server-side time in `clickhouse-benchmark` by default if it is available (since ClickHouse version 22.8). This is needed to correctly compare the performance of clouds. This behavior can be changed with the new `--client-side-time` command line option. Change the `--randomize` command line option from `--randomize 1` to the form without argument. [#40193](https://github.com/ClickHouse/ClickHouse/pull/40193) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Add counters (ProfileEvents) for cases when query complexity limitation has been set and has reached (a separate counter for `overflow_mode` = `break` and `throw`). For example, if you have set up `max_rows_to_read` with `read_overflow_mode = 'break'`, looking at the value of `OverflowBreak` counter will allow distinguishing incomplete results. [#40205](https://github.com/ClickHouse/ClickHouse/pull/40205) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fix memory accounting in case of MEMORY_LIMIT_EXCEEDED errors (previously [peak] memory usage was takes failed allocations into account). [#40249](https://github.com/ClickHouse/ClickHouse/pull/40249) ([Azat Khuzhin](https://github.com/azat)).
+* Add current metrics for fs cache: `FilesystemCacheSize` and `FilesystemCacheElements`. [#40260](https://github.com/ClickHouse/ClickHouse/pull/40260) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Add support for LARGE_BINARY/LARGE_STRING with Arrow (Closes [#32401](https://github.com/ClickHouse/ClickHouse/issues/32401)). [#40293](https://github.com/ClickHouse/ClickHouse/pull/40293) ([Josh Taylor](https://github.com/joshuataylor)).
+
+#### Bug Fix
+* Support hadoop secure rpc transfer(hadoop.rpc.protection=privacy and hadoop.rpc.protection=integrity). [#39411](https://github.com/ClickHouse/ClickHouse/pull/39411) ([michael1589](https://github.com/michael1589)).
+* Fix seeking while reading from encrypted disk. This PR fixes [#38381](https://github.com/ClickHouse/ClickHouse/issues/38381). [#39687](https://github.com/ClickHouse/ClickHouse/pull/39687) ([Vitaly Baranov](https://github.com/vitlibar)).
+* * Fix duplicate columns in join plan. Finally, solve [#26809](https://github.com/ClickHouse/ClickHouse/issues/26809). [#40009](https://github.com/ClickHouse/ClickHouse/pull/40009) ([Vladimir C](https://github.com/vdimir)).
+
+#### Build/Testing/Packaging Improvement
+* Prebuild ClickHouse x86 binaries now require support for AVX instructions, i.e. a CPU not older than Intel Sandy Bridge / AMD Bulldozer, both released in 2011. [#39000](https://github.com/ClickHouse/ClickHouse/pull/39000) ([Robert Schulze](https://github.com/rschu1ze)).
+* Former packages used to install systemd.service file to `/etc`. The files there are marked as `conf` and are not cleaned out, and not updated automatically. This PR cleans them out. [#39323](https://github.com/ClickHouse/ClickHouse/pull/39323) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Fix LSan by fixing getauxval(). [#39430](https://github.com/ClickHouse/ClickHouse/pull/39430) ([Azat Khuzhin](https://github.com/azat)).
+* TSAN has issues with clang-14 (https://github.com/google/sanitizers/issues/1552, https://github.com/google/sanitizers/issues/1540), so here we temporary build the TSAN binaries with clang-13. [#39450](https://github.com/ClickHouse/ClickHouse/pull/39450) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Remove the option to build ClickHouse tools as separate executable programs. This fixes [#37847](https://github.com/ClickHouse/ClickHouse/issues/37847). [#39520](https://github.com/ClickHouse/ClickHouse/pull/39520) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fixed Unit tests for wide integers on s390x. [#39627](https://github.com/ClickHouse/ClickHouse/pull/39627) ([Harry Lee](https://github.com/HarryLeeIBM)).
+* Increase max cache size for clang-tidy builds. Try to avoid flushing it out between builds. [#39652](https://github.com/ClickHouse/ClickHouse/pull/39652) ([Nikita Taranov](https://github.com/nickitat)).
+* No need to use fixed IP when you are using cluster with SSL. Using the same fixed IP could trigger collision between tests. At this change the server's certificate is generated for a designated host name (see server-ext.cnf at each test). The client should check server's certificate against that name accordingly. [#40007](https://github.com/ClickHouse/ClickHouse/pull/40007) ([Sema Checherinda](https://github.com/CheSema)).
+* Support build with `clang-16` (trunk). This closes [#39949](https://github.com/ClickHouse/ClickHouse/issues/39949). [#40181](https://github.com/ClickHouse/ClickHouse/pull/40181) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Prepare RISC-V 64 build to run in CI. This is for [#40141](https://github.com/ClickHouse/ClickHouse/issues/40141). [#40197](https://github.com/ClickHouse/ClickHouse/pull/40197) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+
+#### Bug Fix (user-visible misbehavior in official stable or prestable release)
+
+* Fixed query hanging for SELECT with ORDER BY WITH FILL with different date/time types. [#37849](https://github.com/ClickHouse/ClickHouse/pull/37849) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* Fix ORDER BY that matches projections ORDER BY (before it simply returns unsorted result). [#38725](https://github.com/ClickHouse/ClickHouse/pull/38725) ([Azat Khuzhin](https://github.com/azat)).
+* Do not optimise functions in GROUP BY statements if they shadow one of the table columns or expressions. Fixes [#37032](https://github.com/ClickHouse/ClickHouse/issues/37032). [#39103](https://github.com/ClickHouse/ClickHouse/pull/39103) ([Anton Kozlov](https://github.com/tonickkozlov)).
+* Fix wrong table name in logs after RENAME TABLE. This fixes [#38018](https://github.com/ClickHouse/ClickHouse/issues/38018). [#39227](https://github.com/ClickHouse/ClickHouse/pull/39227) ([Amos Bird](https://github.com/amosbird)).
+* Fix positional arguments in case of columns pruning when optimising the query. Closes [#38433](https://github.com/ClickHouse/ClickHouse/issues/38433). [#39293](https://github.com/ClickHouse/ClickHouse/pull/39293) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix bug in schema inference in case of empty messages in Protobuf/CapnProto formats that allowed to create column with empty `Tuple` type. Closes [#39051](https://github.com/ClickHouse/ClickHouse/issues/39051) Add 2 new settings `input_format_{protobuf/capnproto}_skip_fields_with_unsupported_types_in_schema_inference` that allow to skip fields with unsupported types while schema inference for Protobuf and CapnProto formats. [#39357](https://github.com/ClickHouse/ClickHouse/pull/39357) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix segmentation fault on `CREATE WINDOW VIEW .. ON CLUSTER ... INNER`. Closes [#39363](https://github.com/ClickHouse/ClickHouse/issues/39363). [#39384](https://github.com/ClickHouse/ClickHouse/pull/39384) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix WriteBuffer finalize when cancel insert into function. Proper version of https://github.com/ClickHouse/ClickHouse/pull/39396 that was reverted. [#39458](https://github.com/ClickHouse/ClickHouse/pull/39458) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix storing of columns of type `Object` in sparse serialization. [#39464](https://github.com/ClickHouse/ClickHouse/pull/39464) ([Anton Popov](https://github.com/CurtizJ)).
+* Fix possible "Not found column in block" exception when using projections. This closes [#39469](https://github.com/ClickHouse/ClickHouse/issues/39469). [#39470](https://github.com/ClickHouse/ClickHouse/pull/39470) ([小路](https://github.com/nicelulu)).
+* Fix LOGICAL_ERROR on race between DROP and INSERT with materialized views. [#39477](https://github.com/ClickHouse/ClickHouse/pull/39477) ([Azat Khuzhin](https://github.com/azat)).
+* Fix data race and possible heap-buffer-overflow in Avro format. Closes [#39094](https://github.com/ClickHouse/ClickHouse/issues/39094) Closes [#33652](https://github.com/ClickHouse/ClickHouse/issues/33652). [#39498](https://github.com/ClickHouse/ClickHouse/pull/39498) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix rare bug in asynchronous reading (with setting `local_filesystem_read_method='pread_threadpool'`) with enabled `O_DIRECT` (enabled by setting `min_bytes_to_use_direct_io`). [#39506](https://github.com/ClickHouse/ClickHouse/pull/39506) ([Anton Popov](https://github.com/CurtizJ)).
+* Fixes "Code: 49. DB::Exception: FunctionFactory: the function name '' is not unique. (LOGICAL_ERROR)" observed on FreeBSD when starting clickhouse. [#39551](https://github.com/ClickHouse/ClickHouse/pull/39551) ([Alexander Gololobov](https://github.com/davenger)).
+* Fix bug with maxsplit argument for splitByChar, which was not working correctly. [#39552](https://github.com/ClickHouse/ClickHouse/pull/39552) ([filimonov](https://github.com/filimonov)).
+* * Fix bug in ASOF JOIN with `enable_optimize_predicate_expression`, close [#37813](https://github.com/ClickHouse/ClickHouse/issues/37813). [#39556](https://github.com/ClickHouse/ClickHouse/pull/39556) ([Vladimir C](https://github.com/vdimir)).
+* Fixed `CREATE/DROP INDEX` query with `ON CLUSTER` or `Replicated` database and `ReplicatedMergeTree`. It used to be executed on all replicas (causing error or DDL queue stuck). Fixes [#39511](https://github.com/ClickHouse/ClickHouse/issues/39511). [#39565](https://github.com/ClickHouse/ClickHouse/pull/39565) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Fix "column not found" error for push down with join, close [#39505](https://github.com/ClickHouse/ClickHouse/issues/39505). [#39575](https://github.com/ClickHouse/ClickHouse/pull/39575) ([Vladimir C](https://github.com/vdimir)).
+* Fix the wrong `REGEXP_REPLACE` alias. This fixes https://github.com/ClickHouse/ClickBench/issues/9. [#39592](https://github.com/ClickHouse/ClickHouse/pull/39592) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fixed point of origin for exponential decay window functions to the last value in window. Previously, decay was calculated by formula `exp((t - curr_row_t) / decay_length)`, which is incorrect when right boundary of window is not `CURRENT ROW`. It was changed to: `exp((t - last_row_t) / decay_length)`. There is no change in results for windows with `ROWS BETWEEN (smth) AND CURRENT ROW`. [#39593](https://github.com/ClickHouse/ClickHouse/pull/39593) ([Vladimir Chebotaryov](https://github.com/quickhouse)).
+* Fix Decimal division overflow, which can be detected based on operands scale. [#39600](https://github.com/ClickHouse/ClickHouse/pull/39600) ([Andrey Zvonov](https://github.com/zvonand)).
+* Fix settings `output_format_arrow_string_as_string` and `output_format_arrow_low_cardinality_as_dictionary` work in combination. Closes [#39624](https://github.com/ClickHouse/ClickHouse/issues/39624). [#39647](https://github.com/ClickHouse/ClickHouse/pull/39647) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fixed a bug in default database resolution in distributed table reads. [#39674](https://github.com/ClickHouse/ClickHouse/pull/39674) ([Anton Kozlov](https://github.com/tonickkozlov)).
+* Select might read data of dropped table if cache for mmap IO is used and database engine is Ordinary and new tables was created with the same name as dropped one had. It's fixed. [#39708](https://github.com/ClickHouse/ClickHouse/pull/39708) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Fix possible error `Invalid column type for ColumnUnique::insertRangeFrom. Expected String, got ColumnLowCardinality` Fixes [#38460](https://github.com/ClickHouse/ClickHouse/issues/38460). [#39716](https://github.com/ClickHouse/ClickHouse/pull/39716) ([Arthur Passos](https://github.com/arthurpassos)).
+* Field names in the `meta` section of JSON format were erroneously double escaped. This closes [#39693](https://github.com/ClickHouse/ClickHouse/issues/39693). [#39747](https://github.com/ClickHouse/ClickHouse/pull/39747) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fix wrong index analysis with tuples and operator `IN`, which could lead to wrong query result. [#39752](https://github.com/ClickHouse/ClickHouse/pull/39752) ([Anton Popov](https://github.com/CurtizJ)).
+* Fix EmbeddedRocksDB filtering by key using params. [#39757](https://github.com/ClickHouse/ClickHouse/pull/39757) ([Antonio Andelic](https://github.com/antonio2368)).
+* Fix error `Invalid number of columns in chunk pushed to OutputPort` which was cause by ARRAY JOIN optimization. Fixes [#39164](https://github.com/ClickHouse/ClickHouse/issues/39164). [#39799](https://github.com/ClickHouse/ClickHouse/pull/39799) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix `CANNOT_READ_ALL_DATA` exception with `local_filesystem_read_method=pread_threadpool`. This bug affected only Linux kernel version 5.9 and 5.10 according to [man](https://manpages.debian.org/testing/manpages-dev/preadv2.2.en.html#BUGS). [#39800](https://github.com/ClickHouse/ClickHouse/pull/39800) ([Anton Popov](https://github.com/CurtizJ)).
+* Fix quota_key application on connect. [#39874](https://github.com/ClickHouse/ClickHouse/pull/39874) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* we meeted query exceptions: DB::Exception: Cannot open file /media/ssd1/fordata/clickhouse/data/data/perf/perf_log_local_v3_1/20220618_17233_17238_1/namespace.dict.bin, errno: 24, strerror: Too many open files. [#39886](https://github.com/ClickHouse/ClickHouse/pull/39886) ([Fangyuan Deng](https://github.com/pzhdfy)).
+* fix broken NFS mkdir for root-squashed volumes. [#39898](https://github.com/ClickHouse/ClickHouse/pull/39898) ([Constantine Peresypkin](https://github.com/pkit)).
+* Remove dictionaries from prometheus metrics on DETACH/DROP. [#39926](https://github.com/ClickHouse/ClickHouse/pull/39926) ([Azat Khuzhin](https://github.com/azat)).
+* Fix read of StorageFile with virtual columns. Closes [#39907](https://github.com/ClickHouse/ClickHouse/issues/39907). [#39943](https://github.com/ClickHouse/ClickHouse/pull/39943) ([flynn](https://github.com/ucasfl)).
+* Fix big memory usage during fetches. Fixes [#39915](https://github.com/ClickHouse/ClickHouse/issues/39915). [#39990](https://github.com/ClickHouse/ClickHouse/pull/39990) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* - Fix hashId crash and salt parameter not being used. [#40002](https://github.com/ClickHouse/ClickHouse/pull/40002) ([Raúl Marín](https://github.com/Algunenano)).
+* fix HashMethodOneNumber get wrong key value when column is const. [#40020](https://github.com/ClickHouse/ClickHouse/pull/40020) ([Duc Canh Le](https://github.com/canhld94)).
+* Fixed "Part directory doesn't exist" and "`tmp_<part_name>` ... No such file or directory" errors during too slow INSERT or too long merge/mutation. Also fixed issue that may cause some replication queue entries to stuck without any errors or warnings in logs if previous attempt to fetch part failed, but `tmp-fetch_<part_name>` directory was not cleaned up. [#40031](https://github.com/ClickHouse/ClickHouse/pull/40031) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Fix rare cases of parsing of arrays of tuples in format `Values`. [#40034](https://github.com/ClickHouse/ClickHouse/pull/40034) ([Anton Popov](https://github.com/CurtizJ)).
+* Fixes ArrowColumn format Dictionary(X) & Dictionary(Nullable(X)) conversion to ClickHouse LowCardinality(X) & LowCardinality(Nullable(X)) respectively. [#40037](https://github.com/ClickHouse/ClickHouse/pull/40037) ([Arthur Passos](https://github.com/arthurpassos)).
+* Fix potential deadlock in WriteBufferFromS3 during task scheduling failure. [#40070](https://github.com/ClickHouse/ClickHouse/pull/40070) ([Maksim Kita](https://github.com/kitaisreal)).
+* Fix bug in collectFilesToSkip() by adding correct file extension(.idx or idx2) for indexes to be recalculated, avoid wrong hard links. Fixed [#39896](https://github.com/ClickHouse/ClickHouse/issues/39896). [#40095](https://github.com/ClickHouse/ClickHouse/pull/40095) ([Jianmei Zhang](https://github.com/zhangjmruc)).
+* A segmentation fault that has CaresPTRResolver::resolve in the stack trace has been reported:. [#40134](https://github.com/ClickHouse/ClickHouse/pull/40134) ([Arthur Passos](https://github.com/arthurpassos)).
+* Fix a very rare case of incorrect behavior of array subscript operator. This closes [#28720](https://github.com/ClickHouse/ClickHouse/issues/28720). [#40185](https://github.com/ClickHouse/ClickHouse/pull/40185) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fix insufficient argument check for encryption functions (found by query fuzzer). This closes [#39987](https://github.com/ClickHouse/ClickHouse/issues/39987). [#40194](https://github.com/ClickHouse/ClickHouse/pull/40194) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* - Fix unexpected result arrayDifference of Array(UInt32). [#40211](https://github.com/ClickHouse/ClickHouse/pull/40211) ([Duc Canh Le](https://github.com/canhld94)).
+* Fix the case when the order of columns can be incorrect if the `IN` operator is used with a table with `ENGINE = Set` containing multiple columns. This fixes [#13014](https://github.com/ClickHouse/ClickHouse/issues/13014). [#40225](https://github.com/ClickHouse/ClickHouse/pull/40225) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fix possible segfault in CapnProto input format. This bug was found and send through ClickHouse bug-bounty [program](https://github.com/ClickHouse/ClickHouse/issues/38986) by *kiojj*. [#40241](https://github.com/ClickHouse/ClickHouse/pull/40241) ([Kruglov Pavel](https://github.com/Avogar)).
+* - Avoid continuously growing memory consumption of pattern cache when using functions multi(Fuzzy)Match(Any|AllIndices|AnyIndex)(). [#40264](https://github.com/ClickHouse/ClickHouse/pull/40264) ([Robert Schulze](https://github.com/rschu1ze)).
+
+#### Build
+
+* Fix build error: ``` [ 69%] Building CXX object src/CMakeFiles/clickhouse_common_io.dir/Common/waitForPid.cpp.o /CLionProjects/clickhouse-yandex/src/Common/waitForPid.cpp:112:5: error: identifier '__kevp__' is reserved because it starts with '__' [-Werror,-Wreserved-identifier] EV_SET(&change, pid, EVFILT_PROC, EV_ADD, NOTE_EXIT, 0, NULL); ^ /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include/sys/event.h:108:17: note: expanded from macro 'EV_SET' struct kevent *__kevp__ = (kevp); \ ^ ```. [#39493](https://github.com/ClickHouse/ClickHouse/pull/39493) ([小路](https://github.com/nicelulu)).
+
+#### Build Improvement
+
+* Fixed Endian issue in BitHelpers for s390x. [#39656](https://github.com/ClickHouse/ClickHouse/pull/39656) ([Harry Lee](https://github.com/HarryLeeIBM)).
+* Implement a piece of code related to SipHash for s390x architecture (which is not supported by ClickHouse). [#39732](https://github.com/ClickHouse/ClickHouse/pull/39732) ([Harry Lee](https://github.com/HarryLeeIBM)).
+* Fixed an Endian issue in Coordination snapshot code for s390x architecture (which is not supported by ClickHouse). [#39931](https://github.com/ClickHouse/ClickHouse/pull/39931) ([Harry Lee](https://github.com/HarryLeeIBM)).
+* Fixed Endian issues in Codec code for s390x architecture (which is not supported by ClickHouse). [#40008](https://github.com/ClickHouse/ClickHouse/pull/40008) ([Harry Lee](https://github.com/HarryLeeIBM)).
+* Fixed Endian issues in reading/writing BigEndian binary data in ReadHelpers and WriteHelpers code for s390x architecture (which is not supported by ClickHouse). [#40179](https://github.com/ClickHouse/ClickHouse/pull/40179) ([Harry Lee](https://github.com/HarryLeeIBM)).
+
+#### NO CL ENTRY
+
+* NO CL ENTRY:  'Revert "tests: enable back 02232_dist_insert_send_logs_level_hung"'. [#39788](https://github.com/ClickHouse/ClickHouse/pull/39788) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* NO CL ENTRY:  'Revert "Update arrow to fix possible data race"'. [#39804](https://github.com/ClickHouse/ClickHouse/pull/39804) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* NO CL ENTRY:  'Revert "Revert "Update arrow to fix possible data race""'. [#39811](https://github.com/ClickHouse/ClickHouse/pull/39811) ([Kruglov Pavel](https://github.com/Avogar)).
+* NO CL ENTRY:  'Revert "Limit number of analyze for one query"'. [#39816](https://github.com/ClickHouse/ClickHouse/pull/39816) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* NO CL ENTRY:  'Revert "Revert "tests: enable back 02232_dist_insert_send_logs_level_hung""'. [#39817](https://github.com/ClickHouse/ClickHouse/pull/39817) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* NO CL ENTRY:  'Prepare library-bridge for catboost integration'. [#39904](https://github.com/ClickHouse/ClickHouse/pull/39904) ([Robert Schulze](https://github.com/rschu1ze)).
+* NO CL ENTRY:  'Revert "ColumnVector: optimize filter with AVX512VBMI2 compress store"'. [#39963](https://github.com/ClickHouse/ClickHouse/pull/39963) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* NO CL ENTRY:  'Revert "copy self-extracting to output"'. [#40005](https://github.com/ClickHouse/ClickHouse/pull/40005) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* NO CL ENTRY:  'Revert "Use separate counter for RSS in global memory tracker."'. [#40199](https://github.com/ClickHouse/ClickHouse/pull/40199) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* NO CL ENTRY:  'Revert "tests/performance: cover sparse_hashed dictionary"'. [#40268](https://github.com/ClickHouse/ClickHouse/pull/40268) ([Alexander Tokmakov](https://github.com/tavplubix)).
+
+#### NOT FOR CHANGELOG / INSIGNIFICANT
+
+* Test/insert deduplication token materialized views [#34662](https://github.com/ClickHouse/ClickHouse/pull/34662) ([Denny Crane](https://github.com/den-crane)).
+* Merging [#34372](https://github.com/ClickHouse/ClickHouse/issues/34372) [#35968](https://github.com/ClickHouse/ClickHouse/pull/35968) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Decoupling local cache function and cache algorithm [#38048](https://github.com/ClickHouse/ClickHouse/pull/38048) ([Han Shukai](https://github.com/KinderRiven)).
+* Use separate counter for RSS in global memory tracker. [#38682](https://github.com/ClickHouse/ClickHouse/pull/38682) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Build self-extracting-executable utils [#38936](https://github.com/ClickHouse/ClickHouse/pull/38936) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* Improvements in integration tests [#38978](https://github.com/ClickHouse/ClickHouse/pull/38978) ([Ilya Yatsishin](https://github.com/qoega)).
+* More readable regexp in `test_quota` [#39084](https://github.com/ClickHouse/ClickHouse/pull/39084) ([Vladimir Chebotaryov](https://github.com/quickhouse)).
+* Fixed regexp in `test_match_process_uid_against_data_owner` [#39085](https://github.com/ClickHouse/ClickHouse/pull/39085) ([Vladimir Chebotaryov](https://github.com/quickhouse)).
+* tests: enable back 02232_dist_insert_send_logs_level_hung [#39124](https://github.com/ClickHouse/ClickHouse/pull/39124) ([Azat Khuzhin](https://github.com/azat)).
+* Add connection info for Distributed sends log message [#39178](https://github.com/ClickHouse/ClickHouse/pull/39178) ([Azat Khuzhin](https://github.com/azat)).
+* Forbid defining non-default disk with default path from <path> [#39183](https://github.com/ClickHouse/ClickHouse/pull/39183) ([Azat Khuzhin](https://github.com/azat)).
+* Fix LZ4 decompression issue for s390x [#39195](https://github.com/ClickHouse/ClickHouse/pull/39195) ([Harry Lee](https://github.com/HarryLeeIBM)).
+* Do not report "Failed communicating with" on and on for parts exchange [#39222](https://github.com/ClickHouse/ClickHouse/pull/39222) ([Azat Khuzhin](https://github.com/azat)).
+* Improve logging around replicated merges [#39230](https://github.com/ClickHouse/ClickHouse/pull/39230) ([Raúl Marín](https://github.com/Algunenano)).
+* Cleanup logic around join_algorithm setting, add docs [#39271](https://github.com/ClickHouse/ClickHouse/pull/39271) ([Vladimir C](https://github.com/vdimir)).
+* Possible fix for flaky `test_keeper_force_recovery` [#39321](https://github.com/ClickHouse/ClickHouse/pull/39321) ([Antonio Andelic](https://github.com/antonio2368)).
+* tests/performance: improve parallel_mv test [#39325](https://github.com/ClickHouse/ClickHouse/pull/39325) ([Azat Khuzhin](https://github.com/azat)).
+* Update azure library (removed "harmful" function) [#39327](https://github.com/ClickHouse/ClickHouse/pull/39327) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Refactor PreparedSets/SubqueryForSet [#39343](https://github.com/ClickHouse/ClickHouse/pull/39343) ([Vladimir C](https://github.com/vdimir)).
+* Small doc updates [#39362](https://github.com/ClickHouse/ClickHouse/pull/39362) ([Robert Schulze](https://github.com/rschu1ze)).
+* Even less usage of StringRef [#39364](https://github.com/ClickHouse/ClickHouse/pull/39364) ([Robert Schulze](https://github.com/rschu1ze)).
+* Automatic fixes for black formatting for domestic repo PRs [#39390](https://github.com/ClickHouse/ClickHouse/pull/39390) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Clickhouse-local fixes [#39404](https://github.com/ClickHouse/ClickHouse/pull/39404) ([Heena Bansal](https://github.com/HeenaBansal2009)).
+* Uppercase `ROWS`, `GROUPS`, `RANGE` in queries with windows [#39410](https://github.com/ClickHouse/ClickHouse/pull/39410) ([Vladimir Chebotaryov](https://github.com/quickhouse)).
+* GitHub helper [#39421](https://github.com/ClickHouse/ClickHouse/pull/39421) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* ShellCommand wait pid refactoring [#39426](https://github.com/ClickHouse/ClickHouse/pull/39426) ([Maksim Kita](https://github.com/kitaisreal)).
+* Require clear style check to continue building [#39428](https://github.com/ClickHouse/ClickHouse/pull/39428) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* DirectDictionary improve performance of dictHas with duplicate keys [#39449](https://github.com/ClickHouse/ClickHouse/pull/39449) ([Maksim Kita](https://github.com/kitaisreal)).
+* Commit status names: remove "actions" [#39454](https://github.com/ClickHouse/ClickHouse/pull/39454) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Improve synchronization between hosts in distributed backup and fix locks [#39455](https://github.com/ClickHouse/ClickHouse/pull/39455) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Remove some dead and commented code [#39460](https://github.com/ClickHouse/ClickHouse/pull/39460) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Add Build Check and Special Build Check to SimpleCheck [#39467](https://github.com/ClickHouse/ClickHouse/pull/39467) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* Update version after release [#39474](https://github.com/ClickHouse/ClickHouse/pull/39474) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Update version_date.tsv and changelogs after v22.7.1.2484-stable [#39475](https://github.com/ClickHouse/ClickHouse/pull/39475) ([github-actions[bot]](https://github.com/apps/github-actions)).
+* Update README.md [#39478](https://github.com/ClickHouse/ClickHouse/pull/39478) ([Dan Roscigno](https://github.com/DanRoscigno)).
+* Remove unused constructor [#39491](https://github.com/ClickHouse/ClickHouse/pull/39491) ([alesapin](https://github.com/alesapin)).
+* Mark new codec DEFLATE_QPL as experimental + cosmetics [#39495](https://github.com/ClickHouse/ClickHouse/pull/39495) ([Robert Schulze](https://github.com/rschu1ze)).
+* Update arrow to fix possible data race [#39510](https://github.com/ClickHouse/ClickHouse/pull/39510) ([Kruglov Pavel](https://github.com/Avogar)).
+* fix `-DENABLE_EXAMPLES=1` in master [#39517](https://github.com/ClickHouse/ClickHouse/pull/39517) ([Constantine Peresypkin](https://github.com/pkit)).
+* LZ4_decompress_faster.cpp: remove endianness-dependent code [#39523](https://github.com/ClickHouse/ClickHouse/pull/39523) ([Ignat Loskutov](https://github.com/loskutov)).
+* Fix 02286_parallel_final [#39524](https://github.com/ClickHouse/ClickHouse/pull/39524) ([Nikita Taranov](https://github.com/nickitat)).
+* add Equinix metal N3 Xlarge [#39532](https://github.com/ClickHouse/ClickHouse/pull/39532) ([Tyler Hannan](https://github.com/tylerhannan)).
+* Less usage of StringRef [#39535](https://github.com/ClickHouse/ClickHouse/pull/39535) ([Robert Schulze](https://github.com/rschu1ze)).
+* Follow up to [#37827](https://github.com/ClickHouse/ClickHouse/issues/37827) [#39557](https://github.com/ClickHouse/ClickHouse/pull/39557) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Temporarily disable all tests with MaterializedPostgreSQL [#39564](https://github.com/ClickHouse/ClickHouse/pull/39564) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Update version_date.tsv after v22.3.9.19-lts [#39576](https://github.com/ClickHouse/ClickHouse/pull/39576) ([github-actions[bot]](https://github.com/apps/github-actions)).
+* free compression and decompression contexts [#39578](https://github.com/ClickHouse/ClickHouse/pull/39578) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* Update version_date.tsv and changelogs after v22.6.4.35-stable [#39579](https://github.com/ClickHouse/ClickHouse/pull/39579) ([github-actions[bot]](https://github.com/apps/github-actions)).
+* Merge Woboq code browser page into "Getting Started" document [#39596](https://github.com/ClickHouse/ClickHouse/pull/39596) ([Robert Schulze](https://github.com/rschu1ze)).
+* Fix Chain::addSink [#39601](https://github.com/ClickHouse/ClickHouse/pull/39601) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Update NuRaft to latest master [#39609](https://github.com/ClickHouse/ClickHouse/pull/39609) ([Antonio Andelic](https://github.com/antonio2368)).
+* copy self-extracting to output [#39617](https://github.com/ClickHouse/ClickHouse/pull/39617) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* Replace MemoryTrackerBlockerInThread to LockMemoryExceptionInThread [#39619](https://github.com/ClickHouse/ClickHouse/pull/39619) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Combining sumIf->countIf and multiIf->if opt. [#39621](https://github.com/ClickHouse/ClickHouse/pull/39621) ([Amos Bird](https://github.com/amosbird)).
+* Update README.md [#39622](https://github.com/ClickHouse/ClickHouse/pull/39622) ([Ivan Blinkov](https://github.com/blinkov)).
+* Disable 02327_capnproto_protobuf_empty_messages with Ordinary [#39623](https://github.com/ClickHouse/ClickHouse/pull/39623) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* add Dell PowerEdge R740XD results [#39625](https://github.com/ClickHouse/ClickHouse/pull/39625) ([Tyler Hannan](https://github.com/tylerhannan)).
+* Attempt to fix wrong workflow_run data for rerun [#39630](https://github.com/ClickHouse/ClickHouse/pull/39630) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Run tests with Replicated database in master [#39653](https://github.com/ClickHouse/ClickHouse/pull/39653) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Rollback request in Keeper if storing log fails [#39673](https://github.com/ClickHouse/ClickHouse/pull/39673) ([Antonio Andelic](https://github.com/antonio2368)).
+* Fix utils build on CI [#39679](https://github.com/ClickHouse/ClickHouse/pull/39679) ([Azat Khuzhin](https://github.com/azat)).
+* Add duration_ms into system.zookeeper_log [#39686](https://github.com/ClickHouse/ClickHouse/pull/39686) ([Azat Khuzhin](https://github.com/azat)).
+* Fix DISTINCT: handle all const columns case correctly [#39688](https://github.com/ClickHouse/ClickHouse/pull/39688) ([Igor Nikonov](https://github.com/devcrafter)).
+* Update README.md [#39692](https://github.com/ClickHouse/ClickHouse/pull/39692) ([Yuko Takagi](https://github.com/yukotakagi)).
+* Update Keeper version for digest [#39698](https://github.com/ClickHouse/ClickHouse/pull/39698) ([Antonio Andelic](https://github.com/antonio2368)).
+* Change mysql-odbc url [#39702](https://github.com/ClickHouse/ClickHouse/pull/39702) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Avoid recursive destruction of AST. [#39705](https://github.com/ClickHouse/ClickHouse/pull/39705) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Update ccache to the latest available version [#39709](https://github.com/ClickHouse/ClickHouse/pull/39709) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Join enums refactoring [#39718](https://github.com/ClickHouse/ClickHouse/pull/39718) ([Maksim Kita](https://github.com/kitaisreal)).
+* Fix flaky test `02360_send_logs_level_colors` [#39720](https://github.com/ClickHouse/ClickHouse/pull/39720) ([Anton Popov](https://github.com/CurtizJ)).
+* Fix cherry-pick for cases, when assignee is not set for PR [#39723](https://github.com/ClickHouse/ClickHouse/pull/39723) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Jepsen label [#39730](https://github.com/ClickHouse/ClickHouse/pull/39730) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Fix redirecting of logs to stdout in clickhouse-client [#39731](https://github.com/ClickHouse/ClickHouse/pull/39731) ([Anton Popov](https://github.com/CurtizJ)).
+* CI: refactor Simple Check, use statuses to make it stateful [#39735](https://github.com/ClickHouse/ClickHouse/pull/39735) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* Use different root path for total-queue Jepsen test [#39738](https://github.com/ClickHouse/ClickHouse/pull/39738) ([Antonio Andelic](https://github.com/antonio2368)).
+* Simple refactoring: ordinary DISTINCT implementation [#39740](https://github.com/ClickHouse/ClickHouse/pull/39740) ([Igor Nikonov](https://github.com/devcrafter)).
+* Cleanup usages of `allow_experimental_projection_optimization` setting, part 1 [#39746](https://github.com/ClickHouse/ClickHouse/pull/39746) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Enable SQL function getOSKernelVersion() on all platforms [#39751](https://github.com/ClickHouse/ClickHouse/pull/39751) ([Robert Schulze](https://github.com/rschu1ze)).
+* Try clang-15 for build with tsan [#39758](https://github.com/ClickHouse/ClickHouse/pull/39758) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Rename "splitted build" to "shared libraries build" in CI tools [#39759](https://github.com/ClickHouse/ClickHouse/pull/39759) ([Robert Schulze](https://github.com/rschu1ze)).
+* Use std::popcount, ::countl_zero, ::countr_zero functions [#39760](https://github.com/ClickHouse/ClickHouse/pull/39760) ([Robert Schulze](https://github.com/rschu1ze)).
+* Self-extracting - run resulting executable with execvp [#39763](https://github.com/ClickHouse/ClickHouse/pull/39763) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* Fix non-deterministic queries in distinct_in_order test [#39772](https://github.com/ClickHouse/ClickHouse/pull/39772) ([Igor Nikonov](https://github.com/devcrafter)).
+* Fix some flaky integration tests [#39775](https://github.com/ClickHouse/ClickHouse/pull/39775) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Retry inserts with ClickHouseHelper [#39780](https://github.com/ClickHouse/ClickHouse/pull/39780) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Add cloudflare DNS as a fallback [#39795](https://github.com/ClickHouse/ClickHouse/pull/39795) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Update README.md [#39796](https://github.com/ClickHouse/ClickHouse/pull/39796) ([Yuko Takagi](https://github.com/yukotakagi)).
+* Minor fix for Stress Tests [#39798](https://github.com/ClickHouse/ClickHouse/pull/39798) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Typos [#39813](https://github.com/ClickHouse/ClickHouse/pull/39813) ([Robert Schulze](https://github.com/rschu1ze)).
+* Update settings changes history [#39839](https://github.com/ClickHouse/ClickHouse/pull/39839) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix post-build script for building utils/self-extracting-executable/compressor [#39843](https://github.com/ClickHouse/ClickHouse/pull/39843) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* Add hasJoin method into ASTSelectQuery [#39850](https://github.com/ClickHouse/ClickHouse/pull/39850) ([Maksim Kita](https://github.com/kitaisreal)).
+* Update tweak on version part update [#39853](https://github.com/ClickHouse/ClickHouse/pull/39853) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Update version_date.tsv and changelogs after v22.7.2.15-stable [#39854](https://github.com/ClickHouse/ClickHouse/pull/39854) ([github-actions[bot]](https://github.com/apps/github-actions)).
+* Fix typo and extra dots in exception messages from OverCommitTracker [#39858](https://github.com/ClickHouse/ClickHouse/pull/39858) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Fix flaky integration test test_async_backups_to_same_destination. [#39859](https://github.com/ClickHouse/ClickHouse/pull/39859) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Better total part size calculation on mutation [#39860](https://github.com/ClickHouse/ClickHouse/pull/39860) ([alesapin](https://github.com/alesapin)).
+* typo: PostgerSQL -> PostgreSQL [#39861](https://github.com/ClickHouse/ClickHouse/pull/39861) ([nathanbegbie](https://github.com/nathanbegbie)).
+* Remove prefer_localhost_replica from test [#39862](https://github.com/ClickHouse/ClickHouse/pull/39862) ([Igor Nikonov](https://github.com/devcrafter)).
+* Block memory tracker in Keeper during commit [#39867](https://github.com/ClickHouse/ClickHouse/pull/39867) ([Antonio Andelic](https://github.com/antonio2368)).
+* Update version_date.tsv after v22.3.10.22-lts [#39868](https://github.com/ClickHouse/ClickHouse/pull/39868) ([github-actions[bot]](https://github.com/apps/github-actions)).
+* fix incorrect format for functions with settings [#39869](https://github.com/ClickHouse/ClickHouse/pull/39869) ([Constantine Peresypkin](https://github.com/pkit)).
+* Get api url from event, not from const/ENV [#39871](https://github.com/ClickHouse/ClickHouse/pull/39871) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Cleanup unused dirs from `store/` on all disks [#39872](https://github.com/ClickHouse/ClickHouse/pull/39872) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Update 02354_distributed_with_external_aggregation_memory_usage.sql [#39893](https://github.com/ClickHouse/ClickHouse/pull/39893) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Fix the race between waitMutation and updating local queue from ZK [#39900](https://github.com/ClickHouse/ClickHouse/pull/39900) ([Alexander Gololobov](https://github.com/davenger)).
+* Improve 02354_distributed_with_external_aggregation_memory_usage [#39908](https://github.com/ClickHouse/ClickHouse/pull/39908) ([Nikita Taranov](https://github.com/nickitat)).
+* Move username and password from URL parameters to Basic Authentication [#39910](https://github.com/ClickHouse/ClickHouse/pull/39910) ([San](https://github.com/santrancisco)).
+* Remove cache flush from the Docs Check [#39911](https://github.com/ClickHouse/ClickHouse/pull/39911) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fix flaky tests (`Tried to commit obsolete part`) [#39922](https://github.com/ClickHouse/ClickHouse/pull/39922) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Add logging to debug flaky tests [#39925](https://github.com/ClickHouse/ClickHouse/pull/39925) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Fix flaky test `02360_send_logs_level_colors` [#39927](https://github.com/ClickHouse/ClickHouse/pull/39927) ([Anton Popov](https://github.com/CurtizJ)).
+* Don't create self-extracting clickhouse for split build [#39936](https://github.com/ClickHouse/ClickHouse/pull/39936) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* tests/stress: add dmesg output (to see OOM details) [#39939](https://github.com/ClickHouse/ClickHouse/pull/39939) ([Azat Khuzhin](https://github.com/azat)).
+* Create metadata directory on CREATE for FileLog engine [#39940](https://github.com/ClickHouse/ClickHouse/pull/39940) ([Azat Khuzhin](https://github.com/azat)).
+* tests: fix 02352_rwlock flakiness [#39941](https://github.com/ClickHouse/ClickHouse/pull/39941) ([Azat Khuzhin](https://github.com/azat)).
+* Remove old code from the website [#39947](https://github.com/ClickHouse/ClickHouse/pull/39947) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Remove debug trace from DistinctStep [#39955](https://github.com/ClickHouse/ClickHouse/pull/39955) ([Igor Nikonov](https://github.com/devcrafter)).
+* IAST destructor intrusive list [#39956](https://github.com/ClickHouse/ClickHouse/pull/39956) ([Maksim Kita](https://github.com/kitaisreal)).
+* Remove old code from the website (part 2) [#39959](https://github.com/ClickHouse/ClickHouse/pull/39959) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Add Stateful tests (release), Stateless tests (release) to Mergeable Check [#39967](https://github.com/ClickHouse/ClickHouse/pull/39967) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* Change font in CI reports [#39969](https://github.com/ClickHouse/ClickHouse/pull/39969) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Add setting type to support special 'auto' value [#39974](https://github.com/ClickHouse/ClickHouse/pull/39974) ([Vladimir C](https://github.com/vdimir)).
+* Update 02354_distributed_with_external_aggregation_memory_usage.sql [#39979](https://github.com/ClickHouse/ClickHouse/pull/39979) ([Nikita Taranov](https://github.com/nickitat)).
+* tests/stress: fix dmesg reading [#39980](https://github.com/ClickHouse/ClickHouse/pull/39980) ([Azat Khuzhin](https://github.com/azat)).
+* Disable 02380_insert_mv_race.sh with Ordinary [#39985](https://github.com/ClickHouse/ClickHouse/pull/39985) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Mention how the build can be speed up by disabling self-extraction [#39988](https://github.com/ClickHouse/ClickHouse/pull/39988) ([Robert Schulze](https://github.com/rschu1ze)).
+* Use different root path for Jepsen Counter test [#39992](https://github.com/ClickHouse/ClickHouse/pull/39992) ([Antonio Andelic](https://github.com/antonio2368)).
+* ActionsDAG rename index to outputs [#39998](https://github.com/ClickHouse/ClickHouse/pull/39998) ([Maksim Kita](https://github.com/kitaisreal)).
+* Added H literal for Hour IntervalKind [#39999](https://github.com/ClickHouse/ClickHouse/pull/39999) ([Heena Bansal](https://github.com/HeenaBansal2009)).
+* Try to avoid timeouts when checking for replication consistency [#40001](https://github.com/ClickHouse/ClickHouse/pull/40001) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* More generic check for MergeTree table family [#40004](https://github.com/ClickHouse/ClickHouse/pull/40004) ([Alexander Gololobov](https://github.com/davenger)).
+* Further preparation for catboost integration into library-bridge [#40010](https://github.com/ClickHouse/ClickHouse/pull/40010) ([Robert Schulze](https://github.com/rschu1ze)).
+* Self-extracting: decompressor, extract real path of executable instead of argv[0] [#40011](https://github.com/ClickHouse/ClickHouse/pull/40011) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* copy self-extracting to output [#40017](https://github.com/ClickHouse/ClickHouse/pull/40017) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* Update 02354_distributed_with_external_aggregation_memory_usage.sql [#40024](https://github.com/ClickHouse/ClickHouse/pull/40024) ([Nikita Taranov](https://github.com/nickitat)).
+* Fix segfault in `DataTypeAggregateFunction` [#40025](https://github.com/ClickHouse/ClickHouse/pull/40025) ([Anton Popov](https://github.com/CurtizJ)).
+* tests/performance: cover sparse_hashed dictionary [#40027](https://github.com/ClickHouse/ClickHouse/pull/40027) ([Azat Khuzhin](https://github.com/azat)).
+* Cleanup docs of parseDateTime*() function family [#40030](https://github.com/ClickHouse/ClickHouse/pull/40030) ([Robert Schulze](https://github.com/rschu1ze)).
+* Job url [#40032](https://github.com/ClickHouse/ClickHouse/pull/40032) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Update version_date.tsv and changelogs after v22.6.5.22-stable [#40036](https://github.com/ClickHouse/ClickHouse/pull/40036) ([github-actions[bot]](https://github.com/apps/github-actions)).
+* Non-significant changes [#40038](https://github.com/ClickHouse/ClickHouse/pull/40038) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* tests: attempt to make 02293_part_log_has_merge_reason less flaky [#40047](https://github.com/ClickHouse/ClickHouse/pull/40047) ([Azat Khuzhin](https://github.com/azat)).
+* Remove documentation templates [#40048](https://github.com/ClickHouse/ClickHouse/pull/40048) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Move images to clickhouse-presentations repository. [#40049](https://github.com/ClickHouse/ClickHouse/pull/40049) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fix broken image in test-visualizer [#40050](https://github.com/ClickHouse/ClickHouse/pull/40050) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Add a test for query parameters in HTTP POST [#40055](https://github.com/ClickHouse/ClickHouse/pull/40055) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fix clickhouse-test hang in case of CREATE DATABASE fails [#40057](https://github.com/ClickHouse/ClickHouse/pull/40057) ([Azat Khuzhin](https://github.com/azat)).
+* tests: fix 02380_insert_mv_race for Ordinary database [#40058](https://github.com/ClickHouse/ClickHouse/pull/40058) ([Azat Khuzhin](https://github.com/azat)).
+* Skip newlines before Tags in clickhouse-test [#40061](https://github.com/ClickHouse/ClickHouse/pull/40061) ([Vladimir C](https://github.com/vdimir)).
+* Replace S3 URLs by parameter [#40066](https://github.com/ClickHouse/ClickHouse/pull/40066) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Finally fix `_csv.Error: field larger than field limit` [#40072](https://github.com/ClickHouse/ClickHouse/pull/40072) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* tests: fix 00926_adaptive_index_granularity_pk/00489_pk_subexpression flakiness [#40075](https://github.com/ClickHouse/ClickHouse/pull/40075) ([Azat Khuzhin](https://github.com/azat)).
+* Changelogs and versions [#40090](https://github.com/ClickHouse/ClickHouse/pull/40090) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* A test for counting resources in subqueries [#40104](https://github.com/ClickHouse/ClickHouse/pull/40104) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Use a job ID as ref text [#40112](https://github.com/ClickHouse/ClickHouse/pull/40112) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Delete files DictionaryJoinAdapter.h/cpp [#40113](https://github.com/ClickHouse/ClickHouse/pull/40113) ([Vladimir C](https://github.com/vdimir)).
+* Rework S3Helper a little bit [#40127](https://github.com/ClickHouse/ClickHouse/pull/40127) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* PODArray assign empty array fix [#40129](https://github.com/ClickHouse/ClickHouse/pull/40129) ([Maksim Kita](https://github.com/kitaisreal)).
+* Disable 02390_prometheus_ClickHouseStatusInfo_DictionaryStatus with Ordinary database [#40136](https://github.com/ClickHouse/ClickHouse/pull/40136) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Add tests with Ordinary database to flaky check [#40137](https://github.com/ClickHouse/ClickHouse/pull/40137) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* fs cache: minor change [#40138](https://github.com/ClickHouse/ClickHouse/pull/40138) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix typo [#40139](https://github.com/ClickHouse/ClickHouse/pull/40139) ([Robert Schulze](https://github.com/rschu1ze)).
+* Fix keeper-bench in case of error during scheduling a thread [#40147](https://github.com/ClickHouse/ClickHouse/pull/40147) ([Azat Khuzhin](https://github.com/azat)).
+* Fix "Cannot quickly remove directory" [#40151](https://github.com/ClickHouse/ClickHouse/pull/40151) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Set sync_request_timeout to 10 to avoid reconnections in tests [#40158](https://github.com/ClickHouse/ClickHouse/pull/40158) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Disable zero-copy replication by default [#40175](https://github.com/ClickHouse/ClickHouse/pull/40175) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Improve assignment and logging for cherry-pick and backport steps [#40177](https://github.com/ClickHouse/ClickHouse/pull/40177) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* test for Decimal aggregateFunction normalization [#39420](https://github.com/ClickHouse/ClickHouse/issues/39420) [#40178](https://github.com/ClickHouse/ClickHouse/pull/40178) ([Denny Crane](https://github.com/den-crane)).
+* Minor build changes [#40182](https://github.com/ClickHouse/ClickHouse/pull/40182) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* clickhouse-test: enable ZooKeeper tests by default [#40191](https://github.com/ClickHouse/ClickHouse/pull/40191) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Remove old code [#40196](https://github.com/ClickHouse/ClickHouse/pull/40196) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Update README.md [#40198](https://github.com/ClickHouse/ClickHouse/pull/40198) ([clickhouse-robot-curie](https://github.com/clickhouse-robot-curie)).
+* Fix a bug with symlinks detection [#40232](https://github.com/ClickHouse/ClickHouse/pull/40232) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Better error message when restoring covered parts [#40234](https://github.com/ClickHouse/ClickHouse/pull/40234) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Try to print stacktraces if query timeouts in integration tests [#40248](https://github.com/ClickHouse/ClickHouse/pull/40248) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Add Unit tests to Mergeable [#40250](https://github.com/ClickHouse/ClickHouse/pull/40250) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* Extract common KV storage logic [#40261](https://github.com/ClickHouse/ClickHouse/pull/40261) ([Antonio Andelic](https://github.com/antonio2368)).
+* Add update_mergeable_check trigger for Unit tests [#40269](https://github.com/ClickHouse/ClickHouse/pull/40269) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* CVE-2021-3520: (negligible) rdkafka library: update lz4.c from upstream [#40272](https://github.com/ClickHouse/ClickHouse/pull/40272) ([Suzy Wang](https://github.com/SuzyWangIBMer)).
+* Fix build [#40297](https://github.com/ClickHouse/ClickHouse/pull/40297) ([Alexander Tokmakov](https://github.com/tavplubix)).
+
+#### Support cte statement for antlr4 syntax file
+
+* ... [#39814](https://github.com/ClickHouse/ClickHouse/pull/39814) ([qianmoQ](https://github.com/qianmoQ)).
+
--- a/docs/en/development/tests.md
+++ b/docs/en/development/tests.md
@ -192,7 +192,7 @@ ClickHouse fuzzing is implemented both using [libFuzzer](https://llvm.org/docs/L
 All the fuzz testing should be performed with sanitizers (Address and Undefined).

 LibFuzzer is used for isolated fuzz testing of library code. Fuzzers are implemented as part of test code and have “_fuzzer” name postfixes.
-Fuzzer example can be found at `src/Parsers/tests/lexer_fuzzer.cpp`. LibFuzzer-specific configs, dictionaries and corpus are stored at `tests/fuzz`.
+Fuzzer example can be found at `src/Parsers/fuzzers/lexer_fuzzer.cpp`. LibFuzzer-specific configs, dictionaries and corpus are stored at `tests/fuzz`.
 We encourage you to write fuzz tests for every functionality that handles user input.

 Fuzzers are not built by default. To build fuzzers both `-DENABLE_FUZZING=1` and `-DENABLE_TESTS=1` options should be set.
--- a/docs/en/engines/table-engines/integrations/hdfs.md
+++ b/docs/en/engines/table-engines/integrations/hdfs.md
@ -51,10 +51,14 @@ SELECT * FROM hdfs_engine_table LIMIT 2
 ## Implementation Details {#implementation-details}

 -   Reads and writes can be parallel.
-   [Zero-copy](../../../operations/storing-data.md#zero-copy) replication is supported.
 -   Not supported:
    -   `ALTER` and `SELECT...SAMPLE` operations.
    -   Indexes.
+    -   [Zero-copy](../../../operations/storing-data.md#zero-copy) replication is possible, but not recommended.
+  
+  :::warning Zero-copy replication is not ready for production
+  Zero-copy replication is disabled by default in ClickHouse version 22.8 and higher.  This feature is not recommended for production use.
+  :::

 **Globs in path**

--- a/docs/en/engines/table-engines/integrations/s3.md
+++ b/docs/en/engines/table-engines/integrations/s3.md
@ -50,10 +50,14 @@ For more information about virtual columns see [here](../../../engines/table-eng
 ## Implementation Details {#implementation-details}

 -   Reads and writes can be parallel
-   [Zero-copy](../../../operations/storing-data.md#zero-copy) replication is supported.
 -   Not supported:
    -   `ALTER` and `SELECT...SAMPLE` operations.
    -   Indexes.
+    -   [Zero-copy](../../../operations/storing-data.md#zero-copy) replication is possible, but not supported.
+
+  :::warning Zero-copy replication is not ready for production
+  Zero-copy replication is disabled by default in ClickHouse version 22.8 and higher.  This feature is not recommended for production use.
+  :::

 ## Wildcards In Path {#wildcards-in-path}

--- a/docs/en/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md
@ -1023,6 +1023,10 @@ Other parameters:

 Examples of working configurations can be found in integration tests directory (see e.g. [test_merge_tree_azure_blob_storage](https://github.com/ClickHouse/ClickHouse/blob/master/tests/integration/test_merge_tree_azure_blob_storage/configs/config.d/storage_conf.xml) or [test_azure_blob_storage_zero_copy_replication](https://github.com/ClickHouse/ClickHouse/blob/master/tests/integration/test_azure_blob_storage_zero_copy_replication/configs/config.d/storage_conf.xml)).

+  :::warning Zero-copy replication is not ready for production
+  Zero-copy replication is disabled by default in ClickHouse version 22.8 and higher.  This feature is not recommended for production use.
+  :::
+
 ## Virtual Columns {#virtual-columns}

 -   `_part` — Name of a part.
--- a/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md
@ -39,10 +39,53 @@ Uniqueness of rows is determined by the `ORDER BY` table section, not `PRIMARY K

 `ver` — column with the version number. Type `UInt*`, `Date`, `DateTime` or `DateTime64`. Optional parameter.

-    When merging, `ReplacingMergeTree` from all the rows with the same sorting key leaves only one:
+When merging, `ReplacingMergeTree` from all the rows with the same sorting key leaves only one:

-    -   The last in the selection, if `ver` not set. A selection is a set of rows in a set of parts participating in the merge. The most recently created part (the last insert) will be the last one in the selection. Thus, after deduplication, the very last row from the most recent insert will remain for each unique sorting key.
-    -   With the maximum version, if `ver` specified. If `ver` is the same for several rows, then it will use "if `ver` is not specified" rule for them, i.e. the most recent inserted row will remain.
+   - The last in the selection, if `ver` not set. A selection is a set of rows in a set of parts participating in the merge. The most recently created part (the last insert) will be the last one in the selection. Thus, after deduplication, the very last row from the most recent insert will remain for each unique sorting key.
+   - With the maximum version, if `ver` specified. If `ver` is the same for several rows, then it will use "if `ver` is not specified" rule for them, i.e. the most recent inserted row will remain.
+
+Example: 
+
+```sql
+-- without ver - the last inserted 'wins'
+CREATE TABLE myFirstReplacingMT
+(
+    `key` Int64,
+    `someCol` String,
+    `eventTime` DateTime
+)
+ENGINE = ReplacingMergeTree
+ORDER BY key;
+
+INSERT INTO myFirstReplacingMT Values (1, 'first', '2020-01-01 01:01:01');
+INSERT INTO myFirstReplacingMT Values (1, 'second', '2020-01-01 00:00:00');
+
+SELECT * FROM myFirstReplacingMT FINAL;
+
+┌─key─┬─someCol─┬───────────eventTime─┐
+│   1 │ second  │ 2020-01-01 00:00:00 │
+└─────┴─────────┴─────────────────────┘
+
+
+-- with ver - the row with the biggest ver 'wins'
+CREATE TABLE mySecondReplacingMT
+(
+    `key` Int64,
+    `someCol` String,
+    `eventTime` DateTime
+)
+ENGINE = ReplacingMergeTree(eventTime)
+ORDER BY key;
+
+INSERT INTO mySecondReplacingMT Values (1, 'first', '2020-01-01 01:01:01');
+INSERT INTO mySecondReplacingMT Values (1, 'second', '2020-01-01 00:00:00');
+
+SELECT * FROM mySecondReplacingMT FINAL;
+
+┌─key─┬─someCol─┬───────────eventTime─┐
+│   1 │ first   │ 2020-01-01 01:01:01 │
+└─────┴─────────┴─────────────────────┘
+```

 ## Query clauses

--- a/docs/en/interfaces/cli.md
+++ b/docs/en/interfaces/cli.md
@ -22,7 +22,9 @@ Connected to ClickHouse server version 20.13.1 revision 54442.

 Different client and server versions are compatible with one another, but some features may not be available in older clients. We recommend using the same version of the client as the server app. When you try to use a client of the older version, then the server, `clickhouse-client` displays the message:

-      ClickHouse client version is older than ClickHouse server. It may lack support for new features.
+```response
+ClickHouse client version is older than ClickHouse server. It may lack support for new features.
+```

 ## Usage {#cli_usage}

@ -80,6 +82,13 @@ You can create a query with parameters and pass values to them from client appli
 $ clickhouse-client --param_parName="[1, 2]"  -q "SELECT * FROM table WHERE a = {parName:Array(UInt16)}"
 ```

+It is also possible to set parameters from within an interactive session:
+``` bash
+$ clickhouse-client -nq "
+  SET param_parName='[1, 2]';
+  SELECT {parName:Array(UInt16)}"
+```
+
 #### Query Syntax {#cli-queries-with-parameters-syntax}

 Format a query as usual, then place the values that you want to pass from the app parameters to the query in braces in the following format:
@ -116,6 +125,7 @@ You can pass parameters to `clickhouse-client` (all parameters have a default va
 -   `--port` – The port to connect to. Default value: 9000. Note that the HTTP interface and the native interface use different ports.
 -   `--user, -u` – The username. Default value: default.
 -   `--password` – The password. Default value: empty string.
+-   `--ask-password` - Prompt the user to enter a password.
 -   `--query, -q` – The query to process when using non-interactive mode. You must specify either `query` or `queries-file` option.
 -   `--queries-file` – file path with queries to execute. You must specify either `query` or `queries-file` option.
 -   `--database, -d` – Select the current default database. Default value: the current database from the server settings (‘default’ by default).
@ -182,6 +192,6 @@ This feature can be used to generate URLs to facilitate profiling of queries.

 If the configuration above is applied, the ID of a query is shown in the following format:

-``` text
+```response
 speedscope:http://speedscope-host/#profileURL=qp%3Fid%3Dc8ecc783-e753-4b38-97f1-42cddfb98b7d
 ```
--- a/docs/en/operations/caches.md
+++ b/docs/en/operations/caches.md
@ -19,6 +19,7 @@ Additional cache types:
 - Compiled expressions cache.
 - [Avro format](../interfaces/formats.md#data-format-avro) schemas cache.
 - [Dictionaries](../sql-reference/dictionaries/index.md) data cache.
+- Schema inference cache.

 Indirectly used:

--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -743,13 +743,24 @@ On hosts with low RAM and swap, you possibly need setting `max_server_memory_usa

 -   [max_server_memory_usage](#max_server_memory_usage)

-## concurrent_threads_soft_limit {#concurrent_threads_soft_limit}
-The maximum number of query processing threads, excluding threads for retrieving data from remote servers, allowed to run all queries. This is not a hard limit. In case if the limit is reached the query will still get one thread to run.
+## concurrent_threads_soft_limit_num {#concurrent_threads_soft_limit_num}
+The maximum number of query processing threads, excluding threads for retrieving data from remote servers, allowed to run all queries. This is not a hard limit. In case if the limit is reached the query will still get at least one thread to run. Query can upscale to desired number of threads during execution if more threads become available.

 Possible values:
+
+-   Positive integer.
+-   0 — No limit.
+
+Default value: `0`.
+
+## concurrent_threads_soft_limit_ratio_to_cores {#concurrent_threads_soft_limit_ratio_to_cores}
+The maximum number of query processing threads as multiple of number of logical cores.
+More details: [concurrent_threads_soft_limit_num](#concurrent-threads-soft-limit-num).
+
+Possible values:
+
 -   Positive integer.
 -   0 — No limit.
-   -1 — The parameter is initialized by number of logical cores multiplies by 3. Which is a good heuristic for CPU-bound tasks.

 Default value: `0`.

--- a/docs/en/operations/settings/merge-tree-settings.md
+++ b/docs/en/operations/settings/merge-tree-settings.md
@ -218,6 +218,10 @@ Default value: 0 (seconds)

 When this setting has a value greater than than zero only a single replica starts the merge immediately if merged part on shared storage and `allow_remote_fs_zero_copy_replication` is enabled.

+:::warning Zero-copy replication is not ready for production
+Zero-copy replication is disabled by default in ClickHouse version 22.8 and higher.  This feature is not recommended for production use.
+:::
+
 Possible values:

 -   Any positive integer.
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -747,7 +747,14 @@ Default value: 268435456.

 Disables lagging replicas for distributed queries. See [Replication](../../engines/table-engines/mergetree-family/replication.md).

-Sets the time in seconds. If a replica lags more than the set value, this replica is not used.
+Sets the time in seconds. If a replica's lag is greater than or equal to the set value, this replica is not used.
+
+Possible values:
+
+-   Positive integer.
+-   0 — Replica lags are not checked.
+
+To prevent the use of any replica with a non-zero lag, set this parameter to 1.

 Default value: 300.

@ -3095,14 +3102,14 @@ Exception: Total regexp lengths too large.

 ## enable_positional_arguments {#enable-positional-arguments}

-Enables or disables supporting positional arguments for [GROUP BY](../../sql-reference/statements/select/group-by.md), [LIMIT BY](../../sql-reference/statements/select/limit-by.md), [ORDER BY](../../sql-reference/statements/select/order-by.md) statements. When you want to use column numbers instead of column names in these clauses, set `enable_positional_arguments = 1`.
+Enables or disables supporting positional arguments for [GROUP BY](../../sql-reference/statements/select/group-by.md), [LIMIT BY](../../sql-reference/statements/select/limit-by.md), [ORDER BY](../../sql-reference/statements/select/order-by.md) statements.

 Possible values:

 -   0 — Positional arguments aren't supported.
 -   1 — Positional arguments are supported: column numbers can use instead of column names.

-Default value: `0`.
+Default value: `1`.

 **Example**

@ -3113,8 +3120,6 @@ CREATE TABLE positional_arguments(one Int, two Int, three Int) ENGINE=Memory();

 INSERT INTO positional_arguments VALUES (10, 20, 30), (20, 20, 10), (30, 10, 20);

-SET enable_positional_arguments = 1;
-
 SELECT * FROM positional_arguments ORDER BY 2,3;
 ```

@ -3302,7 +3307,7 @@ Possible values:

 Default value: `0`.

-## shutdown_wait_unfinished_queries
+## shutdown_wait_unfinished_queries {#shutdown_wait_unfinished_queries}

 Enables or disables waiting unfinished queries when shutdown server.

@ -3313,13 +3318,13 @@ Possible values:

 Default value: 0.

-## shutdown_wait_unfinished
+## shutdown_wait_unfinished {#shutdown_wait_unfinished}

 The waiting time in seconds for currently handled connections when shutdown server.

 Default Value: 5.

-## memory_overcommit_ratio_denominator
+## memory_overcommit_ratio_denominator {#memory_overcommit_ratio_denominator}

 It represents soft memory limit in case when hard limit is reached on user level.
 This value is used to compute overcommit ratio for the query.
@ -3328,7 +3333,7 @@ Read more about [memory overcommit](memory-overcommit.md).

 Default value: `1GiB`.

-## memory_usage_overcommit_max_wait_microseconds
+## memory_usage_overcommit_max_wait_microseconds {#memory_usage_overcommit_max_wait_microseconds}

 Maximum time thread will wait for memory to be freed in the case of memory overcommit on a user level.
 If the timeout is reached and memory is not freed, an exception is thrown.
@ -3336,7 +3341,7 @@ Read more about [memory overcommit](memory-overcommit.md).

 Default value: `5000000`.

-## memory_overcommit_ratio_denominator_for_user
+## memory_overcommit_ratio_denominator_for_user {#memory_overcommit_ratio_denominator_for_user}

 It represents soft memory limit in case when hard limit is reached on global level.
 This value is used to compute overcommit ratio for the query.
@ -3345,6 +3350,36 @@ Read more about [memory overcommit](memory-overcommit.md).

 Default value: `1GiB`.

+## schema_inference_use_cache_for_file {schema_inference_use_cache_for_file}
+
+Enable schemas cache for schema inference in `file` table function.
+
+Default value: `true`.
+
+## schema_inference_use_cache_for_s3 {schema_inference_use_cache_for_s3}
+
+Enable schemas cache for schema inference in `s3` table function.
+
+Default value: `true`.
+
+## schema_inference_use_cache_for_url {schema_inference_use_cache_for_url}
+
+Enable schemas cache for schema inference in `url` table function.
+
+Default value: `true`.
+
+## schema_inference_use_cache_for_hdfs {schema_inference_use_cache_for_hdfs}
+
+Enable schemas cache for schema inference in `hdfs` table function.
+
+Default value: `true`.
+
+## schema_inference_cache_require_modification_time_for_url {#schema_inference_cache_require_modification_time_for_url}
+
+Use schema from cache for URL with last modification time validation (for urls with Last-Modified header). If this setting is enabled and URL doesn't have Last-Modified header, schema from cache won't be used.
+
+Default value: `true`.
+
 ## compatibility {#compatibility}

 This setting changes other settings according to provided ClickHouse version.
@ -3470,6 +3505,24 @@ Default value: `25'000`.

 The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'

+## schema_inference_hints {#schema_inference_hints}
+
+The list of column names and types to use as hints in schema inference for formats without schema.
+
+Example:
+
+Query:
+```sql
+desc format(JSONEachRow, '{"x" : 1, "y" : "String", "z" : "0.0.0.0" }') settings schema_inference_hints='x UInt8, z IPv4';
+```
+
+Result:
+```sql
+x	UInt8					
+y	Nullable(String)					
+z	IPv4
+```
+
 ## date_time_input_format {#date_time_input_format}

 Allows choosing a parser of the text representation of date and time.
--- a/docs/en/operations/storing-data.md
+++ b/docs/en/operations/storing-data.md
@ -316,4 +316,8 @@ Use [http_max_single_read_retries](../operations/settings/settings.md#http-max-s

 ## Zero-copy Replication (not ready for production) {#zero-copy}

-ClickHouse supports zero-copy replication for `S3` and `HDFS` disks, which means that if the data is stored remotely on several machines and needs to be synchronized, then only the metadata is replicated (paths to the data parts), but not the data itself.
+Zero-copy replication is possible, but not recommended, with  `S3` and `HDFS` disks. Zero-copy replication means that if the data is stored remotely on several machines and needs to be synchronized, then only the metadata is replicated (paths to the data parts), but not the data itself.
+
+:::warning Zero-copy replication is not ready for production
+Zero-copy replication is disabled by default in ClickHouse version 22.8 and higher.  This feature is not recommended for production use.
+:::
--- a/docs/en/sql-reference/aggregate-functions/reference/anylast.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/anylast.md
@ -2,7 +2,7 @@
 sidebar_position: 104
 ---

-## anyLast
+# anyLast

 Selects the last value encountered.
 The result is just as indeterminate as for the [any](../../../sql-reference/aggregate-functions/reference/any.md) function.
--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@ -162,6 +162,10 @@ Creates an array from the function arguments.
 The arguments must be constants and have types that have the smallest common type. At least one argument must be passed, because otherwise it isn’t clear which type of array to create. That is, you can’t use this function to create an empty array (to do that, use the ‘emptyArray\*’ function described above).
 Returns an ‘Array(T)’ type result, where ‘T’ is the smallest common type out of the passed arguments.

+## arrayWithConstant(length, elem)
+
+Creates an array of length `length` filled with the constant `elem`.
+
 ## arrayConcat

 Combines arrays passed as arguments.
--- a/docs/en/sql-reference/functions/array-join.md
+++ b/docs/en/sql-reference/functions/array-join.md
@ -9,15 +9,11 @@ This is a very unusual function.

 Normal functions do not change a set of rows, but just change the values in each row (map).
 Aggregate functions compress a set of rows (fold or reduce).
-The ‘arrayJoin’ function takes each row and generates a set of rows (unfold).
+The `arrayJoin` function takes each row and generates a set of rows (unfold).

 This function takes an array as an argument, and propagates the source row to multiple rows for the number of elements in the array.
 All the values in columns are simply copied, except the values in the column where this function is applied; it is replaced with the corresponding array value.

-A query can use multiple `arrayJoin` functions. In this case, the transformation is performed multiple times.
-
-Note the ARRAY JOIN syntax in the SELECT query, which provides broader possibilities.
-
 Example:

 ``` sql
@ -32,3 +28,112 @@ SELECT arrayJoin([1, 2, 3] AS src) AS dst, 'Hello', src
 └─────┴───────────┴─────────┘
 ```

+The `arrayJoin` function affects all sections of the query, including the `WHERE` section. Notice the result 2, even though the subquery returned 1 row.
+
+Example:
+
+```sql
+SELECT sum(1) AS impressions
+FROM
+(
+    SELECT ['Istanbul', 'Berlin', 'Bobruisk'] AS cities
+)
+WHERE arrayJoin(cities) IN ['Istanbul', 'Berlin'];
+```
+
+``` text
+┌─impressions─┐
+│           2 │
+└─────────────┘
+```
+
+A query can use multiple `arrayJoin` functions. In this case, the transformation is performed multiple times and the rows are multiplied.
+
+Example:
+
+```sql
+SELECT
+    sum(1) AS impressions,
+    arrayJoin(cities) AS city,
+    arrayJoin(browsers) AS browser
+FROM
+(
+    SELECT
+        ['Istanbul', 'Berlin', 'Bobruisk'] AS cities,
+        ['Firefox', 'Chrome', 'Chrome'] AS browsers
+)
+GROUP BY
+    2,
+    3
+```
+
+``` text
+┌─impressions─┬─city─────┬─browser─┐
+│           2 │ Istanbul │ Chrome  │
+│           1 │ Istanbul │ Firefox │
+│           2 │ Berlin   │ Chrome  │
+│           1 │ Berlin   │ Firefox │
+│           2 │ Bobruisk │ Chrome  │
+│           1 │ Bobruisk │ Firefox │
+└─────────────┴──────────┴─────────┘
+```
+
+Note the [ARRAY JOIN](../statements/select/array-join.md) syntax in the SELECT query, which provides broader possibilities.
+`ARRAY JOIN` allows you to convert multiple arrays with the same number of elements at a time.
+
+Example:
+
+```sql
+SELECT
+    sum(1) AS impressions,
+    city,
+    browser
+FROM
+(
+    SELECT
+        ['Istanbul', 'Berlin', 'Bobruisk'] AS cities,
+        ['Firefox', 'Chrome', 'Chrome'] AS browsers
+)
+ARRAY JOIN
+    cities AS city,
+    browsers AS browser
+GROUP BY
+    2,
+    3
+```
+
+``` text
+┌─impressions─┬─city─────┬─browser─┐
+│           1 │ Istanbul │ Firefox │
+│           1 │ Berlin   │ Chrome  │
+│           1 │ Bobruisk │ Chrome  │
+└─────────────┴──────────┴─────────┘
+```
+
+Or you can use [Tuple](../data-types/tuple.md)
+
+Example:
+
+```sql
+SELECT
+    sum(1) AS impressions,
+    (arrayJoin(arrayZip(cities, browsers)) AS t).1 AS city,
+    t.2 AS browser
+FROM
+(
+    SELECT
+        ['Istanbul', 'Berlin', 'Bobruisk'] AS cities,
+        ['Firefox', 'Chrome', 'Chrome'] AS browsers
+)
+GROUP BY
+    2,
+    3
+```
+
+``` text
+┌─impressions─┬─city─────┬─browser─┐
+│           1 │ Istanbul │ Firefox │
+│           1 │ Berlin   │ Chrome  │
+│           1 │ Bobruisk │ Chrome  │
+└─────────────┴──────────┴─────────┘
+```
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@ -1068,7 +1068,10 @@ Query:

 ```sql
 WITH toDateTime('2021-04-14 11:22:33') AS date_value
-SELECT dateName('year', date_value), dateName('month', date_value), dateName('day', date_value);
+SELECT
+    dateName('year', date_value),
+    dateName('month', date_value),
+    dateName('day', date_value);
 ```

 Result:
@ -1076,7 +1079,44 @@ Result:
 ```text
 ┌─dateName('year', date_value)─┬─dateName('month', date_value)─┬─dateName('day', date_value)─┐
 │ 2021                         │ April                         │ 14                          │
-└──────────────────────────────┴───────────────────────────────┴─────────────────────────────
+└──────────────────────────────┴───────────────────────────────┴─────────────────────────────┘
+```
+
+## monthName
+
+Returns name of the month.
+
+**Syntax**
+
+``` sql
+monthName(date)
+```
+
+**Arguments**
+
+-   `date` — Date or date with time. [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
+
+**Returned value**
+
+-   The name of the month.
+
+Type: [String](../../sql-reference/data-types/string.md#string)
+
+**Example**
+
+Query:
+
+```sql
+WITH toDateTime('2021-04-14 11:22:33') AS date_value
+SELECT monthName(date_value);
+```
+
+Result:
+
+```text
+┌─monthName(date_value)─┐
+│ April                 │
+└───────────────────────┘
 ```

 ## FROM\_UNIXTIME
--- a/docs/en/sql-reference/functions/other-functions.md
+++ b/docs/en/sql-reference/functions/other-functions.md
@ -1822,10 +1822,13 @@ Result:
 Evaluate external model.
 Accepts a model name and model arguments. Returns Float64.

-## throwIf(x\[, custom_message\])
+## throwIf(x\[, message\[, error_code\]\])

 Throw an exception if the argument is non zero.
-custom_message - is an optional parameter: a constant string, provides an error message
+`message` - is an optional parameter: a constant string providing a custom error message
+`error_code` - is an optional parameter: a constant integer providing a custom error code
+
+To use the `error_code` argument, configuration parameter `allow_custom_error_code_in_throwif` must be enabled.

 ``` sql
 SELECT throwIf(number = 3, 'Too many') FROM numbers(10);
--- a/docs/en/sql-reference/statements/create/user.md
+++ b/docs/en/sql-reference/statements/create/user.md
@ -1,6 +1,9 @@
 ---
 sidebar_position: 39
 sidebar_label: USER
+tags:
+  - create user
+  - add user
 ---

 # CREATE USER
--- a/docs/ru/engines/table-engines/mergetree-family/replacingmergetree.md
+++ b/docs/ru/engines/table-engines/mergetree-family/replacingmergetree.md
@ -28,19 +28,65 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]

 Описание параметров запроса смотрите в [описании запроса](../../../engines/table-engines/mergetree-family/replacingmergetree.md).

-    :::note "Внимание"
-    Уникальность строк определяется `ORDER BY` секцией таблицы, а не `PRIMARY KEY`.
-    :::
-**Параметры ReplacingMergeTree**
+:::warning "Внимание"
+Уникальность строк определяется `ORDER BY` секцией таблицы, а не `PRIMARY KEY`.
+:::

-   `ver` — столбец с номером версии. Тип `UInt*`, `Date`, `DateTime` или `DateTime64`. Необязательный параметр.
+## Параметры ReplacingMergeTree

-    При слиянии `ReplacingMergeTree` оставляет только строку для каждого уникального ключа сортировки:
+### ver
+
+`ver` — столбец с номером версии. Тип `UInt*`, `Date`, `DateTime` или `DateTime64`. Необязательный параметр.
+
+При слиянии `ReplacingMergeTree` оставляет только строку для каждого уникального ключа сортировки:

    - Последнюю в выборке, если `ver` не задан. Под выборкой здесь понимается набор строк в наборе кусков данных, участвующих в слиянии. Последний по времени создания кусок (последняя вставка) будет последним в выборке. Таким образом, после дедупликации для каждого значения ключа сортировки останется самая последняя строка из самой последней вставки.
    - С максимальной версией, если `ver` задан. Если `ver` одинаковый у нескольких строк, то для них используется правило -- если `ver` не задан, т.е. в результате слияния останется самая последняя строка из самой последней вставки.

-**Секции запроса**
+Пример: 
+
+```sql
+-- without ver - the last inserted 'wins'
+CREATE TABLE myFirstReplacingMT
+(
+    `key` Int64,
+    `someCol` String,
+    `eventTime` DateTime
+)
+ENGINE = ReplacingMergeTree
+ORDER BY key;
+
+INSERT INTO myFirstReplacingMT Values (1, 'first', '2020-01-01 01:01:01');
+INSERT INTO myFirstReplacingMT Values (1, 'second', '2020-01-01 00:00:00');
+
+SELECT * FROM myFirstReplacingMT FINAL;
+
+┌─key─┬─someCol─┬───────────eventTime─┐
+│   1 │ second  │ 2020-01-01 00:00:00 │
+└─────┴─────────┴─────────────────────┘
+
+
+-- with ver - the row with the biggest ver 'wins'
+CREATE TABLE mySecondReplacingMT
+(
+    `key` Int64,
+    `someCol` String,
+    `eventTime` DateTime
+)
+ENGINE = ReplacingMergeTree(eventTime)
+ORDER BY key;
+
+INSERT INTO mySecondReplacingMT Values (1, 'first', '2020-01-01 01:01:01');
+INSERT INTO mySecondReplacingMT Values (1, 'second', '2020-01-01 00:00:00');
+
+SELECT * FROM mySecondReplacingMT FINAL;
+
+┌─key─┬─someCol─┬───────────eventTime─┐
+│   1 │ first   │ 2020-01-01 01:01:01 │
+└─────┴─────────┴─────────────────────┘
+```
+
+## Секции запроса

 При создании таблицы `ReplacingMergeTree` используются те же [секции](mergetree.md), что и при создании таблицы `MergeTree`.

@ -48,9 +94,10 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]

 <summary>Устаревший способ создания таблицы</summary>

-    :::note "Внимание"
-    Не используйте этот способ в новых проектах и по возможности переведите старые проекты на способ описанный выше.
-    :::
+:::warning "Внимание"
+Не используйте этот способ в новых проектах и по возможности переведите старые проекты на способ, описанный выше.
+:::
+
 ``` sql
 CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
 (
--- a/docs/ru/interfaces/cli.md
+++ b/docs/ru/interfaces/cli.md
@ -84,6 +84,13 @@ $ cat file.csv | clickhouse-client --database=test --query="INSERT INTO test FOR
 clickhouse-client --param_parName="[1, 2]"  -q "SELECT * FROM table WHERE a = {parName:Array(UInt16)}"
 ```

+Также возможно устанавливать значения параметров, находясь внутри интерактивной сессии:
+``` bash
+$ clickhouse-client -nq "
+  SET param_parName='[1, 2]';
+  SELECT {parName:Array(UInt16)}"
+```
+
 #### Синтаксис запроса {#cli-queries-with-parameters-syntax}

 Отформатируйте запрос обычным способом. Представьте значения, которые вы хотите передать из параметров приложения в запрос в следующем формате:
--- a/docs/ru/operations/settings/settings.md
+++ b/docs/ru/operations/settings/settings.md
@ -3790,14 +3790,14 @@ Exception: Total regexp lengths too large.

 ## enable_positional_arguments {#enable-positional-arguments}

-Включает и отключает поддержку позиционных аргументов для [GROUP BY](../../sql-reference/statements/select/group-by.md), [LIMIT BY](../../sql-reference/statements/select/limit-by.md), [ORDER BY](../../sql-reference/statements/select/order-by.md). Если вы хотите использовать номера столбцов вместо названий в выражениях этих операторов, установите `enable_positional_arguments = 1`.
+Включает и отключает поддержку позиционных аргументов для [GROUP BY](../../sql-reference/statements/select/group-by.md), [LIMIT BY](../../sql-reference/statements/select/limit-by.md), [ORDER BY](../../sql-reference/statements/select/order-by.md).

 Возможные значения:

 -   0 — Позиционные аргументы не поддерживаются.
 -   1 — Позиционные аргументы поддерживаются: можно использовать номера столбцов вместо названий столбцов.

-Значение по умолчанию: `0`.
+Значение по умолчанию: `1`.

 **Пример**

@ -3808,8 +3808,6 @@ CREATE TABLE positional_arguments(one Int, two Int, three Int) ENGINE=Memory();

 INSERT INTO positional_arguments VALUES (10, 20, 30), (20, 20, 10), (30, 10, 20);

-SET enable_positional_arguments = 1;
-
 SELECT * FROM positional_arguments ORDER BY 2,3;
 ```

--- a/docs/ru/sql-reference/functions/array-join.md
+++ b/docs/ru/sql-reference/functions/array-join.md
@ -9,15 +9,11 @@ sidebar_label: "Функция ArrayJoin"

 Обычные функции не изменяют множество строк, а лишь изменяют значения в каждой строке (map).
 Агрегатные функции выполняют свёртку множества строк (fold, reduce).
-Функция arrayJoin выполняет размножение каждой строки в множество строк (unfold).
+Функция `arrayJoin` выполняет размножение каждой строки в множество строк (unfold).

 Функция принимает в качестве аргумента массив, и размножает исходную строку в несколько строк - по числу элементов массива.
 Все значения в столбцах просто копируются, кроме значения в столбце с применением этой функции - он заменяется на соответствующее значение массива.

-В запросе может быть использовано несколько функций `arrayJoin`. В этом случае, соответствующее преобразование делается несколько раз.
-
-Обратите внимание на синтаксис ARRAY JOIN в запросе SELECT, который предоставляет более широкие возможности.
-
 Пример:

 ``` sql
@ -32,3 +28,112 @@ SELECT arrayJoin([1, 2, 3] AS src) AS dst, 'Hello', src
 └─────┴───────────┴─────────┘
 ```

+Функция `arrayJoin` влияет на все секции запроса, включая секцию `WHERE`. Обратите внимание на результат 2, хотя подзапрос вернул 1 строку.
+
+Пример:
+
+```sql
+SELECT sum(1) AS impressions
+FROM
+(
+    SELECT ['Istanbul', 'Berlin', 'Bobruisk'] AS cities
+)
+WHERE arrayJoin(cities) IN ['Istanbul', 'Berlin'];
+```
+
+``` text
+┌─impressions─┐
+│           2 │
+└─────────────┘
+```
+
+В запросе может быть использовано несколько функций `arrayJoin`. В этом случае, соответствующее преобразование делается несколько раз и строки перемножаются.
+
+Пример:
+
+```sql
+SELECT
+    sum(1) AS impressions,
+    arrayJoin(cities) AS city,
+    arrayJoin(browsers) AS browser
+FROM
+(
+    SELECT
+        ['Istanbul', 'Berlin', 'Bobruisk'] AS cities,
+        ['Firefox', 'Chrome', 'Chrome'] AS browsers
+)
+GROUP BY
+    2,
+    3
+```
+
+``` text
+┌─impressions─┬─city─────┬─browser─┐
+│           2 │ Istanbul │ Chrome  │
+│           1 │ Istanbul │ Firefox │
+│           2 │ Berlin   │ Chrome  │
+│           1 │ Berlin   │ Firefox │
+│           2 │ Bobruisk │ Chrome  │
+│           1 │ Bobruisk │ Firefox │
+└─────────────┴──────────┴─────────┘
+```
+
+Обратите внимание на синтаксис [ARRAY JOIN](../statements/select/array-join.md) в запросе SELECT, который предоставляет более широкие возможности.
+`ARRAY JOIN` позволяет преобразовать несколько массивов с одинаковым количеством элементов за раз.
+
+Пример:
+
+```sql
+SELECT
+    sum(1) AS impressions,
+    city,
+    browser
+FROM
+(
+    SELECT
+        ['Istanbul', 'Berlin', 'Bobruisk'] AS cities,
+        ['Firefox', 'Chrome', 'Chrome'] AS browsers
+)
+ARRAY JOIN
+    cities AS city,
+    browsers AS browser
+GROUP BY
+    2,
+    3
+```
+
+``` text
+┌─impressions─┬─city─────┬─browser─┐
+│           1 │ Istanbul │ Firefox │
+│           1 │ Berlin   │ Chrome  │
+│           1 │ Bobruisk │ Chrome  │
+└─────────────┴──────────┴─────────┘
+```
+
+Или можно использовать [Tuple](../data-types/tuple.md)
+
+Пример:
+
+```sql
+SELECT
+    sum(1) AS impressions,
+    (arrayJoin(arrayZip(cities, browsers)) AS t).1 AS city,
+    t.2 AS browser
+FROM
+(
+    SELECT
+        ['Istanbul', 'Berlin', 'Bobruisk'] AS cities,
+        ['Firefox', 'Chrome', 'Chrome'] AS browsers
+)
+GROUP BY
+    2,
+    3
+```
+
+``` text
+┌─impressions─┬─city─────┬─browser─┐
+│           1 │ Istanbul │ Firefox │
+│           1 │ Berlin   │ Chrome  │
+│           1 │ Bobruisk │ Chrome  │
+└─────────────┴──────────┴─────────┘
+```
--- a/docs/ru/sql-reference/functions/other-functions.md
+++ b/docs/ru/sql-reference/functions/other-functions.md
@ -1727,10 +1727,13 @@ SELECT joinGet(db_test.id_val,'val',toUInt32(number)) from numbers(4) SETTINGS j

 Принимает на вход имя и аргументы модели. Возвращает Float64.

-## throwIf(x\[, custom_message\]) {#throwifx-custom-message}
+## throwIf(x\[, message\[, error_code\]\]) {#throwifx-custom-message}

 Бросает исключение, если аргумент не равен нулю.
-custom_message - необязательный параметр, константная строка, задает текст сообщения об ошибке.
+`custom_message` - необязательный параметр, константная строка, задает текст сообщения об ошибке.
+`error_code` - необязательный параметр, константное число, задает код ошибки.
+
+Чтобы использовать аргумент `error_code`, должен быть включен параметр конфигурации `allow_custom_error_code_in_throwif`.

 ``` sql
 SELECT throwIf(number = 3, 'Too many') FROM numbers(10);
--- a/programs/install/Install.cpp
+++ b/programs/install/Install.cpp
@ -1,6 +1,7 @@
 #include <iostream>
 #include <filesystem>
 #include <boost/program_options.hpp>
+#include <Common/filesystemHelpers.h>

 #include <sys/stat.h>
 #include <pwd.h>
@ -378,10 +379,10 @@ int mainEntryClickHouseInstall(int argc, char ** argv)

            if (fs::exists(symlink_path))
            {
-                bool is_symlink = fs::is_symlink(symlink_path);
+                bool is_symlink = FS::isSymlink(symlink_path);
                fs::path points_to;
                if (is_symlink)
-                    points_to = fs::weakly_canonical(fs::read_symlink(symlink_path));
+                    points_to = fs::weakly_canonical(FS::readSymlink(symlink_path));

                if (is_symlink && points_to == main_bin_path)
                {
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -1156,22 +1156,20 @@ int Server::main(const std::vector<std::string> & /*args*/)
            if (config->has("max_partition_size_to_drop"))
                global_context->setMaxPartitionSizeToDrop(config->getUInt64("max_partition_size_to_drop"));

-            if (config->has("concurrent_threads_soft_limit"))
+            ConcurrencyControl::SlotCount concurrent_threads_soft_limit = ConcurrencyControl::Unlimited;
+            if (config->has("concurrent_threads_soft_limit_num"))
            {
-                auto concurrent_threads_soft_limit = config->getInt("concurrent_threads_soft_limit", 0);
-                if (concurrent_threads_soft_limit == -1)
-                {
-                    // Based on tests concurrent_threads_soft_limit has an optimal value when it's about 3 times of logical CPU cores
-                    constexpr size_t thread_factor = 3;
-                    concurrent_threads_soft_limit = std::thread::hardware_concurrency() * thread_factor;
-                }
-                if (concurrent_threads_soft_limit)
-                    ConcurrencyControl::instance().setMaxConcurrency(concurrent_threads_soft_limit);
-                else
-                    ConcurrencyControl::instance().setMaxConcurrency(ConcurrencyControl::Unlimited);
+                auto value = config->getUInt64("concurrent_threads_soft_limit_num", 0);
+                if (value > 0 && value < concurrent_threads_soft_limit)
+                    concurrent_threads_soft_limit = value;
            }
-            else
-                ConcurrencyControl::instance().setMaxConcurrency(ConcurrencyControl::Unlimited);
+            if (config->has("concurrent_threads_soft_limit_ratio_to_cores"))
+            {
+                auto value = config->getUInt64("concurrent_threads_soft_limit_ratio_to_cores", 0) * std::thread::hardware_concurrency();
+                if (value > 0 && value < concurrent_threads_soft_limit)
+                    concurrent_threads_soft_limit = value;
+            }
+            ConcurrencyControl::instance().setMaxConcurrency(concurrent_threads_soft_limit);

            if (config->has("max_concurrent_queries"))
                global_context->getProcessList().setMaxSize(config->getInt("max_concurrent_queries", 0));
@ -1541,7 +1539,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
        /// We load temporary database first, because projections need it.
        database_catalog.initializeAndLoadTemporaryDatabase();
        loadMetadataSystem(global_context);
-        maybeConvertOrdinaryDatabaseToAtomic(global_context, DatabaseCatalog::instance().getSystemDatabase());
+        maybeConvertSystemDatabase(global_context);
        /// After attaching system databases we can initialize system log.
        global_context->initializeSystemLogs();
        global_context->setSystemZooKeeperLogAfterInitializationIfNeeded();
@ -1555,6 +1553,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
        database_catalog.loadMarkedAsDroppedTables();
        /// Then, load remaining databases
        loadMetadata(global_context, default_database);
+        convertDatabasesEnginesIfNeed(global_context);
        startupSystemTables();
        database_catalog.loadDatabases();
        /// After loading validate that default database exists
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@ -281,12 +281,12 @@
    <http_server_default_response><![CDATA[<html ng-app="SMI2"><head><base href="http://ui.tabix.io/"></head><body><div ui-view="" class="content-ui"></div><script src="http://loader.tabix.io/master.js"></script></body></html>]]></http_server_default_response>
    -->

-    <!-- Maximum number of query processing threads to run all queries.
-         Note that This is not a hard limit. In case if the limit is reached the query will still get one thread to run.
-         For value equals to -1 this parameter is initialized by number of logical cores multiplies by 3.
-         Which is a good heuristic for CPU-bound tasks.
+    <!-- The maximum number of query processing threads, excluding threads for retrieving data from remote servers, allowed to run all queries.
+         This is not a hard limit. In case if the limit is reached the query will still get at least one thread to run.
+         Query can upscale to desired number of threads during execution if more threads become available.
    -->
-    <concurrent_threads_soft_limit>0</concurrent_threads_soft_limit>
+    <concurrent_threads_soft_limit_num>0</concurrent_threads_soft_limit_num>
+    <concurrent_threads_soft_limit_ratio_to_cores>0</concurrent_threads_soft_limit_ratio_to_cores>

    <!-- Maximum number of concurrent queries. -->
    <max_concurrent_queries>100</max_concurrent_queries>
--- a/src/Access/Common/AccessType.h
+++ b/src/Access/Common/AccessType.h
@ -140,6 +140,7 @@ enum class AccessType
    M(SYSTEM_DROP_MMAP_CACHE, "SYSTEM DROP MMAP, DROP MMAP CACHE, DROP MMAP", GLOBAL, SYSTEM_DROP_CACHE) \
    M(SYSTEM_DROP_COMPILED_EXPRESSION_CACHE, "SYSTEM DROP COMPILED EXPRESSION, DROP COMPILED EXPRESSION CACHE, DROP COMPILED EXPRESSIONS", GLOBAL, SYSTEM_DROP_CACHE) \
    M(SYSTEM_DROP_FILESYSTEM_CACHE, "SYSTEM DROP FILESYSTEM CACHE, DROP FILESYSTEM CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
+    M(SYSTEM_DROP_SCHEMA_CACHE, "SYSTEM DROP SCHEMA CACHE, DROP SCHEMA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
    M(SYSTEM_DROP_CACHE, "DROP CACHE", GROUP, SYSTEM) \
    M(SYSTEM_RELOAD_CONFIG, "RELOAD CONFIG", GLOBAL, SYSTEM_RELOAD) \
    M(SYSTEM_RELOAD_SYMBOLS, "RELOAD SYMBOLS", GLOBAL, SYSTEM_RELOAD) \
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -130,6 +130,9 @@ if (TARGET ch_contrib::hdfs)
    add_headers_and_sources(dbms Disks/ObjectStorages/HDFS)
 endif()

+add_headers_and_sources(dbms Disks/ObjectStorages/Cached)
+add_headers_and_sources(dbms Disks/ObjectStorages/Web)
+
 add_headers_and_sources(dbms Storages/Cache)
 if (TARGET ch_contrib::hivemetastore)
    add_headers_and_sources(dbms Storages/Hive)
--- a/src/Columns/Collator.h
+++ b/src/Columns/Collator.h
@ -46,8 +46,10 @@ public:
    int compare(const char * str1, size_t length1, const char * str2, size_t length2) const;

    const std::string & getLocale() const;
-private:

+    bool operator==(const Collator & other) const { return this->getLocale() == other.getLocale(); }
+
+private:
    std::string locale;
    UCollator * collator;
 };
--- a/src/Columns/ColumnVector.cpp
+++ b/src/Columns/ColumnVector.cpp
@ -12,12 +12,14 @@
 #include <Common/RadixSort.h>
 #include <Common/SipHash.h>
 #include <Common/WeakHash.h>
+#include <Common/TargetSpecific.h>
 #include <Common/assert_cast.h>
 #include <base/sort.h>
 #include <base/unaligned.h>
 #include <base/bit_cast.h>
 #include <base/scope_guard.h>

+#include <bit>
 #include <cmath>
 #include <cstring>

@ -25,6 +27,10 @@
 #    include <emmintrin.h>
 #endif

+#if USE_MULTITARGET_CODE
+#    include <immintrin.h>
+#endif
+
 #if USE_EMBEDDED_COMPILER
 #include <DataTypes/Native.h>
 #include <llvm/IR/IRBuilder.h>
@ -471,6 +477,128 @@ void ColumnVector<T>::insertRangeFrom(const IColumn & src, size_t start, size_t
    memcpy(data.data() + old_size, &src_vec.data[start], length * sizeof(data[0]));
 }

+static inline UInt64 blsr(UInt64 mask)
+{
+#ifdef __BMI__
+    return _blsr_u64(mask);
+#else
+    return mask & (mask-1);
+#endif
+}
+
+DECLARE_DEFAULT_CODE(
+template <typename T, typename Container, size_t SIMD_BYTES>
+inline void doFilterAligned(const UInt8 *& filt_pos, const UInt8 *& filt_end_aligned, const T *& data_pos, Container & res_data)
+{
+    while (filt_pos < filt_end_aligned)
+    {
+        UInt64 mask = bytes64MaskToBits64Mask(filt_pos);
+
+        if (0xffffffffffffffff == mask)
+        {
+            res_data.insert(data_pos, data_pos + SIMD_BYTES);
+        }
+        else
+        {
+            while (mask)
+            {
+                size_t index = std::countr_zero(mask);
+                res_data.push_back(data_pos[index]);
+                mask = blsr(mask);
+            }
+        }
+
+        filt_pos += SIMD_BYTES;
+        data_pos += SIMD_BYTES;
+    }
+}
+)
+
+namespace
+{
+template <typename T, typename Container>
+void resize(Container & res_data, size_t reserve_size)
+{
+#if defined(MEMORY_SANITIZER)
+    res_data.resize_fill(reserve_size, static_cast<T>(0)); // MSan doesn't recognize that all allocated memory is written by AVX-512 intrinsics.
+#else
+    res_data.resize(reserve_size);
+#endif
+}
+}
+
+DECLARE_AVX512VBMI2_SPECIFIC_CODE(
+template <size_t ELEMENT_WIDTH>
+inline void compressStoreAVX512(const void *src, void *dst, const UInt64 mask)
+{
+    __m512i vsrc = _mm512_loadu_si512(src);
+    if constexpr (ELEMENT_WIDTH == 1)
+        _mm512_mask_compressstoreu_epi8(dst, static_cast<__mmask64>(mask), vsrc);
+    else if constexpr (ELEMENT_WIDTH == 2)
+        _mm512_mask_compressstoreu_epi16(dst, static_cast<__mmask32>(mask), vsrc);
+    else if constexpr (ELEMENT_WIDTH == 4)
+        _mm512_mask_compressstoreu_epi32(dst, static_cast<__mmask16>(mask), vsrc);
+    else if constexpr (ELEMENT_WIDTH == 8)
+        _mm512_mask_compressstoreu_epi64(dst, static_cast<__mmask8>(mask), vsrc);
+}
+
+template <typename T, typename Container, size_t SIMD_BYTES>
+inline void doFilterAligned(const UInt8 *& filt_pos, const UInt8 *& filt_end_aligned, const T *& data_pos, Container & res_data)
+{
+    static constexpr size_t VEC_LEN = 64;   /// AVX512 vector length - 64 bytes
+    static constexpr size_t ELEMENT_WIDTH = sizeof(T);
+    static constexpr size_t ELEMENTS_PER_VEC = VEC_LEN / ELEMENT_WIDTH;
+    static constexpr UInt64 KMASK = 0xffffffffffffffff >> (64 - ELEMENTS_PER_VEC);
+
+    size_t current_offset = res_data.size();
+    size_t reserve_size = res_data.size();
+    size_t alloc_size = SIMD_BYTES * 2;
+
+    while (filt_pos < filt_end_aligned)
+    {
+        /// to avoid calling resize too frequently, resize to reserve buffer.
+        if (reserve_size - current_offset < SIMD_BYTES)
+        {
+            reserve_size += alloc_size;
+            resize<T>(res_data, reserve_size);
+            alloc_size *= 2;
+        }
+
+        UInt64 mask = bytes64MaskToBits64Mask(filt_pos);
+
+        if (0xffffffffffffffff == mask)
+        {
+            for (size_t i = 0; i < SIMD_BYTES; i += ELEMENTS_PER_VEC)
+                _mm512_storeu_si512(reinterpret_cast<void *>(&res_data[current_offset + i]),
+                        _mm512_loadu_si512(reinterpret_cast<const void *>(data_pos + i)));
+            current_offset += SIMD_BYTES;
+        }
+        else
+        {
+            if (mask)
+            {
+                for (size_t i = 0; i < SIMD_BYTES; i += ELEMENTS_PER_VEC)
+                {
+                    compressStoreAVX512<ELEMENT_WIDTH>(reinterpret_cast<const void *>(data_pos + i),
+                            reinterpret_cast<void *>(&res_data[current_offset]), mask & KMASK);
+                    current_offset += std::popcount(mask & KMASK);
+                    /// prepare mask for next iter, if ELEMENTS_PER_VEC = 64, no next iter
+                    if (ELEMENTS_PER_VEC < 64)
+                    {
+                        mask >>= ELEMENTS_PER_VEC;
+                    }
+                }
+            }
+        }
+
+        filt_pos += SIMD_BYTES;
+        data_pos += SIMD_BYTES;
+    }
+    /// resize to the real size.
+    res_data.resize(current_offset);
+}
+)
+
 template <typename T>
 ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_size_hint) const
 {
@ -496,31 +624,13 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_s
    static constexpr size_t SIMD_BYTES = 64;
    const UInt8 * filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;

-    while (filt_pos < filt_end_aligned)
-    {
-        UInt64 mask = bytes64MaskToBits64Mask(filt_pos);
-
-        if (0xffffffffffffffff == mask)
-        {
-            res_data.insert(data_pos, data_pos + SIMD_BYTES);
-        }
-        else
-        {
-            while (mask)
-            {
-                size_t index = std::countr_zero(mask);
-                res_data.push_back(data_pos[index]);
-            #ifdef __BMI__
-                mask = _blsr_u64(mask);
-            #else
-                mask = mask & (mask-1);
-            #endif
-            }
-        }
-
-        filt_pos += SIMD_BYTES;
-        data_pos += SIMD_BYTES;
-    }
+#if USE_MULTITARGET_CODE
+    static constexpr bool VBMI2_CAPABLE = sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8;
+    if (VBMI2_CAPABLE && isArchSupported(TargetArch::AVX512VBMI2))
+        TargetSpecific::AVX512VBMI2::doFilterAligned<T, Container, SIMD_BYTES>(filt_pos, filt_end_aligned, data_pos, res_data);
+    else
+#endif
+        TargetSpecific::Default::doFilterAligned<T, Container, SIMD_BYTES>(filt_pos, filt_end_aligned, data_pos, res_data);

    while (filt_pos < filt_end)
    {
--- a/src/Columns/tests/gtest_column_vector.cpp
+++ b/src/Columns/tests/gtest_column_vector.cpp
@ -0,0 +1,91 @@
+#include <typeinfo>
+#include <vector>
+#include <Columns/ColumnsNumber.h>
+#include <Common/randomSeed.h>
+#include <gtest/gtest.h>
+
+
+using namespace DB;
+
+static pcg64 rng(randomSeed());
+static constexpr int error_code = 12345;
+static constexpr size_t TEST_RUNS = 500;
+static constexpr size_t MAX_ROWS = 10000;
+static const std::vector<size_t> filter_ratios = {1, 2, 5, 11, 32, 64, 100, 1000};
+static const size_t K = filter_ratios.size();
+
+template <typename T>
+static MutableColumnPtr createColumn(size_t n)
+{
+    auto column = ColumnVector<T>::create();
+    auto & values = column->getData();
+
+    for (size_t i = 0; i < n; ++i)
+    {
+        values.push_back(i);
+    }
+
+    return column;
+}
+
+bool checkFilter(const PaddedPODArray<UInt8> &flit, const IColumn & src, const IColumn & dst)
+{
+    size_t n = flit.size();
+    size_t dst_size = dst.size();
+    size_t j = 0;   /// index of dest
+    for (size_t i = 0; i < n; ++i)
+    {
+        if (flit[i] != 0)
+        {
+            if ((dst_size <= j) || (src.compareAt(i, j, dst, 0) != 0))
+                return false;
+            j++;
+        }
+    }
+    return dst_size == j;   /// filtered size check
+}
+
+template <typename T>
+static void testFilter()
+{
+    auto test_case = [&](size_t rows, size_t filter_ratio)
+    {
+        auto vector_column = createColumn<T>(rows);
+        PaddedPODArray<UInt8> flit(rows);
+        for (size_t i = 0; i < rows; ++i)
+            flit[i] = rng() % filter_ratio == 0;
+        auto res_column = vector_column->filter(flit, -1);
+
+        if (!checkFilter(flit, *vector_column, *res_column))
+            throw Exception(error_code, "VectorColumn filter failure, type: {}", typeid(T).name());
+    };
+
+    try
+    {
+        for (size_t i = 0; i < TEST_RUNS; ++i)
+        {
+            size_t rows = rng() % MAX_ROWS + 1;
+            size_t filter_ratio = filter_ratios[rng() % K];
+
+            test_case(rows, filter_ratio);
+        }
+    }
+    catch (const Exception & e)
+    {
+        FAIL() << e.displayText();
+    }
+}
+
+
+TEST(ColumnVector, Filter)
+{
+    testFilter<UInt8>();
+    testFilter<Int16>();
+    testFilter<UInt32>();
+    testFilter<Int64>();
+    testFilter<UInt128>();
+    testFilter<Int256>();
+    testFilter<Float32>();
+    testFilter<Float64>();
+    testFilter<UUID>();
+}
--- a/src/Common/CaresPTRResolver.cpp
+++ b/src/Common/CaresPTRResolver.cpp
@ -35,8 +35,13 @@ namespace DB
         * See https://github.com/grpc/grpc/blob/master/src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_wrapper.cc#L1187
         * That means it's safe to init it here, but we should be cautious when introducing new code that depends on c-ares and even updates
         * to grpc. As discussed in https://github.com/ClickHouse/ClickHouse/pull/37827#discussion_r919189085, c-ares should be adapted to be atomic
+         *
+         * Since C++ 11 static objects are initialized in a thread safe manner. The static qualifier also makes sure
+         * it'll be called/ initialized only once.
         * */
-        if (ares_library_init(ARES_LIB_INIT_ALL) != ARES_SUCCESS || ares_init(&channel) != ARES_SUCCESS)
+        static const auto library_init_result = ares_library_init(ARES_LIB_INIT_ALL);
+
+        if (library_init_result != ARES_SUCCESS || ares_init(&channel) != ARES_SUCCESS)
        {
            throw DB::Exception("Failed to initialize c-ares", DB::ErrorCodes::DNS_ERROR);
        }
@ -45,7 +50,12 @@ namespace DB
    CaresPTRResolver::~CaresPTRResolver()
    {
        ares_destroy(channel);
-        ares_library_cleanup();
+        /*
+         * Library initialization is currently done only once in the constructor. Multiple instances of CaresPTRResolver
+         * will be used in the lifetime of ClickHouse, thus it's problematic to have de-init here.
+         * In a practical view, it makes little to no sense to de-init a DNS library since DNS requests will happen
+         * until the end of the program. Hence, ares_library_cleanup() will not be called.
+         * */
    }

    std::vector<std::string> CaresPTRResolver::resolve(const std::string & ip)
--- a/src/Common/Config/ConfigHelper.h
+++ b/src/Common/Config/ConfigHelper.h
@ -15,6 +15,6 @@ namespace DB::ConfigHelper

 /// The behavior is like `config.getBool(key, default_)`,
 /// except when the tag is empty (aka. self-closing), `empty_as` will be used instead of throwing Poco::Exception.
-bool getBool(const Poco::Util::AbstractConfiguration & config, const std::string & key, bool default_, bool empty_as);
+bool getBool(const Poco::Util::AbstractConfiguration & config, const std::string & key, bool default_ = false, bool empty_as = true);

 }
--- a/src/Common/CpuId.h
+++ b/src/Common/CpuId.h
@ -82,6 +82,7 @@ inline bool cpuid(UInt32 op, UInt32 * res) noexcept /// NOLINT
    OP(AVX512BW)             \
    OP(AVX512VL)             \
    OP(AVX512VBMI)           \
+    OP(AVX512VBMI2)          \
    OP(PREFETCHWT1)          \
    OP(SHA)                  \
    OP(ADX)                  \
@ -302,6 +303,11 @@ bool haveAVX512VBMI() noexcept
    return haveAVX512F() && ((CpuInfo(0x7, 0).registers.ecx >> 1) & 1u);
 }

+bool haveAVX512VBMI2() noexcept
+{
+    return haveAVX512F() && ((CpuInfo(0x7, 0).registers.ecx >> 6) & 1u);
+}
+
 bool haveRDRAND() noexcept
 {
    return CpuInfo(0x0).registers.eax >= 0x7 && ((CpuInfo(0x1).registers.ecx >> 30) & 1u);
--- a/src/Common/CurrentMemoryTracker.cpp
+++ b/src/Common/CurrentMemoryTracker.cpp
@ -51,15 +51,23 @@ void CurrentMemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded)
    {
        if (current_thread)
        {
-            current_thread->untracked_memory += size;
+            Int64 will_be = current_thread->untracked_memory + size;
+            Int64 limit = current_thread->untracked_memory_limit + current_thread->untracked_memory_limit_increase;

-            if (current_thread->untracked_memory > current_thread->untracked_memory_limit)
+            if (will_be > limit)
            {
-                /// Zero untracked before track. If tracker throws out-of-limit we would be able to alloc up to untracked_memory_limit bytes
+                /// Increase limit before track. If tracker throws out-of-limit we would be able to alloc up to untracked_memory_limit bytes
                /// more. It could be useful to enlarge Exception message in rethrow logic.
-                Int64 tmp = current_thread->untracked_memory;
+                current_thread->untracked_memory_limit_increase = current_thread->untracked_memory_limit;
+                memory_tracker->allocImpl(will_be, throw_if_memory_exceeded);
+                current_thread->untracked_memory_limit_increase = 0;
                current_thread->untracked_memory = 0;
-                memory_tracker->allocImpl(tmp, throw_if_memory_exceeded);
+            }
+            else
+            {
+                /// Update after successful allocations,
+                /// since failed allocations should not be take into account.
+                current_thread->untracked_memory = will_be;
            }
        }
        /// total_memory_tracker only, ignore untracked_memory
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@ -92,6 +92,8 @@
    M(FilesystemCacheReadBuffers, "Number of active cache buffers") \
    M(CacheFileSegments, "Number of existing cache file segments") \
    M(CacheDetachedFileSegments, "Number of existing detached cache file segments") \
+    M(FilesystemCacheSize, "Filesystem cache size in bytes") \
+    M(FilesystemCacheElements, "Filesystem cache elements (file segments)") \
    M(S3Requests, "S3 requests") \
    M(KeeperAliveConnections, "Number of alive connections") \
    M(KeeperOutstandingRequets, "Number of outstanding requests") \
--- a/src/Common/DNSPTRResolverProvider.cpp
+++ b/src/Common/DNSPTRResolverProvider.cpp
@ -5,9 +5,8 @@ namespace DB
 {
    std::shared_ptr<DNSPTRResolver> DNSPTRResolverProvider::get()
    {
-        static auto cares_resolver = std::make_shared<CaresPTRResolver>(
+        return std::make_shared<CaresPTRResolver>(
            CaresPTRResolver::provider_token {}
        );
-        return cares_resolver;
    }
 }
--- a/src/Common/DateLUT.cpp
+++ b/src/Common/DateLUT.cpp
@ -3,6 +3,7 @@
 #include <Poco/DigestStream.h>
 #include <Poco/Exception.h>
 #include <Poco/SHA1Engine.h>
+#include <Common/filesystemHelpers.h>

 #include <filesystem>
 #include <fstream>
@ -64,9 +65,9 @@ std::string determineDefaultTimeZone()
        ///  /etc/localtime -> /usr/share/zoneinfo//UTC
        ///  /usr/share/zoneinfo//UTC -> UCT
        /// But the preferred time zone name is pointed by the first link (UTC), and the second link is just an internal detail.
-        if (fs::is_symlink(tz_file_path))
+        if (FS::isSymlink(tz_file_path))
        {
-            tz_file_path = fs::read_symlink(tz_file_path);
+            tz_file_path = FS::readSymlink(tz_file_path);
            /// If it's relative - make it absolute.
            if (tz_file_path.is_relative())
                tz_file_path = (fs::path("/etc/") / tz_file_path).lexically_normal();
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -634,6 +634,7 @@
    M(663, INCONSISTENT_METADATA_FOR_BACKUP) \
    M(664, ACCESS_STORAGE_DOESNT_ALLOW_BACKUP) \
    M(665, CANNOT_CONNECT_NATS) \
+    M(666, CANNOT_USE_CACHE) \
    \
    M(999, KEEPER_EXCEPTION) \
    M(1000, POCO_EXCEPTION) \
--- a/src/Common/FileCache.cpp
+++ b/src/Common/FileCache.cpp
@ -29,13 +29,13 @@ FileCache::FileCache(
    , max_size(cache_settings_.max_size)
    , max_element_size(cache_settings_.max_elements)
    , max_file_segment_size(cache_settings_.max_file_segment_size)
+    , allow_persistent_files(cache_settings_.do_not_evict_index_and_mark_files)
    , enable_filesystem_query_cache_limit(cache_settings_.enable_filesystem_query_cache_limit)
    , main_priority(std::make_unique<LRUFileCachePriority>())
    , stash_priority(std::make_unique<LRUFileCachePriority>())
    , max_stash_element_size(cache_settings_.max_elements)
    , enable_cache_hits_threshold(cache_settings_.enable_cache_hits_threshold)
    , log(&Poco::Logger::get("FileCache"))
-    , allow_to_remove_persistent_segments_from_cache_by_default(cache_settings_.allow_to_remove_persistent_segments_from_cache_by_default)
 {
 }

@ -597,18 +597,20 @@ FileCache::FileSegmentCell * FileCache::addCell(
    return &(it->second);
 }

-FileSegmentsHolder FileCache::setDownloading(
+FileSegmentPtr FileCache::createFileSegmentForDownload(
    const Key & key,
    size_t offset,
    size_t size,
-    bool is_persistent)
+    bool is_persistent,
+    std::lock_guard<std::mutex> & cache_lock)
 {
-    std::lock_guard cache_lock(mutex);
-
 #ifndef NDEBUG
    assertCacheCorrectness(key, cache_lock);
 #endif

+    if (size > max_file_segment_size)
+        throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Requested size exceeds max file segment size");
+
    auto * cell = getCell(key, offset, cache_lock);
    if (cell)
        throw Exception(
@ -616,8 +618,12 @@ FileSegmentsHolder FileCache::setDownloading(
            "Cache cell already exists for key `{}` and offset {}",
            key.toString(), offset);

-    auto file_segments = splitRangeIntoCells(key, offset, size, FileSegment::State::DOWNLOADING, is_persistent, cache_lock);
-    return FileSegmentsHolder(std::move(file_segments));
+    cell = addCell(key, offset, size, FileSegment::State::EMPTY, is_persistent, cache_lock);
+
+    if (!cell)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Failed to add a new cell for download");
+
+    return cell->file_segment;
 }

 bool FileCache::tryReserve(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
@ -691,6 +697,13 @@ bool FileCache::tryReserve(const Key & key, size_t offset, size_t size, std::loc
                if (cell->releasable())
                {
                    auto & file_segment = cell->file_segment;
+
+                    if (file_segment->isPersistent() && allow_persistent_files)
+                    {
+                        LOG_DEBUG(log, "File segment will not be removed, because it is persistent: {}", file_segment->getInfoForLog());
+                        continue;
+                    }
+
                    std::lock_guard segment_lock(file_segment->mutex);

                    switch (file_segment->download_state)
@ -806,6 +819,12 @@ bool FileCache::tryReserveForMainList(
        {
            auto & file_segment = cell->file_segment;

+            if (file_segment->isPersistent() && allow_persistent_files)
+            {
+                LOG_DEBUG(log, "File segment will not be removed, because it is persistent: {}", file_segment->getInfoForLog());
+                continue;
+            }
+
            std::lock_guard segment_lock(file_segment->mutex);

            switch (file_segment->download_state)
@ -927,7 +946,7 @@ void FileCache::removeIfExists(const Key & key)
    }
 }

-void FileCache::removeIfReleasable(bool remove_persistent_files)
+void FileCache::removeIfReleasable()
 {
    /// Try remove all cached files by cache_base_path.
    /// Only releasable file segments are evicted.
@ -951,10 +970,8 @@ void FileCache::removeIfReleasable(bool remove_persistent_files)
        if (cell->releasable())
        {
            auto file_segment = cell->file_segment;
-            if (file_segment
-                && (!file_segment->isPersistent()
-                    || remove_persistent_files
-                    || allow_to_remove_persistent_segments_from_cache_by_default))
+
+            if (file_segment)
            {
                to_remove.emplace_back(file_segment);
            }
@ -1088,9 +1105,11 @@ void FileCache::loadCacheInfoIntoMemory(std::lock_guard<std::mutex> & cache_lock
                }
                else
                {
-                    LOG_WARNING(log,
-                                "Cache capacity changed (max size: {}, available: {}), cached file `{}` does not fit in cache anymore (size: {})",
-                                max_size, getAvailableCacheSizeUnlocked(cache_lock), key_it->path().string(), size);
+                    LOG_WARNING(
+                        log,
+                        "Cache capacity changed (max size: {}, available: {}), cached file `{}` does not fit in cache anymore (size: {})",
+                        max_size, getAvailableCacheSizeUnlocked(cache_lock), key_it->path().string(), size);
+
                    fs::remove(offset_it->path());
                }
            }
--- a/src/Common/FileCache.h
+++ b/src/Common/FileCache.h
@ -43,7 +43,7 @@ public:

    void removeIfExists(const Key & key);

-    void removeIfReleasable(bool remove_persistent_files);
+    void removeIfReleasable();

    static bool isReadOnly();

@ -84,7 +84,18 @@ public:
     */
    FileSegmentsHolder get(const Key & key, size_t offset, size_t size);

-    FileSegmentsHolder setDownloading(const Key & key, size_t offset, size_t size, bool is_persistent);
+    /**
+     * Create a file segment of exactly requested size with EMPTY state.
+     * Throw exception if requested size exceeds max allowed file segment size.
+     * This method is for protected usage: file segment range writer uses it
+     * to dynamically allocate file segments.
+     */
+    FileSegmentPtr createFileSegmentForDownload(
+         const Key & key,
+         size_t offset,
+         size_t size,
+         bool is_persistent,
+         std::lock_guard<std::mutex> & cache_lock);

    FileSegments getSnapshot() const;

@ -100,6 +111,7 @@ private:
    size_t max_size;
    size_t max_element_size;
    size_t max_file_segment_size;
+    bool allow_persistent_files;

    bool is_initialized = false;

@ -229,7 +241,6 @@ private:
    size_t enable_cache_hits_threshold;

    Poco::Logger * log;
-    bool allow_to_remove_persistent_segments_from_cache_by_default;

    FileSegments getImpl(const Key & key, const FileSegment::Range & range, std::lock_guard<std::mutex> & cache_lock);

--- a/src/Common/FileCacheSettings.cpp
+++ b/src/Common/FileCacheSettings.cpp
@ -1,20 +1,37 @@
 #include "FileCacheSettings.h"

 #include <Poco/Util/AbstractConfiguration.h>
+#include <Common/Exception.h>

 namespace DB
 {

+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+}
+
 void FileCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix)
 {
-    max_size = config.getUInt64(config_prefix + ".data_cache_max_size", REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_CACHE_SIZE);
-    max_elements = config.getUInt64(config_prefix + ".data_cache_max_elements", REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS);
+    if (!config.has(config_prefix + ".max_size"))
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected cache size (`size`) in configuration");
+
+    max_size = config.getUInt64(config_prefix + ".max_size", 0);
+    if (max_size == 0)
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected non-zero size for cache configuration");
+
+    auto path = config.getString(config_prefix + ".path", "");
+    if (path.empty())
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Disk Cache requires non-empty `path` field (cache base path) in config");
+
+    max_elements = config.getUInt64(config_prefix + ".max_elements", REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS);
    max_file_segment_size = config.getUInt64(config_prefix + ".max_file_segment_size", REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE);
+
    cache_on_write_operations = config.getUInt64(config_prefix + ".cache_on_write_operations", false);
    enable_filesystem_query_cache_limit = config.getUInt64(config_prefix + ".enable_filesystem_query_cache_limit", false);
    enable_cache_hits_threshold = config.getUInt64(config_prefix + ".enable_cache_hits_threshold", REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD);
+
    do_not_evict_index_and_mark_files = config.getUInt64(config_prefix + ".do_not_evict_index_and_mark_files", true);
-    allow_to_remove_persistent_segments_from_cache_by_default = config.getUInt64(config_prefix + ".allow_to_remove_persistent_segments_from_cache_by_default", true);
 }

 }
--- a/src/Common/FileCacheSettings.h
+++ b/src/Common/FileCacheSettings.h
@ -19,7 +19,6 @@ struct FileCacheSettings
    bool enable_filesystem_query_cache_limit = false;

    bool do_not_evict_index_and_mark_files = true;
-    bool allow_to_remove_persistent_segments_from_cache_by_default = true;

    void loadFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix);
 };
--- a/src/Common/FileCache_fwd.h
+++ b/src/Common/FileCache_fwd.h
@ -4,7 +4,6 @@
 namespace DB
 {

-static constexpr int REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_CACHE_SIZE = 1024 * 1024 * 1024;
 static constexpr int REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE = 100 * 1024 * 1024;
 static constexpr int REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS = 1024 * 1024;
 static constexpr int REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD = 0;
--- a/src/Common/FileSegment.cpp
+++ b/src/Common/FileSegment.cpp
@ -1,11 +1,13 @@
 #include "FileSegment.h"
+
 #include <base/getThreadId.h>
-#include <Common/hex.h>
+#include <base/scope_guard.h>
 #include <Common/logger_useful.h>
+#include <Common/FileCache.h>
+#include <Common/hex.h>
 #include <IO/WriteBufferFromString.h>
 #include <IO/Operators.h>
 #include <filesystem>
-#include <Common/FileCache.h>

 namespace CurrentMetrics
 {
@ -37,7 +39,7 @@ FileSegment::FileSegment(
 #else
    , log(&Poco::Logger::get("FileSegment"))
 #endif
-    , is_persistent(is_persistent_) /// Not really used for now, see PR 36171
+    , is_persistent(is_persistent_)
 {
    /// On creation, file segment state can be EMPTY, DOWNLOADED, DOWNLOADING.
    switch (download_state)
@ -55,13 +57,6 @@ FileSegment::FileSegment(
            reserved_size = downloaded_size = size_;
            break;
        }
-        /// DOWNLOADING is used only for write-through caching (e.g. getOrSetDownloader() is not
-        /// needed, downloader is set on file segment creation).
-        case (State::DOWNLOADING):
-        {
-            downloader_id = getCallerId();
-            break;
-        }
        case (State::SKIP_CACHE):
        {
            break;
@ -91,6 +86,18 @@ size_t FileSegment::getDownloadedSize() const
    return getDownloadedSize(segment_lock);
 }

+size_t FileSegment::getRemainingSizeToDownload() const
+{
+    std::lock_guard segment_lock(mutex);
+    return range().size() - downloaded_size;
+}
+
+bool FileSegment::isDetached() const
+{
+    std::lock_guard segment_lock(mutex);
+    return is_detached;
+}
+
 size_t FileSegment::getDownloadedSize(std::lock_guard<std::mutex> & /* segment_lock */) const
 {
    if (download_state == State::DOWNLOADED)
@ -184,6 +191,22 @@ FileSegment::RemoteFileReaderPtr FileSegment::getRemoteFileReader()
    return remote_file_reader;
 }

+FileSegment::RemoteFileReaderPtr FileSegment::extractRemoteFileReader()
+{
+    std::lock_guard cache_lock(cache->mutex);
+    std::lock_guard segment_lock(mutex);
+
+    if (!is_detached)
+    {
+        bool is_last_holder = cache->isLastFileSegmentHolder(key(), offset(), cache_lock, segment_lock);
+        if (!downloader_id.empty() || !is_last_holder)
+            return nullptr;
+    }
+
+    LOG_TRACE(log, "Extracted reader from file segment");
+    return std::move(remote_file_reader);
+}
+
 void FileSegment::setRemoteFileReader(RemoteFileReaderPtr remote_file_reader_)
 {
    if (!isDownloader())
@ -279,80 +302,6 @@ String FileSegment::getPathInLocalCache() const
    return cache->getPathInLocalCache(key(), offset(), isPersistent());
 }

-void FileSegment::writeInMemory(const char * from, size_t size)
-{
-    if (!size)
-        throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Attempt to write zero size cache file");
-
-    if (availableSize() < size)
-        throw Exception(
-            ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR,
-            "Not enough space is reserved. Available: {}, expected: {}", availableSize(), size);
-
-    std::lock_guard segment_lock(mutex);
-    assertNotDetached(segment_lock);
-
-    if (cache_writer)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cache writer already initialized");
-
-    auto download_path = getPathInLocalCache();
-    cache_writer = std::make_unique<WriteBufferFromFile>(download_path, size + 1);
-
-    try
-    {
-        cache_writer->write(from, size);
-    }
-    catch (Exception & e)
-    {
-        wrapWithCacheInfo(e, "while writing into cache", segment_lock);
-
-        setDownloadFailed(segment_lock);
-
-        cv.notify_all();
-
-        throw;
-    }
-}
-
-size_t FileSegment::finalizeWrite()
-{
-    std::lock_guard segment_lock(mutex);
-
-    if (!cache_writer)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cache writer not initialized");
-
-    size_t size = cache_writer->offset();
-
-    if (size == 0)
-        throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Writing zero size is not allowed");
-
-    assertNotDetached(segment_lock);
-
-    try
-    {
-        cache_writer->next();
-    }
-    catch (Exception & e)
-    {
-        wrapWithCacheInfo(e, "while writing into cache", segment_lock);
-
-        setDownloadFailed(segment_lock);
-
-        cv.notify_all();
-
-        throw;
-    }
-
-    downloaded_size += size;
-
-    if (downloaded_size != range().size())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected downloaded size to equal file segment size ({} == {})", downloaded_size, range().size());
-
-    setDownloaded(segment_lock);
-
-    return size;
-}
-
 FileSegment::State FileSegment::wait()
 {
    std::unique_lock segment_lock(mutex);
@ -481,7 +430,7 @@ void FileSegment::completeBatchAndResetDownloader()
    cv.notify_all();
 }

-void FileSegment::complete(State state)
+void FileSegment::completeWithState(State state, bool auto_resize)
 {
    std::lock_guard cache_lock(cache->mutex);
    std::lock_guard segment_lock(mutex);
@ -506,8 +455,24 @@ void FileSegment::complete(State state)
    }

    if (state == State::DOWNLOADED)
+    {
+        if (auto_resize && downloaded_size != range().size())
+        {
+            LOG_TEST(log, "Resize cell {} to downloaded: {}", range().toString(), downloaded_size);
+            assert(downloaded_size <= range().size());
+            segment_range = Range(segment_range.left, segment_range.left + downloaded_size - 1);
+        }
+
+        /// Update states and finalize cache write buffer.
        setDownloaded(segment_lock);

+        if (downloaded_size != range().size())
+            throw Exception(
+                ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR,
+                "Cannot complete file segment as DOWNLOADED, because downloaded size ({}) does not match expected size ({})",
+                downloaded_size, range().size());
+    }
+
    download_state = state;

    try
@ -526,16 +491,20 @@ void FileSegment::complete(State state)
    cv.notify_all();
 }

-void FileSegment::complete(std::lock_guard<std::mutex> & cache_lock)
+void FileSegment::completeBasedOnCurrentState(std::lock_guard<std::mutex> & cache_lock)
 {
    std::lock_guard segment_lock(mutex);

+    if (is_detached)
+        return;
+
    assertNotDetached(segment_lock);

-    completeUnlocked(cache_lock, segment_lock);
+    completeBasedOnCurrentStateUnlocked(cache_lock, segment_lock);
 }

-void FileSegment::completeUnlocked(std::lock_guard<std::mutex> & cache_lock, std::lock_guard<std::mutex> & segment_lock)
+void FileSegment::completeBasedOnCurrentStateUnlocked(
+    std::lock_guard<std::mutex> & cache_lock, std::lock_guard<std::mutex> & segment_lock)
 {
    bool is_last_holder = cache->isLastFileSegmentHolder(key(), offset(), cache_lock, segment_lock);

@ -607,7 +576,10 @@ void FileSegment::completeImpl(std::lock_guard<std::mutex> & cache_lock, std::lo
            * it only when nobody needs it.
            */
            download_state = State::PARTIALLY_DOWNLOADED_NO_CONTINUATION;
-            LOG_TEST(log, "Resize cell {} to downloaded: {}", range().toString(), current_downloaded_size);
+            /// Resize this file segment by creating a copy file segment with DOWNLOADED state,
+            /// but current file segment should remain PARRTIALLY_DOWNLOADED_NO_CONTINUATION and with detached state,
+            /// because otherwise an invariant that getOrSet() returns a contiguous range of file segments will be broken
+            /// (this will be crucial for other file segment holder, not for current one).
            cache->reduceSizeToDownloaded(key(), offset(), cache_lock, segment_lock);
        }

@ -644,8 +616,9 @@ String FileSegment::getInfoForLogImpl(std::lock_guard<std::mutex> & segment_lock
    info << "state: " << download_state << ", ";
    info << "downloaded size: " << getDownloadedSize(segment_lock) << ", ";
    info << "reserved size: " << reserved_size << ", ";
-    info << "downloader id: " << downloader_id << ", ";
-    info << "caller id: " << getCallerId();
+    info << "downloader id: " << (downloader_id.empty() ? "None" : downloader_id) << ", ";
+    info << "caller id: " << getCallerId() << ", ";
+    info << "persistent: " << is_persistent;

    return info.str();
 }
@ -820,7 +793,7 @@ FileSegmentsHolder::~FileSegmentsHolder()
            /// under the same mutex, because complete() checks for segment pointers.
            std::lock_guard cache_lock(cache->mutex);

-            file_segment->complete(cache_lock);
+            file_segment->completeBasedOnCurrentState(cache_lock);

            file_segment_it = file_segments.erase(current_file_segment_it);
        }
@ -843,4 +816,149 @@ String FileSegmentsHolder::toString()
    return ranges;
 }

+FileSegmentRangeWriter::FileSegmentRangeWriter(
+    FileCache * cache_,
+    const FileSegment::Key & key_,
+    OnCompleteFileSegmentCallback && on_complete_file_segment_func_)
+    : cache(cache_)
+    , key(key_)
+    , current_file_segment_it(file_segments_holder.file_segments.end())
+    , on_complete_file_segment_func(on_complete_file_segment_func_)
+{
+}
+
+FileSegments::iterator FileSegmentRangeWriter::allocateFileSegment(size_t offset, bool is_persistent)
+{
+    /**
+     * Allocate a new file segment starting `offset`.
+     * File segment capacity will equal `max_file_segment_size`, but actual size is 0.
+     */
+
+    std::lock_guard cache_lock(cache->mutex);
+
+    /// We set max_file_segment_size to be downloaded,
+    /// if we have less size to write, file segment will be resized in complete() method.
+    auto file_segment = cache->createFileSegmentForDownload(
+        key, offset, cache->max_file_segment_size, is_persistent, cache_lock);
+    return file_segments_holder.add(std::move(file_segment));
+}
+
+void FileSegmentRangeWriter::completeFileSegment(FileSegment & file_segment)
+{
+    /**
+     * Complete file segment based on downaloaded size.
+     */
+
+    /// File segment can be detached if space reservation failed.
+    if (file_segment.isDetached())
+        return;
+
+    if (file_segment.getDownloadedSize() > 0)
+    {
+        /// file_segment->complete(DOWNLOADED) is not enough, because file segment capacity
+        /// was initially set with a margin as `max_file_segment_size`. => We need to always
+        /// resize to actual size after download finished.
+        file_segment.getOrSetDownloader();
+        file_segment.completeWithState(FileSegment::State::DOWNLOADED, /* auto_resize */true);
+        on_complete_file_segment_func(file_segment);
+    }
+    else
+    {
+        std::lock_guard cache_lock(cache->mutex);
+        file_segment.completeBasedOnCurrentState(cache_lock);
+    }
+}
+
+bool FileSegmentRangeWriter::write(const char * data, size_t size, size_t offset, bool is_persistent)
+{
+    /**
+     * Write a range of file segments. Allocate file segment of `max_file_segment_size` and write to
+     * it until it is full and then allocate next file segment.
+     */
+
+    if (finalized)
+        return false;
+
+    auto & file_segments = file_segments_holder.file_segments;
+
+    if (current_file_segment_it == file_segments.end())
+    {
+        current_file_segment_it = allocateFileSegment(current_file_segment_write_offset, is_persistent);
+    }
+    else
+    {
+        if (current_file_segment_write_offset != offset)
+        {
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR,
+                "Cannot write file segment at offset {}, because current write offset is: {}",
+                offset, current_file_segment_write_offset);
+        }
+
+        if ((*current_file_segment_it)->getRemainingSizeToDownload() == 0)
+        {
+            completeFileSegment(**current_file_segment_it);
+            current_file_segment_it = allocateFileSegment(current_file_segment_write_offset, is_persistent);
+        }
+        else if ((*current_file_segment_it)->getDownloadOffset() != offset)
+        {
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR,
+                "Cannot file segment download offset {} does not match current write offset {}",
+                (*current_file_segment_it)->getDownloadOffset(), offset);
+        }
+    }
+
+    auto & file_segment = *current_file_segment_it;
+    file_segment->getOrSetDownloader();
+    SCOPE_EXIT({
+        file_segment->resetDownloader();
+    });
+
+    bool reserved = file_segment->reserve(size);
+    if (!reserved)
+    {
+        file_segment->completeWithState(FileSegment::State::PARTIALLY_DOWNLOADED_NO_CONTINUATION);
+        on_complete_file_segment_func(*file_segment);
+
+        LOG_DEBUG(
+            &Poco::Logger::get("FileSegmentRangeWriter"),
+            "Unsuccessful space reservation attempt (size: {}, file segment info: {}",
+            size, file_segment->getInfoForLog());
+
+        return false;
+    }
+
+    (*current_file_segment_it)->write(data, size, offset);
+    current_file_segment_write_offset += size;
+
+    return true;
+}
+
+void FileSegmentRangeWriter::finalize()
+{
+    if (finalized)
+        return;
+
+    auto & file_segments = file_segments_holder.file_segments;
+    if (file_segments.empty() || current_file_segment_it == file_segments.end())
+        return;
+
+    completeFileSegment(**current_file_segment_it);
+    finalized = true;
+}
+
+FileSegmentRangeWriter::~FileSegmentRangeWriter()
+{
+    try
+    {
+        if (!finalized)
+            finalize();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
+}
+
 }
--- a/src/Common/FileSegment.h
+++ b/src/Common/FileSegment.h
@ -1,7 +1,7 @@
 #pragma once

-#include <boost/noncopyable.hpp>
 #include <Core/Types.h>
+
 #include <IO/WriteBufferFromFile.h>
 #include <IO/ReadBufferFromFileBase.h>
 #include <list>
@ -18,6 +18,7 @@ namespace DB
 {

 class FileCache;
+class ReadBufferFromFileBase;

 class FileSegment;
 using FileSegmentPtr = std::shared_ptr<FileSegment>;
@ -113,17 +114,10 @@ public:

    void write(const char * from, size_t size, size_t offset_);

-    /**
-     * writeInMemory and finalizeWrite are used together to write a single file with delay.
-     * Both can be called only once, one after another. Used for writing cache via threadpool
-     * on wrote operations. TODO: this solution is temporary, until adding a separate cache layer.
-     */
-    void writeInMemory(const char * from, size_t size);
-
-    size_t finalizeWrite();
-
    RemoteFileReaderPtr getRemoteFileReader();

+    RemoteFileReaderPtr extractRemoteFileReader();
+
    void setRemoteFileReader(RemoteFileReaderPtr remote_file_reader_);

    void resetRemoteFileReader();
@ -144,9 +138,11 @@ public:

    size_t getDownloadedSize() const;

+    size_t getRemainingSizeToDownload() const;
+
    void completeBatchAndResetDownloader();

-    void complete(State state);
+    void completeWithState(State state, bool auto_resize = false);

    String getInfoForLog() const;

@ -168,6 +164,8 @@ public:

    [[noreturn]] void throwIfDetached() const;

+    bool isDetached() const;
+
    String getPathInLocalCache() const;

 private:
@ -197,8 +195,8 @@ private:
    /// FileSegmentsHolder. complete() might check if the caller of the method
    /// is the last alive holder of the segment. Therefore, complete() and destruction
    /// of the file segment pointer must be done under the same cache mutex.
-    void complete(std::lock_guard<std::mutex> & cache_lock);
-    void completeUnlocked(std::lock_guard<std::mutex> & cache_lock, std::lock_guard<std::mutex> & segment_lock);
+    void completeBasedOnCurrentState(std::lock_guard<std::mutex> & cache_lock);
+    void completeBasedOnCurrentStateUnlocked(std::lock_guard<std::mutex> & cache_lock, std::lock_guard<std::mutex> & segment_lock);

    void completeImpl(
        std::lock_guard<std::mutex> & cache_lock,
@ -206,7 +204,7 @@ private:

    void resetDownloaderImpl(std::lock_guard<std::mutex> & segment_lock);

-    const Range segment_range;
+    Range segment_range;

    State download_state;

@ -246,23 +244,71 @@ private:
    std::atomic<size_t> hits_count = 0; /// cache hits.
    std::atomic<size_t> ref_count = 0; /// Used for getting snapshot state

-    /// Currently no-op. (will be added in PR 36171)
-    /// Defined if a file comply by the eviction policy.
    bool is_persistent;
    CurrentMetrics::Increment metric_increment{CurrentMetrics::CacheFileSegments};
 };

 struct FileSegmentsHolder : private boost::noncopyable
 {
+    FileSegmentsHolder() = default;
+
    explicit FileSegmentsHolder(FileSegments && file_segments_) : file_segments(std::move(file_segments_)) {}

    FileSegmentsHolder(FileSegmentsHolder && other) noexcept : file_segments(std::move(other.file_segments)) {}

    ~FileSegmentsHolder();

-    FileSegments file_segments{};
-
    String toString();
+
+    FileSegments::iterator add(FileSegmentPtr && file_segment)
+    {
+        return file_segments.insert(file_segments.end(), file_segment);
+    }
+
+    FileSegments file_segments{};
+};
+
+/**
+  * We want to write eventually some size, which is not known until the very end.
+  * Therefore we allocate file segments lazily. Each file segment is assigned capacity
+  * of max_file_segment_size, but reserved_size remains 0, until call to tryReserve().
+  * Once current file segment is full (reached max_file_segment_size), we allocate a
+  * new file segment. All allocated file segments resize in file segments holder.
+  * If at the end of all writes, the last file segment is not full, then it is resized.
+  */
+class FileSegmentRangeWriter
+{
+public:
+    using OnCompleteFileSegmentCallback = std::function<void(const FileSegment & file_segment)>;
+
+    FileSegmentRangeWriter(
+        FileCache * cache_,
+        const FileSegment::Key & key_,
+        /// A callback which is called right after each file segment is completed.
+        /// It is used to write into filesystem cache log.
+        OnCompleteFileSegmentCallback && on_complete_file_segment_func_);
+
+    ~FileSegmentRangeWriter();
+
+    bool write(const char * data, size_t size, size_t offset, bool is_persistent);
+
+    void finalize();
+
+private:
+    FileSegments::iterator allocateFileSegment(size_t offset, bool is_persistent);
+    void completeFileSegment(FileSegment & file_segment);
+
+    FileCache * cache;
+    FileSegment::Key key;
+
+    FileSegmentsHolder file_segments_holder;
+    FileSegments::iterator current_file_segment_it;
+
+    size_t current_file_segment_write_offset = 0;
+
+    bool finalized = false;
+
+    OnCompleteFileSegmentCallback on_complete_file_segment_func;
 };

 }
--- a/src/Common/LRUFileCachePriority.cpp
+++ b/src/Common/LRUFileCachePriority.cpp
@ -1,4 +1,11 @@
 #include <Common/LRUFileCachePriority.h>
+#include <Common/CurrentMetrics.h>
+
+namespace CurrentMetrics
+{
+    extern const Metric FilesystemCacheSize;
+    extern const Metric FilesystemCacheElements;
+}

 namespace DB
 {
@ -22,8 +29,13 @@ IFileCachePriority::WriteIterator LRUFileCachePriority::add(const Key & key, siz
                entry.size);
    }
 #endif
+
    auto iter = queue.insert(queue.end(), FileCacheRecord(key, offset, size));
    cache_size += size;
+
+    CurrentMetrics::add(CurrentMetrics::FilesystemCacheSize, size);
+    CurrentMetrics::add(CurrentMetrics::FilesystemCacheElements);
+
    return std::make_shared<LRUFileCacheIterator>(this, iter);
 }

@ -39,10 +51,19 @@ bool LRUFileCachePriority::contains(const Key & key, size_t offset, std::lock_gu

 void LRUFileCachePriority::removeAll(std::lock_guard<std::mutex> &)
 {
+    CurrentMetrics::sub(CurrentMetrics::FilesystemCacheSize, cache_size);
+    CurrentMetrics::sub(CurrentMetrics::FilesystemCacheElements, queue.size());
+
    queue.clear();
    cache_size = 0;
 }

+LRUFileCachePriority::LRUFileCacheIterator::LRUFileCacheIterator(
+    LRUFileCachePriority * cache_priority_, LRUFileCachePriority::LRUQueueIterator queue_iter_)
+    : cache_priority(cache_priority_), queue_iter(queue_iter_)
+{
+}
+
 IFileCachePriority::ReadIterator LRUFileCachePriority::getLowestPriorityReadIterator(std::lock_guard<std::mutex> &)
 {
    return std::make_unique<const LRUFileCacheIterator>(this, queue.begin());
@ -58,4 +79,27 @@ size_t LRUFileCachePriority::getElementsNum(std::lock_guard<std::mutex> &) const
    return queue.size();
 }

+void LRUFileCachePriority::LRUFileCacheIterator::removeAndGetNext(std::lock_guard<std::mutex> &)
+{
+    cache_priority->cache_size -= queue_iter->size;
+
+    CurrentMetrics::sub(CurrentMetrics::FilesystemCacheSize, queue_iter->size);
+    CurrentMetrics::sub(CurrentMetrics::FilesystemCacheElements);
+
+    queue_iter = cache_priority->queue.erase(queue_iter);
+}
+
+void LRUFileCachePriority::LRUFileCacheIterator::incrementSize(size_t size_increment, std::lock_guard<std::mutex> &)
+{
+    cache_priority->cache_size += size_increment;
+    CurrentMetrics::add(CurrentMetrics::FilesystemCacheSize, size_increment);
+    queue_iter->size += size_increment;
+}
+
+void LRUFileCachePriority::LRUFileCacheIterator::use(std::lock_guard<std::mutex> &)
+{
+    queue_iter->hits++;
+    cache_priority->queue.splice(cache_priority->queue.end(), cache_priority->queue, queue_iter);
+}
+
 };
--- a/src/Common/LRUFileCachePriority.h
+++ b/src/Common/LRUFileCachePriority.h
@ -37,14 +37,11 @@ private:
 class LRUFileCachePriority::LRUFileCacheIterator : public IFileCachePriority::IIterator
 {
 public:
-    LRUFileCacheIterator(LRUFileCachePriority * file_cache_, LRUFileCachePriority::LRUQueueIterator queue_iter_)
-        : file_cache(file_cache_), queue_iter(queue_iter_)
-    {
-    }
+    LRUFileCacheIterator(LRUFileCachePriority * cache_priority_, LRUFileCachePriority::LRUQueueIterator queue_iter_);

    void next() const override { queue_iter++; }

-    bool valid() const override { return queue_iter != file_cache->queue.end(); }
+    bool valid() const override { return queue_iter != cache_priority->queue.end(); }

    const Key & key() const override { return queue_iter->key; }

@ -54,26 +51,14 @@ public:

    size_t hits() const override { return queue_iter->hits; }

-    void removeAndGetNext(std::lock_guard<std::mutex> &) override
-    {
-        file_cache->cache_size -= queue_iter->size;
-        queue_iter = file_cache->queue.erase(queue_iter);
-    }
+    void removeAndGetNext(std::lock_guard<std::mutex> &) override;

-    void incrementSize(size_t size_increment, std::lock_guard<std::mutex> &) override
-    {
-        file_cache->cache_size += size_increment;
-        queue_iter->size += size_increment;
-    }
+    void incrementSize(size_t size_increment, std::lock_guard<std::mutex> &) override;

-    void use(std::lock_guard<std::mutex> &) override
-    {
-        queue_iter->hits++;
-        file_cache->queue.splice(file_cache->queue.end(), file_cache->queue, queue_iter);
-    }
+    void use(std::lock_guard<std::mutex> &) override;

 private:
-    LRUFileCachePriority * file_cache;
+    LRUFileCachePriority * cache_priority;
    mutable LRUFileCachePriority::LRUQueueIterator queue_iter;
 };

--- a/src/Common/MemoryTracker.cpp
+++ b/src/Common/MemoryTracker.cpp
@ -169,12 +169,14 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT
    std::bernoulli_distribution fault(fault_probability);
    if (unlikely(fault_probability && fault(thread_local_rng)) && memoryTrackerCanThrow(level, true) && throw_if_memory_exceeded)
    {
+        /// Revert
+        amount.fetch_sub(size, std::memory_order_relaxed);
+
        /// Prevent recursion. Exception::ctor -> std::string -> new[] -> MemoryTracker::alloc
        MemoryTrackerBlockerInThread untrack_lock(VariableContext::Global);

        ProfileEvents::increment(ProfileEvents::QueryMemoryLimitExceeded);
        const auto * description = description_ptr.load(std::memory_order_relaxed);
-        amount.fetch_sub(size, std::memory_order_relaxed);
        throw DB::Exception(
            DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED,
            "Memory tracker{}{}: fault injected. Would use {} (attempt to allocate chunk of {} bytes), maximum: {}",
@ -211,6 +213,9 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT

        if (overcommit_result != OvercommitResult::MEMORY_FREED)
        {
+            /// Revert
+            amount.fetch_sub(size, std::memory_order_relaxed);
+
            /// Prevent recursion. Exception::ctor -> std::string -> new[] -> MemoryTracker::alloc
            MemoryTrackerBlockerInThread untrack_lock(VariableContext::Global);
            ProfileEvents::increment(ProfileEvents::QueryMemoryLimitExceeded);
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -278,6 +278,8 @@
    M(CachedReadBufferReadFromCacheBytes, "Bytes read from filesystem cache") \
    M(CachedReadBufferCacheWriteBytes, "Bytes written from source (remote fs, etc) to filesystem cache") \
    M(CachedReadBufferCacheWriteMicroseconds, "Time spent writing data into filesystem cache") \
+    M(CachedWriteBufferCacheWriteBytes, "Bytes written from source (remote fs, etc) to filesystem cache") \
+    M(CachedWriteBufferCacheWriteMicroseconds, "Time spent writing data into filesystem cache") \
    \
    M(RemoteFSSeeks, "Total number of seeks for async buffer") \
    M(RemoteFSPrefetches, "Number of prefetches made with asynchronous reading from remote filesystem") \
@ -347,7 +349,13 @@
    \
    M(ScalarSubqueriesGlobalCacheHit, "Number of times a read from a scalar subquery was done using the global cache") \
    M(ScalarSubqueriesLocalCacheHit, "Number of times a read from a scalar subquery was done using the local cache") \
-    M(ScalarSubqueriesCacheMiss, "Number of times a read from a scalar subquery was not cached and had to be calculated completely") \
+    M(ScalarSubqueriesCacheMiss, "Number of times a read from a scalar subquery was not cached and had to be calculated completely")                                                                                                                                                                                                 \
+    \
+    M(SchemaInferenceCacheHits, "Number of times a schema from cache was used for schema inference") \
+    M(SchemaInferenceCacheMisses, "Number of times a schema is not in cache while schema inference") \
+    M(SchemaInferenceCacheEvictions, "Number of times a schema from cache was evicted due to overflow") \
+    M(SchemaInferenceCacheInvalidations, "Number of times a schema in cache became invalid due to changes in data") \
+    \
    M(KeeperPacketsSent, "Packets sent by keeper server") \
    M(KeeperPacketsReceived, "Packets received by keeper server") \
    M(KeeperRequestTotal, "Total requests number on keeper server") \
--- a/src/Common/TargetSpecific.cpp
+++ b/src/Common/TargetSpecific.cpp
@ -20,6 +20,8 @@ UInt32 getSupportedArchs()
        result |= static_cast<UInt32>(TargetArch::AVX512BW);
    if (Cpu::CpuFlagsCache::have_AVX512VBMI)
        result |= static_cast<UInt32>(TargetArch::AVX512VBMI);
+    if (Cpu::CpuFlagsCache::have_AVX512VBMI2)
+        result |= static_cast<UInt32>(TargetArch::AVX512VBMI2);
    return result;
 }

@ -38,8 +40,9 @@ String toString(TargetArch arch)
        case TargetArch::AVX:     return "avx";
        case TargetArch::AVX2:    return "avx2";
        case TargetArch::AVX512F: return "avx512f";
-        case TargetArch::AVX512BW: return "avx512bw";
-        case TargetArch::AVX512VBMI: return "avx512vbmi";
+        case TargetArch::AVX512BW:    return "avx512bw";
+        case TargetArch::AVX512VBMI:  return "avx512vbmi";
+        case TargetArch::AVX512VBMI2: return "avx512vbmi";
    }

    __builtin_unreachable();
--- a/src/Common/TargetSpecific.h
+++ b/src/Common/TargetSpecific.h
@ -31,7 +31,7 @@
 * int funcImpl() {
 *     return 2;
 * }
- * ) // DECLARE_DEFAULT_CODE
+ * ) // DECLARE_AVX2_SPECIFIC_CODE
 *
 * int func() {
 * #if USE_MULTITARGET_CODE
@ -80,8 +80,9 @@ enum class TargetArch : UInt32
    AVX      = (1 << 1),
    AVX2     = (1 << 2),
    AVX512F  = (1 << 3),
-    AVX512BW  = (1 << 4),
-    AVX512VBMI = (1 << 5),
+    AVX512BW    = (1 << 4),
+    AVX512VBMI  = (1 << 5),
+    AVX512VBMI2 = (1 << 6),
 };

 /// Runtime detection.
@ -100,6 +101,7 @@ String toString(TargetArch arch);

 #if defined(__clang__)

+#define AVX512VBMI2_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2")))
 #define AVX512VBMI_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi")))
 #define AVX512BW_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw")))
 #define AVX512_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f")))
@ -108,6 +110,8 @@ String toString(TargetArch arch);
 #define SSE42_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt")))
 #define DEFAULT_FUNCTION_SPECIFIC_ATTRIBUTE

+#   define BEGIN_AVX512VBMI2_SPECIFIC_CODE \
+        _Pragma("clang attribute push(__attribute__((target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2\"))),apply_to=function)")
 #   define BEGIN_AVX512VBMI_SPECIFIC_CODE \
        _Pragma("clang attribute push(__attribute__((target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi\"))),apply_to=function)")
 #   define BEGIN_AVX512BW_SPECIFIC_CODE \
@ -129,6 +133,7 @@ String toString(TargetArch arch);
 #   define DUMMY_FUNCTION_DEFINITION [[maybe_unused]] void _dummy_function_definition();
 #else

+#define AVX512VBMI2_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2,tune=native")))
 #define AVX512VBMI_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,tune=native")))
 #define AVX512BW_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,tune=native")))
 #define AVX512_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,tune=native")))
@ -137,6 +142,9 @@ String toString(TargetArch arch);
 #define SSE42_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt",tune=native)))
 #define DEFAULT_FUNCTION_SPECIFIC_ATTRIBUTE

+#   define BEGIN_AVX512VBMI2_SPECIFIC_CODE \
+        _Pragma("GCC push_options") \
+        _Pragma("GCC target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2,tune=native\")")
 #   define BEGIN_AVX512VBMI_SPECIFIC_CODE \
        _Pragma("GCC push_options") \
        _Pragma("GCC target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,tune=native\")")
@ -217,6 +225,16 @@ namespace TargetSpecific::AVX512VBMI { \
 } \
 END_TARGET_SPECIFIC_CODE

+#define DECLARE_AVX512VBMI2_SPECIFIC_CODE(...) \
+BEGIN_AVX512VBMI2_SPECIFIC_CODE \
+namespace TargetSpecific::AVX512VBMI2 { \
+    DUMMY_FUNCTION_DEFINITION \
+    using namespace DB::TargetSpecific::AVX512VBMI2; \
+    __VA_ARGS__ \
+} \
+END_TARGET_SPECIFIC_CODE
+
+
 #else

 #define USE_MULTITARGET_CODE 0
@ -229,6 +247,7 @@ END_TARGET_SPECIFIC_CODE
 #define DECLARE_AVX512F_SPECIFIC_CODE(...)
 #define DECLARE_AVX512BW_SPECIFIC_CODE(...)
 #define DECLARE_AVX512VBMI_SPECIFIC_CODE(...)
+#define DECLARE_AVX512VBMI2_SPECIFIC_CODE(...)

 #endif

@ -245,8 +264,9 @@ DECLARE_SSE42_SPECIFIC_CODE  (__VA_ARGS__) \
 DECLARE_AVX_SPECIFIC_CODE    (__VA_ARGS__) \
 DECLARE_AVX2_SPECIFIC_CODE   (__VA_ARGS__) \
 DECLARE_AVX512F_SPECIFIC_CODE(__VA_ARGS__) \
-DECLARE_AVX512BW_SPECIFIC_CODE(__VA_ARGS__) \
-DECLARE_AVX512VBMI_SPECIFIC_CODE(__VA_ARGS__)
+DECLARE_AVX512BW_SPECIFIC_CODE    (__VA_ARGS__) \
+DECLARE_AVX512VBMI_SPECIFIC_CODE  (__VA_ARGS__) \
+DECLARE_AVX512VBMI2_SPECIFIC_CODE (__VA_ARGS__)

 DECLARE_DEFAULT_CODE(
    constexpr auto BuildArch = TargetArch::Default; /// NOLINT
@ -276,6 +296,9 @@ DECLARE_AVX512VBMI_SPECIFIC_CODE(
    constexpr auto BuildArch = TargetArch::AVX512VBMI; /// NOLINT
 ) // DECLARE_AVX512VBMI_SPECIFIC_CODE

+DECLARE_AVX512VBMI2_SPECIFIC_CODE(
+    constexpr auto BuildArch = TargetArch::AVX512VBMI2; /// NOLINT
+) // DECLARE_AVX512VBMI2_SPECIFIC_CODE

 /** Runtime Dispatch helpers for class members.
  *
--- a/src/Common/ThreadStatus.h
+++ b/src/Common/ThreadStatus.h
@ -135,6 +135,8 @@ public:
    Int64 untracked_memory = 0;
    /// Each thread could new/delete memory in range of (-untracked_memory_limit, untracked_memory_limit) without access to common counters.
    Int64 untracked_memory_limit = 4 * 1024 * 1024;
+    /// Increase limit in case of exception.
+    Int64 untracked_memory_limit_increase = 0;

    /// Statistics of read and write rows/bytes
    Progress progress_in;
--- a/src/Common/filesystemHelpers.cpp
+++ b/src/Common/filesystemHelpers.cpp
@ -351,4 +351,24 @@ void setModificationTime(const std::string & path, time_t time)
    if (utime(path.c_str(), &tb) != 0)
        DB::throwFromErrnoWithPath("Cannot set modification time for file: " + path, path, DB::ErrorCodes::PATH_ACCESS_DENIED);
 }
+
+bool isSymlink(const fs::path & path)
+{
+    /// Remove trailing slash before checking if file is symlink.
+    /// Let /path/to/link is a symlink to /path/to/target/dir/ directory.
+    /// In this case is_symlink("/path/to/link") is true,
+    /// but is_symlink("/path/to/link/") is false (it's a directory)
+    if (path.filename().empty())
+        return fs::is_symlink(path.parent_path());      /// STYLE_CHECK_ALLOW_STD_FS_SYMLINK
+    return fs::is_symlink(path);        /// STYLE_CHECK_ALLOW_STD_FS_SYMLINK
+}
+
+fs::path readSymlink(const fs::path & path)
+{
+    /// See the comment for isSymlink
+    if (path.filename().empty())
+        return fs::read_symlink(path.parent_path());        /// STYLE_CHECK_ALLOW_STD_FS_SYMLINK
+    return fs::read_symlink(path);      /// STYLE_CHECK_ALLOW_STD_FS_SYMLINK
+}
+
 }
--- a/src/Common/filesystemHelpers.h
+++ b/src/Common/filesystemHelpers.h
@ -9,6 +9,7 @@
 #include <sys/statvfs.h>
 #include <Poco/TemporaryFile.h>

+namespace fs = std::filesystem;

 namespace DB
 {
@ -89,4 +90,8 @@ Poco::Timestamp getModificationTimestamp(const std::string & path);
 void setModificationTime(const std::string & path, time_t time);
 /// st_ctime
 time_t getChangeTime(const std::string & path);
+
+bool isSymlink(const fs::path & path);
+fs::path readSymlink(const fs::path & path);
+
 }
--- a/src/Common/tests/gtest_lru_file_cache.cpp
+++ b/src/Common/tests/gtest_lru_file_cache.cpp
@ -80,7 +80,7 @@ void complete(const DB::FileSegmentsHolder & holder)
    {
        ASSERT_TRUE(file_segment->getOrSetDownloader() == DB::FileSegment::getCallerId());
        prepareAndDownload(file_segment);
-        file_segment->complete(DB::FileSegment::State::DOWNLOADED);
+        file_segment->completeWithState(DB::FileSegment::State::DOWNLOADED);
    }
 }

@ -125,7 +125,7 @@ TEST(FileCache, get)
        assertRange(2, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADING);

        download(segments[0]);
-        segments[0]->complete(DB::FileSegment::State::DOWNLOADED);
+        segments[0]->completeWithState(DB::FileSegment::State::DOWNLOADED);
        assertRange(3, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED);
    }

@ -146,7 +146,7 @@ TEST(FileCache, get)

        ASSERT_TRUE(segments[1]->getOrSetDownloader() == DB::FileSegment::getCallerId());
        prepareAndDownload(segments[1]);
-        segments[1]->complete(DB::FileSegment::State::DOWNLOADED);
+        segments[1]->completeWithState(DB::FileSegment::State::DOWNLOADED);
        assertRange(6, segments[1], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED);
    }

@ -203,7 +203,7 @@ TEST(FileCache, get)
        ASSERT_TRUE(segments[2]->getOrSetDownloader() == DB::FileSegment::getCallerId());
        prepareAndDownload(segments[2]);

-        segments[2]->complete(DB::FileSegment::State::DOWNLOADED);
+        segments[2]->completeWithState(DB::FileSegment::State::DOWNLOADED);

        assertRange(14, segments[3], DB::FileSegment::Range(17, 20), DB::FileSegment::State::DOWNLOADED);

@ -244,7 +244,7 @@ TEST(FileCache, get)
        ASSERT_TRUE(segments[3]->getOrSetDownloader() == DB::FileSegment::getCallerId());
        prepareAndDownload(segments[3]);

-        segments[3]->complete(DB::FileSegment::State::DOWNLOADED);
+        segments[3]->completeWithState(DB::FileSegment::State::DOWNLOADED);
        ASSERT_TRUE(segments[3]->state() == DB::FileSegment::State::DOWNLOADED);
    }

@ -267,8 +267,8 @@ TEST(FileCache, get)
        ASSERT_TRUE(segments[2]->getOrSetDownloader() == DB::FileSegment::getCallerId());
        prepareAndDownload(segments[0]);
        prepareAndDownload(segments[2]);
-        segments[0]->complete(DB::FileSegment::State::DOWNLOADED);
-        segments[2]->complete(DB::FileSegment::State::DOWNLOADED);
+        segments[0]->completeWithState(DB::FileSegment::State::DOWNLOADED);
+        segments[2]->completeWithState(DB::FileSegment::State::DOWNLOADED);
    }

    /// Current cache:    [____][_]  [][___][__]
@ -290,8 +290,8 @@ TEST(FileCache, get)
        ASSERT_TRUE(s1[0]->getOrSetDownloader() == DB::FileSegment::getCallerId());
        prepareAndDownload(s5[0]);
        prepareAndDownload(s1[0]);
-        s5[0]->complete(DB::FileSegment::State::DOWNLOADED);
-        s1[0]->complete(DB::FileSegment::State::DOWNLOADED);
+        s5[0]->completeWithState(DB::FileSegment::State::DOWNLOADED);
+        s1[0]->completeWithState(DB::FileSegment::State::DOWNLOADED);

        /// Current cache:    [___]       [_][___][_]   [__]
        ///                   ^   ^       ^  ^   ^  ^   ^  ^
@ -393,7 +393,7 @@ TEST(FileCache, get)
        }

        prepareAndDownload(segments[2]);
-        segments[2]->complete(DB::FileSegment::State::DOWNLOADED);
+        segments[2]->completeWithState(DB::FileSegment::State::DOWNLOADED);
        ASSERT_TRUE(segments[2]->state() == DB::FileSegment::State::DOWNLOADED);

        other_1.join();
@ -458,7 +458,7 @@ TEST(FileCache, get)

            ASSERT_TRUE(segments_2[1]->getOrSetDownloader() == DB::FileSegment::getCallerId());
            prepareAndDownload(segments_2[1]);
-            segments_2[1]->complete(DB::FileSegment::State::DOWNLOADED);
+            segments_2[1]->completeWithState(DB::FileSegment::State::DOWNLOADED);
        });

        {
--- a/src/Core/BackgroundSchedulePool.cpp
+++ b/src/Core/BackgroundSchedulePool.cpp
@ -73,6 +73,11 @@ bool BackgroundSchedulePoolTaskInfo::activateAndSchedule()
    return true;
 }

+std::unique_lock<std::mutex> BackgroundSchedulePoolTaskInfo::getExecLock()
+{
+    return std::unique_lock{exec_mutex};
+}
+
 void BackgroundSchedulePoolTaskInfo::execute()
 {
    Stopwatch watch;
--- a/src/Core/BackgroundSchedulePool.h
+++ b/src/Core/BackgroundSchedulePool.h
@ -121,6 +121,10 @@ public:
    /// get Coordination::WatchCallback needed for notifications from ZooKeeper watches.
    Coordination::WatchCallback getWatchCallback();

+    /// Returns lock that protects from concurrent task execution.
+    /// This lock should not be held for a long time.
+    std::unique_lock<std::mutex> getExecLock();
+
 private:
    friend class TaskNotification;
    friend class BackgroundSchedulePool;
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -280,6 +280,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(UInt64, http_max_fields, 1000000, "Maximum number of fields in HTTP header", 0) \
    M(UInt64, http_max_field_name_size, 1048576, "Maximum length of field name in HTTP header", 0) \
    M(UInt64, http_max_field_value_size, 1048576, "Maximum length of field value in HTTP header", 0) \
+    M(UInt64, http_max_chunk_size, 100_GiB, "Maximum value of a chunk size in HTTP chunked transfer encoding", 0) \
    M(Bool, http_skip_not_found_url_for_globs, true, "Skip url's for globs with HTTP_NOT_FOUND error", 0) \
    M(Bool, optimize_throw_if_noop, false, "If setting is enabled and OPTIMIZE query didn't actually assign a merge then an explanatory exception is thrown", 0) \
    M(Bool, use_index_for_in_with_subqueries, true, "Try using an index if there is a subquery or a table expression on the right side of the IN operator.", 0) \
@ -408,6 +409,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(UInt64, low_cardinality_max_dictionary_size, 8192, "Maximum size (in rows) of shared global dictionary for LowCardinality type.", 0) \
    M(Bool, low_cardinality_use_single_dictionary_for_part, false, "LowCardinality type serialization setting. If is true, than will use additional keys when global dictionary overflows. Otherwise, will create several shared dictionaries.", 0) \
    M(Bool, decimal_check_overflow, true, "Check overflow of decimal arithmetic/comparison operations", 0) \
+    M(Bool, allow_custom_error_code_in_throwif, false, "Enable custom error code in function throwIf(). If true, thrown exceptions may have unexpected error codes.", 0) \
    \
    M(Bool, prefer_localhost_replica, true, "If it's true then queries will be always sent to local replica (if it exists). If it's false then replica to send a query will be chosen between local and remote ones according to load_balancing", 0) \
    M(UInt64, max_fetch_partition_retries_count, 5, "Amount of retries while fetching partition from another host.", 0) \
@ -538,11 +540,12 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(Bool, engine_file_allow_create_multiple_files, false, "Enables or disables creating a new file on each insert in file engine tables if format has suffix.", 0) \
    M(Bool, allow_experimental_database_replicated, false, "Allow to create databases with Replicated engine", 0) \
    M(UInt64, database_replicated_initial_query_timeout_sec, 300, "How long initial DDL query should wait for Replicated database to precess previous DDL queue entries", 0) \
+    M(Bool, database_replicated_enforce_synchronous_settings, false, "Enforces synchronous waiting for some queries (see also database_atomic_wait_for_drop_and_detach_synchronously, mutation_sync, replication_alter_partitions_sync). Not recommended to enable these settings.", 0) \
    M(UInt64, max_distributed_depth, 5, "Maximum distributed query depth", 0) \
    M(Bool, database_replicated_always_detach_permanently, false, "Execute DETACH TABLE as DETACH TABLE PERMANENTLY if database engine is Replicated", 0) \
    M(Bool, database_replicated_allow_only_replicated_engine, false, "Allow to create only Replicated tables in database with engine Replicated", 0) \
    M(DistributedDDLOutputMode, distributed_ddl_output_mode, DistributedDDLOutputMode::THROW, "Format of distributed DDL query result", 0) \
-    M(UInt64, distributed_ddl_entry_format_version, 2, "Version of DDL entry to write into ZooKeeper", 0) \
+    M(UInt64, distributed_ddl_entry_format_version, 3, "Compatibility version of distributed DDL (ON CLUSTER) queries", 0) \
    \
    M(UInt64, external_storage_max_read_rows, 0, "Limit maximum number of rows when table with external engine should flush history data. Now supported only for MySQL table engine, database engine, dictionary and MaterializedMySQL. If equal to 0, this setting is disabled", 0) \
    M(UInt64, external_storage_max_read_bytes, 0, "Limit maximum number of bytes when table with external engine should flush history data. Now supported only for MySQL table engine, database engine, dictionary and MaterializedMySQL. If equal to 0, this setting is disabled", 0)  \
@ -590,6 +593,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(Bool, enable_filesystem_cache_on_write_operations, false, "Write into cache on write operations. To actually work this setting requires be added to disk config too", 0) \
    M(Bool, enable_filesystem_cache_log, false, "Allows to record the filesystem caching log for each query", 0) \
    M(Bool, read_from_filesystem_cache_if_exists_otherwise_bypass_cache, false, "", 0) \
+    M(Bool, enable_filesystem_cache_on_lower_level, true, "If read buffer supports caching inside threadpool, allow it to do it, otherwise cache outside ot threadpool. Do not use this setting, it is needed for testing", 0) \
    M(Bool, skip_download_if_exceeds_query_cache, true, "Skip download from remote filesystem if exceeds query cache size", 0) \
    M(UInt64, max_query_cache_size, (128UL * 1024 * 1024 * 1024), "Max remote filesystem cache size that can be used by a single query", 0) \
    \
@ -607,6 +611,12 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(Bool, allow_deprecated_database_ordinary, false, "Allow to create databases with deprecated Ordinary engine", 0) \
    M(Bool, allow_deprecated_syntax_for_merge_tree, false, "Allow to create *MergeTree tables with deprecated engine definition syntax", 0) \
    \
+    M(Bool, schema_inference_use_cache_for_file, true, "Use cache in schema inference while using file table function", 0) \
+    M(Bool, schema_inference_use_cache_for_s3, true, "Use cache in schema inference while using s3 table function", 0) \
+    M(Bool, schema_inference_use_cache_for_hdfs, true, "Use cache in schema inference while using hdfs table function", 0) \
+    M(Bool, schema_inference_use_cache_for_url, true, "Use cache in schema inference while using url table function", 0) \
+    M(Bool, schema_inference_cache_require_modification_time_for_url, true, "Use schema from cache for URL with last modification time validation (for urls with Last-Modified header)", 0) \
+    \
    M(String, compatibility, "", "Changes other settings according to provided ClickHouse version. If we know that we changed some behaviour in ClickHouse by changing some settings in some version, this compatibility setting will control these settings", 0) \
    \
    M(Map, additional_table_filters, "", "Additional filter expression which would be applied after reading from specified table. Syntax: {'table1': 'expression', 'database.table2': 'expression'}", 0) \
@ -626,6 +636,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(Bool, compatibility_ignore_auto_increment_in_create_table, false, "Ignore AUTO_INCREMENT keyword in column declaration if true, otherwise return error. It simplifies migration from MySQL", 0) \
    M(Bool, multiple_joins_try_to_keep_original_names, false, "Do not add aliases to top level expression list on multiple joins rewrite", 0) \
    M(Bool, optimize_distinct_in_order, true, "Enable DISTINCT optimization if some columns in DISTINCT form a prefix of sorting. For example, prefix of sorting key in merge tree or ORDER BY statement", 0) \
+    M(Bool, optimize_sorting_by_input_stream_properties, true, "Optimize sorting by sorting properties of input stream", 0) \
    // End of COMMON_SETTINGS
    // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS.

@ -704,6 +715,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(Bool, input_format_orc_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format ORC", 0) \
    M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Arrow", 0) \
    M(String, column_names_for_schema_inference, "", "The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \
+    M(String, schema_inference_hints, "", "The list of column names and types to use in schema inference for formats without column names. The format: 'column_name1 column_type1, column_name2 column_type2, ...'", 0) \
    M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \
    M(Bool, input_format_json_try_infer_numbers_from_strings, true, "Try to infer numbers from string fields while schema inference", 0) \
    M(Bool, input_format_try_infer_integers, true, "Try to infer numbers from string fields while schema inference in text formats", 0) \
--- a/src/Core/SortDescription.cpp
+++ b/src/Core/SortDescription.cpp
@ -42,6 +42,22 @@ void SortColumnDescription::explain(JSONBuilder::JSONMap & map) const
    map.add("With Fill", with_fill);
 }

+bool SortDescription::hasPrefix(const SortDescription & prefix) const
+{
+    if (prefix.empty())
+        return true;
+
+    if (prefix.size() > size())
+        return false;
+
+    for (size_t i = 0; i < prefix.size(); ++i)
+    {
+        if ((*this)[i] != prefix[i])
+            return false;
+    }
+    return true;
+}
+
 #if USE_EMBEDDED_COMPILER

 static CHJIT & getJITInstance()
--- a/src/Core/SortDescription.h
+++ b/src/Core/SortDescription.h
@ -8,8 +8,7 @@
 #include <Core/SettingsEnums.h>
 #include <Common/IntervalKind.h>
 #include <DataTypes/IDataType.h>
-
-class Collator;
+#include <Columns/Collator.h>

 namespace DB
 {
@ -65,9 +64,18 @@ struct SortColumnDescription
    {
    }

-    bool operator == (const SortColumnDescription & other) const
+    static bool compareCollators(const std::shared_ptr<Collator> & a, const std::shared_ptr<Collator> & b)
    {
-        return column_name == other.column_name && direction == other.direction && nulls_direction == other.nulls_direction;
+        if (unlikely(a && b))
+            return *a == *b;
+
+        return a == b;
+    }
+
+    bool operator==(const SortColumnDescription & other) const
+    {
+        return column_name == other.column_name && direction == other.direction && nulls_direction == other.nulls_direction
+            && compareCollators(collator, other.collator);
    }

    bool operator != (const SortColumnDescription & other) const
@ -89,6 +97,13 @@ struct SortColumnDescriptionWithColumnIndex
        : base(std::move(description_)), column_number(column_number_)
    {
    }
+
+    bool operator==(const SortColumnDescriptionWithColumnIndex & other) const
+    {
+        return base == other.base && column_number == other.column_number;
+    }
+
+    bool operator!=(const SortColumnDescriptionWithColumnIndex & other) const { return !(*this == other); }
 };

 class CompiledSortDescriptionFunctionHolder;
@ -104,6 +119,8 @@ public:
    std::shared_ptr<CompiledSortDescriptionFunctionHolder> compiled_sort_description_holder;
    size_t min_count_to_compile_sort_description = 3;
    bool compile_sort_description = false;
+
+    bool hasPrefix(const SortDescription & prefix) const;
 };

 /** Compile sort description for header_types.
--- a/src/Databases/DatabaseAtomic.cpp
+++ b/src/Databases/DatabaseAtomic.cpp
@ -6,6 +6,7 @@
 #include <IO/ReadBufferFromFile.h>
 #include <Parsers/formatAST.h>
 #include <Common/atomicRename.h>
+#include <Common/filesystemHelpers.h>
 #include <Storages/StorageMaterializedView.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/ExternalDictionariesLoader.h>
@ -117,13 +118,19 @@ void DatabaseAtomic::dropTable(ContextPtr local_context, const String & table_na
    if (table)
        table->dropInnerTableIfAny(sync, local_context);
    else
-        throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} doesn't exist",
-                        backQuote(getDatabaseName()), backQuote(table_name));
+        throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {}.{} doesn't exist", backQuote(getDatabaseName()), backQuote(table_name));

+    dropTableImpl(local_context, table_name, sync);
+}
+
+void DatabaseAtomic::dropTableImpl(ContextPtr local_context, const String & table_name, bool sync)
+{
    String table_metadata_path = getObjectMetadataPath(table_name);
    String table_metadata_path_drop;
+    StoragePtr table;
    {
        std::lock_guard lock(mutex);
+        table = getTableUnlocked(table_name);
        table_metadata_path_drop = DatabaseCatalog::instance().getPathForDroppedMetadata(table->getStorageID());
        auto txn = local_context->getZooKeeperMetadataTransaction();
        if (txn && !local_context->isInternalSubquery())
@ -416,15 +423,15 @@ UUID DatabaseAtomic::tryGetTableUUID(const String & table_name) const
    return UUIDHelpers::Nil;
 }

-void DatabaseAtomic::beforeLoadingMetadata(ContextMutablePtr /*context*/, bool force_restore, bool /*force_attach*/)
+void DatabaseAtomic::beforeLoadingMetadata(ContextMutablePtr /*context*/, LoadingStrictnessLevel mode)
 {
-    if (!force_restore)
+    if (mode < LoadingStrictnessLevel::FORCE_RESTORE)
        return;

    /// Recreate symlinks to table data dirs in case of force restore, because some of them may be broken
    for (const auto & table_path : fs::directory_iterator(path_to_table_symlinks))
    {
-        if (!fs::is_symlink(table_path))
+        if (!FS::isSymlink(table_path))
        {
            throw Exception(ErrorCodes::ABORTED,
                "'{}' is not a symlink. Atomic database should contains only symlinks.", std::string(table_path.path()));
@ -435,17 +442,17 @@ void DatabaseAtomic::beforeLoadingMetadata(ContextMutablePtr /*context*/, bool f
 }

 void DatabaseAtomic::loadStoredObjects(
-    ContextMutablePtr local_context, bool force_restore, bool force_attach, bool skip_startup_tables)
+    ContextMutablePtr local_context, LoadingStrictnessLevel mode, bool skip_startup_tables)
 {
-    beforeLoadingMetadata(local_context, force_restore, force_attach);
-    DatabaseOrdinary::loadStoredObjects(local_context, force_restore, force_attach, skip_startup_tables);
+    beforeLoadingMetadata(local_context, mode);
+    DatabaseOrdinary::loadStoredObjects(local_context, mode, skip_startup_tables);
 }

-void DatabaseAtomic::startupTables(ThreadPool & thread_pool, bool force_restore, bool force_attach)
+void DatabaseAtomic::startupTables(ThreadPool & thread_pool, LoadingStrictnessLevel mode)
 {
-    DatabaseOrdinary::startupTables(thread_pool, force_restore, force_attach);
+    DatabaseOrdinary::startupTables(thread_pool, mode);

-    if (!force_restore)
+    if (mode < LoadingStrictnessLevel::FORCE_RESTORE)
        return;

    NameToPathMap table_names;
@ -495,7 +502,7 @@ void DatabaseAtomic::tryCreateMetadataSymlink()
    fs::path metadata_symlink(path_to_metadata_symlink);
    if (fs::exists(metadata_symlink))
    {
-        if (!fs::is_symlink(metadata_symlink))
+        if (!FS::isSymlink(metadata_symlink))
            throw Exception(ErrorCodes::FILE_ALREADY_EXISTS, "Directory {} exists", path_to_metadata_symlink);
    }
    else
--- a/src/Databases/DatabaseAtomic.h
+++ b/src/Databases/DatabaseAtomic.h
@ -36,6 +36,7 @@ public:
            bool dictionary) override;

    void dropTable(ContextPtr context, const String & table_name, bool sync) override;
+    void dropTableImpl(ContextPtr context, const String & table_name, bool sync);

    void attachTable(ContextPtr context, const String & name, const StoragePtr & table, const String & relative_table_path) override;
    StoragePtr detachTable(ContextPtr context, const String & name) override;
@ -47,11 +48,11 @@ public:

    DatabaseTablesIteratorPtr getTablesIterator(ContextPtr context, const FilterByNameFunction & filter_by_table_name) const override;

-    void loadStoredObjects(ContextMutablePtr context, bool force_restore, bool force_attach, bool skip_startup_tables) override;
+    void loadStoredObjects(ContextMutablePtr context, LoadingStrictnessLevel mode, bool skip_startup_tables) override;

-    void beforeLoadingMetadata(ContextMutablePtr context, bool force_restore, bool force_attach) override;
+    void beforeLoadingMetadata(ContextMutablePtr context, LoadingStrictnessLevel mode) override;

-    void startupTables(ThreadPool & thread_pool, bool force_restore, bool force_attach) override;
+    void startupTables(ThreadPool & thread_pool, LoadingStrictnessLevel mode) override;

    /// Atomic database cannot be detached if there is detached table which still in use
    void assertCanBeDetached(bool cleanup) override;
--- a/src/Databases/DatabaseFactory.cpp
+++ b/src/Databases/DatabaseFactory.cpp
@ -16,6 +16,7 @@
 #include <Storages/ExternalDataSourceConfiguration.h>
 #include <Common/logger_useful.h>
 #include <Common/Macros.h>
+#include <Common/filesystemHelpers.h>

 #include "config_core.h"

@ -60,8 +61,43 @@ namespace ErrorCodes
    extern const int NOT_IMPLEMENTED;
 }

+void cckMetadataPathForOrdinary(const ASTCreateQuery & create, const String & metadata_path)
+{
+    const String & engine_name = create.storage->engine->name;
+    const String & database_name = create.getDatabase();
+
+    if (engine_name != "Ordinary")
+        return;
+
+    if (!FS::isSymlink(metadata_path))
+        return;
+
+    String target_path = FS::readSymlink(metadata_path).string();
+    fs::path path_to_remove = metadata_path;
+    if (path_to_remove.filename().empty())
+        path_to_remove = path_to_remove.parent_path();
+
+    /// Before 20.7 metadata/db_name.sql file might absent and Ordinary database was attached if there's metadata/db_name/ dir.
+    /// Between 20.7 and 22.7 metadata/db_name.sql was created in this case as well.
+    /// Since 20.7 `default` database is created with Atomic engine on the very first server run.
+    /// The problem is that if server crashed during the very first run and metadata/db_name/ -> store/whatever symlink was created
+    /// then it's considered as Ordinary database. And it even works somehow
+    /// until background task tries to remove unused dir from store/...
+    throw Exception(ErrorCodes::CANNOT_CREATE_DATABASE,
+                    "Metadata directory {} for Ordinary database {} is a symbolic link to {}. "
+                    "It may be a result of manual intervention, crash on very first server start or a bug. "
+                    "Database cannot be attached (it's kind of protection from potential data loss). "
+                    "Metadata directory must not be a symlink and must contain tables metadata files itself. "
+                    "You have to resolve this manually. It can be done like this: rm {}; sudo -u clickhouse mv {} {};",
+                    metadata_path, database_name, target_path,
+                    quoteString(path_to_remove.string()), quoteString(target_path), quoteString(path_to_remove.string()));
+
+}
+
 DatabasePtr DatabaseFactory::get(const ASTCreateQuery & create, const String & metadata_path, ContextPtr context)
 {
+    cckMetadataPathForOrdinary(create, metadata_path);
+
    /// Creates store/xxx/ for Atomic
    fs::create_directories(fs::path(metadata_path).parent_path());

@ -127,19 +163,6 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String
            throw Exception(ErrorCodes::UNKNOWN_DATABASE_ENGINE,
                            "Ordinary database engine is deprecated (see also allow_deprecated_database_ordinary setting)");

-        /// Before 20.7 metadata/db_name.sql file might absent and Ordinary database was attached if there's metadata/db_name/ dir.
-        /// Between 20.7 and 22.7 metadata/db_name.sql was created in this case as well.
-        /// Since 20.7 `default` database is created with Atomic engine on the very first server run.
-        /// The problem is that if server crashed during the very first run and metadata/db_name/ -> store/whatever symlink was created
-        /// then it's considered as Ordinary database. And it even works somehow
-        /// until background task tries to remove onused dir from store/...
-        if (fs::is_symlink(metadata_path))
-            throw Exception(ErrorCodes::CANNOT_CREATE_DATABASE, "Metadata directory {} for Ordinary database {} is a symbolic link to {}. "
-                            "It may be a result of manual intervention, crash on very first server start or a bug. "
-                            "Database cannot be attached (it's kind of protection from potential data loss). "
-                            "Metadata directory must not be a symlink and must contain tables metadata files itself. "
-                            "You have to resolve this manually.",
-                            metadata_path, database_name, fs::read_symlink(metadata_path).string());
        return std::make_shared<DatabaseOrdinary>(database_name, metadata_path, context);
    }

--- a/src/Databases/DatabaseLazy.cpp
+++ b/src/Databases/DatabaseLazy.cpp
@ -38,7 +38,7 @@ DatabaseLazy::DatabaseLazy(const String & name_, const String & metadata_path_,


 void DatabaseLazy::loadStoredObjects(
-    ContextMutablePtr local_context, bool /* force_restore */, bool /*force_attach*/, bool /* skip_startup_tables */)
+    ContextMutablePtr local_context, LoadingStrictnessLevel /*mode*/, bool /* skip_startup_tables */)
 {
    iterateMetadataFiles(local_context, [this, &local_context](const String & file_name)
    {
--- a/src/Databases/DatabaseLazy.h
+++ b/src/Databases/DatabaseLazy.h
@ -26,7 +26,7 @@ public:

    bool canContainDistributedTables() const override { return false; }

-    void loadStoredObjects(ContextMutablePtr context, bool force_restore, bool force_attach, bool skip_startup_tables) override;
+    void loadStoredObjects(ContextMutablePtr context, LoadingStrictnessLevel /*mode*/, bool skip_startup_tables) override;

    void createTable(
        ContextPtr context,
--- a/src/Databases/DatabaseOnDisk.cpp
+++ b/src/Databases/DatabaseOnDisk.cpp
@ -230,7 +230,7 @@ void DatabaseOnDisk::createTable(

 /// If the table was detached permanently we will have a flag file with
 /// .sql.detached extension, is not needed anymore since we attached the table back
-void DatabaseOnDisk::removeDetachedPermanentlyFlag(ContextPtr, const String & table_name, const String & table_metadata_path, bool) const
+void DatabaseOnDisk::removeDetachedPermanentlyFlag(ContextPtr, const String & table_name, const String & table_metadata_path, bool)
 {
    try
    {
--- a/src/Databases/DatabaseOnDisk.h
+++ b/src/Databases/DatabaseOnDisk.h
@ -94,7 +94,7 @@ protected:
    virtual void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table,
                                   const String & table_metadata_tmp_path, const String & table_metadata_path, ContextPtr query_context);

-    virtual void removeDetachedPermanentlyFlag(ContextPtr context, const String & table_name, const String & table_metadata_path, bool attach) const;
+    virtual void removeDetachedPermanentlyFlag(ContextPtr context, const String & table_name, const String & table_metadata_path, bool attach);
    virtual void setDetachedTableNotInUseForce(const UUID & /*uuid*/) {}

    const String metadata_path;
--- a/src/Databases/DatabaseOrdinary.cpp
+++ b/src/Databases/DatabaseOrdinary.cpp
@ -81,7 +81,7 @@ DatabaseOrdinary::DatabaseOrdinary(
 }

 void DatabaseOrdinary::loadStoredObjects(
-    ContextMutablePtr local_context, bool force_restore, bool force_attach, bool skip_startup_tables)
+    ContextMutablePtr local_context, LoadingStrictnessLevel mode, bool skip_startup_tables)
 {
    /** Tables load faster if they are loaded in sorted (by name) order.
      * Otherwise (for the ext4 filesystem), `DirectoryIterator` iterates through them in some order,
@ -89,6 +89,7 @@ void DatabaseOrdinary::loadStoredObjects(
      */

    ParsedTablesMetadata metadata;
+    bool force_attach = LoadingStrictnessLevel::FORCE_ATTACH <= mode;
    loadTablesMetadata(local_context, metadata, force_attach);

    size_t total_tables = metadata.parsed_tables.size() - metadata.total_dictionaries;
@ -118,7 +119,7 @@ void DatabaseOrdinary::loadStoredObjects(
        {
            pool.scheduleOrThrowOnError([&]()
            {
-                loadTableFromMetadata(local_context, path, name, ast, force_restore);
+                loadTableFromMetadata(local_context, path, name, ast, mode);

                /// Messages, so that it's not boring to wait for the server to load for a long time.
                logAboutProgress(log, ++dictionaries_processed, metadata.total_dictionaries, watch);
@ -140,7 +141,7 @@ void DatabaseOrdinary::loadStoredObjects(
        {
            pool.scheduleOrThrowOnError([&]()
            {
-                loadTableFromMetadata(local_context, path, name, ast, force_restore);
+                loadTableFromMetadata(local_context, path, name, ast, mode);

                /// Messages, so that it's not boring to wait for the server to load for a long time.
                logAboutProgress(log, ++tables_processed, total_tables, watch);
@ -153,7 +154,7 @@ void DatabaseOrdinary::loadStoredObjects(
    if (!skip_startup_tables)
    {
        /// After all tables was basically initialized, startup them.
-        startupTables(pool, force_restore, force_attach);
+        startupTables(pool, mode);
    }
 }

@ -198,6 +199,7 @@ void DatabaseOrdinary::loadTablesMetadata(ContextPtr local_context, ParsedTables
                if (fs::exists(full_path.string() + detached_suffix))
                {
                    const std::string table_name = unescapeForFileName(file_name.substr(0, file_name.size() - 4));
+                    permanently_detached_tables.push_back(table_name);
                    LOG_DEBUG(log, "Skipping permanently detached table {}.", backQuote(table_name));
                    return;
                }
@ -238,7 +240,8 @@ void DatabaseOrdinary::loadTablesMetadata(ContextPtr local_context, ParsedTables
             TSA_SUPPRESS_WARNING_FOR_READ(database_name), tables_in_database, dictionaries_in_database);
 }

-void DatabaseOrdinary::loadTableFromMetadata(ContextMutablePtr local_context, const String & file_path, const QualifiedTableName & name, const ASTPtr & ast, bool force_restore)
+void DatabaseOrdinary::loadTableFromMetadata(ContextMutablePtr local_context, const String & file_path, const QualifiedTableName & name, const ASTPtr & ast,
+    LoadingStrictnessLevel mode)
 {
    assert(name.database == TSA_SUPPRESS_WARNING_FOR_READ(database_name));
    const auto & create_query = ast->as<const ASTCreateQuery &>();
@ -248,11 +251,10 @@ void DatabaseOrdinary::loadTableFromMetadata(ContextMutablePtr local_context, co
        create_query,
        *this,
        name.database,
-        file_path,
-        force_restore);
+        file_path, LoadingStrictnessLevel::FORCE_RESTORE <= mode);
 }

-void DatabaseOrdinary::startupTables(ThreadPool & thread_pool, bool /*force_restore*/, bool /*force_attach*/)
+void DatabaseOrdinary::startupTables(ThreadPool & thread_pool, LoadingStrictnessLevel /*mode*/)
 {
    LOG_INFO(log, "Starting up tables.");

--- a/src/Databases/DatabaseOrdinary.h
+++ b/src/Databases/DatabaseOrdinary.h
@ -21,21 +21,24 @@ public:

    String getEngineName() const override { return "Ordinary"; }

-    void loadStoredObjects(ContextMutablePtr context, bool force_restore, bool force_attach, bool skip_startup_tables) override;
+    void loadStoredObjects(ContextMutablePtr context, LoadingStrictnessLevel mode, bool skip_startup_tables) override;

    bool supportsLoadingInTopologicalOrder() const override { return true; }

    void loadTablesMetadata(ContextPtr context, ParsedTablesMetadata & metadata, bool is_startup) override;

-    void loadTableFromMetadata(ContextMutablePtr local_context, const String & file_path, const QualifiedTableName & name, const ASTPtr & ast, bool force_restore) override;
+    void loadTableFromMetadata(ContextMutablePtr local_context, const String & file_path, const QualifiedTableName & name, const ASTPtr & ast,
+        LoadingStrictnessLevel mode) override;

-    void startupTables(ThreadPool & thread_pool, bool force_restore, bool force_attach) override;
+    void startupTables(ThreadPool & thread_pool, LoadingStrictnessLevel mode) override;

    void alterTable(
        ContextPtr context,
        const StorageID & table_id,
        const StorageInMemoryMetadata & metadata) override;

+    Strings getNamesOfPermanentlyDetachedTables() const override { return permanently_detached_tables; }
+
 protected:
    virtual void commitAlterTable(
        const StorageID & table_id,
@ -43,6 +46,8 @@ protected:
        const String & table_metadata_path,
        const String & statement,
        ContextPtr query_context);
+
+    Strings permanently_detached_tables;
 };

 }
--- a/src/Databases/DatabaseReplicated.cpp
+++ b/src/Databases/DatabaseReplicated.cpp
@ -48,10 +48,12 @@ namespace ErrorCodes
    extern const int CANNOT_RESTORE_TABLE;
 }

+static constexpr const char * REPLICATED_DATABASE_MARK = "DatabaseReplicated";
 static constexpr const char * DROPPED_MARK = "DROPPED";
 static constexpr const char * BROKEN_TABLES_SUFFIX = "_broken_tables";
 static constexpr const char * BROKEN_REPLICATED_TABLES_SUFFIX = "_broken_replicated_tables";

+static constexpr size_t METADATA_FILE_BUFFER_SIZE = 32768;

 zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const
 {
@ -63,6 +65,13 @@ static inline String getHostID(ContextPtr global_context, const UUID & db_uuid)
    return Cluster::Address::toString(getFQDNOrHostName(), global_context->getTCPPort()) + ':' + toString(db_uuid);
 }

+static inline UInt64 getMetadataHash(const String & table_name, const String & metadata)
+{
+    SipHash hash;
+    hash.update(table_name);
+    hash.update(metadata);
+    return hash.get64();
+}

 DatabaseReplicated::~DatabaseReplicated() = default;

@ -80,6 +89,7 @@ DatabaseReplicated::DatabaseReplicated(
    , shard_name(shard_name_)
    , replica_name(replica_name_)
    , db_settings(std::move(db_settings_))
+    , tables_metadata_digest(0)
 {
    if (zookeeper_path.empty() || shard_name.empty() || replica_name.empty())
        throw Exception("ZooKeeper path, shard and replica names must be non-empty", ErrorCodes::BAD_ARGUMENTS);
@ -116,13 +126,30 @@ std::pair<String, String> DatabaseReplicated::parseFullReplicaName(const String
    return {shard, replica};
 }

-ClusterPtr DatabaseReplicated::getCluster() const
+ClusterPtr DatabaseReplicated::tryGetCluster() const
 {
    std::lock_guard lock{mutex};
    if (cluster)
        return cluster;

-    cluster = getClusterImpl();
+    /// Database is probably not created or not initialized yet, it's ok to return nullptr
+    if (is_readonly)
+        return cluster;
+
+    try
+    {
+        /// A quick fix for stateless tests with DatabaseReplicated. Its ZK
+        /// node can be destroyed at any time. If another test lists
+        /// system.clusters to get client command line suggestions, it will
+        /// get an error when trying to get the info about DB from ZK.
+        /// Just ignore these inaccessible databases. A good example of a
+        /// failing test is `01526_client_start_and_exit`.
+        cluster = getClusterImpl();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(log);
+    }
    return cluster;
 }

@ -232,7 +259,7 @@ void DatabaseReplicated::fillClusterAuthInfo(String collection_name, const Poco:
    cluster_auth_info.cluster_secure_connection = config_ref.getBool(config_prefix + ".cluster_secure_connection", false);
 }

-void DatabaseReplicated::tryConnectToZooKeeperAndInitDatabase(bool force_attach)
+void DatabaseReplicated::tryConnectToZooKeeperAndInitDatabase(LoadingStrictnessLevel mode)
 {
    try
    {
@ -250,27 +277,48 @@ void DatabaseReplicated::tryConnectToZooKeeperAndInitDatabase(bool force_attach)
        }

        replica_path = fs::path(zookeeper_path) / "replicas" / getFullReplicaName();
+        bool is_create_query = mode == LoadingStrictnessLevel::CREATE;

        String replica_host_id;
        if (current_zookeeper->tryGet(replica_path, replica_host_id))
        {
+            if (replica_host_id == DROPPED_MARK && !is_create_query)
+            {
+                LOG_WARNING(log, "Database {} exists locally, but marked dropped in ZooKeeper ({}). "
+                                 "Will not try to start it up", getDatabaseName(), replica_path);
+                is_probably_dropped = true;
+                return;
+            }
+
            String host_id = getHostID(getContext(), db_uuid);
-            if (replica_host_id != host_id)
-                throw Exception(ErrorCodes::REPLICA_IS_ALREADY_EXIST,
-                                "Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'",
-                                replica_name, shard_name, zookeeper_path, replica_host_id, host_id);
+            if (is_create_query || replica_host_id != host_id)
+            {
+                throw Exception(
+                    ErrorCodes::REPLICA_IS_ALREADY_EXIST,
+                    "Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'",
+                    replica_name, shard_name, zookeeper_path, replica_host_id, host_id);
+            }
+        }
+        else if (is_create_query)
+        {
+            /// Create new replica. Throws if replica with the same name already exists
+            createReplicaNodesInZooKeeper(current_zookeeper);
        }
        else
        {
-            /// Throws if replica with the same name already exists
-            createReplicaNodesInZooKeeper(current_zookeeper);
+            /// It's not CREATE query, but replica does not exist. Probably it was dropped.
+            /// Do not create anything, continue as readonly.
+            LOG_WARNING(log, "Database {} exists locally, but its replica does not exist in ZooKeeper ({}). "
+                             "Assuming it was dropped, will not try to start it up", getDatabaseName(), replica_path);
+            is_probably_dropped = true;
+            return;
        }

        is_readonly = false;
    }
    catch (...)
    {
-        if (!force_attach)
+        if (mode < LoadingStrictnessLevel::FORCE_ATTACH)
            throw;

        /// It's server startup, ignore error.
@ -284,7 +332,7 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP
    current_zookeeper->createAncestors(zookeeper_path);

    Coordination::Requests ops;
-    ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent));
+    ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, REPLICATED_DATABASE_MARK, zkutil::CreateMode::Persistent));
    ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", zkutil::CreateMode::Persistent));
    ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent));
    ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/counter", "", zkutil::CreateMode::Persistent));
@ -306,10 +354,42 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP

    /// Other codes are unexpected, will throw
    zkutil::KeeperMultiException::check(res, ops, responses);
-    assert(false);
+    chassert(false);
    __builtin_unreachable();
 }

+bool DatabaseReplicated::looksLikeReplicatedDatabasePath(const ZooKeeperPtr & current_zookeeper, const String & path)
+{
+    Coordination::Stat stat;
+    String maybe_database_mark;
+    if (!current_zookeeper->tryGet(path, maybe_database_mark, &stat))
+        return false;
+    if (maybe_database_mark.starts_with(REPLICATED_DATABASE_MARK))
+        return true;
+    if (maybe_database_mark.empty())
+        return false;
+
+    /// Old versions did not have REPLICATED_DATABASE_MARK. Check specific nodes exist and add mark.
+    Coordination::Requests ops;
+    ops.emplace_back(zkutil::makeCheckRequest(path + "/log", -1));
+    ops.emplace_back(zkutil::makeCheckRequest(path + "/replicas", -1));
+    ops.emplace_back(zkutil::makeCheckRequest(path + "/counter", -1));
+    ops.emplace_back(zkutil::makeCheckRequest(path + "/metadata", -1));
+    ops.emplace_back(zkutil::makeCheckRequest(path + "/max_log_ptr", -1));
+    ops.emplace_back(zkutil::makeCheckRequest(path + "/logs_to_keep", -1));
+    ops.emplace_back(zkutil::makeSetRequest(path, REPLICATED_DATABASE_MARK, stat.version));
+    Coordination::Responses responses;
+    auto res = current_zookeeper->tryMulti(ops, responses);
+    if (res == Coordination::Error::ZOK)
+        return true;
+
+    /// Recheck database mark (just in case of concurrent update).
+    if (!current_zookeeper->tryGet(path, maybe_database_mark, &stat))
+        return false;
+
+    return maybe_database_mark.starts_with(REPLICATED_DATABASE_MARK);
+}
+
 void DatabaseReplicated::createEmptyLogEntry(const ZooKeeperPtr & current_zookeeper)
 {
    /// On replica creation add empty entry to log. Can be used to trigger some actions on other replicas (e.g. update cluster info).
@ -319,11 +399,17 @@ void DatabaseReplicated::createEmptyLogEntry(const ZooKeeperPtr & current_zookee

 bool DatabaseReplicated::waitForReplicaToProcessAllEntries(UInt64 timeout_ms)
 {
+    if (!ddl_worker)
+        return false;
    return ddl_worker->waitForReplicaToProcessAllEntries(timeout_ms);
 }

 void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper)
 {
+    if (!looksLikeReplicatedDatabasePath(current_zookeeper, zookeeper_path))
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot add new database replica: provided path {} "
+                        "already contains some data and it does not look like Replicated database path.", zookeeper_path);
+
    /// Write host name to replica_path, it will protect from multiple replicas with the same name
    auto host_id = getHostID(getContext(), db_uuid);

@ -334,6 +420,7 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt
        Coordination::Requests ops;
        ops.emplace_back(zkutil::makeCreateRequest(replica_path, host_id, zkutil::CreateMode::Persistent));
        ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", "0", zkutil::CreateMode::Persistent));
+        ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/digest", "0", zkutil::CreateMode::Persistent));
        /// In addition to creating the replica nodes, we record the max_log_ptr at the instant where
        /// we declared ourself as an existing replica. We'll need this during recoverLostReplica to
        /// notify other nodes that issued new queries while this node was recovering.
@ -354,25 +441,89 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt
    createEmptyLogEntry(current_zookeeper);
 }

-void DatabaseReplicated::beforeLoadingMetadata(ContextMutablePtr /*context*/, bool /*force_restore*/, bool force_attach)
+void DatabaseReplicated::beforeLoadingMetadata(ContextMutablePtr /*context*/, LoadingStrictnessLevel mode)
 {
-    tryConnectToZooKeeperAndInitDatabase(force_attach);
+    tryConnectToZooKeeperAndInitDatabase(mode);
 }

 void DatabaseReplicated::loadStoredObjects(
-    ContextMutablePtr local_context, bool force_restore, bool force_attach, bool skip_startup_tables)
+    ContextMutablePtr local_context, LoadingStrictnessLevel mode, bool skip_startup_tables)
 {
-    beforeLoadingMetadata(local_context, force_restore, force_attach);
-    DatabaseAtomic::loadStoredObjects(local_context, force_restore, force_attach, skip_startup_tables);
+    beforeLoadingMetadata(local_context, mode);
+    DatabaseAtomic::loadStoredObjects(local_context, mode, skip_startup_tables);
 }

-void DatabaseReplicated::startupTables(ThreadPool & thread_pool, bool force_restore, bool force_attach)
+UInt64 DatabaseReplicated::getMetadataHash(const String & table_name) const
 {
-    DatabaseAtomic::startupTables(thread_pool, force_restore, force_attach);
+    return DB::getMetadataHash(table_name, readMetadataFile(table_name));
+}
+
+void DatabaseReplicated::startupTables(ThreadPool & thread_pool, LoadingStrictnessLevel mode)
+{
+    DatabaseAtomic::startupTables(thread_pool, mode);
+
+    /// TSA: No concurrent writes are possible during loading
+    UInt64 digest = 0;
+    for (const auto & table : TSA_SUPPRESS_WARNING_FOR_READ(tables))
+        digest += getMetadataHash(table.first);
+
+    LOG_DEBUG(log, "Calculated metadata digest of {} tables: {}", TSA_SUPPRESS_WARNING_FOR_READ(tables).size(), digest);
+    chassert(!TSA_SUPPRESS_WARNING_FOR_READ(tables_metadata_digest));
+    TSA_SUPPRESS_WARNING_FOR_WRITE(tables_metadata_digest) = digest;
+
    ddl_worker = std::make_unique<DatabaseReplicatedDDLWorker>(this, getContext());
+    if (is_probably_dropped)
+        return;
    ddl_worker->startup();
 }

+bool DatabaseReplicated::checkDigestValid(const ContextPtr & local_context, bool debug_check /* = true */) const
+{
+    if (debug_check)
+    {
+        /// Reduce number of debug checks
+        if (thread_local_rng() % 16)
+            return true;
+    }
+
+    LOG_TEST(log, "Current in-memory metadata digest: {}", tables_metadata_digest);
+
+    /// Database is probably being dropped
+    if (!local_context->getZooKeeperMetadataTransaction() && !ddl_worker->isCurrentlyActive())
+        return true;
+
+    UInt64 local_digest = 0;
+    {
+        std::lock_guard lock{mutex};
+        for (const auto & table : TSA_SUPPRESS_WARNING_FOR_READ(tables))
+            local_digest += getMetadataHash(table.first);
+    }
+
+    if (local_digest != tables_metadata_digest)
+    {
+        LOG_ERROR(log, "Digest of local metadata ({}) is not equal to in-memory digest ({})", local_digest, tables_metadata_digest);
+        return false;
+    }
+
+    /// Do not check digest in Keeper after internal subquery, it's probably not committed yet
+    if (local_context->isInternalSubquery())
+        return true;
+
+    /// Check does not make sense to check digest in Keeper during recovering
+    if (is_recovering)
+        return true;
+
+    String zk_digest = getZooKeeper()->get(replica_path + "/digest");
+    String local_digest_str = toString(local_digest);
+    if (zk_digest != local_digest_str)
+    {
+        LOG_ERROR(log, "Digest of local metadata ({}) is not equal to digest in Keeper ({})", local_digest_str, zk_digest);
+        return false;
+    }
+
+    return true;
+}
+
 void DatabaseReplicated::checkQueryValid(const ASTPtr & query, ContextPtr query_context) const
 {
    /// Replicas will set correct name of current database in query context (database name can be different on replicas)
@ -512,7 +663,7 @@ static UUID getTableUUIDIfReplicated(const String & metadata, ContextPtr context
        return UUIDHelpers::Nil;
    if (!startsWith(create.storage->engine->name, "Replicated") || !endsWith(create.storage->engine->name, "MergeTree"))
        return UUIDHelpers::Nil;
-    assert(create.uuid != UUIDHelpers::Nil);
+    chassert(create.uuid != UUIDHelpers::Nil);
    return create.uuid;
 }

@ -532,9 +683,6 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep
    else
        LOG_WARNING(log, "Will recover replica with staled log pointer {} from log pointer {}", our_log_ptr, max_log_ptr);

-    if (new_replica && !empty())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "It's new replica, but database is not empty");
-
    auto table_name_to_metadata = tryGetConsistentMetadataSnapshot(current_zookeeper, max_log_ptr);

    /// For ReplicatedMergeTree tables we can compare only UUIDs to ensure that it's the same table.
@ -578,7 +726,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep
        auto in_zk = table_name_to_metadata.find(name);
        if (in_zk == table_name_to_metadata.end() || in_zk->second != readMetadataFile(name))
        {
-            /// Local table does not exits in ZooKeeper or has different metadata
+            /// Local table does not exist in ZooKeeper or has different metadata
            tables_to_detach.emplace_back(std::move(name));
        }
    }
@ -640,7 +788,13 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep
            assert(db_name < to_database_name);
            DDLGuardPtr to_table_guard = DatabaseCatalog::instance().getDDLGuard(to_database_name, to_name);
            auto to_db_ptr = DatabaseCatalog::instance().getDatabase(to_database_name);
-            DatabaseAtomic::renameTable(make_query_context(), broken_table_name, *to_db_ptr, to_name, false, false);
+
+            std::lock_guard lock{metadata_mutex};
+            UInt64 new_digest = tables_metadata_digest;
+            new_digest -= getMetadataHash(broken_table_name);
+            DatabaseAtomic::renameTable(make_query_context(), broken_table_name, *to_db_ptr, to_name, /* exchange */ false, /* dictionary */ false);
+            tables_metadata_digest = new_digest;
+            assert(checkDigestValid(getContext()));
            ++moved_tables;
        };

@ -649,9 +803,24 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep
            LOG_DEBUG(log, "Will DROP TABLE {}, because it does not store data on disk and can be safely dropped", backQuoteIfNeed(table_name));
            dropped_tables.push_back(tryGetTableUUID(table_name));
            dropped_dictionaries += table->isDictionary();
-
            table->flushAndShutdown();
-            DatabaseAtomic::dropTable(make_query_context(), table_name, true);
+
+            if (table->getName() == "MaterializedView" || table->getName() == "WindowView")
+            {
+                /// We have to drop MV inner table, so MV will not try to do it implicitly breaking some invariants.
+                /// Also we have to commit metadata transaction, because it's not committed by default for inner tables of MVs.
+                /// Yep, I hate inner tables of materialized views.
+                auto mv_drop_inner_table_context = make_query_context();
+                table->dropInnerTableIfAny(sync, mv_drop_inner_table_context);
+                mv_drop_inner_table_context->getZooKeeperMetadataTransaction()->commit();
+            }
+
+            std::lock_guard lock{metadata_mutex};
+            UInt64 new_digest = tables_metadata_digest;
+            new_digest -= getMetadataHash(table_name);
+            DatabaseAtomic::dropTableImpl(make_query_context(), table_name, /* sync */ true);
+            tables_metadata_digest = new_digest;
+            assert(checkDigestValid(getContext()));
        }
        else if (!table->supportsReplication())
        {
@ -677,7 +846,15 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep
        /// TODO Maybe we should do it in two steps: rename all tables to temporary names and then rename them to actual names?
        DDLGuardPtr table_guard = DatabaseCatalog::instance().getDDLGuard(db_name, std::min(from, to));
        DDLGuardPtr to_table_guard = DatabaseCatalog::instance().getDDLGuard(db_name, std::max(from, to));
+
+        std::lock_guard lock{metadata_mutex};
+        UInt64 new_digest = tables_metadata_digest;
+        String statement = readMetadataFile(from);
+        new_digest -= DB::getMetadataHash(from, statement);
+        new_digest += DB::getMetadataHash(to, statement);
        DatabaseAtomic::renameTable(make_query_context(), from, *this, to, false, false);
+        tables_metadata_digest = new_digest;
+        assert(checkDigestValid(getContext()));
    }

    for (const auto & id : dropped_tables)
@ -712,6 +889,10 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep
                LOG_INFO(log, "Marked recovered {} as finished", entry_name);
        }
    }
+
+    std::lock_guard lock{metadata_mutex};
+    chassert(checkDigestValid(getContext()));
+    current_zookeeper->set(replica_path + "/digest", toString(tables_metadata_digest));
 }

 std::map<String, String> DatabaseReplicated::tryGetConsistentMetadataSnapshot(const ZooKeeperPtr & zookeeper, UInt32 & max_log_ptr)
@ -749,8 +930,8 @@ std::map<String, String> DatabaseReplicated::tryGetConsistentMetadataSnapshot(co
        }
        else
        {
-            assert(max_log_ptr == new_max_log_ptr);
-            assert(table_names.size() != table_name_to_metadata.size());
+            chassert(max_log_ptr == new_max_log_ptr);
+            chassert(table_names.size() != table_name_to_metadata.size());
            LOG_DEBUG(log, "Cannot get metadata of some tables due to ZooKeeper error, will retry");
        }
    }
@ -801,6 +982,8 @@ void DatabaseReplicated::drop(ContextPtr context_)

 void DatabaseReplicated::stopReplication()
 {
+    if (is_probably_dropped)
+        return;
    if (ddl_worker)
        ddl_worker->shutdown();
 }
@ -817,12 +1000,29 @@ void DatabaseReplicated::dropTable(ContextPtr local_context, const String & tabl
 {
    auto txn = local_context->getZooKeeperMetadataTransaction();
    assert(!ddl_worker->isCurrentlyActive() || txn || startsWith(table_name, ".inner_id."));
-    if (txn && txn->isInitialQuery())
+    if (txn && txn->isInitialQuery() && !txn->isCreateOrReplaceQuery())
    {
        String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name);
        txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1));
    }
-    DatabaseAtomic::dropTable(local_context, table_name, sync);
+
+    auto table = tryGetTable(table_name, getContext());
+    if (table->getName() == "MaterializedView" || table->getName() == "WindowView")
+    {
+        /// Avoid recursive locking of metadata_mutex
+        table->dropInnerTableIfAny(sync, local_context);
+    }
+
+    std::lock_guard lock{metadata_mutex};
+    UInt64 new_digest = tables_metadata_digest;
+    new_digest -= getMetadataHash(table_name);
+    if (txn && !txn->isCreateOrReplaceQuery())
+        txn->addOp(zkutil::makeSetRequest(replica_path + "/digest", toString(new_digest), -1));
+
+    DatabaseAtomic::dropTableImpl(local_context, table_name, sync);
+    tables_metadata_digest = new_digest;
+
+    assert(checkDigestValid(local_context));
 }

 void DatabaseReplicated::renameTable(ContextPtr local_context, const String & table_name, IDatabase & to_database,
@ -831,31 +1031,51 @@ void DatabaseReplicated::renameTable(ContextPtr local_context, const String & ta
    auto txn = local_context->getZooKeeperMetadataTransaction();
    assert(txn);

+    if (this != &to_database)
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Moving tables between databases is not supported for Replicated engine");
+    if (table_name == to_table_name)
+        throw Exception(ErrorCodes::INCORRECT_QUERY, "Cannot rename table to itself");
+    if (!isTableExist(table_name, local_context))
+        throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", table_name);
+    if (exchange && !to_database.isTableExist(to_table_name, local_context))
+        throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", to_table_name);
+
+    String statement = readMetadataFile(table_name);
+    String statement_to;
+    if (exchange)
+        statement_to = readMetadataFile(to_table_name);
+
    if (txn->isInitialQuery())
    {
-        if (this != &to_database)
-            throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Moving tables between databases is not supported for Replicated engine");
-        if (table_name == to_table_name)
-            throw Exception(ErrorCodes::INCORRECT_QUERY, "Cannot rename table to itself");
-        if (!isTableExist(table_name, local_context))
-            throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", table_name);
-        if (exchange && !to_database.isTableExist(to_table_name, local_context))
-            throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", to_table_name);
-
-        String statement = readMetadataFile(table_name);
        String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name);
        String metadata_zk_path_to = zookeeper_path + "/metadata/" + escapeForFileName(to_table_name);
-        txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1));
+        if (!txn->isCreateOrReplaceQuery())
+            txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1));
+
        if (exchange)
        {
-            String statement_to = readMetadataFile(to_table_name);
            txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path_to, -1));
-            txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement_to, zkutil::CreateMode::Persistent));
+            if (!txn->isCreateOrReplaceQuery())
+                txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement_to, zkutil::CreateMode::Persistent));
        }
        txn->addOp(zkutil::makeCreateRequest(metadata_zk_path_to, statement, zkutil::CreateMode::Persistent));
    }

+    std::lock_guard lock{metadata_mutex};
+    UInt64 new_digest = tables_metadata_digest;
+    new_digest -= DB::getMetadataHash(table_name, statement);
+    new_digest += DB::getMetadataHash(to_table_name, statement);
+    if (exchange)
+    {
+        new_digest -= DB::getMetadataHash(to_table_name, statement_to);
+        new_digest += DB::getMetadataHash(table_name, statement_to);
+    }
+    if (txn)
+        txn->addOp(zkutil::makeSetRequest(replica_path + "/digest", toString(new_digest), -1));
+
    DatabaseAtomic::renameTable(local_context, table_name, to_database, to_table_name, exchange, dictionary);
+    tables_metadata_digest = new_digest;
+    assert(checkDigestValid(local_context));
 }

 void DatabaseReplicated::commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table,
@ -864,14 +1084,24 @@ void DatabaseReplicated::commitCreateTable(const ASTCreateQuery & query, const S
 {
    auto txn = query_context->getZooKeeperMetadataTransaction();
    assert(!ddl_worker->isCurrentlyActive() || txn);
-    if (txn && txn->isInitialQuery())
+
+    String statement = getObjectDefinitionFromCreateQuery(query.clone());
+    if (txn && txn->isInitialQuery() && !txn->isCreateOrReplaceQuery())
    {
        String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(query.getTable());
-        String statement = getObjectDefinitionFromCreateQuery(query.clone());
        /// zk::multi(...) will throw if `metadata_zk_path` exists
        txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent));
    }
+
+    std::lock_guard lock{metadata_mutex};
+    UInt64 new_digest = tables_metadata_digest;
+    new_digest += DB::getMetadataHash(query.getTable(), statement);
+    if (txn && !txn->isCreateOrReplaceQuery())
+        txn->addOp(zkutil::makeSetRequest(replica_path + "/digest", toString(new_digest), -1));
+
    DatabaseAtomic::commitCreateTable(query, table, table_metadata_tmp_path, table_metadata_path, query_context);
+    tables_metadata_digest = new_digest;
+    assert(checkDigestValid(query_context));
 }

 void DatabaseReplicated::commitAlterTable(const StorageID & table_id,
@ -879,12 +1109,23 @@ void DatabaseReplicated::commitAlterTable(const StorageID & table_id,
                                          const String & statement, ContextPtr query_context)
 {
    auto txn = query_context->getZooKeeperMetadataTransaction();
+    assert(!ddl_worker->isCurrentlyActive() || txn);
    if (txn && txn->isInitialQuery())
    {
        String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name);
        txn->addOp(zkutil::makeSetRequest(metadata_zk_path, statement, -1));
    }
+
+    std::lock_guard lock{metadata_mutex};
+    UInt64 new_digest = tables_metadata_digest;
+    new_digest -= getMetadataHash(table_id.table_name);
+    new_digest += DB::getMetadataHash(table_id.table_name, statement);
+    if (txn)
+        txn->addOp(zkutil::makeSetRequest(replica_path + "/digest", toString(new_digest), -1));
+
    DatabaseAtomic::commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path, statement, query_context);
+    tables_metadata_digest = new_digest;
+    assert(checkDigestValid(query_context));
 }

 void DatabaseReplicated::detachTablePermanently(ContextPtr local_context, const String & table_name)
@ -898,10 +1139,19 @@ void DatabaseReplicated::detachTablePermanently(ContextPtr local_context, const
        String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name);
        txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1));
    }
+
+    std::lock_guard lock{metadata_mutex};
+    UInt64 new_digest = tables_metadata_digest;
+    new_digest -= getMetadataHash(table_name);
+    if (txn)
+        txn->addOp(zkutil::makeSetRequest(replica_path + "/digest", toString(new_digest), -1));
+
    DatabaseAtomic::detachTablePermanently(local_context, table_name);
+    tables_metadata_digest = new_digest;
+    assert(checkDigestValid(local_context));
 }

-void DatabaseReplicated::removeDetachedPermanentlyFlag(ContextPtr local_context, const String & table_name, const String & table_metadata_path, bool attach) const
+void DatabaseReplicated::removeDetachedPermanentlyFlag(ContextPtr local_context, const String & table_name, const String & table_metadata_path, bool attach)
 {
    auto txn = local_context->getZooKeeperMetadataTransaction();
    assert(!ddl_worker->isCurrentlyActive() || txn);
@ -911,14 +1161,26 @@ void DatabaseReplicated::removeDetachedPermanentlyFlag(ContextPtr local_context,
        String statement = readMetadataFile(table_name);
        txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent));
    }
+
+    std::lock_guard lock{metadata_mutex};
+    UInt64 new_digest = tables_metadata_digest;
+    if (attach)
+    {
+        new_digest += getMetadataHash(table_name);
+        if (txn)
+            txn->addOp(zkutil::makeSetRequest(replica_path + "/digest", toString(new_digest), -1));
+    }
+
    DatabaseAtomic::removeDetachedPermanentlyFlag(local_context, table_name, table_metadata_path, attach);
+    tables_metadata_digest = new_digest;
+    assert(checkDigestValid(local_context));
 }


 String DatabaseReplicated::readMetadataFile(const String & table_name) const
 {
    String statement;
-    ReadBufferFromFile in(getObjectMetadataPath(table_name), 4096);
+    ReadBufferFromFile in(getObjectMetadataPath(table_name), METADATA_FILE_BUFFER_SIZE);
    readStringUntilEOF(statement, in);
    return statement;
 }
--- a/src/Databases/DatabaseReplicated.h
+++ b/src/Databases/DatabaseReplicated.h
@ -40,7 +40,7 @@ public:
                          const String & table_metadata_tmp_path, const String & table_metadata_path,
                          const String & statement, ContextPtr query_context) override;
    void detachTablePermanently(ContextPtr context, const String & table_name) override;
-    void removeDetachedPermanentlyFlag(ContextPtr context, const String & table_name, const String & table_metadata_path, bool attach) const override;
+    void removeDetachedPermanentlyFlag(ContextPtr context, const String & table_name, const String & table_metadata_path, bool attach) override;

    bool waitForReplicaToProcessAllEntries(UInt64 timeout_ms);

@ -60,15 +60,15 @@ public:
    const String & getZooKeeperPath() const { return zookeeper_path; }

    /// Returns cluster consisting of database replicas
-    ClusterPtr getCluster() const;
+    ClusterPtr tryGetCluster() const;

    void drop(ContextPtr /*context*/) override;

-    void loadStoredObjects(ContextMutablePtr context, bool force_restore, bool force_attach, bool skip_startup_tables) override;
+    void loadStoredObjects(ContextMutablePtr context, LoadingStrictnessLevel mode, bool skip_startup_tables) override;

-    void beforeLoadingMetadata(ContextMutablePtr context, bool force_restore, bool force_attach) override;
+    void beforeLoadingMetadata(ContextMutablePtr context, LoadingStrictnessLevel mode) override;

-    void startupTables(ThreadPool & thread_pool, bool force_restore, bool force_attach) override;
+    void startupTables(ThreadPool & thread_pool, LoadingStrictnessLevel mode) override;

    void shutdown() override;

@ -78,8 +78,9 @@ public:
    friend struct DatabaseReplicatedTask;
    friend class DatabaseReplicatedDDLWorker;
 private:
-    void tryConnectToZooKeeperAndInitDatabase(bool force_attach);
+    void tryConnectToZooKeeperAndInitDatabase(LoadingStrictnessLevel mode);
    bool createDatabaseNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper);
+    static bool looksLikeReplicatedDatabasePath(const ZooKeeperPtr & current_zookeeper, const String & path);
    void createReplicaNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper);

    struct
@ -110,6 +111,9 @@ private:
        return is_recovering && typeid_cast<DatabaseAtomic *>(&to_database);
    }

+    UInt64 getMetadataHash(const String & table_name) const;
+    bool checkDigestValid(const ContextPtr & local_context, bool debug_check = true) const TSA_REQUIRES(metadata_mutex);
+
    String zookeeper_path;
    String shard_name;
    String replica_name;
@ -119,10 +123,20 @@ private:
    zkutil::ZooKeeperPtr getZooKeeper() const;

    std::atomic_bool is_readonly = true;
+    std::atomic_bool is_probably_dropped = false;
    std::atomic_bool is_recovering = false;
    std::unique_ptr<DatabaseReplicatedDDLWorker> ddl_worker;
    UInt32 max_log_ptr_at_creation = 0;

+    /// Usually operation with metadata are single-threaded because of the way replication works,
+    /// but StorageReplicatedMergeTree may call alterTable outside from DatabaseReplicatedDDLWorker causing race conditions.
+    std::mutex metadata_mutex;
+
+    /// Sum of hashes of pairs (table_name, table_create_statement).
+    /// We calculate this sum from local metadata files and compare it will value in ZooKeeper.
+    /// It allows to detect if metadata is broken and recover replica.
+    UInt64 tables_metadata_digest TSA_GUARDED_BY(metadata_mutex);
+
    mutable ClusterPtr cluster;
 };

--- a/src/Databases/DatabaseReplicatedSettings.h
+++ b/src/Databases/DatabaseReplicatedSettings.h
@ -12,6 +12,7 @@ class ASTStorage;
    M(UInt64, max_replication_lag_to_enqueue, 10, "Replica will throw exception on attempt to execute query if its replication lag greater", 0) \
    M(UInt64, wait_entry_commited_timeout_sec, 3600, "Replicas will try to cancel query if timeout exceed, but initiator host has not executed it yet", 0) \
    M(String, collection_name, "", "A name of a collection defined in server's config where all info for cluster authentication is defined", 0) \
+    M(Bool, check_consistency, true, "Check consistency of local metadata and metadata in Keeper, do replica recovery on inconsistency", 0) \


 DECLARE_SETTINGS_TRAITS(DatabaseReplicatedSettingsTraits, LIST_OF_DATABASE_REPLICATED_SETTINGS)
--- a/src/Databases/DatabaseReplicatedWorker.cpp
+++ b/src/Databases/DatabaseReplicatedWorker.cpp
@ -32,9 +32,10 @@ bool DatabaseReplicatedDDLWorker::initializeMainThread()
    {
        try
        {
+            chassert(!database->is_probably_dropped);
            auto zookeeper = getAndSetZooKeeper();
            if (database->is_readonly)
-                database->tryConnectToZooKeeperAndInitDatabase(false);
+                database->tryConnectToZooKeeperAndInitDatabase(LoadingStrictnessLevel::ATTACH);
            initializeReplication();
            initialized = true;
            return true;
@ -65,8 +66,34 @@ void DatabaseReplicatedDDLWorker::initializeReplication()
    UInt32 our_log_ptr = parse<UInt32>(log_ptr_str);
    UInt32 max_log_ptr = parse<UInt32>(zookeeper->get(database->zookeeper_path + "/max_log_ptr"));
    logs_to_keep = parse<UInt32>(zookeeper->get(database->zookeeper_path + "/logs_to_keep"));
-    if (our_log_ptr == 0 || our_log_ptr + logs_to_keep < max_log_ptr)
+
+    UInt64 digest;
+    String digest_str;
+    UInt64 local_digest;
+    if (zookeeper->tryGet(database->replica_path + "/digest", digest_str))
    {
+        digest = parse<UInt64>(digest_str);
+        std::lock_guard lock{database->metadata_mutex};
+        local_digest = database->tables_metadata_digest;
+    }
+    else
+    {
+        /// Database was created by old ClickHouse versions, let's create the node
+        std::lock_guard lock{database->metadata_mutex};
+        digest = local_digest = database->tables_metadata_digest;
+        digest_str = toString(digest);
+        zookeeper->create(database->replica_path + "/digest", digest_str, zkutil::CreateMode::Persistent);
+    }
+
+    bool is_new_replica = our_log_ptr == 0;
+    bool lost_according_to_log_ptr = our_log_ptr + logs_to_keep < max_log_ptr;
+    bool lost_according_to_digest = database->db_settings.check_consistency && local_digest != digest;
+
+    if (is_new_replica || lost_according_to_log_ptr || lost_according_to_digest)
+    {
+        if (!is_new_replica)
+            LOG_WARNING(log, "Replica seems to be lost: our_log_ptr={}, max_log_ptr={}, local_digest={}, zk_digest={}",
+                        our_log_ptr, max_log_ptr, local_digest, digest);
        database->recoverLostReplica(zookeeper, our_log_ptr, max_log_ptr);
        zookeeper->set(database->replica_path + "/log_ptr", toString(max_log_ptr));
        initializeLogPointer(DDLTaskBase::getLogEntryName(max_log_ptr));
@ -77,6 +104,10 @@ void DatabaseReplicatedDDLWorker::initializeReplication()
        last_skipped_entry_name.emplace(log_entry_name);
        initializeLogPointer(log_entry_name);
    }
+
+    std::lock_guard lock{database->metadata_mutex};
+    if (!database->checkDigestValid(context))
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Inconsistent database metadata after reconnection to ZooKeeper");
 }

 String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry)
@ -93,7 +124,7 @@ bool DatabaseReplicatedDDLWorker::waitForReplicaToProcessAllEntries(UInt64 timeo
    const auto max_log_ptr_path = database->zookeeper_path + "/max_log_ptr";
    UInt32 our_log_ptr = parse<UInt32>(zookeeper->get(our_log_ptr_path));
    UInt32 max_log_ptr = parse<UInt32>(zookeeper->get(max_log_ptr_path));
-    assert(our_log_ptr <= max_log_ptr);
+    chassert(our_log_ptr <= max_log_ptr);

    /// max_log_ptr is the number of the last successfully executed request on the initiator
    /// The log could contain other entries which are not committed yet
@ -108,7 +139,6 @@ bool DatabaseReplicatedDDLWorker::waitForReplicaToProcessAllEntries(UInt64 timeo
        std::unique_lock lock{mutex};
        bool processed = wait_current_task_change.wait_for(lock, std::chrono::milliseconds(timeout_ms), [&]()
        {
-            assert(zookeeper->expired() || current_task <= max_log);
            return zookeeper->expired() || current_task == max_log || stop_flag;
        });

@ -181,6 +211,7 @@ String DatabaseReplicatedDDLWorker::enqueueQueryImpl(const ZooKeeperPtr & zookee
    /// Create status dirs
    ops.emplace_back(zkutil::makeCreateRequest(node_path + "/active", "", zkutil::CreateMode::Persistent));
    ops.emplace_back(zkutil::makeCreateRequest(node_path + "/finished", "", zkutil::CreateMode::Persistent));
+    ops.emplace_back(zkutil::makeCreateRequest(node_path + "/synced", "", zkutil::CreateMode::Persistent));
    zookeeper->multi(ops);


@ -206,7 +237,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr
    auto task = std::make_unique<DatabaseReplicatedTask>(entry_name, entry_path, database);
    task->entry = entry;
    task->parseQueryFromEntry(context);
-    assert(!task->entry.query.empty());
+    chassert(!task->entry.query.empty());
    assert(!zookeeper->exists(task->getFinishedNodePath()));
    task->is_initial_query = true;

--- a/src/Databases/IDatabase.h
+++ b/src/Databases/IDatabase.h
@ -1,12 +1,13 @@
 #pragma once

-#include <base/types.h>
+#include <Core/UUID.h>
+#include <Databases/LoadingStrictnessLevel.h>
+#include <Interpreters/Context_fwd.h>
 #include <Parsers/IAST_fwd.h>
 #include <Storages/IStorage_fwd.h>
-#include <Interpreters/Context_fwd.h>
+#include <base/types.h>
 #include <Common/Exception.h>
 #include <Common/ThreadPool.h>
-#include <Core/UUID.h>

 #include <ctime>
 #include <functional>
@ -132,18 +133,15 @@ public:
    /// You can call only once, right after the object is created.
    virtual void loadStoredObjects( /// NOLINT
        ContextMutablePtr /*context*/,
-        bool /*force_restore*/,
-        bool /*force_attach*/ = false,
-        bool /* skip_startup_tables */ = false)
+        LoadingStrictnessLevel /*mode*/,
+        bool /* skip_startup_tables */)
    {
    }

    virtual bool supportsLoadingInTopologicalOrder() const { return false; }

    virtual void beforeLoadingMetadata(
-        ContextMutablePtr /*context*/,
-        bool /*force_restore*/,
-        bool /*force_attach*/)
+        ContextMutablePtr /*context*/, LoadingStrictnessLevel /*mode*/)
    {
    }

@ -152,12 +150,13 @@ public:
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented");
    }

-    virtual void loadTableFromMetadata(ContextMutablePtr /*local_context*/, const String & /*file_path*/, const QualifiedTableName & /*name*/, const ASTPtr & /*ast*/, bool /*force_restore*/)
+    virtual void loadTableFromMetadata(ContextMutablePtr /*local_context*/, const String & /*file_path*/, const QualifiedTableName & /*name*/, const ASTPtr & /*ast*/,
+        LoadingStrictnessLevel /*mode*/)
    {
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented");
    }

-    virtual void startupTables(ThreadPool & /*thread_pool*/, bool /*force_restore*/, bool /*force_attach*/) {}
+    virtual void startupTables(ThreadPool & /*thread_pool*/, LoadingStrictnessLevel /*mode*/) {}

    /// Check the existence of the table in memory (attached).
    virtual bool isTableExist(const String & name, ContextPtr context) const = 0;
@ -223,6 +222,13 @@ public:
        throw Exception("There is no DETACH TABLE PERMANENTLY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED);
    }

+    /// Returns list of table names that were permanently detached.
+    /// This list may not be updated in runtime and may be filled only on server startup
+    virtual Strings getNamesOfPermanentlyDetachedTables() const
+    {
+        throw Exception("Cannot get names of permanently detached tables for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED);
+    }
+
    /// Rename the table and possibly move the table to another database.
    virtual void renameTable(
        ContextPtr /*context*/,
--- a/src/Databases/LoadingStrictnessLevel.cpp
+++ b/src/Databases/LoadingStrictnessLevel.cpp
@ -0,0 +1,28 @@
+#include <Databases/LoadingStrictnessLevel.h>
+#include <cassert>
+
+namespace DB
+{
+
+LoadingStrictnessLevel getLoadingStrictnessLevel(bool attach, bool force_attach, bool force_restore)
+{
+    if (force_restore)
+    {
+        assert(attach);
+        assert(force_attach);
+        return LoadingStrictnessLevel::FORCE_RESTORE;
+    }
+
+    if (force_attach)
+    {
+        assert(attach);
+        return LoadingStrictnessLevel::FORCE_ATTACH;
+    }
+
+    if (attach)
+        return LoadingStrictnessLevel::ATTACH;
+
+    return LoadingStrictnessLevel::CREATE;
+}
+
+}
--- a/src/Databases/LoadingStrictnessLevel.h
+++ b/src/Databases/LoadingStrictnessLevel.h
@ -0,0 +1,21 @@
+#pragma once
+
+namespace DB
+{
+
+/// Strictness mode for loading a table or database
+enum class LoadingStrictnessLevel
+{
+    /// Do all possible sanity checks
+    CREATE = 0,
+    /// Expect existing paths on FS and in ZK for ATTACH query
+    ATTACH = 1,
+    /// We ignore some error on server startup
+    FORCE_ATTACH = 2,
+    /// Skip all sanity checks (if force_restore_data flag exists)
+    FORCE_RESTORE = 3,
+};
+
+LoadingStrictnessLevel getLoadingStrictnessLevel(bool attach, bool force_attach, bool force_restore);
+
+}
--- a/src/Databases/MySQL/DatabaseMaterializedMySQL.cpp
+++ b/src/Databases/MySQL/DatabaseMaterializedMySQL.cpp
@ -63,11 +63,11 @@ void DatabaseMaterializedMySQL::setException(const std::exception_ptr & exceptio
    exception = exception_;
 }

-void DatabaseMaterializedMySQL::startupTables(ThreadPool & thread_pool, bool force_restore, bool force_attach)
+void DatabaseMaterializedMySQL::startupTables(ThreadPool & thread_pool, LoadingStrictnessLevel mode)
 {
-    DatabaseAtomic::startupTables(thread_pool, force_restore, force_attach);
+    DatabaseAtomic::startupTables(thread_pool, mode);

-    if (!force_attach)
+    if (mode < LoadingStrictnessLevel::FORCE_ATTACH)
        materialize_thread.assertMySQLAvailable();

    materialize_thread.startSynchronization();
--- a/src/Databases/MySQL/DatabaseMaterializedMySQL.h
+++ b/src/Databases/MySQL/DatabaseMaterializedMySQL.h
@ -48,7 +48,7 @@ protected:
 public:
    String getEngineName() const override { return "MaterializedMySQL"; }

-    void startupTables(ThreadPool & thread_pool, bool force_restore, bool force_attach) override;
+    void startupTables(ThreadPool & thread_pool, LoadingStrictnessLevel mode) override;

    void createTable(ContextPtr context_, const String & name, const StoragePtr & table, const ASTPtr & query) override;

--- a/src/Databases/MySQL/DatabaseMySQL.cpp
+++ b/src/Databases/MySQL/DatabaseMySQL.cpp
@ -398,7 +398,7 @@ String DatabaseMySQL::getMetadataPath() const
    return metadata_path;
 }

-void DatabaseMySQL::loadStoredObjects(ContextMutablePtr, bool, bool /*force_attach*/, bool /* skip_startup_tables */)
+void DatabaseMySQL::loadStoredObjects(ContextMutablePtr, LoadingStrictnessLevel /*mode*/, bool /* skip_startup_tables */)
 {

    std::lock_guard<std::mutex> lock{mutex};
--- a/src/Databases/MySQL/DatabaseMySQL.h
+++ b/src/Databases/MySQL/DatabaseMySQL.h
@ -76,7 +76,7 @@ public:

    void createTable(ContextPtr, const String & table_name, const StoragePtr & storage, const ASTPtr & create_query) override;

-    void loadStoredObjects(ContextMutablePtr, bool, bool force_attach, bool skip_startup_tables) override;
+    void loadStoredObjects(ContextMutablePtr, LoadingStrictnessLevel /*mode*/, bool skip_startup_tables) override;

    StoragePtr detachTable(ContextPtr context, const String & table_name) override;

--- a/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.cpp
+++ b/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.cpp
@ -125,9 +125,9 @@ void DatabaseMaterializedPostgreSQL::startSynchronization()
 }


-void DatabaseMaterializedPostgreSQL::startupTables(ThreadPool & thread_pool, bool force_restore, bool force_attach)
+void DatabaseMaterializedPostgreSQL::startupTables(ThreadPool & thread_pool, LoadingStrictnessLevel mode)
 {
-    DatabaseAtomic::startupTables(thread_pool, force_restore, force_attach);
+    DatabaseAtomic::startupTables(thread_pool, mode);
    startup_task->activateAndSchedule();
 }

--- a/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.h
+++ b/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.h
@ -40,7 +40,7 @@ public:

    String getMetadataPath() const override { return metadata_path; }

-    void startupTables(ThreadPool & thread_pool, bool force_restore, bool force_attach) override;
+    void startupTables(ThreadPool & thread_pool, LoadingStrictnessLevel mode) override;

    DatabaseTablesIteratorPtr
    getTablesIterator(ContextPtr context, const DatabaseOnDisk::FilterByNameFunction & filter_by_table_name) const override;
--- a/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp
+++ b/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp
@ -290,7 +290,7 @@ void DatabasePostgreSQL::drop(ContextPtr /*context*/)
 }


-void DatabasePostgreSQL::loadStoredObjects(ContextMutablePtr /* context */, bool, bool /*force_attach*/, bool /* skip_startup_tables */)
+void DatabasePostgreSQL::loadStoredObjects(ContextMutablePtr /* context */, LoadingStrictnessLevel /*mode*/, bool /* skip_startup_tables */)
 {
    {
        std::lock_guard<std::mutex> lock{mutex};
--- a/src/Databases/PostgreSQL/DatabasePostgreSQL.h
+++ b/src/Databases/PostgreSQL/DatabasePostgreSQL.h
@ -45,7 +45,7 @@ public:

    bool empty() const override;

-    void loadStoredObjects(ContextMutablePtr, bool, bool force_attach, bool skip_startup_tables) override;
+    void loadStoredObjects(ContextMutablePtr, LoadingStrictnessLevel /*mode*/, bool skip_startup_tables) override;

    DatabaseTablesIteratorPtr getTablesIterator(ContextPtr context, const FilterByNameFunction & filter_by_table_name) const override;

--- a/src/Databases/TablesLoader.cpp
+++ b/src/Databases/TablesLoader.cpp
@ -62,11 +62,10 @@ void logAboutProgress(Poco::Logger * log, size_t processed, size_t total, Atomic
    }
 }

-TablesLoader::TablesLoader(ContextMutablePtr global_context_, Databases databases_, bool force_restore_, bool force_attach_)
+TablesLoader::TablesLoader(ContextMutablePtr global_context_, Databases databases_, LoadingStrictnessLevel strictness_mode_)
 : global_context(global_context_)
 , databases(std::move(databases_))
-, force_restore(force_restore_)
-, force_attach(force_attach_)
+, strictness_mode(strictness_mode_)
 {
    metadata.default_database = global_context->getCurrentDatabase();
    log = &Poco::Logger::get("TablesLoader");
@ -83,7 +82,7 @@ void TablesLoader::loadTables()
        if (need_resolve_dependencies && database.second->supportsLoadingInTopologicalOrder())
            databases_to_load.push_back(database.first);
        else
-            database.second->loadStoredObjects(global_context, force_restore, force_attach, true);
+            database.second->loadStoredObjects(global_context, strictness_mode, /* skip_startup_tables */ true);
    }

    if (databases_to_load.empty())
@ -92,8 +91,9 @@ void TablesLoader::loadTables()
    /// Read and parse metadata from Ordinary, Atomic, Materialized*, Replicated, etc databases. Build dependency graph.
    for (auto & database_name : databases_to_load)
    {
-        databases[database_name]->beforeLoadingMetadata(global_context, force_restore, force_attach);
-        databases[database_name]->loadTablesMetadata(global_context, metadata, force_attach);
+        databases[database_name]->beforeLoadingMetadata(global_context, strictness_mode);
+        bool is_startup = LoadingStrictnessLevel::FORCE_ATTACH <= strictness_mode;
+        databases[database_name]->loadTablesMetadata(global_context, metadata, is_startup);
    }

    LOG_INFO(log, "Parsed metadata of {} tables in {} databases in {} sec",
@ -119,7 +119,7 @@ void TablesLoader::startupTables()
 {
    /// Startup tables after all tables are loaded. Background tasks (merges, mutations, etc) may slow down data parts loading.
    for (auto & database : databases)
-        database.second->startupTables(pool, force_restore, force_attach);
+        database.second->startupTables(pool, strictness_mode);
 }


@ -253,7 +253,7 @@ void TablesLoader::startLoadingIndependentTables(ThreadPool & pool, size_t level
        pool.scheduleOrThrowOnError([this, load_context, total_tables, &table_name]()
        {
            const auto & path_and_query = metadata.parsed_tables[table_name];
-            databases[table_name.database]->loadTableFromMetadata(load_context, path_and_query.path, table_name, path_and_query.ast, force_restore);
+            databases[table_name.database]->loadTableFromMetadata(load_context, path_and_query.path, table_name, path_and_query.ast, strictness_mode);
            logAboutProgress(log, ++tables_processed, total_tables, stopwatch);
        });
    }
--- a/src/Databases/TablesLoader.h
+++ b/src/Databases/TablesLoader.h
@ -1,14 +1,15 @@
 #pragma once
-#include <Core/Types.h>
-#include <Core/QualifiedTableName.h>
-#include <Parsers/IAST_fwd.h>
-#include <Interpreters/Context_fwd.h>
-#include <Common/ThreadPool.h>
-#include <Common/Stopwatch.h>
 #include <map>
+#include <mutex>
 #include <unordered_map>
 #include <unordered_set>
-#include <mutex>
+#include <Core/QualifiedTableName.h>
+#include <Core/Types.h>
+#include <Databases/LoadingStrictnessLevel.h>
+#include <Interpreters/Context_fwd.h>
+#include <Parsers/IAST_fwd.h>
+#include <Common/Stopwatch.h>
+#include <Common/ThreadPool.h>

 namespace Poco
 {
@ -78,7 +79,7 @@ class TablesLoader
 public:
    using Databases = std::map<String, DatabasePtr>;

-    TablesLoader(ContextMutablePtr global_context_, Databases databases_, bool force_restore_ = false, bool force_attach_ = false);
+    TablesLoader(ContextMutablePtr global_context_, Databases databases_, LoadingStrictnessLevel strictness_mode_);
    TablesLoader() = delete;

    void loadTables();
@ -87,8 +88,7 @@ public:
 private:
    ContextMutablePtr global_context;
    Databases databases;
-    bool force_restore;
-    bool force_attach;
+    LoadingStrictnessLevel strictness_mode;

    Strings databases_to_load;
    ParsedTablesMetadata metadata;
--- a/src/Disks/DiskCacheWrapper.cpp
+++ b/src/Disks/DiskCacheWrapper.cpp
@ -1,371 +0,0 @@
-#include "DiskCacheWrapper.h"
-#include <IO/copyData.h>
-#include <IO/ReadBufferFromFileDecorator.h>
-#include <IO/WriteBufferFromFileDecorator.h>
-#include <Common/quoteString.h>
-#include <condition_variable>
-
-namespace DB
-{
-/**
- * This buffer writes to cache, but after finalize() copy written file from cache to disk.
- */
-class WritingToCacheWriteBuffer final : public WriteBufferFromFileDecorator
-{
-public:
-    WritingToCacheWriteBuffer(
-        std::unique_ptr<WriteBufferFromFileBase> impl_,
-        std::function<std::unique_ptr<ReadBuffer>()> create_read_buffer_,
-        std::function<std::unique_ptr<WriteBuffer>()> create_write_buffer_)
-        : WriteBufferFromFileDecorator(std::move(impl_))
-        , create_read_buffer(std::move(create_read_buffer_))
-        , create_write_buffer(std::move(create_write_buffer_))
-    {
-    }
-
-    ~WritingToCacheWriteBuffer() override
-    {
-        try
-        {
-            finalize();
-        }
-        catch (...)
-        {
-            tryLogCurrentException(__PRETTY_FUNCTION__);
-        }
-    }
-
-    void preFinalize() override
-    {
-        impl->next();
-        impl->preFinalize();
-        impl->finalize();
-
-        read_buffer = create_read_buffer();
-        write_buffer = create_write_buffer();
-        copyData(*read_buffer, *write_buffer);
-        write_buffer->next();
-        write_buffer->preFinalize();
-
-        is_prefinalized = true;
-    }
-
-    void finalizeImpl() override
-    {
-        if (!is_prefinalized)
-            preFinalize();
-
-        write_buffer->finalize();
-    }
-
-private:
-    std::function<std::unique_ptr<ReadBuffer>()> create_read_buffer;
-    std::function<std::unique_ptr<WriteBuffer>()> create_write_buffer;
-    std::unique_ptr<ReadBuffer> read_buffer;
-    std::unique_ptr<WriteBuffer> write_buffer;
-
-    bool is_prefinalized = false;
-};
-
-enum FileDownloadStatus
-{
-    NONE,
-    DOWNLOADING,
-    DOWNLOADED,
-    ERROR
-};
-
-struct FileDownloadMetadata
-{
-    /// Thread waits on this condition if download process is in progress.
-    std::condition_variable condition;
-    FileDownloadStatus status = NONE;
-};
-
-DiskCacheWrapper::DiskCacheWrapper(
-    std::shared_ptr<IDisk> delegate_, std::shared_ptr<DiskLocal> cache_disk_, std::function<bool(const String &)> cache_file_predicate_)
-    : DiskDecorator(delegate_), cache_disk(cache_disk_), cache_file_predicate(cache_file_predicate_)
-{
-}
-
-std::shared_ptr<FileDownloadMetadata> DiskCacheWrapper::acquireDownloadMetadata(const String & path) const
-{
-    std::lock_guard lock{mutex};
-
-    auto it = file_downloads.find(path);
-    if (it != file_downloads.end())
-        if (auto x = it->second.lock())
-            return x;
-
-    std::shared_ptr<FileDownloadMetadata> metadata(
-        new FileDownloadMetadata,
-        [this, path] (FileDownloadMetadata * p)
-        {
-            std::lock_guard erase_lock{mutex};
-            file_downloads.erase(path);
-            delete p;
-        });
-
-    file_downloads.emplace(path, metadata);
-
-    return metadata;
-}
-
-std::unique_ptr<ReadBufferFromFileBase>
-DiskCacheWrapper::readFile(
-    const String & path,
-    const ReadSettings & settings,
-    std::optional<size_t> read_hint,
-    std::optional<size_t> file_size) const
-{
-    if (!cache_file_predicate(path))
-        return DiskDecorator::readFile(path, settings, read_hint, file_size);
-
-    LOG_TEST(log, "Read file {} from cache", backQuote(path));
-
-    if (cache_disk->exists(path))
-        return cache_disk->readFile(path, settings, read_hint, file_size);
-
-    auto metadata = acquireDownloadMetadata(path);
-
-    {
-        std::unique_lock<std::mutex> lock{mutex};
-
-        if (metadata->status == NONE)
-        {
-            /// This thread will responsible for file downloading to cache.
-            metadata->status = DOWNLOADING;
-            LOG_TEST(log, "File {} doesn't exist in cache. Will download it", backQuote(path));
-        }
-        else if (metadata->status == DOWNLOADING)
-        {
-            LOG_TEST(log, "Waiting for file {} download to cache", backQuote(path));
-            metadata->condition.wait(lock, [metadata] { return metadata->status == DOWNLOADED || metadata->status == ERROR; });
-        }
-    }
-
-    auto current_read_settings = settings;
-    /// Do not use RemoteFSReadMethod::threadpool for index and mark files.
-    /// Here it does not make sense since the files are small.
-    /// Note: enabling `threadpool` read requires to call setReadUntilEnd().
-    current_read_settings.remote_fs_method = RemoteFSReadMethod::read;
-    /// Disable data cache.
-    current_read_settings.enable_filesystem_cache = false;
-
-    if (metadata->status == DOWNLOADING)
-    {
-        FileDownloadStatus result_status = DOWNLOADED;
-
-        if (!cache_disk->exists(path))
-        {
-            try
-            {
-                auto dir_path = directoryPath(path);
-                if (!cache_disk->exists(dir_path))
-                    cache_disk->createDirectories(dir_path);
-
-                auto tmp_path = path + ".tmp";
-                {
-                    auto src_buffer = DiskDecorator::readFile(path, current_read_settings, read_hint, file_size);
-
-                    WriteSettings write_settings;
-                    write_settings.enable_filesystem_cache_on_write_operations = false;
-
-                    auto dst_buffer = cache_disk->writeFile(tmp_path, settings.local_fs_buffer_size, WriteMode::Rewrite, write_settings);
-                    copyData(*src_buffer, *dst_buffer);
-                }
-                cache_disk->moveFile(tmp_path, path);
-
-                LOG_TEST(log, "File {} downloaded to cache", backQuote(path));
-            }
-            catch (...)
-            {
-                tryLogCurrentException("DiskCache", "Failed to download file + " + backQuote(path) + " to cache");
-                result_status = ERROR;
-            }
-        }
-
-        /// Notify all waiters that file download is finished.
-        std::unique_lock<std::mutex> lock{mutex};
-
-        metadata->status = result_status;
-        lock.unlock();
-        metadata->condition.notify_all();
-    }
-
-    if (metadata->status == DOWNLOADED)
-        return cache_disk->readFile(path, settings, read_hint, file_size);
-
-    return DiskDecorator::readFile(path, current_read_settings, read_hint, file_size);
-}
-
-std::unique_ptr<WriteBufferFromFileBase>
-DiskCacheWrapper::writeFile(const String & path, size_t buf_size, WriteMode mode, const WriteSettings & settings)
-{
-    if (!cache_file_predicate(path))
-        return DiskDecorator::writeFile(path, buf_size, mode, settings);
-
-    WriteSettings current_settings = settings;
-    /// There are two different cache implementations. Disable second one if the first is enabled.
-    /// The first will soon be removed, this disabling is temporary.
-    current_settings.enable_filesystem_cache_on_write_operations = false;
-
-    LOG_TEST(log, "Write file {} to cache", backQuote(path));
-
-    auto dir_path = directoryPath(path);
-    if (!cache_disk->exists(dir_path))
-        cache_disk->createDirectories(dir_path);
-
-    return std::make_unique<WritingToCacheWriteBuffer>(
-        cache_disk->writeFile(path, buf_size, mode, current_settings),
-        [this, path]()
-        {
-            /// Copy file from cache to actual disk when cached buffer is finalized.
-            return cache_disk->readFile(path, ReadSettings(), /* read_hint= */ {}, /* file_size= */ {});
-        },
-        [this, path, buf_size, mode, current_settings]()
-        {
-            return DiskDecorator::writeFile(path, buf_size, mode, current_settings);
-        });
-}
-
-void DiskCacheWrapper::clearDirectory(const String & path)
-{
-    if (cache_disk->exists(path))
-        cache_disk->clearDirectory(path);
-    DiskDecorator::clearDirectory(path);
-}
-
-void DiskCacheWrapper::moveDirectory(const String & from_path, const String & to_path)
-{
-    if (cache_disk->exists(from_path))
-    {
-        /// Destination directory may not be empty if previous directory move attempt was failed.
-        if (cache_disk->exists(to_path) && cache_disk->isDirectory(to_path))
-            cache_disk->clearDirectory(to_path);
-
-        cache_disk->moveDirectory(from_path, to_path);
-    }
-    DiskDecorator::moveDirectory(from_path, to_path);
-}
-
-void DiskCacheWrapper::moveFile(const String & from_path, const String & to_path)
-{
-    if (cache_disk->exists(from_path))
-    {
-        auto dir_path = directoryPath(to_path);
-        if (!cache_disk->exists(dir_path))
-            cache_disk->createDirectories(dir_path);
-
-        cache_disk->moveFile(from_path, to_path);
-    }
-    DiskDecorator::moveFile(from_path, to_path);
-}
-
-void DiskCacheWrapper::replaceFile(const String & from_path, const String & to_path)
-{
-    if (cache_disk->exists(from_path))
-    {
-        auto dir_path = directoryPath(to_path);
-        if (!cache_disk->exists(dir_path))
-            cache_disk->createDirectories(dir_path);
-
-        cache_disk->replaceFile(from_path, to_path);
-    }
-    DiskDecorator::replaceFile(from_path, to_path);
-}
-
-void DiskCacheWrapper::removeFile(const String & path)
-{
-    cache_disk->removeFileIfExists(path);
-    DiskDecorator::removeFile(path);
-}
-
-void DiskCacheWrapper::removeFileIfExists(const String & path)
-{
-    cache_disk->removeFileIfExists(path);
-    DiskDecorator::removeFileIfExists(path);
-}
-
-void DiskCacheWrapper::removeDirectory(const String & path)
-{
-    if (cache_disk->exists(path))
-        cache_disk->removeDirectory(path);
-
-    DiskDecorator::removeDirectory(path);
-}
-
-void DiskCacheWrapper::removeRecursive(const String & path)
-{
-    if (cache_disk->exists(path))
-        cache_disk->removeRecursive(path);
-    DiskDecorator::removeRecursive(path);
-}
-
-void DiskCacheWrapper::removeSharedFile(const String & path, bool keep_s3)
-{
-    if (cache_disk->exists(path))
-        cache_disk->removeSharedFile(path, keep_s3);
-    DiskDecorator::removeSharedFile(path, keep_s3);
-}
-
-void DiskCacheWrapper::removeSharedRecursive(const String & path, bool keep_all, const NameSet & files_to_keep)
-{
-    if (cache_disk->exists(path))
-        cache_disk->removeSharedRecursive(path, keep_all, files_to_keep);
-    DiskDecorator::removeSharedRecursive(path, keep_all, files_to_keep);
-}
-
-
-void DiskCacheWrapper::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all, const NameSet & files_to_keep)
-{
-    for (const auto & file : files)
-    {
-        if (cache_disk->exists(file.path))
-        {
-            bool keep_file = keep_all || files_to_keep.contains(fs::path(file.path).filename());
-            cache_disk->removeSharedFile(file.path, keep_file);
-        }
-    }
-
-    DiskDecorator::removeSharedFiles(files, keep_all, files_to_keep);
-}
-
-void DiskCacheWrapper::createHardLink(const String & src_path, const String & dst_path)
-{
-    /// Don't create hardlinks for cache files to shadow directory as it just waste cache disk space.
-    if (cache_disk->exists(src_path) && !dst_path.starts_with("shadow/"))
-    {
-        auto dir_path = directoryPath(dst_path);
-        if (!cache_disk->exists(dir_path))
-            cache_disk->createDirectories(dir_path);
-
-        cache_disk->createHardLink(src_path, dst_path);
-    }
-    DiskDecorator::createHardLink(src_path, dst_path);
-}
-
-void DiskCacheWrapper::createDirectory(const String & path)
-{
-    cache_disk->createDirectory(path);
-    DiskDecorator::createDirectory(path);
-}
-
-void DiskCacheWrapper::createDirectories(const String & path)
-{
-    cache_disk->createDirectories(path);
-    DiskDecorator::createDirectories(path);
-}
-
-ReservationPtr DiskCacheWrapper::reserve(UInt64 bytes)
-{
-    auto ptr = DiskDecorator::reserve(bytes);
-    if (ptr)
-    {
-        auto disk_ptr = std::static_pointer_cast<DiskCacheWrapper>(shared_from_this());
-        return std::make_unique<ReservationDelegate>(std::move(ptr), disk_ptr);
-    }
-    return ptr;
-}
-
-}
--- a/Show More
+++ b/Show More