Merge branch 'master' of github.com:ClickHouse/ClickHouse into fix_issue_22028

2024-11-21 15:12:02 +00:00 · 2021-03-28 18:41:22 +01:00 · 2021-03-28 18:41:22 +01:00 · 9761db7efb
commit 9761db7efb
parent fe71d628a1 771493f03a
241 changed files with 3956 additions and 957 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -93,7 +93,7 @@
 	url = https://github.com/ClickHouse-Extras/libunwind.git
 [submodule "contrib/simdjson"]
 	path = contrib/simdjson
-	url = https://github.com/ClickHouse-Extras/simdjson.git
+	url = https://github.com/simdjson/simdjson.git
 [submodule "contrib/rapidjson"]
 	path = contrib/rapidjson
 	url = https://github.com/ClickHouse-Extras/rapidjson
--- a/base/common/DateLUTImpl.h
+++ b/base/common/DateLUTImpl.h
@ -1069,11 +1069,11 @@ public:
    }

    template <typename DateOrTime>
-    inline LUTIndex addMonthsIndex(DateOrTime v, Int64 delta) const
+    inline LUTIndex NO_SANITIZE_UNDEFINED addMonthsIndex(DateOrTime v, Int64 delta) const
    {
        const Values & values = lut[toLUTIndex(v)];

-        Int64 month = static_cast<Int64>(values.month) + delta;
+        Int64 month = values.month + delta;

        if (month > 0)
        {
--- a/contrib/NuRaft
+++ b/contrib/NuRaft
@ -1 +1 @@
-Subproject commit 3d3683e77753cfe015a05fae95ddf418e19f59e1
+Subproject commit 70468326ad5d72e9497944838484c591dae054ea
--- a/contrib/replxx
+++ b/contrib/replxx
@ -1 +1 @@
-Subproject commit cdb6e3f2ce4464225daf9c8beeae7db98d590bdc
+Subproject commit 2b24f14594d7606792b92544bb112a6322ba34d7
--- a/contrib/simdjson
+++ b/contrib/simdjson
@ -1 +1 @@
-Subproject commit 3190d66a49059092a1753dc35595923debfc1698
+Subproject commit 95b4870e20be5f97d9dcf63b23b1c6f520c366c1
--- a/docker/client/Dockerfile
+++ b/docker/client/Dockerfile
@ -18,6 +18,7 @@ RUN apt-get update \
            clickhouse-client=$version \
            clickhouse-common-static=$version \
            locales \
+            tzdata \
    && rm -rf /var/lib/apt/lists/* /var/cache/debconf \
    && apt-get clean

--- a/docker/server/Dockerfile
+++ b/docker/server/Dockerfile
@ -32,6 +32,7 @@ RUN groupadd -r clickhouse --gid=101 \
            clickhouse-server=$version \
            locales \
            wget \
+            tzdata \
    && rm -rf \
        /var/lib/apt/lists/* \
        /var/cache/debconf \
--- a/docker/server/Dockerfile.alpine
+++ b/docker/server/Dockerfile.alpine
@ -21,7 +21,9 @@ RUN addgroup -S -g 101 clickhouse \
    && chown clickhouse:clickhouse /var/lib/clickhouse \
    && chown root:clickhouse /var/log/clickhouse-server \
    && chmod +x /entrypoint.sh \
-    && apk add --no-cache su-exec bash \
+    && apk add --no-cache su-exec bash tzdata \
+    && cp /usr/share/zoneinfo/UTC /etc/localtime \
+    && echo "UTC" > /etc/timezone \
    && chmod ugo+Xrw -R /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client

 # we need to allow "others" access to clickhouse folder, because docker container
--- a/docker/server/entrypoint.sh
+++ b/docker/server/entrypoint.sh
@ -46,9 +46,11 @@ DATA_DIR="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --
 TMP_DIR="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=tmp_path || true)"
 USER_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=user_files_path || true)"
 LOG_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=logger.log || true)"
-LOG_DIR="$(dirname "$LOG_PATH" || true)"
+LOG_DIR=""
+if [ -n "$LOG_PATH" ]; then LOG_DIR="$(dirname "$LOG_PATH")"; fi
 ERROR_LOG_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=logger.errorlog || true)"
-ERROR_LOG_DIR="$(dirname "$ERROR_LOG_PATH" || true)"
+ERROR_LOG_DIR=""
+if [ -n "$ERROR_LOG_PATH" ]; then ERROR_LOG_DIR="$(dirname "$ERROR_LOG_PATH")"; fi
 FORMAT_SCHEMA_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=format_schema_path || true)"

 CLICKHOUSE_USER="${CLICKHOUSE_USER:-default}"
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@ -292,6 +292,7 @@ function run_tests
        01318_decrypt                           # Depends on OpenSSL
        01663_aes_msan                          # Depends on OpenSSL
        01667_aes_args_check                    # Depends on OpenSSL
+        01776_decrypt_aead_size_check           # Depends on OpenSSL
        01281_unsucceeded_insert_select_queries_counter
        01292_create_user
        01294_lazy_database_concurrent
--- a/docker/test/performance-comparison/perf.py
+++ b/docker/test/performance-comparison/perf.py
@ -266,14 +266,13 @@ for query_index in queries_to_run:

            try:
                # Will also detect too long queries during warmup stage
-                res = c.execute(q, query_id = prewarm_id, settings = {'max_execution_time': 10})
+                res = c.execute(q, query_id = prewarm_id, settings = {'max_execution_time': args.max_query_seconds})
            except clickhouse_driver.errors.Error as e:
                # Add query id to the exception to make debugging easier.
                e.args = (prewarm_id, *e.args)
                e.message = prewarm_id + ': ' + e.message
                raise

-
            print(f'prewarm\t{query_index}\t{prewarm_id}\t{conn_index}\t{c.last_query.elapsed}')
        except KeyboardInterrupt:
            raise
@ -320,7 +319,7 @@ for query_index in queries_to_run:

        for conn_index, c in enumerate(this_query_connections):
            try:
-                res = c.execute(q, query_id = run_id)
+                res = c.execute(q, query_id = run_id, settings = {'max_execution_time': args.max_query_seconds})
            except clickhouse_driver.errors.Error as e:
                # Add query id to the exception to make debugging easier.
                e.args = (run_id, *e.args)
--- a/docker/test/sqlancer/Dockerfile
+++ b/docker/test/sqlancer/Dockerfile
@ -2,7 +2,6 @@
 FROM ubuntu:20.04

 RUN apt-get update --yes && env DEBIAN_FRONTEND=noninteractive apt-get install wget unzip git openjdk-14-jdk maven python3 --yes --no-install-recommends
-
 RUN wget https://github.com/sqlancer/sqlancer/archive/master.zip -O /sqlancer.zip
 RUN mkdir /sqlancer && \
 	cd /sqlancer && \
--- a/docker/test/sqlancer/process_sqlancer_result.py
+++ b/docker/test/sqlancer/process_sqlancer_result.py
@ -26,6 +26,7 @@ def process_result(result_folder):
            with open(err_path, 'r') as f:
                if 'AssertionError' in f.read():
                    summary.append((test, "FAIL"))
+                    status = 'failure'
                else:
                    summary.append((test, "OK"))

--- a/docker/test/sqlancer/run.sh
+++ b/docker/test/sqlancer/run.sh
@ -11,7 +11,7 @@ service clickhouse-server start && sleep 5

 cd /sqlancer/sqlancer-master

-export TIMEOUT=60
+export TIMEOUT=300
 export NUM_QUERIES=1000

 ( java -jar target/sqlancer-*.jar --num-threads 10 --timeout-seconds $TIMEOUT --num-queries $NUM_QUERIES  --username default --password "" clickhouse --oracle TLPWhere | tee /test_output/TLPWhere.out )  3>&1 1>&2 2>&3 | tee /test_output/TLPWhere.err
--- a/docs/en/engines/table-engines/integrations/postgresql.md
+++ b/docs/en/engines/table-engines/integrations/postgresql.md
@ -3,7 +3,7 @@ toc_priority: 8
 toc_title: PostgreSQL
 ---

-# PosgtreSQL {#postgresql}
+# PostgreSQL {#postgresql}

 The PostgreSQL engine allows you to perform `SELECT` queries on data that is stored on a remote PostgreSQL server.

--- a/docs/en/faq/integration/json-import.md
+++ b/docs/en/faq/integration/json-import.md
@ -19,7 +19,7 @@ $ echo '{"foo":"bar"}' | curl 'http://localhost:8123/?query=INSERT%20INTO%20test
 Using [CLI interface](../../interfaces/cli.md):

 ``` bash
-$ echo '{"foo":"bar"}'  | clickhouse-client ---query="INSERT INTO test FORMAT JSONEachRow"
+$ echo '{"foo":"bar"}'  | clickhouse-client --query="INSERT INTO test FORMAT JSONEachRow"
 ```

 Instead of inserting data manually, you might consider to use one of [client libraries](../../interfaces/index.md) instead.
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@ -50,7 +50,7 @@ The supported formats are:
 | [Parquet](#data-format-parquet)                                                         | ✔     | ✔      |
 | [Arrow](#data-format-arrow)                                                             | ✔     | ✔      |
 | [ArrowStream](#data-format-arrow-stream)                                                | ✔     | ✔      |
-| [ORC](#data-format-orc)                                                                 | ✔     | ✗      |
+| [ORC](#data-format-orc)                                                                 | ✔     | ✔      |
 | [RowBinary](#rowbinary)                                                                 | ✔     | ✔      |
 | [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes)                               | ✔     | ✔      |
 | [Native](#native)                                                                       | ✔     | ✔      |
@ -1284,32 +1284,33 @@ To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-e

 ## ORC {#data-format-orc}

-[Apache ORC](https://orc.apache.org/) is a columnar storage format widespread in the Hadoop ecosystem. You can only insert data in this format to ClickHouse.
+[Apache ORC](https://orc.apache.org/) is a columnar storage format widespread in the [Hadoop](https://hadoop.apache.org/) ecosystem.

 ### Data Types Matching {#data_types-matching-3}

-The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md) in `INSERT` queries.
+The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries.

-| ORC data type (`INSERT`) | ClickHouse data type                                |
-|--------------------------|-----------------------------------------------------|
-| `UINT8`, `BOOL`          | [UInt8](../sql-reference/data-types/int-uint.md)    |
-| `INT8`                   | [Int8](../sql-reference/data-types/int-uint.md)     |
-| `UINT16`                 | [UInt16](../sql-reference/data-types/int-uint.md)   |
-| `INT16`                  | [Int16](../sql-reference/data-types/int-uint.md)    |
-| `UINT32`                 | [UInt32](../sql-reference/data-types/int-uint.md)   |
-| `INT32`                  | [Int32](../sql-reference/data-types/int-uint.md)    |
-| `UINT64`                 | [UInt64](../sql-reference/data-types/int-uint.md)   |
-| `INT64`                  | [Int64](../sql-reference/data-types/int-uint.md)    |
-| `FLOAT`, `HALF_FLOAT`    | [Float32](../sql-reference/data-types/float.md)     |
-| `DOUBLE`                 | [Float64](../sql-reference/data-types/float.md)     |
-| `DATE32`                 | [Date](../sql-reference/data-types/date.md)         |
-| `DATE64`, `TIMESTAMP`    | [DateTime](../sql-reference/data-types/datetime.md) |
-| `STRING`, `BINARY`       | [String](../sql-reference/data-types/string.md)     |
-| `DECIMAL`                | [Decimal](../sql-reference/data-types/decimal.md)   |
+| ORC data type (`INSERT`) | ClickHouse data type                                | ORC data type (`SELECT`) |
+|--------------------------|-----------------------------------------------------|--------------------------|
+| `UINT8`, `BOOL`          | [UInt8](../sql-reference/data-types/int-uint.md)    | `UINT8`                  |
+| `INT8`                   | [Int8](../sql-reference/data-types/int-uint.md)     | `INT8`                   |
+| `UINT16`                 | [UInt16](../sql-reference/data-types/int-uint.md)   | `UINT16`                 |
+| `INT16`                  | [Int16](../sql-reference/data-types/int-uint.md)    | `INT16`                  |
+| `UINT32`                 | [UInt32](../sql-reference/data-types/int-uint.md)   | `UINT32`                 |
+| `INT32`                  | [Int32](../sql-reference/data-types/int-uint.md)    | `INT32`                  |
+| `UINT64`                 | [UInt64](../sql-reference/data-types/int-uint.md)   | `UINT64`                 |
+| `INT64`                  | [Int64](../sql-reference/data-types/int-uint.md)    | `INT64`                  |
+| `FLOAT`, `HALF_FLOAT`    | [Float32](../sql-reference/data-types/float.md)     | `FLOAT`                  |
+| `DOUBLE`                 | [Float64](../sql-reference/data-types/float.md)     | `DOUBLE`                 |
+| `DATE32`                 | [Date](../sql-reference/data-types/date.md)         | `DATE32`                 |
+| `DATE64`, `TIMESTAMP`    | [DateTime](../sql-reference/data-types/datetime.md) | `TIMESTAMP`              |
+| `STRING`, `BINARY`       | [String](../sql-reference/data-types/string.md)     | `BINARY`                 |
+| `DECIMAL`                | [Decimal](../sql-reference/data-types/decimal.md)   | `DECIMAL`                |
+| `-`                      | [Array](../sql-reference/data-types/array.md)       | `LIST`                   |

 ClickHouse supports configurable precision of the `Decimal` type. The `INSERT` query treats the ORC `DECIMAL` type as the ClickHouse `Decimal128` type.

-Unsupported ORC data types: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`.
+Unsupported ORC data types: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`.

 The data types of ClickHouse table columns don’t have to match the corresponding ORC data fields. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) the data to the data type set for the ClickHouse table column.

@ -1321,6 +1322,14 @@ You can insert ORC data from a file into ClickHouse table by the following comma
 $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC"
 ```

+### Selecting Data {#selecting-data-2}
+
+You can select data from a ClickHouse table and save them into some file in the ORC format by the following command:
+
+``` bash
+$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT ORC" > {filename.orc}
+```
+
 To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-engines/integrations/hdfs.md).

 ## LineAsString {#lineasstring}
--- a/docs/en/operations/system-tables/errors.md
+++ b/docs/en/operations/system-tables/errors.md
@ -9,7 +9,7 @@ Columns:
 -   `value` ([UInt64](../../sql-reference/data-types/int-uint.md)) — the number of times this error has been happened.
 -   `last_error_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — time when the last error happened.
 -   `last_error_message` ([String](../../sql-reference/data-types/string.md)) — message for the last error.
-   `last_error_stacktrace` ([String](../../sql-reference/data-types/string.md)) — stacktrace for the last error.
+-   `last_error_trace` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — A [stack trace](https://en.wikipedia.org/wiki/Stack_trace) which represents a list of physical addresses where the called methods are stored.
 -   `remote` ([UInt8](../../sql-reference/data-types/int-uint.md)) — remote exception (i.e. received during one of the distributed query).

 **Example**
@ -25,3 +25,12 @@ LIMIT 1
 │ CANNOT_OPEN_FILE │   76 │     1 │
 └──────────────────┴──────┴───────┘
 ```
+
+``` sql
+WITH arrayMap(x -> demangle(addressToSymbol(x)), last_error_trace) AS all
+SELECT name, arrayStringConcat(all, '\n') AS res
+FROM system.errors
+LIMIT 1
+SETTINGS allow_introspection_functions=1\G
+```
+
--- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
+++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
@ -320,8 +320,6 @@ Similar to `cache`, but stores data on SSD and index in RAM.
        <write_buffer_size>1048576</write_buffer_size>
        <!-- Path where cache file will be stored. -->
        <path>/var/lib/clickhouse/clickhouse_dictionaries/test_dict</path>
-        <!-- Max number on stored keys in the cache. Rounded up to a power of two. -->
-        <max_stored_keys>1048576</max_stored_keys>
    </ssd_cache>
 </layout>
 ```
@ -329,8 +327,8 @@ Similar to `cache`, but stores data on SSD and index in RAM.
 or

 ``` sql
-LAYOUT(CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576
-    PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict MAX_STORED_KEYS 1048576))
+LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576
+    PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict))
 ```

 ### complex_key_ssd_cache {#complex-key-ssd-cache}
--- a/docs/en/sql-reference/window-functions/index.md
+++ b/docs/en/sql-reference/window-functions/index.md
@ -23,7 +23,9 @@ ClickHouse supports the standard grammar for defining windows and window functio
 | `GROUPS` frame | not supported |
 | Calculating aggregate functions over a frame (`sum(value) over (order by time)`) | all aggregate functions are supported |
 | `rank()`, `dense_rank()`, `row_number()` | supported |
-| `lag/lead(value, offset)` | not supported, replace with `any(value) over (.... rows between <offset> preceding and <offset> preceding)`, or `following` for `lead`| 
+| `lag/lead(value, offset)` | Not supported. Workarounds: |
+|  | 1) replace with `any(value) over (.... rows between <offset> preceding and <offset> preceding)`, or `following` for `lead`|
+|  | 2) use `lagInFrame/leadInFrame`, which are analogous, but respect the window frame. To get behavior identical to `lag/lead`, use `rows between unbounded preceding and unbounded following` |

 ## References

--- a/docs/ru/interfaces/formats.md
+++ b/docs/ru/interfaces/formats.md
@ -49,7 +49,7 @@ ClickHouse может принимать (`INSERT`) и отдавать (`SELECT
 | [Parquet](#data-format-parquet)                                                         | ✔     | ✔      |
 | [Arrow](#data-format-arrow)                                                             | ✔     | ✔      |
 | [ArrowStream](#data-format-arrow-stream)                                                | ✔     | ✔      |
-| [ORC](#data-format-orc)                                                                 | ✔     | ✗      |
+| [ORC](#data-format-orc)                                                                 | ✔     | ✔      |
 | [RowBinary](#rowbinary)                                                                 | ✔     | ✔      |
 | [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes)                               | ✔     | ✔      |
 | [Native](#native)                                                                       | ✔     | ✔      |
@ -1203,45 +1203,53 @@ $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_

 ## ORC {#data-format-orc}

-[Apache ORC](https://orc.apache.org/) - это column-oriented формат данных, распространённый в экосистеме Hadoop. Вы можете только вставлять данные этого формата в ClickHouse.
+[Apache ORC](https://orc.apache.org/) — это столбцовый формат данных, распространенный в экосистеме [Hadoop](https://hadoop.apache.org/).

 ### Соответствие типов данных {#sootvetstvie-tipov-dannykh-1}

-Таблица показывает поддержанные типы данных и их соответствие [типам данных](../sql-reference/data-types/index.md) ClickHouse для запросов `INSERT`.
+Таблица ниже содержит поддерживаемые типы данных и их соответствие [типам данных](../sql-reference/data-types/index.md) ClickHouse для запросов `INSERT` и `SELECT`.

-| Тип данных ORC (`INSERT`) | Тип данных ClickHouse                               |
-|---------------------------|-----------------------------------------------------|
-| `UINT8`, `BOOL`           | [UInt8](../sql-reference/data-types/int-uint.md)    |
-| `INT8`                    | [Int8](../sql-reference/data-types/int-uint.md)     |
-| `UINT16`                  | [UInt16](../sql-reference/data-types/int-uint.md)   |
-| `INT16`                   | [Int16](../sql-reference/data-types/int-uint.md)    |
-| `UINT32`                  | [UInt32](../sql-reference/data-types/int-uint.md)   |
-| `INT32`                   | [Int32](../sql-reference/data-types/int-uint.md)    |
-| `UINT64`                  | [UInt64](../sql-reference/data-types/int-uint.md)   |
-| `INT64`                   | [Int64](../sql-reference/data-types/int-uint.md)    |
-| `FLOAT`, `HALF_FLOAT`     | [Float32](../sql-reference/data-types/float.md)     |
-| `DOUBLE`                  | [Float64](../sql-reference/data-types/float.md)     |
-| `DATE32`                  | [Date](../sql-reference/data-types/date.md)         |
-| `DATE64`, `TIMESTAMP`     | [DateTime](../sql-reference/data-types/datetime.md) |
-| `STRING`, `BINARY`        | [String](../sql-reference/data-types/string.md)     |
-| `DECIMAL`                 | [Decimal](../sql-reference/data-types/decimal.md)   |
+| Тип данных ORC (`INSERT`) | Тип данных ClickHouse                               | Тип данных ORC (`SELECT`) |
+|---------------------------|-----------------------------------------------------|---------------------------|
+| `UINT8`, `BOOL`           | [UInt8](../sql-reference/data-types/int-uint.md)    | `UINT8`                   |
+| `INT8`                    | [Int8](../sql-reference/data-types/int-uint.md)     | `INT8`                    |
+| `UINT16`                  | [UInt16](../sql-reference/data-types/int-uint.md)   | `UINT16`                  |
+| `INT16`                   | [Int16](../sql-reference/data-types/int-uint.md)    | `INT16`                   |
+| `UINT32`                  | [UInt32](../sql-reference/data-types/int-uint.md)   | `UINT32`                  |
+| `INT32`                   | [Int32](../sql-reference/data-types/int-uint.md)    | `INT32`                   |
+| `UINT64`                  | [UInt64](../sql-reference/data-types/int-uint.md)   | `UINT64`                  |
+| `INT64`                   | [Int64](../sql-reference/data-types/int-uint.md)    | `INT64`                   |
+| `FLOAT`, `HALF_FLOAT`     | [Float32](../sql-reference/data-types/float.md)     | `FLOAT`                   |
+| `DOUBLE`                  | [Float64](../sql-reference/data-types/float.md)     | `DOUBLE`                  |
+| `DATE32`                  | [Date](../sql-reference/data-types/date.md)         | `DATE32`                  |
+| `DATE64`, `TIMESTAMP`     | [DateTime](../sql-reference/data-types/datetime.md) | `TIMESTAMP`               |
+| `STRING`, `BINARY`        | [String](../sql-reference/data-types/string.md)     | `BINARY`                  |
+| `DECIMAL`                 | [Decimal](../sql-reference/data-types/decimal.md)   | `DECIMAL`                 |
+| `-`                       | [Array](../sql-reference/data-types/array.md)       | `LIST`                    |

-ClickHouse поддерживает настраиваемую точность для формата `Decimal`. При обработке запроса `INSERT`, ClickHouse обрабатывает тип данных Parquet `DECIMAL` как `Decimal128`.
+ClickHouse поддерживает настраиваемую точность для формата `Decimal`. При обработке запроса `INSERT`, ClickHouse обрабатывает тип данных ORC `DECIMAL` как `Decimal128`.

-Неподдержанные типы данных ORC: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`.
+Неподдерживаемые типы данных ORC: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`.

-Типы данных столбцов в таблицах ClickHouse могут отличаться от типов данных для соответствующих полей ORC. При вставке данных, ClickHouse интерпретирует типы данных ORC согласно таблице соответствия, а затем [приводит](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) данные к типу, установленному для столбца таблицы ClickHouse.
+Типы данных столбцов в таблицах ClickHouse могут отличаться от типов данных для соответствующих полей ORC. При вставке данных ClickHouse интерпретирует типы данных ORC согласно таблице соответствия, а затем [приводит](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) данные к типу, установленному для столбца таблицы ClickHouse.

 ### Вставка данных {#vstavka-dannykh-1}

-Данные ORC можно вставить в таблицу ClickHouse командой:
+Чтобы вставить в ClickHouse данные из файла в формате ORC, используйте команду следующего вида:

 ``` bash
 $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC"
 ```

-Для обмена данных с Hadoop можно использовать [движок таблиц HDFS](../engines/table-engines/integrations/hdfs.md).
+### Вывод данных {#vyvod-dannykh-1}

+Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата ORC, используйте команду следующего вида:
+
+``` bash
+$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT ORC" > {filename.orc}
+```
+
+Для обмена данных с экосистемой Hadoop вы можете использовать [движок таблиц HDFS](../engines/table-engines/integrations/hdfs.md).

 ## LineAsString {#lineasstring}

--- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
+++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
@ -318,8 +318,6 @@ LAYOUT(CACHE(SIZE_IN_CELLS 1000000000))
        <write_buffer_size>1048576</write_buffer_size>
        <!-- Path where cache file will be stored. -->
        <path>/var/lib/clickhouse/clickhouse_dictionaries/test_dict</path>
-        <!-- Max number on stored keys in the cache. Rounded up to a power of two. -->
-        <max_stored_keys>1048576</max_stored_keys>
    </ssd_cache>
 </layout>
 ```
@ -327,8 +325,8 @@ LAYOUT(CACHE(SIZE_IN_CELLS 1000000000))
 или

 ``` sql
-LAYOUT(CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576
-    PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict MAX_STORED_KEYS 1048576))
+LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576
+    PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict))
 ```

 ### complex_key_ssd_cache {#complex-key-ssd-cache}
--- a/docs/ru/sql-reference/functions/other-functions.md
+++ b/docs/ru/sql-reference/functions/other-functions.md
@ -672,7 +672,7 @@ neighbor(column, offset[, default_value])
    Функция может получить доступ к значению в столбце соседней строки только внутри обрабатываемого в данный момент блока данных.

 Порядок строк, используемый при вычислении функции `neighbor`, может отличаться от порядка строк, возвращаемых пользователю.
-Чтобы этого не случилось, вы можете сделать подзапрос с [ORDER BY](../../sql-reference/statements/select/order-by.md) и вызвать функцию изне подзапроса.
+Чтобы этого не случилось, вы можете сделать подзапрос с [ORDER BY](../../sql-reference/statements/select/order-by.md) и вызвать функцию извне подзапроса.

 **Аргументы**

--- a/docs/tools/requirements.txt
+++ b/docs/tools/requirements.txt
@ -26,7 +26,7 @@ numpy==1.19.2
 Pygments==2.5.2
 pymdown-extensions==8.0
 python-slugify==4.0.1
-PyYAML==5.3.1
+PyYAML==5.4.1
 repackage==0.7.3
 requests==2.24.0
 singledispatch==3.4.0.3
--- a/src/Client/Connection.cpp
+++ b/src/Client/Connection.cpp
@ -8,10 +8,10 @@
 #include <IO/ReadHelpers.h>
 #include <IO/WriteHelpers.h>
 #include <IO/copyData.h>
+#include <IO/TimeoutSetter.h>
 #include <DataStreams/NativeBlockInputStream.h>
 #include <DataStreams/NativeBlockOutputStream.h>
 #include <Client/Connection.h>
-#include <Client/TimeoutSetter.h>
 #include <Common/ClickHouseRevision.h>
 #include <Common/Exception.h>
 #include <Common/NetException.h>
--- a/src/Client/ya.make
+++ b/src/Client/ya.make
@ -16,7 +16,6 @@ SRCS(
    HedgedConnections.cpp
    HedgedConnectionsFactory.cpp
    MultiplexedConnections.cpp
-    TimeoutSetter.cpp

 )

--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -560,7 +560,7 @@ namespace DB
 {
 namespace ErrorCodes
 {
-#define M(VALUE, NAME) extern const Value NAME = VALUE;
+#define M(VALUE, NAME) extern const ErrorCode NAME = VALUE;
    APPLY_FOR_ERROR_CODES(M)
 #undef M

@ -587,7 +587,7 @@ namespace ErrorCodes

    ErrorCode end() { return END + 1; }

-    void increment(ErrorCode error_code, bool remote, const std::string & message, const std::string & stacktrace)
+    void increment(ErrorCode error_code, bool remote, const std::string & message, const FramePointers & trace)
    {
        if (error_code >= end())
        {
@ -596,10 +596,10 @@ namespace ErrorCodes
            error_code = end() - 1;
        }

-        values[error_code].increment(remote, message, stacktrace);
+        values[error_code].increment(remote, message, trace);
    }

-    void ErrorPairHolder::increment(bool remote, const std::string & message, const std::string & stacktrace)
+    void ErrorPairHolder::increment(bool remote, const std::string & message, const FramePointers & trace)
    {
        const auto now = std::chrono::system_clock::now();

@ -609,7 +609,7 @@ namespace ErrorCodes

        ++error.count;
        error.message = message;
-        error.stacktrace = stacktrace;
+        error.trace = trace;
        error.error_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(now.time_since_epoch()).count();
    }
    ErrorPair ErrorPairHolder::get()
--- a/src/Common/ErrorCodes.h
+++ b/src/Common/ErrorCodes.h
@ -1,11 +1,12 @@
 #pragma once

-#include <stddef.h>
+#include <cstddef>
 #include <cstdint>
 #include <utility>
 #include <mutex>
-#include <common/types.h>
 #include <string_view>
+#include <vector>
+#include <common/types.h>

 /** Allows to count number of simultaneously happening error codes.
  * See also Exception.cpp for incrementing part.
@ -19,6 +20,7 @@ namespace ErrorCodes
    /// ErrorCode identifier (index in array).
    using ErrorCode = int;
    using Value = size_t;
+    using FramePointers = std::vector<void *>;

    /// Get name of error_code by identifier.
    /// Returns statically allocated string.
@ -33,7 +35,7 @@ namespace ErrorCodes
        /// Message for the last error.
        std::string message;
        /// Stacktrace for the last error.
-        std::string stacktrace;
+        FramePointers trace;
    };
    struct ErrorPair
    {
@ -46,7 +48,7 @@ namespace ErrorCodes
    {
    public:
        ErrorPair get();
-        void increment(bool remote, const std::string & message, const std::string & stacktrace);
+        void increment(bool remote, const std::string & message, const FramePointers & trace);

    private:
        ErrorPair value;
@ -60,7 +62,7 @@ namespace ErrorCodes
    ErrorCode end();

    /// Add value for specified error_code.
-    void increment(ErrorCode error_code, bool remote, const std::string & message, const std::string & stacktrace);
+    void increment(ErrorCode error_code, bool remote, const std::string & message, const FramePointers & trace);
 }

 }
--- a/src/Common/Exception.cpp
+++ b/src/Common/Exception.cpp
@ -36,7 +36,7 @@ namespace ErrorCodes

 /// - Aborts the process if error code is LOGICAL_ERROR.
 /// - Increments error codes statistics.
-void handle_error_code([[maybe_unused]] const std::string & msg, const std::string & stacktrace, int code, bool remote)
+void handle_error_code([[maybe_unused]] const std::string & msg, int code, bool remote, const Exception::FramePointers & trace)
 {
    // In debug builds and builds with sanitizers, treat LOGICAL_ERROR as an assertion failure.
    // Log the message before we fail.
@ -47,20 +47,21 @@ void handle_error_code([[maybe_unused]] const std::string & msg, const std::stri
        abort();
    }
 #endif
-    ErrorCodes::increment(code, remote, msg, stacktrace);
+
+    ErrorCodes::increment(code, remote, msg, trace);
 }

 Exception::Exception(const std::string & msg, int code, bool remote_)
    : Poco::Exception(msg, code)
    , remote(remote_)
 {
-    handle_error_code(msg, getStackTraceString(), code, remote);
+    handle_error_code(msg, code, remote, getStackFramePointers());
 }

 Exception::Exception(const std::string & msg, const Exception & nested, int code)
    : Poco::Exception(msg, nested, code)
 {
-    handle_error_code(msg, getStackTraceString(), code, remote);
+    handle_error_code(msg, code, remote, getStackFramePointers());
 }

 Exception::Exception(CreateFromPocoTag, const Poco::Exception & exc)
@ -101,6 +102,31 @@ std::string Exception::getStackTraceString() const
 #endif
 }

+Exception::FramePointers Exception::getStackFramePointers() const
+{
+    FramePointers frame_pointers;
+#ifdef STD_EXCEPTION_HAS_STACK_TRACE
+    {
+        frame_pointers.resize(get_stack_trace_size());
+        for (size_t i = 0; i < frame_pointers.size(); ++i)
+        {
+            frame_pointers[i] = get_stack_trace_frames()[i];
+        }
+    }
+#else
+    {
+        size_t stack_trace_size = trace.getSize();
+        size_t stack_trace_offset = trace.getOffset();
+        frame_pointers.reserve(stack_trace_size - stack_trace_offset);
+        for (size_t i = stack_trace_offset; i < stack_trace_size; ++i)
+        {
+            frame_pointers.push_back(trace.getFramePointers()[i]);
+        }
+    }
+#endif
+    return frame_pointers;
+}
+

 void throwFromErrno(const std::string & s, int code, int the_errno)
 {
--- a/src/Common/Exception.h
+++ b/src/Common/Exception.h
@ -24,6 +24,8 @@ namespace DB
 class Exception : public Poco::Exception
 {
 public:
+    using FramePointers = std::vector<void *>;
+
    Exception() = default;
    Exception(const std::string & msg, int code, bool remote_ = false);
    Exception(const std::string & msg, const Exception & nested, int code);
@ -66,6 +68,8 @@ public:
    bool isRemoteException() const { return remote; }

    std::string getStackTraceString() const;
+    /// Used for system.errors
+    FramePointers getStackFramePointers() const;

 private:
 #ifndef STD_EXCEPTION_HAS_STACK_TRACE
--- a/src/Common/HashTable/LRUHashMap.h
+++ b/src/Common/HashTable/LRUHashMap.h
@ -271,13 +271,13 @@ private:
 };

 template <typename Key, typename Mapped>
-struct DefaultCellDisposer
+struct DefaultLRUHashMapCellDisposer
 {
    void operator()(const Key &, const Mapped &) const {}
 };

-template <typename Key, typename Value, typename Disposer = DefaultCellDisposer<Key, Value>, typename Hash = DefaultHash<Key>>
+template <typename Key, typename Value, typename Disposer = DefaultLRUHashMapCellDisposer<Key, Value>, typename Hash = DefaultHash<Key>>
 using LRUHashMap = LRUHashMapImpl<Key, Value, Disposer, Hash, false>;

-template <typename Key, typename Value, typename Disposer = DefaultCellDisposer<Key, Value>, typename Hash = DefaultHash<Key>>
+template <typename Key, typename Value, typename Disposer = DefaultLRUHashMapCellDisposer<Key, Value>, typename Hash = DefaultHash<Key>>
 using LRUHashMapWithSavedHash = LRUHashMapImpl<Key, Value, Disposer, Hash, true>;
--- a/src/Common/PODArray.h
+++ b/src/Common/PODArray.h
@ -692,6 +692,30 @@ public:
        assign(from.begin(), from.end());
    }

+    void erase(const_iterator first, const_iterator last)
+    {
+        iterator first_no_const = const_cast<iterator>(first);
+        iterator last_no_const = const_cast<iterator>(last);
+
+        size_t items_to_move = end() - last;
+
+        while (items_to_move != 0)
+        {
+            *first_no_const = *last_no_const;
+
+            ++first_no_const;
+            ++last_no_const;
+
+            --items_to_move;
+        }
+
+        this->c_end = reinterpret_cast<char *>(first_no_const);
+    }
+
+    void erase(const_iterator pos)
+    {
+        this->erase(pos, pos + 1);
+    }

    bool operator== (const PODArray & rhs) const
    {
--- a/src/Common/tests/gtest_pod_array.cpp
+++ b/src/Common/tests/gtest_pod_array.cpp
@ -92,3 +92,57 @@ TEST(Common, PODInsertElementSizeNotMultipleOfLeftPadding)

    EXPECT_EQ(arr1_initially_empty.size(), items_to_insert_size);
 }
+
+TEST(Common, PODErase)
+{
+    {
+        PaddedPODArray<UInt64> items {0,1,2,3,4,5,6,7,8,9};
+        PaddedPODArray<UInt64> expected;
+        expected = {0,1,2,3,4,5,6,7,8,9};
+
+        items.erase(items.begin(), items.begin());
+        EXPECT_EQ(items, expected);
+
+        items.erase(items.end(), items.end());
+        EXPECT_EQ(items, expected);
+    }
+    {
+        PaddedPODArray<UInt64> actual {0,1,2,3,4,5,6,7,8,9};
+        PaddedPODArray<UInt64> expected;
+
+        expected = {0,1,4,5,6,7,8,9};
+        actual.erase(actual.begin() + 2, actual.begin() + 4);
+        EXPECT_EQ(actual, expected);
+
+        expected = {0,1,4};
+        actual.erase(actual.begin() + 3, actual.end());
+        EXPECT_EQ(actual, expected);
+
+        expected = {};
+        actual.erase(actual.begin(), actual.end());
+        EXPECT_EQ(actual, expected);
+
+        for (size_t i = 0; i < 10; ++i)
+            actual.emplace_back(static_cast<UInt64>(i));
+
+        expected = {0,1,4,5,6,7,8,9};
+        actual.erase(actual.begin() + 2, actual.begin() + 4);
+        EXPECT_EQ(actual, expected);
+
+        expected = {0,1,4};
+        actual.erase(actual.begin() + 3, actual.end());
+        EXPECT_EQ(actual, expected);
+
+        expected = {};
+        actual.erase(actual.begin(), actual.end());
+        EXPECT_EQ(actual, expected);
+    }
+    {
+        PaddedPODArray<UInt64> actual {0,1,2,3,4,5,6,7,8,9};
+        PaddedPODArray<UInt64> expected;
+
+        expected = {1,2,3,4,5,6,7,8,9};
+        actual.erase(actual.begin());
+        EXPECT_EQ(actual, expected);
+    }
+}
--- a/src/Common/ya.make
+++ b/src/Common/ya.make
@ -14,7 +14,7 @@ PEERDIR(
    clickhouse/base/common
    clickhouse/base/pcg-random
    clickhouse/base/widechar_width
-    contrib/libs/libcpuid/libcpuid
+    contrib/libs/libcpuid
    contrib/libs/openssl
    contrib/libs/poco/NetSSL_OpenSSL
    contrib/libs/re2
--- a/src/Common/ya.make.in
+++ b/src/Common/ya.make.in
@ -13,7 +13,7 @@ PEERDIR(
    clickhouse/base/common
    clickhouse/base/pcg-random
    clickhouse/base/widechar_width
-    contrib/libs/libcpuid/libcpuid
+    contrib/libs/libcpuid
    contrib/libs/openssl
    contrib/libs/poco/NetSSL_OpenSSL
    contrib/libs/re2
--- a/src/Compression/CachedCompressedReadBuffer.cpp
+++ b/src/Compression/CachedCompressedReadBuffer.cpp
@ -51,7 +51,7 @@ bool CachedCompressedReadBuffer::nextImpl()
        {
            owned_cell->additional_bytes = codec->getAdditionalSizeAtTheEndOfBuffer();
            owned_cell->data.resize(size_decompressed + owned_cell->additional_bytes);
-            decompress(owned_cell->data.data(), size_decompressed, size_compressed_without_checksum);
+            decompressTo(owned_cell->data.data(), size_decompressed, size_compressed_without_checksum);

        }

--- a/src/Compression/CompressedReadBuffer.cpp
+++ b/src/Compression/CompressedReadBuffer.cpp
@ -21,7 +21,7 @@ bool CompressedReadBuffer::nextImpl()
    memory.resize(size_decompressed + additional_size_at_the_end_of_buffer);
    working_buffer = Buffer(memory.data(), &memory[size_decompressed]);

-    decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum);
+    decompress(working_buffer, size_decompressed, size_compressed_without_checksum);

    return true;
 }
@ -48,7 +48,7 @@ size_t CompressedReadBuffer::readBig(char * to, size_t n)
        /// If the decompressed block fits entirely where it needs to be copied.
        if (size_decompressed + additional_size_at_the_end_of_buffer <= n - bytes_read)
        {
-            decompress(to + bytes_read, size_decompressed, size_compressed_without_checksum);
+            decompressTo(to + bytes_read, size_decompressed, size_compressed_without_checksum);
            bytes_read += size_decompressed;
            bytes += size_decompressed;
        }
@ -61,9 +61,9 @@ size_t CompressedReadBuffer::readBig(char * to, size_t n)

            memory.resize(size_decompressed + additional_size_at_the_end_of_buffer);
            working_buffer = Buffer(memory.data(), &memory[size_decompressed]);
-            pos = working_buffer.begin();

-            decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum);
+            decompress(working_buffer, size_decompressed, size_compressed_without_checksum);
+            pos = working_buffer.begin();

            bytes_read += read(to + bytes_read, n - bytes_read);
            break;
--- a/src/Compression/CompressedReadBufferBase.cpp
+++ b/src/Compression/CompressedReadBufferBase.cpp
@ -184,7 +184,7 @@ size_t CompressedReadBufferBase::readCompressedData(size_t & size_decompressed,
 }


-void CompressedReadBufferBase::decompress(char * to, size_t size_decompressed, size_t size_compressed_without_checksum)
+static void readHeaderAndGetCodec(const char * compressed_buffer, size_t size_decompressed, CompressionCodecPtr & codec, bool allow_different_codecs)
 {
    ProfileEvents::increment(ProfileEvents::CompressedReadBufferBlocks);
    ProfileEvents::increment(ProfileEvents::CompressedReadBufferBytes, size_decompressed);
@ -210,11 +210,38 @@ void CompressedReadBufferBase::decompress(char * to, size_t size_decompressed, s
                            ErrorCodes::CANNOT_DECOMPRESS);
        }
    }
+}

+
+void CompressedReadBufferBase::decompressTo(char * to, size_t size_decompressed, size_t size_compressed_without_checksum)
+{
+    readHeaderAndGetCodec(compressed_buffer, size_decompressed, codec, allow_different_codecs);
    codec->decompress(compressed_buffer, size_compressed_without_checksum, to);
 }


+void CompressedReadBufferBase::decompress(BufferBase::Buffer & to, size_t size_decompressed, size_t size_compressed_without_checksum)
+{
+    readHeaderAndGetCodec(compressed_buffer, size_decompressed, codec, allow_different_codecs);
+
+    if (codec->isNone())
+    {
+        /// Shortcut for NONE codec to avoid extra memcpy.
+        /// We doing it by changing the buffer `to` to point to existing uncompressed data.
+
+        UInt8 header_size = ICompressionCodec::getHeaderSize();
+        if (size_compressed_without_checksum < header_size)
+            throw Exception(ErrorCodes::CORRUPTED_DATA,
+                "Can't decompress data: the compressed data size ({}, this should include header size) is less than the header size ({})",
+                    size_compressed_without_checksum, static_cast<size_t>(header_size));
+
+        to = BufferBase::Buffer(compressed_buffer + header_size, compressed_buffer + size_compressed_without_checksum);
+    }
+    else
+        codec->decompress(compressed_buffer, size_compressed_without_checksum, to.begin());
+}
+
+
 /// 'compressed_in' could be initialized lazily, but before first call of 'readCompressedData'.
 CompressedReadBufferBase::CompressedReadBufferBase(ReadBuffer * in, bool allow_different_codecs_)
    : compressed_in(in), own_compressed_buffer(0), allow_different_codecs(allow_different_codecs_)
--- a/src/Compression/CompressedReadBufferBase.h
+++ b/src/Compression/CompressedReadBufferBase.h
@ -3,6 +3,7 @@
 #include <Common/PODArray.h>
 #include <Compression/LZ4_decompress_faster.h>
 #include <Compression/ICompressionCodec.h>
+#include <IO/BufferBase.h>


 namespace DB
@ -37,7 +38,12 @@ protected:
    /// Returns number of compressed bytes read.
    size_t readCompressedData(size_t & size_decompressed, size_t & size_compressed_without_checksum, bool always_copy);

-    void decompress(char * to, size_t size_decompressed, size_t size_compressed_without_checksum);
+    /// Decompress into memory pointed by `to`
+    void decompressTo(char * to, size_t size_decompressed, size_t size_compressed_without_checksum);
+
+    /// This method can change location of `to` to avoid unnecessary copy if data is uncompressed.
+    /// It is more efficient for compression codec NONE but not suitable if you want to decompress into specific location.
+    void decompress(BufferBase::Buffer & to, size_t size_decompressed, size_t size_compressed_without_checksum);

 public:
    /// 'compressed_in' could be initialized lazily, but before first call of 'readCompressedData'.
--- a/src/Compression/CompressedReadBufferFromFile.cpp
+++ b/src/Compression/CompressedReadBufferFromFile.cpp
@ -31,7 +31,7 @@ bool CompressedReadBufferFromFile::nextImpl()
    memory.resize(size_decompressed + additional_size_at_the_end_of_buffer);
    working_buffer = Buffer(memory.data(), &memory[size_decompressed]);

-    decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum);
+    decompress(working_buffer, size_decompressed, size_compressed_without_checksum);

    return true;
 }
@ -108,7 +108,7 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n)
        /// If the decompressed block fits entirely where it needs to be copied.
        if (size_decompressed + additional_size_at_the_end_of_buffer <= n - bytes_read)
        {
-            decompress(to + bytes_read, size_decompressed, size_compressed_without_checksum);
+            decompressTo(to + bytes_read, size_decompressed, size_compressed_without_checksum);
            bytes_read += size_decompressed;
            bytes += size_decompressed;
        }
@ -122,9 +122,9 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n)

            memory.resize(size_decompressed + additional_size_at_the_end_of_buffer);
            working_buffer = Buffer(memory.data(), &memory[size_decompressed]);
-            pos = working_buffer.begin();

-            decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum);
+            decompress(working_buffer, size_decompressed, size_compressed_without_checksum);
+            pos = working_buffer.begin();

            bytes_read += read(to + bytes_read, n - bytes_read);
            break;
--- a/src/Compression/ICompressionCodec.cpp
+++ b/src/Compression/ICompressionCodec.cpp
@ -98,7 +98,7 @@ UInt32 ICompressionCodec::decompress(const char * source, UInt32 source_size, ch

    UInt8 header_size = getHeaderSize();
    if (source_size < header_size)
-        throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: the compressed data size ({}), this should include header size) is less than the header size ({})", source_size, size_t(header_size));
+        throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: the compressed data size ({}, this should include header size) is less than the header size ({})", source_size, static_cast<size_t>(header_size));

    uint8_t our_method = getMethodByte();
    uint8_t method = source[0];
--- a/src/Coordination/CoordinationSettings.h
+++ b/src/Coordination/CoordinationSettings.h
@ -31,6 +31,8 @@ struct Settings;
    M(UInt64, rotate_log_storage_interval, 10000, "How many records will be stored in one log storage file", 0) \
    M(UInt64, snapshots_to_keep, 3, "How many compressed snapshots to keep on disk", 0) \
    M(UInt64, stale_log_gap, 10000, "When node became stale and should receive snapshots from leader", 0) \
+    M(UInt64, fresh_log_gap, 200, "When node became fresh", 0) \
+    M(Bool, quorum_reads, false, "Execute read requests as writes through whole RAFT consesus with similar speed", 0) \
    M(Bool, force_sync, true, " Call fsync on each change in RAFT changelog", 0)

 DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS)
--- a/src/Coordination/NuKeeperServer.cpp
+++ b/src/Coordination/NuKeeperServer.cpp
@ -30,6 +30,8 @@ NuKeeperServer::NuKeeperServer(
    , state_manager(nuraft::cs_new<NuKeeperStateManager>(server_id, "test_keeper_server", config, coordination_settings))
    , responses_queue(responses_queue_)
 {
+    if (coordination_settings->quorum_reads)
+        LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Quorum reads enabled, NuKeeper will work slower.");
 }

 void NuKeeperServer::startup()
@ -59,6 +61,7 @@ void NuKeeperServer::startup()
    params.reserved_log_items_ = coordination_settings->reserved_log_items;
    params.snapshot_distance_ = coordination_settings->snapshot_distance;
    params.stale_log_gap_ = coordination_settings->stale_log_gap;
+    params.fresh_log_gap_ = coordination_settings->fresh_log_gap;
    params.client_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds();
    params.auto_forwarding_ = coordination_settings->auto_forwarding;
    params.auto_forwarding_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds() * 2;
@ -106,7 +109,7 @@ nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(int64_t session_id, const Coord
 void NuKeeperServer::putRequest(const NuKeeperStorage::RequestForSession & request_for_session)
 {
    auto [session_id, request] = request_for_session;
-    if (isLeaderAlive() && request->isReadRequest())
+    if (!coordination_settings->quorum_reads && isLeaderAlive() && request->isReadRequest())
    {
        state_machine->processReadRequest(request_for_session);
    }
@ -185,6 +188,9 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t
    if (next_index < last_commited || next_index - last_commited <= 1)
        commited_store = true;

+    if (initialized_flag)
+        return nuraft::cb_func::ReturnCode::Ok;
+
    auto set_initialized = [this] ()
    {
        std::unique_lock lock(initialized_mutex);
@ -196,10 +202,27 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t
    {
        case nuraft::cb_func::BecomeLeader:
        {
-            if (commited_store) /// We become leader and store is empty, ready to serve requests
+            /// We become leader and store is empty or we already committed it
+            if (commited_store || initial_batch_committed)
                set_initialized();
            return nuraft::cb_func::ReturnCode::Ok;
        }
+        case nuraft::cb_func::BecomeFollower:
+        case nuraft::cb_func::GotAppendEntryReqFromLeader:
+        {
+            if (isLeaderAlive())
+            {
+                auto leader_index = raft_instance->get_leader_committed_log_idx();
+                auto our_index = raft_instance->get_committed_log_idx();
+                /// This may happen when we start RAFT cluster from scratch.
+                /// Node first became leader, and after that some other node became leader.
+                /// BecameFresh for this node will not be called because it was already fresh
+                /// when it was leader.
+                if (leader_index < our_index + coordination_settings->fresh_log_gap)
+                    set_initialized();
+            }
+            return nuraft::cb_func::ReturnCode::Ok;
+        }
        case nuraft::cb_func::BecomeFresh:
        {
            set_initialized(); /// We are fresh follower, ready to serve requests.
@ -209,6 +232,7 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t
        {
            if (isLeader()) /// We have committed our log store and we are leader, ready to serve requests.
                set_initialized();
+            initial_batch_committed = true;
            return nuraft::cb_func::ReturnCode::Ok;
        }
        default: /// ignore other events
@ -220,7 +244,7 @@ void NuKeeperServer::waitInit()
 {
    std::unique_lock lock(initialized_mutex);
    int64_t timeout = coordination_settings->startup_timeout.totalMilliseconds();
-    if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag; }))
+    if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag.load(); }))
        throw Exception(ErrorCodes::RAFT_ERROR, "Failed to wait RAFT initialization");
 }

--- a/src/Coordination/NuKeeperServer.h
+++ b/src/Coordination/NuKeeperServer.h
@ -31,8 +31,9 @@ private:
    ResponsesQueue & responses_queue;

    std::mutex initialized_mutex;
-    bool initialized_flag = false;
+    std::atomic<bool> initialized_flag = false;
    std::condition_variable initialized_cv;
+    std::atomic<bool> initial_batch_committed = false;

    nuraft::cb_func::ReturnCode callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * param);

--- a/src/Coordination/NuKeeperSnapshotManager.cpp
+++ b/src/Coordination/NuKeeperSnapshotManager.cpp
@ -241,9 +241,10 @@ NuKeeperStorageSnapshot::~NuKeeperStorageSnapshot()
    storage->disableSnapshotMode();
 }

-NuKeeperSnapshotManager::NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_)
+NuKeeperSnapshotManager::NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_, size_t storage_tick_time_)
    : snapshots_path(snapshots_path_)
    , snapshots_to_keep(snapshots_to_keep_)
+    , storage_tick_time(storage_tick_time_)
 {
    namespace fs = std::filesystem;

@ -325,22 +326,24 @@ nuraft::ptr<nuraft::buffer> NuKeeperSnapshotManager::serializeSnapshotToBuffer(c
    return writer.getBuffer();
 }

-SnapshotMetadataPtr NuKeeperSnapshotManager::deserializeSnapshotFromBuffer(NuKeeperStorage * storage, nuraft::ptr<nuraft::buffer> buffer)
+SnapshotMetaAndStorage NuKeeperSnapshotManager::deserializeSnapshotFromBuffer(nuraft::ptr<nuraft::buffer> buffer) const
 {
    ReadBufferFromNuraftBuffer reader(buffer);
    CompressedReadBuffer compressed_reader(reader);
-    return NuKeeperStorageSnapshot::deserialize(*storage, compressed_reader);
+    auto storage = std::make_unique<NuKeeperStorage>(storage_tick_time);
+    auto snapshot_metadata = NuKeeperStorageSnapshot::deserialize(*storage, compressed_reader);
+    return std::make_pair(snapshot_metadata, std::move(storage));
 }

-SnapshotMetadataPtr NuKeeperSnapshotManager::restoreFromLatestSnapshot(NuKeeperStorage * storage)
+SnapshotMetaAndStorage NuKeeperSnapshotManager::restoreFromLatestSnapshot()
 {
    if (existing_snapshots.empty())
-        return nullptr;
+        return {};

    auto buffer = deserializeLatestSnapshotBufferFromDisk();
    if (!buffer)
-        return nullptr;
-    return deserializeSnapshotFromBuffer(storage, buffer);
+        return {};
+    return deserializeSnapshotFromBuffer(buffer);
 }

 void NuKeeperSnapshotManager::removeOutdatedSnapshotsIfNeeded()
--- a/src/Coordination/NuKeeperSnapshotManager.h
+++ b/src/Coordination/NuKeeperSnapshotManager.h
@ -40,17 +40,20 @@ public:
 using NuKeeperStorageSnapshotPtr = std::shared_ptr<NuKeeperStorageSnapshot>;
 using CreateSnapshotCallback = std::function<void(NuKeeperStorageSnapshotPtr &&)>;

+
+using SnapshotMetaAndStorage = std::pair<SnapshotMetadataPtr, NuKeeperStoragePtr>;
+
 class NuKeeperSnapshotManager
 {
 public:
-    NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_);
+    NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_, size_t storage_tick_time_ = 500);

-    SnapshotMetadataPtr restoreFromLatestSnapshot(NuKeeperStorage * storage);
+    SnapshotMetaAndStorage restoreFromLatestSnapshot();

    static nuraft::ptr<nuraft::buffer> serializeSnapshotToBuffer(const NuKeeperStorageSnapshot & snapshot);
    std::string serializeSnapshotBufferToDisk(nuraft::buffer & buffer, size_t up_to_log_idx);

-    static SnapshotMetadataPtr deserializeSnapshotFromBuffer(NuKeeperStorage * storage, nuraft::ptr<nuraft::buffer> buffer);
+    SnapshotMetaAndStorage deserializeSnapshotFromBuffer(nuraft::ptr<nuraft::buffer> buffer) const;

    nuraft::ptr<nuraft::buffer> deserializeSnapshotBufferFromDisk(size_t up_to_log_idx) const;
    nuraft::ptr<nuraft::buffer> deserializeLatestSnapshotBufferFromDisk();
@ -74,6 +77,7 @@ private:
    const std::string snapshots_path;
    const size_t snapshots_to_keep;
    std::map<size_t, std::string> existing_snapshots;
+    size_t storage_tick_time;
 };

 struct CreateSnapshotTask
--- a/src/Coordination/NuKeeperStateMachine.cpp
+++ b/src/Coordination/NuKeeperStateMachine.cpp
@ -4,6 +4,7 @@
 #include <IO/ReadHelpers.h>
 #include <Common/ZooKeeper/ZooKeeperIO.h>
 #include <Coordination/NuKeeperSnapshotManager.h>
+#include <future>

 namespace DB
 {
@ -37,8 +38,7 @@ NuKeeperStorage::RequestForSession parseRequest(nuraft::buffer & data)

 NuKeeperStateMachine::NuKeeperStateMachine(ResponsesQueue & responses_queue_, SnapshotsQueue & snapshots_queue_, const std::string & snapshots_path_, const CoordinationSettingsPtr & coordination_settings_)
    : coordination_settings(coordination_settings_)
-    , storage(coordination_settings->dead_session_check_period_ms.totalMilliseconds())
-    , snapshot_manager(snapshots_path_, coordination_settings->snapshots_to_keep)
+    , snapshot_manager(snapshots_path_, coordination_settings->snapshots_to_keep, coordination_settings->dead_session_check_period_ms.totalMicroseconds())
    , responses_queue(responses_queue_)
    , snapshots_queue(snapshots_queue_)
    , last_committed_idx(0)
@ -60,7 +60,7 @@ void NuKeeperStateMachine::init()
        try
        {
            latest_snapshot_buf = snapshot_manager.deserializeSnapshotBufferFromDisk(latest_log_index);
-            latest_snapshot_meta = snapshot_manager.deserializeSnapshotFromBuffer(&storage, latest_snapshot_buf);
+            std::tie(latest_snapshot_meta, storage) = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_buf);
            last_committed_idx = latest_snapshot_meta->get_last_log_idx();
            loaded = true;
            break;
@ -83,6 +83,9 @@ void NuKeeperStateMachine::init()
    {
        LOG_DEBUG(log, "No existing snapshots, last committed log index {}", last_committed_idx);
    }
+
+    if (!storage)
+        storage = std::make_unique<NuKeeperStorage>(coordination_settings->dead_session_check_period_ms.totalMilliseconds());
 }

 nuraft::ptr<nuraft::buffer> NuKeeperStateMachine::commit(const size_t log_idx, nuraft::buffer & data)
@ -96,7 +99,7 @@ nuraft::ptr<nuraft::buffer> NuKeeperStateMachine::commit(const size_t log_idx, n
        nuraft::buffer_serializer bs(response);
        {
            std::lock_guard lock(storage_lock);
-            session_id = storage.getSessionID(session_timeout_ms);
+            session_id = storage->getSessionID(session_timeout_ms);
            bs.put_i64(session_id);
        }
        LOG_DEBUG(log, "Session ID response {} with timeout {}", session_id, session_timeout_ms);
@ -109,7 +112,7 @@ nuraft::ptr<nuraft::buffer> NuKeeperStateMachine::commit(const size_t log_idx, n
        NuKeeperStorage::ResponsesForSessions responses_for_sessions;
        {
            std::lock_guard lock(storage_lock);
-            responses_for_sessions = storage.processRequest(request_for_session.request, request_for_session.session_id, log_idx);
+            responses_for_sessions = storage->processRequest(request_for_session.request, request_for_session.session_id, log_idx);
            for (auto & response_for_session : responses_for_sessions)
                responses_queue.push(response_for_session);
        }
@ -133,7 +136,7 @@ bool NuKeeperStateMachine::apply_snapshot(nuraft::snapshot & s)

    {
        std::lock_guard lock(storage_lock);
-        snapshot_manager.deserializeSnapshotFromBuffer(&storage, latest_snapshot_ptr);
+        std::tie(latest_snapshot_meta, storage) = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_ptr);
    }
    last_committed_idx = s.get_last_log_idx();
    return true;
@ -157,7 +160,7 @@ void NuKeeperStateMachine::create_snapshot(
    CreateSnapshotTask snapshot_task;
    {
        std::lock_guard lock(storage_lock);
-        snapshot_task.snapshot = std::make_shared<NuKeeperStorageSnapshot>(&storage, snapshot_meta_copy);
+        snapshot_task.snapshot = std::make_shared<NuKeeperStorageSnapshot>(storage.get(), snapshot_meta_copy);
    }

    snapshot_task.create_snapshot = [this, when_done] (NuKeeperStorageSnapshotPtr && snapshot)
@ -179,7 +182,7 @@ void NuKeeperStateMachine::create_snapshot(
            {
                /// Must do it with lock (clearing elements from list)
                std::lock_guard lock(storage_lock);
-                storage.clearGarbageAfterSnapshot();
+                storage->clearGarbageAfterSnapshot();
                /// Destroy snapshot with lock
                snapshot.reset();
                LOG_TRACE(log, "Cleared garbage after snapshot");
@ -214,7 +217,7 @@ void NuKeeperStateMachine::save_logical_snp_obj(
    if (obj_id == 0)
    {
        std::lock_guard lock(storage_lock);
-        NuKeeperStorageSnapshot snapshot(&storage, s.get_last_log_idx());
+        NuKeeperStorageSnapshot snapshot(storage.get(), s.get_last_log_idx());
        cloned_buffer = snapshot_manager.serializeSnapshotToBuffer(snapshot);
    }
    else
@ -225,7 +228,28 @@ void NuKeeperStateMachine::save_logical_snp_obj(
    nuraft::ptr<nuraft::buffer> snp_buf = s.serialize();
    cloned_meta = nuraft::snapshot::deserialize(*snp_buf);

-    auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*cloned_buffer, s.get_last_log_idx());
+    /// Sometimes NuRaft can call save and create snapshots from different threads
+    /// at once. To avoid race conditions we serialize snapshots through snapshots_queue
+    /// TODO: make something better
+    CreateSnapshotTask snapshot_task;
+    std::shared_ptr<std::promise<void>> waiter = std::make_shared<std::promise<void>>();
+    auto future = waiter->get_future();
+    snapshot_task.snapshot = nullptr;
+    snapshot_task.create_snapshot = [this, waiter, cloned_buffer, log_idx = s.get_last_log_idx()] (NuKeeperStorageSnapshotPtr &&)
+    {
+        try
+        {
+            auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*cloned_buffer, log_idx);
+            LOG_DEBUG(log, "Saved snapshot {} to path {}", log_idx, result_path);
+        }
+        catch (...)
+        {
+            tryLogCurrentException(log);
+        }
+        waiter->set_value();
+    };
+    snapshots_queue.push(std::move(snapshot_task));
+    future.wait();

    {
        std::lock_guard lock(snapshots_lock);
@ -233,7 +257,6 @@ void NuKeeperStateMachine::save_logical_snp_obj(
        latest_snapshot_meta = cloned_meta;
    }

-    LOG_DEBUG(log, "Created snapshot {} with path {}", s.get_last_log_idx(), result_path);

    obj_id++;
 }
@ -271,7 +294,7 @@ void NuKeeperStateMachine::processReadRequest(const NuKeeperStorage::RequestForS
    NuKeeperStorage::ResponsesForSessions responses;
    {
        std::lock_guard lock(storage_lock);
-        responses = storage.processRequest(request_for_session.request, request_for_session.session_id, std::nullopt);
+        responses = storage->processRequest(request_for_session.request, request_for_session.session_id, std::nullopt);
    }
    for (const auto & response : responses)
        responses_queue.push(response);
@ -280,13 +303,13 @@ void NuKeeperStateMachine::processReadRequest(const NuKeeperStorage::RequestForS
 std::unordered_set<int64_t> NuKeeperStateMachine::getDeadSessions()
 {
    std::lock_guard lock(storage_lock);
-    return storage.getDeadSessions();
+    return storage->getDeadSessions();
 }

 void NuKeeperStateMachine::shutdownStorage()
 {
    std::lock_guard lock(storage_lock);
-    storage.finalize();
+    storage->finalize();
 }

 }
--- a/src/Coordination/NuKeeperStateMachine.h
+++ b/src/Coordination/NuKeeperStateMachine.h
@ -52,7 +52,7 @@ public:

    NuKeeperStorage & getStorage()
    {
-        return storage;
+        return *storage;
    }

    void processReadRequest(const NuKeeperStorage::RequestForSession & request_for_session);
@ -68,7 +68,7 @@ private:

    CoordinationSettingsPtr coordination_settings;

-    NuKeeperStorage storage;
+    NuKeeperStoragePtr storage;

    NuKeeperSnapshotManager snapshot_manager;

--- a/src/Coordination/NuKeeperStorage.cpp
+++ b/src/Coordination/NuKeeperStorage.cpp
@ -233,7 +233,7 @@ struct NuKeeperStorageGetRequest final : public NuKeeperStorageRequest
 struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest
 {
    using NuKeeperStorageRequest::NuKeeperStorageRequest;
-    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t /*zxid*/, int64_t session_id) const override
+    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t /*zxid*/, int64_t /*session_id*/) const override
    {
        Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
        Coordination::ZooKeeperRemoveResponse & response = dynamic_cast<Coordination::ZooKeeperRemoveResponse &>(*response_ptr);
@ -257,7 +257,12 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest
        {
            auto prev_node = it->value;
            if (prev_node.stat.ephemeralOwner != 0)
-                ephemerals[session_id].erase(request.path);
+            {
+                auto ephemerals_it = ephemerals.find(prev_node.stat.ephemeralOwner);
+                ephemerals_it->second.erase(request.path);
+                if (ephemerals_it->second.empty())
+                    ephemerals.erase(ephemerals_it);
+            }

            auto child_basename = getBaseName(it->key);
            container.updateValue(parentPath(request.path), [&child_basename] (NuKeeperStorage::Node & parent)
@ -271,10 +276,10 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest

            container.erase(request.path);

-            undo = [prev_node, &container, &ephemerals, session_id, path = request.path, child_basename]
+            undo = [prev_node, &container, &ephemerals, path = request.path, child_basename]
            {
                if (prev_node.stat.ephemeralOwner != 0)
-                    ephemerals[session_id].emplace(path);
+                    ephemerals[prev_node.stat.ephemeralOwner].emplace(path);

                container.insert(path, prev_node);
                container.updateValue(parentPath(path), [&child_basename] (NuKeeperStorage::Node & parent)
@ -377,7 +382,6 @@ struct NuKeeperStorageSetRequest final : public NuKeeperStorageRequest
    {
        return processWatchesImpl(zk_request->getPath(), watches, list_watches, Coordination::Event::CHANGED);
    }
-
 };

 struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest
@ -641,6 +645,13 @@ NuKeeperStorage::ResponsesForSessions NuKeeperStorage::processRequest(const Coor
            for (const auto & ephemeral_path : it->second)
            {
                container.erase(ephemeral_path);
+                container.updateValue(parentPath(ephemeral_path), [&ephemeral_path] (NuKeeperStorage::Node & parent)
+                {
+                    --parent.stat.numChildren;
+                    ++parent.stat.cversion;
+                    parent.children.erase(getBaseName(ephemeral_path));
+                });
+
                auto responses = processWatchesImpl(ephemeral_path, watches, list_watches, Coordination::Event::DELETED);
                results.insert(results.end(), responses.begin(), responses.end());
            }
--- a/src/Coordination/NuKeeperStorage.h
+++ b/src/Coordination/NuKeeperStorage.h
@ -131,4 +131,6 @@ public:
    }
 };

+using NuKeeperStoragePtr = std::unique_ptr<NuKeeperStorage>;
+
 }
--- a/src/Coordination/NuKeeperStorageDispatcher.cpp
+++ b/src/Coordination/NuKeeperStorageDispatcher.cpp
@ -132,6 +132,10 @@ void NuKeeperStorageDispatcher::initialize(const Poco::Util::AbstractConfigurati

    coordination_settings->loadFromConfig("test_keeper_server.coordination_settings", config);

+    request_thread = ThreadFromGlobalPool([this] { requestThread(); });
+    responses_thread = ThreadFromGlobalPool([this] { responseThread(); });
+    snapshot_thread = ThreadFromGlobalPool([this] { snapshotThread(); });
+
    server = std::make_unique<NuKeeperServer>(myid, coordination_settings, config, responses_queue, snapshots_queue);
    try
    {
@ -148,10 +152,8 @@ void NuKeeperStorageDispatcher::initialize(const Poco::Util::AbstractConfigurati
        throw;
    }

-    request_thread = ThreadFromGlobalPool([this] { requestThread(); });
-    responses_thread = ThreadFromGlobalPool([this] { responseThread(); });
+
    session_cleaner_thread = ThreadFromGlobalPool([this] { sessionCleanerTask(); });
-    snapshot_thread = ThreadFromGlobalPool([this] { snapshotThread(); });

    LOG_DEBUG(log, "Dispatcher initialized");
 }
--- a/src/Coordination/tests/gtest_for_build.cpp
+++ b/src/Coordination/tests/gtest_for_build.cpp
@ -897,25 +897,25 @@ TEST(CoordinationTest, TestStorageSnapshotSimple)
    manager.serializeSnapshotBufferToDisk(*buf, 2);
    EXPECT_TRUE(fs::exists("./snapshots/snapshot_2.bin"));

-    DB::NuKeeperStorage restored_storage(500);

    auto debuf = manager.deserializeSnapshotBufferFromDisk(2);
-    manager.deserializeSnapshotFromBuffer(&restored_storage, debuf);

-    EXPECT_EQ(restored_storage.container.size(), 3);
-    EXPECT_EQ(restored_storage.container.getValue("/").children.size(), 1);
-    EXPECT_EQ(restored_storage.container.getValue("/hello").children.size(), 1);
-    EXPECT_EQ(restored_storage.container.getValue("/hello/somepath").children.size(), 0);
+    auto [snapshot_meta, restored_storage] = manager.deserializeSnapshotFromBuffer(debuf);

-    EXPECT_EQ(restored_storage.container.getValue("/").data, "");
-    EXPECT_EQ(restored_storage.container.getValue("/hello").data, "world");
-    EXPECT_EQ(restored_storage.container.getValue("/hello/somepath").data, "somedata");
-    EXPECT_EQ(restored_storage.session_id_counter, 7);
-    EXPECT_EQ(restored_storage.zxid, 2);
-    EXPECT_EQ(restored_storage.ephemerals.size(), 2);
-    EXPECT_EQ(restored_storage.ephemerals[3].size(), 1);
-    EXPECT_EQ(restored_storage.ephemerals[1].size(), 1);
-    EXPECT_EQ(restored_storage.session_and_timeout.size(), 2);
+    EXPECT_EQ(restored_storage->container.size(), 3);
+    EXPECT_EQ(restored_storage->container.getValue("/").children.size(), 1);
+    EXPECT_EQ(restored_storage->container.getValue("/hello").children.size(), 1);
+    EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").children.size(), 0);
+
+    EXPECT_EQ(restored_storage->container.getValue("/").data, "");
+    EXPECT_EQ(restored_storage->container.getValue("/hello").data, "world");
+    EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").data, "somedata");
+    EXPECT_EQ(restored_storage->session_id_counter, 7);
+    EXPECT_EQ(restored_storage->zxid, 2);
+    EXPECT_EQ(restored_storage->ephemerals.size(), 2);
+    EXPECT_EQ(restored_storage->ephemerals[3].size(), 1);
+    EXPECT_EQ(restored_storage->ephemerals[1].size(), 1);
+    EXPECT_EQ(restored_storage->session_and_timeout.size(), 2);
 }

 TEST(CoordinationTest, TestStorageSnapshotMoreWrites)
@ -946,15 +946,14 @@ TEST(CoordinationTest, TestStorageSnapshotMoreWrites)
    manager.serializeSnapshotBufferToDisk(*buf, 50);
    EXPECT_TRUE(fs::exists("./snapshots/snapshot_50.bin"));

-    DB::NuKeeperStorage restored_storage(500);

    auto debuf = manager.deserializeSnapshotBufferFromDisk(50);
-    manager.deserializeSnapshotFromBuffer(&restored_storage, debuf);
+    auto [meta, restored_storage] = manager.deserializeSnapshotFromBuffer(debuf);

-    EXPECT_EQ(restored_storage.container.size(), 51);
+    EXPECT_EQ(restored_storage->container.size(), 51);
    for (size_t i = 0; i < 50; ++i)
    {
-        EXPECT_EQ(restored_storage.container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
+        EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
    }
 }

@ -987,14 +986,13 @@ TEST(CoordinationTest, TestStorageSnapshotManySnapshots)
    EXPECT_TRUE(fs::exists("./snapshots/snapshot_250.bin"));


-    DB::NuKeeperStorage restored_storage(500);
-    manager.restoreFromLatestSnapshot(&restored_storage);
+    auto [meta, restored_storage] = manager.restoreFromLatestSnapshot();

-    EXPECT_EQ(restored_storage.container.size(), 251);
+    EXPECT_EQ(restored_storage->container.size(), 251);

    for (size_t i = 0; i < 250; ++i)
    {
-        EXPECT_EQ(restored_storage.container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
+        EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
    }
 }

@ -1040,12 +1038,11 @@ TEST(CoordinationTest, TestStorageSnapshotMode)
            EXPECT_FALSE(storage.container.contains("/hello_" + std::to_string(i)));
    }

-    DB::NuKeeperStorage restored_storage(500);
-    manager.restoreFromLatestSnapshot(&restored_storage);
+    auto [meta, restored_storage] = manager.restoreFromLatestSnapshot();

    for (size_t i = 0; i < 50; ++i)
    {
-        EXPECT_EQ(restored_storage.container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
+        EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
    }

 }
@ -1071,8 +1068,7 @@ TEST(CoordinationTest, TestStorageSnapshotBroken)
    plain_buf.truncate(34);
    plain_buf.sync();

-    DB::NuKeeperStorage restored_storage(500);
-    EXPECT_THROW(manager.restoreFromLatestSnapshot(&restored_storage), DB::Exception);
+    EXPECT_THROW(manager.restoreFromLatestSnapshot(), DB::Exception);
 }

 nuraft::ptr<nuraft::buffer> getBufferFromZKRequest(int64_t session_id, const Coordination::ZooKeeperRequestPtr & request)
@ -1236,6 +1232,37 @@ TEST(CoordinationTest, TestStateMachineAndLogStore)
    }
 }

+TEST(CoordinationTest, TestEphemeralNodeRemove)
+{
+    using namespace Coordination;
+    using namespace DB;
+
+    ChangelogDirTest snapshots("./snapshots");
+    CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
+
+    ResponsesQueue queue;
+    SnapshotsQueue snapshots_queue{1};
+    auto state_machine = std::make_shared<NuKeeperStateMachine>(queue, snapshots_queue, "./snapshots", settings);
+    state_machine->init();
+
+    std::shared_ptr<ZooKeeperCreateRequest> request_c = std::make_shared<ZooKeeperCreateRequest>();
+    request_c->path = "/hello";
+    request_c->is_ephemeral = true;
+    auto entry_c = getLogEntryFromZKRequest(0, 1, request_c);
+    state_machine->commit(1, entry_c->get_buf());
+    const auto & storage = state_machine->getStorage();
+
+    EXPECT_EQ(storage.ephemerals.size(), 1);
+    std::shared_ptr<ZooKeeperRemoveRequest> request_d = std::make_shared<ZooKeeperRemoveRequest>();
+    request_d->path = "/hello";
+    /// Delete from other session
+    auto entry_d = getLogEntryFromZKRequest(0, 2, request_d);
+    state_machine->commit(2, entry_d->get_buf());
+
+    EXPECT_EQ(storage.ephemerals.size(), 0);
+}
+
+
 int main(int argc, char ** argv)
 {
    Poco::AutoPtr<Poco::ConsoleChannel> channel(new Poco::ConsoleChannel(std::cerr));
--- a/src/Core/Field.h
+++ b/src/Core/Field.h
@ -953,3 +953,26 @@ void writeFieldText(const Field & x, WriteBuffer & buf);
 String toString(const Field & x);

 }
+
+template <>
+struct fmt::formatter<DB::Field>
+{
+    constexpr auto parse(format_parse_context & ctx)
+    {
+        auto it = ctx.begin();
+        auto end = ctx.end();
+
+        /// Only support {}.
+        if (it != end && *it != '}')
+            throw format_error("invalid format");
+
+        return it;
+    }
+
+    template <typename FormatContext>
+    auto format(const DB::Field & x, FormatContext & ctx)
+    {
+        return format_to(ctx.out(), "{}", toString(x));
+    }
+};
+
--- a/src/Dictionaries/CacheDictionary.cpp
+++ b/src/Dictionaries/CacheDictionary.cpp
@ -101,7 +101,7 @@ template <DictionaryKeyType dictionary_key_type>
 double CacheDictionary<dictionary_key_type>::getLoadFactor() const
 {
    const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
-    return static_cast<double>(cache_storage_ptr->getSize()) / cache_storage_ptr->getMaxSize();
+    return cache_storage_ptr->getLoadFactor();
 }

 template <DictionaryKeyType dictionary_key_type>
@ -333,9 +333,7 @@ Columns CacheDictionary<dictionary_key_type>::getColumnsImpl(
    FetchResult result_of_fetch_from_storage;

    {
-        /// Write lock on storage
-        const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};
-
+        const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};
        result_of_fetch_from_storage = cache_storage_ptr->fetchColumnsForKeys(keys, request);
    }

--- a/src/Dictionaries/CacheDictionaryStorage.h
+++ b/src/Dictionaries/CacheDictionaryStorage.h
@ -1,6 +1,7 @@
 #pragma once

 #include <chrono>
+#include <variant>

 #include <pcg_random.hpp>

@ -30,28 +31,31 @@ struct CacheDictionaryStorageConfiguration
    const DictionaryLifetime lifetime;
 };

-/** Keys are stored in LRUCache and column values are serialized into arena.
-
-    Cell in LRUCache consists of allocated size and place in arena were columns serialized data is stored.
-
-    Columns are serialized by rows.
-
-    When cell is removed from LRUCache data associated with it is also removed from arena.
-
-    In case of complex key we also store key data in arena and it is removed from arena.
-*/
+/** ICacheDictionaryStorage implementation that keeps key in hash table with fixed collision length.
+  * Value in hash table point to index in attributes arrays.
+  */
 template <DictionaryKeyType dictionary_key_type>
 class CacheDictionaryStorage final : public ICacheDictionaryStorage
 {
+
+    static constexpr size_t max_collision_length = 10;
+
 public:
    using KeyType = std::conditional_t<dictionary_key_type == DictionaryKeyType::simple, UInt64, StringRef>;
    static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionaryStorage");

-    explicit CacheDictionaryStorage(CacheDictionaryStorageConfiguration & configuration_)
+    explicit CacheDictionaryStorage(
+        const DictionaryStructure & dictionary_structure,
+        CacheDictionaryStorageConfiguration & configuration_)
        : configuration(configuration_)
        , rnd_engine(randomSeed())
-        , cache(configuration.max_size_in_cells, false, { arena })
    {
+        size_t cells_size = roundUpToPowerOfTwoOrZero(std::max(configuration.max_size_in_cells, max_collision_length));
+
+        cells.resize_fill(cells_size);
+        size_overlap_mask = cells_size - 1;
+
+        setup(dictionary_structure);
    }

    bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return true; }
@ -71,9 +75,7 @@ public:
        const DictionaryStorageFetchRequest & fetch_request) override
    {
        if constexpr (dictionary_key_type == DictionaryKeyType::simple)
-        {
            return fetchColumnsForKeysImpl<SimpleKeysStorageFetchResult>(keys, fetch_request);
-        }
        else
            throw Exception("Method fetchColumnsForKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED);
    }
@ -109,9 +111,7 @@ public:
        const DictionaryStorageFetchRequest & column_fetch_requests) override
    {
        if constexpr (dictionary_key_type == DictionaryKeyType::complex)
-        {
            return fetchColumnsForKeysImpl<ComplexKeysStorageFetchResult>(keys, column_fetch_requests);
-        }
        else
            throw Exception("Method fetchColumnsForKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED);
    }
@ -140,79 +140,162 @@ public:
            throw Exception("Method getCachedComplexKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED);
    }

-    size_t getSize() const override { return cache.size(); }
+    size_t getSize() const override { return size; }

-    size_t getMaxSize() const override { return cache.getMaxSize(); }
+    double getLoadFactor() const override { return static_cast<double>(size) / configuration.max_size_in_cells; }

-    size_t getBytesAllocated() const override { return arena.size() + cache.getSizeInBytes(); }
+    size_t getBytesAllocated() const override
+    {
+        size_t attributes_size_in_bytes = 0;
+        size_t attributes_size = attributes.size();
+
+        for (size_t attribute_index = 0; attribute_index < attributes_size; ++attribute_index)
+        {
+            getAttributeContainer(attribute_index, [&](const auto & container)
+            {
+                attributes_size_in_bytes += container.capacity() * sizeof(container[0]);
+            });
+        }
+
+        return arena.size() + sizeof(Cell) * configuration.max_size_in_cells + attributes_size_in_bytes;
+    }

 private:

+    struct FetchedKey
+    {
+        FetchedKey(size_t element_index_, bool is_default_)
+            : element_index(element_index_)
+            , is_default(is_default_)
+        {}
+
+        size_t element_index;
+        bool is_default;
+    };
+
    template <typename KeysStorageFetchResult>
-    ALWAYS_INLINE KeysStorageFetchResult fetchColumnsForKeysImpl(
+    KeysStorageFetchResult fetchColumnsForKeysImpl(
        const PaddedPODArray<KeyType> & keys,
        const DictionaryStorageFetchRequest & fetch_request)
    {
        KeysStorageFetchResult result;

        result.fetched_columns = fetch_request.makeAttributesResultColumns();
-        result.key_index_to_state.resize_fill(keys.size(), {KeyState::not_found});
+        result.key_index_to_state.resize_fill(keys.size());

-        const auto now = std::chrono::system_clock::now();
+        const time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());

        size_t fetched_columns_index = 0;
+        size_t keys_size = keys.size();

        std::chrono::seconds max_lifetime_seconds(configuration.strict_max_lifetime_seconds);

-        size_t keys_size = keys.size();
+        PaddedPODArray<FetchedKey> fetched_keys;
+        fetched_keys.resize_fill(keys_size);

        for (size_t key_index = 0; key_index < keys_size; ++key_index)
        {
            auto key = keys[key_index];
-            auto * it = cache.find(key);
+            auto [key_state, cell_index] = getKeyStateAndCellIndex(key, now);

-            if (it)
+            if (unlikely(key_state == KeyState::not_found))
            {
-                /// Columns values for key are serialized in cache now deserialize them
-                const auto & cell = it->getMapped();
+                result.key_index_to_state[key_index] = {KeyState::not_found};
+                ++result.not_found_keys_size;
+                continue;
+            }

-                bool has_deadline = cellHasDeadline(cell);
+            auto & cell = cells[cell_index];

-                if (has_deadline && now > cell.deadline + max_lifetime_seconds)
-                {
-                    result.key_index_to_state[key_index] = {KeyState::not_found};
-                    ++result.not_found_keys_size;
-                    continue;
-                }
-                else if (has_deadline && now > cell.deadline)
-                {
-                    result.key_index_to_state[key_index] = {KeyState::expired, fetched_columns_index};
-                    ++result.expired_keys_size;
-                }
-                else
-                {
-                    result.key_index_to_state[key_index] = {KeyState::found, fetched_columns_index};
-                    ++result.found_keys_size;
-                }
+            result.expired_keys_size += static_cast<size_t>(key_state == KeyState::expired);

-                ++fetched_columns_index;
+            result.key_index_to_state[key_index] = {key_state, fetched_columns_index};
+            fetched_keys[fetched_columns_index] = FetchedKey(cell.element_index, cell.is_default);

-                if (cell.isDefault())
+            ++fetched_columns_index;
+
+            result.key_index_to_state[key_index].setDefaultValue(cell.is_default);
+            result.default_keys_size += cell.is_default;
+        }
+
+        result.found_keys_size = keys_size - (result.expired_keys_size + result.not_found_keys_size);
+
+        for (size_t attribute_index = 0; attribute_index < fetch_request.attributesSize(); ++attribute_index)
+        {
+            if (!fetch_request.shouldFillResultColumnWithIndex(attribute_index))
+                continue;
+
+            auto & attribute = attributes[attribute_index];
+            const auto & default_value_provider = fetch_request.defaultValueProviderAtIndex(attribute_index);
+
+            size_t fetched_keys_size = fetched_keys.size();
+            auto & fetched_column = *result.fetched_columns[attribute_index];
+            fetched_column.reserve(fetched_keys_size);
+
+            if (unlikely(attribute.is_complex_type))
+            {
+                auto & container = std::get<std::vector<Field>>(attribute.attribute_container);
+
+                for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index)
                {
-                    result.key_index_to_state[key_index].setDefault();
-                    ++result.default_keys_size;
-                    insertDefaultValuesIntoColumns(result.fetched_columns, fetch_request, key_index);
-                }
-                else
-                {
-                    const char * place_for_serialized_columns = cell.place_for_serialized_columns;
-                    deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, place_for_serialized_columns);
+                    auto fetched_key = fetched_keys[fetched_key_index];
+
+                    if (unlikely(fetched_key.is_default))
+                        fetched_column.insert(default_value_provider.getDefaultValue(fetched_key_index));
+                    else
+                        fetched_column.insert(container[fetched_key.element_index]);
                }
            }
            else
            {
-                result.key_index_to_state[key_index] = {KeyState::not_found};
-                ++result.not_found_keys_size;
+                auto type_call = [&](const auto & dictionary_attribute_type)
+                {
+                    using Type = std::decay_t<decltype(dictionary_attribute_type)>;
+                    using AttributeType = typename Type::AttributeType;
+                    using ValueType = DictionaryValueType<AttributeType>;
+                    using ColumnType =
+                        std::conditional_t<std::is_same_v<AttributeType, String>, ColumnString,
+                            std::conditional_t<IsDecimalNumber<AttributeType>, ColumnDecimal<ValueType>,
+                                ColumnVector<AttributeType>>>;
+
+                    auto & container = std::get<PaddedPODArray<ValueType>>(attribute.attribute_container);
+                    ColumnType & column_typed = static_cast<ColumnType &>(fetched_column);
+
+                    if constexpr (std::is_same_v<ColumnType, ColumnString>)
+                    {
+                        for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index)
+                        {
+                            auto fetched_key = fetched_keys[fetched_key_index];
+
+                            if (unlikely(fetched_key.is_default))
+                                column_typed.insert(default_value_provider.getDefaultValue(fetched_key_index));
+                            else
+                            {
+                                auto item = container[fetched_key.element_index];
+                                column_typed.insertData(item.data, item.size);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        auto & data = column_typed.getData();
+
+                        for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index)
+                        {
+                            auto fetched_key = fetched_keys[fetched_key_index];
+
+                            if (unlikely(fetched_key.is_default))
+                                column_typed.insert(default_value_provider.getDefaultValue(fetched_key_index));
+                            else
+                            {
+                                auto item = container[fetched_key.element_index];
+                                data.push_back(item);
+                            }
+                        }
+                    }
+                };
+
+                callOnDictionaryAttributeType(attribute.type, type_call);
            }
        }

@ -221,58 +304,108 @@ private:

    void insertColumnsForKeysImpl(const PaddedPODArray<KeyType> & keys, Columns columns)
    {
-        Arena temporary_values_pool;
-
-        size_t columns_to_serialize_size = columns.size();
-        PaddedPODArray<StringRef> temporary_column_data(columns_to_serialize_size);
-
        const auto now = std::chrono::system_clock::now();

-        size_t keys_size = keys.size();
+        Field column_value;

-        for (size_t key_index = 0; key_index < keys_size; ++key_index)
+        for (size_t key_index = 0; key_index < keys.size(); ++key_index)
        {
-            size_t allocated_size_for_columns = 0;
-            const char * block_start = nullptr;
-
            auto key = keys[key_index];
-            auto * it = cache.find(key);

-            for (size_t column_index = 0; column_index < columns_to_serialize_size; ++column_index)
+            size_t cell_index = getCellIndexForInsert(key);
+            auto & cell = cells[cell_index];
+
+            bool cell_was_default = cell.is_default;
+            cell.is_default = false;
+
+            bool was_inserted = cell.deadline == 0;
+
+            if (was_inserted)
            {
-                auto & column = columns[column_index];
-                temporary_column_data[column_index] = column->serializeValueIntoArena(key_index, temporary_values_pool, block_start);
-                allocated_size_for_columns += temporary_column_data[column_index].size;
-            }
+                if constexpr (std::is_same_v<KeyType, StringRef>)
+                    cell.key = copyStringInArena(key);
+                else
+                    cell.key = key;

-            char * place_for_serialized_columns = arena.alloc(allocated_size_for_columns);
-            memcpy(reinterpret_cast<void*>(place_for_serialized_columns), reinterpret_cast<const void*>(block_start), allocated_size_for_columns);
+                for (size_t attribute_index = 0; attribute_index < columns.size(); ++attribute_index)
+                {
+                    auto & column = columns[attribute_index];

-            if (it)
-            {
-                /// Cell exists need to free previous serialized place and update deadline
-                auto & cell = it->getMapped();
+                    getAttributeContainer(attribute_index, [&](auto & container)
+                    {
+                        container.emplace_back();
+                        cell.element_index = container.size() - 1;

-                if (cell.place_for_serialized_columns)
-                    arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns);
+                        using ElementType = std::decay_t<decltype(container[0])>;

-                setCellDeadline(cell, now);
-                cell.allocated_size_for_columns = allocated_size_for_columns;
-                cell.place_for_serialized_columns = place_for_serialized_columns;
+                        column->get(key_index, column_value);
+
+                        if constexpr (std::is_same_v<ElementType, Field>)
+                            container.back() = column_value;
+                        else if constexpr (std::is_same_v<ElementType, StringRef>)
+                        {
+                            const String & string_value = column_value.get<String>();
+                            StringRef string_value_ref = StringRef {string_value.data(), string_value.size()};
+                            StringRef inserted_value = copyStringInArena(string_value_ref);
+                            container.back() = inserted_value;
+                        }
+                        else
+                            container.back() = column_value.get<NearestFieldType<ElementType>>();
+                    });
+                }
+
+                ++size;
            }
            else
            {
-                /// No cell exists so create and put in cache
-                Cell cell;
+                if (cell.key != key)
+                {
+                    if constexpr (std::is_same_v<KeyType, StringRef>)
+                    {
+                        char * data = const_cast<char *>(cell.key.data);
+                        arena.free(data, cell.key.size);
+                        cell.key = copyStringInArena(key);
+                    }
+                    else
+                        cell.key = key;
+                }

-                setCellDeadline(cell, now);
-                cell.allocated_size_for_columns = allocated_size_for_columns;
-                cell.place_for_serialized_columns = place_for_serialized_columns;
+                /// Put values into existing index
+                size_t index_to_use = cell.element_index;

-                insertCellInCache(key, cell);
+                for (size_t attribute_index = 0; attribute_index < columns.size(); ++attribute_index)
+                {
+                    auto & column = columns[attribute_index];
+
+                    getAttributeContainer(attribute_index, [&](auto & container)
+                    {
+                        using ElementType = std::decay_t<decltype(container[0])>;
+
+                        column->get(key_index, column_value);
+
+                        if constexpr (std::is_same_v<ElementType, Field>)
+                            container[index_to_use] = column_value;
+                        else if constexpr (std::is_same_v<ElementType, StringRef>)
+                        {
+                            const String & string_value = column_value.get<String>();
+                            StringRef string_ref_value = StringRef {string_value.data(), string_value.size()};
+                            StringRef inserted_value = copyStringInArena(string_ref_value);
+
+                            if (!cell_was_default)
+                            {
+                                StringRef previous_value = container[index_to_use];
+                                arena.free(const_cast<char *>(previous_value.data), previous_value.size);
+                            }
+
+                            container[index_to_use] = inserted_value;
+                        }
+                        else
+                            container[index_to_use] = column_value.get<NearestFieldType<ElementType>>();
+                    });
+                }
            }

-            temporary_values_pool.rollback(allocated_size_for_columns);
+            setCellDeadline(cell, now);
        }
    }

@ -280,94 +413,224 @@ private:
    {
        const auto now = std::chrono::system_clock::now();

-        for (auto key : keys)
+        size_t keys_size = keys.size();
+
+        for (size_t key_index = 0; key_index < keys_size; ++key_index)
        {
-            auto * it = cache.find(key);
+            auto key = keys[key_index];

-            if (it)
+            size_t cell_index = getCellIndexForInsert(key);
+            auto & cell = cells[cell_index];
+
+            bool was_inserted = cell.deadline == 0;
+            bool cell_was_default = cell.is_default;
+
+            cell.is_default = true;
+
+            if (was_inserted)
            {
-                auto & cell = it->getMapped();
+                if constexpr (std::is_same_v<KeyType, StringRef>)
+                    cell.key = copyStringInArena(key);
+                else
+                    cell.key = key;

-                setCellDeadline(cell, now);
+                for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index)
+                {
+                    getAttributeContainer(attribute_index, [&](auto & container)
+                    {
+                        container.emplace_back();
+                        cell.element_index = container.size() - 1;
+                    });
+                }

-                if (cell.place_for_serialized_columns)
-                    arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns);
-
-                cell.allocated_size_for_columns = 0;
-                cell.place_for_serialized_columns = nullptr;
+                ++size;
            }
            else
            {
-                Cell cell;
+                for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index)
+                {
+                    getAttributeContainer(attribute_index, [&](const auto & container)
+                    {
+                        using ElementType = std::decay_t<decltype(container[0])>;

-                setCellDeadline(cell, now);
-                cell.allocated_size_for_columns = 0;
-                cell.place_for_serialized_columns = nullptr;
+                        if constexpr (std::is_same_v<ElementType, StringRef>)
+                        {
+                            if (!cell_was_default)
+                            {
+                                StringRef previous_value = container[cell.element_index];
+                                arena.free(const_cast<char *>(previous_value.data), previous_value.size);
+                            }
+                        }
+                    });
+                }

-                insertCellInCache(key, cell);
+                if (cell.key != key)
+                {
+                    if constexpr (std::is_same_v<KeyType, StringRef>)
+                    {
+                        char * data = const_cast<char *>(cell.key.data);
+                        arena.free(data, cell.key.size);
+                        cell.key = copyStringInArena(key);
+                    }
+                    else
+                        cell.key = key;
+                }
            }
+
+            setCellDeadline(cell, now);
        }
    }

    PaddedPODArray<KeyType> getCachedKeysImpl() const
    {
        PaddedPODArray<KeyType> result;
-        result.reserve(cache.size());
+        result.reserve(size);

-        for (auto & node : cache)
+        for (auto & cell : cells)
        {
-            auto & cell = node.getMapped();
-
-            if (cell.isDefault())
+            if (cell.deadline == 0)
                continue;

-            result.emplace_back(node.getKey());
+            if (cell.is_default)
+                continue;
+
+            result.emplace_back(cell.key);
        }

        return result;
    }

+    template <typename GetContainerFunc>
+    void getAttributeContainer(size_t attribute_index, GetContainerFunc && func)
+    {
+        auto & attribute = attributes[attribute_index];
+        auto & attribute_type = attribute.type;
+
+        if (unlikely(attribute.is_complex_type))
+        {
+            auto & container = std::get<std::vector<Field>>(attribute.attribute_container);
+            std::forward<GetContainerFunc>(func)(container);
+        }
+        else
+        {
+            auto type_call = [&](const auto & dictionary_attribute_type)
+            {
+                using Type = std::decay_t<decltype(dictionary_attribute_type)>;
+                using AttributeType = typename Type::AttributeType;
+                using ValueType = DictionaryValueType<AttributeType>;
+
+                auto & container = std::get<PaddedPODArray<ValueType>>(attribute.attribute_container);
+                std::forward<GetContainerFunc>(func)(container);
+            };
+
+            callOnDictionaryAttributeType(attribute_type, type_call);
+        }
+    }
+
+    template <typename GetContainerFunc>
+    void getAttributeContainer(size_t attribute_index, GetContainerFunc && func) const
+    {
+        return const_cast<std::decay_t<decltype(*this)> *>(this)->template getAttributeContainer(attribute_index, std::forward<GetContainerFunc>(func));
+    }
+
+    StringRef copyStringInArena(StringRef value_to_copy)
+    {
+        size_t value_to_copy_size = value_to_copy.size;
+        char * place_for_key = arena.alloc(value_to_copy_size);
+        memcpy(reinterpret_cast<void *>(place_for_key), reinterpret_cast<const void *>(value_to_copy.data), value_to_copy_size);
+        StringRef updated_value{place_for_key, value_to_copy_size};
+
+        return updated_value;
+    }
+
+    void setup(const DictionaryStructure & dictionary_structure)
+    {
+        /// For each dictionary attribute create storage attribute
+        /// For simple attributes create PODArray, for complex vector of Fields
+
+        attributes.reserve(dictionary_structure.attributes.size());
+
+        for (const auto & dictionary_attribute : dictionary_structure.attributes)
+        {
+            auto attribute_type = dictionary_attribute.underlying_type;
+
+            auto type_call = [&](const auto & dictionary_attribute_type)
+            {
+                using Type = std::decay_t<decltype(dictionary_attribute_type)>;
+                using AttributeType = typename Type::AttributeType;
+                using ValueType = DictionaryValueType<AttributeType>;
+
+                attributes.emplace_back();
+                auto & last_attribute = attributes.back();
+                last_attribute.type = attribute_type;
+                last_attribute.is_complex_type = dictionary_attribute.is_nullable || dictionary_attribute.is_array;
+
+                if (dictionary_attribute.is_nullable)
+                    last_attribute.attribute_container = std::vector<Field>();
+                else
+                    last_attribute.attribute_container = PaddedPODArray<ValueType>();
+            };
+
+            callOnDictionaryAttributeType(attribute_type, type_call);
+        }
+    }
+
    using TimePoint = std::chrono::system_clock::time_point;

    struct Cell
    {
-        TimePoint deadline;
-        size_t allocated_size_for_columns;
-        char * place_for_serialized_columns;
-
-        inline bool isDefault() const { return place_for_serialized_columns == nullptr; }
-        inline void setDefault()
-        {
-            place_for_serialized_columns = nullptr;
-            allocated_size_for_columns = 0;
-        }
+        KeyType key;
+        size_t element_index;
+        bool is_default;
+        time_t deadline;
    };

-    void insertCellInCache(KeyType & key, const Cell & cell)
+    struct Attribute
    {
-        if constexpr (dictionary_key_type == DictionaryKeyType::complex)
-        {
-            /// Copy complex key into arena and put in cache
-            size_t key_size = key.size;
-            char * place_for_key = arena.alloc(key_size);
-            memcpy(reinterpret_cast<void *>(place_for_key), reinterpret_cast<const void *>(key.data), key_size);
-            KeyType updated_key{place_for_key, key_size};
-            key = updated_key;
-        }
+        AttributeUnderlyingType type;
+        bool is_complex_type;

-        cache.insert(key, cell);
-    }
+        std::variant<
+            PaddedPODArray<UInt8>,
+            PaddedPODArray<UInt16>,
+            PaddedPODArray<UInt32>,
+            PaddedPODArray<UInt64>,
+            PaddedPODArray<UInt128>,
+            PaddedPODArray<Int8>,
+            PaddedPODArray<Int16>,
+            PaddedPODArray<Int32>,
+            PaddedPODArray<Int64>,
+            PaddedPODArray<Decimal32>,
+            PaddedPODArray<Decimal64>,
+            PaddedPODArray<Decimal128>,
+            PaddedPODArray<Float32>,
+            PaddedPODArray<Float64>,
+            PaddedPODArray<StringRef>,
+            std::vector<Field>> attribute_container;
+    };

-    inline static bool cellHasDeadline(const Cell & cell)
-    {
-        return cell.deadline != std::chrono::system_clock::from_time_t(0);
-    }
+    CacheDictionaryStorageConfiguration configuration;
+
+    pcg64 rnd_engine;
+
+    size_t size_overlap_mask = 0;
+
+    size_t size = 0;
+
+    PaddedPODArray<Cell> cells;
+
+    ArenaWithFreeLists arena;
+
+    std::vector<Attribute> attributes;

    inline void setCellDeadline(Cell & cell, TimePoint now)
    {
        if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0)
        {
-            cell.deadline = std::chrono::system_clock::from_time_t(0);
+            /// This maybe not obvious, but when we define is this cell is expired or expired permanently, we add strict_max_lifetime_seconds
+            /// to the expiration time. And it overflows pretty well.
+            auto deadline = std::chrono::time_point<std::chrono::system_clock>::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds);
+            cell.deadline = std::chrono::system_clock::to_time_t(deadline);
            return;
        }

@ -375,44 +638,75 @@ private:
        size_t max_sec_lifetime = configuration.lifetime.max_sec;

        std::uniform_int_distribution<UInt64> distribution{min_sec_lifetime, max_sec_lifetime};
-        cell.deadline = now + std::chrono::seconds(distribution(rnd_engine));
+
+        auto deadline = now + std::chrono::seconds(distribution(rnd_engine));
+        cell.deadline = std::chrono::system_clock::to_time_t(deadline);
    }

-    template <typename>
-    friend class ArenaCellDisposer;
-
-    CacheDictionaryStorageConfiguration configuration;
-
-    ArenaWithFreeLists arena;
-
-    pcg64 rnd_engine;
-
-    class ArenaCellDisposer
+    inline size_t getCellIndex(const KeyType key) const
    {
-    public:
-        ArenaWithFreeLists & arena;
+        const size_t hash = DefaultHash<KeyType>()(key);
+        const size_t index = hash & size_overlap_mask;
+        return index;
+    }

-        template <typename Key, typename Value>
-        void operator()(const Key & key, const Value & value) const
+    using KeyStateAndCellIndex = std::pair<KeyState::State, size_t>;
+
+    inline KeyStateAndCellIndex getKeyStateAndCellIndex(const KeyType key, const time_t now) const
+    {
+        size_t place_value = getCellIndex(key);
+        const size_t place_value_end = place_value + max_collision_length;
+
+        time_t max_lifetime_seconds = static_cast<time_t>(configuration.strict_max_lifetime_seconds);
+
+        for (; place_value < place_value_end; ++place_value)
        {
-            /// In case of complex key we keep it in arena
-            if constexpr (std::is_same_v<Key, StringRef>)
-                arena.free(const_cast<char *>(key.data), key.size);
+            const auto cell_place_value = place_value & size_overlap_mask;
+            const auto & cell = cells[cell_place_value];

-            if (value.place_for_serialized_columns)
-                arena.free(value.place_for_serialized_columns, value.allocated_size_for_columns);
+            if (cell.key != key)
+                continue;
+
+            if (unlikely(now > cell.deadline + max_lifetime_seconds))
+                return std::make_pair(KeyState::not_found, cell_place_value);
+
+            if (unlikely(now > cell.deadline))
+                return std::make_pair(KeyState::expired, cell_place_value);
+
+            return std::make_pair(KeyState::found, cell_place_value);
        }
-    };

-    using SimpleKeyLRUHashMap = LRUHashMap<UInt64, Cell, ArenaCellDisposer>;
-    using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash<StringRef, Cell, ArenaCellDisposer>;
+        return std::make_pair(KeyState::not_found, place_value & size_overlap_mask);
+    }

-    using CacheLRUHashMap = std::conditional_t<
-        dictionary_key_type == DictionaryKeyType::simple,
-        SimpleKeyLRUHashMap,
-        ComplexKeyLRUHashMap>;
+    inline size_t getCellIndexForInsert(const KeyType & key) const
+    {
+        size_t place_value = getCellIndex(key);
+        const size_t place_value_end = place_value + max_collision_length;
+        size_t oldest_place_value = place_value;

-    CacheLRUHashMap cache;
+        time_t oldest_time = std::numeric_limits<time_t>::max();
+
+        for (; place_value < place_value_end; ++place_value)
+        {
+            const size_t cell_place_value = place_value & size_overlap_mask;
+            const Cell cell = cells[cell_place_value];
+
+            if (cell.deadline == 0)
+                return cell_place_value;
+
+            if (cell.key == key)
+                return cell_place_value;
+
+            if (cell.deadline < oldest_time)
+            {
+                oldest_time = cell.deadline;
+                oldest_place_value = cell_place_value;
+            }
+        }
+
+        return oldest_place_value;
+    }
 };

 }
--- a/src/Dictionaries/ICacheDictionaryStorage.h
+++ b/src/Dictionaries/ICacheDictionaryStorage.h
@ -12,9 +12,9 @@ struct KeyState
 {
    enum State: uint8_t
    {
-        not_found = 2,
-        expired = 4,
-        found = 8,
+        not_found = 0,
+        expired = 1,
+        found = 2,
    };

    KeyState(State state_, size_t fetched_column_index_)
@ -31,9 +31,10 @@ struct KeyState
    inline bool isNotFound() const { return state == State::not_found; }
    inline bool isDefault() const { return is_default; }
    inline void setDefault() { is_default = true; }
+    inline void setDefaultValue(bool is_default_value) { is_default = is_default_value; }
    /// Valid only if keyState is found or expired
    inline size_t getFetchedColumnIndex() const { return fetched_column_index; }
-
+    inline void setFetchedColumnIndex(size_t fetched_column_index_value) { fetched_column_index = fetched_column_index_value; }
 private:
    State state = not_found;
    size_t fetched_column_index = 0;
@ -111,8 +112,8 @@ public:
    /// Return size of keys in storage
    virtual size_t getSize() const = 0;

-    /// Return maximum size of keys in storage
-    virtual size_t getMaxSize() const = 0;
+    /// Returns storage load factor
+    virtual double getLoadFactor() const = 0;

    /// Return bytes allocated in storage
    virtual size_t getBytesAllocated() const = 0;
--- a/src/Dictionaries/SSDCacheDictionaryStorage.h
+++ b/src/Dictionaries/SSDCacheDictionaryStorage.h
@ -17,7 +17,7 @@
 #include <Common/Arena.h>
 #include <Common/ArenaWithFreeLists.h>
 #include <Common/MemorySanitizer.h>
-#include <Common/HashTable/LRUHashMap.h>
+#include <Common/HashTable/HashMap.h>
 #include <IO/AIO.h>
 #include <Dictionaries/DictionaryStructure.h>
 #include <Dictionaries/ICacheDictionaryStorage.h>
@ -56,7 +56,6 @@ struct SSDCacheDictionaryStorageConfiguration

    const std::string file_path;
    const size_t max_partitions_count;
-    const size_t max_stored_keys;
    const size_t block_size;
    const size_t file_blocks_size;
    const size_t read_buffer_blocks_size;
@ -127,7 +126,7 @@ public:

    /// Reset block with new block_data
    /// block_data must be filled with zeroes if it is new block
-    ALWAYS_INLINE inline void reset(char * new_block_data)
+    inline void reset(char * new_block_data)
    {
        block_data = new_block_data;
        current_block_offset = block_header_size;
@ -135,13 +134,13 @@ public:
    }

    /// Check if it is enough place to write key in block
-    ALWAYS_INLINE inline bool enoughtPlaceToWriteKey(const SSDCacheSimpleKey & cache_key) const
+    inline bool enoughtPlaceToWriteKey(const SSDCacheSimpleKey & cache_key) const
    {
        return (current_block_offset + (sizeof(cache_key.key) + sizeof(cache_key.size) + cache_key.size)) <= block_size;
    }

    /// Check if it is enough place to write key in block
-    ALWAYS_INLINE inline bool enoughtPlaceToWriteKey(const SSDCacheComplexKey & cache_key) const
+    inline bool enoughtPlaceToWriteKey(const SSDCacheComplexKey & cache_key) const
    {
        const StringRef & key = cache_key.key;
        size_t complex_key_size = sizeof(key.size) + key.size;
@ -152,7 +151,7 @@ public:
    /// Write key and returns offset in ssd cache block where data is written
    /// It is client responsibility to check if there is enough place in block to write key
    /// Returns true if key was written and false if there was not enough place to write key
-    ALWAYS_INLINE inline bool writeKey(const SSDCacheSimpleKey & cache_key, size_t & offset_in_block)
+    inline bool writeKey(const SSDCacheSimpleKey & cache_key, size_t & offset_in_block)
    {
        assert(cache_key.size > 0);

@ -181,7 +180,7 @@ public:
        return true;
    }

-    ALWAYS_INLINE inline bool writeKey(const SSDCacheComplexKey & cache_key, size_t & offset_in_block)
+    inline bool writeKey(const SSDCacheComplexKey & cache_key, size_t & offset_in_block)
    {
        assert(cache_key.size > 0);

@ -216,20 +215,20 @@ public:
        return true;
    }

-    ALWAYS_INLINE inline size_t getKeysSize() const { return keys_size; }
+    inline size_t getKeysSize() const { return keys_size; }

    /// Write keys size into block header
-    ALWAYS_INLINE inline void writeKeysSize()
+    inline void writeKeysSize()
    {
        char * keys_size_offset_data = block_data + block_header_check_sum_size;
        std::memcpy(keys_size_offset_data, &keys_size, sizeof(size_t));
    }

    /// Get check sum from block header
-    ALWAYS_INLINE inline size_t getCheckSum() const { return unalignedLoad<size_t>(block_data); }
+    inline size_t getCheckSum() const { return unalignedLoad<size_t>(block_data); }

    /// Calculate check sum in block
-    ALWAYS_INLINE inline size_t calculateCheckSum() const
+    inline size_t calculateCheckSum() const
    {
        size_t calculated_check_sum = static_cast<size_t>(CityHash_v1_0_2::CityHash64(block_data + block_header_check_sum_size, block_size - block_header_check_sum_size));

@ -237,7 +236,7 @@ public:
    }

    /// Check if check sum from block header matched calculated check sum in block
-    ALWAYS_INLINE inline bool checkCheckSum() const
+    inline bool checkCheckSum() const
    {
        size_t calculated_check_sum = calculateCheckSum();
        size_t check_sum = getCheckSum();
@ -246,16 +245,16 @@ public:
    }

    /// Write check sum in block header
-    ALWAYS_INLINE inline void writeCheckSum()
+    inline void writeCheckSum()
    {
        size_t check_sum = static_cast<size_t>(CityHash_v1_0_2::CityHash64(block_data + block_header_check_sum_size, block_size - block_header_check_sum_size));
        std::memcpy(block_data, &check_sum, sizeof(size_t));
    }

-    ALWAYS_INLINE inline size_t getBlockSize() const { return block_size; }
+    inline size_t getBlockSize() const { return block_size; }

    /// Returns block data
-    ALWAYS_INLINE inline char * getBlockData() const { return block_data; }
+    inline char * getBlockData() const { return block_data; }

    /// Read keys that were serialized in block
    /// It is client responsibility to ensure that simple or complex keys were written in block
@ -337,9 +336,7 @@ inline bool operator==(const SSDCacheIndex & lhs, const SSDCacheIndex & rhs)
    return lhs.block_index == rhs.block_index && lhs.offset_in_block == rhs.offset_in_block;
 }

-/** SSDCacheMemoryBuffer initialized with block size and memory buffer blocks size.
-  * Allocate block_size * memory_buffer_blocks_size bytes with page alignment.
-  * Logically represents multiple memory_buffer_blocks_size blocks and current write block.
+/** Logically represents multiple memory_buffer_blocks_size SSDCacheBlocks and current write block.
  * If key cannot be written into current_write_block, current block keys size and check summ is written
  * and buffer increase index of current_write_block_index.
  * If current_write_block_index == memory_buffer_blocks_size write key will always returns true.
@ -444,7 +441,7 @@ private:
    size_t current_block_index = 0;
 };

-/// TODO: Add documentation
+/// Logically represents multiple memory_buffer_blocks_size SSDCacheBlocks on file system
 template <typename SSDCacheKeyType>
 class SSDCacheFileBuffer : private boost::noncopyable
 {
@ -614,11 +611,13 @@ public:
    }

    template <typename FetchBlockFunc>
-    ALWAYS_INLINE void fetchBlocks(char * read_buffer, size_t read_from_file_buffer_blocks_size, const PaddedPODArray<size_t> & blocks_to_fetch, FetchBlockFunc && func) const
+    void fetchBlocks(size_t read_from_file_buffer_blocks_size, const PaddedPODArray<size_t> & blocks_to_fetch, FetchBlockFunc && func) const
    {
        if (blocks_to_fetch.empty())
            return;

+        Memory<Allocator<true>> read_buffer(read_from_file_buffer_blocks_size * block_size, 4096);
+
        size_t blocks_to_fetch_size = blocks_to_fetch.size();

        PaddedPODArray<iocb> requests;
@ -631,7 +630,7 @@ public:
        {
            iocb request{};

-            char * buffer_place = read_buffer + block_size * (block_to_fetch_index % read_from_file_buffer_blocks_size);
+            char * buffer_place = read_buffer.data() + block_size * (block_to_fetch_index % read_from_file_buffer_blocks_size);

            #if defined(__FreeBSD__)
            request.aio.aio_lio_opcode = LIO_READ;
@ -751,7 +750,7 @@ private:
        int fd = -1;
    };

-    ALWAYS_INLINE inline static int preallocateDiskSpace(int fd, size_t offset, size_t len)
+    inline static int preallocateDiskSpace(int fd, size_t offset, size_t len)
    {
        #if defined(__FreeBSD__)
            return posix_fallocate(fd, offset, len);
@ -760,7 +759,7 @@ private:
        #endif
    }

-    ALWAYS_INLINE inline static char * getRequestBuffer(const iocb & request)
+    inline static char * getRequestBuffer(const iocb & request)
    {
        char * result = nullptr;

@ -773,7 +772,7 @@ private:
        return result;
    }

-    ALWAYS_INLINE inline static ssize_t eventResult(io_event & event)
+    inline static ssize_t eventResult(io_event & event)
    {
        ssize_t  bytes_written;

@ -795,7 +794,13 @@ private:
    size_t current_blocks_size = 0;
 };

-/// TODO: Add documentation
+/** ICacheDictionaryStorage implementation that keeps column data serialized in memory index and in disk partitions.
+  * Data is first written in memory buffer.
+  * If memory buffer is full then buffer is flushed to disk partition.
+  * If memory buffer cannot be flushed to associated disk partition, then if partition
+  * can be allocated (current partition index < max_partitions_size) storage allocates new partition, if not old partitions are reused.
+  * Index maps key to partition block and offset.
+  */
 template <DictionaryKeyType dictionary_key_type>
 class SSDCacheDictionaryStorage final : public ICacheDictionaryStorage
 {
@ -806,9 +811,7 @@ public:
    explicit SSDCacheDictionaryStorage(const SSDCacheDictionaryStorageConfiguration & configuration_)
        : configuration(configuration_)
        , file_buffer(configuration_.file_path, configuration.block_size, configuration.file_blocks_size)
-        , read_from_file_buffer(configuration_.block_size * configuration_.read_buffer_blocks_size, 4096)
        , rnd_engine(randomSeed())
-        , index(configuration.max_stored_keys, false, { complex_key_arena })
    {
        memory_buffer_partitions.emplace_back(configuration.block_size, configuration.write_buffer_blocks_size);
    }
@ -897,14 +900,31 @@ public:

    size_t getSize() const override { return index.size(); }

-    size_t getMaxSize() const override {return index.getMaxSize(); }
+    double getLoadFactor() const override
+    {
+        size_t partitions_size = memory_buffer_partitions.size();
+
+        if (partitions_size == configuration.max_partitions_count)
+            return 1.0;
+
+        auto & current_memory_partition = memory_buffer_partitions[current_partition_index];
+
+        size_t full_partitions = partitions_size - 1;
+        size_t blocks_in_memory = (full_partitions * configuration.write_buffer_blocks_size) + current_memory_partition.getCurrentBlockIndex();
+        size_t blocks_on_disk = file_buffer.getCurrentBlockIndex();
+
+        size_t max_blocks_size = (configuration.file_blocks_size + configuration.write_buffer_blocks_size) * configuration.max_partitions_count;
+
+        double load_factor = static_cast<double>(blocks_in_memory + blocks_on_disk) / max_blocks_size;
+        return load_factor;
+    }

    size_t getBytesAllocated() const override
    {
        size_t memory_partitions_bytes_size = memory_buffer_partitions.size() * configuration.write_buffer_blocks_size * configuration.block_size;
        size_t file_partitions_bytes_size = memory_buffer_partitions.size() * configuration.file_blocks_size * configuration.block_size;

-        return index.getSizeInBytes() + memory_partitions_bytes_size + file_partitions_bytes_size;
+        return index.getBufferSizeInBytes() + memory_partitions_bytes_size + file_partitions_bytes_size;
    }

 private:
@ -920,8 +940,7 @@ private:
            default_value
        };

-        TimePoint deadline;
-
+        time_t deadline;
        SSDCacheIndex index;
        size_t in_memory_partition_index;
        CellState state;
@ -933,13 +952,12 @@ private:

    struct KeyToBlockOffset
    {
-        KeyToBlockOffset(size_t key_index_, size_t offset_in_block_, bool is_expired_)
-            : key_index(key_index_), offset_in_block(offset_in_block_), is_expired(is_expired_)
+        KeyToBlockOffset(size_t key_index_, size_t offset_in_block_)
+            : key_index(key_index_), offset_in_block(offset_in_block_)
        {}

        size_t key_index = 0;
        size_t offset_in_block = 0;
-        bool is_expired = false;
    };

    template <typename Result>
@ -950,20 +968,24 @@ private:
        Result result;

        result.fetched_columns = fetch_request.makeAttributesResultColumns();
-        result.key_index_to_state.resize_fill(keys.size(), {KeyState::not_found});
+        result.key_index_to_state.resize_fill(keys.size());

-        const auto now = std::chrono::system_clock::now();
+        const time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());

        size_t fetched_columns_index = 0;

-        using BlockIndexToKeysMap = std::unordered_map<size_t, std::vector<KeyToBlockOffset>, DefaultHash<size_t>>;
+        using BlockIndexToKeysMap = absl::flat_hash_map<size_t, PaddedPODArray<KeyToBlockOffset>, DefaultHash<size_t>>;
        BlockIndexToKeysMap block_to_keys_map;
        absl::flat_hash_set<size_t, DefaultHash<size_t>> unique_blocks_to_request;
        PaddedPODArray<size_t> blocks_to_request;

-        std::chrono::seconds strict_max_lifetime_seconds(configuration.strict_max_lifetime_seconds);
+        time_t strict_max_lifetime_seconds = static_cast<time_t>(configuration.strict_max_lifetime_seconds);
        size_t keys_size = keys.size();

+        for (size_t attribute_size = 0; attribute_size < fetch_request.attributesSize(); ++attribute_size)
+            if (fetch_request.shouldFillResultColumnWithIndex(attribute_size))
+                result.fetched_columns[attribute_size]->reserve(keys_size);
+
        for (size_t key_index = 0; key_index < keys_size; ++key_index)
        {
            auto key = keys[key_index];
@ -978,9 +1000,7 @@ private:

            const auto & cell = it->getMapped();

-            bool has_deadline = cellHasDeadline(cell);
-
-            if (has_deadline && now > cell.deadline + strict_max_lifetime_seconds)
+            if (unlikely(now > cell.deadline + strict_max_lifetime_seconds))
            {
                ++result.not_found_keys_size;
                continue;
@ -989,14 +1009,14 @@ private:
            bool cell_is_expired = false;
            KeyState::State key_state = KeyState::found;

-            if (has_deadline && now > cell.deadline)
+            if (now > cell.deadline)
            {
                cell_is_expired = true;
                key_state = KeyState::expired;
            }

-            result.expired_keys_size += cell_is_expired;
-            result.found_keys_size += !cell_is_expired;
+            result.expired_keys_size += static_cast<size_t>(cell_is_expired);
+            result.found_keys_size += static_cast<size_t>(!cell_is_expired);

            switch (cell.state)
            {
@ -1012,13 +1032,20 @@ private:
                }
                case Cell::on_disk:
                {
-                    block_to_keys_map[cell.index.block_index].emplace_back(key_index, cell.index.offset_in_block, cell_is_expired);
+                    PaddedPODArray<KeyToBlockOffset> & keys_block = block_to_keys_map[cell.index.block_index];
+                    keys_block.emplace_back(key_index, cell.index.offset_in_block);

-                    if (!unique_blocks_to_request.contains(cell.index.block_index))
-                    {
+                    KeyState::State state = cell_is_expired ? KeyState::expired : KeyState::found;
+
+                    /// Fetched column index will be set later during fetch blocks
+                    result.key_index_to_state[key_index] = {state, 0};
+
+                    auto insert_result = unique_blocks_to_request.insert(cell.index.block_index);
+                    bool was_inserted = insert_result.second;
+
+                    if (was_inserted)
                        blocks_to_request.emplace_back(cell.index.block_index);
-                        unique_blocks_to_request.insert(cell.index.block_index);
-                    }
+
                    break;
                }
                case Cell::default_value:
@ -1037,7 +1064,7 @@ private:
        /// Sort blocks by offset before start async io requests
        std::sort(blocks_to_request.begin(), blocks_to_request.end());

-        file_buffer.fetchBlocks(read_from_file_buffer.m_data, configuration.read_buffer_blocks_size, blocks_to_request, [&](size_t block_index, char * block_data)
+        file_buffer.fetchBlocks(configuration.read_buffer_blocks_size, blocks_to_request, [&](size_t block_index, char * block_data)
        {
            auto & keys_in_block = block_to_keys_map[block_index];

@ -1046,10 +1073,7 @@ private:
                char * key_data = block_data + key_in_block.offset_in_block;
                deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, key_data);

-                if (key_in_block.is_expired)
-                    result.key_index_to_state[key_in_block.key_index] = {KeyState::expired, fetched_columns_index};
-                else
-                    result.key_index_to_state[key_in_block.key_index] = {KeyState::found, fetched_columns_index};
+                result.key_index_to_state[key_in_block.key_index].setFetchedColumnIndex(fetched_columns_index);

                ++fetched_columns_index;
            }
@ -1087,7 +1111,7 @@ private:
                throw Exception("Serialized columns size is greater than allowed block size and metadata", ErrorCodes::UNSUPPORTED_METHOD);

            /// We cannot reuse place that is already allocated in file or memory cache so we erase key from index
-            index.erase(key);
+            eraseKeyFromIndex(key);

            Cell cell;
            setCellDeadline(cell, now);
@ -1114,8 +1138,7 @@ private:

        for (auto key : keys)
        {
-            /// We cannot reuse place that is already allocated in file or memory cache so we erase key from index
-            index.erase(key);
+            eraseKeyFromIndex(key);

            Cell cell;

@ -1135,7 +1158,7 @@ private:
                key = updated_key;
            }

-            index.insert(key, cell);
+            index[key] = cell;
        }
    }

@ -1188,7 +1211,7 @@ private:
                cell.index = cache_index;
                cell.in_memory_partition_index = current_partition_index;

-                index.insert(ssd_cache_key.key, cell);
+                index[ssd_cache_key.key] = cell;
                break;
            }
            else
@ -1218,7 +1241,7 @@ private:
                            if (old_key_cell.isOnDisk() &&
                                old_key_block >= block_index_in_file_before_write &&
                                old_key_block < file_read_end_block_index)
-                                index.erase(old_key);
+                                eraseKeyFromIndex(old_key);
                        }
                    }
                }
@ -1271,7 +1294,7 @@ private:
                    cell.index = cache_index;
                    cell.in_memory_partition_index = current_partition_index;

-                    index.insert(ssd_cache_key.key, cell);
+                    index[ssd_cache_key.key] = cell;
                    break;
                }
                else
@ -1296,16 +1319,12 @@ private:
        }
    }

-    inline static bool cellHasDeadline(const Cell & cell)
-    {
-        return cell.deadline != std::chrono::system_clock::from_time_t(0);
-    }
-
    inline void setCellDeadline(Cell & cell, TimePoint now)
    {
        if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0)
        {
-            cell.deadline = std::chrono::system_clock::from_time_t(0);
+            auto deadline = std::chrono::time_point<std::chrono::system_clock>::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds);
+            cell.deadline = std::chrono::system_clock::to_time_t(deadline);
            return;
        }

@ -1313,47 +1332,45 @@ private:
        size_t max_sec_lifetime = configuration.lifetime.max_sec;

        std::uniform_int_distribution<UInt64> distribution{min_sec_lifetime, max_sec_lifetime};
-        cell.deadline = now + std::chrono::seconds{distribution(rnd_engine)};
+        auto deadline = now + std::chrono::seconds(distribution(rnd_engine));
+        cell.deadline = std::chrono::system_clock::to_time_t(deadline);
    }

-    template <typename>
-    friend class ArenaCellKeyDisposer;
+    inline void eraseKeyFromIndex(KeyType key)
+    {
+        auto it = index.find(key);
+
+        if (it == nullptr)
+            return;
+
+        /// In case of complex key in arena key is serialized from hash table
+        KeyType key_copy = it->getKey();
+
+        index.erase(key);
+
+        if constexpr (std::is_same_v<KeyType, StringRef>)
+            complex_key_arena.free(const_cast<char *>(key_copy.data), key_copy.size);
+    }

    SSDCacheDictionaryStorageConfiguration configuration;

    SSDCacheFileBuffer<SSDCacheKeyType> file_buffer;

-    Memory<Allocator<true>> read_from_file_buffer;
-
    std::vector<SSDCacheMemoryBuffer<SSDCacheKeyType>> memory_buffer_partitions;

    pcg64 rnd_engine;

-    class ArenaCellKeyDisposer
-    {
-    public:
-        ArenaWithFreeLists & arena;
+    using SimpleKeyHashMap = HashMap<UInt64, Cell>;
+    using ComplexKeyHashMap = HashMapWithSavedHash<StringRef, Cell>;

-        template <typename Key, typename Value>
-        void operator()(const Key & key, const Value &) const
-        {
-            /// In case of complex key we keep it in arena
-            if constexpr (std::is_same_v<Key, StringRef>)
-                arena.free(const_cast<char *>(key.data), key.size);
-        }
-    };
-
-    using SimpleKeyLRUHashMap = LRUHashMap<UInt64, Cell, ArenaCellKeyDisposer>;
-    using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash<StringRef, Cell, ArenaCellKeyDisposer>;
-
-    using CacheLRUHashMap = std::conditional_t<
+    using CacheMap = std::conditional_t<
        dictionary_key_type == DictionaryKeyType::simple,
-        SimpleKeyLRUHashMap,
-        ComplexKeyLRUHashMap>;
+        SimpleKeyHashMap,
+        ComplexKeyHashMap>;

    ArenaWithFreeLists complex_key_arena;

-    CacheLRUHashMap index;
+    CacheMap index;

    size_t current_partition_index = 0;

--- a/src/Dictionaries/benchmark
+++ b/src/Dictionaries/benchmark
@ -1,154 +0,0 @@
-clickhouse-client --query="DROP TABLE IF EXISTS simple_cache_dictionary_table_source";
-clickhouse-client --query="CREATE TABLE simple_cache_dictionary_table_source (id UInt64, value1 String, value2 UInt64, value3 String, value4 Float64, value5 Decimal64(4)) ENGINE=TinyLog;"
-clickhouse-client --query="INSERT INTO simple_cache_dictionary_table_source SELECT number, concat('Value1 ', toString(number)), number, concat('Value3 ', toString(number)), toFloat64(number), cast(number, 'Decimal64(4)') FROM system.numbers LIMIT 1000000;"
-
-clickhouse-client --multiquery --query="CREATE DICTIONARY clickhouse_simple_cache_dictionary (
-    id UInt64,
-    value1 String,
-    value2 UInt64,
-    value3 String,
-    value4 Float64,
-    value5 Decimal64(4)
-)
-PRIMARY KEY id
-SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_cache_dictionary_table_source' PASSWORD '' DB 'default'))
-LIFETIME(MIN 300 MAX 300)
-LAYOUT(CACHE(SIZE_IN_CELLS 100000));"
-
-clickhouse-client --multiquery --query="CREATE DICTIONARY clickhouse_ssd_simple_cache_dictionary (
-    id UInt64,
-    value1 String,
-    value2 UInt64,
-    value3 String,
-    value4 Float64,
-    value5 Decimal64(4)
-)
-PRIMARY KEY id
-SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_cache_dictionary_table_source' PASSWORD '' DB 'default'))
-LIFETIME(MIN 300 MAX 300)
-LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 WRITE_BUFFER_SIZE 327680 MAX_STORED_KEYS 1048576 PATH '/opt/mkita/ClickHouse/build_release/programs/ssd_cache'));"
-
-clickhouse-client --multiquery --query="CREATE DICTIONARY clickhouse_dummy_simple_cache_dictionary (
-    id UInt64,
-    value1 String,
-    value2 UInt64,
-    value3 String,
-    value4 Float64,
-    value5 Decimal64(4)
-)
-PRIMARY KEY id
-SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_cache_dictionary_table_source' PASSWORD '' DB 'default'))
-LIFETIME(MIN 300 MAX 300)
-LAYOUT(DUMMY_SIMPLE());"
-
-./clickhouse-benchmark --query="SELECT
-    dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value1', number),
-    dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value2', number),
-    dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value3', number),
-    dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value4', number),
-    dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value5', number)
-FROM system.numbers
-LIMIT 10000
-FORMAT Null"
-
-./clickhouse-benchmark --query="SELECT
-    dictGet('default.clickhouse_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number)
-FROM system.numbers
-LIMIT 10000
-FORMAT Null"
-
-./clickhouse-benchmark --query="SELECT dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value1', number) FROM system.numbers_mt LIMIT 10000 FORMAT Null"
-
-./clickhouse-benchmark --query="SELECT
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value1', number),
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value2', number),
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value3', number),
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value4', number),
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value5', number)
-FROM system.numbers
-LIMIT 10000
-FORMAT Null"
-
-./clickhouse-benchmark --query="SELECT dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value1', number) FROM system.numbers_mt LIMIT 10000 FORMAT Null"
-
-SELECT
-    dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value1', number),
-    dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value2', number),
-    dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value3', number),
-    dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value4', number),
-    dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value5', number)
-FROM system.numbers
-    LIMIT 10000
-FORMAT Null
-
-SELECT dictGet('default.clickhouse_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number) FROM system.numbers LIMIT 10000 FORMAT Null
-
-SELECT dictGet('default.clickhouse_ssd_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number) FROM system.numbers LIMIT 10000
-FORMAT Null
-
-SELECT
-    dictGet('default.clickhouse_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number)
-FROM system.numbers
-    LIMIT 10000
-FORMAT
-    Null
-
-SELECT
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value1', number),
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value2', number),
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value3', number),
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value4', number),
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value5', number)
-FROM system.numbers
-    LIMIT 10000
-FORMAT
-    Null
-
-SELECT
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value1', number),
-    dictGet('default.clickhouse_simple_cache_dictionary', 'value2', number)
-FROM system.numbers
-LIMIT 10000
-FORMAT Null
-
-SELECT
-    dictGet('clickhouse_simple_cache_dictionary', 'value1', number)
-FROM system.numbers
-LIMIT 100000
-FORMAT Null
-
-SELECT
-    dictGet('clickhouse_simple_cache_dictionary', 'value2', number)
-FROM system.numbers
-LIMIT 100000
-FORMAT Null
-
-SELECT
-    dictGet('clickhouse_simple_cache_dictionary', 'value3', number)
-FROM system.numbers
-LIMIT 100000
-FORMAT Null
-
-SELECT
-    dictGet('clickhouse_simple_cache_dictionary', 'value4', number)
-FROM system.numbers
-LIMIT 100000
-FORMAT Null
-
-SELECT
-    dictGet('clickhouse_simple_cache_dictionary', 'value5', number)
-FROM system.numbers
-LIMIT 100000
-FORMAT Null
-
-SELECT
-    dictGet('clickhouse_simple_cache_dictionary', 'value1', number),
-    dictGet('clickhouse_simple_cache_dictionary', 'value2', number),
-    dictGet('clickhouse_simple_cache_dictionary', 'value3', number),
-    dictGet('clickhouse_simple_cache_dictionary', 'value4', number),
-    dictGet('clickhouse_simple_cache_dictionary', 'value5', number)
-FROM system.numbers
-LIMIT 100000
-FORMAT Null
-
-SELECT * FROM clickhouse_simple_cache_dictionary_table;
--- a/src/Dictionaries/registerCacheDictionaries.cpp
+++ b/src/Dictionaries/registerCacheDictionaries.cpp
@ -1,6 +1,6 @@
 #include "CacheDictionary.h"
-#include "SSDCacheDictionaryStorage.h"
 #include "CacheDictionaryStorage.h"
+#include "SSDCacheDictionaryStorage.h"
 #include <Dictionaries/DictionaryFactory.h>

 namespace DB
@ -20,13 +20,13 @@ CacheDictionaryStorageConfiguration parseCacheStorageConfiguration(
    const DictionaryLifetime & dict_lifetime,
    DictionaryKeyType dictionary_key_type)
 {
-    String dictionary_type_prefix = dictionary_key_type == DictionaryKeyType::complex ? ".complex_key_cache." : ".cache.";
+    String dictionary_type_prefix = (dictionary_key_type == DictionaryKeyType::complex) ? ".complex_key_cache." : ".cache.";
    String dictionary_configuration_prefix = layout_prefix + dictionary_type_prefix;

    const size_t size = config.getUInt64(dictionary_configuration_prefix + "size_in_cells");
    if (size == 0)
        throw Exception(ErrorCodes::TOO_SMALL_BUFFER_SIZE,
-            "({}: cache dictionary cannot have 0 cells",
+            "({}): cache dictionary cannot have 0 cells",
            full_name);

    size_t dict_lifetime_seconds = static_cast<size_t>(dict_lifetime.max_sec);
@ -59,7 +59,6 @@ SSDCacheDictionaryStorageConfiguration parseSSDCacheStorageConfiguration(
    static constexpr size_t DEFAULT_READ_BUFFER_SIZE_BYTES = 16 * DEFAULT_SSD_BLOCK_SIZE_BYTES;
    static constexpr size_t DEFAULT_WRITE_BUFFER_SIZE_BYTES = DEFAULT_SSD_BLOCK_SIZE_BYTES;

-    static constexpr size_t DEFAULT_MAX_STORED_KEYS = 100000;
    static constexpr size_t DEFAULT_PARTITIONS_COUNT = 16;

    const size_t max_partitions_count
@ -94,16 +93,11 @@ SSDCacheDictionaryStorageConfiguration parseSSDCacheStorageConfiguration(
    if (directory_path.at(0) != '/')
        directory_path = std::filesystem::path{config.getString("path")}.concat(directory_path).string();

-    const size_t max_stored_keys_in_partition
-        = config.getInt64(dictionary_configuration_prefix + "max_stored_keys", DEFAULT_MAX_STORED_KEYS);
-    const size_t rounded_size = roundUpToPowerOfTwoOrZero(max_stored_keys_in_partition);
-
    SSDCacheDictionaryStorageConfiguration configuration{
        strict_max_lifetime_seconds,
        dict_lifetime,
        directory_path,
        max_partitions_count,
-        rounded_size,
        block_size,
        file_size / block_size,
        read_buffer_size / block_size,
@ -194,7 +188,8 @@ DictionaryPtr createCacheDictionaryLayout(
    const bool allow_read_expired_keys = config.getBool(layout_prefix + ".cache.allow_read_expired_keys", false);

    auto storage_configuration = parseCacheStorageConfiguration(full_name, config, layout_prefix, dict_lifetime, dictionary_key_type);
-    auto storage = std::make_shared<CacheDictionaryStorage<dictionary_key_type>>(storage_configuration);
+
+    std::shared_ptr<ICacheDictionaryStorage> storage = std::make_shared<CacheDictionaryStorage<dictionary_key_type>>(dict_struct, storage_configuration);

    auto update_queue_configuration = parseCacheDictionaryUpdateQueueConfiguration(full_name, config, layout_prefix, dictionary_key_type);

--- a/src/Disks/DiskCacheWrapper.cpp
+++ b/src/Disks/DiskCacheWrapper.cpp
@ -209,7 +209,13 @@ void DiskCacheWrapper::clearDirectory(const String & path)
 void DiskCacheWrapper::moveDirectory(const String & from_path, const String & to_path)
 {
    if (cache_disk->exists(from_path))
+    {
+        /// Destination directory may not be empty if previous directory move attempt was failed.
+        if (cache_disk->exists(to_path) && cache_disk->isDirectory(to_path))
+            cache_disk->clearDirectory(to_path);
+
        cache_disk->moveDirectory(from_path, to_path);
+    }
    DiskDecorator::moveDirectory(from_path, to_path);
 }

--- a/src/Functions/FunctionsAES.h
+++ b/src/Functions/FunctionsAES.h
@ -538,8 +538,9 @@ private:

        [[maybe_unused]] const auto block_size = static_cast<size_t>(EVP_CIPHER_block_size(evp_cipher));
        [[maybe_unused]] const auto iv_size = static_cast<size_t>(EVP_CIPHER_iv_length(evp_cipher));
-        const auto key_size = static_cast<size_t>(EVP_CIPHER_key_length(evp_cipher));
-        const auto tag_size = 16; // https://tools.ietf.org/html/rfc5116#section-5.1
+
+        const size_t key_size = static_cast<size_t>(EVP_CIPHER_key_length(evp_cipher));
+        static constexpr size_t tag_size = 16; // https://tools.ietf.org/html/rfc5116#section-5.1

        auto decrypted_result_column = ColumnString::create();
        auto & decrypted_result_column_data = decrypted_result_column->getChars();
@ -549,9 +550,17 @@ private:
            size_t resulting_size = 0;
            for (size_t r = 0; r < input_rows_count; ++r)
            {
-                resulting_size += input_column->getDataAt(r).size + 1;
+                size_t string_size = input_column->getDataAt(r).size;
+                resulting_size += string_size + 1;  /// With terminating zero.
+
                if constexpr (mode == CipherMode::RFC5116_AEAD_AES_GCM)
+                {
+                    if (string_size < tag_size)
+                        throw Exception("Encrypted data is smaller than the size of additional data for AEAD mode, cannot decrypt.",
+                            ErrorCodes::BAD_ARGUMENTS);
+
                    resulting_size -= tag_size;
+                }
            }

 #if defined(MEMORY_SANITIZER)
@ -565,6 +574,7 @@ private:
            decrypted_result_column_data.resize(resulting_size);
 #endif
        }
+
        auto * decrypted = decrypted_result_column_data.data();

        KeyHolder<mode> key_holder;
@ -631,7 +641,7 @@ private:
                    // 1.a.2: Set AAD if present
                    if (aad_column)
                    {
-                        const auto aad_data = aad_column->getDataAt(r);
+                        StringRef aad_data = aad_column->getDataAt(r);
                        int tmp_len = 0;
                        if (aad_data.size != 0 && EVP_DecryptUpdate(evp_ctx, nullptr, &tmp_len,
                                reinterpret_cast<const unsigned char *>(aad_data.data), aad_data.size) != 1)
--- a/src/Functions/SimdJSONParser.h
+++ b/src/Functions/SimdJSONParser.h
@ -42,11 +42,11 @@ struct SimdJSONParser
        ALWAYS_INLINE bool isBool() const { return element.type() == simdjson::dom::element_type::BOOL; }
        ALWAYS_INLINE bool isNull() const { return element.type() == simdjson::dom::element_type::NULL_VALUE; }

-        ALWAYS_INLINE Int64 getInt64() const { return element.get_int64().first; }
-        ALWAYS_INLINE UInt64 getUInt64() const { return element.get_uint64().first; }
-        ALWAYS_INLINE double getDouble() const { return element.get_double().first; }
-        ALWAYS_INLINE bool getBool() const { return element.get_bool().first; }
-        ALWAYS_INLINE std::string_view getString() const { return element.get_string().first; }
+        ALWAYS_INLINE Int64 getInt64() const { return element.get_int64().value_unsafe(); }
+        ALWAYS_INLINE UInt64 getUInt64() const { return element.get_uint64().value_unsafe(); }
+        ALWAYS_INLINE double getDouble() const { return element.get_double().value_unsafe(); }
+        ALWAYS_INLINE bool getBool() const { return element.get_bool().value_unsafe(); }
+        ALWAYS_INLINE std::string_view getString() const { return element.get_string().value_unsafe(); }
        ALWAYS_INLINE Array getArray() const;
        ALWAYS_INLINE Object getObject() const;

@ -75,7 +75,7 @@ struct SimdJSONParser
        ALWAYS_INLINE Iterator begin() const { return array.begin(); }
        ALWAYS_INLINE Iterator end() const { return array.end(); }
        ALWAYS_INLINE size_t size() const { return array.size(); }
-        ALWAYS_INLINE Element operator[](size_t index) const { assert(index < size()); return array.at(index).first; }
+        ALWAYS_INLINE Element operator[](size_t index) const { assert(index < size()); return array.at(index).value_unsafe(); }

    private:
        simdjson::dom::array array;
@ -111,7 +111,7 @@ struct SimdJSONParser
            if (x.error())
                return false;

-            result = x.first;
+            result = x.value_unsafe();
            return true;
        }

@ -137,7 +137,7 @@ struct SimdJSONParser
        if (document.error())
            return false;

-        result = document.first;
+        result = document.value_unsafe();
        return true;
    }

@ -155,12 +155,12 @@ private:

 inline ALWAYS_INLINE SimdJSONParser::Array SimdJSONParser::Element::getArray() const
 {
-    return element.get_array().first;
+    return element.get_array().value_unsafe();
 }

 inline ALWAYS_INLINE SimdJSONParser::Object SimdJSONParser::Element::getObject() const
 {
-    return element.get_object().first;
+    return element.get_object().value_unsafe();
 }

 }
--- a/src/Functions/TransformDateTime64.h
+++ b/src/Functions/TransformDateTime64.h
@ -49,8 +49,11 @@ public:
    {}

    template <typename ... Args>
-    inline auto execute(const DateTime64 & t, Args && ... args) const
+    inline auto NO_SANITIZE_UNDEFINED execute(const DateTime64 & t, Args && ... args) const
    {
+        /// Type conversion from float to integer may be required.
+        /// We are Ok with implementation specific result for out of range and denormals conversion.
+
        if constexpr (TransformHasExecuteOverload_v<DateTime64, decltype(scale_multiplier), Args...>)
        {
            return wrapped_transform.execute(t, scale_multiplier, std::forward<Args>(args)...);
--- a/src/Functions/URL/ExtractFirstSignificantSubdomain.h
+++ b/src/Functions/URL/ExtractFirstSignificantSubdomain.h
@ -90,7 +90,70 @@ struct ExtractFirstSignificantSubdomain
            res_data += last_3_periods[1] + 1 - begin;
            res_size = last_3_periods[0] - last_3_periods[1] - 1;
        }
-   }
+    }
+
+    /// The difference with execute() is due to custom TLD list can have records of any level,
+    /// not only 2-nd level (like non-custom variant), so it requires more lookups.
+    template <class Lookup>
+    static void executeCustom(const Lookup & lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr)
+    {
+        res_data = data;
+        res_size = 0;
+
+        Pos tmp;
+        size_t domain_length;
+        ExtractDomain<without_www>::execute(data, size, tmp, domain_length);
+
+        if (domain_length == 0)
+            return;
+
+        if (out_domain_end)
+            *out_domain_end = tmp + domain_length;
+
+        /// cut useless dot
+        if (tmp[domain_length - 1] == '.')
+            --domain_length;
+
+        res_data = tmp;
+        res_size = domain_length;
+
+        auto begin = tmp;
+        auto end = begin + domain_length;
+        const char * last_2_periods[2]{};
+        const char * prev = begin - 1;
+
+        auto pos = find_first_symbols<'.'>(begin, end);
+        while (pos < end)
+        {
+            if (lookup(pos + 1, end - pos - 1))
+            {
+                res_data += prev + 1 - begin;
+                res_size = end - 1 - prev;
+                return;
+            }
+
+            last_2_periods[1] = last_2_periods[0];
+            last_2_periods[0] = pos;
+            prev = pos;
+            pos = find_first_symbols<'.'>(pos + 1, end);
+        }
+
+        /// if there is domain of the first level (i.e. no dots in the hostname) -> return nothing
+        if (!last_2_periods[0])
+            return;
+
+        /// if there is domain of the second level -> always return itself
+        if (!last_2_periods[1])
+        {
+            res_size = last_2_periods[0] - begin;
+            return;
+        }
+
+        /// if there is domain of the 3+ level, and zero records in TLD list ->
+        /// fallback to domain of the second level
+        res_data += last_2_periods[1] + 1 - begin;
+        res_size = last_2_periods[0] - last_2_periods[1] - 1;
+    }
 };

 }
--- a/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h
+++ b/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h
@ -17,10 +17,10 @@ namespace ErrorCodes
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
 }

-struct FirstSignificantSubdomainCustomtLookup
+struct FirstSignificantSubdomainCustomLookup
 {
    const TLDList & tld_list;
-    FirstSignificantSubdomainCustomtLookup(const std::string & tld_list_name)
+    FirstSignificantSubdomainCustomLookup(const std::string & tld_list_name)
        : tld_list(TLDListsHolder::getInstance().getTldList(tld_list_name))
    {
    }
@ -63,7 +63,7 @@ public:
    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override
    {
        const ColumnConst * column_tld_list_name = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
-        FirstSignificantSubdomainCustomtLookup tld_lookup(column_tld_list_name->getValue<String>());
+        FirstSignificantSubdomainCustomLookup tld_lookup(column_tld_list_name->getValue<String>());

        /// FIXME: convertToFullColumnIfConst() is suboptimal
        auto column = arguments[0].column->convertToFullColumnIfConst();
@ -79,7 +79,7 @@ public:
                ErrorCodes::ILLEGAL_COLUMN);
    }

-    static void vector(FirstSignificantSubdomainCustomtLookup & tld_lookup,
+    static void vector(FirstSignificantSubdomainCustomLookup & tld_lookup,
        const ColumnString::Chars & data, const ColumnString::Offsets & offsets,
        ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets)
    {
--- a/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp
+++ b/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp
@ -10,7 +10,7 @@ struct CutToFirstSignificantSubdomainCustom
 {
    static size_t getReserveLengthForElement() { return 15; }

-    static void execute(FirstSignificantSubdomainCustomtLookup & tld_lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size)
+    static void execute(FirstSignificantSubdomainCustomLookup & tld_lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size)
    {
        res_data = data;
        res_size = 0;
@ -18,7 +18,7 @@ struct CutToFirstSignificantSubdomainCustom
        Pos tmp_data;
        size_t tmp_length;
        Pos domain_end;
-        ExtractFirstSignificantSubdomain<without_www>::execute(tld_lookup, data, size, tmp_data, tmp_length, &domain_end);
+        ExtractFirstSignificantSubdomain<without_www>::executeCustom(tld_lookup, data, size, tmp_data, tmp_length, &domain_end);

        if (tmp_length == 0)
            return;
--- a/src/Functions/array/mapPopulateSeries.cpp
+++ b/src/Functions/array/mapPopulateSeries.cpp
@ -190,7 +190,7 @@ private:
            }

            static constexpr size_t MAX_ARRAY_SIZE = 1ULL << 30;
-            if (static_cast<size_t>(max_key - min_key) > MAX_ARRAY_SIZE)
+            if (static_cast<size_t>(max_key) - static_cast<size_t>(min_key) > MAX_ARRAY_SIZE)
                throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size in the result of function {}", getName());

            /* fill the result arrays */
--- a/src/Functions/bar.cpp
+++ b/src/Functions/bar.cpp
@ -16,6 +16,7 @@ namespace ErrorCodes
    extern const int ARGUMENT_OUT_OF_BOUND;
    extern const int ILLEGAL_COLUMN;
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int BAD_ARGUMENTS;
 }

 namespace
@ -110,6 +111,9 @@ public:
                arguments[2].column->getFloat64(i),
                max_width);

+            if (!isFinite(width))
+                throw Exception("Value of width must not be NaN and Inf", ErrorCodes::BAD_ARGUMENTS);
+
            size_t next_size = current_offset + UnicodeBar::getWidthInBytes(width) + 1;
            dst_chars.resize(next_size);
            UnicodeBar::render(width, reinterpret_cast<char *>(&dst_chars[current_offset]));
--- a/src/Functions/registerFunctionsMiscellaneous.cpp
+++ b/src/Functions/registerFunctionsMiscellaneous.cpp
@ -41,7 +41,8 @@ void registerFunctionThrowIf(FunctionFactory &);
 void registerFunctionVersion(FunctionFactory &);
 void registerFunctionBuildId(FunctionFactory &);
 void registerFunctionUptime(FunctionFactory &);
-void registerFunctionTimeZone(FunctionFactory &);
+void registerFunctionTimezone(FunctionFactory &);
+void registerFunctionTimezoneOf(FunctionFactory &);
 void registerFunctionRunningAccumulate(FunctionFactory &);
 void registerFunctionRunningDifference(FunctionFactory &);
 void registerFunctionRunningDifferenceStartingWithFirstValue(FunctionFactory &);
@ -111,7 +112,8 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory)
    registerFunctionVersion(factory);
    registerFunctionBuildId(factory);
    registerFunctionUptime(factory);
-    registerFunctionTimeZone(factory);
+    registerFunctionTimezone(factory);
+    registerFunctionTimezoneOf(factory);
    registerFunctionRunningAccumulate(factory);
    registerFunctionRunningDifference(factory);
    registerFunctionRunningDifferenceStartingWithFirstValue(factory);
--- a/src/Functions/timezone.cpp
+++ b/src/Functions/timezone.cpp
@ -12,13 +12,13 @@ namespace

 /** Returns the server time zone.
  */
-class FunctionTimeZone : public IFunction
+class FunctionTimezone : public IFunction
 {
 public:
    static constexpr auto name = "timezone";
    static FunctionPtr create(const Context &)
    {
-        return std::make_shared<FunctionTimeZone>();
+        return std::make_shared<FunctionTimezone>();
    }

    String getName() const override
@ -45,9 +45,10 @@ public:

 }

-void registerFunctionTimeZone(FunctionFactory & factory)
+void registerFunctionTimezone(FunctionFactory & factory)
 {
-    factory.registerFunction<FunctionTimeZone>();
+    factory.registerFunction<FunctionTimezone>();
+    factory.registerAlias("timeZone", "timezone");
 }

 }
--- a/src/Functions/timezoneOf.cpp
+++ b/src/Functions/timezoneOf.cpp
@ -0,0 +1,118 @@
+#include <Functions/IFunctionImpl.h>
+#include <Functions/FunctionFactory.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypeDateTime.h>
+#include <common/DateLUTImpl.h>
+#include <Core/Field.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+}
+
+
+namespace
+{
+
+
+/** timezoneOf(x) - get the name of the timezone of DateTime data type.
+  * Example: Europe/Moscow.
+  */
+class ExecutableFunctionTimezoneOf : public IExecutableFunctionImpl
+{
+public:
+    static constexpr auto name = "timezoneOf";
+    String getName() const override { return name; }
+
+    bool useDefaultImplementationForNulls() const override { return false; }
+    bool useDefaultImplementationForLowCardinalityColumns() const override { return false; }
+
+    /// Execute the function on the columns.
+    ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
+    {
+        DataTypePtr type_no_nullable = removeNullable(arguments[0].type);
+
+        return DataTypeString().createColumnConst(input_rows_count,
+            dynamic_cast<const TimezoneMixin &>(*type_no_nullable).getTimeZone().getTimeZone());
+    }
+};
+
+
+class BaseFunctionTimezoneOf : public IFunctionBaseImpl
+{
+public:
+    BaseFunctionTimezoneOf(DataTypes argument_types_, DataTypePtr return_type_)
+        : argument_types(std::move(argument_types_)), return_type(std::move(return_type_)) {}
+
+    static constexpr auto name = "timezoneOf";
+    String getName() const override { return name; }
+
+    bool isDeterministic() const override { return true; }
+    bool isDeterministicInScopeOfQuery() const override { return true; }
+
+    const DataTypes & getArgumentTypes() const override { return argument_types; }
+    const DataTypePtr & getResultType() const override { return return_type; }
+
+    ExecutableFunctionImplPtr prepare(const ColumnsWithTypeAndName &) const override
+    {
+        return std::make_unique<ExecutableFunctionTimezoneOf>();
+    }
+
+    ColumnPtr getResultIfAlwaysReturnsConstantAndHasArguments(const ColumnsWithTypeAndName & arguments) const override
+    {
+        DataTypePtr type_no_nullable = removeNullable(arguments[0].type);
+
+        return DataTypeString().createColumnConst(1,
+            dynamic_cast<const TimezoneMixin &>(*type_no_nullable).getTimeZone().getTimeZone());
+    }
+
+private:
+    DataTypes argument_types;
+    DataTypePtr return_type;
+};
+
+
+class FunctionTimezoneOfBuilder : public IFunctionOverloadResolverImpl
+{
+public:
+    static constexpr auto name = "timezoneOf";
+    String getName() const override { return name; }
+    static FunctionOverloadResolverImplPtr create(const Context &) { return std::make_unique<FunctionTimezoneOfBuilder>(); }
+
+    size_t getNumberOfArguments() const override { return 1; }
+
+    DataTypePtr getReturnType(const DataTypes & types) const override
+    {
+        DataTypePtr type_no_nullable = removeNullable(types[0]);
+
+        if (isDateTime(type_no_nullable) || isDateTime64(type_no_nullable))
+            return std::make_shared<DataTypeString>();
+        else
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bad argument for function {}, should be DateTime or DateTime64", name);
+    }
+
+    FunctionBaseImplPtr build(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override
+    {
+        return std::make_unique<BaseFunctionTimezoneOf>(DataTypes{arguments[0].type}, return_type);
+    }
+
+    bool useDefaultImplementationForNulls() const override { return false; }
+    bool useDefaultImplementationForLowCardinalityColumns() const override { return false; }
+    ColumnNumbers getArgumentsThatDontImplyNullableReturnType(size_t /*number_of_arguments*/) const override { return {0}; }
+};
+
+}
+
+void registerFunctionTimezoneOf(FunctionFactory & factory)
+{
+    factory.registerFunction<FunctionTimezoneOfBuilder>();
+    factory.registerAlias("timeZoneOf", "timezoneOf");
+}
+
+}
+
--- a/src/Functions/toTimezone.cpp
+++ b/src/Functions/toTimezone.cpp
@ -21,11 +21,11 @@ namespace
 {

 /// Just changes time zone information for data type. The calculation is free.
-class FunctionToTimeZone : public IFunction
+class FunctionToTimezone : public IFunction
 {
 public:
-    static constexpr auto name = "toTimeZone";
-    static FunctionPtr create(const Context &) { return std::make_shared<FunctionToTimeZone>(); }
+    static constexpr auto name = "toTimezone";
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionToTimezone>(); }

    String getName() const override
    {
@ -64,7 +64,8 @@ public:

 void registerFunctionToTimeZone(FunctionFactory & factory)
 {
-    factory.registerFunction<FunctionToTimeZone>();
+    factory.registerFunction<FunctionToTimezone>();
+    factory.registerAlias("toTimeZone", "toTimezone");
 }

 }
--- a/src/Functions/ya.make
+++ b/src/Functions/ya.make
@ -467,6 +467,7 @@ SRCS(
    timeSlot.cpp
    timeSlots.cpp
    timezone.cpp
+    timezoneOf.cpp
    timezoneOffset.cpp
    toColumnTypeName.cpp
    toCustomWeek.cpp
@ -506,7 +507,7 @@ SRCS(
    toStartOfTenMinutes.cpp
    toStartOfYear.cpp
    toTime.cpp
-    toTimeZone.cpp
+    toTimezone.cpp
    toTypeName.cpp
    toUnixTimestamp64Micro.cpp
    toUnixTimestamp64Milli.cpp
--- a/src/IO/BrotliWriteBuffer.cpp
+++ b/src/IO/BrotliWriteBuffer.cpp
@ -106,7 +106,7 @@ void BrotliWriteBuffer::finish()
    try
    {
        finishImpl();
-        out->next();
+        out->finalize();
        finished = true;
    }
    catch (...)
--- a/src/IO/LZMADeflatingWriteBuffer.cpp
+++ b/src/IO/LZMADeflatingWriteBuffer.cpp
@ -105,7 +105,7 @@ void LZMADeflatingWriteBuffer::finish()
    try
    {
        finishImpl();
-        out->next();
+        out->finalize();
        finished = true;
    }
    catch (...)
--- a/src/IO/PeekableReadBuffer.cpp
+++ b/src/IO/PeekableReadBuffer.cpp
@ -82,6 +82,7 @@ bool PeekableReadBuffer::peekNext()
        checkpoint.emplace(memory.data());
        checkpoint_in_own_memory = true;
    }
+
    if (currentlyReadFromOwnMemory())
    {
        /// Update buffer size
@ -99,7 +100,6 @@ bool PeekableReadBuffer::peekNext()
                pos_offset = 0;
        }
        BufferBase::set(memory.data(), peeked_size + bytes_to_copy, pos_offset);
-
    }

    peeked_size += bytes_to_copy;
@ -113,12 +113,21 @@ void PeekableReadBuffer::rollbackToCheckpoint(bool drop)
 {
    checkStateCorrect();

-    if (!checkpoint)
-        throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR);
-    else if (checkpointInOwnMemory() == currentlyReadFromOwnMemory())
+    assert(checkpoint);
+
+    if (checkpointInOwnMemory() == currentlyReadFromOwnMemory())
+    {
+        /// Both checkpoint and position are in the same buffer.
        pos = *checkpoint;
-    else /// Checkpoint is in own memory and pos is not. Switch to reading from own memory
+    }
+    else
+    {
+        /// Checkpoint is in own memory and position is not.
+        assert(checkpointInOwnMemory());
+
+        /// Switch to reading from own memory.
        BufferBase::set(memory.data(), peeked_size, *checkpoint - memory.data());
+    }

    if (drop)
        dropCheckpoint();
@ -134,6 +143,7 @@ bool PeekableReadBuffer::nextImpl()

    checkStateCorrect();
    bool res;
+    bool checkpoint_at_end = checkpoint && *checkpoint == working_buffer.end() && currentlyReadFromOwnMemory();

    if (checkpoint)
    {
@ -163,6 +173,13 @@ bool PeekableReadBuffer::nextImpl()
    BufferBase::set(sub_working.begin(), sub_working.size(), sub_buf.offset());
    nextimpl_working_buffer_offset = sub_buf.offset();

+    if (checkpoint_at_end)
+    {
+        checkpoint.emplace(working_buffer.begin());
+        peeked_size = 0;
+        checkpoint_in_own_memory = false;
+    }
+
    checkStateCorrect();
    return res;
 }
--- a/src/IO/PeekableReadBuffer.h
+++ b/src/IO/PeekableReadBuffer.h
@ -43,10 +43,7 @@ public:
    /// Forget checkpoint and all data between checkpoint and position
    ALWAYS_INLINE inline void dropCheckpoint()
    {
-#ifndef NDEBUG
-        if (!checkpoint)
-            throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR);
-#endif
+        assert(checkpoint);
        if (!currentlyReadFromOwnMemory())
        {
            /// Don't need to store unread data anymore
--- a/src/IO/ReadBufferFromPocoSocket.cpp
+++ b/src/IO/ReadBufferFromPocoSocket.cpp
@ -1,6 +1,7 @@
 #include <Poco/Net/NetException.h>

 #include <IO/ReadBufferFromPocoSocket.h>
+#include <IO/TimeoutSetter.h>
 #include <Common/Exception.h>
 #include <Common/NetException.h>
 #include <Common/Stopwatch.h>
@ -27,23 +28,23 @@ bool ReadBufferFromPocoSocket::nextImpl()
    ssize_t bytes_read = 0;
    Stopwatch watch;

-    int flags = 0;
-    if (async_callback)
-        flags |= MSG_DONTWAIT;
-
    /// Add more details to exceptions.
    try
    {
-        bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags);
-
-        /// If async_callback is specified, and read is blocking, run async_callback and try again later.
+        /// If async_callback is specified, and read will block, run async_callback and try again later.
        /// It is expected that file descriptor may be polled externally.
        /// Note that receive timeout is not checked here. External code should check it while polling.
-        while (bytes_read < 0 && async_callback && errno == EAGAIN)
-        {
+        while (async_callback && !socket.poll(0, Poco::Net::Socket::SELECT_READ))
            async_callback(socket.impl()->sockfd(), socket.getReceiveTimeout(), socket_description);
-            bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags);
-        }
+
+        /// receiveBytes in SecureStreamSocket throws TimeoutException after max(receive_timeout, send_timeout),
+        /// but we want to get this exception exactly after receive_timeout. So, set send_timeout = receive_timeout
+        /// before receiveBytes.
+        std::unique_ptr<TimeoutSetter> timeout_setter = nullptr;
+        if (socket.secure())
+            timeout_setter = std::make_unique<TimeoutSetter>(dynamic_cast<Poco::Net::StreamSocket &>(socket), socket.getReceiveTimeout(), socket.getReceiveTimeout());
+
+        bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size());
    }
    catch (const Poco::Net::NetException & e)
    {
--- a/src/Client/TimeoutSetter.cpp
+++ b/src/Client/TimeoutSetter.cpp
@ -1,4 +1,4 @@
-#include "TimeoutSetter.h"
+#include <IO/TimeoutSetter.h>

 #include <common/logger_useful.h>

--- a/src/Client/TimeoutSetter.h
+++ b/src/Client/TimeoutSetter.h
--- a/src/IO/WriteBufferFromPocoSocket.cpp
+++ b/src/IO/WriteBufferFromPocoSocket.cpp
@ -1,6 +1,7 @@
 #include <Poco/Net/NetException.h>

 #include <IO/WriteBufferFromPocoSocket.h>
+#include <IO/TimeoutSetter.h>

 #include <Common/Exception.h>
 #include <Common/NetException.h>
@ -40,6 +41,13 @@ void WriteBufferFromPocoSocket::nextImpl()
        /// Add more details to exceptions.
        try
        {
+            /// sendBytes in SecureStreamSocket throws TimeoutException after max(receive_timeout, send_timeout),
+            /// but we want to get this exception exactly after send_timeout. So, set receive_timeout = send_timeout
+            /// before sendBytes.
+            std::unique_ptr<TimeoutSetter> timeout_setter = nullptr;
+            if (socket.secure())
+                timeout_setter = std::make_unique<TimeoutSetter>(dynamic_cast<Poco::Net::StreamSocket &>(socket), socket.getSendTimeout(), socket.getSendTimeout());
+
            res = socket.impl()->sendBytes(working_buffer.begin() + bytes_written, offset() - bytes_written);
        }
        catch (const Poco::Net::NetException & e)
--- a/src/IO/WriteBufferFromS3.cpp
+++ b/src/IO/WriteBufferFromS3.cpp
@ -120,7 +120,7 @@ WriteBufferFromS3::~WriteBufferFromS3()
    }
    catch (...)
    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
+        tryLogCurrentException(log);
    }
 }

--- a/src/IO/ZlibDeflatingWriteBuffer.cpp
+++ b/src/IO/ZlibDeflatingWriteBuffer.cpp
@ -107,7 +107,7 @@ void ZlibDeflatingWriteBuffer::finish()
    try
    {
        finishImpl();
-        out->next();
+        out->finalize();
        finished = true;
    }
    catch (...)
--- a/src/IO/ZstdDeflatingWriteBuffer.cpp
+++ b/src/IO/ZstdDeflatingWriteBuffer.cpp
@ -94,7 +94,7 @@ void ZstdDeflatingWriteBuffer::finish()
    try
    {
        finishImpl();
-        out->next();
+        out->finalize();
        finished = true;
    }
    catch (...)
--- a/src/IO/tests/gtest_peekable_read_buffer.cpp
+++ b/src/IO/tests/gtest_peekable_read_buffer.cpp
@ -6,11 +6,6 @@
 #include <IO/ConcatReadBuffer.h>
 #include <IO/PeekableReadBuffer.h>

-namespace DB::ErrorCodes
-{
-    extern const int LOGICAL_ERROR;
-}
-
 static void readAndAssert(DB::ReadBuffer & buf, const char * str)
 {
    size_t n = strlen(str);
@ -48,20 +43,6 @@ try
        readAndAssert(peekable, "01234");
    }

-#ifndef ABORT_ON_LOGICAL_ERROR
-    bool exception = false;
-    try
-    {
-        peekable.rollbackToCheckpoint();
-    }
-    catch (DB::Exception & e)
-    {
-        if (e.code() != DB::ErrorCodes::LOGICAL_ERROR)
-            throw;
-        exception = true;
-    }
-    ASSERT_TRUE(exception);
-#endif
    assertAvailable(peekable, "56789");

    readAndAssert(peekable, "56");
--- a/src/IO/ya.make
+++ b/src/IO/ya.make
@ -50,6 +50,7 @@ SRCS(
    ReadBufferFromPocoSocket.cpp
    ReadHelpers.cpp
    SeekAvoidingReadBuffer.cpp
+    TimeoutSetter.cpp
    UseSSL.cpp
    WriteBufferFromFile.cpp
    WriteBufferFromFileBase.cpp
--- a/src/Interpreters/ExternalLoader.cpp
+++ b/src/Interpreters/ExternalLoader.cpp
@ -818,13 +818,10 @@ private:
            if (!min_id)
                min_id = getMinIDToFinishLoading(forced_to_reload);

-            if (info->state_id >= min_id)
-                return true; /// stop
-
            if (info->loading_id < min_id)
                startLoading(*info, forced_to_reload, *min_id);

-            /// Wait for the next event if loading wasn't completed, and stop otherwise.
+            /// Wait for the next event if loading wasn't completed, or stop otherwise.
            return (info->state_id >= min_id);
        };

@ -850,9 +847,6 @@ private:
                if (filter && !filter(name))
                    continue;

-                if (info.state_id >= min_id)
-                    continue;
-
                if (info.loading_id < min_id)
                    startLoading(info, forced_to_reload, *min_id);

--- a/src/Interpreters/InterpreterCreateQuery.cpp
+++ b/src/Interpreters/InterpreterCreateQuery.cpp
@ -260,7 +260,8 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create)
            renamed = true;
        }

-        database->loadStoredObjects(context, has_force_restore_data_flag, create.attach && force_attach);
+        /// We use global context here, because storages lifetime is bigger than query context lifetime
+        database->loadStoredObjects(context.getGlobalContext(), has_force_restore_data_flag, create.attach && force_attach);
    }
    catch (...)
    {
@ -970,7 +971,8 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create,
    if (create.as_table_function)
    {
        const auto & factory = TableFunctionFactory::instance();
-        res = factory.get(create.as_table_function, context)->execute(create.as_table_function, context, create.table, properties.columns);
+        auto table_func = factory.get(create.as_table_function, context);
+        res = table_func->execute(create.as_table_function, context, create.table, properties.columns);
        res->renameInMemory({create.database, create.table, create.uuid});
    }
    else
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@ -393,7 +393,7 @@ InterpreterSelectQuery::InterpreterSelectQuery(
            view = nullptr;
        }

-        if (try_move_to_prewhere && storage && query.where() && !query.prewhere() && !query.final())
+        if (try_move_to_prewhere && storage && query.where() && !query.prewhere())
        {
            /// PREWHERE optimization: transfer some condition from WHERE to PREWHERE if enabled and viable
            if (const auto & column_sizes = storage->getColumnSizes(); !column_sizes.empty())
--- a/src/Interpreters/InterserverIOHandler.h
+++ b/src/Interpreters/InterserverIOHandler.h
@ -9,8 +9,6 @@
 #include <Common/ActionBlocker.h>
 #include <common/types.h>

-#include <Poco/Net/HTMLForm.h>
-
 #include <atomic>
 #include <map>
 #include <shared_mutex>
--- a/src/Interpreters/WindowDescription.cpp
+++ b/src/Interpreters/WindowDescription.cpp
@ -1,5 +1,6 @@
 #include <Interpreters/WindowDescription.h>

+#include <Core/Field.h>
 #include <IO/Operators.h>
 #include <Parsers/ASTFunction.h>

@ -60,7 +61,7 @@ void WindowFrame::toString(WriteBuffer & buf) const
    }
    else
    {
-        buf << abs(begin_offset);
+        buf << applyVisitor(FieldVisitorToString(), begin_offset);
        buf << " "
            << (begin_preceding ? "PRECEDING" : "FOLLOWING");
    }
@ -77,7 +78,7 @@ void WindowFrame::toString(WriteBuffer & buf) const
    }
    else
    {
-        buf << abs(end_offset);
+        buf << applyVisitor(FieldVisitorToString(), end_offset);
        buf << " "
            << (end_preceding ? "PRECEDING" : "FOLLOWING");
    }
@ -121,23 +122,33 @@ void WindowFrame::checkValid() const
    if (end_type == BoundaryType::Offset
        && begin_type == BoundaryType::Offset)
    {
-        // Frame starting with following rows can't have preceding rows.
-        if (!(end_preceding && !begin_preceding))
+        // Frame start offset must be less or equal that the frame end offset.
+        bool begin_less_equal_end;
+        if (begin_preceding && end_preceding)
        {
-            // Frame start offset must be less or equal that the frame end offset.
-            const bool begin_before_end
-                = begin_offset * (begin_preceding ? -1 : 1)
-                    <= end_offset * (end_preceding ? -1 : 1);
-
-            if (!begin_before_end)
-            {
-                throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                    "Frame start offset {} {} does not precede the frame end offset {} {}",
-                    begin_offset, begin_preceding ? "PRECEDING" : "FOLLOWING",
-                    end_offset, end_preceding ? "PRECEDING" : "FOLLOWING");
-            }
-            return;
+            begin_less_equal_end = begin_offset >= end_offset;
        }
+        else if (begin_preceding && !end_preceding)
+        {
+            begin_less_equal_end = true;
+        }
+        else if (!begin_preceding && end_preceding)
+        {
+            begin_less_equal_end = false;
+        }
+        else /* if (!begin_preceding && !end_preceding) */
+        {
+            begin_less_equal_end = begin_offset <= end_offset;
+        }
+
+        if (!begin_less_equal_end)
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                "Frame start offset {} {} does not precede the frame end offset {} {}",
+                begin_offset, begin_preceding ? "PRECEDING" : "FOLLOWING",
+                end_offset, end_preceding ? "PRECEDING" : "FOLLOWING");
+        }
+        return;
    }

    throw Exception(ErrorCodes::BAD_ARGUMENTS,
--- a/src/Interpreters/WindowDescription.h
+++ b/src/Interpreters/WindowDescription.h
@ -44,14 +44,13 @@ struct WindowFrame
    // Offset might be both preceding and following, controlled by begin_preceding,
    // but the offset value must be positive.
    BoundaryType begin_type = BoundaryType::Unbounded;
-    // This should have been a Field but I'm getting some crazy linker errors.
-    int64_t begin_offset = 0;
+    Field begin_offset = 0;
    bool begin_preceding = true;

    // Here as well, Unbounded can only be UNBOUNDED FOLLOWING, and end_preceding
    // must be false.
    BoundaryType end_type = BoundaryType::Current;
-    int64_t end_offset = 0;
+    Field end_offset = 0;
    bool end_preceding = false;


--- a/src/Interpreters/convertFieldToType.cpp
+++ b/src/Interpreters/convertFieldToType.cpp
@ -377,6 +377,11 @@ Field convertFieldToType(const Field & from_value, const IDataType & to_type, co
    else if (const auto * nullable_type = typeid_cast<const DataTypeNullable *>(&to_type))
    {
        const IDataType & nested_type = *nullable_type->getNestedType();
+
+        /// NULL remains NULL after any conversion.
+        if (WhichDataType(nested_type).isNothing())
+            return {};
+
        if (from_type_hint && from_type_hint->equals(nested_type))
            return from_value;
        return convertFieldToTypeImpl(from_value, nested_type, from_type_hint);
--- a/src/Interpreters/evaluateConstantExpression.cpp
+++ b/src/Interpreters/evaluateConstantExpression.cpp
@ -290,8 +290,6 @@ std::optional<Blocks> evaluateExpressionOverConstantCondition(const ASTPtr & nod
 {
    Blocks result;

-    // TODO: `node` may be always-false literal.
-
    if (const auto * fn = node->as<ASTFunction>())
    {
        const auto dnf = analyzeFunction(fn, target_expr);
@ -350,6 +348,14 @@ std::optional<Blocks> evaluateExpressionOverConstantCondition(const ASTPtr & nod
            }
        }
    }
+    else if (const auto * literal = node->as<ASTLiteral>())
+    {
+        // Check if it's always true or false.
+        if (literal->value.getType() == Field::Types::UInt64 && literal->value.get<UInt64>() == 0)
+            return {result};
+        else
+            return {};
+    }

    return {result};
 }
--- a/src/Parsers/ASTSelectQuery.cpp
+++ b/src/Parsers/ASTSelectQuery.cpp
@ -137,8 +137,8 @@ void ASTSelectQuery::formatImpl(const FormatSettings & s, FormatState & state, F
    if (window())
    {
        s.ostr << (s.hilite ? hilite_keyword : "") << s.nl_or_ws << indent_str <<
-            "WINDOW " << (s.hilite ? hilite_none : "");
-        window()->formatImpl(s, state, frame);
+            "WINDOW" << (s.hilite ? hilite_none : "");
+        window()->as<ASTExpressionList &>().formatImplMultiline(s, state, frame);
    }

    if (orderBy())
--- a/src/Parsers/ASTWindowDefinition.cpp
+++ b/src/Parsers/ASTWindowDefinition.cpp
@ -35,6 +35,8 @@ String ASTWindowDefinition::getID(char) const
 void ASTWindowDefinition::formatImpl(const FormatSettings & settings,
    FormatState & state, FormatStateStacked format_frame) const
 {
+    format_frame.expression_list_prepend_whitespace = false;
+
    if (partition_by)
    {
        settings.ostr << "PARTITION BY ";
@ -70,7 +72,8 @@ void ASTWindowDefinition::formatImpl(const FormatSettings & settings,
        }
        else
        {
-            settings.ostr << abs(frame.begin_offset);
+            settings.ostr << applyVisitor(FieldVisitorToString(),
+                frame.begin_offset);
            settings.ostr << " "
                << (!frame.begin_preceding ? "FOLLOWING" : "PRECEDING");
        }
@ -85,7 +88,8 @@ void ASTWindowDefinition::formatImpl(const FormatSettings & settings,
        }
        else
        {
-            settings.ostr << abs(frame.end_offset);
+            settings.ostr << applyVisitor(FieldVisitorToString(),
+                frame.end_offset);
            settings.ostr << " "
                << (!frame.end_preceding ? "FOLLOWING" : "PRECEDING");
        }
--- a/src/Parsers/ExpressionElementParsers.cpp
+++ b/src/Parsers/ExpressionElementParsers.cpp
@ -581,30 +581,20 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p
        else if (parser_literal.parse(pos, ast_literal, expected))
        {
            const Field & value = ast_literal->as<ASTLiteral &>().value;
-            if (!isInt64FieldType(value.getType()))
+            if ((node->frame.type == WindowFrame::FrameType::Rows
+                    || node->frame.type == WindowFrame::FrameType::Groups)
+                && !(value.getType() == Field::Types::UInt64
+                     || (value.getType() == Field::Types::Int64
+                            && value.get<Int64>() >= 0)))
            {
                throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                    "Only integer frame offsets are supported, '{}' is not supported.",
+                    "Frame offset for '{}' frame must be a nonnegative integer, '{}' of type '{}' given.",
+                    WindowFrame::toString(node->frame.type),
+                    applyVisitor(FieldVisitorToString(), value),
                    Field::Types::toString(value.getType()));
            }
-            node->frame.begin_offset = value.get<Int64>();
+            node->frame.begin_offset = value;
            node->frame.begin_type = WindowFrame::BoundaryType::Offset;
-            // We can easily get a UINT64_MAX here, which doesn't even fit into
-            // int64_t. Not sure what checks we are going to need here after we
-            // support floats and dates.
-            if (node->frame.begin_offset > INT_MAX || node->frame.begin_offset < INT_MIN)
-            {
-                throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                    "Frame offset must be between {} and {}, but {} is given",
-                    INT_MAX, INT_MIN, node->frame.begin_offset);
-            }
-
-            if (node->frame.begin_offset < 0)
-            {
-                throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                    "Frame start offset must be greater than zero, {} given",
-                    node->frame.begin_offset);
-            }
        }
        else
        {
@ -652,28 +642,20 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p
            else if (parser_literal.parse(pos, ast_literal, expected))
            {
                const Field & value = ast_literal->as<ASTLiteral &>().value;
-                if (!isInt64FieldType(value.getType()))
+                if ((node->frame.type == WindowFrame::FrameType::Rows
+                        || node->frame.type == WindowFrame::FrameType::Groups)
+                    && !(value.getType() == Field::Types::UInt64
+                         || (value.getType() == Field::Types::Int64
+                                && value.get<Int64>() >= 0)))
                {
                    throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                        "Only integer frame offsets are supported, '{}' is not supported.",
+                        "Frame offset for '{}' frame must be a nonnegative integer, '{}' of type '{}' given.",
+                        WindowFrame::toString(node->frame.type),
+                        applyVisitor(FieldVisitorToString(), value),
                        Field::Types::toString(value.getType()));
                }
-                node->frame.end_offset = value.get<Int64>();
+                node->frame.end_offset = value;
                node->frame.end_type = WindowFrame::BoundaryType::Offset;
-
-                if (node->frame.end_offset > INT_MAX || node->frame.end_offset < INT_MIN)
-                {
-                    throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                        "Frame offset must be between {} and {}, but {} is given",
-                        INT_MAX, INT_MIN, node->frame.end_offset);
-                }
-
-                if (node->frame.end_offset < 0)
-                {
-                    throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                        "Frame end offset must be greater than zero, {} given",
-                        node->frame.end_offset);
-                }
            }
            else
            {
--- a/src/Parsers/Lexer.cpp
+++ b/src/Parsers/Lexer.cpp
@ -275,7 +275,8 @@ Token Lexer::nextTokenImpl()
                        else
                            ++pos;
                    }
-                    return Token(TokenType::ErrorMultilineCommentIsNotClosed, token_begin, end);
+                    pos = end;
+                    return Token(TokenType::ErrorMultilineCommentIsNotClosed, token_begin, pos);
                }
            }
            return Token(TokenType::Slash, token_begin, pos);
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@ -3,6 +3,7 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <Common/Arena.h>
 #include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/getLeastSupertype.h>
 #include <Interpreters/ExpressionActions.h>
 #include <Interpreters/convertFieldToType.h>

@ -27,7 +28,8 @@ public:
    virtual ~IWindowFunction() = default;

    // Must insert the result for current_row.
-    virtual void windowInsertResultInto(IColumn & to, const WindowTransform * transform) = 0;
+    virtual void windowInsertResultInto(const WindowTransform * transform,
+        size_t function_index) = 0;
 };

 // Compares ORDER BY column values at given rows to find the boundaries of frame:
@ -37,7 +39,7 @@ template <typename ColumnType>
 static int compareValuesWithOffset(const IColumn * _compared_column,
    size_t compared_row, const IColumn * _reference_column,
    size_t reference_row,
-    uint64_t _offset,
+    const Field & _offset,
    bool offset_is_preceding)
 {
    // Casting the columns to the known type here makes it faster, probably
@ -46,7 +48,8 @@ static int compareValuesWithOffset(const IColumn * _compared_column,
        _compared_column);
    const auto * reference_column = assert_cast<const ColumnType *>(
        _reference_column);
-    const auto offset = static_cast<typename ColumnType::ValueType>(_offset);
+    const auto offset = _offset.get<typename ColumnType::ValueType>();
+    assert(offset >= 0);

    const auto compared_value_data = compared_column->getDataAt(compared_row);
    assert(compared_value_data.size == sizeof(typename ColumnType::ValueType));
@ -101,6 +104,53 @@ static int compareValuesWithOffset(const IColumn * _compared_column,
    }
 }

+// A specialization of compareValuesWithOffset for floats.
+template <typename ColumnType>
+static int compareValuesWithOffsetFloat(const IColumn * _compared_column,
+    size_t compared_row, const IColumn * _reference_column,
+    size_t reference_row,
+    const Field & _offset,
+    bool offset_is_preceding)
+{
+    // Casting the columns to the known type here makes it faster, probably
+    // because the getData call can be devirtualized.
+    const auto * compared_column = assert_cast<const ColumnType *>(
+        _compared_column);
+    const auto * reference_column = assert_cast<const ColumnType *>(
+        _reference_column);
+    const auto offset = _offset.get<typename ColumnType::ValueType>();
+    assert(offset >= 0);
+
+    const auto compared_value_data = compared_column->getDataAt(compared_row);
+    assert(compared_value_data.size == sizeof(typename ColumnType::ValueType));
+    auto compared_value = unalignedLoad<typename ColumnType::ValueType>(
+        compared_value_data.data);
+
+    const auto reference_value_data = reference_column->getDataAt(reference_row);
+    assert(reference_value_data.size == sizeof(typename ColumnType::ValueType));
+    auto reference_value = unalignedLoad<typename ColumnType::ValueType>(
+        reference_value_data.data);
+
+    // Floats overflow to Inf and the comparison will work normally, so we don't
+    // have to do anything.
+    if (offset_is_preceding)
+    {
+        reference_value -= offset;
+    }
+    else
+    {
+        reference_value += offset;
+    }
+
+    const auto result =  compared_value < reference_value ? -1
+        : compared_value == reference_value ? 0 : 1;
+
+//    fmt::print(stderr, "compared {}, offset {}, reference {}, result {}\n",
+//        compared_value, offset, reference_value, result);
+
+    return result;
+}
+
 // Helper macros to dispatch on type of the ORDER BY column
 #define APPLY_FOR_ONE_TYPE(FUNCTION, TYPE) \
 else if (typeid_cast<const TYPE *>(column)) \
@ -114,14 +164,20 @@ if (false) /* NOLINT */ \
 { \
    /* Do nothing, a starter condition. */ \
 } \
-APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int8>) \
 APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt8>) \
-APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int16>) \
 APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt16>) \
-APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int32>) \
 APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt32>) \
-APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int64>) \
 APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt64>) \
+\
+APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int8>) \
+APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int16>) \
+APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int32>) \
+APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int64>) \
+APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int128>) \
+\
+APPLY_FOR_ONE_TYPE(FUNCTION##Float, ColumnVector<Float32>) \
+APPLY_FOR_ONE_TYPE(FUNCTION##Float, ColumnVector<Float64>) \
+\
 else \
 { \
    throw Exception(ErrorCodes::NOT_IMPLEMENTED, \
@ -193,9 +249,28 @@ WindowTransform::WindowTransform(const Block & input_header_,
                == WindowFrame::BoundaryType::Offset))
    {
        assert(order_by_indices.size() == 1);
-        const IColumn * column = input_header.getByPosition(
-            order_by_indices[0]).column.get();
+        const auto & entry = input_header.getByPosition(order_by_indices[0]);
+        const IColumn * column = entry.column.get();
        APPLY_FOR_TYPES(compareValuesWithOffset)
+
+        // Check that the offset type matches the window type.
+        // Convert the offsets to the ORDER BY column type. We can't just check
+        // that it matches, because e.g. the int literals are always (U)Int64,
+        // but the column might be Int8 and so on.
+        if (window_description.frame.begin_type
+            == WindowFrame::BoundaryType::Offset)
+        {
+            window_description.frame.begin_offset = convertFieldToTypeOrThrow(
+                window_description.frame.begin_offset,
+                *entry.type);
+        }
+        if (window_description.frame.end_type
+            == WindowFrame::BoundaryType::Offset)
+        {
+            window_description.frame.end_offset = convertFieldToTypeOrThrow(
+                window_description.frame.end_offset,
+                *entry.type);
+        }
    }
 }

@ -391,7 +466,7 @@ void WindowTransform::advanceFrameStartRowsOffset()
 {
    // Just recalculate it each time by walking blocks.
    const auto [moved_row, offset_left] = moveRowNumber(current_row,
-        window_description.frame.begin_offset
+        window_description.frame.begin_offset.get<UInt64>()
            * (window_description.frame.begin_preceding ? -1 : 1));

    frame_start = moved_row;
@ -638,7 +713,7 @@ void WindowTransform::advanceFrameEndRowsOffset()
    // Walk the specified offset from the current row. The "+1" is needed
    // because the frame_end is a past-the-end pointer.
    const auto [moved_row, offset_left] = moveRowNumber(current_row,
-        window_description.frame.end_offset
+        window_description.frame.end_offset.get<UInt64>()
            * (window_description.frame.end_preceding ? -1 : 1)
            + 1);

@ -852,14 +927,14 @@ void WindowTransform::writeOutCurrentRow()
    for (size_t wi = 0; wi < workspaces.size(); ++wi)
    {
        auto & ws = workspaces[wi];
-        IColumn * result_column = block.output_columns[wi].get();

        if (ws.window_function_impl)
        {
-            ws.window_function_impl->windowInsertResultInto(*result_column, this);
+            ws.window_function_impl->windowInsertResultInto(this, wi);
        }
        else
        {
+            IColumn * result_column = block.output_columns[wi].get();
            const auto * a = ws.aggregate_function.get();
            auto * buf = ws.aggregate_function_state.data();
            // FIXME does it also allocate the result on the arena?
@ -1280,8 +1355,11 @@ struct WindowFunctionRank final : public WindowFunction
    DataTypePtr getReturnType() const override
    { return std::make_shared<DataTypeUInt64>(); }

-    void windowInsertResultInto(IColumn & to, const WindowTransform * transform) override
+    void windowInsertResultInto(const WindowTransform * transform,
+        size_t function_index) override
    {
+        IColumn & to = *transform->blockAt(transform->current_row)
+            .output_columns[function_index];
        assert_cast<ColumnUInt64 &>(to).getData().push_back(
            transform->peer_group_start_row_number);
    }
@ -1297,8 +1375,11 @@ struct WindowFunctionDenseRank final : public WindowFunction
    DataTypePtr getReturnType() const override
    { return std::make_shared<DataTypeUInt64>(); }

-    void windowInsertResultInto(IColumn & to, const WindowTransform * transform) override
+    void windowInsertResultInto(const WindowTransform * transform,
+        size_t function_index) override
    {
+        IColumn & to = *transform->blockAt(transform->current_row)
+            .output_columns[function_index];
        assert_cast<ColumnUInt64 &>(to).getData().push_back(
            transform->peer_group_number);
    }
@ -1314,13 +1395,123 @@ struct WindowFunctionRowNumber final : public WindowFunction
    DataTypePtr getReturnType() const override
    { return std::make_shared<DataTypeUInt64>(); }

-    void windowInsertResultInto(IColumn & to, const WindowTransform * transform) override
+    void windowInsertResultInto(const WindowTransform * transform,
+        size_t function_index) override
    {
+        IColumn & to = *transform->blockAt(transform->current_row)
+            .output_columns[function_index];
        assert_cast<ColumnUInt64 &>(to).getData().push_back(
            transform->current_row_number);
    }
 };

+// ClickHouse-specific variant of lag/lead that respects the window frame.
+template <bool is_lead>
+struct WindowFunctionLagLeadInFrame final : public WindowFunction
+{
+    WindowFunctionLagLeadInFrame(const std::string & name_,
+            const DataTypes & argument_types_, const Array & parameters_)
+        : WindowFunction(name_, argument_types_, parameters_)
+    {
+        if (!parameters.empty())
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                "Function {} cannot be parameterized", name_);
+        }
+
+        if (argument_types.empty())
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                "Function {} takes at least one argument", name_);
+        }
+
+        if (argument_types.size() == 1)
+        {
+            return;
+        }
+
+        if (!isInt64FieldType(argument_types[1]->getDefault().getType()))
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                "Offset must be an integer, '{}' given",
+                argument_types[1]->getName());
+        }
+
+        if (argument_types.size() == 2)
+        {
+            return;
+        }
+
+        if (!getLeastSupertype({argument_types[0], argument_types[2]}))
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                "The default value type '{}' is not convertible to the argument type '{}'",
+                argument_types[2]->getName(),
+                argument_types[0]->getName());
+        }
+
+        if (argument_types.size() > 3)
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                "Function '{}' accepts at most 3 arguments, {} given",
+                name, argument_types.size());
+        }
+    }
+
+    DataTypePtr getReturnType() const override
+    { return argument_types[0]; }
+
+    void windowInsertResultInto(const WindowTransform * transform,
+        size_t function_index) override
+    {
+        const auto & current_block = transform->blockAt(transform->current_row);
+        IColumn & to = *current_block.output_columns[function_index];
+        const auto & workspace = transform->workspaces[function_index];
+
+        int offset = 1;
+        if (argument_types.size() > 1)
+        {
+            offset = (*current_block.input_columns[
+                    workspace.argument_column_indices[1]])[
+                        transform->current_row.row].get<Int64>();
+            if (offset < 0)
+            {
+                throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                    "The offset for function {} must be nonnegative, {} given",
+                    getName(), offset);
+            }
+        }
+
+        const auto [target_row, offset_left] = transform->moveRowNumber(
+            transform->current_row, offset * (is_lead ? 1 : -1));
+
+        if (offset_left != 0
+            || target_row < transform->frame_start
+            || transform->frame_end <= target_row)
+        {
+            // Offset is outside the frame.
+            if (argument_types.size() > 2)
+            {
+                // Column with default values is specified.
+                to.insertFrom(*current_block.input_columns[
+                            workspace.argument_column_indices[2]],
+                    transform->current_row.row);
+            }
+            else
+            {
+                to.insertDefault();
+            }
+        }
+        else
+        {
+            // Offset is inside the frame.
+            to.insertFrom(*transform->blockAt(target_row).input_columns[
+                    workspace.argument_column_indices[0]],
+                target_row.row);
+        }
+    }
+};
+
 void registerWindowFunctions(AggregateFunctionFactory & factory)
 {
    // Why didn't I implement lag/lead yet? Because they are a mess. I imagine
@ -1332,9 +1523,10 @@ void registerWindowFunctions(AggregateFunctionFactory & factory)
    // the whole partition like Postgres does, because using a linear amount
    // of additional memory is not an option when we have a lot of data. We must
    // be able to process at least the lag/lead in streaming fashion.
-    // Our best bet is probably rewriting, say `lag(value, offset)` to
-    // `any(value) over (rows between offset preceding and offset preceding)`,
-    // at the query planning stage.
+    // A partial solution for constant offsets is rewriting, say `lag(value, offset)
+    // to `any(value) over (rows between offset preceding and offset preceding)`.
+    // We also implement non-standard functions `lag/leadInFrame`, that are
+    // analogous to `lag/lead`, but respect the frame.
    // Functions like cume_dist() do require materializing the entire
    // partition, but it's probably also simpler to implement them by rewriting
    // to a (rows between unbounded preceding and unbounded following) frame,
@ -1360,6 +1552,20 @@ void registerWindowFunctions(AggregateFunctionFactory & factory)
            return std::make_shared<WindowFunctionRowNumber>(name, argument_types,
                parameters);
        });
+
+    factory.registerFunction("lagInFrame", [](const std::string & name,
+            const DataTypes & argument_types, const Array & parameters)
+        {
+            return std::make_shared<WindowFunctionLagLeadInFrame<false>>(
+                name, argument_types, parameters);
+        });
+
+    factory.registerFunction("leadInFrame", [](const std::string & name,
+            const DataTypes & argument_types, const Array & parameters)
+        {
+            return std::make_shared<WindowFunctionLagLeadInFrame<true>>(
+                name, argument_types, parameters);
+        });
 }

 }
--- a/src/Processors/Transforms/WindowTransform.h
+++ b/src/Processors/Transforms/WindowTransform.h
@ -110,7 +110,9 @@ public:
    Status prepare() override;
    void work() override;

-private:
+    /*
+     * Implementation details.
+     */
    void advancePartitionEnd();

    bool arePeers(const RowNumber & x, const RowNumber & y) const;
@ -321,10 +323,7 @@ public:
    int (* compare_values_with_offset) (
        const IColumn * compared_column, size_t compared_row,
        const IColumn * reference_column, size_t reference_row,
-        // We can make it a Field later if we need the Decimals. Now we only
-        // have ints and datetime, and the underlying Field type for them is
-        // uint64_t anyway.
-        uint64_t offset,
+        const Field & offset,
        bool offset_is_preceding);
 };

--- a/Show More
+++ b/Show More