Merge upstream/master into global-merge-executor (using imerge)

2024-09-29 13:10:48 +00:00 · 2021-09-03 12:29:48 +00:00 · 2021-09-03 12:29:48 +00:00 · 218f0adca7
commit 218f0adca7
parent ca7adb76b9 7929ee4d9b
24 changed files with 285 additions and 152 deletions
--- a/cmake/find/zlib.cmake
+++ b/cmake/find/zlib.cmake
@ -29,6 +29,9 @@ if (NOT USE_INTERNAL_ZLIB_LIBRARY)
 endif ()

 if (NOT ZLIB_FOUND AND NOT MISSING_INTERNAL_ZLIB_LIBRARY)
+    # https://github.com/zlib-ng/zlib-ng/pull/733
+    # This is disabed by default
+    add_compile_definitions(Z_TLS=__thread)
    set (USE_INTERNAL_ZLIB_LIBRARY 1)
    set (ZLIB_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/${INTERNAL_ZLIB_NAME}" "${ClickHouse_BINARY_DIR}/contrib/${INTERNAL_ZLIB_NAME}" CACHE INTERNAL "") # generated zconf.h
    set (ZLIB_INCLUDE_DIRS ${ZLIB_INCLUDE_DIR}) # for poco
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@ -26,17 +26,17 @@ The supported formats are:
 | [VerticalRaw](#verticalraw)                                                             | ✗     | ✔      |
 | [JSON](#json)                                                                           | ✗     | ✔      |
 | [JSONAsString](#jsonasstring)                                                           | ✔     | ✗      |
-| [JSONString](#jsonstring)                                                               | ✗     | ✔      |
+| [JSONStrings](#jsonstrings)                                                               | ✗     | ✔      |
 | [JSONCompact](#jsoncompact)                                                             | ✗     | ✔      |
-| [JSONCompactString](#jsoncompactstring)                                                 | ✗     | ✔      |
+| [JSONCompactStrings](#jsoncompactstrings)                                                 | ✗     | ✔      |
 | [JSONEachRow](#jsoneachrow)                                                             | ✔     | ✔      |
 | [JSONEachRowWithProgress](#jsoneachrowwithprogress)                                     | ✗     | ✔      |
 | [JSONStringsEachRow](#jsonstringseachrow)                                               | ✔     | ✔      |
 | [JSONStringsEachRowWithProgress](#jsonstringseachrowwithprogress)                       | ✗     | ✔      |
 | [JSONCompactEachRow](#jsoncompacteachrow)                                               | ✔     | ✔      |
 | [JSONCompactEachRowWithNamesAndTypes](#jsoncompacteachrowwithnamesandtypes)             | ✔     | ✔      |
-| [JSONCompactStringEachRow](#jsoncompactstringeachrow)                                   | ✔     | ✔      |
-| [JSONCompactStringEachRowWithNamesAndTypes](#jsoncompactstringeachrowwithnamesandtypes) | ✔     | ✔      |
+| [JSONCompactStringsEachRow](#jsoncompactstringseachrow)                                   | ✔     | ✔      |
+| [JSONCompactStringsEachRowWithNamesAndTypes](#jsoncompactstringseachrowwithnamesandtypes) | ✔     | ✔      |
 | [TSKV](#tskv)                                                                           | ✔     | ✔      |
 | [Pretty](#pretty)                                                                       | ✗     | ✔      |
 | [PrettyCompact](#prettycompact)                                                         | ✗     | ✔      |
@ -464,7 +464,7 @@ ClickHouse supports [NULL](../sql-reference/syntax.md), which is displayed as `n
 -   [JSONEachRow](#jsoneachrow) format
 -   [output_format_json_array_of_rows](../operations/settings/settings.md#output-format-json-array-of-rows) setting

-## JSONString {#jsonstring}
+## JSONStrings {#jsonstrings}

 Differs from JSON only in that data fields are output in strings, not in typed JSON values.

@ -541,7 +541,7 @@ Result:
 ```

 ## JSONCompact {#jsoncompact}
-## JSONCompactString {#jsoncompactstring}
+## JSONCompactStrings {#jsoncompactstrings}

 Differs from JSON only in that data rows are output in arrays, not in objects.

@ -580,7 +580,7 @@ Example:
 ```

 ```
-// JSONCompactString
+// JSONCompactStrings
 {
        "meta":
        [
@ -614,7 +614,7 @@ Example:
 ## JSONEachRow {#jsoneachrow}
 ## JSONStringsEachRow {#jsonstringseachrow}
 ## JSONCompactEachRow {#jsoncompacteachrow}
-## JSONCompactStringEachRow {#jsoncompactstringeachrow}
+## JSONCompactStringsEachRow {#jsoncompactstringseachrow}

 When using these formats, ClickHouse outputs rows as separated, newline-delimited JSON values, but the data as a whole is not valid JSON.

@ -639,9 +639,9 @@ Differs from `JSONEachRow`/`JSONStringsEachRow` in that ClickHouse will also yie
 ```

 ## JSONCompactEachRowWithNamesAndTypes {#jsoncompacteachrowwithnamesandtypes}
-## JSONCompactStringEachRowWithNamesAndTypes {#jsoncompactstringeachrowwithnamesandtypes}
+## JSONCompactStringsEachRowWithNamesAndTypes {#jsoncompactstringseachrowwithnamesandtypes}

-Differs from `JSONCompactEachRow`/`JSONCompactStringEachRow` in that the column names and types are written as the first two rows.
+Differs from `JSONCompactEachRow`/`JSONCompactStringsEachRow` in that the column names and types are written as the first two rows.

 ```json
 ["'hello'", "multiply(42, number)", "range(5)"]
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -390,12 +390,12 @@ This section contains the following parameters:

 ## keep_alive_timeout {#keep-alive-timeout}

-The number of seconds that ClickHouse waits for incoming requests before closing the connection. Defaults to 3 seconds.
+The number of seconds that ClickHouse waits for incoming requests before closing the connection. Defaults to 10 seconds.

 **Example**

 ``` xml
-<keep_alive_timeout>3</keep_alive_timeout>
+<keep_alive_timeout>10</keep_alive_timeout>
 ```

 ## listen_host {#server_configuration_parameters-listen_host}
@ -535,7 +535,7 @@ Possible values:
 -   Positive double.
 -   0 — The ClickHouse server can use all available RAM.

-Default value: `0`.
+Default value: `0.9`.

 **Usage**

--- a/docs/ja/operations/server-configuration-parameters/settings.md
+++ b/docs/ja/operations/server-configuration-parameters/settings.md
@ -284,12 +284,12 @@ ClickHouseサーバー間でデータを交換するポート。

 ## keep_alive_timeout {#keep-alive-timeout}

-ClickHouseが接続を閉じる前に受信要求を待機する秒数。 既定値は3秒です。
+ClickHouseが接続を閉じる前に受信要求を待機する秒数。 既定値は10秒です。

 **例**

 ``` xml
-<keep_alive_timeout>3</keep_alive_timeout>
+<keep_alive_timeout>10</keep_alive_timeout>
 ```

 ## listen_host {#server_configuration_parameters-listen_host}
--- a/docs/ru/interfaces/formats.md
+++ b/docs/ru/interfaces/formats.md
@ -25,17 +25,17 @@ ClickHouse может принимать (`INSERT`) и отдавать (`SELECT
 | [VerticalRaw](#verticalraw)                                                             | ✗     | ✔      |
 | [JSON](#json)                                                                           | ✗     | ✔      |
 | [JSONAsString](#jsonasstring)                                                           | ✔     | ✗      |
-| [JSONString](#jsonstring)                                                               | ✗     | ✔      |
+| [JSONStrings](#jsonstrings)                                                               | ✗     | ✔      |
 | [JSONCompact](#jsoncompact)                                                             | ✗     | ✔      |
-| [JSONCompactString](#jsoncompactstring)                                                 | ✗     | ✔      |
+| [JSONCompactStrings](#jsoncompactstrings)                                                 | ✗     | ✔      |
 | [JSONEachRow](#jsoneachrow)                                                             | ✔     | ✔      |
 | [JSONEachRowWithProgress](#jsoneachrowwithprogress)                                     | ✗     | ✔      |
-| [JSONStringEachRow](#jsonstringeachrow)                                                 | ✔     | ✔      |
-| [JSONStringEachRowWithProgress](#jsonstringeachrowwithprogress)                         | ✗     | ✔      |
+| [JSONStringsEachRow](#jsonstringseachrow)                                                 | ✔     | ✔      |
+| [JSONStringsEachRowWithProgress](#jsonstringseachrowwithprogress)                         | ✗     | ✔      |
 | [JSONCompactEachRow](#jsoncompacteachrow)                                               | ✔     | ✔      |
 | [JSONCompactEachRowWithNamesAndTypes](#jsoncompacteachrowwithnamesandtypes)             | ✔     | ✔      |
-| [JSONCompactStringEachRow](#jsoncompactstringeachrow)                                   | ✔     | ✔      |
-| [JSONCompactStringEachRowWithNamesAndTypes](#jsoncompactstringeachrowwithnamesandtypes) | ✔     | ✔      |
+| [JSONCompactStringsEachRow](#jsoncompactstringseachrow)                                   | ✔     | ✔      |
+| [JSONCompactStringsEachRowWithNamesAndTypes](#jsoncompactstringseachrowwithnamesandtypes) | ✔     | ✔      |
 | [TSKV](#tskv)                                                                           | ✔     | ✔      |
 | [Pretty](#pretty)                                                                       | ✗     | ✔      |
 | [PrettyCompact](#prettycompact)                                                         | ✗     | ✔      |
@ -442,7 +442,7 @@ ClickHouse поддерживает [NULL](../sql-reference/syntax.md), кото
 -   Формат [JSONEachRow](#jsoneachrow)
 -   Настройка [output_format_json_array_of_rows](../operations/settings/settings.md#output-format-json-array-of-rows)

-## JSONString {#jsonstring}
+## JSONStrings {#jsonstrings}

 Отличается от JSON только тем, что поля данных выводятся в строках, а не в типизированных значениях JSON.

@ -519,7 +519,7 @@ SELECT * FROM json_as_string;
 ```

 ## JSONCompact {#jsoncompact}
-## JSONCompactString {#jsoncompactstring}
+## JSONCompactStrings {#jsoncompactstrings}

 Отличается от JSON только тем, что строчки данных выводятся в массивах, а не в object-ах.

@ -558,7 +558,7 @@ SELECT * FROM json_as_string;
 ```

 ```json
-// JSONCompactString
+// JSONCompactStrings
 {
        "meta":
        [
@ -590,9 +590,9 @@ SELECT * FROM json_as_string;
 ```

 ## JSONEachRow {#jsoneachrow}
-## JSONStringEachRow {#jsonstringeachrow}
+## JSONStringsEachRow {#jsonstringseachrow}
 ## JSONCompactEachRow {#jsoncompacteachrow}
-## JSONCompactStringEachRow {#jsoncompactstringeachrow}
+## JSONCompactStringsEachRow {#jsoncompactstringseachrow}

 При использовании этих форматов ClickHouse выводит каждую запись как значения JSON (каждое значение отдельной строкой), при этом данные в целом — невалидный JSON.

@ -605,9 +605,9 @@ SELECT * FROM json_as_string;
 При вставке данных вы должны предоставить отдельное значение JSON для каждой строки.

 ## JSONEachRowWithProgress {#jsoneachrowwithprogress}
-## JSONStringEachRowWithProgress {#jsonstringeachrowwithprogress}
+## JSONStringsEachRowWithProgress {#jsonstringseachrowwithprogress}

-Отличается от `JSONEachRow`/`JSONStringEachRow` тем, что ClickHouse будет выдавать информацию о ходе выполнения в виде значений JSON.
+Отличается от `JSONEachRow`/`JSONStringsEachRow` тем, что ClickHouse будет выдавать информацию о ходе выполнения в виде значений JSON.

 ```json
 {"row":{"'hello'":"hello","multiply(42, number)":"0","range(5)":[0,1,2,3,4]}}
@ -617,9 +617,9 @@ SELECT * FROM json_as_string;
 ```

 ## JSONCompactEachRowWithNamesAndTypes {#jsoncompacteachrowwithnamesandtypes}
-## JSONCompactStringEachRowWithNamesAndTypes {#jsoncompactstringeachrowwithnamesandtypes}
+## JSONCompactStringsEachRowWithNamesAndTypes {#jsoncompactstringseachrowwithnamesandtypes}

-Отличается от `JSONCompactEachRow`/`JSONCompactStringEachRow` тем, что имена и типы столбцов записываются как первые две строки.
+Отличается от `JSONCompactEachRow`/`JSONCompactStringsEachRow` тем, что имена и типы столбцов записываются как первые две строки.

 ```json
 ["'hello'", "multiply(42, number)", "range(5)"]
--- a/docs/ru/operations/server-configuration-parameters/settings.md
+++ b/docs/ru/operations/server-configuration-parameters/settings.md
@ -371,12 +371,12 @@ ClickHouse проверяет условия для `min_part_size` и `min_part

 ## keep_alive_timeout {#keep-alive-timeout}

-Время в секундах, в течение которого ClickHouse ожидает входящих запросов прежде, чем закрыть соединение.
+Время в секундах, в течение которого ClickHouse ожидает входящих запросов прежде, чем 10акрыть соединение.

 **Пример**

 ``` xml
-<keep_alive_timeout>3</keep_alive_timeout>
+<keep_alive_timeout>10</keep_alive_timeout>
 ```

 ## listen_host {#server_configuration_parameters-listen_host}
@ -516,7 +516,7 @@ ClickHouse проверяет условия для `min_part_size` и `min_part
 -   Положительное число с плавающей запятой.
 -   0 — сервер Clickhouse может использовать всю оперативную память.

-Значение по умолчанию: `0`.
+Значение по умолчанию: `0.9`.

 **Использование**

--- a/docs/zh/interfaces/formats.md
+++ b/docs/zh/interfaces/formats.md
@ -26,17 +26,17 @@ ClickHouse可以接受和返回各种格式的数据。受支持的输入格式
 | [VerticalRaw](#verticalraw)                                                             | ✗     | ✔      |
 | [JSON](#json)                                                                           | ✗     | ✔      |
 | [JSONAsString](#jsonasstring)                                                           | ✔     | ✗      |
-| [JSONString](#jsonstring)                                                               | ✗     | ✔      |
+| [JSONStrings](#jsonstrings)                                                               | ✗     | ✔      |
 | [JSONCompact](#jsoncompact)                                                             | ✗     | ✔      |
-| [JSONCompactString](#jsoncompactstring)                                                 | ✗     | ✔      |
+| [JSONCompactStrings](#jsoncompactstrings)                                                 | ✗     | ✔      |
 | [JSONEachRow](#jsoneachrow)                                                             | ✔     | ✔      |
 | [JSONEachRowWithProgress](#jsoneachrowwithprogress)                                     | ✗     | ✔      |
 | [JSONStringsEachRow](#jsonstringseachrow)                                               | ✔     | ✔      |
 | [JSONStringsEachRowWithProgress](#jsonstringseachrowwithprogress)                       | ✗     | ✔      |
 | [JSONCompactEachRow](#jsoncompacteachrow)                                               | ✔     | ✔      |
 | [JSONCompactEachRowWithNamesAndTypes](#jsoncompacteachrowwithnamesandtypes)             | ✔     | ✔      |
-| [JSONCompactStringEachRow](#jsoncompactstringeachrow)                                   | ✔     | ✔      |
-| [JSONCompactStringEachRowWithNamesAndTypes](#jsoncompactstringeachrowwithnamesandtypes) | ✔     | ✔      |
+| [JSONCompactStringsEachRow](#jsoncompactstringseachrow)                                   | ✔     | ✔      |
+| [JSONCompactStringsEachRowWithNamesAndTypes](#jsoncompactstringseachrowwithnamesandtypes) | ✔     | ✔      |
 | [TSKV](#tskv)                                                                           | ✔     | ✔      |
 | [Pretty](#pretty)                                                                       | ✗     | ✔      |
 | [PrettyCompact](#prettycompact)                                                         | ✗     | ✔      |
@ -465,7 +465,7 @@ ClickHouse支持[NULL](../sql-reference/syntax.md), 在JSON输出中显示为`nu
 -   [JSONEachRow](#jsoneachrow)格式
 -   [output_format_json_array_of_rows](../operations/settings/settings.md#output-format-json-array-of-rows)设置

-## JSONString {#jsonstring}
+## JSONStrings {#jsonstrings}

 与JSON的不同之处在于数据字段以字符串输出，而不是以类型化JSON值输出。

@ -543,7 +543,7 @@ SELECT * FROM json_as_string;
 ```

 ## JSONCompact {#jsoncompact}
-## JSONCompactString {#jsoncompactstring}
+## JSONCompactStrings {#jsoncompactstrings}

 与JSON格式不同的是它以数组的方式输出结果，而不是以结构体。

@ -582,7 +582,7 @@ SELECT * FROM json_as_string;
 ```

 ```json
-// JSONCompactString
+// JSONCompactStrings
 {
        "meta":
        [
@ -614,9 +614,9 @@ SELECT * FROM json_as_string;
 ```

 ## JSONEachRow {#jsoneachrow}
-## JSONStringEachRow {#jsonstringeachrow}
+## JSONStringsEachRow {#jsonstringseachrow}
 ## JSONCompactEachRow {#jsoncompacteachrow}
-## JSONCompactStringEachRow {#jsoncompactstringeachrow}
+## JSONCompactStringsEachRow {#jsoncompactstringseachrow}

 使用这些格式时，ClickHouse会将行输出为用换行符分隔的JSON值，这些输出数据作为一个整体时，由于没有分隔符(,)因而不是有效的JSON文档。

@ -629,9 +629,9 @@ SELECT * FROM json_as_string;
 在插入数据时，应该为每一行提供一个单独的JSON值。

 ## JSONEachRowWithProgress {#jsoneachrowwithprogress}
-## JSONStringEachRowWithProgress {#jsonstringeachrowwithprogress}
+## JSONStringsEachRowWithProgress {#jsonstringseachrowwithprogress}

-与`JSONEachRow`/`JSONStringEachRow`不同的是，ClickHouse还将生成作为JSON值的进度信息。
+与`JSONEachRow`/`JSONStringsEachRow`不同的是，ClickHouse还将生成作为JSON值的进度信息。

 ```json
 {"row":{"'hello'":"hello","multiply(42, number)":"0","range(5)":[0,1,2,3,4]}}
@ -641,9 +641,9 @@ SELECT * FROM json_as_string;
 ```

 ## JSONCompactEachRowWithNamesAndTypes {#jsoncompacteachrowwithnamesandtypes}
-## JSONCompactStringEachRowWithNamesAndTypes {#jsoncompactstringeachrowwithnamesandtypes}
+## JSONCompactStringsEachRowWithNamesAndTypes {#jsoncompactstringseachrowwithnamesandtypes}

-与`JSONCompactEachRow`/`JSONCompactStringEachRow`不同的是，列名和类型被写入前两行。
+与`JSONCompactEachRow`/`JSONCompactStringsEachRow`不同的是，列名和类型被写入前两行。

 ```json
 ["'hello'", "multiply(42, number)", "range(5)"]
--- a/docs/zh/operations/server-configuration-parameters/settings.md
+++ b/docs/zh/operations/server-configuration-parameters/settings.md
@ -282,12 +282,12 @@ ClickHouse每x秒重新加载内置字典。 这使得编辑字典 “on the fly

 ## keep_alive_timeout {#keep-alive-timeout}

-ClickHouse在关闭连接之前等待传入请求的秒数。 默认为3秒。
+ClickHouse在关闭连接之前等待传入请求的秒数。默认为10秒。

 **示例**

 ``` xml
-<keep_alive_timeout>3</keep_alive_timeout>
+<keep_alive_timeout>10</keep_alive_timeout>
 ```

 ## listen_host {#server_configuration_parameters-listen_host}
--- a/src/Coordination/Changelog.cpp
+++ b/src/Coordination/Changelog.cpp
@ -142,8 +142,22 @@ private:

 struct ChangelogReadResult
 {
-    uint64_t entries_read;
+    /// Total entries read from log including skipped.
+    /// Useful when we decide to continue to write in the same log and want to know
+    /// how many entries was already written in it.
+    uint64_t total_entries_read_from_log;
+
+    /// First index in log
+    uint64_t log_start_index;
+
+    /// First entry actually read log (not including skipped)
    uint64_t first_read_index;
+    /// Last entry read from log (last entry in log)
+    /// When we don't skip anything last_read_index - first_read_index = total_entries_read_from_log.
+    /// But when some entries from the start of log can be skipped because they are not required.
+    uint64_t last_read_index;
+
+    /// last offset we were able to read from log
    off_t last_position;
    bool error;
 };
@ -156,6 +170,7 @@ public:
        , read_buf(filepath)
    {}

+    /// start_log_index -- all entries with index < start_log_index will be skipped, but accounted into total_entries_read_from_log
    ChangelogReadResult readChangelog(IndexToLogEntry & logs, uint64_t start_log_index, IndexToOffset & index_to_offset, Poco::Logger * log)
    {
        uint64_t previous_index = 0;
@ -210,7 +225,7 @@ public:
                if (logs.count(record.header.index) != 0)
                    throw Exception(ErrorCodes::CORRUPTED_DATA, "Duplicated index id {} in log {}", record.header.index, filepath);

-                result.entries_read += 1;
+                result.total_entries_read_from_log += 1;

                /// Read but skip this entry because our state is already more fresh
                if (record.header.index < start_log_index)
@ -224,9 +239,10 @@ public:
                /// Put it into in memory structure
                logs.emplace(record.header.index, log_entry);
                index_to_offset[record.header.index] = result.last_position;
+                result.last_read_index = record.header.index;

-                if (result.entries_read % 50000 == 0)
-                    LOG_TRACE(log, "Reading changelog from path {}, entries {}", filepath, result.entries_read);
+                if (result.total_entries_read_from_log % 50000 == 0)
+                    LOG_TRACE(log, "Reading changelog from path {}, entries {}", filepath, result.total_entries_read_from_log);
            }
        }
        catch (const Exception & ex)
@ -243,7 +259,7 @@ public:
            tryLogCurrentException(log);
        }

-        LOG_TRACE(log, "Totally read from changelog {} {} entries", filepath, result.entries_read);
+        LOG_TRACE(log, "Totally read from changelog {} {} entries", filepath, result.total_entries_read_from_log);

        return result;
    }
@ -280,16 +296,10 @@ Changelog::Changelog(

 void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uint64_t logs_to_keep)
 {
-    uint64_t total_read = 0;
+    std::optional<ChangelogReadResult> last_log_read_result;

-    /// Amount of entries in last log index
-    uint64_t entries_in_last = 0;
-    /// Log idx of the first incomplete log (key in existing_changelogs)
-    int64_t first_incomplete_log_start_index = -1; /// if -1 then no incomplete log exists
-
-    ChangelogReadResult result{};
-    /// First log index which was read from all changelogs
-    uint64_t first_read_index = 0;
+    /// Last log has some free space to write
+    bool last_log_is_not_complete = false;

    /// We must start to read from this log index
    uint64_t start_to_read_from = last_commited_log_index;
@ -300,19 +310,14 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin
    else
        start_to_read_from = 1;

-    /// At least we read something
-    bool started = false;
-
    /// Got through changelog files in order of start_index
    for (const auto & [changelog_start_index, changelog_description] : existing_changelogs)
    {
-        /// How many entries we have in the last changelog
-        entries_in_last = changelog_description.expectedEntriesCountInLog();

        /// [from_log_index.>=.......start_to_read_from.....<=.to_log_index]
        if (changelog_description.to_log_index >= start_to_read_from)
        {
-            if (!started) /// still nothing was read
+            if (!last_log_read_result) /// still nothing was read
            {
                /// Our first log starts from the more fresh log_id than we required to read and this changelog is not empty log.
                /// So we are missing something in our logs, but it's not dataloss, we will receive snapshot and required
@ -320,8 +325,12 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin
                if (changelog_description.from_log_index > last_commited_log_index && (changelog_description.from_log_index - last_commited_log_index) > 1)
                {
                    LOG_ERROR(log, "Some records was lost, last committed log index {}, smallest available log index on disk {}. Hopefully will receive missing records from leader.", last_commited_log_index, changelog_description.from_log_index);
-                    first_incomplete_log_start_index = changelog_start_index;
-                    break;
+                    /// Nothing to do with our more fresh log, leader will overwrite them, so remove everything and just start from last_commited_index
+                    removeAllLogs();
+                    min_log_id = last_commited_log_index;
+                    max_log_id = last_commited_log_index == 0 ? 0 : last_commited_log_index - 1;
+                    rotate(max_log_id + 1);
+                    return;
                }
                else if (changelog_description.from_log_index > start_to_read_from)
                {
@ -332,69 +341,100 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin


            ChangelogReader reader(changelog_description.path);
-            result = reader.readChangelog(logs, start_to_read_from, index_to_start_pos, log);
-
-            started = true;
+            last_log_read_result = reader.readChangelog(logs, start_to_read_from, index_to_start_pos, log);

            /// Otherwise we have already initialized it
-            if (first_read_index == 0)
-                first_read_index = result.first_read_index;
+            if (min_log_id == 0)
+                min_log_id = last_log_read_result->first_read_index;

-            total_read += result.entries_read;
+            if (last_log_read_result->last_read_index != 0)
+                max_log_id = last_log_read_result->last_read_index;
+
+            last_log_read_result->log_start_index = changelog_description.from_log_index;
+
+            /// How many entries we have in the last changelog
+            uint64_t expected_entries_in_log = changelog_description.expectedEntriesCountInLog();

            /// May happen after truncate, crash or simply unfinished log
-            if (result.entries_read < entries_in_last)
+            if (last_log_read_result->total_entries_read_from_log < expected_entries_in_log)
            {
-                first_incomplete_log_start_index = changelog_start_index;
+                last_log_is_not_complete = true;
                break;
            }
        }
    }

-    if (first_read_index != 0)
-        start_index = first_read_index;
-    else /// We just may have no logs (only snapshot)
-        start_index = last_commited_log_index;
-
-    /// Found some broken or non finished logs
-    /// We have to remove broken data and continue to write into incomplete log.
-    if (first_incomplete_log_start_index != -1) /// otherwise all logs completed so just start a new one
+    /// we can have empty log (with zero entries) and last_log_read_result will be initialized
+    if (!last_log_read_result || min_log_id == 0) /// We just may have no logs (only snapshot or nothing)
    {
-        auto start_remove_from = existing_changelogs.begin();
-        if (started)
-            start_remove_from = existing_changelogs.upper_bound(first_incomplete_log_start_index);
+        /// Just to be sure they don't exist
+        removeAllLogs();

-        /// All subsequent logs shouldn't exist. But they may exist if we crashed after writeAt started. Remove them.
-        for (auto itr = start_remove_from; itr != existing_changelogs.end();)
+        min_log_id = last_commited_log_index;
+        max_log_id = last_commited_log_index == 0 ? 0 : last_commited_log_index - 1;
+    }
+    else if (last_log_is_not_complete) /// if it's complete just start new one
    {
-            LOG_WARNING(log, "Removing changelog {}, because it's goes after broken changelog entry", itr->second.path);
-            std::filesystem::remove(itr->second.path);
-            itr = existing_changelogs.erase(itr);
+        assert(last_log_read_result != std::nullopt);
+        /// Actually they shouldn't exist, but to be sure we remove them
+        removeAllLogsAfter(last_log_read_result->log_start_index);
+
+        assert(!existing_changelogs.empty());
+        assert(existing_changelogs.find(last_log_read_result->log_start_index)->first == existing_changelogs.rbegin()->first);
+
+        /// Continue to write into incomplete existing log
+        auto description = existing_changelogs[last_log_read_result->log_start_index];
+
+        if (last_log_read_result->error)
+            initWriter(description, last_log_read_result->total_entries_read_from_log,  /* truncate_to_offset = */ last_log_read_result->last_position);
+        else
+            initWriter(description, last_log_read_result->total_entries_read_from_log);
    }

-        /// Continue to write into existing log
-        if (!existing_changelogs.empty())
+    /// Start new log if we don't initialize writer from previous log. All logs can be "complete".
+    if (!current_writer)
+        rotate(max_log_id + 1);
+}
+
+
+void Changelog::initWriter(const ChangelogFileDescription & description, uint64_t entries_already_written, std::optional<uint64_t> truncate_to_offset)
 {
-            auto description = existing_changelogs.rbegin()->second;
    if (description.expectedEntriesCountInLog() != rotate_interval)
        LOG_TRACE(log, "Looks like rotate_logs_interval was changed, current {}, expected entries in last log {}", rotate_interval, description.expectedEntriesCountInLog());

    LOG_TRACE(log, "Continue to write into {}", description.path);
    current_writer = std::make_unique<ChangelogWriter>(description.path, WriteMode::Append, description.from_log_index);
-            current_writer->setEntriesWritten(result.entries_read);
+    current_writer->setEntriesWritten(entries_already_written);

-            /// Truncate all broken entries from log
-            if (result.error)
+    if (truncate_to_offset)
    {
-                LOG_WARNING(log, "Read finished with error, truncating all broken log entries");
-                current_writer->truncateToLength(result.last_position);
-            }
+        LOG_WARNING(log, "Changelog {} contain broken enties, truncating all broken log entries", description.path);
+        current_writer->truncateToLength(*truncate_to_offset);
    }
 }

-    /// Start new log if we don't initialize writer from previous log
-    if (!current_writer)
-        rotate(start_index + total_read);
+void Changelog::removeAllLogsAfter(uint64_t start_to_remove_from_id)
+{
+    auto start_to_remove_from = existing_changelogs.upper_bound(start_to_remove_from_id);
+
+    /// All subsequent logs shouldn't exist. But they may exist if we crashed after writeAt started. Remove them.
+    for (auto itr = start_to_remove_from; itr != existing_changelogs.end();)
+    {
+        LOG_WARNING(log, "Removing changelog {}, because it's goes after broken changelog entry", itr->second.path);
+        std::filesystem::remove(itr->second.path);
+        itr = existing_changelogs.erase(itr);
+    }
+}
+
+void Changelog::removeAllLogs()
+{
+    LOG_WARNING(log, "Removing all changelogs");
+    for (auto itr = existing_changelogs.begin(); itr != existing_changelogs.end();)
+    {
+        LOG_WARNING(log, "Removing changelog {}, because it's goes after broken changelog entry", itr->second.path);
+        std::filesystem::remove(itr->second.path);
+        itr = existing_changelogs.erase(itr);
+    }
 }

 void Changelog::rotate(uint64_t new_start_log_index)
@ -439,7 +479,7 @@ void Changelog::appendEntry(uint64_t index, const LogEntryPtr & log_entry)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Changelog must be initialized before appending records");

    if (logs.empty())
-        start_index = index;
+        min_log_id = index;

    const auto & current_changelog_description = existing_changelogs[current_writer->getStartIndex()];
    const bool log_is_complete = current_writer->getEntriesWritten() == current_changelog_description.expectedEntriesCountInLog();
@ -452,6 +492,7 @@ void Changelog::appendEntry(uint64_t index, const LogEntryPtr & log_entry)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Record with index {} already exists", index);

    logs[index] = makeClone(log_entry);
+    max_log_id = index;
 }

 void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry)
@ -513,11 +554,30 @@ void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry)

 void Changelog::compact(uint64_t up_to_log_index)
 {
+    LOG_INFO(log, "Compact logs up to log index {}, our max log id is {}", up_to_log_index, max_log_id);
+
+    bool remove_all_logs = false;
+    if (up_to_log_index > max_log_id)
+    {
+        LOG_INFO(log, "Seems like this node recovers from leaders snapshot, removing all logs");
+        /// If we received snapshot from leader we may compact up to more fresh log
+        max_log_id = up_to_log_index;
+        remove_all_logs = true;
+    }
+
+    bool need_rotate = false;
    for (auto itr = existing_changelogs.begin(); itr != existing_changelogs.end();)
    {
        /// Remove all completely outdated changelog files
-        if (itr->second.to_log_index <= up_to_log_index)
+        if (remove_all_logs || itr->second.to_log_index <= up_to_log_index)
        {
+            if (current_writer && itr->second.from_log_index == current_writer->getStartIndex())
+            {
+                LOG_INFO(log, "Trying to remove log {} which is current active log for write. Possibly this node recovers from snapshot", itr->second.path);
+                need_rotate = true;
+                current_writer.reset();
+            }
+
            LOG_INFO(log, "Removing changelog {} because of compaction", itr->second.path);
            std::erase_if(index_to_start_pos, [right_index = itr->second.to_log_index] (const auto & item) { return item.first <= right_index; });
            std::filesystem::remove(itr->second.path);
@ -526,8 +586,14 @@ void Changelog::compact(uint64_t up_to_log_index)
        else /// Files are ordered, so all subsequent should exist
            break;
    }
-    start_index = up_to_log_index + 1;
+    /// Compaction from the past is possible, so don't make our min_log_id smaller.
+    min_log_id = std::max(min_log_id, up_to_log_index + 1);
    std::erase_if(logs, [up_to_log_index] (const auto & item) { return item.first <= up_to_log_index; });
+
+    if (need_rotate)
+        rotate(up_to_log_index + 1);
+
+    LOG_INFO(log, "Compaction up to {} finished new min index {}, new max index {}", up_to_log_index, min_log_id, max_log_id);
 }

 LogEntryPtr Changelog::getLastEntry() const
@ -535,10 +601,11 @@ LogEntryPtr Changelog::getLastEntry() const
    /// This entry treaded in special way by NuRaft
    static LogEntryPtr fake_entry = nuraft::cs_new<nuraft::log_entry>(0, nuraft::buffer::alloc(sizeof(uint64_t)));

-    const uint64_t next_index = getNextEntryIndex() - 1;
-    auto entry = logs.find(next_index);
+    auto entry = logs.find(max_log_id);
    if (entry == logs.end())
+    {
        return fake_entry;
+    }

    return entry->second;
 }
--- a/src/Coordination/Changelog.h
+++ b/src/Coordination/Changelog.h
@ -2,6 +2,7 @@

 #include <libnuraft/nuraft.hxx> // Y_IGNORE
 #include <city.h>
+#include <optional>
 #include <IO/WriteBufferFromFile.h>
 #include <IO/HashingWriteBuffer.h>
 #include <Compression/CompressedWriteBuffer.h>
@ -87,12 +88,12 @@ public:

    uint64_t getNextEntryIndex() const
    {
-        return start_index + logs.size();
+        return max_log_id + 1;
    }

    uint64_t getStartIndex() const
    {
-        return start_index;
+        return min_log_id;
    }

    /// Last entry in log, or fake entry with term 0 if log is empty
@ -128,6 +129,13 @@ private:
    /// Starts new file [new_start_log_index, new_start_log_index + rotate_interval]
    void rotate(uint64_t new_start_log_index);

+    /// Remove all changelogs from disk with start_index bigger than start_to_remove_from_id
+    void removeAllLogsAfter(uint64_t start_to_remove_from_id);
+    /// Remove all logs from disk
+    void removeAllLogs();
+    /// Init writer for existing log with some entries already written
+    void initWriter(const ChangelogFileDescription & description, uint64_t entries_already_written, std::optional<uint64_t> truncate_to_offset = {});
+
 private:
    const std::string changelogs_dir;
    const uint64_t rotate_interval;
@ -144,7 +152,9 @@ private:
    /// Mapping log_id -> log_entry
    IndexToLogEntry logs;
    /// Start log_id which exists in all "active" logs
-    uint64_t start_index = 0;
+    /// min_log_id + 1 == max_log_id means empty log storage for NuRaft
+    uint64_t min_log_id = 0;
+    uint64_t max_log_id = 0;
 };

 }
--- a/src/Coordination/KeeperDispatcher.cpp
+++ b/src/Coordination/KeeperDispatcher.cpp
@ -293,10 +293,12 @@ void KeeperDispatcher::shutdown()
            if (session_cleaner_thread.joinable())
                session_cleaner_thread.join();

-            /// FIXME not the best way to notify
+            if (requests_queue)
+            {
                requests_queue->push({});
                if (request_thread.joinable())
                    request_thread.join();
+            }

            responses_queue.push({});
            if (responses_thread.joinable())
@ -313,7 +315,7 @@ void KeeperDispatcher::shutdown()
        KeeperStorage::RequestForSession request_for_session;

        /// Set session expired for all pending requests
-        while (requests_queue->tryPop(request_for_session))
+        while (requests_queue && requests_queue->tryPop(request_for_session))
        {
            if (request_for_session.request)
            {
--- a/src/Coordination/tests/gtest_coordination.cpp
+++ b/src/Coordination/tests/gtest_coordination.cpp
@ -404,6 +404,7 @@ TEST(CoordinationTest, ChangelogTestCompaction)
    /// And we able to read it
    DB::KeeperLogStore changelog_reader("./logs", 5, true);
    changelog_reader.init(7, 0);
+
    EXPECT_EQ(changelog_reader.size(), 1);
    EXPECT_EQ(changelog_reader.start_index(), 7);
    EXPECT_EQ(changelog_reader.next_slot(), 8);
@ -1317,7 +1318,7 @@ TEST(CoordinationTest, TestRotateIntervalChanges)
        }
    }

-    EXPECT_TRUE(fs::exists("./logs/changelog_0_99.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_100.bin"));

    DB::KeeperLogStore changelog_1("./logs", 10, true);
    changelog_1.init(0, 50);
@ -1330,8 +1331,8 @@ TEST(CoordinationTest, TestRotateIntervalChanges)
        changelog_1.end_of_append_batch(0, 0);
    }

-    EXPECT_TRUE(fs::exists("./logs/changelog_0_99.bin"));
-    EXPECT_TRUE(fs::exists("./logs/changelog_100_109.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_100.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_101_110.bin"));

    DB::KeeperLogStore changelog_2("./logs", 7, true);
    changelog_2.init(98, 55);
@ -1346,11 +1347,11 @@ TEST(CoordinationTest, TestRotateIntervalChanges)
    }

    changelog_2.compact(105);
-    EXPECT_FALSE(fs::exists("./logs/changelog_0_99.bin"));
-    EXPECT_TRUE(fs::exists("./logs/changelog_100_109.bin"));
-    EXPECT_TRUE(fs::exists("./logs/changelog_110_116.bin"));
-    EXPECT_TRUE(fs::exists("./logs/changelog_117_123.bin"));
-    EXPECT_TRUE(fs::exists("./logs/changelog_124_130.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_1_100.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_101_110.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_111_117.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_118_124.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_125_131.bin"));

    DB::KeeperLogStore changelog_3("./logs", 5, true);
    changelog_3.init(116, 3);
@ -1364,14 +1365,14 @@ TEST(CoordinationTest, TestRotateIntervalChanges)
    }

    changelog_3.compact(125);
-    EXPECT_FALSE(fs::exists("./logs/changelog_100_109.bin"));
-    EXPECT_FALSE(fs::exists("./logs/changelog_110_116.bin"));
-    EXPECT_FALSE(fs::exists("./logs/changelog_117_123.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_101_110.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_111_117.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_118_124.bin"));

-    EXPECT_TRUE(fs::exists("./logs/changelog_124_130.bin"));
-    EXPECT_TRUE(fs::exists("./logs/changelog_131_135.bin"));
-    EXPECT_TRUE(fs::exists("./logs/changelog_136_140.bin"));
-    EXPECT_TRUE(fs::exists("./logs/changelog_141_145.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_125_131.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_132_136.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_137_141.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_142_146.bin"));
 }


--- a/src/IO/Bzip2ReadBuffer.cpp
+++ b/src/IO/Bzip2ReadBuffer.cpp
@ -12,6 +12,7 @@ namespace DB
 namespace ErrorCodes
 {
    extern const int BZIP2_STREAM_DECODER_FAILED;
+    extern const int UNEXPECTED_END_OF_FILE;
 }


@ -90,6 +91,12 @@ bool Bzip2ReadBuffer::nextImpl()
            "bzip2 stream decoder failed: error code: {}",
            ret);

+    if (in->eof())
+    {
+        eof = true;
+        throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "Unexpected end of bzip2 archive");
+    }
+
    return true;
 }
 }
--- a/src/IO/Bzip2WriteBuffer.cpp
+++ b/src/IO/Bzip2WriteBuffer.cpp
@ -2,7 +2,7 @@
 #    include <Common/config.h>
 #endif

-#if USE_BROTLI
+#if USE_BZIP2
 #    include <IO/Bzip2WriteBuffer.h>
 #    include <bzlib.h> // Y_IGNORE

--- a/src/IO/ZlibDeflatingWriteBuffer.h
+++ b/src/IO/ZlibDeflatingWriteBuffer.h
@ -4,6 +4,7 @@
 #include <IO/BufferWithOwnMemory.h>
 #include <IO/CompressionMethod.h>

+
 #include <zlib.h>


--- a/src/IO/ZlibInflatingReadBuffer.h
+++ b/src/IO/ZlibInflatingReadBuffer.h
@ -4,6 +4,7 @@
 #include <IO/BufferWithOwnMemory.h>
 #include <IO/CompressionMethod.h>

+
 #include <zlib.h>


--- a/src/IO/examples/zlib_ng_bug.cpp
+++ b/src/IO/examples/zlib_ng_bug.cpp
@ -1,6 +1,7 @@
 #include <unistd.h>
 #include <vector>
 #include <stdexcept>
+
 #include <zlib.h>

 #pragma GCC diagnostic ignored "-Wold-style-cast"
--- a/src/Server/KeeperTCPHandler.cpp
+++ b/src/Server/KeeperTCPHandler.cpp
@ -192,7 +192,7 @@ struct SocketInterruptablePollWrapper
 KeeperTCPHandler::KeeperTCPHandler(IServer & server_, const Poco::Net::StreamSocket & socket_)
    : Poco::Net::TCPServerConnection(socket_)
    , server(server_)
-    , log(&Poco::Logger::get("NuKeeperTCPHandler"))
+    , log(&Poco::Logger::get("KeeperTCPHandler"))
    , global_context(Context::createCopy(server.context()))
    , keeper_dispatcher(global_context->getKeeperDispatcher())
    , operation_timeout(0, global_context->getConfigRef().getUInt("keeper_server.operation_timeout_ms", Coordination::DEFAULT_OPERATION_TIMEOUT_MS) * 1000)
--- a/src/Storages/MergeTree/ReplicatedMergeTreeTableMetadata.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeTableMetadata.cpp
@ -46,7 +46,15 @@ ReplicatedMergeTreeTableMetadata::ReplicatedMergeTreeTableMetadata(const MergeTr

    primary_key = formattedAST(metadata_snapshot->getPrimaryKey().expression_list_ast);
    if (metadata_snapshot->isPrimaryKeyDefined())
-        sorting_key = formattedAST(metadata_snapshot->getSortingKey().expression_list_ast);
+    {
+        /// We don't use preparsed AST `sorting_key.expression_list_ast` because
+        /// it contain version column for VersionedCollapsingMergeTree, which
+        /// is not stored in ZooKeeper for compatibility reasons. So the best
+        /// compatible way is just to convert definition_ast to list and
+        /// serialize it. In all other places key.expression_list_ast should be
+        /// used.
+        sorting_key = formattedAST(extractKeyExpressionList(metadata_snapshot->getSortingKey().definition_ast));
+    }

    data_format_version = data.format_version;

--- a/tests/integration/test_storage_kafka/test.py
+++ b/tests/integration/test_storage_kafka/test.py
@ -7,6 +7,7 @@ import time
 import logging
 import io
 import string
+import ast

 import avro.schema
 import avro.io
@ -2792,7 +2793,7 @@ def test_kafka_formats_with_broken_message(kafka_cluster):
                # broken message
                "(0,'BAD','AM',0.5,1)",
            ],
-            'expected':r'''{"raw_message":"(0,'BAD','AM',0.5,1)","error":"Cannot parse string 'BAD' as UInt16: syntax error at begin of string. Note: there are toUInt16OrZero and toUInt16OrNull functions, which returns zero\/NULL instead of throwing exception.: while executing 'FUNCTION _CAST(assumeNotNull(_dummy_0) :: 2, 'UInt16' :: 1) -> _CAST(assumeNotNull(_dummy_0), 'UInt16') UInt16 : 4'"}''',
+            'expected':r'''{"raw_message":"(0,'BAD','AM',0.5,1)","error":"Cannot parse string 'BAD' as UInt16: syntax error at begin of string. Note: there are toUInt16OrZero and toUInt16OrNull functions, which returns zero\/NULL instead of throwing exception"}''',
            'supports_empty_value': True,
            'printable':True,
        },
@ -2934,11 +2935,13 @@ def test_kafka_formats_with_broken_message(kafka_cluster):
 '''.format(topic_name=topic_name, offset_0=offsets[0], offset_1=offsets[1], offset_2=offsets[2])
        # print(('Checking result\n {result} \n expected \n {expected}\n'.format(result=str(result), expected=str(expected))))
        assert TSV(result) == TSV(expected), 'Proper result for format: {}'.format(format_name)
-        errors_result = instance.query('SELECT raw_message, error FROM test.kafka_errors_{format_name}_mv format JSONEachRow'.format(format_name=format_name))
-        errors_expected = format_opts['expected']
+        errors_result = ast.literal_eval(instance.query('SELECT raw_message, error FROM test.kafka_errors_{format_name}_mv format JSONEachRow'.format(format_name=format_name)))
+        errors_expected = ast.literal_eval(format_opts['expected'])
        # print(errors_result.strip())
        # print(errors_expected.strip())
-        assert  errors_result.strip() == errors_expected.strip(), 'Proper errors for format: {}'.format(format_name)
+        assert errors_result['raw_message'] == errors_expected['raw_message'], 'Proper raw_message for format: {}'.format(format_name)
+        # Errors text can change, just checking prefixes
+        assert errors_expected['error'] in errors_result['error'], 'Proper error for format: {}'.format(format_name)
        kafka_delete_topic(admin_client, topic_name)

 def wait_for_new_data(table_name, prev_count = 0, max_retries = 120):
--- a/tests/queries/0_stateless/01526_alter_add_and_modify_order_zookeeper.reference
+++ b/tests/queries/0_stateless/01526_alter_add_and_modify_order_zookeeper.reference
@ -1,6 +1,6 @@
 2019-10-01	a	1	aa	1	1	1
 2019-10-01	a	1	aa	1	1	1	0
-CREATE TABLE default.table_for_alter\n(\n    `d` Date,\n    `a` String,\n    `b` UInt8,\n    `x` String,\n    `y` Int8,\n    `version` UInt64,\n    `sign` Int8 DEFAULT 1,\n    `order` UInt32\n)\nENGINE = ReplicatedVersionedCollapsingMergeTree(\'/clickhouse/tables/01526_alter_add/t1\', \'1\', sign, version)\nPARTITION BY y\nPRIMARY KEY d\nORDER BY (d, order)\nSETTINGS index_granularity = 8192
+CREATE TABLE default.table_for_alter\n(\n    `d` Date,\n    `a` String,\n    `b` UInt8,\n    `x` String,\n    `y` Int8,\n    `version` UInt64,\n    `sign` Int8 DEFAULT 1,\n    `order` UInt32\n)\nENGINE = ReplicatedVersionedCollapsingMergeTree(\'/clickhouse/tables/default/01526_alter_add/t1\', \'1\', sign, version)\nPARTITION BY y\nPRIMARY KEY d\nORDER BY (d, order)\nSETTINGS index_granularity = 8192
 2019-10-01	a	1	aa	1	1	1	0	0
 2019-10-02	b	2	bb	2	2	2	1	2
-CREATE TABLE default.table_for_alter\n(\n    `d` Date,\n    `a` String,\n    `b` UInt8,\n    `x` String,\n    `y` Int8,\n    `version` UInt64,\n    `sign` Int8 DEFAULT 1,\n    `order` UInt32,\n    `datum` UInt32\n)\nENGINE = ReplicatedVersionedCollapsingMergeTree(\'/clickhouse/tables/01526_alter_add/t1\', \'1\', sign, version)\nPARTITION BY y\nPRIMARY KEY d\nORDER BY (d, order, datum)\nSETTINGS index_granularity = 8192
+CREATE TABLE default.table_for_alter\n(\n    `d` Date,\n    `a` String,\n    `b` UInt8,\n    `x` String,\n    `y` Int8,\n    `version` UInt64,\n    `sign` Int8 DEFAULT 1,\n    `order` UInt32,\n    `datum` UInt32\n)\nENGINE = ReplicatedVersionedCollapsingMergeTree(\'/clickhouse/tables/default/01526_alter_add/t1\', \'1\', sign, version)\nPARTITION BY y\nPRIMARY KEY d\nORDER BY (d, order, datum)\nSETTINGS index_granularity = 8192
--- a/tests/queries/0_stateless/01526_alter_add_and_modify_order_zookeeper.sql
+++ b/tests/queries/0_stateless/01526_alter_add_and_modify_order_zookeeper.sql
@ -12,17 +12,27 @@ CREATE TABLE table_for_alter
    `version` UInt64,
    `sign` Int8 DEFAULT 1
 )
-ENGINE = ReplicatedVersionedCollapsingMergeTree('/clickhouse/tables/01526_alter_add/t1', '1', sign, version)
+ENGINE = ReplicatedVersionedCollapsingMergeTree('/clickhouse/tables/{database}/01526_alter_add/t1', '1', sign, version)
 PARTITION BY y
 ORDER BY d
 SETTINGS index_granularity = 8192;

 INSERT INTO table_for_alter VALUES(toDate('2019-10-01'), 'a', 1, 'aa', 1, 1, 1);

+DETACH TABLE table_for_alter;
+
+ATTACH TABLE table_for_alter;
+
+
 SELECT * FROM table_for_alter;

 ALTER TABLE table_for_alter ADD COLUMN order UInt32, MODIFY ORDER BY (d, order);

+
+DETACH TABLE table_for_alter;
+
+ATTACH TABLE table_for_alter;
+
 SELECT * FROM table_for_alter;

 SHOW CREATE TABLE table_for_alter;
@ -35,4 +45,8 @@ SELECT * FROM table_for_alter ORDER BY d;

 SHOW CREATE TABLE table_for_alter;

+DETACH TABLE table_for_alter;
+
+ATTACH TABLE table_for_alter;
+
 DROP TABLE IF EXISTS table_for_alter;
--- a/tests/queries/0_stateless/02022_bzip2_truncated.reference
+++ b/tests/queries/0_stateless/02022_bzip2_truncated.reference
--- a/tests/queries/0_stateless/02022_bzip2_truncated.sh
+++ b/tests/queries/0_stateless/02022_bzip2_truncated.sh
@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+tmp_path=$(mktemp "$CURDIR/02022_bzip2_truncate.XXXXXX.bz2")
+trap 'rm -f $tmp_path' EXIT
+
+${CLICKHOUSE_LOCAL} -q "SELECT * FROM numbers(1e6) FORMAT TSV" | bzip2 > "$tmp_path"
+truncate -s10000 "$tmp_path"
+# just ensure that it will exit eventually
+${CLICKHOUSE_LOCAL} -q "SELECT count() FROM file('$tmp_path', 'TSV', 'n UInt64') FORMAT Null" >& /dev/null
+
+exit 0