Merge upstream/master into global-merge-executor (using imerge)

This commit is contained in:
Nikita Mikhaylov 2021-09-03 12:29:48 +00:00
commit 218f0adca7
24 changed files with 285 additions and 152 deletions

View File

@ -29,6 +29,9 @@ if (NOT USE_INTERNAL_ZLIB_LIBRARY)
endif ()
if (NOT ZLIB_FOUND AND NOT MISSING_INTERNAL_ZLIB_LIBRARY)
# https://github.com/zlib-ng/zlib-ng/pull/733
# This is disabed by default
add_compile_definitions(Z_TLS=__thread)
set (USE_INTERNAL_ZLIB_LIBRARY 1)
set (ZLIB_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/${INTERNAL_ZLIB_NAME}" "${ClickHouse_BINARY_DIR}/contrib/${INTERNAL_ZLIB_NAME}" CACHE INTERNAL "") # generated zconf.h
set (ZLIB_INCLUDE_DIRS ${ZLIB_INCLUDE_DIR}) # for poco

View File

@ -26,17 +26,17 @@ The supported formats are:
| [VerticalRaw](#verticalraw) | ✗ | ✔ |
| [JSON](#json) | ✗ | ✔ |
| [JSONAsString](#jsonasstring) | ✔ | ✗ |
| [JSONString](#jsonstring) | ✗ | ✔ |
| [JSONStrings](#jsonstrings) | ✗ | ✔ |
| [JSONCompact](#jsoncompact) | ✗ | ✔ |
| [JSONCompactString](#jsoncompactstring) | ✗ | ✔ |
| [JSONCompactStrings](#jsoncompactstrings) | ✗ | ✔ |
| [JSONEachRow](#jsoneachrow) | ✔ | ✔ |
| [JSONEachRowWithProgress](#jsoneachrowwithprogress) | ✗ | ✔ |
| [JSONStringsEachRow](#jsonstringseachrow) | ✔ | ✔ |
| [JSONStringsEachRowWithProgress](#jsonstringseachrowwithprogress) | ✗ | ✔ |
| [JSONCompactEachRow](#jsoncompacteachrow) | ✔ | ✔ |
| [JSONCompactEachRowWithNamesAndTypes](#jsoncompacteachrowwithnamesandtypes) | ✔ | ✔ |
| [JSONCompactStringEachRow](#jsoncompactstringeachrow) | ✔ | ✔ |
| [JSONCompactStringEachRowWithNamesAndTypes](#jsoncompactstringeachrowwithnamesandtypes) | ✔ | ✔ |
| [JSONCompactStringsEachRow](#jsoncompactstringseachrow) | ✔ | ✔ |
| [JSONCompactStringsEachRowWithNamesAndTypes](#jsoncompactstringseachrowwithnamesandtypes) | ✔ | ✔ |
| [TSKV](#tskv) | ✔ | ✔ |
| [Pretty](#pretty) | ✗ | ✔ |
| [PrettyCompact](#prettycompact) | ✗ | ✔ |
@ -464,7 +464,7 @@ ClickHouse supports [NULL](../sql-reference/syntax.md), which is displayed as `n
- [JSONEachRow](#jsoneachrow) format
- [output_format_json_array_of_rows](../operations/settings/settings.md#output-format-json-array-of-rows) setting
## JSONString {#jsonstring}
## JSONStrings {#jsonstrings}
Differs from JSON only in that data fields are output in strings, not in typed JSON values.
@ -541,7 +541,7 @@ Result:
```
## JSONCompact {#jsoncompact}
## JSONCompactString {#jsoncompactstring}
## JSONCompactStrings {#jsoncompactstrings}
Differs from JSON only in that data rows are output in arrays, not in objects.
@ -580,7 +580,7 @@ Example:
```
```
// JSONCompactString
// JSONCompactStrings
{
"meta":
[
@ -614,7 +614,7 @@ Example:
## JSONEachRow {#jsoneachrow}
## JSONStringsEachRow {#jsonstringseachrow}
## JSONCompactEachRow {#jsoncompacteachrow}
## JSONCompactStringEachRow {#jsoncompactstringeachrow}
## JSONCompactStringsEachRow {#jsoncompactstringseachrow}
When using these formats, ClickHouse outputs rows as separated, newline-delimited JSON values, but the data as a whole is not valid JSON.
@ -639,9 +639,9 @@ Differs from `JSONEachRow`/`JSONStringsEachRow` in that ClickHouse will also yie
```
## JSONCompactEachRowWithNamesAndTypes {#jsoncompacteachrowwithnamesandtypes}
## JSONCompactStringEachRowWithNamesAndTypes {#jsoncompactstringeachrowwithnamesandtypes}
## JSONCompactStringsEachRowWithNamesAndTypes {#jsoncompactstringseachrowwithnamesandtypes}
Differs from `JSONCompactEachRow`/`JSONCompactStringEachRow` in that the column names and types are written as the first two rows.
Differs from `JSONCompactEachRow`/`JSONCompactStringsEachRow` in that the column names and types are written as the first two rows.
```json
["'hello'", "multiply(42, number)", "range(5)"]

View File

@ -390,12 +390,12 @@ This section contains the following parameters:
## keep_alive_timeout {#keep-alive-timeout}
The number of seconds that ClickHouse waits for incoming requests before closing the connection. Defaults to 3 seconds.
The number of seconds that ClickHouse waits for incoming requests before closing the connection. Defaults to 10 seconds.
**Example**
``` xml
<keep_alive_timeout>3</keep_alive_timeout>
<keep_alive_timeout>10</keep_alive_timeout>
```
## listen_host {#server_configuration_parameters-listen_host}
@ -535,7 +535,7 @@ Possible values:
- Positive double.
- 0 — The ClickHouse server can use all available RAM.
Default value: `0`.
Default value: `0.9`.
**Usage**

View File

@ -284,12 +284,12 @@ ClickHouseサーバー間でデータを交換するポート。
## keep_alive_timeout {#keep-alive-timeout}
ClickHouseが接続を閉じる前に受信要求を待機する秒数。 既定値は3秒です。
ClickHouseが接続を閉じる前に受信要求を待機する秒数。 既定値は10秒です。
**例**
``` xml
<keep_alive_timeout>3</keep_alive_timeout>
<keep_alive_timeout>10</keep_alive_timeout>
```
## listen_host {#server_configuration_parameters-listen_host}

View File

@ -25,17 +25,17 @@ ClickHouse может принимать (`INSERT`) и отдавать (`SELECT
| [VerticalRaw](#verticalraw) | ✗ | ✔ |
| [JSON](#json) | ✗ | ✔ |
| [JSONAsString](#jsonasstring) | ✔ | ✗ |
| [JSONString](#jsonstring) | ✗ | ✔ |
| [JSONStrings](#jsonstrings) | ✗ | ✔ |
| [JSONCompact](#jsoncompact) | ✗ | ✔ |
| [JSONCompactString](#jsoncompactstring) | ✗ | ✔ |
| [JSONCompactStrings](#jsoncompactstrings) | ✗ | ✔ |
| [JSONEachRow](#jsoneachrow) | ✔ | ✔ |
| [JSONEachRowWithProgress](#jsoneachrowwithprogress) | ✗ | ✔ |
| [JSONStringEachRow](#jsonstringeachrow) | ✔ | ✔ |
| [JSONStringEachRowWithProgress](#jsonstringeachrowwithprogress) | ✗ | ✔ |
| [JSONStringsEachRow](#jsonstringseachrow) | ✔ | ✔ |
| [JSONStringsEachRowWithProgress](#jsonstringseachrowwithprogress) | ✗ | ✔ |
| [JSONCompactEachRow](#jsoncompacteachrow) | ✔ | ✔ |
| [JSONCompactEachRowWithNamesAndTypes](#jsoncompacteachrowwithnamesandtypes) | ✔ | ✔ |
| [JSONCompactStringEachRow](#jsoncompactstringeachrow) | ✔ | ✔ |
| [JSONCompactStringEachRowWithNamesAndTypes](#jsoncompactstringeachrowwithnamesandtypes) | ✔ | ✔ |
| [JSONCompactStringsEachRow](#jsoncompactstringseachrow) | ✔ | ✔ |
| [JSONCompactStringsEachRowWithNamesAndTypes](#jsoncompactstringseachrowwithnamesandtypes) | ✔ | ✔ |
| [TSKV](#tskv) | ✔ | ✔ |
| [Pretty](#pretty) | ✗ | ✔ |
| [PrettyCompact](#prettycompact) | ✗ | ✔ |
@ -442,7 +442,7 @@ ClickHouse поддерживает [NULL](../sql-reference/syntax.md), кото
- Формат [JSONEachRow](#jsoneachrow)
- Настройка [output_format_json_array_of_rows](../operations/settings/settings.md#output-format-json-array-of-rows)
## JSONString {#jsonstring}
## JSONStrings {#jsonstrings}
Отличается от JSON только тем, что поля данных выводятся в строках, а не в типизированных значениях JSON.
@ -519,7 +519,7 @@ SELECT * FROM json_as_string;
```
## JSONCompact {#jsoncompact}
## JSONCompactString {#jsoncompactstring}
## JSONCompactStrings {#jsoncompactstrings}
Отличается от JSON только тем, что строчки данных выводятся в массивах, а не в object-ах.
@ -558,7 +558,7 @@ SELECT * FROM json_as_string;
```
```json
// JSONCompactString
// JSONCompactStrings
{
"meta":
[
@ -590,9 +590,9 @@ SELECT * FROM json_as_string;
```
## JSONEachRow {#jsoneachrow}
## JSONStringEachRow {#jsonstringeachrow}
## JSONStringsEachRow {#jsonstringseachrow}
## JSONCompactEachRow {#jsoncompacteachrow}
## JSONCompactStringEachRow {#jsoncompactstringeachrow}
## JSONCompactStringsEachRow {#jsoncompactstringseachrow}
При использовании этих форматов ClickHouse выводит каждую запись как значения JSON (каждое значение отдельной строкой), при этом данные в целом — невалидный JSON.
@ -605,9 +605,9 @@ SELECT * FROM json_as_string;
При вставке данных вы должны предоставить отдельное значение JSON для каждой строки.
## JSONEachRowWithProgress {#jsoneachrowwithprogress}
## JSONStringEachRowWithProgress {#jsonstringeachrowwithprogress}
## JSONStringsEachRowWithProgress {#jsonstringseachrowwithprogress}
Отличается от `JSONEachRow`/`JSONStringEachRow` тем, что ClickHouse будет выдавать информацию о ходе выполнения в виде значений JSON.
Отличается от `JSONEachRow`/`JSONStringsEachRow` тем, что ClickHouse будет выдавать информацию о ходе выполнения в виде значений JSON.
```json
{"row":{"'hello'":"hello","multiply(42, number)":"0","range(5)":[0,1,2,3,4]}}
@ -617,9 +617,9 @@ SELECT * FROM json_as_string;
```
## JSONCompactEachRowWithNamesAndTypes {#jsoncompacteachrowwithnamesandtypes}
## JSONCompactStringEachRowWithNamesAndTypes {#jsoncompactstringeachrowwithnamesandtypes}
## JSONCompactStringsEachRowWithNamesAndTypes {#jsoncompactstringseachrowwithnamesandtypes}
Отличается от `JSONCompactEachRow`/`JSONCompactStringEachRow` тем, что имена и типы столбцов записываются как первые две строки.
Отличается от `JSONCompactEachRow`/`JSONCompactStringsEachRow` тем, что имена и типы столбцов записываются как первые две строки.
```json
["'hello'", "multiply(42, number)", "range(5)"]

View File

@ -371,12 +371,12 @@ ClickHouse проверяет условия для `min_part_size` и `min_part
## keep_alive_timeout {#keep-alive-timeout}
Время в секундах, в течение которого ClickHouse ожидает входящих запросов прежде, чем закрыть соединение.
Время в секундах, в течение которого ClickHouse ожидает входящих запросов прежде, чем 10акрыть соединение.
**Пример**
``` xml
<keep_alive_timeout>3</keep_alive_timeout>
<keep_alive_timeout>10</keep_alive_timeout>
```
## listen_host {#server_configuration_parameters-listen_host}
@ -516,7 +516,7 @@ ClickHouse проверяет условия для `min_part_size` и `min_part
- Положительное число с плавающей запятой.
- 0 — сервер Clickhouse может использовать всю оперативную память.
Значение по умолчанию: `0`.
Значение по умолчанию: `0.9`.
**Использование**

View File

@ -26,17 +26,17 @@ ClickHouse可以接受和返回各种格式的数据。受支持的输入格式
| [VerticalRaw](#verticalraw) | ✗ | ✔ |
| [JSON](#json) | ✗ | ✔ |
| [JSONAsString](#jsonasstring) | ✔ | ✗ |
| [JSONString](#jsonstring) | ✗ | ✔ |
| [JSONStrings](#jsonstrings) | ✗ | ✔ |
| [JSONCompact](#jsoncompact) | ✗ | ✔ |
| [JSONCompactString](#jsoncompactstring) | ✗ | ✔ |
| [JSONCompactStrings](#jsoncompactstrings) | ✗ | ✔ |
| [JSONEachRow](#jsoneachrow) | ✔ | ✔ |
| [JSONEachRowWithProgress](#jsoneachrowwithprogress) | ✗ | ✔ |
| [JSONStringsEachRow](#jsonstringseachrow) | ✔ | ✔ |
| [JSONStringsEachRowWithProgress](#jsonstringseachrowwithprogress) | ✗ | ✔ |
| [JSONCompactEachRow](#jsoncompacteachrow) | ✔ | ✔ |
| [JSONCompactEachRowWithNamesAndTypes](#jsoncompacteachrowwithnamesandtypes) | ✔ | ✔ |
| [JSONCompactStringEachRow](#jsoncompactstringeachrow) | ✔ | ✔ |
| [JSONCompactStringEachRowWithNamesAndTypes](#jsoncompactstringeachrowwithnamesandtypes) | ✔ | ✔ |
| [JSONCompactStringsEachRow](#jsoncompactstringseachrow) | ✔ | ✔ |
| [JSONCompactStringsEachRowWithNamesAndTypes](#jsoncompactstringseachrowwithnamesandtypes) | ✔ | ✔ |
| [TSKV](#tskv) | ✔ | ✔ |
| [Pretty](#pretty) | ✗ | ✔ |
| [PrettyCompact](#prettycompact) | ✗ | ✔ |
@ -465,7 +465,7 @@ ClickHouse支持[NULL](../sql-reference/syntax.md), 在JSON输出中显示为`nu
- [JSONEachRow](#jsoneachrow)格式
- [output_format_json_array_of_rows](../operations/settings/settings.md#output-format-json-array-of-rows)设置
## JSONString {#jsonstring}
## JSONStrings {#jsonstrings}
与JSON的不同之处在于数据字段以字符串输出而不是以类型化JSON值输出。
@ -543,7 +543,7 @@ SELECT * FROM json_as_string;
```
## JSONCompact {#jsoncompact}
## JSONCompactString {#jsoncompactstring}
## JSONCompactStrings {#jsoncompactstrings}
与JSON格式不同的是它以数组的方式输出结果而不是以结构体。
@ -582,7 +582,7 @@ SELECT * FROM json_as_string;
```
```json
// JSONCompactString
// JSONCompactStrings
{
"meta":
[
@ -614,9 +614,9 @@ SELECT * FROM json_as_string;
```
## JSONEachRow {#jsoneachrow}
## JSONStringEachRow {#jsonstringeachrow}
## JSONStringsEachRow {#jsonstringseachrow}
## JSONCompactEachRow {#jsoncompacteachrow}
## JSONCompactStringEachRow {#jsoncompactstringeachrow}
## JSONCompactStringsEachRow {#jsoncompactstringseachrow}
使用这些格式时ClickHouse会将行输出为用换行符分隔的JSON值这些输出数据作为一个整体时由于没有分隔符(,)因而不是有效的JSON文档。
@ -629,9 +629,9 @@ SELECT * FROM json_as_string;
在插入数据时应该为每一行提供一个单独的JSON值。
## JSONEachRowWithProgress {#jsoneachrowwithprogress}
## JSONStringEachRowWithProgress {#jsonstringeachrowwithprogress}
## JSONStringsEachRowWithProgress {#jsonstringseachrowwithprogress}
与`JSONEachRow`/`JSONStringEachRow`不同的是ClickHouse还将生成作为JSON值的进度信息。
与`JSONEachRow`/`JSONStringsEachRow`不同的是ClickHouse还将生成作为JSON值的进度信息。
```json
{"row":{"'hello'":"hello","multiply(42, number)":"0","range(5)":[0,1,2,3,4]}}
@ -641,9 +641,9 @@ SELECT * FROM json_as_string;
```
## JSONCompactEachRowWithNamesAndTypes {#jsoncompacteachrowwithnamesandtypes}
## JSONCompactStringEachRowWithNamesAndTypes {#jsoncompactstringeachrowwithnamesandtypes}
## JSONCompactStringsEachRowWithNamesAndTypes {#jsoncompactstringseachrowwithnamesandtypes}
与`JSONCompactEachRow`/`JSONCompactStringEachRow`不同的是,列名和类型被写入前两行。
与`JSONCompactEachRow`/`JSONCompactStringsEachRow`不同的是,列名和类型被写入前两行。
```json
["'hello'", "multiply(42, number)", "range(5)"]

View File

@ -282,12 +282,12 @@ ClickHouse每x秒重新加载内置字典。 这使得编辑字典 “on the fly
## keep_alive_timeout {#keep-alive-timeout}
ClickHouse在关闭连接之前等待传入请求的秒数。 默认为3秒。
ClickHouse在关闭连接之前等待传入请求的秒数。默认为10秒。
**示例**
``` xml
<keep_alive_timeout>3</keep_alive_timeout>
<keep_alive_timeout>10</keep_alive_timeout>
```
## listen_host {#server_configuration_parameters-listen_host}

View File

@ -142,8 +142,22 @@ private:
struct ChangelogReadResult
{
uint64_t entries_read;
/// Total entries read from log including skipped.
/// Useful when we decide to continue to write in the same log and want to know
/// how many entries was already written in it.
uint64_t total_entries_read_from_log;
/// First index in log
uint64_t log_start_index;
/// First entry actually read log (not including skipped)
uint64_t first_read_index;
/// Last entry read from log (last entry in log)
/// When we don't skip anything last_read_index - first_read_index = total_entries_read_from_log.
/// But when some entries from the start of log can be skipped because they are not required.
uint64_t last_read_index;
/// last offset we were able to read from log
off_t last_position;
bool error;
};
@ -156,6 +170,7 @@ public:
, read_buf(filepath)
{}
/// start_log_index -- all entries with index < start_log_index will be skipped, but accounted into total_entries_read_from_log
ChangelogReadResult readChangelog(IndexToLogEntry & logs, uint64_t start_log_index, IndexToOffset & index_to_offset, Poco::Logger * log)
{
uint64_t previous_index = 0;
@ -210,7 +225,7 @@ public:
if (logs.count(record.header.index) != 0)
throw Exception(ErrorCodes::CORRUPTED_DATA, "Duplicated index id {} in log {}", record.header.index, filepath);
result.entries_read += 1;
result.total_entries_read_from_log += 1;
/// Read but skip this entry because our state is already more fresh
if (record.header.index < start_log_index)
@ -224,9 +239,10 @@ public:
/// Put it into in memory structure
logs.emplace(record.header.index, log_entry);
index_to_offset[record.header.index] = result.last_position;
result.last_read_index = record.header.index;
if (result.entries_read % 50000 == 0)
LOG_TRACE(log, "Reading changelog from path {}, entries {}", filepath, result.entries_read);
if (result.total_entries_read_from_log % 50000 == 0)
LOG_TRACE(log, "Reading changelog from path {}, entries {}", filepath, result.total_entries_read_from_log);
}
}
catch (const Exception & ex)
@ -243,7 +259,7 @@ public:
tryLogCurrentException(log);
}
LOG_TRACE(log, "Totally read from changelog {} {} entries", filepath, result.entries_read);
LOG_TRACE(log, "Totally read from changelog {} {} entries", filepath, result.total_entries_read_from_log);
return result;
}
@ -280,16 +296,10 @@ Changelog::Changelog(
void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uint64_t logs_to_keep)
{
uint64_t total_read = 0;
std::optional<ChangelogReadResult> last_log_read_result;
/// Amount of entries in last log index
uint64_t entries_in_last = 0;
/// Log idx of the first incomplete log (key in existing_changelogs)
int64_t first_incomplete_log_start_index = -1; /// if -1 then no incomplete log exists
ChangelogReadResult result{};
/// First log index which was read from all changelogs
uint64_t first_read_index = 0;
/// Last log has some free space to write
bool last_log_is_not_complete = false;
/// We must start to read from this log index
uint64_t start_to_read_from = last_commited_log_index;
@ -300,19 +310,14 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin
else
start_to_read_from = 1;
/// At least we read something
bool started = false;
/// Got through changelog files in order of start_index
for (const auto & [changelog_start_index, changelog_description] : existing_changelogs)
{
/// How many entries we have in the last changelog
entries_in_last = changelog_description.expectedEntriesCountInLog();
/// [from_log_index.>=.......start_to_read_from.....<=.to_log_index]
if (changelog_description.to_log_index >= start_to_read_from)
{
if (!started) /// still nothing was read
if (!last_log_read_result) /// still nothing was read
{
/// Our first log starts from the more fresh log_id than we required to read and this changelog is not empty log.
/// So we are missing something in our logs, but it's not dataloss, we will receive snapshot and required
@ -320,8 +325,12 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin
if (changelog_description.from_log_index > last_commited_log_index && (changelog_description.from_log_index - last_commited_log_index) > 1)
{
LOG_ERROR(log, "Some records was lost, last committed log index {}, smallest available log index on disk {}. Hopefully will receive missing records from leader.", last_commited_log_index, changelog_description.from_log_index);
first_incomplete_log_start_index = changelog_start_index;
break;
/// Nothing to do with our more fresh log, leader will overwrite them, so remove everything and just start from last_commited_index
removeAllLogs();
min_log_id = last_commited_log_index;
max_log_id = last_commited_log_index == 0 ? 0 : last_commited_log_index - 1;
rotate(max_log_id + 1);
return;
}
else if (changelog_description.from_log_index > start_to_read_from)
{
@ -332,69 +341,100 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin
ChangelogReader reader(changelog_description.path);
result = reader.readChangelog(logs, start_to_read_from, index_to_start_pos, log);
started = true;
last_log_read_result = reader.readChangelog(logs, start_to_read_from, index_to_start_pos, log);
/// Otherwise we have already initialized it
if (first_read_index == 0)
first_read_index = result.first_read_index;
if (min_log_id == 0)
min_log_id = last_log_read_result->first_read_index;
total_read += result.entries_read;
if (last_log_read_result->last_read_index != 0)
max_log_id = last_log_read_result->last_read_index;
last_log_read_result->log_start_index = changelog_description.from_log_index;
/// How many entries we have in the last changelog
uint64_t expected_entries_in_log = changelog_description.expectedEntriesCountInLog();
/// May happen after truncate, crash or simply unfinished log
if (result.entries_read < entries_in_last)
if (last_log_read_result->total_entries_read_from_log < expected_entries_in_log)
{
first_incomplete_log_start_index = changelog_start_index;
last_log_is_not_complete = true;
break;
}
}
}
if (first_read_index != 0)
start_index = first_read_index;
else /// We just may have no logs (only snapshot)
start_index = last_commited_log_index;
/// Found some broken or non finished logs
/// We have to remove broken data and continue to write into incomplete log.
if (first_incomplete_log_start_index != -1) /// otherwise all logs completed so just start a new one
/// we can have empty log (with zero entries) and last_log_read_result will be initialized
if (!last_log_read_result || min_log_id == 0) /// We just may have no logs (only snapshot or nothing)
{
auto start_remove_from = existing_changelogs.begin();
if (started)
start_remove_from = existing_changelogs.upper_bound(first_incomplete_log_start_index);
/// Just to be sure they don't exist
removeAllLogs();
/// All subsequent logs shouldn't exist. But they may exist if we crashed after writeAt started. Remove them.
for (auto itr = start_remove_from; itr != existing_changelogs.end();)
min_log_id = last_commited_log_index;
max_log_id = last_commited_log_index == 0 ? 0 : last_commited_log_index - 1;
}
else if (last_log_is_not_complete) /// if it's complete just start new one
{
LOG_WARNING(log, "Removing changelog {}, because it's goes after broken changelog entry", itr->second.path);
std::filesystem::remove(itr->second.path);
itr = existing_changelogs.erase(itr);
assert(last_log_read_result != std::nullopt);
/// Actually they shouldn't exist, but to be sure we remove them
removeAllLogsAfter(last_log_read_result->log_start_index);
assert(!existing_changelogs.empty());
assert(existing_changelogs.find(last_log_read_result->log_start_index)->first == existing_changelogs.rbegin()->first);
/// Continue to write into incomplete existing log
auto description = existing_changelogs[last_log_read_result->log_start_index];
if (last_log_read_result->error)
initWriter(description, last_log_read_result->total_entries_read_from_log, /* truncate_to_offset = */ last_log_read_result->last_position);
else
initWriter(description, last_log_read_result->total_entries_read_from_log);
}
/// Continue to write into existing log
if (!existing_changelogs.empty())
/// Start new log if we don't initialize writer from previous log. All logs can be "complete".
if (!current_writer)
rotate(max_log_id + 1);
}
void Changelog::initWriter(const ChangelogFileDescription & description, uint64_t entries_already_written, std::optional<uint64_t> truncate_to_offset)
{
auto description = existing_changelogs.rbegin()->second;
if (description.expectedEntriesCountInLog() != rotate_interval)
LOG_TRACE(log, "Looks like rotate_logs_interval was changed, current {}, expected entries in last log {}", rotate_interval, description.expectedEntriesCountInLog());
LOG_TRACE(log, "Continue to write into {}", description.path);
current_writer = std::make_unique<ChangelogWriter>(description.path, WriteMode::Append, description.from_log_index);
current_writer->setEntriesWritten(result.entries_read);
current_writer->setEntriesWritten(entries_already_written);
/// Truncate all broken entries from log
if (result.error)
if (truncate_to_offset)
{
LOG_WARNING(log, "Read finished with error, truncating all broken log entries");
current_writer->truncateToLength(result.last_position);
}
LOG_WARNING(log, "Changelog {} contain broken enties, truncating all broken log entries", description.path);
current_writer->truncateToLength(*truncate_to_offset);
}
}
/// Start new log if we don't initialize writer from previous log
if (!current_writer)
rotate(start_index + total_read);
void Changelog::removeAllLogsAfter(uint64_t start_to_remove_from_id)
{
auto start_to_remove_from = existing_changelogs.upper_bound(start_to_remove_from_id);
/// All subsequent logs shouldn't exist. But they may exist if we crashed after writeAt started. Remove them.
for (auto itr = start_to_remove_from; itr != existing_changelogs.end();)
{
LOG_WARNING(log, "Removing changelog {}, because it's goes after broken changelog entry", itr->second.path);
std::filesystem::remove(itr->second.path);
itr = existing_changelogs.erase(itr);
}
}
void Changelog::removeAllLogs()
{
LOG_WARNING(log, "Removing all changelogs");
for (auto itr = existing_changelogs.begin(); itr != existing_changelogs.end();)
{
LOG_WARNING(log, "Removing changelog {}, because it's goes after broken changelog entry", itr->second.path);
std::filesystem::remove(itr->second.path);
itr = existing_changelogs.erase(itr);
}
}
void Changelog::rotate(uint64_t new_start_log_index)
@ -439,7 +479,7 @@ void Changelog::appendEntry(uint64_t index, const LogEntryPtr & log_entry)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Changelog must be initialized before appending records");
if (logs.empty())
start_index = index;
min_log_id = index;
const auto & current_changelog_description = existing_changelogs[current_writer->getStartIndex()];
const bool log_is_complete = current_writer->getEntriesWritten() == current_changelog_description.expectedEntriesCountInLog();
@ -452,6 +492,7 @@ void Changelog::appendEntry(uint64_t index, const LogEntryPtr & log_entry)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Record with index {} already exists", index);
logs[index] = makeClone(log_entry);
max_log_id = index;
}
void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry)
@ -513,11 +554,30 @@ void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry)
void Changelog::compact(uint64_t up_to_log_index)
{
LOG_INFO(log, "Compact logs up to log index {}, our max log id is {}", up_to_log_index, max_log_id);
bool remove_all_logs = false;
if (up_to_log_index > max_log_id)
{
LOG_INFO(log, "Seems like this node recovers from leaders snapshot, removing all logs");
/// If we received snapshot from leader we may compact up to more fresh log
max_log_id = up_to_log_index;
remove_all_logs = true;
}
bool need_rotate = false;
for (auto itr = existing_changelogs.begin(); itr != existing_changelogs.end();)
{
/// Remove all completely outdated changelog files
if (itr->second.to_log_index <= up_to_log_index)
if (remove_all_logs || itr->second.to_log_index <= up_to_log_index)
{
if (current_writer && itr->second.from_log_index == current_writer->getStartIndex())
{
LOG_INFO(log, "Trying to remove log {} which is current active log for write. Possibly this node recovers from snapshot", itr->second.path);
need_rotate = true;
current_writer.reset();
}
LOG_INFO(log, "Removing changelog {} because of compaction", itr->second.path);
std::erase_if(index_to_start_pos, [right_index = itr->second.to_log_index] (const auto & item) { return item.first <= right_index; });
std::filesystem::remove(itr->second.path);
@ -526,8 +586,14 @@ void Changelog::compact(uint64_t up_to_log_index)
else /// Files are ordered, so all subsequent should exist
break;
}
start_index = up_to_log_index + 1;
/// Compaction from the past is possible, so don't make our min_log_id smaller.
min_log_id = std::max(min_log_id, up_to_log_index + 1);
std::erase_if(logs, [up_to_log_index] (const auto & item) { return item.first <= up_to_log_index; });
if (need_rotate)
rotate(up_to_log_index + 1);
LOG_INFO(log, "Compaction up to {} finished new min index {}, new max index {}", up_to_log_index, min_log_id, max_log_id);
}
LogEntryPtr Changelog::getLastEntry() const
@ -535,10 +601,11 @@ LogEntryPtr Changelog::getLastEntry() const
/// This entry treaded in special way by NuRaft
static LogEntryPtr fake_entry = nuraft::cs_new<nuraft::log_entry>(0, nuraft::buffer::alloc(sizeof(uint64_t)));
const uint64_t next_index = getNextEntryIndex() - 1;
auto entry = logs.find(next_index);
auto entry = logs.find(max_log_id);
if (entry == logs.end())
{
return fake_entry;
}
return entry->second;
}

View File

@ -2,6 +2,7 @@
#include <libnuraft/nuraft.hxx> // Y_IGNORE
#include <city.h>
#include <optional>
#include <IO/WriteBufferFromFile.h>
#include <IO/HashingWriteBuffer.h>
#include <Compression/CompressedWriteBuffer.h>
@ -87,12 +88,12 @@ public:
uint64_t getNextEntryIndex() const
{
return start_index + logs.size();
return max_log_id + 1;
}
uint64_t getStartIndex() const
{
return start_index;
return min_log_id;
}
/// Last entry in log, or fake entry with term 0 if log is empty
@ -128,6 +129,13 @@ private:
/// Starts new file [new_start_log_index, new_start_log_index + rotate_interval]
void rotate(uint64_t new_start_log_index);
/// Remove all changelogs from disk with start_index bigger than start_to_remove_from_id
void removeAllLogsAfter(uint64_t start_to_remove_from_id);
/// Remove all logs from disk
void removeAllLogs();
/// Init writer for existing log with some entries already written
void initWriter(const ChangelogFileDescription & description, uint64_t entries_already_written, std::optional<uint64_t> truncate_to_offset = {});
private:
const std::string changelogs_dir;
const uint64_t rotate_interval;
@ -144,7 +152,9 @@ private:
/// Mapping log_id -> log_entry
IndexToLogEntry logs;
/// Start log_id which exists in all "active" logs
uint64_t start_index = 0;
/// min_log_id + 1 == max_log_id means empty log storage for NuRaft
uint64_t min_log_id = 0;
uint64_t max_log_id = 0;
};
}

View File

@ -293,10 +293,12 @@ void KeeperDispatcher::shutdown()
if (session_cleaner_thread.joinable())
session_cleaner_thread.join();
/// FIXME not the best way to notify
if (requests_queue)
{
requests_queue->push({});
if (request_thread.joinable())
request_thread.join();
}
responses_queue.push({});
if (responses_thread.joinable())
@ -313,7 +315,7 @@ void KeeperDispatcher::shutdown()
KeeperStorage::RequestForSession request_for_session;
/// Set session expired for all pending requests
while (requests_queue->tryPop(request_for_session))
while (requests_queue && requests_queue->tryPop(request_for_session))
{
if (request_for_session.request)
{

View File

@ -404,6 +404,7 @@ TEST(CoordinationTest, ChangelogTestCompaction)
/// And we able to read it
DB::KeeperLogStore changelog_reader("./logs", 5, true);
changelog_reader.init(7, 0);
EXPECT_EQ(changelog_reader.size(), 1);
EXPECT_EQ(changelog_reader.start_index(), 7);
EXPECT_EQ(changelog_reader.next_slot(), 8);
@ -1317,7 +1318,7 @@ TEST(CoordinationTest, TestRotateIntervalChanges)
}
}
EXPECT_TRUE(fs::exists("./logs/changelog_0_99.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_1_100.bin"));
DB::KeeperLogStore changelog_1("./logs", 10, true);
changelog_1.init(0, 50);
@ -1330,8 +1331,8 @@ TEST(CoordinationTest, TestRotateIntervalChanges)
changelog_1.end_of_append_batch(0, 0);
}
EXPECT_TRUE(fs::exists("./logs/changelog_0_99.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_100_109.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_1_100.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_101_110.bin"));
DB::KeeperLogStore changelog_2("./logs", 7, true);
changelog_2.init(98, 55);
@ -1346,11 +1347,11 @@ TEST(CoordinationTest, TestRotateIntervalChanges)
}
changelog_2.compact(105);
EXPECT_FALSE(fs::exists("./logs/changelog_0_99.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_100_109.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_110_116.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_117_123.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_124_130.bin"));
EXPECT_FALSE(fs::exists("./logs/changelog_1_100.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_101_110.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_111_117.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_118_124.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_125_131.bin"));
DB::KeeperLogStore changelog_3("./logs", 5, true);
changelog_3.init(116, 3);
@ -1364,14 +1365,14 @@ TEST(CoordinationTest, TestRotateIntervalChanges)
}
changelog_3.compact(125);
EXPECT_FALSE(fs::exists("./logs/changelog_100_109.bin"));
EXPECT_FALSE(fs::exists("./logs/changelog_110_116.bin"));
EXPECT_FALSE(fs::exists("./logs/changelog_117_123.bin"));
EXPECT_FALSE(fs::exists("./logs/changelog_101_110.bin"));
EXPECT_FALSE(fs::exists("./logs/changelog_111_117.bin"));
EXPECT_FALSE(fs::exists("./logs/changelog_118_124.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_124_130.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_131_135.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_136_140.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_141_145.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_125_131.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_132_136.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_137_141.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_142_146.bin"));
}

View File

@ -12,6 +12,7 @@ namespace DB
namespace ErrorCodes
{
extern const int BZIP2_STREAM_DECODER_FAILED;
extern const int UNEXPECTED_END_OF_FILE;
}
@ -90,6 +91,12 @@ bool Bzip2ReadBuffer::nextImpl()
"bzip2 stream decoder failed: error code: {}",
ret);
if (in->eof())
{
eof = true;
throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "Unexpected end of bzip2 archive");
}
return true;
}
}

View File

@ -2,7 +2,7 @@
# include <Common/config.h>
#endif
#if USE_BROTLI
#if USE_BZIP2
# include <IO/Bzip2WriteBuffer.h>
# include <bzlib.h> // Y_IGNORE

View File

@ -4,6 +4,7 @@
#include <IO/BufferWithOwnMemory.h>
#include <IO/CompressionMethod.h>
#include <zlib.h>

View File

@ -4,6 +4,7 @@
#include <IO/BufferWithOwnMemory.h>
#include <IO/CompressionMethod.h>
#include <zlib.h>

View File

@ -1,6 +1,7 @@
#include <unistd.h>
#include <vector>
#include <stdexcept>
#include <zlib.h>
#pragma GCC diagnostic ignored "-Wold-style-cast"

View File

@ -192,7 +192,7 @@ struct SocketInterruptablePollWrapper
KeeperTCPHandler::KeeperTCPHandler(IServer & server_, const Poco::Net::StreamSocket & socket_)
: Poco::Net::TCPServerConnection(socket_)
, server(server_)
, log(&Poco::Logger::get("NuKeeperTCPHandler"))
, log(&Poco::Logger::get("KeeperTCPHandler"))
, global_context(Context::createCopy(server.context()))
, keeper_dispatcher(global_context->getKeeperDispatcher())
, operation_timeout(0, global_context->getConfigRef().getUInt("keeper_server.operation_timeout_ms", Coordination::DEFAULT_OPERATION_TIMEOUT_MS) * 1000)

View File

@ -46,7 +46,15 @@ ReplicatedMergeTreeTableMetadata::ReplicatedMergeTreeTableMetadata(const MergeTr
primary_key = formattedAST(metadata_snapshot->getPrimaryKey().expression_list_ast);
if (metadata_snapshot->isPrimaryKeyDefined())
sorting_key = formattedAST(metadata_snapshot->getSortingKey().expression_list_ast);
{
/// We don't use preparsed AST `sorting_key.expression_list_ast` because
/// it contain version column for VersionedCollapsingMergeTree, which
/// is not stored in ZooKeeper for compatibility reasons. So the best
/// compatible way is just to convert definition_ast to list and
/// serialize it. In all other places key.expression_list_ast should be
/// used.
sorting_key = formattedAST(extractKeyExpressionList(metadata_snapshot->getSortingKey().definition_ast));
}
data_format_version = data.format_version;

View File

@ -7,6 +7,7 @@ import time
import logging
import io
import string
import ast
import avro.schema
import avro.io
@ -2792,7 +2793,7 @@ def test_kafka_formats_with_broken_message(kafka_cluster):
# broken message
"(0,'BAD','AM',0.5,1)",
],
'expected':r'''{"raw_message":"(0,'BAD','AM',0.5,1)","error":"Cannot parse string 'BAD' as UInt16: syntax error at begin of string. Note: there are toUInt16OrZero and toUInt16OrNull functions, which returns zero\/NULL instead of throwing exception.: while executing 'FUNCTION _CAST(assumeNotNull(_dummy_0) :: 2, 'UInt16' :: 1) -> _CAST(assumeNotNull(_dummy_0), 'UInt16') UInt16 : 4'"}''',
'expected':r'''{"raw_message":"(0,'BAD','AM',0.5,1)","error":"Cannot parse string 'BAD' as UInt16: syntax error at begin of string. Note: there are toUInt16OrZero and toUInt16OrNull functions, which returns zero\/NULL instead of throwing exception"}''',
'supports_empty_value': True,
'printable':True,
},
@ -2934,11 +2935,13 @@ def test_kafka_formats_with_broken_message(kafka_cluster):
'''.format(topic_name=topic_name, offset_0=offsets[0], offset_1=offsets[1], offset_2=offsets[2])
# print(('Checking result\n {result} \n expected \n {expected}\n'.format(result=str(result), expected=str(expected))))
assert TSV(result) == TSV(expected), 'Proper result for format: {}'.format(format_name)
errors_result = instance.query('SELECT raw_message, error FROM test.kafka_errors_{format_name}_mv format JSONEachRow'.format(format_name=format_name))
errors_expected = format_opts['expected']
errors_result = ast.literal_eval(instance.query('SELECT raw_message, error FROM test.kafka_errors_{format_name}_mv format JSONEachRow'.format(format_name=format_name)))
errors_expected = ast.literal_eval(format_opts['expected'])
# print(errors_result.strip())
# print(errors_expected.strip())
assert errors_result.strip() == errors_expected.strip(), 'Proper errors for format: {}'.format(format_name)
assert errors_result['raw_message'] == errors_expected['raw_message'], 'Proper raw_message for format: {}'.format(format_name)
# Errors text can change, just checking prefixes
assert errors_expected['error'] in errors_result['error'], 'Proper error for format: {}'.format(format_name)
kafka_delete_topic(admin_client, topic_name)
def wait_for_new_data(table_name, prev_count = 0, max_retries = 120):

View File

@ -1,6 +1,6 @@
2019-10-01 a 1 aa 1 1 1
2019-10-01 a 1 aa 1 1 1 0
CREATE TABLE default.table_for_alter\n(\n `d` Date,\n `a` String,\n `b` UInt8,\n `x` String,\n `y` Int8,\n `version` UInt64,\n `sign` Int8 DEFAULT 1,\n `order` UInt32\n)\nENGINE = ReplicatedVersionedCollapsingMergeTree(\'/clickhouse/tables/01526_alter_add/t1\', \'1\', sign, version)\nPARTITION BY y\nPRIMARY KEY d\nORDER BY (d, order)\nSETTINGS index_granularity = 8192
CREATE TABLE default.table_for_alter\n(\n `d` Date,\n `a` String,\n `b` UInt8,\n `x` String,\n `y` Int8,\n `version` UInt64,\n `sign` Int8 DEFAULT 1,\n `order` UInt32\n)\nENGINE = ReplicatedVersionedCollapsingMergeTree(\'/clickhouse/tables/default/01526_alter_add/t1\', \'1\', sign, version)\nPARTITION BY y\nPRIMARY KEY d\nORDER BY (d, order)\nSETTINGS index_granularity = 8192
2019-10-01 a 1 aa 1 1 1 0 0
2019-10-02 b 2 bb 2 2 2 1 2
CREATE TABLE default.table_for_alter\n(\n `d` Date,\n `a` String,\n `b` UInt8,\n `x` String,\n `y` Int8,\n `version` UInt64,\n `sign` Int8 DEFAULT 1,\n `order` UInt32,\n `datum` UInt32\n)\nENGINE = ReplicatedVersionedCollapsingMergeTree(\'/clickhouse/tables/01526_alter_add/t1\', \'1\', sign, version)\nPARTITION BY y\nPRIMARY KEY d\nORDER BY (d, order, datum)\nSETTINGS index_granularity = 8192
CREATE TABLE default.table_for_alter\n(\n `d` Date,\n `a` String,\n `b` UInt8,\n `x` String,\n `y` Int8,\n `version` UInt64,\n `sign` Int8 DEFAULT 1,\n `order` UInt32,\n `datum` UInt32\n)\nENGINE = ReplicatedVersionedCollapsingMergeTree(\'/clickhouse/tables/default/01526_alter_add/t1\', \'1\', sign, version)\nPARTITION BY y\nPRIMARY KEY d\nORDER BY (d, order, datum)\nSETTINGS index_granularity = 8192

View File

@ -12,17 +12,27 @@ CREATE TABLE table_for_alter
`version` UInt64,
`sign` Int8 DEFAULT 1
)
ENGINE = ReplicatedVersionedCollapsingMergeTree('/clickhouse/tables/01526_alter_add/t1', '1', sign, version)
ENGINE = ReplicatedVersionedCollapsingMergeTree('/clickhouse/tables/{database}/01526_alter_add/t1', '1', sign, version)
PARTITION BY y
ORDER BY d
SETTINGS index_granularity = 8192;
INSERT INTO table_for_alter VALUES(toDate('2019-10-01'), 'a', 1, 'aa', 1, 1, 1);
DETACH TABLE table_for_alter;
ATTACH TABLE table_for_alter;
SELECT * FROM table_for_alter;
ALTER TABLE table_for_alter ADD COLUMN order UInt32, MODIFY ORDER BY (d, order);
DETACH TABLE table_for_alter;
ATTACH TABLE table_for_alter;
SELECT * FROM table_for_alter;
SHOW CREATE TABLE table_for_alter;
@ -35,4 +45,8 @@ SELECT * FROM table_for_alter ORDER BY d;
SHOW CREATE TABLE table_for_alter;
DETACH TABLE table_for_alter;
ATTACH TABLE table_for_alter;
DROP TABLE IF EXISTS table_for_alter;

View File

@ -0,0 +1,15 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
tmp_path=$(mktemp "$CURDIR/02022_bzip2_truncate.XXXXXX.bz2")
trap 'rm -f $tmp_path' EXIT
${CLICKHOUSE_LOCAL} -q "SELECT * FROM numbers(1e6) FORMAT TSV" | bzip2 > "$tmp_path"
truncate -s10000 "$tmp_path"
# just ensure that it will exit eventually
${CLICKHOUSE_LOCAL} -q "SELECT count() FROM file('$tmp_path', 'TSV', 'n UInt64') FORMAT Null" >& /dev/null
exit 0