Merge branch 'master' into refactor-actions-dag

This commit is contained in:
Nikolai Kochetov 2021-03-10 12:40:55 +03:00
commit 4f133ae036
260 changed files with 11934 additions and 8794 deletions

View File

@ -1,9 +1,9 @@
# This strings autochanged from release_lib.sh:
SET(VERSION_REVISION 54448)
SET(VERSION_REVISION 54449)
SET(VERSION_MAJOR 21)
SET(VERSION_MINOR 3)
SET(VERSION_MINOR 4)
SET(VERSION_PATCH 1)
SET(VERSION_GITHASH ef72ba7349f230321750c13ee63b49a11a7c0adc)
SET(VERSION_DESCRIBE v21.3.1.1-prestable)
SET(VERSION_STRING 21.3.1.1)
SET(VERSION_GITHASH af2135ef9dc72f16fa4f229b731262c3f0a8bbdc)
SET(VERSION_DESCRIBE v21.4.1.1-prestable)
SET(VERSION_STRING 21.4.1.1)
# end of autochange

View File

@ -32,6 +32,7 @@ endif()
set_property(DIRECTORY PROPERTY EXCLUDE_FROM_ALL 1)
add_subdirectory (abseil-cpp-cmake)
add_subdirectory (antlr4-runtime-cmake)
add_subdirectory (boost-cmake)
add_subdirectory (cctz-cmake)

2
contrib/NuRaft vendored

@ -1 +1 @@
Subproject commit 9a0d78de4b90546368d954b6434f0e9a823e8d80
Subproject commit ff9049bcc8ea6a02276ccdc8629d764e9e5de853

View File

@ -0,0 +1,18 @@
set(ABSL_ROOT_DIR "${ClickHouse_SOURCE_DIR}/contrib/abseil-cpp")
if(NOT EXISTS "${ABSL_ROOT_DIR}/CMakeLists.txt")
message(FATAL_ERROR " submodule third_party/abseil-cpp is missing. To fix try run: \n git submodule update --init --recursive")
endif()
add_subdirectory("${ABSL_ROOT_DIR}" "${ClickHouse_BINARY_DIR}/contrib/abseil-cpp")
add_library(abseil_swiss_tables INTERFACE)
target_link_libraries(abseil_swiss_tables INTERFACE
absl::flat_hash_map
absl::flat_hash_set
)
get_target_property(FLAT_HASH_MAP_INCLUDE_DIR absl::flat_hash_map INTERFACE_INCLUDE_DIRECTORIES)
target_include_directories (abseil_swiss_tables SYSTEM BEFORE INTERFACE ${FLAT_HASH_MAP_INCLUDE_DIR})
get_target_property(FLAT_HASH_SET_INCLUDE_DIR absl::flat_hash_set INTERFACE_INCLUDE_DIRECTORIES)
target_include_directories (abseil_swiss_tables SYSTEM BEFORE INTERFACE ${FLAT_HASH_SET_INCLUDE_DIR})

View File

@ -39,11 +39,6 @@ set(_gRPC_SSL_LIBRARIES ${OPENSSL_LIBRARIES})
# Use abseil-cpp from ClickHouse contrib, not from gRPC third_party.
set(gRPC_ABSL_PROVIDER "clickhouse" CACHE STRING "" FORCE)
set(ABSL_ROOT_DIR "${ClickHouse_SOURCE_DIR}/contrib/abseil-cpp")
if(NOT EXISTS "${ABSL_ROOT_DIR}/CMakeLists.txt")
message(FATAL_ERROR " grpc: submodule third_party/abseil-cpp is missing. To fix try run: \n git submodule update --init --recursive")
endif()
add_subdirectory("${ABSL_ROOT_DIR}" "${ClickHouse_BINARY_DIR}/contrib/abseil-cpp")
# Choose to build static or shared library for c-ares.
if (MAKE_STATIC_LIBRARIES)

4
debian/changelog vendored
View File

@ -1,5 +1,5 @@
clickhouse (21.3.1.1) unstable; urgency=low
clickhouse (21.4.1.1) unstable; urgency=low
* Modified source code
-- clickhouse-release <clickhouse-release@yandex-team.ru> Mon, 01 Feb 2021 12:50:53 +0300
-- clickhouse-release <clickhouse-release@yandex-team.ru> Sat, 06 Mar 2021 14:43:27 +0300

View File

@ -1,7 +1,7 @@
FROM ubuntu:18.04
ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
ARG version=21.3.1.*
ARG version=21.4.1.*
RUN apt-get update \
&& apt-get install --yes --no-install-recommends \

View File

@ -1,7 +1,7 @@
FROM ubuntu:20.04
ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
ARG version=21.3.1.*
ARG version=21.4.1.*
ARG gosu_ver=1.10
# user/group precreated explicitly with fixed uid/gid on purpose.

View File

@ -1,7 +1,7 @@
FROM ubuntu:18.04
ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
ARG version=21.3.1.*
ARG version=21.4.1.*
RUN apt-get update && \
apt-get install -y apt-transport-https dirmngr && \

View File

@ -151,6 +151,7 @@ function clone_submodules
cd "$FASTTEST_SOURCE"
SUBMODULES_TO_UPDATE=(
contrib/abseil-cpp
contrib/antlr4-runtime
contrib/boost
contrib/zlib-ng

View File

@ -5,5 +5,6 @@
<!-- Path to configuration file with users, access rights, profiles of settings, quotas. -->
<users_config>users.xml</users_config>
<access_control_path>/var/lib/clickhouse/access/</access_control_path>
<!-- Path to directory where users created by SQL commands are stored. -->
<access_control_path>access/</access_control_path>
</yandex>

View File

@ -81,8 +81,8 @@ clickhouse-client --query "SHOW TABLES FROM test"
./stress --hung-check --output-folder test_output --skip-func-tests "$SKIP_TESTS_OPTION" && echo "OK" > /test_output/script_exit_code.txt || echo "FAIL" > /test_output/script_exit_code.txt
stop
# TODO remove me when persistent snapshots will be ready
rm -fr /var/lib/clickhouse/coordination ||:
start
clickhouse-client --query "SELECT 'Server successfuly started'" > /test_output/alive_check.txt || echo 'Server failed to start' > /test_output/alive_check.txt
tar -chf /test_output/coordination.tar /var/lib/clickhouse/coordination ||:

View File

@ -73,19 +73,18 @@ Clusters are set like this:
``` xml
<remote_servers>
<logs>
<!-- Inter-server per-cluster secret for Distributed queries
default: no secret (no authentication will be performed)
If set, then Distributed queries will be validated on shards, so at least:
- such cluster should exist on the shard,
- such cluster should have the same secret.
And also (and which is more important), the initial_user will
be used as current user for the query.
-->
<!-- <secret></secret> -->
<shard>
<!-- Inter-server per-cluster secret for Distributed queries
default: no secret (no authentication will be performed)
If set, then Distributed queries will be validated on shards, so at least:
- such cluster should exist on the shard,
- such cluster should have the same secret.
And also (and which is more important), the initial_user will
be used as current user for the query.
-->
<!-- <secret></secret> -->
<!-- Optional. Shard weight when writing data. Default: 1. -->
<weight>1</weight>
<!-- Optional. Whether to write data to just one of the replicas. Default: false (write data to all replicas). -->

View File

@ -7,9 +7,9 @@ toc_title: Distinctive Features
## True Column-Oriented Database Management System {#true-column-oriented-dbms}
In a true column-oriented DBMS, no extra data is stored with the values. Among other things, this means that constant-length values must be supported, to avoid storing their length “number” next to the values. As an example, a billion UInt8-type values should consume around 1 GB uncompressed, or this strongly affects the CPU use. It is essential to store data compactly (without any “garbage”) even when uncompressed, since the speed of decompression (CPU usage) depends mainly on the volume of uncompressed data.
In a real column-oriented DBMS, no extra data is stored with the values. Among other things, this means that constant-length values must be supported, to avoid storing their length “number” next to the values. For example, a billion UInt8-type values should consume around 1 GB uncompressed, or this strongly affects the CPU use. It is essential to store data compactly (without any “garbage”) even when uncompressed since the speed of decompression (CPU usage) depends mainly on the volume of uncompressed data.
It is worth noting because there are systems that can store values of different columns separately, but that cant effectively process analytical queries due to their optimization for other scenarios. Examples are HBase, BigTable, Cassandra, and HyperTable. In these systems, you would get throughput around a hundred thousand rows per second, but not hundreds of millions of rows per second.
It is worth noting because there are systems that can store values of different columns separately, but that cant effectively process analytical queries due to their optimization for other scenarios. Examples are HBase, BigTable, Cassandra, and HyperTable. You would get throughput around a hundred thousand rows per second in these systems, but not hundreds of millions of rows per second.
Its also worth noting that ClickHouse is a database management system, not a single database. ClickHouse allows creating tables and databases in runtime, loading data, and running queries without reconfiguring and restarting the server.

View File

@ -17,7 +17,6 @@ The `system.part_log` table contains the following columns:
- `event_date` ([Date](../../sql-reference/data-types/date.md)) — Event date.
- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Event time.
- `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Event time with microseconds precision.
- `duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Duration.
- `database` ([String](../../sql-reference/data-types/string.md)) — Name of the database the data part is in.
- `table` ([String](../../sql-reference/data-types/string.md)) — Name of the table the data part is in.

View File

@ -538,7 +538,7 @@ Result:
## date\_sub {#date_sub}
Subtracts a time/date interval from the provided date.
Subtracts the time interval or date interval from the provided date or date with time.
**Syntax**
@ -546,19 +546,30 @@ Subtracts a time/date interval from the provided date.
date_sub(unit, value, date)
```
Aliases: `dateSub`, `DATE_SUB`.
Aliases: `dateSub`, `DATE_SUB`.
**Arguments**
- `unit` — The type of interval to subtract. [String](../../sql-reference/data-types/string.md).
Possible values:
Supported values: second, minute, hour, day, week, month, quarter, year.
- `value` - Value in specified unit - [Int](../../sql-reference/data-types/int-uint.md)
- `date` — [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md) to subtract value from.
- `second`
- `minute`
- `hour`
- `day`
- `week`
- `month`
- `quarter`
- `year`
- `value` — Value of interval to subtract. [Int](../../sql-reference/data-types/int-uint.md).
- `date` — The date or date with time from which `value` is subtracted. [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
**Returned value**
Returns Date or DateTime with `value` expressed in `unit` subtracted from `date`.
Returns the date or date with time obtained by subtracting `value`, expressed in `unit`, from `date`.
Type: [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
**Example**

View File

@ -121,24 +121,53 @@ LIMIT 10
└────────────────────────────┴────────┘
```
## IPv6StringToNum(s) {#ipv6stringtonums}
## IPv6StringToNum {#ipv6stringtonums}
The reverse function of IPv6NumToString. If the IPv6 address has an invalid format, it returns a string of null bytes.
If the IP address is a valid IPv4 address then the IPv6 equivalent of the IPv4 address is returned.
The reverse function of [IPv6NumToString](#ipv6numtostringx). If the IPv6 address has an invalid format, it returns a string of null bytes.
If the input string contains a valid IPv4 address, returns its IPv6 equivalent.
HEX can be uppercase or lowercase.
Alias: `INET6_ATON`.
**Syntax**
``` sql
SELECT cutIPv6(IPv6StringToNum('127.0.0.1'), 0, 0);
IPv6StringToNum(string)
```
``` text
┌─cutIPv6(IPv6StringToNum('127.0.0.1'), 0, 0)─┐
│ ::ffff:127.0.0.1 │
└─────────────────────────────────────────────┘
**Argument**
- `string` — IP address. [String](../../sql-reference/data-types/string.md).
**Returned value**
- IPv6 address in binary format.
Type: [FixedString(16)](../../sql-reference/data-types/fixedstring.md).
**Example**
Query:
``` sql
SELECT addr, cutIPv6(IPv6StringToNum(addr), 0, 0) FROM (SELECT ['notaddress', '127.0.0.1', '1111::ffff'] AS addr) ARRAY JOIN addr;
```
Result:
``` text
┌─addr───────┬─cutIPv6(IPv6StringToNum(addr), 0, 0)─┐
│ notaddress │ :: │
│ 127.0.0.1 │ ::ffff:127.0.0.1 │
│ 1111::ffff │ 1111::ffff │
└────────────┴──────────────────────────────────────┘
```
**See also**
- [cutIPv6](#cutipv6x-bytestocutforipv6-bytestocutforipv4).
## IPv4ToIPv6(x) {#ipv4toipv6x}
Takes a `UInt32` number. Interprets it as an IPv4 address in [big endian](https://en.wikipedia.org/wiki/Endianness). Returns a `FixedString(16)` value containing the IPv6 address in binary format. Examples:
@ -232,43 +261,56 @@ SELECT
└───────────────────────────────────┴──────────────────────────┘
```
## toIPv6(string) {#toipv6string}
## toIPv6 {#toipv6string}
An alias to `IPv6StringToNum()` that takes a string form of IPv6 address and returns value of [IPv6](../../sql-reference/data-types/domains/ipv6.md) type, which is binary equal to value returned by `IPv6StringToNum()`.
If the IP address is a valid IPv4 address then the IPv6 equivalent of the IPv4 address is returned.
Converts a string form of IPv6 address to [IPv6](../../sql-reference/data-types/domains/ipv6.md) type. If the IPv6 address has an invalid format, returns an empty value.
Similar to [IPv6StringToNum](#ipv6stringtonums) function, which converts IPv6 address to binary format.
``` sql
WITH
'2001:438:ffff::407d:1bc1' as IPv6_string
SELECT
toTypeName(IPv6StringToNum(IPv6_string)),
toTypeName(toIPv6(IPv6_string))
If the input string contains a valid IPv4 address, then the IPv6 equivalent of the IPv4 address is returned.
**Syntax**
```sql
toIPv6(string)
```
``` text
┌─toTypeName(IPv6StringToNum(IPv6_string))─┬─toTypeName(toIPv6(IPv6_string))─┐
│ FixedString(16) │ IPv6 │
└──────────────────────────────────────────┴─────────────────────────────────┘
```
**Argument**
- `string` — IP address. [String](../../sql-reference/data-types/string.md)
**Returned value**
- IP address.
Type: [IPv6](../../sql-reference/data-types/domains/ipv6.md).
**Examples**
Query:
``` sql
WITH
'2001:438:ffff::407d:1bc1' as IPv6_string
WITH '2001:438:ffff::407d:1bc1' AS IPv6_string
SELECT
hex(IPv6StringToNum(IPv6_string)),
hex(toIPv6(IPv6_string))
hex(toIPv6(IPv6_string));
```
Result:
``` text
┌─hex(IPv6StringToNum(IPv6_string))─┬─hex(toIPv6(IPv6_string))─────────┐
│ 20010438FFFF000000000000407D1BC1 │ 20010438FFFF000000000000407D1BC1 │
└───────────────────────────────────┴──────────────────────────────────┘
```
Query:
``` sql
SELECT toIPv6('127.0.0.1')
SELECT toIPv6('127.0.0.1');
```
Result:
``` text
┌─toIPv6('127.0.0.1')─┐
│ ::ffff:127.0.0.1 │

View File

@ -16,6 +16,7 @@
- `MOVE_PART` — перемещение куска между дисками.
- `event_date` ([Date](../../sql-reference/data-types/date.md)) — дата события.
- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — время события.
- `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — время события с точностью до микросекунд.
- `duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — длительность.
- `database` ([String](../../sql-reference/data-types/string.md)) — имя базы данных, в которой находится кусок.
- `table` ([String](../../sql-reference/data-types/string.md)) — имя таблицы, в которой находится кусок.
@ -47,6 +48,7 @@ query_id: 983ad9c7-28d5-4ae1-844e-603116b7de31
event_type: NewPart
event_date: 2021-02-02
event_time: 2021-02-02 11:14:28
event_time_microseconds: 2021-02-02 11:14:28.861919
duration_ms: 35
database: default
table: log_mt_2

View File

@ -444,9 +444,9 @@ date_trunc(unit, value[, timezone])
Синоним: `dateTrunc`.
**Параметры**
**Аргументы**
- `unit`Название части даты или времени. [String](../syntax.md#syntax-string-literal).
- `unit`название части даты или времени. [String Literal](../syntax.md#syntax-string-literal).
Возможные значения:
- `second`
@ -458,8 +458,8 @@ date_trunc(unit, value[, timezone])
- `quarter`
- `year`
- `value`Дата и время. [DateTime](../../sql-reference/data-types/datetime.md) или [DateTime64](../../sql-reference/data-types/datetime64.md).
- `timezone` — [Часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) для возвращаемого значения (необязательно). Если параметр не задан, используется часовой пояс параметра `value`. [String](../../sql-reference/data-types/string.md)
- `value`дата и время. [DateTime](../../sql-reference/data-types/datetime.md) или [DateTime64](../../sql-reference/data-types/datetime64.md).
- `timezone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) для возвращаемого значения (необязательно). Если параметр не задан, используется часовой пояс параметра `value`. [String](../../sql-reference/data-types/string.md)
**Возвращаемое значение**
@ -616,6 +616,57 @@ SELECT dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-0
└────────────────────────────────────────────────────────────────────────────────────────┘
```
## date\_sub {#date_sub}
Вычитает интервал времени или даты из указанной даты или даты со временем.
**Синтаксис**
``` sql
date_sub(unit, value, date)
```
Синонимы: `dateSub`, `DATE_SUB`.
**Аргументы**
- `unit` — единица измерения времени, в которой задан интервал для вычитания. [String](../../sql-reference/data-types/string.md).
Возможные значения:
- `second`
- `minute`
- `hour`
- `day`
- `week`
- `month`
- `quarter`
- `year`
- `value` — значение интервала для вычитания. [Int](../../sql-reference/data-types/int-uint.md).
- `date` — дата или дата со временем, из которой вычитается `value`. [Date](../../sql-reference/data-types/date.md) или [DateTime](../../sql-reference/data-types/datetime.md).
**Возвращаемое значение**
Возвращает дату или дату со временем, полученную в результате вычитания `value`, выраженного в `unit`, из `date`.
Тип: [Date](../../sql-reference/data-types/date.md) или [DateTime](../../sql-reference/data-types/datetime.md).
**Пример**
Запрос:
``` sql
SELECT date_sub(YEAR, 3, toDate('2018-01-01'));
```
Результат:
``` text
┌─minus(toDate('2018-01-01'), toIntervalYear(3))─┐
│ 2015-01-01 │
└────────────────────────────────────────────────┘
```
## timeSlot {#timeslot}
Округляет время до получаса.

View File

@ -121,12 +121,53 @@ LIMIT 10
└────────────────────────────┴────────┘
```
## IPv6StringToNum(s) {#ipv6stringtonums}
## IPv6StringToNum {#ipv6stringtonums}
Функция, обратная к [IPv6NumToString](#ipv6numtostringx). Если IPv6 адрес передан в неправильном формате, то возвращает строку из нулевых байт.
Если IP адрес является корректным IPv4 адресом, функция возвращает его IPv6 эквивалент.
Функция, обратная к IPv6NumToString. Если IPv6 адрес в неправильном формате, то возвращает строку из нулевых байт.
HEX может быть в любом регистре.
Alias: `INET6_ATON`.
Синоним: `INET6_ATON`.
**Синтаксис**
``` sql
IPv6StringToNum(string)
```
**Аргумент**
- `string` — IP адрес. [String](../../sql-reference/data-types/string.md).
**Возвращаемое значение**
- Адрес IPv6 в двоичном представлении.
Тип: [FixedString(16)](../../sql-reference/data-types/fixedstring.md).
**Пример**
Запрос:
``` sql
SELECT addr, cutIPv6(IPv6StringToNum(addr), 0, 0) FROM (SELECT ['notaddress', '127.0.0.1', '1111::ffff'] AS addr) ARRAY JOIN addr;
```
Результат:
``` text
┌─addr───────┬─cutIPv6(IPv6StringToNum(addr), 0, 0)─┐
│ notaddress │ :: │
│ 127.0.0.1 │ ::ffff:127.0.0.1 │
│ 1111::ffff │ 1111::ffff │
└────────────┴──────────────────────────────────────┘
```
**Смотрите также**
- [cutIPv6](#cutipv6x-bytestocutforipv6-bytestocutforipv4).
## IPv4ToIPv6(x) {#ipv4toipv6x}
@ -221,38 +262,62 @@ SELECT
└───────────────────────────────────┴──────────────────────────┘
```
## toIPv6(string) {#toipv6string}
## toIPv6 {#toipv6string}
Псевдоним функции `IPv6StringToNum()` которая принимает строку с адресом IPv6 и возвращает значение типа [IPv6](../../sql-reference/functions/ip-address-functions.md), которое равно значению, возвращаемому функцией `IPv6StringToNum()`.
Приводит строку с адресом в формате IPv6 к типу [IPv6](../../sql-reference/data-types/domains/ipv6.md). Возвращает пустое значение, если входящая строка не является корректным IP адресом.
Похоже на функцию [IPv6StringToNum](#ipv6stringtonums), которая представляет адрес IPv6 в двоичном виде.
``` sql
WITH
'2001:438:ffff::407d:1bc1' as IPv6_string
SELECT
toTypeName(IPv6StringToNum(IPv6_string)),
toTypeName(toIPv6(IPv6_string))
Если входящая строка содержит корректный IPv4 адрес, функция возвращает его IPv6 эквивалент.
**Синтаксис**
```sql
toIPv6(string)
```
``` text
┌─toTypeName(IPv6StringToNum(IPv6_string))─┬─toTypeName(toIPv6(IPv6_string))─┐
│ FixedString(16) │ IPv6 │
└──────────────────────────────────────────┴─────────────────────────────────┘
```
**Аргумент**
- `string` — IP адрес. [String](../../sql-reference/data-types/string.md)
**Возвращаемое значение**
- IP адрес.
Тип: [IPv6](../../sql-reference/data-types/domains/ipv6.md).
**Примеры**
Запрос:
``` sql
WITH
'2001:438:ffff::407d:1bc1' as IPv6_string
WITH '2001:438:ffff::407d:1bc1' AS IPv6_string
SELECT
hex(IPv6StringToNum(IPv6_string)),
hex(toIPv6(IPv6_string))
hex(toIPv6(IPv6_string));
```
Результат:
``` text
┌─hex(IPv6StringToNum(IPv6_string))─┬─hex(toIPv6(IPv6_string))─────────┐
│ 20010438FFFF000000000000407D1BC1 │ 20010438FFFF000000000000407D1BC1 │
└───────────────────────────────────┴──────────────────────────────────┘
```
Запрос:
``` sql
SELECT toIPv6('127.0.0.1');
```
Результат:
``` text
┌─toIPv6('127.0.0.1')─┐
│ ::ffff:127.0.0.1 │
└─────────────────────┘
```
## isIPv4String {#isipv4string}
Определяет, является ли строка адресом IPv4 или нет. Также вернет `0`, если `string` — адрес IPv6.

View File

@ -102,7 +102,7 @@ USING (equi_column1, ... equi_columnN, asof_column)
- При использовании обычного `JOIN` , запрос отправляется на удалённые серверы. На каждом из них выполняются подзапросы для формирования «правой» таблицы, и с этой таблицей выполняется соединение. То есть, «правая» таблица формируется на каждом сервере отдельно.
- При использовании `GLOBAL ... JOIN`, сначала сервер-инициатор запроса запускает подзапрос для вычисления правой таблицы. Эта временная таблица передаётся на каждый удалённый сервер, и на них выполняются запросы с использованием переданных временных данных.
Будьте аккуратны при использовании `GLOBAL`. За дополнительной информацией обращайтесь в раздел [Распределенные подзапросы](#select-distributed-subqueries).
Будьте аккуратны при использовании `GLOBAL`. За дополнительной информацией обращайтесь в раздел [Распределенные подзапросы](../../../sql-reference/operators/in.md#select-distributed-subqueries).
## Рекомендации по использованию {#usage-recommendations}

View File

@ -47,7 +47,7 @@
如果配置文件中没有设置 ZooKeeper ,则无法创建复制表,并且任何现有的复制表都将变为只读。
`SELECT` 查询并不需要借助 ZooKeeper 本并不影响 `SELECT` 的性能查询复制表与非复制表速度是一样的。查询分布式表时ClickHouse的处理方式可通过设置 [max_replica_delay_for_distributed_queries](../../../operations/settings/settings.md#settings-max_replica_delay_for_distributed_queries) 和 [fallback_to_stale_replicas_for_distributed_queries](../../../operations/settings/settings.md) 修改。
`SELECT` 查询并不需要借助 ZooKeeper 本并不影响 `SELECT` 的性能查询复制表与非复制表速度是一样的。查询分布式表时ClickHouse的处理方式可通过设置 [max_replica_delay_for_distributed_queries](../../../operations/settings/settings.md#settings-max_replica_delay_for_distributed_queries) 和 [fallback_to_stale_replicas_for_distributed_queries](../../../operations/settings/settings.md) 修改。
对于每个 `INSERT` 语句,会通过几个事务将十来个记录添加到 ZooKeeper。确切地说这是针对每个插入的数据块; 每个 INSERT 语句的每 `max_insert_block_size = 1048576` 行和最后剩余的都各算作一个块。)相比非复制表,写 zk 会导致 `INSERT` 的延迟略长一些。但只要你按照建议每秒不超过一个 `INSERT` 地批量插入数据,不会有任何问题。一个 ZooKeeper 集群能给整个 ClickHouse 集群支撑协调每秒几百个 `INSERT`。数据插入的吞吐量(每秒的行数)可以跟不用复制的数据一样高。

View File

@ -45,7 +45,7 @@ Restrictions on the «maximum amount of something» can take the value 0, which
用于在单个服务器上运行用户查询的最大RAM量。
默认值定义在 [设置。h](https://github.com/ClickHouse/ClickHouse/blob/master/src/Interpreters/Settings.h#L244). 默认情况下,额不受限制 (`max_memory_usage_for_user = 0`).
默认值定义在 [Settings.h](https://github.com/ClickHouse/ClickHouse/blob/master/src/Interpreters/Settings.h#L244). 默认情况下,额不受限制 (`max_memory_usage_for_user = 0`).
另请参阅说明 [max_memory_usage](#settings_max_memory_usage).
@ -53,7 +53,7 @@ Restrictions on the «maximum amount of something» can take the value 0, which
用于在单个服务器上运行所有查询的最大RAM数量。
默认值定义在 [设置。h](https://github.com/ClickHouse/ClickHouse/blob/master/src/Interpreters/Settings.h#L245). 默认情况下,额不受限制 (`max_memory_usage_for_all_queries = 0`).
默认值定义在 [Settings.h](https://github.com/ClickHouse/ClickHouse/blob/master/src/Interpreters/Settings.h#L245). 默认情况下,额不受限制 (`max_memory_usage_for_all_queries = 0`).
另请参阅说明 [max_memory_usage](#settings_max_memory_usage).

View File

@ -7,10 +7,12 @@
#include <IO/ReadHelpers.h>
#include <IO/WriteBufferFromFileDescriptor.h>
#include <IO/WriteBufferFromOStream.h>
#include <Parsers/ASTInsertQuery.h>
#include <Parsers/ParserQuery.h>
#include <Parsers/parseQuery.h>
#include <Parsers/formatAST.h>
#include <Parsers/obfuscateQueries.h>
#include <Parsers/parseQuery.h>
#include <Common/ErrorCodes.h>
#include <Common/TerminalSize.h>
#include <Interpreters/Context.h>
@ -28,6 +30,14 @@
#pragma GCC diagnostic ignored "-Wunused-function"
#pragma GCC diagnostic ignored "-Wmissing-declarations"
namespace DB
{
namespace ErrorCodes
{
extern const int INVALID_FORMAT_INSERT_QUERY_WITH_DATA;
}
}
int mainEntryClickHouseFormat(int argc, char ** argv)
{
using namespace DB;
@ -128,6 +138,14 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
do
{
ASTPtr res = parseQueryAndMovePosition(parser, pos, end, "query", multiple, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH);
/// For insert query with data(INSERT INTO ... VALUES ...), will lead to format fail,
/// should throw exception early and make exception message more readable.
if (const auto * insert_query = res->as<ASTInsertQuery>(); insert_query && insert_query->data)
{
throw Exception(
"Can't format ASTInsertQuery with data, since data will be lost",
DB::ErrorCodes::INVALID_FORMAT_INSERT_QUERY_WITH_DATA);
}
if (!quiet)
{
WriteBufferFromOStream res_buf(std::cout, 4096);
@ -137,6 +155,26 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
std::cout << "\n;\n";
std::cout << std::endl;
}
do
{
/// skip spaces to avoid throw exception after last query
while (pos != end && std::isspace(*pos))
++pos;
/// for skip comment after the last query and to not throw exception
if (end - pos > 2 && *pos == '-' && *(pos + 1) == '-')
{
pos += 2;
/// skip until the end of the line
while (pos != end && *pos != '\n')
++pos;
}
/// need to parse next sql
else
break;
} while (pos != end);
} while (multiple && pos != end);
}
}

View File

@ -716,7 +716,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
config().getString("path", ""),
std::move(main_config_zk_node_cache),
main_config_zk_changed_event,
[&](ConfigurationPtr config)
[&](ConfigurationPtr config, bool initial_loading)
{
Settings::checkNoSettingNamesAtTopLevel(*config, config_path);
@ -766,14 +766,19 @@ int Server::main(const std::vector<std::string> & /*args*/)
if (config->has("max_partition_size_to_drop"))
global_context->setMaxPartitionSizeToDrop(config->getUInt64("max_partition_size_to_drop"));
if (config->has("zookeeper"))
global_context->reloadZooKeeperIfChanged(config);
if (!initial_loading)
{
/// We do not load ZooKeeper configuration on the first config loading
/// because TestKeeper server is not started yet.
if (config->has("zookeeper"))
global_context->reloadZooKeeperIfChanged(config);
global_context->reloadAuxiliaryZooKeepersConfigIfChanged(config);
global_context->reloadAuxiliaryZooKeepersConfigIfChanged(config);
}
global_context->updateStorageConfiguration(*config);
},
/* already_loaded = */ true);
/* already_loaded = */ false); /// Reload it right now (initial loading)
auto & access_control = global_context->getAccessControlManager();
if (config().has("custom_settings_prefixes"))

View File

@ -518,7 +518,7 @@ void UsersConfigAccessStorage::load(
preprocessed_dir,
zkutil::ZooKeeperNodeCache(get_zookeeper_function),
std::make_shared<Poco::Event>(),
[&](Poco::AutoPtr<Poco::Util::AbstractConfiguration> new_config)
[&](Poco::AutoPtr<Poco::Util::AbstractConfiguration> new_config, bool /*initial_loading*/)
{
parseFromConfig(*new_config);
Settings::checkNoSettingNamesAtTopLevel(*new_config, users_config_path);

View File

@ -24,6 +24,7 @@ namespace ErrorCodes
extern const int PARAMETER_OUT_OF_BOUND;
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int NOT_IMPLEMENTED;
}
@ -553,6 +554,11 @@ const char * ColumnAggregateFunction::deserializeAndInsertFromArena(const char *
return read_buffer.position();
}
const char * ColumnAggregateFunction::skipSerializedInArena(const char *) const
{
throw Exception("Method skipSerializedInArena is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
void ColumnAggregateFunction::popBack(size_t n)
{
size_t size = data.size();

View File

@ -155,6 +155,8 @@ public:
const char * deserializeAndInsertFromArena(const char * src_arena) override;
const char * skipSerializedInArena(const char *) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;

View File

@ -239,6 +239,16 @@ const char * ColumnArray::deserializeAndInsertFromArena(const char * pos)
return pos;
}
const char * ColumnArray::skipSerializedInArena(const char * pos) const
{
size_t array_size = unalignedLoad<size_t>(pos);
pos += sizeof(array_size);
for (size_t i = 0; i < array_size; ++i)
pos = getData().skipSerializedInArena(pos);
return pos;
}
void ColumnArray::updateHashWithValue(size_t n, SipHash & hash) const
{

View File

@ -61,6 +61,7 @@ public:
void insertData(const char * pos, size_t length) override;
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
void updateHashFast(SipHash & hash) const override;

View File

@ -85,6 +85,7 @@ public:
void popBack(size_t) override { throwMustBeDecompressed(); }
StringRef serializeValueIntoArena(size_t, Arena &, char const *&) const override { throwMustBeDecompressed(); }
const char * deserializeAndInsertFromArena(const char *) override { throwMustBeDecompressed(); }
const char * skipSerializedInArena(const char *) const override { throwMustBeDecompressed(); }
void updateHashWithValue(size_t, SipHash &) const override { throwMustBeDecompressed(); }
void updateWeakHash32(WeakHash32 &) const override { throwMustBeDecompressed(); }
void updateHashFast(SipHash &) const override { throwMustBeDecompressed(); }

View File

@ -163,6 +163,11 @@ public:
return res;
}
const char * skipSerializedInArena(const char * pos) const override
{
return data->skipSerializedInArena(pos);
}
void updateHashWithValue(size_t, SipHash & hash) const override
{
data->updateHashWithValue(0, hash);

View File

@ -79,6 +79,12 @@ const char * ColumnDecimal<T>::deserializeAndInsertFromArena(const char * pos)
return pos + sizeof(T);
}
template <typename T>
const char * ColumnDecimal<T>::skipSerializedInArena(const char * pos) const
{
return pos + sizeof(T);
}
template <typename T>
UInt64 ColumnDecimal<T>::get64([[maybe_unused]] size_t n) const
{

View File

@ -129,6 +129,7 @@ public:
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
void updateHashFast(SipHash & hash) const override;

View File

@ -100,6 +100,11 @@ const char * ColumnFixedString::deserializeAndInsertFromArena(const char * pos)
return pos + n;
}
const char * ColumnFixedString::skipSerializedInArena(const char * pos) const
{
return pos + n;
}
void ColumnFixedString::updateHashWithValue(size_t index, SipHash & hash) const
{
hash.update(reinterpret_cast<const char *>(&chars[n * index]), n);

View File

@ -112,6 +112,8 @@ public:
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t index, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;

View File

@ -98,6 +98,11 @@ public:
throw Exception("Cannot deserialize to " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
const char * skipSerializedInArena(const char*) const override
{
throw Exception("Cannot skip serialized " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
void updateHashWithValue(size_t, SipHash &) const override
{
throw Exception("updateHashWithValue is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED);

View File

@ -247,6 +247,11 @@ const char * ColumnLowCardinality::deserializeAndInsertFromArena(const char * po
return new_pos;
}
const char * ColumnLowCardinality::skipSerializedInArena(const char * pos) const
{
return getDictionary().skipSerializedInArena(pos);
}
void ColumnLowCardinality::updateWeakHash32(WeakHash32 & hash) const
{
auto s = size();

View File

@ -94,6 +94,8 @@ public:
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override
{
return getDictionary().updateHashWithValue(getIndexes().getUInt(n), hash);

View File

@ -116,6 +116,11 @@ const char * ColumnMap::deserializeAndInsertFromArena(const char * pos)
return nested->deserializeAndInsertFromArena(pos);
}
const char * ColumnMap::skipSerializedInArena(const char * pos) const
{
return nested->skipSerializedInArena(pos);
}
void ColumnMap::updateHashWithValue(size_t n, SipHash & hash) const
{
nested->updateHashWithValue(n, hash);

View File

@ -58,6 +58,7 @@ public:
void popBack(size_t n) override;
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
void updateHashFast(SipHash & hash) const override;

View File

@ -152,6 +152,17 @@ const char * ColumnNullable::deserializeAndInsertFromArena(const char * pos)
return pos;
}
const char * ColumnNullable::skipSerializedInArena(const char * pos) const
{
UInt8 val = unalignedLoad<UInt8>(pos);
pos += sizeof(val);
if (val == 0)
return getNestedColumn().skipSerializedInArena(pos);
return pos;
}
void ColumnNullable::insertRangeFrom(const IColumn & src, size_t start, size_t length)
{
const ColumnNullable & nullable_col = assert_cast<const ColumnNullable &>(src);

View File

@ -71,6 +71,7 @@ public:
void insertData(const char * pos, size_t length) override;
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
void insert(const Field & x) override;
void insertFrom(const IColumn & src, size_t n) override;

View File

@ -237,6 +237,12 @@ const char * ColumnString::deserializeAndInsertFromArena(const char * pos)
return pos + string_size;
}
const char * ColumnString::skipSerializedInArena(const char * pos) const
{
const size_t string_size = unalignedLoad<size_t>(pos);
pos += sizeof(string_size);
return pos + string_size;
}
ColumnPtr ColumnString::index(const IColumn & indexes, size_t limit) const
{

View File

@ -189,6 +189,8 @@ public:
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override
{
size_t string_size = sizeAt(n);

View File

@ -180,6 +180,14 @@ const char * ColumnTuple::deserializeAndInsertFromArena(const char * pos)
return pos;
}
const char * ColumnTuple::skipSerializedInArena(const char * pos) const
{
for (const auto & column : columns)
pos = column->skipSerializedInArena(pos);
return pos;
}
void ColumnTuple::updateHashWithValue(size_t n, SipHash & hash) const
{
for (const auto & column : columns)

View File

@ -61,6 +61,7 @@ public:
void popBack(size_t n) override;
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
void updateHashFast(SipHash & hash) const override;

View File

@ -26,6 +26,7 @@ namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int ILLEGAL_COLUMN;
extern const int NOT_IMPLEMENTED;
}
/** Stores another column with unique values
@ -78,6 +79,7 @@ public:
bool getBool(size_t n) const override { return getNestedColumn()->getBool(n); }
bool isNullAt(size_t n) const override { return is_nullable && n == getNullValueIndex(); }
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash_func) const override
{
return getNestedColumn()->updateHashWithValue(n, hash_func);
@ -373,6 +375,12 @@ size_t ColumnUnique<ColumnType>::uniqueDeserializeAndInsertFromArena(const char
return uniqueInsertData(pos, string_size - 1);
}
template <typename ColumnType>
const char * ColumnUnique<ColumnType>::skipSerializedInArena(const char *) const
{
throw Exception("Method skipSerializedInArena is not supported for " + this->getName(), ErrorCodes::NOT_IMPLEMENTED);
}
template <typename ColumnType>
int ColumnUnique<ColumnType>::compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const
{

View File

@ -50,6 +50,12 @@ const char * ColumnVector<T>::deserializeAndInsertFromArena(const char * pos)
return pos + sizeof(T);
}
template <typename T>
const char * ColumnVector<T>::skipSerializedInArena(const char * pos) const
{
return pos + sizeof(T);
}
template <typename T>
void ColumnVector<T>::updateHashWithValue(size_t n, SipHash & hash) const
{

View File

@ -154,6 +154,8 @@ public:
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;

View File

@ -207,6 +207,10 @@ public:
/// Returns pointer to the position after the read data.
virtual const char * deserializeAndInsertFromArena(const char * pos) = 0;
/// Skip previously serialized value that was serialized using IColumn::serializeValueIntoArena method.
/// Returns a pointer to the position after the deserialized data.
virtual const char * skipSerializedInArena(const char *) const = 0;
/// Update state of hash function with value of n-th element.
/// On subsequent calls of this method for sequence of column values of arbitrary types,
/// passed bytes to hash must identify sequence of values unambiguously.

View File

@ -67,6 +67,11 @@ public:
return pos;
}
const char * skipSerializedInArena(const char * pos) const override
{
return pos;
}
void updateHashWithValue(size_t /*n*/, SipHash & /*hash*/) const override
{
}

View File

@ -0,0 +1,157 @@
#pragma once
#include <cstdint>
#include <vector>
#include <chrono>
#include <mutex>
#include <condition_variable>
#include <common/defines.h>
#include <Common/MoveOrCopyIfThrow.h>
/** Pool for limited size objects that cannot be used from different threads simultaneously.
* The main use case is to have fixed size of objects that can be reused in difference threads during their lifetime
* and have to be initialized on demand.
* Two main properties of pool are allocated objects size and borrowed objects size.
* Allocated objects size is size of objects that are currently allocated by the pool.
* Borrowed objects size is size of objects that are borrowed by clients.
* If max_size == 0 then pool has unlimited size and objects will be allocated without limit.
*
* Pool provides following strategy for borrowing object:
* If max_size == 0 then pool has unlimited size and objects will be allocated without limit.
* 1. If pool has objects that can be borrowed increase borrowed objects size and return it.
* 2. If pool allocatedObjectsSize is lower than max objects size or pool has unlimited size
* allocate new object, increase borrowed objects size and return it.
* 3. If pool is full wait on condition variable with or without timeout until some object
* will be returned to the pool.
*/
template <typename T>
class BorrowedObjectPool final
{
public:
explicit BorrowedObjectPool(size_t max_size_) : max_size(max_size_) {}
/// Borrow object from pool. If pull is full and all objects were borrowed
/// then calling thread will wait until some object will be returned into pool.
template <typename FactoryFunc>
void borrowObject(T & dest, FactoryFunc && func)
{
std::unique_lock<std::mutex> lock(objects_mutex);
if (!objects.empty())
{
dest = borrowFromObjects(lock);
return;
}
bool has_unlimited_size = (max_size == 0);
if (unlikely(has_unlimited_size) || allocated_objects_size < max_size)
{
dest = allocateObjectForBorrowing(lock, std::forward<FactoryFunc>(func));
return;
}
condition_variable.wait(lock, [this] { return !objects.empty(); });
dest = borrowFromObjects(lock);
}
/// Same as borrowObject function, but wait with timeout.
/// Returns true if object was borrowed during timeout.
template <typename FactoryFunc>
bool tryBorrowObject(T & dest, FactoryFunc && func, size_t timeout_in_milliseconds = 0)
{
std::unique_lock<std::mutex> lock(objects_mutex);
if (!objects.empty())
{
dest = borrowFromObjects(lock);
return true;
}
bool has_unlimited_size = (max_size == 0);
if (unlikely(has_unlimited_size) || allocated_objects_size < max_size)
{
dest = allocateObjectForBorrowing(lock, std::forward<FactoryFunc>(func));
return true;
}
bool wait_result = condition_variable.wait_for(lock, std::chrono::milliseconds(timeout_in_milliseconds), [this] { return !objects.empty(); });
if (wait_result)
dest = borrowFromObjects(lock);
return wait_result;
}
/// Return object into pool. Client must return same object that was borrowed.
inline void returnObject(T && object_to_return)
{
std::unique_lock<std::mutex> lck(objects_mutex);
objects.emplace_back(std::move(object_to_return));
--borrowed_objects_size;
condition_variable.notify_one();
}
/// Max pool size
inline size_t maxSize() const
{
return max_size;
}
/// Allocated objects size by the pool. If allocatedObjectsSize == maxSize then pool is full.
inline size_t allocatedObjectsSize() const
{
std::unique_lock<std::mutex> lock(objects_mutex);
return allocated_objects_size;
}
/// Returns allocatedObjectsSize == maxSize
inline bool isFull() const
{
std::unique_lock<std::mutex> lock(objects_mutex);
return allocated_objects_size == max_size;
}
/// Borrowed objects size. If borrowedObjectsSize == allocatedObjectsSize and pool is full.
/// Then client will wait during borrowObject function call.
inline size_t borrowedObjectsSize() const
{
std::unique_lock<std::mutex> lock(objects_mutex);
return borrowed_objects_size;
}
private:
template <typename FactoryFunc>
inline T allocateObjectForBorrowing(const std::unique_lock<std::mutex> &, FactoryFunc && func)
{
++allocated_objects_size;
++borrowed_objects_size;
return std::forward<FactoryFunc>(func)();
}
inline T borrowFromObjects(const std::unique_lock<std::mutex> &)
{
T dst;
detail::moveOrCopyIfThrow(std::move(objects.back()), dst);
objects.pop_back();
++borrowed_objects_size;
return dst;
}
size_t max_size;
mutable std::mutex objects_mutex;
std::condition_variable condition_variable;
size_t allocated_objects_size = 0;
size_t borrowed_objects_size = 0;
std::vector<T> objects;
};

View File

@ -6,38 +6,7 @@
#include <Poco/Mutex.h>
#include <Poco/Semaphore.h>
#include <common/types.h>
namespace detail
{
template <typename T, bool is_nothrow_move_assignable = std::is_nothrow_move_assignable_v<T>>
struct MoveOrCopyIfThrow;
template <typename T>
struct MoveOrCopyIfThrow<T, true>
{
void operator()(T && src, T & dst) const
{
dst = std::forward<T>(src);
}
};
template <typename T>
struct MoveOrCopyIfThrow<T, false>
{
void operator()(T && src, T & dst) const
{
dst = src;
}
};
template <typename T>
void moveOrCopyIfThrow(T && src, T & dst)
{
MoveOrCopyIfThrow<T>()(std::forward<T>(src), dst);
}
}
#include <Common/MoveOrCopyIfThrow.h>
/** A very simple thread-safe queue of limited size.
* If you try to pop an item from an empty queue, the thread is blocked until the queue becomes nonempty.
@ -53,8 +22,10 @@ private:
Poco::Semaphore empty_count;
public:
ConcurrentBoundedQueue(size_t max_fill)
: fill_count(0, max_fill), empty_count(max_fill, max_fill) {}
explicit ConcurrentBoundedQueue(size_t max_fill)
: fill_count(0, max_fill)
, empty_count(max_fill, max_fill)
{}
void push(const T & x)
{

View File

@ -27,7 +27,7 @@ ConfigReloader::ConfigReloader(
, updater(std::move(updater_))
{
if (!already_loaded)
reloadIfNewer(/* force = */ true, /* throw_on_error = */ true, /* fallback_to_preprocessed = */ true);
reloadIfNewer(/* force = */ true, /* throw_on_error = */ true, /* fallback_to_preprocessed = */ true, /* initial_loading = */ true);
}
@ -66,7 +66,7 @@ void ConfigReloader::run()
if (quit)
return;
reloadIfNewer(zk_changed, /* throw_on_error = */ false, /* fallback_to_preprocessed = */ false);
reloadIfNewer(zk_changed, /* throw_on_error = */ false, /* fallback_to_preprocessed = */ false, /* initial_loading = */ false);
}
catch (...)
{
@ -76,7 +76,7 @@ void ConfigReloader::run()
}
}
void ConfigReloader::reloadIfNewer(bool force, bool throw_on_error, bool fallback_to_preprocessed)
void ConfigReloader::reloadIfNewer(bool force, bool throw_on_error, bool fallback_to_preprocessed, bool initial_loading)
{
std::lock_guard lock(reload_mutex);
@ -131,7 +131,7 @@ void ConfigReloader::reloadIfNewer(bool force, bool throw_on_error, bool fallbac
try
{
updater(loaded_config.configuration);
updater(loaded_config.configuration, initial_loading);
}
catch (...)
{

View File

@ -27,7 +27,7 @@ class Context;
class ConfigReloader
{
public:
using Updater = std::function<void(ConfigurationPtr)>;
using Updater = std::function<void(ConfigurationPtr, bool)>;
/** include_from_path is usually /etc/metrika.xml (i.e. value of <include_from> tag)
*/
@ -46,12 +46,12 @@ public:
void start();
/// Reload immediately. For SYSTEM RELOAD CONFIG query.
void reload() { reloadIfNewer(/* force */ true, /* throw_on_error */ true, /* fallback_to_preprocessed */ false); }
void reload() { reloadIfNewer(/* force */ true, /* throw_on_error */ true, /* fallback_to_preprocessed */ false, /* initial_loading = */ false); }
private:
void run();
void reloadIfNewer(bool force, bool throw_on_error, bool fallback_to_preprocessed);
void reloadIfNewer(bool force, bool throw_on_error, bool fallback_to_preprocessed, bool initial_loading);
struct FileWithTimestamp;

View File

@ -541,12 +541,14 @@
M(572, TOO_MANY_QUERY_PLAN_OPTIMIZATIONS) \
M(573, EPOLL_ERROR) \
M(574, DISTRIBUTED_TOO_MANY_PENDING_BYTES) \
M(575, UNKNOWN_SNAPSHOT) \
\
M(999, KEEPER_EXCEPTION) \
M(1000, POCO_EXCEPTION) \
M(1001, STD_EXCEPTION) \
M(1002, UNKNOWN_EXCEPTION) \
M(1003, INVALID_SHARD_ID) \
M(1004, INVALID_FORMAT_INSERT_QUERY_WITH_DATA)
/* See END */

View File

@ -1,8 +1,9 @@
#pragma once
#include <common/types.h>
#include <Common/UInt128.h>
#include <common/unaligned.h>
#include <common/StringRef.h>
#include <Common/UInt128.h>
#include <type_traits>
@ -178,13 +179,19 @@ inline size_t DefaultHash64(std::enable_if_t<(sizeof(T) <= sizeof(UInt64)), T> k
}
template <typename T>
inline size_t DefaultHash64(std::enable_if_t<(sizeof(T) > sizeof(UInt64)), T> key)
static constexpr bool UseDefaultHashForBigInts =
std::is_same_v<T, DB::Int128> ||
std::is_same_v<T, DB::UInt128> ||
(is_big_int_v<T> && sizeof(T) == 32);
template <typename T>
inline size_t DefaultHash64(std::enable_if_t<(sizeof(T) > sizeof(UInt64) && UseDefaultHashForBigInts<T>), T> key)
{
if constexpr (std::is_same_v<T, DB::Int128>)
{
return intHash64(static_cast<UInt64>(key) ^ static_cast<UInt64>(key >> 64));
}
if constexpr (std::is_same_v<T, DB::UInt128>)
else if constexpr (std::is_same_v<T, DB::UInt128>)
{
return intHash64(key.low ^ key.high);
}
@ -195,6 +202,8 @@ inline size_t DefaultHash64(std::enable_if_t<(sizeof(T) > sizeof(UInt64)), T> ke
static_cast<UInt64>(key >> 128) ^
static_cast<UInt64>(key >> 256));
}
assert(false);
__builtin_unreachable();
}
@ -341,6 +350,11 @@ struct IntHash32
}
else if constexpr (sizeof(T) <= sizeof(UInt64))
return intHash32<salt>(key);
assert(false);
__builtin_unreachable();
}
};
template <>
struct DefaultHash<StringRef> : public StringRefHash {};

View File

@ -77,7 +77,7 @@ struct LRUHashMapCellNodeTraits
static void set_previous(node * __restrict ptr, node * __restrict prev) { ptr->prev = prev; }
};
template <typename TKey, typename TValue, typename Hash, bool save_hash_in_cells>
template <typename TKey, typename TValue, typename Disposer, typename Hash, bool save_hash_in_cells>
class LRUHashMapImpl :
private HashMapTable<
TKey,
@ -108,24 +108,33 @@ public:
boost::intrusive::value_traits<LRUHashMapCellIntrusiveValueTraits>,
boost::intrusive::constant_time_size<false>>;
using LookupResult = typename Base::LookupResult;
using ConstLookupResult = typename Base::ConstLookupResult;
using iterator = typename LRUList::iterator;
using const_iterator = typename LRUList::const_iterator;
using reverse_iterator = typename LRUList::reverse_iterator;
using const_reverse_iterator = typename LRUList::const_reverse_iterator;
LRUHashMapImpl(size_t max_size_, bool preallocate_max_size_in_hash_map = false)
explicit LRUHashMapImpl(size_t max_size_, bool preallocate_max_size_in_hash_map = false, Disposer disposer_ = Disposer())
: Base(preallocate_max_size_in_hash_map ? max_size_ : 32)
, max_size(max_size_)
, disposer(std::move(disposer_))
{
assert(max_size > 0);
}
std::pair<Cell *, bool> insert(const Key & key, const Value & value)
~LRUHashMapImpl()
{
clear();
}
std::pair<Cell *, bool> ALWAYS_INLINE insert(const Key & key, const Value & value)
{
return emplace(key, value);
}
std::pair<Cell *, bool> insert(const Key & key, Value && value)
std::pair<Cell *, bool> ALWAYS_INLINE insert(const Key & key, Value && value)
{
return emplace(key, std::move(value));
}
@ -147,15 +156,16 @@ public:
if (size() == max_size)
{
/// Erase least recently used element from front of the list
Cell & node = lru_list.front();
Cell copy_node = lru_list.front();
const Key & element_to_remove_key = node.getKey();
size_t key_hash = node.getHash(*this);
const Key & element_to_remove_key = copy_node.getKey();
lru_list.pop_front();
[[maybe_unused]] bool erased = Base::erase(element_to_remove_key, key_hash);
[[maybe_unused]] bool erased = Base::erase(element_to_remove_key);
assert(erased);
disposer(element_to_remove_key, copy_node.getMapped());
}
[[maybe_unused]] bool inserted;
@ -174,46 +184,64 @@ public:
return std::make_pair(it, true);
}
using Base::find;
Value & get(const Key & key)
LookupResult ALWAYS_INLINE find(const Key & key)
{
auto it = Base::find(key);
assert(it);
Value & value = it->getMapped();
if (!it)
return nullptr;
/// Put cell to the end of lru list
lru_list.splice(lru_list.end(), lru_list, lru_list.iterator_to(*it));
return value;
return it;
}
const Value & get(const Key & key) const
ConstLookupResult ALWAYS_INLINE find(const Key & key) const
{
return const_cast<std::decay_t<decltype(*this)> *>(this)->find(key);
}
Value & ALWAYS_INLINE get(const Key & key)
{
auto it = find(key);
assert(it);
return it->getMapped();
}
const Value & ALWAYS_INLINE get(const Key & key) const
{
return const_cast<std::decay_t<decltype(*this)> *>(this)->get(key);
}
bool contains(const Key & key) const
bool ALWAYS_INLINE contains(const Key & key) const
{
return Base::has(key);
return find(key) != nullptr;
}
bool erase(const Key & key)
bool ALWAYS_INLINE erase(const Key & key)
{
auto hash = Base::hash(key);
auto it = Base::find(key, hash);
auto key_hash = Base::hash(key);
auto it = Base::find(key, key_hash);
if (!it)
return false;
lru_list.erase(lru_list.iterator_to(*it));
return Base::erase(key, hash);
Cell copy_node = *it;
Base::erase(key, key_hash);
disposer(copy_node.getKey(), copy_node.getMapped());
return true;
}
void clear()
void ALWAYS_INLINE clear()
{
for (auto & cell : lru_list)
disposer(cell.getKey(), cell.getMapped());
lru_list.clear();
Base::clear();
}
@ -222,6 +250,10 @@ public:
size_t getMaxSize() const { return max_size; }
size_t getSizeInBytes() const { return Base::getBufferSizeInBytes(); }
using Base::hash;
iterator begin() { return lru_list.begin(); }
const_iterator begin() const { return lru_list.cbegin(); }
iterator end() { return lru_list.end(); }
@ -235,10 +267,17 @@ public:
private:
size_t max_size;
LRUList lru_list;
Disposer disposer;
};
template <typename Key, typename Value, typename Hash = DefaultHash<Key>>
using LRUHashMap = LRUHashMapImpl<Key, Value, Hash, false>;
template <typename Key, typename Mapped>
struct DefaultCellDisposer
{
void operator()(const Key &, const Mapped &) const {}
};
template <typename Key, typename Value, typename Hash = DefaultHash<Key>>
using LRUHashMapWithSavedHash = LRUHashMapImpl<Key, Value, Hash, true>;
template <typename Key, typename Value, typename Disposer = DefaultCellDisposer<Key, Value>, typename Hash = DefaultHash<Key>>
using LRUHashMap = LRUHashMapImpl<Key, Value, Disposer, Hash, false>;
template <typename Key, typename Value, typename Disposer = DefaultCellDisposer<Key, Value>, typename Hash = DefaultHash<Key>>
using LRUHashMapWithSavedHash = LRUHashMapImpl<Key, Value, Disposer, Hash, true>;

View File

@ -0,0 +1,33 @@
#pragma once
#include <common/types.h>
namespace detail
{
template <typename T, bool is_nothrow_move_assignable = std::is_nothrow_move_assignable_v<T>>
struct MoveOrCopyIfThrow;
template <typename T>
struct MoveOrCopyIfThrow<T, true>
{
void operator()(T && src, T & dst) const
{
dst = std::forward<T>(src);
}
};
template <typename T>
struct MoveOrCopyIfThrow<T, false>
{
void operator()(T && src, T & dst) const
{
dst = src;
}
};
template <typename T>
void moveOrCopyIfThrow(T && src, T & dst)
{
MoveOrCopyIfThrow<T>()(std::forward<T>(src), dst);
}
}

View File

@ -2,15 +2,18 @@
#include <sys/wait.h>
#include <fcntl.h>
#include <dlfcn.h>
#include <unistd.h>
#include <time.h>
#include <csignal>
#include <common/logger_useful.h>
#include <common/errnoToString.h>
#include <Common/Exception.h>
#include <Common/ShellCommand.h>
#include <Common/PipeFDs.h>
#include <common/logger_useful.h>
#include <common/errnoToString.h>
#include <IO/WriteHelpers.h>
#include <IO/Operators.h>
#include <unistd.h>
#include <csignal>
namespace
{
@ -36,9 +39,9 @@ namespace ErrorCodes
extern const int CANNOT_CREATE_CHILD_PROCESS;
}
ShellCommand::ShellCommand(pid_t pid_, int & in_fd_, int & out_fd_, int & err_fd_, bool terminate_in_destructor_)
ShellCommand::ShellCommand(pid_t pid_, int & in_fd_, int & out_fd_, int & err_fd_, ShellCommandDestructorStrategy destructor_strategy_)
: pid(pid_)
, terminate_in_destructor(terminate_in_destructor_)
, destructor_strategy(destructor_strategy_)
, in(in_fd_)
, out(out_fd_)
, err(err_fd_)
@ -52,14 +55,24 @@ Poco::Logger * ShellCommand::getLogger()
ShellCommand::~ShellCommand()
{
if (terminate_in_destructor)
if (wait_called)
return;
if (destructor_strategy.terminate_in_destructor)
{
LOG_TRACE(getLogger(), "Will kill shell command pid {} with SIGTERM", pid);
int retcode = kill(pid, SIGTERM);
if (retcode != 0)
LOG_WARNING(getLogger(), "Cannot kill shell command pid {} errno '{}'", pid, errnoToString(retcode));
size_t try_wait_timeout = destructor_strategy.wait_for_normal_exit_before_termination_seconds;
bool process_terminated_normally = tryWaitProcessWithTimeout(try_wait_timeout);
if (!process_terminated_normally)
{
LOG_TRACE(getLogger(), "Will kill shell command pid {} with SIGTERM", pid);
int retcode = kill(pid, SIGTERM);
if (retcode != 0)
LOG_WARNING(getLogger(), "Cannot kill shell command pid {} errno '{}'", pid, errnoToString(retcode));
}
}
else if (!wait_called)
else
{
try
{
@ -72,6 +85,53 @@ ShellCommand::~ShellCommand()
}
}
bool ShellCommand::tryWaitProcessWithTimeout(size_t timeout_in_seconds)
{
int status = 0;
LOG_TRACE(getLogger(), "Try wait for shell command pid ({}) with timeout ({})", pid, timeout_in_seconds);
wait_called = true;
struct timespec interval {.tv_sec = 1, .tv_nsec = 0};
in.close();
out.close();
err.close();
if (timeout_in_seconds == 0)
{
/// If there is no timeout before signal try to waitpid 1 time without block so we can avoid sending
/// signal if process is already normally terminated.
int waitpid_res = waitpid(pid, &status, WNOHANG);
bool process_terminated_normally = (waitpid_res == pid);
return process_terminated_normally;
}
/// If timeout is positive try waitpid without block in loop until
/// process is normally terminated or waitpid return error
while (timeout_in_seconds != 0)
{
int waitpid_res = waitpid(pid, &status, WNOHANG);
bool process_terminated_normally = (waitpid_res == pid);
if (process_terminated_normally)
return true;
else if (waitpid_res == 0)
{
--timeout_in_seconds;
nanosleep(&interval, nullptr);
continue;
}
else if (waitpid_res == -1 && errno != EINTR)
return false;
}
return false;
}
void ShellCommand::logCommand(const char * filename, char * const argv[])
{
WriteBufferFromOwnString args;
@ -87,7 +147,10 @@ void ShellCommand::logCommand(const char * filename, char * const argv[])
}
std::unique_ptr<ShellCommand> ShellCommand::executeImpl(
const char * filename, char * const argv[], bool pipe_stdin_only, bool terminate_in_destructor)
const char * filename,
char * const argv[],
bool pipe_stdin_only,
ShellCommandDestructorStrategy terminate_in_destructor_strategy)
{
logCommand(filename, argv);
@ -144,7 +207,7 @@ std::unique_ptr<ShellCommand> ShellCommand::executeImpl(
}
std::unique_ptr<ShellCommand> res(new ShellCommand(
pid, pipe_stdin.fds_rw[1], pipe_stdout.fds_rw[0], pipe_stderr.fds_rw[0], terminate_in_destructor));
pid, pipe_stdin.fds_rw[1], pipe_stdout.fds_rw[0], pipe_stderr.fds_rw[0], terminate_in_destructor_strategy));
LOG_TRACE(getLogger(), "Started shell command '{}' with pid {}", filename, pid);
return res;
@ -152,7 +215,9 @@ std::unique_ptr<ShellCommand> ShellCommand::executeImpl(
std::unique_ptr<ShellCommand> ShellCommand::execute(
const std::string & command, bool pipe_stdin_only, bool terminate_in_destructor)
const std::string & command,
bool pipe_stdin_only,
ShellCommandDestructorStrategy terminate_in_destructor_strategy)
{
/// Arguments in non-constant chunks of memory (as required for `execv`).
/// Moreover, their copying must be done before calling `vfork`, so after `vfork` do a minimum of things.
@ -162,12 +227,14 @@ std::unique_ptr<ShellCommand> ShellCommand::execute(
char * const argv[] = { argv0.data(), argv1.data(), argv2.data(), nullptr };
return executeImpl("/bin/sh", argv, pipe_stdin_only, terminate_in_destructor);
return executeImpl("/bin/sh", argv, pipe_stdin_only, terminate_in_destructor_strategy);
}
std::unique_ptr<ShellCommand> ShellCommand::executeDirect(
const std::string & path, const std::vector<std::string> & arguments, bool terminate_in_destructor)
const std::string & path,
const std::vector<std::string> & arguments,
ShellCommandDestructorStrategy terminate_in_destructor_strategy)
{
size_t argv_sum_size = path.size() + 1;
for (const auto & arg : arguments)
@ -188,7 +255,7 @@ std::unique_ptr<ShellCommand> ShellCommand::executeDirect(
argv[arguments.size() + 1] = nullptr;
return executeImpl(path.data(), argv.data(), false, terminate_in_destructor);
return executeImpl(path.data(), argv.data(), false, terminate_in_destructor_strategy);
}
@ -203,8 +270,11 @@ int ShellCommand::tryWait()
LOG_TRACE(getLogger(), "Will wait for shell command pid {}", pid);
int status = 0;
if (-1 == waitpid(pid, &status, 0))
throwFromErrno("Cannot waitpid", ErrorCodes::CANNOT_WAITPID);
while (waitpid(pid, &status, 0) < 0)
{
if (errno != EINTR)
throwFromErrno("Cannot waitpid", ErrorCodes::CANNOT_WAITPID);
}
LOG_TRACE(getLogger(), "Wait for shell command pid {} completed with status {}", pid, status);

View File

@ -23,21 +23,38 @@ namespace DB
* The second difference - allows to work simultaneously with stdin, and with stdout, and with stderr of running process,
* and also to obtain the return code and completion status.
*/
class ShellCommand
struct ShellCommandDestructorStrategy final
{
explicit ShellCommandDestructorStrategy(bool terminate_in_destructor_, size_t wait_for_normal_exit_before_termination_seconds_ = 0)
: terminate_in_destructor(terminate_in_destructor_)
, wait_for_normal_exit_before_termination_seconds(wait_for_normal_exit_before_termination_seconds_)
{
}
bool terminate_in_destructor;
/// If terminate in destructor is true, command will wait until send SIGTERM signal to created process
size_t wait_for_normal_exit_before_termination_seconds = 0;
};
class ShellCommand final
{
private:
pid_t pid;
bool wait_called = false;
bool terminate_in_destructor;
ShellCommandDestructorStrategy destructor_strategy;
ShellCommand(pid_t pid_, int & in_fd_, int & out_fd_, int & err_fd_, bool terminate_in_destructor_);
ShellCommand(pid_t pid_, int & in_fd_, int & out_fd_, int & err_fd_, ShellCommandDestructorStrategy destructor_strategy_);
bool tryWaitProcessWithTimeout(size_t timeout_in_seconds);
static Poco::Logger * getLogger();
/// Print command name and the list of arguments to log. NOTE: No escaping of arguments is performed.
static void logCommand(const char * filename, char * const argv[]);
static std::unique_ptr<ShellCommand> executeImpl(const char * filename, char * const argv[], bool pipe_stdin_only, bool terminate_in_destructor);
static std::unique_ptr<ShellCommand> executeImpl(const char * filename, char * const argv[], bool pipe_stdin_only, ShellCommandDestructorStrategy terminate_in_destructor_strategy);
public:
WriteBufferFromFile in; /// If the command reads from stdin, do not forget to call in.close() after writing all the data there.
@ -48,11 +65,11 @@ public:
/// Run the command using /bin/sh -c.
/// If terminate_in_destructor is true, send terminate signal in destructor and don't wait process.
static std::unique_ptr<ShellCommand> execute(const std::string & command, bool pipe_stdin_only = false, bool terminate_in_destructor = false);
static std::unique_ptr<ShellCommand> execute(const std::string & command, bool pipe_stdin_only = false, ShellCommandDestructorStrategy terminate_in_destructor_strategy = ShellCommandDestructorStrategy(false));
/// Run the executable with the specified arguments. `arguments` - without argv[0].
/// If terminate_in_destructor is true, send terminate signal in destructor and don't wait process.
static std::unique_ptr<ShellCommand> executeDirect(const std::string & path, const std::vector<std::string> & arguments, bool terminate_in_destructor = false);
static std::unique_ptr<ShellCommand> executeDirect(const std::string & path, const std::vector<std::string> & arguments, ShellCommandDestructorStrategy terminate_in_destructor_strategy = ShellCommandDestructorStrategy(false));
/// Wait for the process to end, throw an exception if the code is not 0 or if the process was not completed by itself.
void wait();

View File

@ -345,7 +345,7 @@ struct ODBCBridgeMixin
LOG_TRACE(log, "Starting {}", serviceAlias());
return ShellCommand::executeDirect(path.toString(), cmd_args, true);
return ShellCommand::executeDirect(path.toString(), cmd_args, ShellCommandDestructorStrategy(true));
}
};
}

View File

@ -3,7 +3,6 @@
namespace Coordination
{
void write(size_t x, WriteBuffer & out)
{
x = __builtin_bswap64(x);

View File

@ -98,6 +98,8 @@ public:
if (sync)
plain_buf.sync();
else
plain_buf.next();
return result;
}
@ -143,6 +145,7 @@ private:
struct ChangelogReadResult
{
size_t entries_read;
size_t first_read_index;
off_t last_position;
bool error;
};
@ -207,9 +210,13 @@ public:
result.entries_read += 1;
if (record.header.index < start_log_index)
{
continue;
}
auto log_entry = nuraft::cs_new<nuraft::log_entry>(record.header.term, record.blob, record.header.value_type);
if (result.first_read_index == 0)
result.first_read_index = record.header.index;
logs.emplace(record.header.index, log_entry);
index_to_offset[record.header.index] = result.last_position;
@ -256,30 +263,46 @@ Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval
}
}
void Changelog::readChangelogAndInitWriter(size_t from_log_index)
void Changelog::readChangelogAndInitWriter(size_t last_commited_log_index, size_t logs_to_keep)
{
start_index = from_log_index == 0 ? 1 : from_log_index;
size_t total_read = 0;
size_t entries_in_last = 0;
size_t incomplete_log_index = 0;
ChangelogReadResult result{};
size_t first_read_index = 0;
size_t start_to_read_from = last_commited_log_index;
if (start_to_read_from > logs_to_keep)
start_to_read_from -= logs_to_keep;
else
start_to_read_from = 1;
bool started = false;
for (const auto & [changelog_start_index, changelog_description] : existing_changelogs)
{
entries_in_last = changelog_description.to_log_index - changelog_description.from_log_index + 1;
if (changelog_description.to_log_index >= from_log_index)
if (changelog_description.to_log_index >= start_to_read_from)
{
if (!started)
{
if (changelog_description.from_log_index > start_index)
throw Exception(ErrorCodes::CORRUPTED_DATA, "Cannot read changelog from index {}, smallest available index {}", start_index, changelog_description.from_log_index);
started = true;
if (changelog_description.from_log_index > last_commited_log_index && (changelog_description.from_log_index - last_commited_log_index) > 1)
{
LOG_ERROR(log, "Some records was lost, last committed log index {}, smallest available log index on disk {}. Hopefully will receive missing records from leader.", last_commited_log_index, changelog_description.from_log_index);
incomplete_log_index = changelog_start_index;
break;
}
else if (changelog_description.from_log_index > start_to_read_from)
LOG_WARNING(log, "Don't have required amount of reserved log records. Need to read from {}, smalled available log index on disk {}.", start_to_read_from, changelog_description.from_log_index);
}
started = true;
ChangelogReader reader(changelog_description.path);
result = reader.readChangelog(logs, from_log_index, index_to_start_pos, log);
result = reader.readChangelog(logs, start_to_read_from, index_to_start_pos, log);
if (first_read_index == 0)
first_read_index = result.first_read_index;
total_read += result.entries_read;
/// May happen after truncate, crash or simply unfinished log
@ -291,13 +314,19 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_index)
}
}
if (!started && start_index != 1)
throw Exception(ErrorCodes::CORRUPTED_DATA, "Required to read data from {}, but we don't have any active changelogs", from_log_index);
if (first_read_index != 0)
start_index = first_read_index;
else
start_index = last_commited_log_index;
if (incomplete_log_index != 0)
{
auto start_remove_from = existing_changelogs.begin();
if (started)
start_remove_from = existing_changelogs.upper_bound(incomplete_log_index);
/// All subsequent logs shouldn't exist. But they may exist if we crashed after writeAt started. Remove them.
for (auto itr = existing_changelogs.upper_bound(incomplete_log_index); itr != existing_changelogs.end();)
for (auto itr = start_remove_from; itr != existing_changelogs.end();)
{
LOG_WARNING(log, "Removing changelog {}, because it's goes after broken changelog entry", itr->second.path);
std::filesystem::remove(itr->second.path);

View File

@ -67,7 +67,7 @@ public:
/// Read changelog from files on changelogs_dir_ skipping all entries before from_log_index
/// Truncate broken entries, remove files after broken entries.
void readChangelogAndInitWriter(size_t from_log_index);
void readChangelogAndInitWriter(size_t last_commited_log_index, size_t logs_to_keep);
/// Add entry to log with index. Call fsync if force_sync true.
void appendEntry(size_t index, const LogEntryPtr & log_entry, bool force_sync);

View File

@ -22,14 +22,15 @@ struct Settings;
M(Milliseconds, heart_beat_interval_ms, 500, "Heartbeat interval between quorum nodes", 0) \
M(Milliseconds, election_timeout_lower_bound_ms, 1000, "Lower bound of election timer (avoid too often leader elections)", 0) \
M(Milliseconds, election_timeout_upper_bound_ms, 2000, "Lower bound of election timer (avoid too often leader elections)", 0) \
M(UInt64, reserved_log_items, 50000, "How many log items to store (don't remove during compaction)", 0) \
M(UInt64, snapshot_distance, 100000, "How many log items we have to collect to write new snapshot", 0) \
M(UInt64, max_stored_snapshots, 3, "How many snapshots we want to store", 0) \
M(UInt64, reserved_log_items, 10000, "How many log items to store (don't remove during compaction)", 0) \
M(UInt64, snapshot_distance, 10000, "How many log items we have to collect to write new snapshot", 0) \
M(Bool, auto_forwarding, true, "Allow to forward write requests from followers to leader", 0) \
M(Milliseconds, shutdown_timeout, 5000, "How many time we will until RAFT shutdown", 0) \
M(Milliseconds, startup_timeout, 30000, "How many time we will until RAFT to start", 0) \
M(LogsLevel, raft_logs_level, LogsLevel::information, "Log internal RAFT logs into main server log level. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0) \
M(UInt64, rotate_log_storage_interval, 100000, "How many records will be stored in one log storage file", 0) \
M(UInt64, rotate_log_storage_interval, 10000, "How many records will be stored in one log storage file", 0) \
M(UInt64, snapshots_to_keep, 3, "How many compressed snapshots to keep on disk", 0) \
M(UInt64, stale_log_gap, 10000, "When node became stale and should receive snapshots from leader", 0) \
M(Bool, force_sync, true, " Call fsync on each change in RAFT changelog", 0)
DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS)

View File

@ -16,10 +16,10 @@ size_t NuKeeperLogStore::start_index() const
return changelog.getStartIndex();
}
void NuKeeperLogStore::init(size_t from_log_idx)
void NuKeeperLogStore::init(size_t last_commited_log_index, size_t logs_to_keep)
{
std::lock_guard lock(changelog_lock);
changelog.readChangelogAndInitWriter(from_log_idx);
changelog.readChangelogAndInitWriter(last_commited_log_index, logs_to_keep);
}
size_t NuKeeperLogStore::next_slot() const

View File

@ -14,7 +14,7 @@ class NuKeeperLogStore : public nuraft::log_store
public:
NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_, bool force_sync_);
void init(size_t from_log_idx);
void init(size_t last_commited_log_index, size_t logs_to_keep);
size_t start_index() const override;

View File

@ -22,10 +22,11 @@ NuKeeperServer::NuKeeperServer(
int server_id_,
const CoordinationSettingsPtr & coordination_settings_,
const Poco::Util::AbstractConfiguration & config,
ResponsesQueue & responses_queue_)
ResponsesQueue & responses_queue_,
SnapshotsQueue & snapshots_queue_)
: server_id(server_id_)
, coordination_settings(coordination_settings_)
, state_machine(nuraft::cs_new<NuKeeperStateMachine>(responses_queue_, coordination_settings))
, state_machine(nuraft::cs_new<NuKeeperStateMachine>(responses_queue_, snapshots_queue_, config.getString("test_keeper_server.snapshot_storage_path", config.getString("path", DBMS_DEFAULT_PATH) + "coordination/snapshots"), coordination_settings))
, state_manager(nuraft::cs_new<NuKeeperStateManager>(server_id, "test_keeper_server", config, coordination_settings))
, responses_queue(responses_queue_)
{
@ -34,7 +35,10 @@ NuKeeperServer::NuKeeperServer(
void NuKeeperServer::startup()
{
state_manager->loadLogStore(state_machine->last_commit_index());
state_machine->init();
state_manager->loadLogStore(state_machine->last_commit_index() + 1, coordination_settings->reserved_log_items);
bool single_server = state_manager->getTotalServers() == 1;
nuraft::raft_params params;
@ -54,6 +58,7 @@ void NuKeeperServer::startup()
params.reserved_log_items_ = coordination_settings->reserved_log_items;
params.snapshot_distance_ = coordination_settings->snapshot_distance;
params.stale_log_gap_ = coordination_settings->stale_log_gap;
params.client_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds();
params.auto_forwarding_ = coordination_settings->auto_forwarding;
params.auto_forwarding_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds() * 2;
@ -174,8 +179,11 @@ bool NuKeeperServer::isLeaderAlive() const
nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * /* param */)
{
/// Only initial record
bool empty_store = state_manager->getLogStore()->size() == 1;
size_t last_commited = state_machine->last_commit_index();
size_t next_index = state_manager->getLogStore()->next_slot();
bool commited_store = false;
if (next_index < last_commited || next_index - last_commited <= 1)
commited_store = true;
auto set_initialized = [this] ()
{
@ -188,7 +196,7 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t
{
case nuraft::cb_func::BecomeLeader:
{
if (empty_store) /// We become leader and store is empty, ready to serve requests
if (commited_store) /// We become leader and store is empty, ready to serve requests
set_initialized();
return nuraft::cb_func::ReturnCode::Ok;
}

View File

@ -41,7 +41,8 @@ public:
int server_id_,
const CoordinationSettingsPtr & coordination_settings_,
const Poco::Util::AbstractConfiguration & config,
ResponsesQueue & responses_queue_);
ResponsesQueue & responses_queue_,
SnapshotsQueue & snapshots_queue_);
void startup();

View File

@ -0,0 +1,363 @@
#include <Coordination/NuKeeperSnapshotManager.h>
#include <IO/WriteHelpers.h>
#include <Compression/CompressedReadBuffer.h>
#include <Compression/CompressedWriteBuffer.h>
#include <IO/ReadHelpers.h>
#include <Common/ZooKeeper/ZooKeeperIO.h>
#include <Coordination/ReadBufferFromNuraftBuffer.h>
#include <Coordination/WriteBufferFromNuraftBuffer.h>
#include <IO/WriteBufferFromFile.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/copyData.h>
#include <filesystem>
namespace DB
{
namespace ErrorCodes
{
extern const int UNKNOWN_FORMAT_VERSION;
extern const int UNKNOWN_SNAPSHOT;
extern const int LOGICAL_ERROR;
}
namespace
{
size_t getSnapshotPathUpToLogIdx(const String & snapshot_path)
{
std::filesystem::path path(snapshot_path);
std::string filename = path.stem();
Strings name_parts;
splitInto<'_'>(name_parts, filename);
return parse<size_t>(name_parts[1]);
}
std::string getSnapshotFileName(size_t up_to_log_idx)
{
return std::string{"snapshot_"} + std::to_string(up_to_log_idx) + ".bin";
}
std::string getBaseName(const String & path)
{
size_t basename_start = path.rfind('/');
return std::string{&path[basename_start + 1], path.length() - basename_start - 1};
}
String parentPath(const String & path)
{
auto rslash_pos = path.rfind('/');
if (rslash_pos > 0)
return path.substr(0, rslash_pos);
return "/";
}
void writeNode(const NuKeeperStorage::Node & node, WriteBuffer & out)
{
writeBinary(node.data, out);
/// Serialize ACL
writeBinary(node.acls.size(), out);
for (const auto & acl : node.acls)
{
writeBinary(acl.permissions, out);
writeBinary(acl.scheme, out);
writeBinary(acl.id, out);
}
writeBinary(node.is_sequental, out);
/// Serialize stat
writeBinary(node.stat.czxid, out);
writeBinary(node.stat.mzxid, out);
writeBinary(node.stat.ctime, out);
writeBinary(node.stat.mtime, out);
writeBinary(node.stat.version, out);
writeBinary(node.stat.cversion, out);
writeBinary(node.stat.aversion, out);
writeBinary(node.stat.ephemeralOwner, out);
writeBinary(node.stat.dataLength, out);
writeBinary(node.stat.numChildren, out);
writeBinary(node.stat.pzxid, out);
writeBinary(node.seq_num, out);
}
void readNode(NuKeeperStorage::Node & node, ReadBuffer & in)
{
readBinary(node.data, in);
/// Deserialize ACL
size_t acls_size;
readBinary(acls_size, in);
for (size_t i = 0; i < acls_size; ++i)
{
Coordination::ACL acl;
readBinary(acl.permissions, in);
readBinary(acl.scheme, in);
readBinary(acl.id, in);
node.acls.push_back(acl);
}
readBinary(node.is_sequental, in);
/// Deserialize stat
readBinary(node.stat.czxid, in);
readBinary(node.stat.mzxid, in);
readBinary(node.stat.ctime, in);
readBinary(node.stat.mtime, in);
readBinary(node.stat.version, in);
readBinary(node.stat.cversion, in);
readBinary(node.stat.aversion, in);
readBinary(node.stat.ephemeralOwner, in);
readBinary(node.stat.dataLength, in);
readBinary(node.stat.numChildren, in);
readBinary(node.stat.pzxid, in);
readBinary(node.seq_num, in);
}
void serializeSnapshotMetadata(const SnapshotMetadataPtr & snapshot_meta, WriteBuffer & out)
{
auto buffer = snapshot_meta->serialize();
writeVarUInt(buffer->size(), out);
out.write(reinterpret_cast<const char *>(buffer->data_begin()), buffer->size());
}
SnapshotMetadataPtr deserializeSnapshotMetadata(ReadBuffer & in)
{
size_t data_size;
readVarUInt(data_size, in);
auto buffer = nuraft::buffer::alloc(data_size);
in.readStrict(reinterpret_cast<char *>(buffer->data_begin()), data_size);
buffer->pos(0);
return SnapshotMetadata::deserialize(*buffer);
}
}
void NuKeeperStorageSnapshot::serialize(const NuKeeperStorageSnapshot & snapshot, WriteBuffer & out)
{
writeBinary(static_cast<uint8_t>(snapshot.version), out);
serializeSnapshotMetadata(snapshot.snapshot_meta, out);
writeBinary(snapshot.session_id, out);
writeBinary(snapshot.snapshot_container_size, out);
size_t counter = 0;
for (auto it = snapshot.begin; counter < snapshot.snapshot_container_size; ++it, ++counter)
{
const auto & path = it->key;
const auto & node = it->value;
if (static_cast<size_t>(node.stat.mzxid) > snapshot.snapshot_meta->get_last_log_idx())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to serialize node with mzxid {}, but last snapshot index {}", node.stat.mzxid, snapshot.snapshot_meta->get_last_log_idx());
writeBinary(path, out);
writeNode(node, out);
}
size_t size = snapshot.session_and_timeout.size();
writeBinary(size, out);
for (const auto & [session_id, timeout] : snapshot.session_and_timeout)
{
writeBinary(session_id, out);
writeBinary(timeout, out);
}
}
SnapshotMetadataPtr NuKeeperStorageSnapshot::deserialize(NuKeeperStorage & storage, ReadBuffer & in)
{
uint8_t version;
readBinary(version, in);
if (static_cast<SnapshotVersion>(version) > SnapshotVersion::V0)
throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unsupported snapshot version {}", version);
SnapshotMetadataPtr result = deserializeSnapshotMetadata(in);
int64_t session_id;
readBinary(session_id, in);
storage.zxid = result->get_last_log_idx();
storage.session_id_counter = session_id;
size_t snapshot_container_size;
readBinary(snapshot_container_size, in);
size_t current_size = 0;
while (current_size < snapshot_container_size)
{
std::string path;
readBinary(path, in);
NuKeeperStorage::Node node;
readNode(node, in);
storage.container.insertOrReplace(path, node);
if (node.stat.ephemeralOwner != 0)
storage.ephemerals[node.stat.ephemeralOwner].insert(path);
current_size++;
}
for (const auto & itr : storage.container)
{
if (itr.key != "/")
{
auto parent_path = parentPath(itr.key);
storage.container.updateValue(parent_path, [&path = itr.key] (NuKeeperStorage::Node & value) { value.children.insert(getBaseName(path)); });
}
}
size_t active_sessions_size;
readBinary(active_sessions_size, in);
size_t current_session_size = 0;
while (current_session_size < active_sessions_size)
{
int64_t active_session_id, timeout;
readBinary(active_session_id, in);
readBinary(timeout, in);
storage.addSessionID(active_session_id, timeout);
current_session_size++;
}
return result;
}
NuKeeperStorageSnapshot::NuKeeperStorageSnapshot(NuKeeperStorage * storage_, size_t up_to_log_idx_)
: storage(storage_)
, snapshot_meta(std::make_shared<SnapshotMetadata>(up_to_log_idx_, 0, std::make_shared<nuraft::cluster_config>()))
, session_id(storage->session_id_counter)
{
storage->enableSnapshotMode();
snapshot_container_size = storage->container.snapshotSize();
begin = storage->getSnapshotIteratorBegin();
session_and_timeout = storage->getActiveSessions();
}
NuKeeperStorageSnapshot::NuKeeperStorageSnapshot(NuKeeperStorage * storage_, const SnapshotMetadataPtr & snapshot_meta_)
: storage(storage_)
, snapshot_meta(snapshot_meta_)
, session_id(storage->session_id_counter)
{
storage->enableSnapshotMode();
snapshot_container_size = storage->container.snapshotSize();
begin = storage->getSnapshotIteratorBegin();
session_and_timeout = storage->getActiveSessions();
}
NuKeeperStorageSnapshot::~NuKeeperStorageSnapshot()
{
storage->disableSnapshotMode();
}
NuKeeperSnapshotManager::NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_)
: snapshots_path(snapshots_path_)
, snapshots_to_keep(snapshots_to_keep_)
{
namespace fs = std::filesystem;
if (!fs::exists(snapshots_path))
fs::create_directories(snapshots_path);
for (const auto & p : fs::directory_iterator(snapshots_path))
{
if (startsWith(p.path(), "tmp_")) /// Unfinished tmp files
{
std::filesystem::remove(p);
continue;
}
size_t snapshot_up_to = getSnapshotPathUpToLogIdx(p.path());
existing_snapshots[snapshot_up_to] = p.path();
}
removeOutdatedSnapshotsIfNeeded();
}
std::string NuKeeperSnapshotManager::serializeSnapshotBufferToDisk(nuraft::buffer & buffer, size_t up_to_log_idx)
{
ReadBufferFromNuraftBuffer reader(buffer);
auto snapshot_file_name = getSnapshotFileName(up_to_log_idx);
auto tmp_snapshot_file_name = "tmp_" + snapshot_file_name;
std::string tmp_snapshot_path = std::filesystem::path{snapshots_path} / tmp_snapshot_file_name;
std::string new_snapshot_path = std::filesystem::path{snapshots_path} / snapshot_file_name;
WriteBufferFromFile plain_buf(tmp_snapshot_path);
copyData(reader, plain_buf);
plain_buf.sync();
std::filesystem::rename(tmp_snapshot_path, new_snapshot_path);
existing_snapshots.emplace(up_to_log_idx, new_snapshot_path);
removeOutdatedSnapshotsIfNeeded();
return new_snapshot_path;
}
nuraft::ptr<nuraft::buffer> NuKeeperSnapshotManager::deserializeLatestSnapshotBufferFromDisk()
{
while (!existing_snapshots.empty())
{
auto latest_itr = existing_snapshots.rbegin();
try
{
return deserializeSnapshotBufferFromDisk(latest_itr->first);
}
catch (const DB::Exception &)
{
std::filesystem::remove(latest_itr->second);
existing_snapshots.erase(latest_itr->first);
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
return nullptr;
}
nuraft::ptr<nuraft::buffer> NuKeeperSnapshotManager::deserializeSnapshotBufferFromDisk(size_t up_to_log_idx) const
{
const std::string & snapshot_path = existing_snapshots.at(up_to_log_idx);
WriteBufferFromNuraftBuffer writer;
ReadBufferFromFile reader(snapshot_path);
copyData(reader, writer);
return writer.getBuffer();
}
nuraft::ptr<nuraft::buffer> NuKeeperSnapshotManager::serializeSnapshotToBuffer(const NuKeeperStorageSnapshot & snapshot)
{
WriteBufferFromNuraftBuffer writer;
CompressedWriteBuffer compressed_writer(writer);
NuKeeperStorageSnapshot::serialize(snapshot, compressed_writer);
compressed_writer.finalize();
return writer.getBuffer();
}
SnapshotMetadataPtr NuKeeperSnapshotManager::deserializeSnapshotFromBuffer(NuKeeperStorage * storage, nuraft::ptr<nuraft::buffer> buffer)
{
ReadBufferFromNuraftBuffer reader(buffer);
CompressedReadBuffer compressed_reader(reader);
return NuKeeperStorageSnapshot::deserialize(*storage, compressed_reader);
}
SnapshotMetadataPtr NuKeeperSnapshotManager::restoreFromLatestSnapshot(NuKeeperStorage * storage)
{
if (existing_snapshots.empty())
return nullptr;
auto buffer = deserializeLatestSnapshotBufferFromDisk();
if (!buffer)
return nullptr;
return deserializeSnapshotFromBuffer(storage, buffer);
}
void NuKeeperSnapshotManager::removeOutdatedSnapshotsIfNeeded()
{
while (existing_snapshots.size() > snapshots_to_keep)
removeSnapshot(existing_snapshots.begin()->first);
}
void NuKeeperSnapshotManager::removeSnapshot(size_t log_idx)
{
auto itr = existing_snapshots.find(log_idx);
if (itr == existing_snapshots.end())
throw Exception(ErrorCodes::UNKNOWN_SNAPSHOT, "Unknown snapshot with log index {}", log_idx);
std::filesystem::remove(itr->second);
existing_snapshots.erase(itr);
}
}

View File

@ -0,0 +1,85 @@
#pragma once
#include <libnuraft/nuraft.hxx> // Y_IGNORE
#include <Coordination/NuKeeperStorage.h>
#include <IO/WriteBuffer.h>
#include <IO/ReadBuffer.h>
namespace DB
{
using SnapshotMetadata = nuraft::snapshot;
using SnapshotMetadataPtr = std::shared_ptr<SnapshotMetadata>;
enum SnapshotVersion : uint8_t
{
V0 = 0,
};
struct NuKeeperStorageSnapshot
{
public:
NuKeeperStorageSnapshot(NuKeeperStorage * storage_, size_t up_to_log_idx_);
NuKeeperStorageSnapshot(NuKeeperStorage * storage_, const SnapshotMetadataPtr & snapshot_meta_);
~NuKeeperStorageSnapshot();
static void serialize(const NuKeeperStorageSnapshot & snapshot, WriteBuffer & out);
static SnapshotMetadataPtr deserialize(NuKeeperStorage & storage, ReadBuffer & in);
NuKeeperStorage * storage;
SnapshotVersion version = SnapshotVersion::V0;
SnapshotMetadataPtr snapshot_meta;
int64_t session_id;
size_t snapshot_container_size;
NuKeeperStorage::Container::const_iterator begin;
SessionAndTimeout session_and_timeout;
};
using NuKeeperStorageSnapshotPtr = std::shared_ptr<NuKeeperStorageSnapshot>;
using CreateSnapshotCallback = std::function<void(NuKeeperStorageSnapshotPtr &&)>;
class NuKeeperSnapshotManager
{
public:
NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_);
SnapshotMetadataPtr restoreFromLatestSnapshot(NuKeeperStorage * storage);
static nuraft::ptr<nuraft::buffer> serializeSnapshotToBuffer(const NuKeeperStorageSnapshot & snapshot);
std::string serializeSnapshotBufferToDisk(nuraft::buffer & buffer, size_t up_to_log_idx);
static SnapshotMetadataPtr deserializeSnapshotFromBuffer(NuKeeperStorage * storage, nuraft::ptr<nuraft::buffer> buffer);
nuraft::ptr<nuraft::buffer> deserializeSnapshotBufferFromDisk(size_t up_to_log_idx) const;
nuraft::ptr<nuraft::buffer> deserializeLatestSnapshotBufferFromDisk();
void removeSnapshot(size_t log_idx);
size_t totalSnapshots() const
{
return existing_snapshots.size();
}
size_t getLatestSnapshotIndex() const
{
if (!existing_snapshots.empty())
return existing_snapshots.rbegin()->first;
return 0;
}
private:
void removeOutdatedSnapshotsIfNeeded();
const std::string snapshots_path;
const size_t snapshots_to_keep;
std::map<size_t, std::string> existing_snapshots;
};
struct CreateSnapshotTask
{
NuKeeperStorageSnapshotPtr snapshot;
CreateSnapshotCallback create_snapshot;
};
}

View File

@ -3,11 +3,16 @@
#include <Coordination/WriteBufferFromNuraftBuffer.h>
#include <IO/ReadHelpers.h>
#include <Common/ZooKeeper/ZooKeeperIO.h>
#include <Coordination/NuKeeperStorageSerializer.h>
#include <Coordination/NuKeeperSnapshotManager.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
NuKeeperStorage::RequestForSession parseRequest(nuraft::buffer & data)
{
ReadBufferFromNuraftBuffer buffer(data);
@ -21,6 +26,7 @@ NuKeeperStorage::RequestForSession parseRequest(nuraft::buffer & data)
Coordination::read(xid, buffer);
Coordination::OpNum opnum;
Coordination::read(opnum, buffer);
request_for_session.request = Coordination::ZooKeeperRequestFactory::instance().get(opnum);
@ -29,26 +35,54 @@ NuKeeperStorage::RequestForSession parseRequest(nuraft::buffer & data)
return request_for_session;
}
nuraft::ptr<nuraft::buffer> writeResponses(NuKeeperStorage::ResponsesForSessions & responses)
{
WriteBufferFromNuraftBuffer buffer;
for (const auto & response_and_session : responses)
{
writeIntBinary(response_and_session.session_id, buffer);
response_and_session.response->write(buffer);
}
return buffer.getBuffer();
}
NuKeeperStateMachine::NuKeeperStateMachine(ResponsesQueue & responses_queue_, const CoordinationSettingsPtr & coordination_settings_)
NuKeeperStateMachine::NuKeeperStateMachine(ResponsesQueue & responses_queue_, SnapshotsQueue & snapshots_queue_, const std::string & snapshots_path_, const CoordinationSettingsPtr & coordination_settings_)
: coordination_settings(coordination_settings_)
, storage(coordination_settings->dead_session_check_period_ms.totalMilliseconds())
, snapshot_manager(snapshots_path_, coordination_settings->snapshots_to_keep)
, responses_queue(responses_queue_)
, snapshots_queue(snapshots_queue_)
, last_committed_idx(0)
, log(&Poco::Logger::get("NuKeeperStateMachine"))
{
LOG_DEBUG(log, "Created nukeeper state machine");
}
void NuKeeperStateMachine::init()
{
/// Do everything without mutexes, no other threads exist.
LOG_DEBUG(log, "Totally have {} snapshots", snapshot_manager.totalSnapshots());
bool loaded = false;
bool has_snapshots = snapshot_manager.totalSnapshots() != 0;
while (snapshot_manager.totalSnapshots() != 0)
{
size_t latest_log_index = snapshot_manager.getLatestSnapshotIndex();
LOG_DEBUG(log, "Trying to load state machine from snapshot up to log index {}", latest_log_index);
try
{
latest_snapshot_buf = snapshot_manager.deserializeSnapshotBufferFromDisk(latest_log_index);
latest_snapshot_meta = snapshot_manager.deserializeSnapshotFromBuffer(&storage, latest_snapshot_buf);
last_committed_idx = latest_snapshot_meta->get_last_log_idx();
loaded = true;
break;
}
catch (const DB::Exception & ex)
{
LOG_WARNING(log, "Failed to load from snapshot with index {}, with error {}, will remove it from disk", latest_log_index, ex.displayText());
snapshot_manager.removeSnapshot(latest_log_index);
}
}
if (has_snapshots)
{
if (loaded)
LOG_DEBUG(log, "Loaded snapshot with last committed log index {}", last_committed_idx);
else
LOG_WARNING(log, "All snapshots broken, last committed log index {}", last_committed_idx);
}
else
{
LOG_DEBUG(log, "No existing snapshots, last committed log index {}", last_committed_idx);
}
}
nuraft::ptr<nuraft::buffer> NuKeeperStateMachine::commit(const size_t log_idx, nuraft::buffer & data)
@ -75,7 +109,7 @@ nuraft::ptr<nuraft::buffer> NuKeeperStateMachine::commit(const size_t log_idx, n
NuKeeperStorage::ResponsesForSessions responses_for_sessions;
{
std::lock_guard lock(storage_lock);
responses_for_sessions = storage.processRequest(request_for_session.request, request_for_session.session_id);
responses_for_sessions = storage.processRequest(request_for_session.request, request_for_session.session_id, log_idx);
for (auto & response_for_session : responses_for_sessions)
responses_queue.push(response_for_session);
}
@ -88,59 +122,28 @@ nuraft::ptr<nuraft::buffer> NuKeeperStateMachine::commit(const size_t log_idx, n
bool NuKeeperStateMachine::apply_snapshot(nuraft::snapshot & s)
{
LOG_DEBUG(log, "Applying snapshot {}", s.get_last_log_idx());
StorageSnapshotPtr snapshot;
nuraft::ptr<nuraft::buffer> latest_snapshot_ptr;
{
std::lock_guard<std::mutex> lock(snapshots_lock);
auto entry = snapshots.find(s.get_last_log_idx());
if (entry == snapshots.end())
return false;
snapshot = entry->second;
std::lock_guard lock(snapshots_lock);
if (s.get_last_log_idx() != latest_snapshot_meta->get_last_log_idx())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Required to apply snapshot with last log index {}, but our last log index is {}",
s.get_last_log_idx(), latest_snapshot_meta->get_last_log_idx());
latest_snapshot_ptr = latest_snapshot_buf;
}
{
std::lock_guard lock(storage_lock);
snapshot_manager.deserializeSnapshotFromBuffer(&storage, latest_snapshot_ptr);
}
std::lock_guard lock(storage_lock);
storage = snapshot->storage;
last_committed_idx = s.get_last_log_idx();
return true;
}
nuraft::ptr<nuraft::snapshot> NuKeeperStateMachine::last_snapshot()
{
// Just return the latest snapshot.
/// Just return the latest snapshot.
std::lock_guard<std::mutex> lock(snapshots_lock);
auto entry = snapshots.rbegin();
if (entry == snapshots.rend())
return nullptr;
return entry->second->snapshot;
}
NuKeeperStateMachine::StorageSnapshotPtr NuKeeperStateMachine::createSnapshotInternal(nuraft::snapshot & s)
{
nuraft::ptr<nuraft::buffer> snp_buf = s.serialize();
nuraft::ptr<nuraft::snapshot> ss = nuraft::snapshot::deserialize(*snp_buf);
std::lock_guard lock(storage_lock);
return std::make_shared<NuKeeperStateMachine::StorageSnapshot>(ss, storage);
}
NuKeeperStateMachine::StorageSnapshotPtr NuKeeperStateMachine::readSnapshot(nuraft::snapshot & s, nuraft::buffer & in)
{
nuraft::ptr<nuraft::buffer> snp_buf = s.serialize();
nuraft::ptr<nuraft::snapshot> ss = nuraft::snapshot::deserialize(*snp_buf);
NuKeeperStorageSerializer serializer;
ReadBufferFromNuraftBuffer reader(in);
NuKeeperStorage new_storage(coordination_settings->dead_session_check_period_ms.totalMilliseconds());
serializer.deserialize(new_storage, reader);
return std::make_shared<StorageSnapshot>(ss, new_storage);
}
void NuKeeperStateMachine::writeSnapshot(const NuKeeperStateMachine::StorageSnapshotPtr & snapshot, nuraft::ptr<nuraft::buffer> & out)
{
NuKeeperStorageSerializer serializer;
WriteBufferFromNuraftBuffer writer;
serializer.serialize(snapshot->storage, writer);
out = writer.getBuffer();
return latest_snapshot_meta;
}
void NuKeeperStateMachine::create_snapshot(
@ -148,29 +151,53 @@ void NuKeeperStateMachine::create_snapshot(
nuraft::async_result<bool>::handler_type & when_done)
{
LOG_DEBUG(log, "Creating snapshot {}", s.get_last_log_idx());
auto snapshot = createSnapshotInternal(s);
nuraft::ptr<nuraft::buffer> snp_buf = s.serialize();
auto snapshot_meta_copy = nuraft::snapshot::deserialize(*snp_buf);
CreateSnapshotTask snapshot_task;
{
std::lock_guard<std::mutex> lock(snapshots_lock);
snapshots[s.get_last_log_idx()] = snapshot;
size_t num = snapshots.size();
if (num > coordination_settings->max_stored_snapshots)
{
auto entry = snapshots.begin();
for (size_t i = 0; i < num - coordination_settings->max_stored_snapshots; ++i)
{
if (entry == snapshots.end())
break;
entry = snapshots.erase(entry);
}
}
std::lock_guard lock(storage_lock);
snapshot_task.snapshot = std::make_shared<NuKeeperStorageSnapshot>(&storage, snapshot_meta_copy);
}
LOG_DEBUG(log, "Created snapshot {}", s.get_last_log_idx());
nuraft::ptr<std::exception> except(nullptr);
bool ret = true;
when_done(ret, except);
snapshot_task.create_snapshot = [this, when_done] (NuKeeperStorageSnapshotPtr && snapshot)
{
nuraft::ptr<std::exception> exception(nullptr);
bool ret = true;
try
{
auto snapshot_buf = snapshot_manager.serializeSnapshotToBuffer(*snapshot);
auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*snapshot_buf, snapshot->snapshot_meta->get_last_log_idx());
{
std::lock_guard lock(snapshots_lock);
latest_snapshot_buf = snapshot_buf;
latest_snapshot_meta = snapshot->snapshot_meta;
}
LOG_DEBUG(log, "Created persistent snapshot {} with path {}", latest_snapshot_meta->get_last_log_idx(), result_path);
{
/// Must do it with lock (clearing elements from list)
std::lock_guard lock(storage_lock);
storage.clearGarbageAfterSnapshot();
/// Destroy snapshot with lock
snapshot.reset();
LOG_TRACE(log, "Cleared garbage after snapshot");
}
}
catch (...)
{
LOG_TRACE(log, "Exception happened during snapshot");
tryLogCurrentException(log);
ret = false;
}
when_done(ret, exception);
};
LOG_DEBUG(log, "In memory snapshot {} created, queueing task to flash to disk", s.get_last_log_idx());
snapshots_queue.push(std::move(snapshot_task));
}
void NuKeeperStateMachine::save_logical_snp_obj(
@ -182,20 +209,32 @@ void NuKeeperStateMachine::save_logical_snp_obj(
{
LOG_DEBUG(log, "Saving snapshot {} obj_id {}", s.get_last_log_idx(), obj_id);
nuraft::ptr<nuraft::buffer> cloned_buffer;
nuraft::ptr<nuraft::snapshot> cloned_meta;
if (obj_id == 0)
{
auto new_snapshot = createSnapshotInternal(s);
std::lock_guard<std::mutex> lock(snapshots_lock);
snapshots.try_emplace(s.get_last_log_idx(), std::move(new_snapshot));
std::lock_guard lock(storage_lock);
NuKeeperStorageSnapshot snapshot(&storage, s.get_last_log_idx());
cloned_buffer = snapshot_manager.serializeSnapshotToBuffer(snapshot);
}
else
{
auto received_snapshot = readSnapshot(s, data);
std::lock_guard<std::mutex> lock(snapshots_lock);
snapshots[s.get_last_log_idx()] = std::move(received_snapshot);
cloned_buffer = nuraft::buffer::clone(data);
}
nuraft::ptr<nuraft::buffer> snp_buf = s.serialize();
cloned_meta = nuraft::snapshot::deserialize(*snp_buf);
auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*cloned_buffer, s.get_last_log_idx());
{
std::lock_guard lock(snapshots_lock);
latest_snapshot_buf = cloned_buffer;
latest_snapshot_meta = cloned_meta;
}
LOG_DEBUG(log, "Created snapshot {} with path {}", s.get_last_log_idx(), result_path);
obj_id++;
}
@ -208,29 +247,20 @@ int NuKeeperStateMachine::read_logical_snp_obj(
{
LOG_DEBUG(log, "Reading snapshot {} obj_id {}", s.get_last_log_idx(), obj_id);
StorageSnapshotPtr required_snapshot;
{
std::lock_guard<std::mutex> lock(snapshots_lock);
auto entry = snapshots.find(s.get_last_log_idx());
if (entry == snapshots.end())
{
// Snapshot doesn't exist.
data_out = nullptr;
is_last_obj = true;
return 0;
}
required_snapshot = entry->second;
}
if (obj_id == 0)
{
auto new_snapshot = createSnapshotInternal(s);
writeSnapshot(new_snapshot, data_out);
data_out = nuraft::buffer::alloc(sizeof(int32_t));
nuraft::buffer_serializer bs(data_out);
bs.put_i32(0);
is_last_obj = false;
}
else
{
writeSnapshot(required_snapshot, data_out);
std::lock_guard lock(snapshots_lock);
if (s.get_last_log_idx() != latest_snapshot_meta->get_last_log_idx())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Required to apply snapshot with last log index {}, but our last log index is {}",
s.get_last_log_idx(), latest_snapshot_meta->get_last_log_idx());
data_out = nuraft::buffer::clone(*latest_snapshot_buf);
is_last_obj = true;
}
return 0;
@ -241,7 +271,7 @@ void NuKeeperStateMachine::processReadRequest(const NuKeeperStorage::RequestForS
NuKeeperStorage::ResponsesForSessions responses;
{
std::lock_guard lock(storage_lock);
responses = storage.processRequest(request_for_session.request, request_for_session.session_id);
responses = storage.processRequest(request_for_session.request, request_for_session.session_id, std::nullopt);
}
for (const auto & response : responses)
responses_queue.push(response);

View File

@ -5,16 +5,20 @@
#include <common/logger_useful.h>
#include <Coordination/ThreadSafeQueue.h>
#include <Coordination/CoordinationSettings.h>
#include <Coordination/NuKeeperSnapshotManager.h>
namespace DB
{
using ResponsesQueue = ThreadSafeQueue<NuKeeperStorage::ResponseForSession>;
using SnapshotsQueue = ConcurrentBoundedQueue<CreateSnapshotTask>;
class NuKeeperStateMachine : public nuraft::state_machine
{
public:
NuKeeperStateMachine(ResponsesQueue & responses_queue_, const CoordinationSettingsPtr & coordination_settings_);
NuKeeperStateMachine(ResponsesQueue & responses_queue_, SnapshotsQueue & snapshots_queue_, const std::string & snapshots_path_, const CoordinationSettingsPtr & coordination_settings_);
void init();
nuraft::ptr<nuraft::buffer> pre_commit(const size_t /*log_idx*/, nuraft::buffer & /*data*/) override { return nullptr; }
@ -58,39 +62,25 @@ public:
void shutdownStorage();
private:
struct StorageSnapshot
{
StorageSnapshot(const nuraft::ptr<nuraft::snapshot> & s, const NuKeeperStorage & storage_)
: snapshot(s)
, storage(storage_)
{}
nuraft::ptr<nuraft::snapshot> snapshot;
NuKeeperStorage storage;
};
using StorageSnapshotPtr = std::shared_ptr<StorageSnapshot>;
StorageSnapshotPtr createSnapshotInternal(nuraft::snapshot & s);
StorageSnapshotPtr readSnapshot(nuraft::snapshot & s, nuraft::buffer & in);
static void writeSnapshot(const StorageSnapshotPtr & snapshot, nuraft::ptr<nuraft::buffer> & out);
SnapshotMetadataPtr latest_snapshot_meta = nullptr;
nuraft::ptr<nuraft::buffer> latest_snapshot_buf = nullptr;
CoordinationSettingsPtr coordination_settings;
NuKeeperStorage storage;
NuKeeperSnapshotManager snapshot_manager;
ResponsesQueue & responses_queue;
SnapshotsQueue & snapshots_queue;
/// Mutex for snapshots
std::mutex snapshots_lock;
/// Lock for storage
std::mutex storage_lock;
/// Fake snapshot storage
std::map<uint64_t, StorageSnapshotPtr> snapshots;
/// Last committed Raft log number.
std::atomic<size_t> last_committed_idx;
Poco::Logger * log;

View File

@ -64,9 +64,9 @@ NuKeeperStateManager::NuKeeperStateManager(
throw Exception(ErrorCodes::RAFT_ERROR, "At least one of servers should be able to start as leader (without <start_as_follower>)");
}
void NuKeeperStateManager::loadLogStore(size_t start_log_index)
void NuKeeperStateManager::loadLogStore(size_t last_commited_index, size_t logs_to_keep)
{
log_store->init(start_log_index);
log_store->init(last_commited_index, logs_to_keep);
}
void NuKeeperStateManager::flushLogStore()

View File

@ -25,7 +25,7 @@ public:
int port,
const std::string & logs_path);
void loadLogStore(size_t start_log_index);
void loadLogStore(size_t last_commited_index, size_t logs_to_keep);
void flushLogStore();

View File

@ -70,7 +70,7 @@ static NuKeeperStorage::ResponsesForSessions processWatchesImpl(const String & p
NuKeeperStorage::NuKeeperStorage(int64_t tick_time_ms)
: session_expiry_queue(tick_time_ms)
{
container.emplace("/", Node());
container.insert("/", Node());
}
using Undo = std::function<void()>;
@ -124,19 +124,20 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest
Coordination::ZooKeeperCreateResponse & response = dynamic_cast<Coordination::ZooKeeperCreateResponse &>(*response_ptr);
Coordination::ZooKeeperCreateRequest & request = dynamic_cast<Coordination::ZooKeeperCreateRequest &>(*zk_request);
if (container.count(request.path))
if (container.contains(request.path))
{
response.error = Coordination::Error::ZNODEEXISTS;
}
else
{
auto it = container.find(parentPath(request.path));
auto parent_path = parentPath(request.path);
auto it = container.find(parent_path);
if (it == container.end())
{
response.error = Coordination::Error::ZNONODE;
}
else if (it->second.is_ephemeral)
else if (it->value.stat.ephemeralOwner != 0)
{
response.error = Coordination::Error::ZNOCHILDRENFOREPHEMERALS;
}
@ -149,14 +150,14 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest
created_node.stat.mtime = created_node.stat.ctime;
created_node.stat.numChildren = 0;
created_node.stat.dataLength = request.data.length();
created_node.stat.ephemeralOwner = request.is_ephemeral ? session_id : 0;
created_node.data = request.data;
created_node.is_ephemeral = request.is_ephemeral;
created_node.is_sequental = request.is_sequential;
std::string path_created = request.path;
if (request.is_sequential)
{
auto seq_num = it->second.seq_num;
auto seq_num = it->value.seq_num;
std::stringstream seq_num_str; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
seq_num_str.exceptions(std::ios::failbit);
@ -165,32 +166,36 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest
path_created += seq_num_str.str();
}
/// Increment sequential number even if node is not sequential
++it->second.seq_num;
response.path_created = path_created;
container.emplace(path_created, std::move(created_node));
auto child_path = getBaseName(path_created);
it->second.children.insert(child_path);
container.updateValue(parent_path, [child_path] (NuKeeperStorage::Node & parent)
{
/// Increment sequential number even if node is not sequential
++parent.seq_num;
parent.children.insert(child_path);
++parent.stat.cversion;
++parent.stat.numChildren;
});
response.path_created = path_created;
container.insert(path_created, std::move(created_node));
if (request.is_ephemeral)
ephemerals[session_id].emplace(path_created);
undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first, child_path]
undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path, child_path]
{
container.erase(path_created);
if (is_ephemeral)
ephemerals[session_id].erase(path_created);
auto & undo_parent = container.at(parent_path);
--undo_parent.stat.cversion;
--undo_parent.stat.numChildren;
--undo_parent.seq_num;
undo_parent.children.erase(child_path);
};
++it->second.stat.cversion;
++it->second.stat.numChildren;
container.updateValue(parent_path, [child_path] (NuKeeperStorage::Node & undo_parent)
{
--undo_parent.stat.cversion;
--undo_parent.stat.numChildren;
--undo_parent.seq_num;
undo_parent.children.erase(child_path);
});
};
response.error = Coordination::Error::ZOK;
}
@ -216,8 +221,8 @@ struct NuKeeperStorageGetRequest final : public NuKeeperStorageRequest
}
else
{
response.stat = it->second.stat;
response.data = it->second.data;
response.stat = it->value.stat;
response.data = it->value.data;
response.error = Coordination::Error::ZOK;
}
@ -240,39 +245,44 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest
{
response.error = Coordination::Error::ZNONODE;
}
else if (request.version != -1 && request.version != it->second.stat.version)
else if (request.version != -1 && request.version != it->value.stat.version)
{
response.error = Coordination::Error::ZBADVERSION;
}
else if (it->second.stat.numChildren)
else if (it->value.stat.numChildren)
{
response.error = Coordination::Error::ZNOTEMPTY;
}
else
{
auto prev_node = it->second;
if (prev_node.is_ephemeral)
auto prev_node = it->value;
if (prev_node.stat.ephemeralOwner != 0)
ephemerals[session_id].erase(request.path);
auto child_basename = getBaseName(it->first);
auto & parent = container.at(parentPath(request.path));
--parent.stat.numChildren;
++parent.stat.cversion;
parent.children.erase(child_basename);
auto child_basename = getBaseName(it->key);
container.updateValue(parentPath(request.path), [&child_basename] (NuKeeperStorage::Node & parent)
{
--parent.stat.numChildren;
++parent.stat.cversion;
parent.children.erase(child_basename);
});
response.error = Coordination::Error::ZOK;
container.erase(it);
container.erase(request.path);
undo = [prev_node, &container, &ephemerals, session_id, path = request.path, child_basename]
{
if (prev_node.is_ephemeral)
if (prev_node.stat.ephemeralOwner != 0)
ephemerals[session_id].emplace(path);
container.emplace(path, prev_node);
auto & undo_parent = container.at(parentPath(path));
++undo_parent.stat.numChildren;
--undo_parent.stat.cversion;
undo_parent.children.insert(child_basename);
container.insert(path, prev_node);
container.updateValue(parentPath(path), [&child_basename] (NuKeeperStorage::Node & parent)
{
++parent.stat.numChildren;
--parent.stat.cversion;
parent.children.insert(child_basename);
});
};
}
@ -297,7 +307,7 @@ struct NuKeeperStorageExistsRequest final : public NuKeeperStorageRequest
auto it = container.find(request.path);
if (it != container.end())
{
response.stat = it->second.stat;
response.stat = it->value.stat;
response.error = Coordination::Error::ZOK;
}
else
@ -324,24 +334,35 @@ struct NuKeeperStorageSetRequest final : public NuKeeperStorageRequest
{
response.error = Coordination::Error::ZNONODE;
}
else if (request.version == -1 || request.version == it->second.stat.version)
else if (request.version == -1 || request.version == it->value.stat.version)
{
auto prev_node = it->second;
auto prev_node = it->value;
it->second.data = request.data;
++it->second.stat.version;
it->second.stat.mzxid = zxid;
it->second.stat.mtime = std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1);
it->second.stat.dataLength = request.data.length();
it->second.data = request.data;
++container.at(parentPath(request.path)).stat.cversion;
response.stat = it->second.stat;
auto itr = container.updateValue(request.path, [zxid, request] (NuKeeperStorage::Node & value)
{
value.data = request.data;
value.stat.version++;
value.stat.mzxid = zxid;
value.stat.mtime = std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1);
value.stat.dataLength = request.data.length();
value.data = request.data;
});
container.updateValue(parentPath(request.path), [] (NuKeeperStorage::Node & parent)
{
parent.stat.cversion++;
});
response.stat = itr->value.stat;
response.error = Coordination::Error::ZOK;
undo = [prev_node, &container, path = request.path]
{
container.at(path) = prev_node;
--container.at(parentPath(path)).stat.cversion;
container.updateValue(path, [&prev_node] (NuKeeperStorage::Node & value) { value = prev_node; });
container.updateValue(parentPath(path), [] (NuKeeperStorage::Node & parent)
{
parent.stat.cversion--;
});
};
}
else
@ -378,11 +399,11 @@ struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest
if (path_prefix.empty())
throw DB::Exception("Logical error: path cannot be empty", ErrorCodes::LOGICAL_ERROR);
response.names.insert(response.names.end(), it->second.children.begin(), it->second.children.end());
response.names.insert(response.names.end(), it->value.children.begin(), it->value.children.end());
std::sort(response.names.begin(), response.names.end());
response.stat = it->second.stat;
response.stat = it->value.stat;
response.error = Coordination::Error::ZOK;
}
@ -403,7 +424,7 @@ struct NuKeeperStorageCheckRequest final : public NuKeeperStorageRequest
{
response.error = Coordination::Error::ZNONODE;
}
else if (request.version != -1 && request.version != it->second.stat.version)
else if (request.version != -1 && request.version != it->value.stat.version)
{
response.error = Coordination::Error::ZBADVERSION;
}
@ -601,9 +622,17 @@ NuKeeperWrapperFactory::NuKeeperWrapperFactory()
}
NuKeeperStorage::ResponsesForSessions NuKeeperStorage::processRequest(const Coordination::ZooKeeperRequestPtr & zk_request, int64_t session_id)
NuKeeperStorage::ResponsesForSessions NuKeeperStorage::processRequest(const Coordination::ZooKeeperRequestPtr & zk_request, int64_t session_id, std::optional<int64_t> new_last_zxid)
{
NuKeeperStorage::ResponsesForSessions results;
if (new_last_zxid)
{
if (zxid >= *new_last_zxid)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Got new ZXID {} smaller or equal than current {}. It's a bug", *new_last_zxid, zxid);
zxid = *new_last_zxid;
}
session_expiry_queue.update(session_id, session_and_timeout[session_id]);
if (zk_request->getOpNum() == Coordination::OpNum::Close)
{
auto it = ephemerals.find(session_id);
@ -629,7 +658,6 @@ NuKeeperStorage::ResponsesForSessions NuKeeperStorage::processRequest(const Coor
}
else if (zk_request->getOpNum() == Coordination::OpNum::Heartbeat)
{
session_expiry_queue.update(session_id, session_and_timeout[session_id]);
NuKeeperStorageRequestPtr storage_request = NuKeeperWrapperFactory::instance().get(zk_request);
auto [response, _] = storage_request->process(container, ephemerals, zxid, session_id);
response->xid = zk_request->xid;
@ -639,7 +667,6 @@ NuKeeperStorage::ResponsesForSessions NuKeeperStorage::processRequest(const Coor
}
else
{
NuKeeperStorageRequestPtr storage_request = NuKeeperWrapperFactory::instance().get(zk_request);
auto [response, _] = storage_request->process(container, ephemerals, zxid, session_id);

View File

@ -5,6 +5,7 @@
#include <Common/ConcurrentBoundedQueue.h>
#include <Common/ZooKeeper/ZooKeeperCommon.h>
#include <Coordination/SessionExpiryQueue.h>
#include <Coordination/SnapshotableHashTable.h>
#include <unordered_map>
#include <unordered_set>
#include <vector>
@ -17,17 +18,19 @@ struct NuKeeperStorageRequest;
using NuKeeperStorageRequestPtr = std::shared_ptr<NuKeeperStorageRequest>;
using ResponseCallback = std::function<void(const Coordination::ZooKeeperResponsePtr &)>;
using ChildrenSet = std::unordered_set<std::string>;
using SessionAndTimeout = std::unordered_map<int64_t, int64_t>;
struct NuKeeperStorageSnapshot;
class NuKeeperStorage
{
public:
int64_t session_id_counter{0};
int64_t session_id_counter{1};
struct Node
{
String data;
Coordination::ACLs acls{};
bool is_ephemeral = false;
bool is_sequental = false;
Coordination::Stat stat{};
int32_t seq_num = 0;
@ -50,10 +53,9 @@ public:
using RequestsForSessions = std::vector<RequestForSession>;
using Container = std::unordered_map<std::string, Node>;
using Container = SnapshotableHashTable<Node>;
using Ephemerals = std::unordered_map<int64_t, std::unordered_set<std::string>>;
using SessionAndWatcher = std::unordered_map<int64_t, std::unordered_set<std::string>>;
using SessionAndTimeout = std::unordered_map<int64_t, long>;
using SessionIDs = std::vector<int64_t>;
using Watches = std::map<String /* path, relative of root_path */, SessionIDs>;
@ -72,9 +74,9 @@ public:
void clearDeadWatches(int64_t session_id);
int64_t getZXID()
int64_t getZXID() const
{
return zxid++;
return zxid;
}
public:
@ -88,10 +90,41 @@ public:
return result;
}
ResponsesForSessions processRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id);
void addSessionID(int64_t session_id, int64_t session_timeout_ms)
{
session_and_timeout.emplace(session_id, session_timeout_ms);
session_expiry_queue.update(session_id, session_timeout_ms);
}
ResponsesForSessions processRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id, std::optional<int64_t> new_last_zxid);
void finalize();
void enableSnapshotMode()
{
container.enableSnapshotMode();
}
void disableSnapshotMode()
{
container.disableSnapshotMode();
}
Container::const_iterator getSnapshotIteratorBegin() const
{
return container.begin();
}
void clearGarbageAfterSnapshot()
{
container.clearOutdatedNodes();
}
const SessionAndTimeout & getActiveSessions() const
{
return session_and_timeout;
}
std::unordered_set<int64_t> getDeadSessions()
{
return session_expiry_queue.getExpiredSessions();

View File

@ -69,6 +69,28 @@ void NuKeeperStorageDispatcher::responseThread()
}
}
void NuKeeperStorageDispatcher::snapshotThread()
{
setThreadName("NuKeeperSnpT");
while (!shutdown_called)
{
CreateSnapshotTask task;
snapshots_queue.pop(task);
if (shutdown_called)
break;
try
{
task.create_snapshot(std::move(task.snapshot));
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
}
void NuKeeperStorageDispatcher::setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response)
{
std::lock_guard lock(session_to_response_callback_mutex);
@ -110,7 +132,7 @@ void NuKeeperStorageDispatcher::initialize(const Poco::Util::AbstractConfigurati
coordination_settings->loadFromConfig("test_keeper_server.coordination_settings", config);
server = std::make_unique<NuKeeperServer>(myid, coordination_settings, config, responses_queue);
server = std::make_unique<NuKeeperServer>(myid, coordination_settings, config, responses_queue, snapshots_queue);
try
{
LOG_DEBUG(log, "Waiting server to initialize");
@ -129,6 +151,7 @@ void NuKeeperStorageDispatcher::initialize(const Poco::Util::AbstractConfigurati
request_thread = ThreadFromGlobalPool([this] { requestThread(); });
responses_thread = ThreadFromGlobalPool([this] { responseThread(); });
session_cleaner_thread = ThreadFromGlobalPool([this] { sessionCleanerTask(); });
snapshot_thread = ThreadFromGlobalPool([this] { snapshotThread(); });
LOG_DEBUG(log, "Dispatcher initialized");
}
@ -149,11 +172,18 @@ void NuKeeperStorageDispatcher::shutdown()
if (session_cleaner_thread.joinable())
session_cleaner_thread.join();
/// FIXME not the best way to notify
requests_queue.push({});
if (request_thread.joinable())
request_thread.join();
responses_queue.push({});
if (responses_thread.joinable())
responses_thread.join();
snapshots_queue.push({});
if (snapshot_thread.joinable())
snapshot_thread.join();
}
if (server)
@ -162,9 +192,16 @@ void NuKeeperStorageDispatcher::shutdown()
NuKeeperStorage::RequestForSession request_for_session;
while (requests_queue.tryPop(request_for_session))
{
auto response = request_for_session.request->makeResponse();
response->error = Coordination::Error::ZSESSIONEXPIRED;
setResponse(request_for_session.session_id, response);
if (request_for_session.request)
{
auto response = request_for_session.request->makeResponse();
response->error = Coordination::Error::ZSESSIONEXPIRED;
setResponse(request_for_session.session_id, response);
}
else
{
break;
}
}
session_to_response_callback.clear();
}

View File

@ -30,18 +30,21 @@ private:
CoordinationSettingsPtr coordination_settings;
using RequestsQueue = ConcurrentBoundedQueue<NuKeeperStorage::RequestForSession>;
using SessionToResponseCallback = std::unordered_map<int64_t, ZooKeeperResponseCallback>;
RequestsQueue requests_queue{1};
ResponsesQueue responses_queue;
SnapshotsQueue snapshots_queue{1};
std::atomic<bool> shutdown_called{false};
using SessionToResponseCallback = std::unordered_map<int64_t, ZooKeeperResponseCallback>;
std::mutex session_to_response_callback_mutex;
SessionToResponseCallback session_to_response_callback;
ThreadFromGlobalPool request_thread;
ThreadFromGlobalPool responses_thread;
ThreadFromGlobalPool session_cleaner_thread;
ThreadFromGlobalPool snapshot_thread;
std::unique_ptr<NuKeeperServer> server;
@ -51,6 +54,7 @@ private:
void requestThread();
void responseThread();
void sessionCleanerTask();
void snapshotThread();
void setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response);
public:

View File

@ -1,90 +0,0 @@
#include <Coordination/NuKeeperStorageSerializer.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
#include <Common/ZooKeeper/ZooKeeperIO.h>
namespace DB
{
namespace
{
void writeNode(const NuKeeperStorage::Node & node, WriteBuffer & out)
{
Coordination::write(node.data, out);
Coordination::write(node.acls, out);
Coordination::write(node.is_ephemeral, out);
Coordination::write(node.is_sequental, out);
Coordination::write(node.stat, out);
Coordination::write(node.seq_num, out);
}
void readNode(NuKeeperStorage::Node & node, ReadBuffer & in)
{
Coordination::read(node.data, in);
Coordination::read(node.acls, in);
Coordination::read(node.is_ephemeral, in);
Coordination::read(node.is_sequental, in);
Coordination::read(node.stat, in);
Coordination::read(node.seq_num, in);
}
}
void NuKeeperStorageSerializer::serialize(const NuKeeperStorage & storage, WriteBuffer & out)
{
Coordination::write(storage.zxid, out);
Coordination::write(storage.session_id_counter, out);
Coordination::write(storage.container.size(), out);
for (const auto & [path, node] : storage.container)
{
Coordination::write(path, out);
writeNode(node, out);
}
Coordination::write(storage.ephemerals.size(), out);
for (const auto & [session_id, paths] : storage.ephemerals)
{
Coordination::write(session_id, out);
Coordination::write(paths.size(), out);
for (const auto & path : paths)
Coordination::write(path, out);
}
}
void NuKeeperStorageSerializer::deserialize(NuKeeperStorage & storage, ReadBuffer & in)
{
int64_t session_id_counter, zxid;
Coordination::read(zxid, in);
Coordination::read(session_id_counter, in);
storage.zxid = zxid;
storage.session_id_counter = session_id_counter;
size_t container_size;
Coordination::read(container_size, in);
size_t current_size = 0;
while (current_size < container_size)
{
std::string path;
Coordination::read(path, in);
NuKeeperStorage::Node node;
readNode(node, in);
storage.container[path] = node;
current_size++;
}
size_t ephemerals_size;
Coordination::read(ephemerals_size, in);
while (storage.ephemerals.size() < ephemerals_size)
{
int64_t session_id;
size_t ephemerals_for_session;
Coordination::read(session_id, in);
Coordination::read(ephemerals_for_session, in);
while (storage.ephemerals[session_id].size() < ephemerals_for_session)
{
std::string ephemeral_path;
Coordination::read(ephemeral_path, in);
storage.ephemerals[session_id].emplace(ephemeral_path);
}
}
}
}

View File

@ -1,17 +0,0 @@
#pragma once
#include <Coordination/NuKeeperStorage.h>
#include <IO/WriteBuffer.h>
#include <IO/ReadBuffer.h>
namespace DB
{
class NuKeeperStorageSerializer
{
public:
static void serialize(const NuKeeperStorage & storage, WriteBuffer & out);
static void deserialize(NuKeeperStorage & storage, ReadBuffer & in);
};
}

View File

@ -0,0 +1,197 @@
#pragma once
#include <common/StringRef.h>
#include <unordered_map>
#include <list>
#include <atomic>
namespace DB
{
template<typename V>
struct ListNode
{
std::string key;
V value;
bool active_in_map;
};
template <class V>
class SnapshotableHashTable
{
private:
using ListElem = ListNode<V>;
using List = std::list<ListElem>;
using IndexMap = std::unordered_map<StringRef, typename List::iterator, StringRefHash>;
List list;
IndexMap map;
bool snapshot_mode{false};
public:
using iterator = typename List::iterator;
using const_iterator = typename List::const_iterator;
using reverse_iterator = typename List::reverse_iterator;
using const_reverse_iterator = typename List::const_reverse_iterator;
using ValueUpdater = std::function<void(V & value)>;
bool insert(const std::string & key, const V & value)
{
auto it = map.find(key);
if (it == map.end())
{
ListElem elem{key, value, true};
auto itr = list.insert(list.end(), elem);
map.emplace(itr->key, itr);
return true;
}
return false;
}
void insertOrReplace(const std::string & key, const V & value)
{
auto it = map.find(key);
if (it == map.end())
{
ListElem elem{key, value, true};
auto itr = list.insert(list.end(), elem);
map.emplace(itr->key, itr);
}
else
{
auto list_itr = it->second;
if (snapshot_mode)
{
ListElem elem{key, value, true};
list_itr->active_in_map = false;
auto new_list_itr = list.insert(list.end(), elem);
map.erase(it);
map.emplace(new_list_itr->key, new_list_itr);
}
else
{
list_itr->value = value;
}
}
}
bool erase(const std::string & key)
{
auto it = map.find(key);
if (it == map.end())
return false;
auto list_itr = it->second;
if (snapshot_mode)
{
list_itr->active_in_map = false;
map.erase(it);
}
else
{
map.erase(it);
list.erase(list_itr);
}
return true;
}
bool contains(const std::string & key) const
{
return map.find(key) != map.end();
}
const_iterator updateValue(const std::string & key, ValueUpdater updater)
{
auto it = map.find(key);
assert(it != map.end());
if (snapshot_mode)
{
auto list_itr = it->second;
auto elem_copy = *(list_itr);
list_itr->active_in_map = false;
map.erase(it);
updater(elem_copy.value);
auto itr = list.insert(list.end(), elem_copy);
map.emplace(itr->key, itr);
return itr;
}
else
{
auto list_itr = it->second;
updater(list_itr->value);
return list_itr;
}
}
const_iterator find(const std::string & key) const
{
auto map_it = map.find(key);
if (map_it != map.end())
return map_it->second;
return list.end();
}
const V & getValue(const std::string & key) const
{
auto it = map.find(key);
assert(it != map.end());
return it->second->value;
}
void clearOutdatedNodes()
{
auto start = list.begin();
auto end = list.end();
for (auto itr = start; itr != end;)
{
if (!itr->active_in_map)
itr = list.erase(itr);
else
itr++;
}
}
void clear()
{
list.clear();
map.clear();
}
void enableSnapshotMode()
{
snapshot_mode = true;
}
void disableSnapshotMode()
{
snapshot_mode = false;
}
size_t size() const
{
return map.size();
}
size_t snapshotSize() const
{
return list.size();
}
iterator begin() { return list.begin(); }
const_iterator begin() const { return list.cbegin(); }
iterator end() { return list.end(); }
const_iterator end() const { return list.cend(); }
reverse_iterator rbegin() { return list.rbegin(); }
const_reverse_iterator rbegin() const { return list.crbegin(); }
reverse_iterator rend() { return list.rend(); }
const_reverse_iterator rend() const { return list.crend(); }
};
}

View File

@ -10,7 +10,7 @@
#include <Poco/Logger.h>
#include <Coordination/InMemoryLogStore.h>
#include <Coordination/NuKeeperStateManager.h>
#include <Coordination/NuKeeperStorageSerializer.h>
#include <Coordination/NuKeeperSnapshotManager.h>
#include <Coordination/SummingStateMachine.h>
#include <Coordination/NuKeeperStateMachine.h>
#include <Coordination/LoggerWrapper.h>
@ -28,6 +28,8 @@
#include <Coordination/Changelog.h>
#include <filesystem>
#include <Coordination/SnapshotableHashTable.h>
namespace fs = std::filesystem;
struct ChangelogDirTest
{
@ -102,7 +104,7 @@ struct SimpliestRaftServer
, state_machine(nuraft::cs_new<StateMachine>())
, state_manager(nuraft::cs_new<DB::NuKeeperStateManager>(server_id, hostname, port, logs_path))
{
state_manager->loadLogStore(1);
state_manager->loadLogStore(1, 0);
nuraft::raft_params params;
params.heart_beat_interval_ = 100;
params.election_timeout_lower_bound_ = 200;
@ -195,68 +197,6 @@ TEST(CoordinationTest, TestSummingRaft1)
s1.launcher.shutdown(5);
}
nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(int64_t session_id, const Coordination::ZooKeeperRequestPtr & request)
{
DB::WriteBufferFromNuraftBuffer buf;
DB::writeIntBinary(session_id, buf);
request->write(buf);
return buf.getBuffer();
}
DB::NuKeeperStorage::ResponsesForSessions getZooKeeperResponses(nuraft::ptr<nuraft::buffer> & buffer, const Coordination::ZooKeeperRequestPtr & request)
{
DB::NuKeeperStorage::ResponsesForSessions results;
DB::ReadBufferFromNuraftBuffer buf(buffer);
while (!buf.eof())
{
int64_t session_id;
DB::readIntBinary(session_id, buf);
int32_t length;
Coordination::XID xid;
int64_t zxid;
Coordination::Error err;
Coordination::read(length, buf);
Coordination::read(xid, buf);
Coordination::read(zxid, buf);
Coordination::read(err, buf);
auto response = request->makeResponse();
response->readImpl(buf);
results.push_back(DB::NuKeeperStorage::ResponseForSession{session_id, response});
}
return results;
}
TEST(CoordinationTest, TestStorageSerialization)
{
DB::NuKeeperStorage storage(500);
storage.container["/hello"] = DB::NuKeeperStorage::Node{.data="world"};
storage.container["/hello/somepath"] = DB::NuKeeperStorage::Node{.data="somedata"};
storage.session_id_counter = 5;
storage.zxid = 156;
storage.ephemerals[3] = {"/hello", "/"};
storage.ephemerals[1] = {"/hello/somepath"};
DB::WriteBufferFromOwnString buffer;
DB::NuKeeperStorageSerializer serializer;
serializer.serialize(storage, buffer);
std::string serialized = buffer.str();
EXPECT_NE(serialized.size(), 0);
DB::ReadBufferFromString read(serialized);
DB::NuKeeperStorage new_storage(500);
serializer.deserialize(new_storage, read);
EXPECT_EQ(new_storage.container.size(), 3);
EXPECT_EQ(new_storage.container["/hello"].data, "world");
EXPECT_EQ(new_storage.container["/hello/somepath"].data, "somedata");
EXPECT_EQ(new_storage.session_id_counter, 5);
EXPECT_EQ(new_storage.zxid, 156);
EXPECT_EQ(new_storage.ephemerals.size(), 2);
EXPECT_EQ(new_storage.ephemerals[3].size(), 2);
EXPECT_EQ(new_storage.ephemerals[1].size(), 1);
}
DB::LogEntryPtr getLogEntry(const std::string & s, size_t term)
{
DB::WriteBufferFromNuraftBuffer bufwriter;
@ -268,7 +208,7 @@ TEST(CoordinationTest, ChangelogTestSimple)
{
ChangelogDirTest test("./logs");
DB::NuKeeperLogStore changelog("./logs", 5, true);
changelog.init(1);
changelog.init(1, 0);
auto entry = getLogEntry("hello world", 77);
changelog.append(entry);
EXPECT_EQ(changelog.next_slot(), 2);
@ -282,7 +222,7 @@ TEST(CoordinationTest, ChangelogTestFile)
{
ChangelogDirTest test("./logs");
DB::NuKeeperLogStore changelog("./logs", 5, true);
changelog.init(1);
changelog.init(1, 0);
auto entry = getLogEntry("hello world", 77);
changelog.append(entry);
EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
@ -303,7 +243,7 @@ TEST(CoordinationTest, ChangelogReadWrite)
{
ChangelogDirTest test("./logs");
DB::NuKeeperLogStore changelog("./logs", 1000, true);
changelog.init(1);
changelog.init(1, 0);
for (size_t i = 0; i < 10; ++i)
{
auto entry = getLogEntry("hello world", i * 10);
@ -311,7 +251,7 @@ TEST(CoordinationTest, ChangelogReadWrite)
}
EXPECT_EQ(changelog.size(), 10);
DB::NuKeeperLogStore changelog_reader("./logs", 1000, true);
changelog_reader.init(1);
changelog_reader.init(1, 0);
EXPECT_EQ(changelog_reader.size(), 10);
EXPECT_EQ(changelog_reader.last_entry()->get_term(), changelog.last_entry()->get_term());
EXPECT_EQ(changelog_reader.start_index(), changelog.start_index());
@ -330,7 +270,7 @@ TEST(CoordinationTest, ChangelogWriteAt)
{
ChangelogDirTest test("./logs");
DB::NuKeeperLogStore changelog("./logs", 1000, true);
changelog.init(1);
changelog.init(1, 0);
for (size_t i = 0; i < 10; ++i)
{
auto entry = getLogEntry("hello world", i * 10);
@ -346,7 +286,7 @@ TEST(CoordinationTest, ChangelogWriteAt)
EXPECT_EQ(changelog.next_slot(), 8);
DB::NuKeeperLogStore changelog_reader("./logs", 1000, true);
changelog_reader.init(1);
changelog_reader.init(1, 0);
EXPECT_EQ(changelog_reader.size(), changelog.size());
EXPECT_EQ(changelog_reader.last_entry()->get_term(), changelog.last_entry()->get_term());
@ -359,7 +299,7 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead)
{
ChangelogDirTest test("./logs");
DB::NuKeeperLogStore changelog("./logs", 5, true);
changelog.init(1);
changelog.init(1, 0);
for (size_t i = 0; i < 7; ++i)
{
auto entry = getLogEntry("hello world", i * 10);
@ -371,7 +311,7 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead)
EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
DB::NuKeeperLogStore changelog_reader("./logs", 5, true);
changelog_reader.init(1);
changelog_reader.init(1, 0);
EXPECT_EQ(changelog_reader.size(), 7);
for (size_t i = 7; i < 10; ++i)
@ -407,7 +347,7 @@ TEST(CoordinationTest, ChangelogTestCompaction)
{
ChangelogDirTest test("./logs");
DB::NuKeeperLogStore changelog("./logs", 5, true);
changelog.init(1);
changelog.init(1, 0);
for (size_t i = 0; i < 3; ++i)
{
@ -448,7 +388,7 @@ TEST(CoordinationTest, ChangelogTestCompaction)
EXPECT_EQ(changelog.last_entry()->get_term(), 60);
/// And we able to read it
DB::NuKeeperLogStore changelog_reader("./logs", 5, true);
changelog_reader.init(7);
changelog_reader.init(7, 0);
EXPECT_EQ(changelog_reader.size(), 1);
EXPECT_EQ(changelog_reader.start_index(), 7);
EXPECT_EQ(changelog_reader.next_slot(), 8);
@ -459,7 +399,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperations)
{
ChangelogDirTest test("./logs");
DB::NuKeeperLogStore changelog("./logs", 100, true);
changelog.init(1);
changelog.init(1, 0);
for (size_t i = 0; i < 10; ++i)
{
auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10);
@ -471,7 +411,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperations)
auto entries = changelog.pack(1, 5);
DB::NuKeeperLogStore apply_changelog("./logs", 100, true);
apply_changelog.init(1);
apply_changelog.init(1, 0);
for (size_t i = 0; i < 10; ++i)
{
@ -501,7 +441,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty)
{
ChangelogDirTest test("./logs");
DB::NuKeeperLogStore changelog("./logs", 100, true);
changelog.init(1);
changelog.init(1, 0);
for (size_t i = 0; i < 10; ++i)
{
auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10);
@ -514,7 +454,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty)
ChangelogDirTest test1("./logs1");
DB::NuKeeperLogStore changelog_new("./logs1", 100, true);
changelog_new.init(1);
changelog_new.init(1, 0);
EXPECT_EQ(changelog_new.size(), 0);
changelog_new.apply_pack(5, *entries);
@ -533,7 +473,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty)
EXPECT_EQ(changelog_new.next_slot(), 11);
DB::NuKeeperLogStore changelog_reader("./logs1", 100, true);
changelog_reader.init(5);
changelog_reader.init(5, 0);
}
@ -541,7 +481,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile)
{
ChangelogDirTest test("./logs");
DB::NuKeeperLogStore changelog("./logs", 5, true);
changelog.init(1);
changelog.init(1, 0);
for (size_t i = 0; i < 33; ++i)
{
@ -576,7 +516,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile)
EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin"));
DB::NuKeeperLogStore changelog_read("./logs", 5, true);
changelog_read.init(1);
changelog_read.init(1, 0);
EXPECT_EQ(changelog_read.size(), 7);
EXPECT_EQ(changelog_read.start_index(), 1);
EXPECT_EQ(changelog_read.next_slot(), 8);
@ -587,7 +527,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtFileBorder)
{
ChangelogDirTest test("./logs");
DB::NuKeeperLogStore changelog("./logs", 5, true);
changelog.init(1);
changelog.init(1, 0);
for (size_t i = 0; i < 33; ++i)
{
@ -622,7 +562,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtFileBorder)
EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin"));
DB::NuKeeperLogStore changelog_read("./logs", 5, true);
changelog_read.init(1);
changelog_read.init(1, 0);
EXPECT_EQ(changelog_read.size(), 11);
EXPECT_EQ(changelog_read.start_index(), 1);
EXPECT_EQ(changelog_read.next_slot(), 12);
@ -633,7 +573,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtAllFiles)
{
ChangelogDirTest test("./logs");
DB::NuKeeperLogStore changelog("./logs", 5, true);
changelog.init(1);
changelog.init(1, 0);
for (size_t i = 0; i < 33; ++i)
{
@ -672,7 +612,7 @@ TEST(CoordinationTest, ChangelogTestStartNewLogAfterRead)
{
ChangelogDirTest test("./logs");
DB::NuKeeperLogStore changelog("./logs", 5, true);
changelog.init(1);
changelog.init(1, 0);
for (size_t i = 0; i < 35; ++i)
{
@ -691,7 +631,7 @@ TEST(CoordinationTest, ChangelogTestStartNewLogAfterRead)
DB::NuKeeperLogStore changelog_reader("./logs", 5, true);
changelog_reader.init(1);
changelog_reader.init(1, 0);
auto entry = getLogEntry("36_hello_world", 360);
changelog_reader.append(entry);
@ -713,7 +653,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate)
ChangelogDirTest test("./logs");
DB::NuKeeperLogStore changelog("./logs", 5, true);
changelog.init(1);
changelog.init(1, 0);
for (size_t i = 0; i < 35; ++i)
{
@ -733,7 +673,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate)
plain_buf.truncate(0);
DB::NuKeeperLogStore changelog_reader("./logs", 5, true);
changelog_reader.init(1);
changelog_reader.init(1, 0);
EXPECT_EQ(changelog_reader.size(), 10);
EXPECT_EQ(changelog_reader.last_entry()->get_term(), 90);
@ -762,7 +702,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate)
EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin"));
DB::NuKeeperLogStore changelog_reader2("./logs", 5, true);
changelog_reader2.init(1);
changelog_reader2.init(1, 0);
EXPECT_EQ(changelog_reader2.size(), 11);
EXPECT_EQ(changelog_reader2.last_entry()->get_term(), 7777);
}
@ -772,7 +712,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2)
ChangelogDirTest test("./logs");
DB::NuKeeperLogStore changelog("./logs", 20, true);
changelog.init(1);
changelog.init(1, 0);
for (size_t i = 0; i < 35; ++i)
{
@ -787,7 +727,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2)
plain_buf.truncate(140);
DB::NuKeeperLogStore changelog_reader("./logs", 20, true);
changelog_reader.init(1);
changelog_reader.init(1, 0);
EXPECT_EQ(changelog_reader.size(), 2);
EXPECT_EQ(changelog_reader.last_entry()->get_term(), 450);
@ -800,7 +740,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2)
DB::NuKeeperLogStore changelog_reader2("./logs", 20, true);
changelog_reader2.init(1);
changelog_reader2.init(1, 0);
EXPECT_EQ(changelog_reader2.size(), 3);
EXPECT_EQ(changelog_reader2.last_entry()->get_term(), 7777);
}
@ -810,7 +750,7 @@ TEST(CoordinationTest, ChangelogTestLostFiles)
ChangelogDirTest test("./logs");
DB::NuKeeperLogStore changelog("./logs", 20, true);
changelog.init(1);
changelog.init(1, 0);
for (size_t i = 0; i < 35; ++i)
{
@ -824,10 +764,476 @@ TEST(CoordinationTest, ChangelogTestLostFiles)
fs::remove("./logs/changelog_1_20.bin");
DB::NuKeeperLogStore changelog_reader("./logs", 20, true);
EXPECT_THROW(changelog_reader.init(5), DB::Exception);
/// It should print error message, but still able to start
changelog_reader.init(5, 0);
EXPECT_FALSE(fs::exists("./logs/changelog_1_20.bin"));
EXPECT_FALSE(fs::exists("./logs/changelog_21_40.bin"));
}
fs::remove("./logs/changelog_21_40.bin");
EXPECT_THROW(changelog_reader.init(3), DB::Exception);
TEST(CoordinationTest, SnapshotableHashMapSimple)
{
DB::SnapshotableHashTable<int> hello;
EXPECT_TRUE(hello.insert("hello", 5));
EXPECT_TRUE(hello.contains("hello"));
EXPECT_EQ(hello.getValue("hello"), 5);
EXPECT_FALSE(hello.insert("hello", 145));
EXPECT_EQ(hello.getValue("hello"), 5);
hello.updateValue("hello", [](int & value) { value = 7; });
EXPECT_EQ(hello.getValue("hello"), 7);
EXPECT_EQ(hello.size(), 1);
EXPECT_TRUE(hello.erase("hello"));
EXPECT_EQ(hello.size(), 0);
}
TEST(CoordinationTest, SnapshotableHashMapTrySnapshot)
{
DB::SnapshotableHashTable<int> map_snp;
EXPECT_TRUE(map_snp.insert("/hello", 7));
EXPECT_FALSE(map_snp.insert("/hello", 145));
map_snp.enableSnapshotMode();
EXPECT_FALSE(map_snp.insert("/hello", 145));
map_snp.updateValue("/hello", [](int & value) { value = 554; });
EXPECT_EQ(map_snp.getValue("/hello"), 554);
EXPECT_EQ(map_snp.snapshotSize(), 2);
EXPECT_EQ(map_snp.size(), 1);
auto itr = map_snp.begin();
EXPECT_EQ(itr->key, "/hello");
EXPECT_EQ(itr->value, 7);
EXPECT_EQ(itr->active_in_map, false);
itr = std::next(itr);
EXPECT_EQ(itr->key, "/hello");
EXPECT_EQ(itr->value, 554);
EXPECT_EQ(itr->active_in_map, true);
itr = std::next(itr);
EXPECT_EQ(itr, map_snp.end());
for (size_t i = 0; i < 5; ++i)
{
EXPECT_TRUE(map_snp.insert("/hello" + std::to_string(i), i));
}
EXPECT_EQ(map_snp.getValue("/hello3"), 3);
EXPECT_EQ(map_snp.snapshotSize(), 7);
EXPECT_EQ(map_snp.size(), 6);
itr = std::next(map_snp.begin(), 2);
for (size_t i = 0; i < 5; ++i)
{
EXPECT_EQ(itr->key, "/hello" + std::to_string(i));
EXPECT_EQ(itr->value, i);
EXPECT_EQ(itr->active_in_map, true);
itr = std::next(itr);
}
EXPECT_TRUE(map_snp.erase("/hello3"));
EXPECT_TRUE(map_snp.erase("/hello2"));
EXPECT_EQ(map_snp.snapshotSize(), 7);
EXPECT_EQ(map_snp.size(), 4);
itr = std::next(map_snp.begin(), 2);
for (size_t i = 0; i < 5; ++i)
{
EXPECT_EQ(itr->key, "/hello" + std::to_string(i));
EXPECT_EQ(itr->value, i);
EXPECT_EQ(itr->active_in_map, i != 3 && i != 2);
itr = std::next(itr);
}
map_snp.clearOutdatedNodes();
EXPECT_EQ(map_snp.snapshotSize(), 4);
EXPECT_EQ(map_snp.size(), 4);
itr = map_snp.begin();
EXPECT_EQ(itr->key, "/hello");
EXPECT_EQ(itr->value, 554);
EXPECT_EQ(itr->active_in_map, true);
itr = std::next(itr);
EXPECT_EQ(itr->key, "/hello0");
EXPECT_EQ(itr->value, 0);
EXPECT_EQ(itr->active_in_map, true);
itr = std::next(itr);
EXPECT_EQ(itr->key, "/hello1");
EXPECT_EQ(itr->value, 1);
EXPECT_EQ(itr->active_in_map, true);
itr = std::next(itr);
EXPECT_EQ(itr->key, "/hello4");
EXPECT_EQ(itr->value, 4);
EXPECT_EQ(itr->active_in_map, true);
itr = std::next(itr);
EXPECT_EQ(itr, map_snp.end());
map_snp.disableSnapshotMode();
}
void addNode(DB::NuKeeperStorage & storage, const std::string & path, const std::string & data, int64_t ephemeral_owner=0)
{
using Node = DB::NuKeeperStorage::Node;
Node node{};
node.data = data;
node.stat.ephemeralOwner = ephemeral_owner;
storage.container.insertOrReplace(path, node);
}
TEST(CoordinationTest, TestStorageSnapshotSimple)
{
ChangelogDirTest test("./snapshots");
DB::NuKeeperSnapshotManager manager("./snapshots", 3);
DB::NuKeeperStorage storage(500);
addNode(storage, "/hello", "world", 1);
addNode(storage, "/hello/somepath", "somedata", 3);
storage.session_id_counter = 5;
storage.zxid = 2;
storage.ephemerals[3] = {"/hello"};
storage.ephemerals[1] = {"/hello/somepath"};
storage.getSessionID(130);
storage.getSessionID(130);
DB::NuKeeperStorageSnapshot snapshot(&storage, 2);
EXPECT_EQ(snapshot.snapshot_meta->get_last_log_idx(), 2);
EXPECT_EQ(snapshot.session_id, 7);
EXPECT_EQ(snapshot.snapshot_container_size, 3);
EXPECT_EQ(snapshot.session_and_timeout.size(), 2);
auto buf = manager.serializeSnapshotToBuffer(snapshot);
manager.serializeSnapshotBufferToDisk(*buf, 2);
EXPECT_TRUE(fs::exists("./snapshots/snapshot_2.bin"));
DB::NuKeeperStorage restored_storage(500);
auto debuf = manager.deserializeSnapshotBufferFromDisk(2);
manager.deserializeSnapshotFromBuffer(&restored_storage, debuf);
EXPECT_EQ(restored_storage.container.size(), 3);
EXPECT_EQ(restored_storage.container.getValue("/").children.size(), 1);
EXPECT_EQ(restored_storage.container.getValue("/hello").children.size(), 1);
EXPECT_EQ(restored_storage.container.getValue("/hello/somepath").children.size(), 0);
EXPECT_EQ(restored_storage.container.getValue("/").data, "");
EXPECT_EQ(restored_storage.container.getValue("/hello").data, "world");
EXPECT_EQ(restored_storage.container.getValue("/hello/somepath").data, "somedata");
EXPECT_EQ(restored_storage.session_id_counter, 7);
EXPECT_EQ(restored_storage.zxid, 2);
EXPECT_EQ(restored_storage.ephemerals.size(), 2);
EXPECT_EQ(restored_storage.ephemerals[3].size(), 1);
EXPECT_EQ(restored_storage.ephemerals[1].size(), 1);
EXPECT_EQ(restored_storage.session_and_timeout.size(), 2);
}
TEST(CoordinationTest, TestStorageSnapshotMoreWrites)
{
ChangelogDirTest test("./snapshots");
DB::NuKeeperSnapshotManager manager("./snapshots", 3);
DB::NuKeeperStorage storage(500);
storage.getSessionID(130);
for (size_t i = 0; i < 50; ++i)
{
addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i));
}
DB::NuKeeperStorageSnapshot snapshot(&storage, 50);
EXPECT_EQ(snapshot.snapshot_meta->get_last_log_idx(), 50);
EXPECT_EQ(snapshot.snapshot_container_size, 51);
for (size_t i = 50; i < 100; ++i)
{
addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i));
}
EXPECT_EQ(storage.container.size(), 101);
auto buf = manager.serializeSnapshotToBuffer(snapshot);
manager.serializeSnapshotBufferToDisk(*buf, 50);
EXPECT_TRUE(fs::exists("./snapshots/snapshot_50.bin"));
DB::NuKeeperStorage restored_storage(500);
auto debuf = manager.deserializeSnapshotBufferFromDisk(50);
manager.deserializeSnapshotFromBuffer(&restored_storage, debuf);
EXPECT_EQ(restored_storage.container.size(), 51);
for (size_t i = 0; i < 50; ++i)
{
EXPECT_EQ(restored_storage.container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
}
}
TEST(CoordinationTest, TestStorageSnapshotManySnapshots)
{
ChangelogDirTest test("./snapshots");
DB::NuKeeperSnapshotManager manager("./snapshots", 3);
DB::NuKeeperStorage storage(500);
storage.getSessionID(130);
for (size_t j = 1; j <= 5; ++j)
{
for (size_t i = (j - 1) * 50; i < j * 50; ++i)
{
addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i));
}
DB::NuKeeperStorageSnapshot snapshot(&storage, j * 50);
auto buf = manager.serializeSnapshotToBuffer(snapshot);
manager.serializeSnapshotBufferToDisk(*buf, j * 50);
EXPECT_TRUE(fs::exists(std::string{"./snapshots/snapshot_"} + std::to_string(j * 50) + ".bin"));
}
EXPECT_FALSE(fs::exists("./snapshots/snapshot_50.bin"));
EXPECT_FALSE(fs::exists("./snapshots/snapshot_100.bin"));
EXPECT_TRUE(fs::exists("./snapshots/snapshot_150.bin"));
EXPECT_TRUE(fs::exists("./snapshots/snapshot_200.bin"));
EXPECT_TRUE(fs::exists("./snapshots/snapshot_250.bin"));
DB::NuKeeperStorage restored_storage(500);
manager.restoreFromLatestSnapshot(&restored_storage);
EXPECT_EQ(restored_storage.container.size(), 251);
for (size_t i = 0; i < 250; ++i)
{
EXPECT_EQ(restored_storage.container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
}
}
TEST(CoordinationTest, TestStorageSnapshotMode)
{
ChangelogDirTest test("./snapshots");
DB::NuKeeperSnapshotManager manager("./snapshots", 3);
DB::NuKeeperStorage storage(500);
for (size_t i = 0; i < 50; ++i)
{
addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i));
}
{
DB::NuKeeperStorageSnapshot snapshot(&storage, 50);
for (size_t i = 0; i < 50; ++i)
{
addNode(storage, "/hello_" + std::to_string(i), "wlrd_" + std::to_string(i));
}
for (size_t i = 0; i < 50; ++i)
{
EXPECT_EQ(storage.container.getValue("/hello_" + std::to_string(i)).data, "wlrd_" + std::to_string(i));
}
for (size_t i = 0; i < 50; ++i)
{
if (i % 2 == 0)
storage.container.erase("/hello_" + std::to_string(i));
}
EXPECT_EQ(storage.container.size(), 26);
EXPECT_EQ(storage.container.snapshotSize(), 101);
auto buf = manager.serializeSnapshotToBuffer(snapshot);
manager.serializeSnapshotBufferToDisk(*buf, 50);
}
EXPECT_TRUE(fs::exists("./snapshots/snapshot_50.bin"));
EXPECT_EQ(storage.container.size(), 26);
storage.clearGarbageAfterSnapshot();
EXPECT_EQ(storage.container.snapshotSize(), 26);
for (size_t i = 0; i < 50; ++i)
{
if (i % 2 != 0)
EXPECT_EQ(storage.container.getValue("/hello_" + std::to_string(i)).data, "wlrd_" + std::to_string(i));
else
EXPECT_FALSE(storage.container.contains("/hello_" + std::to_string(i)));
}
DB::NuKeeperStorage restored_storage(500);
manager.restoreFromLatestSnapshot(&restored_storage);
for (size_t i = 0; i < 50; ++i)
{
EXPECT_EQ(restored_storage.container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i));
}
}
TEST(CoordinationTest, TestStorageSnapshotBroken)
{
ChangelogDirTest test("./snapshots");
DB::NuKeeperSnapshotManager manager("./snapshots", 3);
DB::NuKeeperStorage storage(500);
for (size_t i = 0; i < 50; ++i)
{
addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i));
}
{
DB::NuKeeperStorageSnapshot snapshot(&storage, 50);
auto buf = manager.serializeSnapshotToBuffer(snapshot);
manager.serializeSnapshotBufferToDisk(*buf, 50);
}
EXPECT_TRUE(fs::exists("./snapshots/snapshot_50.bin"));
/// Let's corrupt file
DB::WriteBufferFromFile plain_buf("./snapshots/snapshot_50.bin", DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY);
plain_buf.truncate(34);
plain_buf.sync();
DB::NuKeeperStorage restored_storage(500);
EXPECT_THROW(manager.restoreFromLatestSnapshot(&restored_storage), DB::Exception);
}
nuraft::ptr<nuraft::buffer> getBufferFromZKRequest(int64_t session_id, const Coordination::ZooKeeperRequestPtr & request)
{
DB::WriteBufferFromNuraftBuffer buf;
DB::writeIntBinary(session_id, buf);
request->write(buf);
return buf.getBuffer();
}
nuraft::ptr<nuraft::log_entry> getLogEntryFromZKRequest(size_t term, int64_t session_id, const Coordination::ZooKeeperRequestPtr & request)
{
auto buffer = getBufferFromZKRequest(session_id, request);
return nuraft::cs_new<nuraft::log_entry>(term, buffer);
}
void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, size_t total_logs)
{
using namespace Coordination;
using namespace DB;
ChangelogDirTest snapshots("./snapshots");
ChangelogDirTest logs("./logs");
ResponsesQueue queue;
SnapshotsQueue snapshots_queue{1};
auto state_machine = std::make_shared<NuKeeperStateMachine>(queue, snapshots_queue, "./snapshots", settings);
state_machine->init();
DB::NuKeeperLogStore changelog("./logs", settings->rotate_log_storage_interval, true);
changelog.init(state_machine->last_commit_index() + 1, settings->reserved_log_items);
for (size_t i = 1; i < total_logs + 1; ++i)
{
std::shared_ptr<ZooKeeperCreateRequest> request = std::make_shared<ZooKeeperCreateRequest>();
request->path = "/hello_" + std::to_string(i);
auto entry = getLogEntryFromZKRequest(0, 1, request);
changelog.append(entry);
state_machine->commit(i, changelog.entry_at(i)->get_buf());
bool snapshot_created = false;
if (i % settings->snapshot_distance == 0)
{
nuraft::snapshot s(i, 0, std::make_shared<nuraft::cluster_config>());
nuraft::async_result<bool>::handler_type when_done = [&snapshot_created] (bool & ret, nuraft::ptr<std::exception> &/*exception*/)
{
snapshot_created = ret;
std::cerr << "Snapshot finised\n";
};
state_machine->create_snapshot(s, when_done);
CreateSnapshotTask snapshot_task;
snapshots_queue.pop(snapshot_task);
snapshot_task.create_snapshot(std::move(snapshot_task.snapshot));
}
if (snapshot_created)
{
if (changelog.size() > settings->reserved_log_items)
{
changelog.compact(i - settings->reserved_log_items);
}
}
}
SnapshotsQueue snapshots_queue1{1};
auto restore_machine = std::make_shared<NuKeeperStateMachine>(queue, snapshots_queue1, "./snapshots", settings);
restore_machine->init();
EXPECT_EQ(restore_machine->last_commit_index(), total_logs - total_logs % settings->snapshot_distance);
DB::NuKeeperLogStore restore_changelog("./logs", settings->rotate_log_storage_interval, true);
restore_changelog.init(restore_machine->last_commit_index() + 1, settings->reserved_log_items);
EXPECT_EQ(restore_changelog.size(), std::min(settings->reserved_log_items + total_logs % settings->snapshot_distance, total_logs));
EXPECT_EQ(restore_changelog.next_slot(), total_logs + 1);
if (total_logs > settings->reserved_log_items + 1)
EXPECT_EQ(restore_changelog.start_index(), total_logs - total_logs % settings->snapshot_distance - settings->reserved_log_items + 1);
else
EXPECT_EQ(restore_changelog.start_index(), 1);
for (size_t i = restore_machine->last_commit_index() + 1; i < restore_changelog.next_slot(); ++i)
{
restore_machine->commit(i, changelog.entry_at(i)->get_buf());
}
auto & source_storage = state_machine->getStorage();
auto & restored_storage = restore_machine->getStorage();
EXPECT_EQ(source_storage.container.size(), restored_storage.container.size());
for (size_t i = 1; i < total_logs + 1; ++i)
{
auto path = "/hello_" + std::to_string(i);
EXPECT_EQ(source_storage.container.getValue(path).data, restored_storage.container.getValue(path).data);
}
}
TEST(CoordinationTest, TestStateMachineAndLogStore)
{
using namespace Coordination;
using namespace DB;
{
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 10;
settings->reserved_log_items = 10;
settings->rotate_log_storage_interval = 10;
testLogAndStateMachine(settings, 37);
}
{
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 10;
settings->reserved_log_items = 10;
settings->rotate_log_storage_interval = 10;
testLogAndStateMachine(settings, 11);
}
{
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 10;
settings->reserved_log_items = 10;
settings->rotate_log_storage_interval = 10;
testLogAndStateMachine(settings, 40);
}
{
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 10;
settings->reserved_log_items = 20;
settings->rotate_log_storage_interval = 30;
testLogAndStateMachine(settings, 40);
}
{
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 10;
settings->reserved_log_items = 0;
settings->rotate_log_storage_interval = 10;
testLogAndStateMachine(settings, 40);
}
{
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 1;
settings->reserved_log_items = 1;
settings->rotate_log_storage_interval = 32;
testLogAndStateMachine(settings, 32);
}
{
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 10;
settings->reserved_log_items = 7;
settings->rotate_log_storage_interval = 1;
testLogAndStateMachine(settings, 33);
}
{
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 37;
settings->reserved_log_items = 1000;
settings->rotate_log_storage_interval = 5000;
testLogAndStateMachine(settings, 33);
}
{
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 37;
settings->reserved_log_items = 1000;
settings->rotate_log_storage_interval = 5000;
testLogAndStateMachine(settings, 45);
}
}
int main(int argc, char ** argv)

View File

@ -518,6 +518,7 @@ class IColumn;
M(Bool, output_format_write_statistics, true, "Write statistics about read rows, bytes, time elapsed in suitable output formats.", 0) \
M(Bool, output_format_pretty_row_numbers, false, "Add row numbers before each row for pretty output format", 0) \
M(Bool, insert_distributed_one_random_shard, false, "If setting is enabled, inserting into distributed table will choose a random shard to write when there is no sharding key", 0) \
M(Bool, cross_to_inner_join_rewrite, true, "Use inner join instead of comma/cross join if possible", 0) \
// End of FORMAT_FACTORY_SETTINGS

View File

@ -366,4 +366,20 @@ private:
}
};
template <typename TLeftColumns, typename TRightColumns>
bool less(const TLeftColumns & lhs, const TRightColumns & rhs, size_t i, size_t j, const SortDescription & descr)
{
for (const auto & elem : descr)
{
size_t ind = elem.column_number;
int res = elem.direction * lhs[ind]->compareAt(i, j, *rhs[ind], elem.nulls_direction);
if (res < 0)
return true;
else if (res > 0)
return false;
}
return false;
}
}

View File

@ -108,6 +108,13 @@ StoragePtr DatabaseAtomic::detachTable(const String & name)
void DatabaseAtomic::dropTable(const Context & context, const String & table_name, bool no_delay)
{
if (auto * mv = dynamic_cast<StorageMaterializedView *>(tryGetTable(table_name, context).get()))
{
/// Remove the inner table (if any) to avoid deadlock
/// (due to attempt to execute DROP from the worker thread)
mv->dropInnerTable(no_delay, context);
}
String table_metadata_path = getObjectMetadataPath(table_name);
String table_metadata_path_drop;
StoragePtr table;
@ -131,10 +138,7 @@ void DatabaseAtomic::dropTable(const Context & context, const String & table_nam
}
if (table->storesDataOnDisk())
tryRemoveSymlink(table_name);
/// Remove the inner table (if any) to avoid deadlock
/// (due to attempt to execute DROP from the worker thread)
if (auto * mv = dynamic_cast<StorageMaterializedView *>(table.get()))
mv->dropInnerTable(no_delay, context);
/// Notify DatabaseCatalog that table was dropped. It will remove table data in background.
/// Cleanup is performed outside of database to allow easily DROP DATABASE without waiting for cleanup to complete.
DatabaseCatalog::instance().enqueueDroppedTableCleanup(table->getStorageID(), table, table_metadata_path_drop, no_delay);

View File

@ -1,226 +0,0 @@
#pragma once
#include <Common/HashTable/Hash.h>
#include <common/logger_useful.h>
#include <type_traits>
#include <vector>
namespace DB
{
namespace
{
inline size_t roundUpToPowerOfTwoOrZero(size_t x)
{
size_t r = 8;
while (x > r)
r <<= 1;
return r;
}
}
struct EmptyDeleter {};
struct Int64Hasher
{
size_t operator()(const size_t x) const
{
return intHash64(x);
}
};
/*
Class for storing cache index.
It consists of two arrays.
The first one is split into buckets (each stores 8 elements (cells)) determined by hash of the element key.
The second one is split into 4bit numbers, which are positions in bucket for next element write (So cache uses FIFO eviction algorithm inside each bucket).
*/
template <typename K, typename V, typename Hasher, typename Deleter = EmptyDeleter>
class BucketCacheIndex
{
struct Cell
{
K key;
V index;
};
public:
template <typename = std::enable_if<std::is_same_v<EmptyDeleter, Deleter>>>
BucketCacheIndex(size_t cells_)
: buckets(roundUpToPowerOfTwoOrZero(cells_) / bucket_size)
, bucket_mask(buckets - 1)
, cells(buckets * bucket_size)
, positions((buckets / 2) + 1)
{
for (auto & cell : cells)
cell.index.setNotExists();
for (size_t bucket = 0; bucket < buckets; ++bucket)
setPosition(bucket, 0);
}
template <typename = std::enable_if<!std::is_same_v<EmptyDeleter, Deleter>>>
BucketCacheIndex(size_t cells_, Deleter deleter_)
: deleter(deleter_)
, buckets(roundUpToPowerOfTwoOrZero(cells_) / bucket_size)
, bucket_mask(buckets - 1)
, cells(buckets * bucket_size)
, positions((buckets / 2) + 1)
{
for (auto & cell : cells)
cell.index.setNotExists();
for (size_t bucket = 0; bucket < buckets; ++bucket)
setPosition(bucket, 0);
}
void set(K key, V val)
{
const size_t bucket = (hash(key) & bucket_mask);
const size_t idx = getCellIndex(key, bucket);
if (!cells[idx].index.exists())
{
incPosition(bucket);
++sz;
}
cells[idx].key = key;
cells[idx].index = val;
}
template <typename = std::enable_if<!std::is_same_v<EmptyDeleter, Deleter>>>
void setWithDelete(K key, V val)
{
const size_t bucket = (hash(key) & bucket_mask);
const size_t idx = getCellIndex(key, bucket);
if (!cells[idx].index.exists())
{
incPosition(bucket);
++sz;
}
else
{
deleter(cells[idx].key);
}
cells[idx].key = key;
cells[idx].index = val;
}
bool get(K key, V & val) const
{
const size_t bucket = (hash(key) & bucket_mask);
const size_t idx = getCellIndex(key, bucket);
if (!cells[idx].index.exists() || cells[idx].key != key)
return false;
val = cells[idx].index;
return true;
}
bool getKeyAndValue(K & key, V & val) const
{
const size_t bucket = (hash(key) & bucket_mask);
const size_t idx = getCellIndex(key, bucket);
if (!cells[idx].index.exists() || cells[idx].key != key)
return false;
key = cells[idx].key;
val = cells[idx].index;
return true;
}
bool erase(K key)
{
const size_t bucket = (hash(key) & bucket_mask);
const size_t idx = getCellIndex(key, bucket);
if (!cells[idx].index.exists() || cells[idx].key != key)
return false;
cells[idx].index.setNotExists();
--sz;
if constexpr (!std::is_same_v<EmptyDeleter, Deleter>)
deleter(cells[idx].key);
return true;
}
size_t size() const
{
return sz;
}
size_t capacity() const
{
return cells.size();
}
auto keys() const
{
std::vector<K> res;
for (const auto & cell : cells)
{
if (cell.index.exists())
{
res.push_back(cell.key);
}
}
return res;
}
private:
/// Searches for the key in the bucket.
/// Returns index of cell with provided key.
size_t getCellIndex(const K key, const size_t bucket) const
{
const size_t pos = getPosition(bucket);
for (int idx = 7; idx >= 0; --idx)
{
const size_t cur = ((pos + 1 + idx) & pos_mask);
if (cells[bucket * bucket_size + cur].index.exists() &&
cells[bucket * bucket_size + cur].key == key)
{
return bucket * bucket_size + cur;
}
}
return bucket * bucket_size + pos;
}
/// Returns current position for write in the bucket.
size_t getPosition(const size_t bucket) const
{
const size_t idx = (bucket >> 1);
if ((bucket & 1) == 0)
return ((positions[idx] >> 4) & pos_mask);
return (positions[idx] & pos_mask);
}
/// Sets current posiotion in the bucket.
void setPosition(const size_t bucket, const size_t pos)
{
const size_t idx = bucket >> 1;
if ((bucket & 1) == 0)
positions[idx] = ((pos << 4) | (positions[idx] & ((1 << 4) - 1)));
else
positions[idx] = (pos | (positions[idx] & (((1 << 4) - 1) << 4)));
}
void incPosition(const size_t bucket)
{
setPosition(bucket, (getPosition(bucket) + 1) & pos_mask);
}
static constexpr size_t bucket_size = 8;
static constexpr size_t pos_size = 3;
static constexpr size_t pos_mask = (1 << pos_size) - 1;
Hasher hash;
Deleter deleter;
size_t buckets;
size_t bucket_mask;
std::vector<Cell> cells;
std::vector<char> positions;
size_t sz = 0;
};
}

View File

@ -20,6 +20,10 @@ target_link_libraries(clickhouse_dictionaries
string_utils
)
target_link_libraries(clickhouse_dictionaries
PUBLIC
abseil_swiss_tables)
if(USE_CASSANDRA)
target_include_directories(clickhouse_dictionaries SYSTEM PRIVATE ${CASSANDRA_INCLUDE_DIR})
endif()

File diff suppressed because it is too large Load Diff

View File

@ -3,72 +3,76 @@
#include <atomic>
#include <chrono>
#include <cmath>
#include <map>
#include <mutex>
#include <shared_mutex>
#include <utility>
#include <variant>
#include <vector>
#include <common/logger_useful.h>
#include <Columns/ColumnDecimal.h>
#include <Columns/ColumnString.h>
#include <Common/ThreadPool.h>
#include <Common/ConcurrentBoundedQueue.h>
#include <pcg_random.hpp>
#include <Common/ArenaWithFreeLists.h>
#include <common/logger_useful.h>
#include <Common/randomSeed.h>
#include <Common/ThreadPool.h>
#include <Common/CurrentMetrics.h>
#include <ext/bit_cast.h>
#include "DictionaryStructure.h"
#include "IDictionary.h"
#include "IDictionarySource.h"
#include "DictionaryHelpers.h"
namespace CurrentMetrics
{
extern const Metric CacheDictionaryUpdateQueueBatches;
extern const Metric CacheDictionaryUpdateQueueKeys;
}
#include <Dictionaries/IDictionary.h>
#include <Dictionaries/ICacheDictionaryStorage.h>
#include <Dictionaries/DictionaryStructure.h>
#include <Dictionaries/IDictionarySource.h>
#include <Dictionaries/DictionaryHelpers.h>
#include <Dictionaries/CacheDictionaryUpdateQueue.h>
namespace DB
{
/** CacheDictionary store keys in cache storage and can asynchronous and synchronous updates during keys fetch.
namespace ErrorCodes
{
}
If keys are not found in storage during fetch, dictionary start update operation with update queue.
/*
*
* This dictionary is stored in a cache that has a fixed number of cells.
* These cells contain frequently used elements.
* When searching for a dictionary, the cache is searched first and special heuristic is used:
* while looking for the key, we take a look only at max_collision_length elements.
* So, our cache is not perfect. It has errors like "the key is in cache, but the cache says that it does not".
* And in this case we simply ask external source for the key which is faster.
* You have to keep this logic in mind.
* */
During update operation necessary keys are fetched from source and inserted into storage.
After that data from storage and source are aggregated and returned to the client.
Typical flow:
1. Client request data during for example getColumn function call.
2. CacheDictionary request data from storage and if all data is found in storage it returns result to client.
3. If some data is not in storage cache dictionary try to perform update.
If all keys are just expired and allow_read_expired_keys option is set dictionary starts asynchronous update and
return result to client.
If there are not found keys dictionary start synchronous update and wait for result.
4. After getting result from synchronous update dictionary aggregates data that was previously fetched from
storage and data that was fetched during update and return result to client.
*/
template <DictionaryKeyType dictionary_key_type>
class CacheDictionary final : public IDictionary
{
public:
using KeyType = std::conditional_t<dictionary_key_type == DictionaryKeyType::simple, UInt64, StringRef>;
static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by cache dictionary");
CacheDictionary(
const StorageID & dict_id_,
const DictionaryStructure & dict_struct_,
DictionarySourcePtr source_ptr_,
CacheDictionaryStoragePtr cache_storage_ptr_,
CacheDictionaryUpdateQueueConfiguration update_queue_configuration_,
DictionaryLifetime dict_lifetime_,
size_t strict_max_lifetime_seconds,
size_t size_,
bool allow_read_expired_keys_,
size_t max_update_queue_size_,
size_t update_queue_push_timeout_milliseconds_,
size_t query_wait_timeout_milliseconds,
size_t max_threads_for_updates);
bool allow_read_expired_keys_);
~CacheDictionary() override;
std::string getTypeName() const override { return "Cache"; }
std::string getTypeName() const override { return cache_storage_ptr->getName(); }
size_t getElementCount() const override;
size_t getBytesAllocated() const override;
double getLoadFactor() const override;
size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); }
double getHitRate() const override
@ -76,10 +80,6 @@ public:
return static_cast<double>(hit_count.load(std::memory_order_acquire)) / query_count.load(std::memory_order_relaxed);
}
size_t getElementCount() const override { return element_count.load(std::memory_order_relaxed); }
double getLoadFactor() const override { return static_cast<double>(element_count.load(std::memory_order_relaxed)) / size; }
bool supportUpdates() const override { return false; }
std::shared_ptr<const IExternalLoadable> clone() const override
@ -88,14 +88,10 @@ public:
getDictionaryID(),
dict_struct,
getSourceAndUpdateIfNeeded()->clone(),
cache_storage_ptr,
update_queue.getConfiguration(),
dict_lifetime,
strict_max_lifetime_seconds,
size,
allow_read_expired_keys,
max_update_queue_size,
update_queue_push_timeout_milliseconds,
query_wait_timeout_milliseconds,
max_threads_for_updates);
allow_read_expired_keys);
}
const IDictionarySource * getSource() const override;
@ -106,133 +102,78 @@ public:
bool isInjective(const std::string & attribute_name) const override
{
return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective;
return dict_struct.getAttribute(attribute_name).injective;
}
bool hasHierarchy() const override { return hierarchical_attribute; }
void toParent(const PaddedPODArray<Key> & ids, PaddedPODArray<Key> & out) const override;
void isInVectorVector(
const PaddedPODArray<Key> & child_ids, const PaddedPODArray<Key> & ancestor_ids, PaddedPODArray<UInt8> & out) const override;
void isInVectorConstant(const PaddedPODArray<Key> & child_ids, const Key ancestor_id, PaddedPODArray<UInt8> & out) const override;
void isInConstantVector(const Key child_id, const PaddedPODArray<Key> & ancestor_ids, PaddedPODArray<UInt8> & out) const override;
std::exception_ptr getLastException() const override;
DictionaryKeyType getKeyType() const override { return DictionaryKeyType::simple; }
DictionaryKeyType getKeyType() const override
{
return dictionary_key_type == DictionaryKeyType::simple ? DictionaryKeyType::simple : DictionaryKeyType::complex;
}
ColumnPtr getColumn(
const std::string& attribute_name,
const DataTypePtr & result_type,
const Columns & key_columns,
const DataTypes & key_types,
const ColumnPtr default_values_column) const override;
const ColumnPtr & default_values_column) const override;
Columns getColumns(
const Strings & attribute_names,
const DataTypes & result_types,
const Columns & key_columns,
const DataTypes & key_types,
const Columns & default_values_columns) const override;
ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override;
template <typename T>
using ResultArrayType = std::conditional_t<IsDecimalNumber<T>, DecimalPaddedPODArray<T>, PaddedPODArray<T>>;
BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override;
std::exception_ptr getLastException() const override;
bool hasHierarchy() const override { return dictionary_key_type == DictionaryKeyType::simple && hierarchical_attribute; }
void toParent(const PaddedPODArray<UInt64> & ids, PaddedPODArray<UInt64> & out) const override;
void isInVectorVector(
const PaddedPODArray<UInt64> & child_ids,
const PaddedPODArray<UInt64> & ancestor_ids,
PaddedPODArray<UInt8> & out) const override;
void isInVectorConstant(
const PaddedPODArray<UInt64> & child_ids,
const UInt64 ancestor_id, PaddedPODArray<UInt8> & out) const override;
void isInConstantVector(
const UInt64 child_id,
const PaddedPODArray<UInt64> & ancestor_ids,
PaddedPODArray<UInt8> & out) const override;
private:
template <typename Value>
using ContainerType = Value[];
template <typename Value>
using ContainerPtrType = std::unique_ptr<ContainerType<Value>>;
using FetchResult = std::conditional_t<dictionary_key_type == DictionaryKeyType::simple, SimpleKeysStorageFetchResult, ComplexKeysStorageFetchResult>;
using time_point_t = std::chrono::system_clock::time_point;
Columns getColumnsImpl(
const Strings & attribute_names,
const Columns & key_columns,
const PaddedPODArray<KeyType> & keys,
const Columns & default_values_columns) const;
struct CellMetadata final
{
UInt64 id;
time_point_t deadline;
bool is_default{false};
static MutableColumns aggregateColumnsInOrderOfKeys(
const PaddedPODArray<KeyType> & keys,
const DictionaryStorageFetchRequest & request,
const MutableColumns & fetched_columns,
const PaddedPODArray<KeyState> & key_index_to_state);
time_point_t expiresAt() const { return deadline; }
void setExpiresAt(const time_point_t & t) { deadline = t; is_default = false; }
bool isDefault() const { return is_default; }
void setDefault() { is_default = true; }
};
static MutableColumns aggregateColumns(
const PaddedPODArray<KeyType> & keys,
const DictionaryStorageFetchRequest & request,
const MutableColumns & fetched_columns_from_storage,
const PaddedPODArray<KeyState> & key_index_to_fetched_columns_from_storage_result,
const MutableColumns & fetched_columns_during_update,
const HashMap<KeyType, size_t> & found_keys_to_fetched_columns_during_update_index);
using AttributeValue = std::variant<
UInt8, UInt16, UInt32, UInt64, UInt128,
Int8, Int16, Int32, Int64,
Decimal32, Decimal64, Decimal128,
Float32, Float64, String>;
void setupHierarchicalAttribute();
struct AttributeValuesForKey
{
bool found{false};
std::vector<AttributeValue> values;
std::string dump();
};
using FoundValuesForKeys = std::unordered_map<Key, AttributeValuesForKey>;
struct Attribute final
{
AttributeUnderlyingType type;
String name;
/// Default value for each type. Could be defined in config.
AttributeValue null_value;
/// We store attribute value for all keys. It is a "row" in a hand-made open addressing hashtable,
/// where "column" is key.
std::variant<
ContainerPtrType<UInt8>,
ContainerPtrType<UInt16>,
ContainerPtrType<UInt32>,
ContainerPtrType<UInt64>,
ContainerPtrType<UInt128>,
ContainerPtrType<Int8>,
ContainerPtrType<Int16>,
ContainerPtrType<Int32>,
ContainerPtrType<Int64>,
ContainerPtrType<Decimal32>,
ContainerPtrType<Decimal64>,
ContainerPtrType<Decimal128>,
ContainerPtrType<Float32>,
ContainerPtrType<Float64>,
ContainerPtrType<StringRef>>
arrays;
};
void createAttributes();
/* NOLINTNEXTLINE(readability-convert-member-functions-to-static) */
Attribute createAttributeWithTypeAndName(const AttributeUnderlyingType type, const String & name, const Field & null_value);
template <typename AttributeType, typename OutputType, typename DefaultValueExtractor>
void getItemsNumberImpl(
Attribute & attribute,
const PaddedPODArray<Key> & ids,
ResultArrayType<OutputType> & out,
DefaultValueExtractor & default_value_extractor) const;
void getItemsString(
Attribute & attribute,
const PaddedPODArray<Key> & ids,
ColumnString * out,
DictionaryDefaultValueExtractor<String> & default_value_extractor) const;
PaddedPODArray<Key> getCachedIds() const;
bool isEmptyCell(const UInt64 idx) const;
size_t getCellIdx(const Key id) const;
void setDefaultAttributeValue(Attribute & attribute, const Key idx) const;
void setAttributeValue(Attribute & attribute, const Key idx, const Field & value) const;
static std::vector<AttributeValue> getAttributeValuesFromBlockAtPosition(const std::vector<const IColumn *> & column_ptrs, size_t position);
Attribute & getAttribute(const std::string & attribute_name) const;
size_t getAttributeIndex(const std::string & attribute_name) const;
using SharedDictionarySourcePtr = std::shared_ptr<IDictionarySource>;
void update(CacheDictionaryUpdateUnitPtr<dictionary_key_type> update_unit_ptr);
/// Update dictionary source pointer if required and return it. Thread safe.
/// MultiVersion is not used here because it works with constant pointers.
@ -252,47 +193,6 @@ private:
return source_ptr;
}
inline void setLifetime(CellMetadata & cell, time_point_t now)
{
if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0)
{
std::uniform_int_distribution<UInt64> distribution{dict_lifetime.min_sec, dict_lifetime.max_sec};
cell.setExpiresAt(now + std::chrono::seconds{distribution(rnd_engine)});
}
else
{
/// This maybe not obvious, but when we define is this cell is expired or expired permanently, we add strict_max_lifetime_seconds
/// to the expiration time. And it overflows pretty well.
cell.setExpiresAt(std::chrono::time_point<std::chrono::system_clock>::max() - 2 * std::chrono::seconds(strict_max_lifetime_seconds));
}
}
inline bool isExpired(time_point_t now, time_point_t deadline) const
{
return now > deadline;
}
inline bool isExpiredPermanently(time_point_t now, time_point_t deadline) const
{
return now > deadline + std::chrono::seconds(strict_max_lifetime_seconds);
}
enum class ResultState
{
NotFound,
FoundAndValid,
FoundButExpired,
/// Here is a gap between there two states in which a key could be read
/// with an enabled setting in config enable_read_expired_keys.
FoundButExpiredPermanently
};
using FindResult = std::pair<size_t, ResultState>;
FindResult findCellIdxForGet(const Key & id, const time_point_t now) const;
size_t findCellIdxForSet(const Key & id) const;
template <typename AncestorType>
void isInImpl(const PaddedPODArray<Key> & child_ids, const AncestorType & ancestor_ids, PaddedPODArray<UInt8> & out) const;
@ -302,110 +202,34 @@ private:
mutable std::mutex source_mutex;
mutable SharedDictionarySourcePtr source_ptr;
CacheDictionaryStoragePtr cache_storage_ptr;
mutable CacheDictionaryUpdateQueue<dictionary_key_type> update_queue;
const DictionaryLifetime dict_lifetime;
const size_t strict_max_lifetime_seconds;
const bool allow_read_expired_keys;
const size_t max_update_queue_size;
const size_t update_queue_push_timeout_milliseconds;
const size_t query_wait_timeout_milliseconds;
const size_t max_threads_for_updates;
Poco::Logger * log;
const bool allow_read_expired_keys;
mutable pcg64 rnd_engine;
/// This lock is used for the inner cache state update function lock it for
/// write, when it need to update cache state all other functions just
/// readers. Surprisingly this lock is also used for last_exception pointer.
mutable std::shared_mutex rw_lock;
/// Actual size will be increased to match power of 2
const size_t size;
/// all bits to 1 mask (size - 1) (0b1000 - 1 = 0b111)
const size_t size_overlap_mask;
/// Max tries to find cell, overlapped with mask: if size = 16 and start_cell=10: will try cells: 10,11,12,13,14,15,0,1,2,3
static constexpr size_t max_collision_length = 10;
const size_t zero_cell_idx{getCellIdx(0)};
std::map<std::string, size_t> attribute_index_by_name;
mutable std::vector<Attribute> attributes;
mutable std::vector<CellMetadata> cells;
Attribute * hierarchical_attribute = nullptr;
std::unique_ptr<ArenaWithFreeLists> string_arena;
const DictionaryAttribute * hierarchical_attribute = nullptr;
mutable std::exception_ptr last_exception;
mutable std::atomic<size_t> error_count{0};
mutable std::atomic<size_t> error_count {0};
mutable std::atomic<std::chrono::system_clock::time_point> backoff_end_time{std::chrono::system_clock::time_point{}};
mutable pcg64 rnd_engine;
mutable size_t bytes_allocated = 0;
mutable std::atomic<size_t> element_count{0};
mutable std::atomic<size_t> hit_count{0};
mutable std::atomic<size_t> query_count{0};
/*
* How the update goes: we basically have a method like get(keys)->values. Values are cached, so sometimes we
* can return them from the cache. For values not in cache, we query them from the source, and add to the
* cache. The cache is lossy, so we can't expect it to store all the keys, and we store them separately.
* So, there is a map of found keys to all its attributes.
*/
struct UpdateUnit
{
explicit UpdateUnit(std::vector<Key> && requested_ids_) :
requested_ids(std::move(requested_ids_)),
alive_keys(CurrentMetrics::CacheDictionaryUpdateQueueKeys, requested_ids.size())
{
found_ids.reserve(requested_ids.size());
for (const auto id : requested_ids)
found_ids.insert({id, {}});
}
std::vector<Key> requested_ids;
FoundValuesForKeys found_ids;
std::atomic<bool> is_done{false};
std::exception_ptr current_exception{nullptr};
/// While UpdateUnit is alive, it is accounted in update_queue size.
CurrentMetrics::Increment alive_batch{CurrentMetrics::CacheDictionaryUpdateQueueBatches};
CurrentMetrics::Increment alive_keys;
std::string dumpFoundIds();
};
using UpdateUnitPtr = std::shared_ptr<UpdateUnit>;
using UpdateQueue = ConcurrentBoundedQueue<UpdateUnitPtr>;
mutable UpdateQueue update_queue;
ThreadPool update_pool;
/*
* Actually, we can divide all requested keys into two 'buckets'. There are only four possible states and they
* are described in the table.
*
* cache_not_found_ids |0|0|1|1|
* cache_expired_ids |0|1|0|1|
*
* 0 - if set is empty, 1 - otherwise
*
* Only if there are no cache_not_found_ids and some cache_expired_ids
* (with allow_read_expired_keys setting) we can perform async update.
* Otherwise we have no concatenate ids and update them sync.
*
*/
void updateThreadFunction();
void update(UpdateUnitPtr & update_unit_ptr);
void tryPushToUpdateQueueOrThrow(UpdateUnitPtr & update_unit_ptr) const;
void waitForCurrentUpdateFinish(UpdateUnitPtr & update_unit_ptr) const;
mutable std::mutex update_mutex;
mutable std::condition_variable is_update_finished;
std::atomic<bool> finished{false};
};
extern template class CacheDictionary<DictionaryKeyType::simple>;
extern template class CacheDictionary<DictionaryKeyType::complex>;
}

View File

@ -0,0 +1,418 @@
#pragma once
#include <chrono>
#include <pcg_random.hpp>
#include <Common/randomSeed.h>
#include <Common/Arena.h>
#include <Common/ArenaWithFreeLists.h>
#include <Common/HashTable/LRUHashMap.h>
#include <Dictionaries/DictionaryStructure.h>
#include <Dictionaries/ICacheDictionaryStorage.h>
#include <Dictionaries/DictionaryHelpers.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
struct CacheDictionaryStorageConfiguration
{
/// Max size of storage in cells
const size_t max_size_in_cells;
/// Needed to perform check if cell is expired or not found. Default value is dictionary max lifetime.
const size_t strict_max_lifetime_seconds;
/// Lifetime of dictionary. Cell deadline is random value between lifetime min and max seconds.
const DictionaryLifetime lifetime;
};
/** Keys are stored in LRUCache and column values are serialized into arena.
Cell in LRUCache consists of allocated size and place in arena were columns serialized data is stored.
Columns are serialized by rows.
When cell is removed from LRUCache data associated with it is also removed from arena.
In case of complex key we also store key data in arena and it is removed from arena.
*/
template <DictionaryKeyType dictionary_key_type>
class CacheDictionaryStorage final : public ICacheDictionaryStorage
{
public:
using KeyType = std::conditional_t<dictionary_key_type == DictionaryKeyType::simple, UInt64, StringRef>;
static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionaryStorage");
explicit CacheDictionaryStorage(CacheDictionaryStorageConfiguration & configuration_)
: configuration(configuration_)
, rnd_engine(randomSeed())
, cache(configuration.max_size_in_cells, false, { arena })
{
}
bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return true; }
String getName() const override
{
if (dictionary_key_type == DictionaryKeyType::simple)
return "Cache";
else
return "ComplexKeyCache";
}
bool supportsSimpleKeys() const override { return dictionary_key_type == DictionaryKeyType::simple; }
SimpleKeysStorageFetchResult fetchColumnsForKeys(
const PaddedPODArray<UInt64> & keys,
const DictionaryStorageFetchRequest & fetch_request) override
{
if constexpr (dictionary_key_type == DictionaryKeyType::simple)
{
return fetchColumnsForKeysImpl<SimpleKeysStorageFetchResult>(keys, fetch_request);
}
else
throw Exception("Method fetchColumnsForKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED);
}
void insertColumnsForKeys(const PaddedPODArray<UInt64> & keys, Columns columns) override
{
if constexpr (dictionary_key_type == DictionaryKeyType::simple)
insertColumnsForKeysImpl(keys, columns);
else
throw Exception("Method insertColumnsForKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED);
}
void insertDefaultKeys(const PaddedPODArray<UInt64> & keys) override
{
if constexpr (dictionary_key_type == DictionaryKeyType::simple)
insertDefaultKeysImpl(keys);
else
throw Exception("Method insertDefaultKeysImpl is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED);
}
PaddedPODArray<UInt64> getCachedSimpleKeys() const override
{
if constexpr (dictionary_key_type == DictionaryKeyType::simple)
return getCachedKeysImpl();
else
throw Exception("Method getCachedSimpleKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED);
}
bool supportsComplexKeys() const override { return dictionary_key_type == DictionaryKeyType::complex; }
ComplexKeysStorageFetchResult fetchColumnsForKeys(
const PaddedPODArray<StringRef> & keys,
const DictionaryStorageFetchRequest & column_fetch_requests) override
{
if constexpr (dictionary_key_type == DictionaryKeyType::complex)
{
return fetchColumnsForKeysImpl<ComplexKeysStorageFetchResult>(keys, column_fetch_requests);
}
else
throw Exception("Method fetchColumnsForKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED);
}
void insertColumnsForKeys(const PaddedPODArray<StringRef> & keys, Columns columns) override
{
if constexpr (dictionary_key_type == DictionaryKeyType::complex)
insertColumnsForKeysImpl(keys, columns);
else
throw Exception("Method insertColumnsForKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED);
}
void insertDefaultKeys(const PaddedPODArray<StringRef> & keys) override
{
if constexpr (dictionary_key_type == DictionaryKeyType::complex)
insertDefaultKeysImpl(keys);
else
throw Exception("Method insertDefaultKeysImpl is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED);
}
PaddedPODArray<StringRef> getCachedComplexKeys() const override
{
if constexpr (dictionary_key_type == DictionaryKeyType::complex)
return getCachedKeysImpl();
else
throw Exception("Method getCachedComplexKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED);
}
size_t getSize() const override { return cache.size(); }
size_t getMaxSize() const override { return cache.getMaxSize(); }
size_t getBytesAllocated() const override { return arena.size() + cache.getSizeInBytes(); }
private:
template <typename KeysStorageFetchResult>
ALWAYS_INLINE KeysStorageFetchResult fetchColumnsForKeysImpl(
const PaddedPODArray<KeyType> & keys,
const DictionaryStorageFetchRequest & fetch_request)
{
KeysStorageFetchResult result;
result.fetched_columns = fetch_request.makeAttributesResultColumns();
result.key_index_to_state.resize_fill(keys.size(), {KeyState::not_found});
const auto now = std::chrono::system_clock::now();
size_t fetched_columns_index = 0;
std::chrono::seconds max_lifetime_seconds(configuration.strict_max_lifetime_seconds);
size_t keys_size = keys.size();
for (size_t key_index = 0; key_index < keys_size; ++key_index)
{
auto key = keys[key_index];
auto * it = cache.find(key);
if (it)
{
/// Columns values for key are serialized in cache now deserialize them
const auto & cell = it->getMapped();
bool has_deadline = cellHasDeadline(cell);
if (has_deadline && now > cell.deadline + max_lifetime_seconds)
{
result.key_index_to_state[key_index] = {KeyState::not_found};
++result.not_found_keys_size;
continue;
}
else if (has_deadline && now > cell.deadline)
{
result.key_index_to_state[key_index] = {KeyState::expired, fetched_columns_index};
++result.expired_keys_size;
}
else
{
result.key_index_to_state[key_index] = {KeyState::found, fetched_columns_index};
++result.found_keys_size;
}
++fetched_columns_index;
if (cell.isDefault())
{
result.key_index_to_state[key_index].setDefault();
++result.default_keys_size;
insertDefaultValuesIntoColumns(result.fetched_columns, fetch_request, key_index);
}
else
{
const char * place_for_serialized_columns = cell.place_for_serialized_columns;
deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, place_for_serialized_columns);
}
}
else
{
result.key_index_to_state[key_index] = {KeyState::not_found};
++result.not_found_keys_size;
}
}
return result;
}
void insertColumnsForKeysImpl(const PaddedPODArray<KeyType> & keys, Columns columns)
{
Arena temporary_values_pool;
size_t columns_to_serialize_size = columns.size();
PaddedPODArray<StringRef> temporary_column_data(columns_to_serialize_size);
const auto now = std::chrono::system_clock::now();
size_t keys_size = keys.size();
for (size_t key_index = 0; key_index < keys_size; ++key_index)
{
size_t allocated_size_for_columns = 0;
const char * block_start = nullptr;
auto key = keys[key_index];
auto * it = cache.find(key);
for (size_t column_index = 0; column_index < columns_to_serialize_size; ++column_index)
{
auto & column = columns[column_index];
temporary_column_data[column_index] = column->serializeValueIntoArena(key_index, temporary_values_pool, block_start);
allocated_size_for_columns += temporary_column_data[column_index].size;
}
char * place_for_serialized_columns = arena.alloc(allocated_size_for_columns);
memcpy(reinterpret_cast<void*>(place_for_serialized_columns), reinterpret_cast<const void*>(block_start), allocated_size_for_columns);
if (it)
{
/// Cell exists need to free previous serialized place and update deadline
auto & cell = it->getMapped();
if (cell.place_for_serialized_columns)
arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns);
setCellDeadline(cell, now);
cell.allocated_size_for_columns = allocated_size_for_columns;
cell.place_for_serialized_columns = place_for_serialized_columns;
}
else
{
/// No cell exists so create and put in cache
Cell cell;
setCellDeadline(cell, now);
cell.allocated_size_for_columns = allocated_size_for_columns;
cell.place_for_serialized_columns = place_for_serialized_columns;
insertCellInCache(key, cell);
}
temporary_values_pool.rollback(allocated_size_for_columns);
}
}
void insertDefaultKeysImpl(const PaddedPODArray<KeyType> & keys)
{
const auto now = std::chrono::system_clock::now();
for (auto key : keys)
{
auto * it = cache.find(key);
if (it)
{
auto & cell = it->getMapped();
setCellDeadline(cell, now);
if (cell.place_for_serialized_columns)
arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns);
cell.allocated_size_for_columns = 0;
cell.place_for_serialized_columns = nullptr;
}
else
{
Cell cell;
setCellDeadline(cell, now);
cell.allocated_size_for_columns = 0;
cell.place_for_serialized_columns = nullptr;
insertCellInCache(key, cell);
}
}
}
PaddedPODArray<KeyType> getCachedKeysImpl() const
{
PaddedPODArray<KeyType> result;
result.reserve(cache.size());
for (auto & node : cache)
{
auto & cell = node.getMapped();
if (cell.isDefault())
continue;
result.emplace_back(node.getKey());
}
return result;
}
using TimePoint = std::chrono::system_clock::time_point;
struct Cell
{
TimePoint deadline;
size_t allocated_size_for_columns;
char * place_for_serialized_columns;
inline bool isDefault() const { return place_for_serialized_columns == nullptr; }
inline void setDefault()
{
place_for_serialized_columns = nullptr;
allocated_size_for_columns = 0;
}
};
void insertCellInCache(KeyType & key, const Cell & cell)
{
if constexpr (dictionary_key_type == DictionaryKeyType::complex)
{
/// Copy complex key into arena and put in cache
size_t key_size = key.size;
char * place_for_key = arena.alloc(key_size);
memcpy(reinterpret_cast<void *>(place_for_key), reinterpret_cast<const void *>(key.data), key_size);
KeyType updated_key{place_for_key, key_size};
key = updated_key;
}
cache.insert(key, cell);
}
inline static bool cellHasDeadline(const Cell & cell)
{
return cell.deadline != std::chrono::system_clock::from_time_t(0);
}
inline void setCellDeadline(Cell & cell, TimePoint now)
{
if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0)
{
cell.deadline = std::chrono::system_clock::from_time_t(0);
return;
}
size_t min_sec_lifetime = configuration.lifetime.min_sec;
size_t max_sec_lifetime = configuration.lifetime.max_sec;
std::uniform_int_distribution<UInt64> distribution{min_sec_lifetime, max_sec_lifetime};
cell.deadline = now + std::chrono::seconds(distribution(rnd_engine));
}
template <typename>
friend class ArenaCellDisposer;
CacheDictionaryStorageConfiguration configuration;
ArenaWithFreeLists arena;
pcg64 rnd_engine;
class ArenaCellDisposer
{
public:
ArenaWithFreeLists & arena;
template <typename Key, typename Value>
void operator()(const Key & key, const Value & value) const
{
/// In case of complex key we keep it in arena
if constexpr (std::is_same_v<Key, StringRef>)
arena.free(const_cast<char *>(key.data), key.size);
if (value.place_for_serialized_columns)
arena.free(value.place_for_serialized_columns, value.allocated_size_for_columns);
}
};
using SimpleKeyLRUHashMap = LRUHashMap<UInt64, Cell, ArenaCellDisposer>;
using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash<StringRef, Cell, ArenaCellDisposer>;
using CacheLRUHashMap = std::conditional_t<
dictionary_key_type == DictionaryKeyType::simple,
SimpleKeyLRUHashMap,
ComplexKeyLRUHashMap>;
CacheLRUHashMap cache;
};
}

View File

@ -0,0 +1,162 @@
#include "CacheDictionaryUpdateQueue.h"
#include <Dictionaries/CacheDictionaryUpdateQueue.h>
#include <Common/setThreadName.h>
namespace DB
{
namespace ErrorCodes
{
extern const int CACHE_DICTIONARY_UPDATE_FAIL;
extern const int UNSUPPORTED_METHOD;
extern const int TIMEOUT_EXCEEDED;
}
template class CacheDictionaryUpdateUnit<DictionaryKeyType::simple>;
template class CacheDictionaryUpdateUnit<DictionaryKeyType::complex>;
template <DictionaryKeyType dictionary_key_type>
CacheDictionaryUpdateQueue<dictionary_key_type>::CacheDictionaryUpdateQueue(
String dictionary_name_for_logs_,
CacheDictionaryUpdateQueueConfiguration configuration_,
UpdateFunction && update_func_)
: dictionary_name_for_logs(std::move(dictionary_name_for_logs_))
, configuration(configuration_)
, update_func(std::move(update_func_))
, update_queue(configuration.max_update_queue_size)
, update_pool(configuration.max_threads_for_updates)
{
for (size_t i = 0; i < configuration.max_threads_for_updates; ++i)
update_pool.scheduleOrThrowOnError([this] { updateThreadFunction(); });
}
template <DictionaryKeyType dictionary_key_type>
CacheDictionaryUpdateQueue<dictionary_key_type>::~CacheDictionaryUpdateQueue()
{
try {
if (!finished)
stopAndWait();
}
catch (...)
{
/// TODO: Write log
}
}
template <DictionaryKeyType dictionary_key_type>
void CacheDictionaryUpdateQueue<dictionary_key_type>::tryPushToUpdateQueueOrThrow(CacheDictionaryUpdateUnitPtr<dictionary_key_type> & update_unit_ptr)
{
if (finished)
throw Exception{"CacheDictionaryUpdateQueue finished", ErrorCodes::UNSUPPORTED_METHOD};
if (!update_queue.tryPush(update_unit_ptr, configuration.update_queue_push_timeout_milliseconds))
throw DB::Exception(
ErrorCodes::CACHE_DICTIONARY_UPDATE_FAIL,
"Cannot push to internal update queue in dictionary {}. "
"Timelimit of {} ms. exceeded. Current queue size is {}",
dictionary_name_for_logs,
std::to_string(configuration.update_queue_push_timeout_milliseconds),
std::to_string(update_queue.size()));
}
template <DictionaryKeyType dictionary_key_type>
void CacheDictionaryUpdateQueue<dictionary_key_type>::waitForCurrentUpdateFinish(CacheDictionaryUpdateUnitPtr<dictionary_key_type> & update_unit_ptr) const
{
if (finished)
throw Exception{"CacheDictionaryUpdateQueue finished", ErrorCodes::UNSUPPORTED_METHOD};
std::unique_lock<std::mutex> update_lock(update_mutex);
bool result = is_update_finished.wait_for(
update_lock,
std::chrono::milliseconds(configuration.query_wait_timeout_milliseconds),
[&]
{
return update_unit_ptr->is_done || update_unit_ptr->current_exception;
});
if (!result)
{
throw DB::Exception(
ErrorCodes::TIMEOUT_EXCEEDED,
"Dictionary {} source seems unavailable, because {} ms timeout exceeded.",
dictionary_name_for_logs,
toString(configuration.query_wait_timeout_milliseconds));
}
if (update_unit_ptr->current_exception)
{
// Don't just rethrow it, because sharing the same exception object
// between multiple threads can lead to weird effects if they decide to
// modify it, for example, by adding some error context.
try
{
std::rethrow_exception(update_unit_ptr->current_exception);
}
catch (...)
{
throw DB::Exception(
ErrorCodes::CACHE_DICTIONARY_UPDATE_FAIL,
"Update failed for dictionary '{}': {}",
dictionary_name_for_logs,
getCurrentExceptionMessage(true /*with stack trace*/, true /*check embedded stack trace*/));
}
}
}
template <DictionaryKeyType dictionary_key_type>
void CacheDictionaryUpdateQueue<dictionary_key_type>::stopAndWait()
{
finished = true;
update_queue.clear();
for (size_t i = 0; i < configuration.max_threads_for_updates; ++i)
{
auto empty_finishing_ptr = std::make_shared<CacheDictionaryUpdateUnit<dictionary_key_type>>();
update_queue.push(empty_finishing_ptr);
}
update_pool.wait();
}
template <DictionaryKeyType dictionary_key_type>
void CacheDictionaryUpdateQueue<dictionary_key_type>::updateThreadFunction()
{
setThreadName("UpdQueue");
while (!finished)
{
CacheDictionaryUpdateUnitPtr<dictionary_key_type> unit_to_update;
update_queue.pop(unit_to_update);
if (finished)
break;
try
{
/// Update
update_func(unit_to_update);
/// Notify thread about finished updating the bunch of ids
/// where their own ids were included.
std::unique_lock<std::mutex> lock(update_mutex);
unit_to_update->is_done = true;
is_update_finished.notify_all();
}
catch (...)
{
std::unique_lock<std::mutex> lock(update_mutex);
unit_to_update->current_exception = std::current_exception(); // NOLINT(bugprone-throw-keyword-missing)
is_update_finished.notify_all();
}
}
}
template class CacheDictionaryUpdateQueue<DictionaryKeyType::simple>;
template class CacheDictionaryUpdateQueue<DictionaryKeyType::complex>;
}

View File

@ -0,0 +1,172 @@
#pragma once
#include <atomic>
#include <mutex>
#include <shared_mutex>
#include <utility>
#include <vector>
#include <functional>
#include <Common/ThreadPool.h>
#include <Common/ConcurrentBoundedQueue.h>
#include <Common/CurrentMetrics.h>
#include <Common/PODArray.h>
#include <Common/HashTable/HashMap.h>
#include <Columns/IColumn.h>
#include <Dictionaries/ICacheDictionaryStorage.h>
namespace CurrentMetrics
{
extern const Metric CacheDictionaryUpdateQueueBatches;
extern const Metric CacheDictionaryUpdateQueueKeys;
}
namespace DB
{
/** This class is passed between update queue and update queue client during update.
For simple keys we pass simple keys.
For complex keys we pass complex keys columns and requested rows to update.
During update cache dictionary should fill requested_keys_to_fetched_columns_during_update_index and
fetched_columns_during_update.
For complex key to extend lifetime of key complex key arena should be used.
*/
template <DictionaryKeyType dictionary_key_type>
class CacheDictionaryUpdateUnit
{
public:
using KeyType = std::conditional_t<dictionary_key_type == DictionaryKeyType::simple, UInt64, StringRef>;
/// Constructor for complex keys update request
explicit CacheDictionaryUpdateUnit(
const Columns & key_columns_,
const PaddedPODArray<KeyState> & key_index_to_state_from_storage_,
const DictionaryStorageFetchRequest & request_,
size_t keys_to_update_size_)
: key_columns(key_columns_)
, key_index_to_state(key_index_to_state_from_storage_.begin(), key_index_to_state_from_storage_.end())
, request(request_)
, keys_to_update_size(keys_to_update_size_)
, alive_keys(CurrentMetrics::CacheDictionaryUpdateQueueKeys, keys_to_update_size)
{}
CacheDictionaryUpdateUnit()
: keys_to_update_size(0)
, alive_keys(CurrentMetrics::CacheDictionaryUpdateQueueKeys, 0)
{}
const Columns key_columns;
const PaddedPODArray<KeyState> key_index_to_state;
const DictionaryStorageFetchRequest request;
const size_t keys_to_update_size;
HashMap<KeyType, size_t> requested_keys_to_fetched_columns_during_update_index;
MutableColumns fetched_columns_during_update;
/// Complex keys are serialized in this arena
Arena complex_key_arena;
private:
template <DictionaryKeyType>
friend class CacheDictionaryUpdateQueue;
std::atomic<bool> is_done{false};
std::exception_ptr current_exception{nullptr};
/// While UpdateUnit is alive, it is accounted in update_queue size.
CurrentMetrics::Increment alive_batch{CurrentMetrics::CacheDictionaryUpdateQueueBatches};
CurrentMetrics::Increment alive_keys;
};
template <DictionaryKeyType dictionary_key_type>
using CacheDictionaryUpdateUnitPtr = std::shared_ptr<CacheDictionaryUpdateUnit<dictionary_key_type>>;
extern template class CacheDictionaryUpdateUnit<DictionaryKeyType::simple>;
extern template class CacheDictionaryUpdateUnit<DictionaryKeyType::complex>;
struct CacheDictionaryUpdateQueueConfiguration
{
/// Size of update queue
const size_t max_update_queue_size;
/// Size in thead pool of update queue
const size_t max_threads_for_updates;
/// Timeout for trying to push update unit into queue
const size_t update_queue_push_timeout_milliseconds;
/// Timeout during sync waititing of update unit
const size_t query_wait_timeout_milliseconds;
};
/** Responsibility of this class is to provide asynchronous and synchronous update support for CacheDictionary
It is responsibility of CacheDictionary to perform update with UpdateUnit using UpdateFunction.
*/
template <DictionaryKeyType dictionary_key_type>
class CacheDictionaryUpdateQueue
{
public:
/// Client of update queue must provide this function in constructor and perform update using update unit.
using UpdateFunction = std::function<void (CacheDictionaryUpdateUnitPtr<dictionary_key_type>)>;
static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionaryUpdateQueue");
CacheDictionaryUpdateQueue(
String dictionary_name_for_logs_,
CacheDictionaryUpdateQueueConfiguration configuration_,
UpdateFunction && update_func_);
~CacheDictionaryUpdateQueue();
/// Get configuration that was passed to constructor
const CacheDictionaryUpdateQueueConfiguration & getConfiguration() const { return configuration; }
/// Is queue finished
bool isFinished() const { return finished; }
/// Synchronous wait for update queue to stop
void stopAndWait();
/** Try to add update unit into queue.
If queue is full and oush cannot be performed in update_queue_push_timeout_milliseconds from configuration
an exception will be thrown.
If queue already finished an exception will be thrown.
*/
void tryPushToUpdateQueueOrThrow(CacheDictionaryUpdateUnitPtr<dictionary_key_type> & update_unit_ptr);
/** Try to synchronously wait for update completion.
If exception was passed from update function during update it will be rethrowed.
If update will not be finished in query_wait_timeout_milliseconds from configuration
an exception will be thrown.
If queue already finished an exception will be thrown.
*/
void waitForCurrentUpdateFinish(CacheDictionaryUpdateUnitPtr<dictionary_key_type> & update_unit_ptr) const;
private:
void updateThreadFunction();
using UpdateQueue = ConcurrentBoundedQueue<CacheDictionaryUpdateUnitPtr<dictionary_key_type>>;
String dictionary_name_for_logs;
CacheDictionaryUpdateQueueConfiguration configuration;
UpdateFunction update_func;
UpdateQueue update_queue;
ThreadPool update_pool;
mutable std::mutex update_mutex;
mutable std::condition_variable is_update_finished;
std::atomic<bool> finished{false};
};
extern template class CacheDictionaryUpdateQueue<DictionaryKeyType::simple>;
extern template class CacheDictionaryUpdateQueue<DictionaryKeyType::complex>;
}

View File

@ -1,915 +0,0 @@
#include "ComplexKeyCacheDictionary.h"
#include <Common/Arena.h>
#include <Common/BitHelpers.h>
#include <Common/CurrentMetrics.h>
#include <Common/ProfileEvents.h>
#include <Common/ProfilingScopedRWLock.h>
#include <Common/Stopwatch.h>
#include <Common/randomSeed.h>
#include <ext/map.h>
#include <ext/range.h>
#include "DictionaryBlockInputStream.h"
#include "DictionaryFactory.h"
#include <Functions/FunctionHelpers.h>
#include <DataTypes/DataTypesDecimal.h>
namespace ProfileEvents
{
extern const Event DictCacheKeysRequested;
extern const Event DictCacheKeysRequestedMiss;
extern const Event DictCacheKeysRequestedFound;
extern const Event DictCacheKeysExpired;
extern const Event DictCacheKeysNotFound;
extern const Event DictCacheKeysHit;
extern const Event DictCacheRequestTimeNs;
extern const Event DictCacheLockWriteNs;
extern const Event DictCacheLockReadNs;
}
namespace CurrentMetrics
{
extern const Metric DictCacheRequests;
}
namespace DB
{
namespace ErrorCodes
{
extern const int TYPE_MISMATCH;
extern const int BAD_ARGUMENTS;
extern const int UNSUPPORTED_METHOD;
extern const int TOO_SMALL_BUFFER_SIZE;
}
inline UInt64 ComplexKeyCacheDictionary::getCellIdx(const StringRef key) const
{
const auto hash = StringRefHash{}(key);
const auto idx = hash & size_overlap_mask;
return idx;
}
ComplexKeyCacheDictionary::ComplexKeyCacheDictionary(
const StorageID & dict_id_,
const DictionaryStructure & dict_struct_,
DictionarySourcePtr source_ptr_,
const DictionaryLifetime dict_lifetime_,
const size_t size_)
: IDictionaryBase(dict_id_)
, dict_struct(dict_struct_)
, source_ptr{std::move(source_ptr_)}
, dict_lifetime(dict_lifetime_)
, size{roundUpToPowerOfTwoOrZero(std::max(size_, size_t(max_collision_length)))}
, size_overlap_mask{this->size - 1}
, rnd_engine(randomSeed())
{
if (!this->source_ptr->supportsSelectiveLoad())
throw Exception{full_name + ": source cannot be used with ComplexKeyCacheDictionary", ErrorCodes::UNSUPPORTED_METHOD};
createAttributes();
}
ColumnPtr ComplexKeyCacheDictionary::getColumn(
const std::string & attribute_name,
const DataTypePtr & result_type,
const Columns & key_columns,
const DataTypes & key_types,
const ColumnPtr default_values_column) const
{
dict_struct.validateKeyTypes(key_types);
ColumnPtr result;
auto & attribute = getAttribute(attribute_name);
const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type);
auto keys_size = key_columns.front()->size();
auto type_call = [&](const auto &dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
using ColumnProvider = DictionaryAttributeColumnProvider<AttributeType>;
const auto & null_value = std::get<AttributeType>(attribute.null_values);
DictionaryDefaultValueExtractor<AttributeType> default_value_extractor(null_value, default_values_column);
auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size);
if constexpr (std::is_same_v<AttributeType, String>)
{
auto * out = column.get();
getItemsString(attribute, key_columns, out, default_value_extractor);
}
else
{
auto & out = column->getData();
getItemsNumberImpl<AttributeType, AttributeType>(attribute, key_columns, out, default_value_extractor);
}
result = std::move(column);
};
callOnDictionaryAttributeType(attribute.type, type_call);
return result;
}
/// returns cell_idx (always valid for replacing), 'cell is valid' flag, 'cell is outdated' flag,
/// true false found and valid
/// false true not found (something outdated, maybe our cell)
/// false false not found (other id stored with valid data)
/// true true impossible
///
/// todo: split this func to two: find_for_get and find_for_set
ComplexKeyCacheDictionary::FindResult
ComplexKeyCacheDictionary::findCellIdx(const StringRef & key, const CellMetadata::time_point_t now, const size_t hash) const
{
auto pos = hash;
auto oldest_id = pos;
auto oldest_time = CellMetadata::time_point_t::max();
const auto stop = pos + max_collision_length;
for (; pos < stop; ++pos)
{
const auto cell_idx = pos & size_overlap_mask;
const auto & cell = cells[cell_idx];
if (cell.hash != hash || cell.key != key)
{
/// maybe we already found nearest expired cell
if (oldest_time > now && oldest_time > cell.expiresAt())
{
oldest_time = cell.expiresAt();
oldest_id = cell_idx;
}
continue;
}
if (cell.expiresAt() < now)
{
return {cell_idx, false, true};
}
return {cell_idx, true, false};
}
oldest_id &= size_overlap_mask;
return {oldest_id, false, false};
}
ColumnUInt8::Ptr ComplexKeyCacheDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const
{
dict_struct.validateKeyTypes(key_types);
const auto rows_num = key_columns.front()->size();
auto result = ColumnUInt8::create(rows_num);
auto& out = result->getData();
for (const auto row : ext::range(0, rows_num))
out[row] = false;
/// Mapping: <key> -> { all indices `i` of `key_columns` such that `key_columns[i]` = <key> }
MapType<std::vector<size_t>> outdated_keys;
const auto keys_size = dict_struct.key->size();
StringRefs keys(keys_size);
Arena temporary_keys_pool;
PODArray<StringRef> keys_array(rows_num);
size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0;
{
const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
const auto now = std::chrono::system_clock::now();
/// fetch up-to-date values, decide which ones require update
for (const auto row : ext::range(0, rows_num))
{
const StringRef key = placeKeysInPool(row, key_columns, keys, *dict_struct.key, temporary_keys_pool);
keys_array[row] = key;
const auto find_result = findCellIdx(key, now);
const auto & cell_idx = find_result.cell_idx;
/** cell should be updated if either:
* 1. keys (or hash) do not match,
* 2. cell has expired,
* 3. explicit defaults were specified and cell was set default. */
if (!find_result.valid)
{
outdated_keys[key].push_back(row);
if (find_result.outdated)
++cache_expired;
else
++cache_not_found;
}
else
{
++cache_hit;
const auto & cell = cells[cell_idx];
out[row] = !cell.isDefault();
}
}
}
ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired);
ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found);
ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit);
query_count.fetch_add(rows_num, std::memory_order_relaxed);
hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release);
if (outdated_keys.empty())
return result;
std::vector<size_t> required_rows(outdated_keys.size());
std::transform(
std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair) { return pair.getMapped().front(); });
/// request new values
update(
key_columns,
keys_array,
required_rows,
[&](const StringRef key, const auto)
{
for (const auto out_idx : outdated_keys[key])
out[out_idx] = true;
},
[&](const StringRef key, const auto)
{
for (const auto out_idx : outdated_keys[key])
out[out_idx] = false;
});
return result;
}
template <typename AttributeType, typename OutputType, typename DefaultValueExtractor>
void ComplexKeyCacheDictionary::getItemsNumberImpl(
Attribute & attribute,
const Columns & key_columns,
PaddedPODArray<OutputType> & out,
DefaultValueExtractor & default_value_extractor) const
{
/// Mapping: <key> -> { all indices `i` of `key_columns` such that `key_columns[i]` = <key> }
MapType<std::vector<size_t>> outdated_keys;
auto & attribute_array = std::get<ContainerPtrType<AttributeType>>(attribute.arrays);
const auto rows_num = key_columns.front()->size();
const auto keys_size = dict_struct.key->size();
StringRefs keys(keys_size);
Arena temporary_keys_pool;
PODArray<StringRef> keys_array(rows_num);
size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0;
{
const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
const auto now = std::chrono::system_clock::now();
/// fetch up-to-date values, decide which ones require update
for (const auto row : ext::range(0, rows_num))
{
const StringRef key = placeKeysInPool(row, key_columns, keys, *dict_struct.key, temporary_keys_pool);
keys_array[row] = key;
const auto find_result = findCellIdx(key, now);
/** cell should be updated if either:
* 1. keys (or hash) do not match,
* 2. cell has expired,
* 3. explicit defaults were specified and cell was set default. */
if (!find_result.valid)
{
outdated_keys[key].push_back(row);
if (find_result.outdated)
++cache_expired;
else
++cache_not_found;
}
else
{
++cache_hit;
const auto & cell_idx = find_result.cell_idx;
const auto & cell = cells[cell_idx];
out[row] = cell.isDefault() ? default_value_extractor[row] : static_cast<OutputType>(attribute_array[cell_idx]);
}
}
}
ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired);
ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found);
ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit);
query_count.fetch_add(rows_num, std::memory_order_relaxed);
hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release);
if (outdated_keys.empty())
return;
std::vector<size_t> required_rows(outdated_keys.size());
std::transform(std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair)
{
return pair.getMapped().front();
});
/// request new values
update(
key_columns,
keys_array,
required_rows,
[&](const StringRef key, const size_t cell_idx)
{
for (const auto row : outdated_keys[key])
out[row] = static_cast<OutputType>(attribute_array[cell_idx]);
},
[&](const StringRef key, const size_t)
{
for (const auto row : outdated_keys[key])
out[row] = default_value_extractor[row];
});
}
void ComplexKeyCacheDictionary::getItemsString(
Attribute & attribute,
const Columns & key_columns,
ColumnString * out,
DictionaryDefaultValueExtractor<String> & default_value_extractor) const
{
const auto rows_num = key_columns.front()->size();
/// save on some allocations
out->getOffsets().reserve(rows_num);
const auto keys_size = dict_struct.key->size();
StringRefs keys(keys_size);
Arena temporary_keys_pool;
auto & attribute_array = std::get<ContainerPtrType<StringRef>>(attribute.arrays);
auto found_outdated_values = false;
/// perform optimistic version, fallback to pessimistic if failed
{
const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
const auto now = std::chrono::system_clock::now();
/// fetch up-to-date values, discard on fail
for (const auto row : ext::range(0, rows_num))
{
const StringRef key = placeKeysInPool(row, key_columns, keys, *dict_struct.key, temporary_keys_pool);
SCOPE_EXIT(temporary_keys_pool.rollback(key.size));
const auto find_result = findCellIdx(key, now);
if (!find_result.valid)
{
found_outdated_values = true;
break;
}
else
{
const auto & cell_idx = find_result.cell_idx;
const auto & cell = cells[cell_idx];
const auto string_ref = cell.isDefault() ? default_value_extractor[row] : attribute_array[cell_idx];
out->insertData(string_ref.data, string_ref.size);
}
}
}
/// optimistic code completed successfully
if (!found_outdated_values)
{
query_count.fetch_add(rows_num, std::memory_order_relaxed);
hit_count.fetch_add(rows_num, std::memory_order_release);
return;
}
/// now onto the pessimistic one, discard possible partial results from the optimistic path
out->getChars().resize_assume_reserved(0);
out->getOffsets().resize_assume_reserved(0);
/// Mapping: <key> -> { all indices `i` of `key_columns` such that `key_columns[i]` = <key> }
MapType<std::vector<size_t>> outdated_keys;
/// we are going to store every string separately
MapType<StringRef> map;
PODArray<StringRef> keys_array(rows_num);
size_t total_length = 0;
size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0;
{
const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
const auto now = std::chrono::system_clock::now();
for (const auto row : ext::range(0, rows_num))
{
const StringRef key = placeKeysInPool(row, key_columns, keys, *dict_struct.key, temporary_keys_pool);
keys_array[row] = key;
const auto find_result = findCellIdx(key, now);
if (!find_result.valid)
{
outdated_keys[key].push_back(row);
if (find_result.outdated)
++cache_expired;
else
++cache_not_found;
}
else
{
++cache_hit;
const auto & cell_idx = find_result.cell_idx;
const auto & cell = cells[cell_idx];
const auto string_ref = cell.isDefault() ? default_value_extractor[row] : attribute_array[cell_idx];
if (!cell.isDefault())
map[key] = copyIntoArena(string_ref, temporary_keys_pool);
total_length += string_ref.size + 1;
}
}
}
ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired);
ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found);
ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit);
query_count.fetch_add(rows_num, std::memory_order_relaxed);
hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release);
/// request new values
if (!outdated_keys.empty())
{
std::vector<size_t> required_rows(outdated_keys.size());
std::transform(std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair)
{
return pair.getMapped().front();
});
update(
key_columns,
keys_array,
required_rows,
[&](const StringRef key, const size_t cell_idx)
{
const StringRef attribute_value = attribute_array[cell_idx];
/// We must copy key and value to own memory, because it may be replaced with another
/// in next iterations of inner loop of update.
const StringRef copied_key = copyIntoArena(key, temporary_keys_pool);
const StringRef copied_value = copyIntoArena(attribute_value, temporary_keys_pool);
map[copied_key] = copied_value;
total_length += (attribute_value.size + 1) * outdated_keys[key].size();
},
[&](const StringRef key, const size_t)
{
for (const auto row : outdated_keys[key])
total_length += default_value_extractor[row].size + 1;
});
}
out->getChars().reserve(total_length);
for (const auto row : ext::range(0, ext::size(keys_array)))
{
const StringRef key = keys_array[row];
auto * const it = map.find(key);
const auto string_ref = it ? it->getMapped() : default_value_extractor[row];
out->insertData(string_ref.data, string_ref.size);
}
}
template <typename PresentKeyHandler, typename AbsentKeyHandler>
void ComplexKeyCacheDictionary::update(
const Columns & in_key_columns,
const PODArray<StringRef> & in_keys,
const std::vector<size_t> & in_requested_rows,
PresentKeyHandler && on_cell_updated,
AbsentKeyHandler && on_key_not_found) const
{
MapType<bool> remaining_keys{in_requested_rows.size()};
for (const auto row : in_requested_rows)
remaining_keys.insert({in_keys[row], false});
std::uniform_int_distribution<UInt64> distribution(dict_lifetime.min_sec, dict_lifetime.max_sec);
const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs};
{
Stopwatch watch;
auto stream = source_ptr->loadKeys(in_key_columns, in_requested_rows);
stream->readPrefix();
const auto keys_size = dict_struct.key->size();
StringRefs keys(keys_size);
const auto attributes_size = attributes.size();
const auto now = std::chrono::system_clock::now();
while (const auto block = stream->read())
{
/// cache column pointers
const auto key_columns = ext::map<Columns>(
ext::range(0, keys_size), [&](const size_t attribute_idx) { return block.safeGetByPosition(attribute_idx).column; });
const auto attribute_columns = ext::map<Columns>(ext::range(0, attributes_size), [&](const size_t attribute_idx)
{
return block.safeGetByPosition(keys_size + attribute_idx).column;
});
const auto rows_num = block.rows();
for (const auto row : ext::range(0, rows_num))
{
auto key = allocKey(row, key_columns, keys);
const auto hash = StringRefHash{}(key);
const auto find_result = findCellIdx(key, now, hash);
const auto & cell_idx = find_result.cell_idx;
auto & cell = cells[cell_idx];
for (const auto attribute_idx : ext::range(0, attributes.size()))
{
const auto & attribute_column = *attribute_columns[attribute_idx];
auto & attribute = attributes[attribute_idx];
setAttributeValue(attribute, cell_idx, attribute_column[row]);
}
/// if cell id is zero and zero does not map to this cell, then the cell is unused
if (cell.key == StringRef{} && cell_idx != zero_cell_idx)
element_count.fetch_add(1, std::memory_order_relaxed);
/// handle memory allocated for old key
if (key == cell.key)
{
freeKey(key);
key = cell.key;
}
else
{
/// new key is different from the old one
if (cell.key.data)
freeKey(cell.key);
cell.key = key;
}
cell.hash = hash;
if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0)
cell.setExpiresAt(std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)});
else
cell.setExpiresAt(std::chrono::time_point<std::chrono::system_clock>::max());
/// inform caller
on_cell_updated(key, cell_idx);
/// mark corresponding id as found
remaining_keys[key] = true;
}
}
stream->readSuffix();
ProfileEvents::increment(ProfileEvents::DictCacheKeysRequested, in_requested_rows.size());
ProfileEvents::increment(ProfileEvents::DictCacheRequestTimeNs, watch.elapsed());
}
size_t found_num = 0;
size_t not_found_num = 0;
const auto now = std::chrono::system_clock::now();
/// Check which ids have not been found and require setting null_value
for (const auto & key_found_pair : remaining_keys)
{
if (key_found_pair.getMapped())
{
++found_num;
continue;
}
++not_found_num;
auto key = key_found_pair.getKey();
const auto hash = StringRefHash{}(key);
const auto find_result = findCellIdx(key, now, hash);
const auto & cell_idx = find_result.cell_idx;
auto & cell = cells[cell_idx];
/// Set null_value for each attribute
for (auto & attribute : attributes)
setDefaultAttributeValue(attribute, cell_idx);
/// Check if cell had not been occupied before and increment element counter if it hadn't
if (cell.key == StringRef{} && cell_idx != zero_cell_idx)
element_count.fetch_add(1, std::memory_order_relaxed);
if (key == cell.key)
key = cell.key;
else
{
if (cell.key.data)
freeKey(cell.key);
/// copy key from temporary pool
key = copyKey(key);
cell.key = key;
}
cell.hash = hash;
if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0)
cell.setExpiresAt(std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)});
else
cell.setExpiresAt(std::chrono::time_point<std::chrono::system_clock>::max());
cell.setDefault();
/// inform caller that the cell has not been found
on_key_not_found(key, cell_idx);
}
ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedFound, found_num);
ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedMiss, not_found_num);
}
void ComplexKeyCacheDictionary::createAttributes()
{
const auto attributes_size = dict_struct.attributes.size();
attributes.reserve(attributes_size);
bytes_allocated += size * sizeof(CellMetadata);
bytes_allocated += attributes_size * sizeof(attributes.front());
for (const auto & attribute : dict_struct.attributes)
{
attribute_index_by_name.emplace(attribute.name, attributes.size());
attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value));
if (attribute.hierarchical)
throw Exception{full_name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(),
ErrorCodes::TYPE_MISMATCH};
}
}
ComplexKeyCacheDictionary::Attribute & ComplexKeyCacheDictionary::getAttribute(const std::string & attribute_name) const
{
const auto it = attribute_index_by_name.find(attribute_name);
if (it == std::end(attribute_index_by_name))
throw Exception{full_name + ": no such attribute '" + attribute_name + "'", ErrorCodes::BAD_ARGUMENTS};
return attributes[it->second];
}
void ComplexKeyCacheDictionary::setDefaultAttributeValue(Attribute & attribute, const size_t idx) const
{
auto type_call = [&](const auto &dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
if constexpr (std::is_same_v<AttributeType, String>)
{
const auto & null_value_ref = std::get<String>(attribute.null_values);
auto & string_ref = std::get<ContainerPtrType<StringRef>>(attribute.arrays)[idx];
if (string_ref.data != null_value_ref.data())
{
if (string_ref.data)
string_arena->free(const_cast<char *>(string_ref.data), string_ref.size);
string_ref = StringRef{null_value_ref};
}
}
else
{
std::get<ContainerPtrType<AttributeType>>(attribute.arrays)[idx] = std::get<AttributeType>(attribute.null_values);
}
};
callOnDictionaryAttributeType(attribute.type, type_call);
}
ComplexKeyCacheDictionary::Attribute
ComplexKeyCacheDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value)
{
Attribute attr{type, {}, {}};
auto type_call = [&](const auto &dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
if constexpr (std::is_same_v<AttributeType, String>)
{
attr.null_values = null_value.get<String>();
attr.arrays = std::make_unique<ContainerType<StringRef>>(size);
bytes_allocated += size * sizeof(StringRef);
if (!string_arena)
string_arena = std::make_unique<ArenaWithFreeLists>();
}
else
{
attr.null_values = AttributeType(null_value.get<NearestFieldType<AttributeType>>()); /* NOLINT */
attr.arrays = std::make_unique<ContainerType<AttributeType>>(size); /* NOLINT */
bytes_allocated += size * sizeof(AttributeType);
}
};
callOnDictionaryAttributeType(type, type_call);
return attr;
}
void ComplexKeyCacheDictionary::setAttributeValue(Attribute & attribute, const size_t idx, const Field & value) const
{
auto type_call = [&](const auto &dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
if constexpr (std::is_same_v<AttributeType, String>)
{
const auto & string = value.get<String>();
auto & string_ref = std::get<ContainerPtrType<StringRef>>(attribute.arrays)[idx];
const auto & null_value_ref = std::get<String>(attribute.null_values);
/// free memory unless it points to a null_value
if (string_ref.data && string_ref.data != null_value_ref.data())
string_arena->free(const_cast<char *>(string_ref.data), string_ref.size);
const auto str_size = string.size();
if (str_size != 0)
{
auto * str_ptr = string_arena->alloc(str_size);
std::copy(string.data(), string.data() + str_size, str_ptr);
string_ref = StringRef{str_ptr, str_size};
}
else
string_ref = {};
}
else
{
std::get<ContainerPtrType<AttributeType>>(attribute.arrays)[idx] = value.get<NearestFieldType<AttributeType>>();
}
};
callOnDictionaryAttributeType(attribute.type, type_call);
}
StringRef ComplexKeyCacheDictionary::allocKey(const size_t row, const Columns & key_columns, StringRefs & keys) const
{
if (key_size_is_fixed)
return placeKeysInFixedSizePool(row, key_columns);
return placeKeysInPool(row, key_columns, keys, *dict_struct.key, *keys_pool);
}
void ComplexKeyCacheDictionary::freeKey(const StringRef key) const
{
if (key_size_is_fixed)
fixed_size_keys_pool->free(const_cast<char *>(key.data));
else
keys_pool->free(const_cast<char *>(key.data), key.size);
}
template <typename Pool>
StringRef ComplexKeyCacheDictionary::placeKeysInPool(
const size_t row, const Columns & key_columns, StringRefs & keys, const std::vector<DictionaryAttribute> & key_attributes, Pool & pool)
{
const auto keys_size = key_columns.size();
size_t sum_keys_size{};
for (size_t j = 0; j < keys_size; ++j)
{
keys[j] = key_columns[j]->getDataAt(row);
sum_keys_size += keys[j].size;
if (key_attributes[j].underlying_type == AttributeUnderlyingType::utString)
sum_keys_size += sizeof(size_t) + 1;
}
auto place = pool.alloc(sum_keys_size);
auto key_start = place;
for (size_t j = 0; j < keys_size; ++j)
{
if (key_attributes[j].underlying_type == AttributeUnderlyingType::utString)
{
auto start = key_start;
auto key_size = keys[j].size + 1;
memcpy(key_start, &key_size, sizeof(size_t));
key_start += sizeof(size_t);
memcpy(key_start, keys[j].data, keys[j].size);
key_start += keys[j].size;
*key_start = '\0';
++key_start;
keys[j].data = start;
keys[j].size += sizeof(size_t) + 1;
}
else
{
memcpy(key_start, keys[j].data, keys[j].size);
keys[j].data = key_start;
key_start += keys[j].size;
}
}
return {place, sum_keys_size};
}
/// Explicit instantiations.
template StringRef ComplexKeyCacheDictionary::placeKeysInPool<Arena>(
const size_t row,
const Columns & key_columns,
StringRefs & keys,
const std::vector<DictionaryAttribute> & key_attributes,
Arena & pool);
template StringRef ComplexKeyCacheDictionary::placeKeysInPool<ArenaWithFreeLists>(
const size_t row,
const Columns & key_columns,
StringRefs & keys,
const std::vector<DictionaryAttribute> & key_attributes,
ArenaWithFreeLists & pool);
StringRef ComplexKeyCacheDictionary::placeKeysInFixedSizePool(const size_t row, const Columns & key_columns) const
{
auto * res = fixed_size_keys_pool->alloc();
auto * place = res;
for (const auto & key_column : key_columns)
{
const StringRef key = key_column->getDataAt(row);
memcpy(place, key.data, key.size);
place += key.size;
}
return {res, key_size};
}
StringRef ComplexKeyCacheDictionary::copyIntoArena(StringRef src, Arena & arena)
{
char * allocated = arena.alloc(src.size);
memcpy(allocated, src.data, src.size);
return {allocated, src.size};
}
StringRef ComplexKeyCacheDictionary::copyKey(const StringRef key) const
{
auto * res = key_size_is_fixed ? fixed_size_keys_pool->alloc() : keys_pool->alloc(key.size);
memcpy(res, key.data, key.size);
return {res, key.size};
}
bool ComplexKeyCacheDictionary::isEmptyCell(const UInt64 idx) const
{
return (
cells[idx].key == StringRef{}
&& (idx != zero_cell_idx || cells[idx].data == ext::safe_bit_cast<CellMetadata::time_point_urep_t>(CellMetadata::time_point_t())));
}
BlockInputStreamPtr ComplexKeyCacheDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const
{
std::vector<StringRef> keys;
{
const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs};
for (auto idx : ext::range(0, cells.size()))
if (!isEmptyCell(idx) && !cells[idx].isDefault())
keys.push_back(cells[idx].key);
}
using BlockInputStreamType = DictionaryBlockInputStream<UInt64>;
return std::make_shared<BlockInputStreamType>(shared_from_this(), max_block_size, keys, column_names);
}
void registerDictionaryComplexKeyCache(DictionaryFactory & factory)
{
auto create_layout = [=](const std::string & full_name,
const DictionaryStructure & dict_struct,
const Poco::Util::AbstractConfiguration & config,
const std::string & config_prefix,
DictionarySourcePtr source_ptr) -> DictionaryPtr
{
if (!dict_struct.key)
throw Exception{"'key' is required for dictionary of layout 'complex_key_hashed'", ErrorCodes::BAD_ARGUMENTS};
const auto & layout_prefix = config_prefix + ".layout";
const auto size = config.getInt(layout_prefix + ".complex_key_cache.size_in_cells");
if (size == 0)
throw Exception{full_name + ": dictionary of layout 'cache' cannot have 0 cells", ErrorCodes::TOO_SMALL_BUFFER_SIZE};
const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false);
if (require_nonempty)
throw Exception{full_name + ": dictionary of layout 'cache' cannot have 'require_nonempty' attribute set",
ErrorCodes::BAD_ARGUMENTS};
const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix);
const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"};
return std::make_unique<ComplexKeyCacheDictionary>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, size);
};
factory.registerLayout("complex_key_cache", create_layout, true);
}
}

View File

@ -1,276 +0,0 @@
#pragma once
#include <atomic>
#include <chrono>
#include <map>
#include <shared_mutex>
#include <variant>
#include <vector>
#include <Columns/ColumnDecimal.h>
#include <Columns/ColumnString.h>
#include <pcg_random.hpp>
#include <Common/ArenaWithFreeLists.h>
#include <Common/HashTable/HashMap.h>
#include <Common/ProfilingScopedRWLock.h>
#include <Common/SmallObjectPool.h>
#include <common/StringRef.h>
#include <ext/bit_cast.h>
#include <ext/map.h>
#include <ext/range.h>
#include <ext/size.h>
#include <ext/scope_guard.h>
#include "DictionaryStructure.h"
#include "IDictionary.h"
#include "IDictionarySource.h"
#include <DataStreams/IBlockInputStream.h>
#include "DictionaryHelpers.h"
namespace ProfileEvents
{
extern const Event DictCacheKeysRequested;
extern const Event DictCacheKeysRequestedMiss;
extern const Event DictCacheKeysRequestedFound;
extern const Event DictCacheKeysExpired;
extern const Event DictCacheKeysNotFound;
extern const Event DictCacheKeysHit;
extern const Event DictCacheRequestTimeNs;
extern const Event DictCacheLockWriteNs;
extern const Event DictCacheLockReadNs;
}
namespace DB
{
class ComplexKeyCacheDictionary final : public IDictionaryBase
{
public:
ComplexKeyCacheDictionary(
const StorageID & dict_id_,
const DictionaryStructure & dict_struct_,
DictionarySourcePtr source_ptr_,
const DictionaryLifetime dict_lifetime_,
const size_t size_);
std::string getKeyDescription() const { return key_description; }
std::string getTypeName() const override { return "ComplexKeyCache"; }
size_t getBytesAllocated() const override
{
return bytes_allocated + (key_size_is_fixed ? fixed_size_keys_pool->size() : keys_pool->size())
+ (string_arena ? string_arena->size() : 0);
}
size_t getQueryCount() const override { return query_count.load(std::memory_order_relaxed); }
double getHitRate() const override
{
return static_cast<double>(hit_count.load(std::memory_order_acquire)) / query_count.load(std::memory_order_relaxed);
}
size_t getElementCount() const override { return element_count.load(std::memory_order_relaxed); }
double getLoadFactor() const override { return static_cast<double>(element_count.load(std::memory_order_relaxed)) / size; }
bool supportUpdates() const override { return false; }
std::shared_ptr<const IExternalLoadable> clone() const override
{
return std::make_shared<ComplexKeyCacheDictionary>(getDictionaryID(), dict_struct, source_ptr->clone(), dict_lifetime, size);
}
const IDictionarySource * getSource() const override { return source_ptr.get(); }
const DictionaryLifetime & getLifetime() const override { return dict_lifetime; }
const DictionaryStructure & getStructure() const override { return dict_struct; }
bool isInjective(const std::string & attribute_name) const override
{
return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective;
}
DictionaryKeyType getKeyType() const override { return DictionaryKeyType::complex; }
ColumnPtr getColumn(
const std::string& attribute_name,
const DataTypePtr & result_type,
const Columns & key_columns,
const DataTypes & key_types,
const ColumnPtr default_values_column) const override;
ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override;
BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override;
private:
template <typename Value>
using MapType = HashMapWithSavedHash<StringRef, Value, StringRefHash>;
template <typename Value>
using ContainerType = Value[];
template <typename Value>
using ContainerPtrType = std::unique_ptr<ContainerType<Value>>;
struct CellMetadata final
{
using time_point_t = std::chrono::system_clock::time_point;
using time_point_rep_t = time_point_t::rep;
using time_point_urep_t = std::make_unsigned_t<time_point_rep_t>;
static constexpr UInt64 EXPIRES_AT_MASK = std::numeric_limits<time_point_rep_t>::max();
static constexpr UInt64 IS_DEFAULT_MASK = ~EXPIRES_AT_MASK;
StringRef key;
decltype(StringRefHash{}(key)) hash;
/// Stores both expiration time and `is_default` flag in the most significant bit
time_point_urep_t data;
/// Sets expiration time, resets `is_default` flag to false
time_point_t expiresAt() const { return ext::safe_bit_cast<time_point_t>(data & EXPIRES_AT_MASK); }
void setExpiresAt(const time_point_t & t) { data = ext::safe_bit_cast<time_point_urep_t>(t); }
bool isDefault() const { return (data & IS_DEFAULT_MASK) == IS_DEFAULT_MASK; }
void setDefault() { data |= IS_DEFAULT_MASK; }
};
struct Attribute final
{
AttributeUnderlyingType type;
std::variant<
UInt8,
UInt16,
UInt32,
UInt64,
UInt128,
Int8,
Int16,
Int32,
Int64,
Decimal32,
Decimal64,
Decimal128,
Float32,
Float64,
String>
null_values;
std::variant<
ContainerPtrType<UInt8>,
ContainerPtrType<UInt16>,
ContainerPtrType<UInt32>,
ContainerPtrType<UInt64>,
ContainerPtrType<UInt128>,
ContainerPtrType<Int8>,
ContainerPtrType<Int16>,
ContainerPtrType<Int32>,
ContainerPtrType<Int64>,
ContainerPtrType<Decimal32>,
ContainerPtrType<Decimal64>,
ContainerPtrType<Decimal128>,
ContainerPtrType<Float32>,
ContainerPtrType<Float64>,
ContainerPtrType<StringRef>>
arrays;
};
void createAttributes();
Attribute createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value);
template <typename AttributeType, typename OutputType, typename DefaultValueExtractor>
void getItemsNumberImpl(
Attribute & attribute,
const Columns & key_columns,
PaddedPODArray<OutputType> & out,
DefaultValueExtractor & default_value_extractor) const;
void getItemsString(
Attribute & attribute,
const Columns & key_columns,
ColumnString * out,
DictionaryDefaultValueExtractor<String> & default_value_extractor) const;
template <typename PresentKeyHandler, typename AbsentKeyHandler>
void update(
const Columns & in_key_columns,
const PODArray<StringRef> & in_keys,
const std::vector<size_t> & in_requested_rows,
PresentKeyHandler && on_cell_updated,
AbsentKeyHandler && on_key_not_found) const;
UInt64 getCellIdx(const StringRef key) const;
void setDefaultAttributeValue(Attribute & attribute, const size_t idx) const;
void setAttributeValue(Attribute & attribute, const size_t idx, const Field & value) const;
Attribute & getAttribute(const std::string & attribute_name) const;
StringRef allocKey(const size_t row, const Columns & key_columns, StringRefs & keys) const;
void freeKey(const StringRef key) const;
template <typename Arena>
static StringRef placeKeysInPool(
const size_t row,
const Columns & key_columns,
StringRefs & keys,
const std::vector<DictionaryAttribute> & key_attributes,
Arena & pool);
StringRef placeKeysInFixedSizePool(const size_t row, const Columns & key_columns) const;
static StringRef copyIntoArena(StringRef src, Arena & arena);
StringRef copyKey(const StringRef key) const;
struct FindResult
{
const size_t cell_idx;
const bool valid;
const bool outdated;
};
FindResult findCellIdx(const StringRef & key, const CellMetadata::time_point_t now, const size_t hash) const;
FindResult findCellIdx(const StringRef & key, const CellMetadata::time_point_t now) const
{
const auto hash = StringRefHash{}(key);
return findCellIdx(key, now, hash);
}
bool isEmptyCell(const UInt64 idx) const;
const DictionaryStructure dict_struct;
const DictionarySourcePtr source_ptr;
const DictionaryLifetime dict_lifetime;
const std::string key_description{dict_struct.getKeyDescription()};
mutable std::shared_mutex rw_lock;
/// Actual size will be increased to match power of 2
const size_t size;
/// all bits to 1 mask (size - 1) (0b1000 - 1 = 0b111)
const size_t size_overlap_mask;
/// Max tries to find cell, overlapped with mask: if size = 16 and start_cell=10: will try cells: 10,11,12,13,14,15,0,1,2,3
static constexpr size_t max_collision_length = 10;
const UInt64 zero_cell_idx{getCellIdx(StringRef{})};
std::map<std::string, size_t> attribute_index_by_name;
mutable std::vector<Attribute> attributes;
mutable std::vector<CellMetadata> cells{size};
const bool key_size_is_fixed{dict_struct.isKeySizeFixed()};
size_t key_size{key_size_is_fixed ? dict_struct.getKeySize() : 0};
std::unique_ptr<ArenaWithFreeLists> keys_pool = key_size_is_fixed ? nullptr : std::make_unique<ArenaWithFreeLists>();
std::unique_ptr<SmallObjectPool> fixed_size_keys_pool = key_size_is_fixed ? std::make_unique<SmallObjectPool>(key_size) : nullptr;
std::unique_ptr<ArenaWithFreeLists> string_arena;
mutable pcg64 rnd_engine;
mutable size_t bytes_allocated = 0;
mutable std::atomic<size_t> element_count{0};
mutable std::atomic<size_t> hit_count{0};
mutable std::atomic<size_t> query_count{0};
const std::chrono::time_point<std::chrono::system_clock> creation_time = std::chrono::system_clock::now();
};
}

View File

@ -39,7 +39,7 @@ ColumnPtr ComplexKeyDirectDictionary::getColumn(
const DataTypePtr & result_type,
const Columns & key_columns,
const DataTypes & key_types,
const ColumnPtr default_values_column) const
const ColumnPtr & default_values_column) const
{
dict_struct.validateKeyTypes(key_types);

View File

@ -66,7 +66,7 @@ public:
const DataTypePtr & result_type,
const Columns & key_columns,
const DataTypes & key_types,
const ColumnPtr default_values_column) const override;
const ColumnPtr & default_values_column) const override;
ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override;

View File

@ -41,7 +41,7 @@ ColumnPtr ComplexKeyHashedDictionary::getColumn(
const DataTypePtr & result_type,
const Columns & key_columns,
const DataTypes & key_types,
const ColumnPtr default_values_column) const
const ColumnPtr & default_values_column) const
{
dict_struct.validateKeyTypes(key_types);
@ -563,7 +563,13 @@ std::vector<StringRef> ComplexKeyHashedDictionary::getKeys(const Attribute & att
BlockInputStreamPtr ComplexKeyHashedDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const
{
using BlockInputStreamType = DictionaryBlockInputStream<UInt64>;
return std::make_shared<BlockInputStreamType>(shared_from_this(), max_block_size, getKeys(), column_names);
auto vector_keys = getKeys();
PaddedPODArray<StringRef> keys;
keys.reserve(vector_keys.size());
keys.assign(vector_keys.begin(), vector_keys.end());
return std::make_shared<BlockInputStreamType>(shared_from_this(), max_block_size, keys, column_names);
}
void registerDictionaryComplexKeyHashed(DictionaryFactory & factory)

Some files were not shown because too many files have changed in this diff Show More