Merge branch 'master' into fix-number-of-threads-in-subqueries

This commit is contained in:
Nikolai Kochetov 2021-02-16 16:29:30 +03:00
commit 0b315fccc2
203 changed files with 6443 additions and 1063 deletions

2
.gitmodules vendored
View File

@ -220,4 +220,4 @@
url = https://github.com/ClickHouse-Extras/boringssl.git
[submodule "contrib/NuRaft"]
path = contrib/NuRaft
url = https://github.com/eBay/NuRaft.git
url = https://github.com/ClickHouse-Extras/NuRaft.git

View File

@ -13,6 +13,3 @@ ClickHouse® is an open-source column-oriented database management system that a
* [Code Browser](https://clickhouse.tech/codebrowser/html_report/ClickHouse/index.html) with syntax highlight and navigation.
* [Contacts](https://clickhouse.tech/#contacts) can help to get your questions answered if there are any.
* You can also [fill this form](https://clickhouse.tech/#meet) to meet Yandex ClickHouse team in person.
## Upcoming Events
* [Chinese ClickHouse Meetup (online)](http://hdxu.cn/8KxZE) on 6 February 2021.

View File

@ -1,6 +1,8 @@
#pragma once
#include <common/extended_types.h>
#include <common/defines.h>
namespace common
{
@ -156,4 +158,11 @@ namespace common
return false;
return (x * y) / y != x;
}
/// Multiply and ignore overflow.
template <typename T1, typename T2>
inline auto NO_SANITIZE_UNDEFINED mulIgnoreOverflow(T1 x, T2 y)
{
return x * y;
}
}

View File

@ -562,6 +562,7 @@ void debugIncreaseOOMScore()
{
DB::WriteBufferFromFile buf("/proc/self/oom_score_adj");
buf.write(new_score.c_str(), new_score.size());
buf.close();
}
catch (const Poco::Exception & e)
{
@ -784,7 +785,7 @@ void BaseDaemon::initializeTerminationAndSignalProcessing()
/// Setup signal handlers.
/// SIGTSTP is added for debugging purposes. To output a stack trace of any running thread at anytime.
addSignalHandler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, SIGTSTP}, signalHandler, &handled_signals);
addSignalHandler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, SIGTSTP, SIGTRAP}, signalHandler, &handled_signals);
addSignalHandler({SIGHUP, SIGUSR1}, closeLogsSignalHandler, &handled_signals);
addSignalHandler({SIGINT, SIGQUIT, SIGTERM}, terminateRequestedSignalHandler, &handled_signals);

View File

@ -11,7 +11,7 @@ if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/NuRaft/CMakeLists.txt")
return()
endif ()
if (NOT OS_FREEBSD)
if (NOT OS_FREEBSD AND NOT OS_DARWIN)
set (USE_NURAFT 1)
set (NURAFT_LIBRARY nuraft)
@ -20,5 +20,5 @@ if (NOT OS_FREEBSD)
message (STATUS "Using NuRaft=${USE_NURAFT}: ${NURAFT_INCLUDE_DIR} : ${NURAFT_LIBRARY}")
else()
set (USE_NURAFT 0)
message (STATUS "Using internal NuRaft library on FreeBSD is not supported")
message (STATUS "Using internal NuRaft library on FreeBSD and Darwin is not supported")
endif()

2
contrib/NuRaft vendored

@ -1 +1 @@
Subproject commit 410bd149da84cdde60b4436b02b738749f4e87e1
Subproject commit 7adf7ae33e7d5c307342431b577c8ab1025ee793

2
contrib/boost vendored

@ -1 +1 @@
Subproject commit 8e259cd2a6b60d75dd17e73432f11bb7b9351bb1
Subproject commit 48f40ebb539220d328958f8823b094c0b07a4e79

View File

@ -30,7 +30,12 @@ set(SRCS
add_library(nuraft ${SRCS})
target_compile_definitions(nuraft PRIVATE USE_BOOST_ASIO=1 BOOST_ASIO_STANDALONE=1)
if (NOT OPENSSL_SSL_LIBRARY OR NOT OPENSSL_CRYPTO_LIBRARY)
target_compile_definitions(nuraft PRIVATE USE_BOOST_ASIO=1 BOOST_ASIO_STANDALONE=1 SSL_LIBRARY_NOT_FOUND=1)
else()
target_compile_definitions(nuraft PRIVATE USE_BOOST_ASIO=1 BOOST_ASIO_STANDALONE=1)
endif()
target_include_directories (nuraft SYSTEM PRIVATE ${LIBRARY_DIR}/include/libnuraft)
# for some reason include "asio.h" directly without "boost/" prefix.

View File

@ -56,7 +56,7 @@ $ echo 'SELECT version()' | curl 'http://localhost:8123/' --data-binary @-
20.12.3.3
```
### Volumes
### Volumes
Typically you may want to mount the following folders inside your container to archieve persistency:
@ -76,7 +76,7 @@ You may also want to mount:
* `/etc/clickhouse-server/usert.d/*.xml` - files with use settings adjustmenets
* `/docker-entrypoint-initdb.d/` - folder with database initialization scripts (see below).
### Linux capabilities
### Linux capabilities
ClickHouse has some advanced functionality which requite enabling several [linux capabilities](https://man7.org/linux/man-pages/man7/capabilities.7.html).
@ -113,10 +113,10 @@ $ docker run --rm -e CLICKHOUSE_UID=0 -e CLICKHOUSE_GID=0 --name clickhouse-serv
### How to create default database and user on starting
Sometimes you may want to create user (user named `default` is used by default) and database on image starting. You can do it using environment variables `CLICKHOUSE_DB`, `CLICKHOUSE_USER` and `CLICKHOUSE_PASSWORD`:
Sometimes you may want to create user (user named `default` is used by default) and database on image starting. You can do it using environment variables `CLICKHOUSE_DB`, `CLICKHOUSE_USER`, `CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT` and `CLICKHOUSE_PASSWORD`:
```
$ docker run --rm -e CLICKHOUSE_DB=my_database -e CLICKHOUSE_USER=username -e CLICKHOUSE_PASSWORD=password -p 9000:9000/tcp yandex/clickhouse-server
$ docker run --rm -e CLICKHOUSE_DB=my_database -e CLICKHOUSE_USER=username -e CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT=1 -e CLICKHOUSE_PASSWORD=password -p 9000:9000/tcp yandex/clickhouse-server
```
## How to extend this image

View File

@ -54,6 +54,7 @@ FORMAT_SCHEMA_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_
CLICKHOUSE_USER="${CLICKHOUSE_USER:-default}"
CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-}"
CLICKHOUSE_DB="${CLICKHOUSE_DB:-}"
CLICKHOUSE_ACCESS_MANAGEMENT="${CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT:-0}"
for dir in "$DATA_DIR" \
"$ERROR_LOG_DIR" \
@ -97,6 +98,7 @@ if [ -n "$CLICKHOUSE_USER" ] && [ "$CLICKHOUSE_USER" != "default" ] || [ -n "$CL
</networks>
<password>${CLICKHOUSE_PASSWORD}</password>
<quota>default</quota>
<access_management>${CLICKHOUSE_ACCESS_MANAGEMENT}</access_management>
</${CLICKHOUSE_USER}>
</users>
</yandex>

View File

@ -163,6 +163,7 @@ function clone_submodules
contrib/xz
contrib/dragonbox
contrib/fast_float
contrib/NuRaft
)
git submodule sync
@ -182,6 +183,7 @@ function run_cmake
"-DENABLE_EMBEDDED_COMPILER=0"
"-DENABLE_THINLTO=0"
"-DUSE_UNWIND=1"
"-DENABLE_NURAFT=1"
)
# TODO remove this? we don't use ccache anyway. An option would be to download it

View File

@ -0,0 +1,29 @@
---
toc_priority:
toc_title:
---
# data_type_name {#data_type-name}
Description.
**Parameters** (Optional)
- `x` — Description. [Type name](relative/path/to/type/dscr.md#type).
- `y` — Description. [Type name](relative/path/to/type/dscr.md#type).
**Examples**
```sql
```
## Additional Info {#additional-info} (Optional)
The name of an additional section can be any, for example, **Usage**.
**See Also** (Optional)
- [link](#)
[Original article](https://clickhouse.tech/docs/en/data_types/<data-type-name>/) <!--hide-->

View File

@ -136,8 +136,7 @@ The following settings can be specified in configuration file for given endpoint
- `access_key_id` and `secret_access_key` — Optional. Specifies credentials to use with given endpoint.
- `use_environment_credentials` — Optional, default value is `false`. If set to `true`, S3 client will try to obtain credentials from environment variables and Amazon EC2 metadata for given endpoint.
- `header` — Optional, can be speficied multiple times. Adds specified HTTP header to a request to given endpoint.
This configuration also applies to S3 disks in `MergeTree` table engine family.
- `server_side_encryption_customer_key_base64` — Optional. If specified, required headers for accessing S3 objects with SSE-C encryption will be set.
Example:
@ -149,6 +148,7 @@ Example:
<!-- <secret_access_key>SECRET_ACCESS_KEY</secret_access_key> -->
<!-- <use_environment_credentials>false</use_environment_credentials> -->
<!-- <header>Authorization: Bearer SOME-TOKEN</header> -->
<!-- <server_side_encryption_customer_key_base64>BASE64-ENCODED-KEY</server_side_encryption_customer_key_base64> -->
</endpoint-name>
</s3>
```

View File

@ -715,6 +715,7 @@ Configuration markup:
<endpoint>https://storage.yandexcloud.net/my-bucket/root-path/</endpoint>
<access_key_id>your_access_key_id</access_key_id>
<secret_access_key>your_secret_access_key</secret_access_key>
<server_side_encryption_customer_key_base64>your_base64_encoded_customer_key</server_side_encryption_customer_key_base64>
<proxy>
<uri>http://proxy1</uri>
<uri>http://proxy2</uri>
@ -750,7 +751,8 @@ Optional parameters:
- `metadata_path` — Path on local FS to store metadata files for S3. Default value is `/var/lib/clickhouse/disks/<disk_name>/`.
- `cache_enabled` — Allows to cache mark and index files on local FS. Default value is `true`.
- `cache_path` — Path on local FS where to store cached mark and index files. Default value is `/var/lib/clickhouse/disks/<disk_name>/cache/`.
- `skip_access_check` — If true disk access checks will not be performed on disk start-up. Default value is `false`.
- `skip_access_check` — If true, disk access checks will not be performed on disk start-up. Default value is `false`.
- `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set.
S3 disk can be configured as `main` or `cold` storage:

View File

@ -39,4 +39,4 @@ More details on [manipulating partitions](../../sql-reference/statements/alter/p
Its rather radical to drop all data from a table, but in some cases it might be exactly what you need.
More details on [table truncation](../../sql-reference/statements/alter/partition.md#alter_drop-partition).
More details on [table truncation](../../sql-reference/statements/truncate.md).

View File

@ -31,8 +31,8 @@ The supported formats are:
| [JSONCompactString](#jsoncompactstring) | ✗ | ✔ |
| [JSONEachRow](#jsoneachrow) | ✔ | ✔ |
| [JSONEachRowWithProgress](#jsoneachrowwithprogress) | ✗ | ✔ |
| [JSONStringEachRow](#jsonstringeachrow) | ✔ | ✔ |
| [JSONStringEachRowWithProgress](#jsonstringeachrowwithprogress) | ✗ | ✔ |
| [JSONStringsEachRow](#jsonstringseachrow) | ✔ | ✔ |
| [JSONStringsEachRowWithProgress](#jsonstringseachrowwithprogress) | ✗ | ✔ |
| [JSONCompactEachRow](#jsoncompacteachrow) | ✔ | ✔ |
| [JSONCompactEachRowWithNamesAndTypes](#jsoncompacteachrowwithnamesandtypes) | ✔ | ✔ |
| [JSONCompactStringEachRow](#jsoncompactstringeachrow) | ✔ | ✔ |
@ -612,7 +612,7 @@ Example:
```
## JSONEachRow {#jsoneachrow}
## JSONStringEachRow {#jsonstringeachrow}
## JSONStringsEachRow {#jsonstringseachrow}
## JSONCompactEachRow {#jsoncompacteachrow}
## JSONCompactStringEachRow {#jsoncompactstringeachrow}
@ -627,9 +627,9 @@ When using these formats, ClickHouse outputs rows as separated, newline-delimite
When inserting the data, you should provide a separate JSON value for each row.
## JSONEachRowWithProgress {#jsoneachrowwithprogress}
## JSONStringEachRowWithProgress {#jsonstringeachrowwithprogress}
## JSONStringsEachRowWithProgress {#jsonstringseachrowwithprogress}
Differs from `JSONEachRow`/`JSONStringEachRow` in that ClickHouse will also yield progress information as JSON values.
Differs from `JSONEachRow`/`JSONStringsEachRow` in that ClickHouse will also yield progress information as JSON values.
```json
{"row":{"'hello'":"hello","multiply(42, number)":"0","range(5)":[0,1,2,3,4]}}

View File

@ -312,7 +312,7 @@ Enables or disables parsing enum values as enum ids for TSV input format.
Possible values:
- 0 — Enum values are parsed as values.
- 1 — Enum values are parsed as enum IDs
- 1 — Enum values are parsed as enum IDs.
Default value: 0.
@ -2592,6 +2592,18 @@ Possible values:
Default value: `16`.
## opentelemetry_start_trace_probability {#opentelemetry-start-trace-probability}
Sets the probability that the ClickHouse can start a trace for executed queries (if no parent [trace context](https://www.w3.org/TR/trace-context/) is supplied).
Possible values:
- 0 — The trace for all executed queries is disabled (if no parent trace context is supplied).
- Positive floating-point number in the range [0..1]. For example, if the setting value is `0,5`, ClickHouse can start a trace on average for half of the queries.
- 1 — The trace for all executed queries is enabled.
Default value: `0`.
## optimize_on_insert {#optimize-on-insert}
Enables or disables data transformation before the insertion, as if merge was done on this block (according to table engine).

View File

@ -0,0 +1,53 @@
# system.opentelemetry_span_log {#system_tables-opentelemetry_span_log}
Contains information about [trace spans](https://opentracing.io/docs/overview/spans/) for executed queries.
Columns:
- `trace_id` ([UUID](../../sql-reference/data-types/uuid.md) — ID of the trace for executed query.
- `span_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — ID of the `trace span`.
- `parent_span_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — ID of the parent `trace span`.
- `operation_name` ([String](../../sql-reference/data-types/string.md)) — The name of the operation.
- `start_time_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The start time of the `trace span` (in microseconds).
- `finish_time_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The finish time of the `trace span` (in microseconds).
- `finish_date` ([Date](../../sql-reference/data-types/date.md)) — The finish date of the `trace span`.
- `attribute.names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — [Attribute](https://opentelemetry.io/docs/go/instrumentation/#attributes) names depending on the `trace span`. They are filled in according to the recommendations in the [OpenTelemetry](https://opentelemetry.io/) standard.
- `attribute.values` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Attribute values depending on the `trace span`. They are filled in according to the recommendations in the `OpenTelemetry` standard.
**Example**
Query:
``` sql
SELECT * FROM system.opentelemetry_span_log LIMIT 1 FORMAT Vertical;
```
Result:
``` text
Row 1:
──────
trace_id: cdab0847-0d62-61d5-4d38-dd65b19a1914
span_id: 701487461015578150
parent_span_id: 2991972114672045096
operation_name: DB::Block DB::InterpreterSelectQuery::getSampleBlockImpl()
start_time_us: 1612374594529090
finish_time_us: 1612374594529108
finish_date: 2021-02-03
attribute.names: []
attribute.values: []
```
**See Also**
- [OpenTelemetry](../../operations/opentelemetry.md)
[Original article](https://clickhouse.tech/docs/en/operations/system_tables/opentelemetry_span_log) <!--hide-->

View File

@ -1,9 +1,9 @@
---
toc_priority: 47
toc_title: ClickHouse Update
toc_title: ClickHouse Upgrade
---
# ClickHouse Update {#clickhouse-update}
# ClickHouse Upgrade {#clickhouse-upgrade}
If ClickHouse was installed from `deb` packages, execute the following commands on the server:
@ -16,3 +16,19 @@ $ sudo service clickhouse-server restart
If you installed ClickHouse using something other than the recommended `deb` packages, use the appropriate update method.
ClickHouse does not support a distributed update. The operation should be performed consecutively on each separate server. Do not update all the servers on a cluster simultaneously, or the cluster will be unavailable for some time.
The upgrade of older version of ClickHouse to specific version:
As an example:
`xx.yy.a.b` is a current stable version. The latest stable version could be found [here](https://github.com/ClickHouse/ClickHouse/releases)
```bash
$ sudo apt-get update
$ sudo apt-get install clickhouse-server=xx.yy.a.b clickhouse-client=xx.yy.a.b clickhouse-common-static=xx.yy.a.b
$ sudo service clickhouse-server restart
```

View File

@ -0,0 +1,83 @@
---
toc_priority: 65
toc_title: Map(key, value)
---
# Map(key, value) {#data_type-map}
`Map(key, value)` data type stores `key:value` pairs.
**Parameters**
- `key` — The key part of the pair. [String](../../sql-reference/data-types/string.md) or [Integer](../../sql-reference/data-types/int-uint.md).
- `value` — The value part of the pair. [String](../../sql-reference/data-types/string.md), [Integer](../../sql-reference/data-types/int-uint.md) or [Array](../../sql-reference/data-types/array.md).
!!! warning "Warning"
Currently `Map` data type is an experimental feature. To work with it you must set `allow_experimental_map_type = 1`.
To get the value from an `a Map('key', 'value')` column, use `a['key']` syntax. This lookup works now with a linear complexity.
**Examples**
Consider the table:
``` sql
CREATE TABLE table_map (a Map(String, UInt64)) ENGINE=Memory;
INSERT INTO table_map VALUES ({'key1':1, 'key2':10}), ({'key1':2,'key2':20}), ({'key1':3,'key2':30});
```
Select all `key2` values:
```sql
SELECT a['key2'] FROM table_map;
```
Result:
```text
┌─arrayElement(a, 'key2')─┐
│ 10 │
│ 20 │
│ 30 │
└─────────────────────────┘
```
If there's no such `key` in the `Map()` column, the query returns zeros for numerical values, empty strings or empty arrays.
```sql
INSERT INTO table_map VALUES ({'key3':100}), ({});
SELECT a['key3'] FROM table_map;
```
Result:
```text
┌─arrayElement(a, 'key3')─┐
│ 100 │
│ 0 │
└─────────────────────────┘
┌─arrayElement(a, 'key3')─┐
│ 0 │
│ 0 │
│ 0 │
└─────────────────────────┘
```
## Convert Tuple to Map Type {#map-and-tuple}
You can cast `Tuple()` as `Map()` using [CAST](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) function:
``` sql
SELECT CAST(([1, 2, 3], ['Ready', 'Steady', 'Go']), 'Map(UInt8, String)') AS map;
```
``` text
┌─map───────────────────────────┐
│ {1:'Ready',2:'Steady',3:'Go'} │
└───────────────────────────────┘
```
**See Also**
- [map()](../../sql-reference/functions/tuple-map-functions.md#function-map) function
- [CAST()](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) function
[Original article](https://clickhouse.tech/docs/en/data-types/map/) <!--hide-->

View File

@ -265,32 +265,81 @@ SELECT toIPv6('127.0.0.1')
└─────────────────────┘
```
## isIPv4String
## isIPv4String {#isipv4string}
Determines if the input string is an IPv4 address or not. Returns `1` if true `0` otherwise.
Determines whether the input string is an IPv4 address or not. If `string` is IPv6 address returns `0`.
``` sql
SELECT isIPv4String('127.0.0.1')
**Syntax**
```sql
isIPv4String(string)
```
**Parameters**
- `string` — IP address. [String](../../sql-reference/data-types/string.md).
**Returned value**
- `1` if `string` is IPv4 address, `0` otherwise.
Type: [UInt8](../../sql-reference/data-types/int-uint.md).
**Examples**
Query:
```sql
SELECT addr, isIPv4String(addr) FROM ( SELECT ['0.0.0.0', '127.0.0.1', '::ffff:127.0.0.1'] AS addr ) ARRAY JOIN addr
```
Result:
``` text
┌─isIPv4String('127.0.0.1')─┐
│ 1 │
└───────────────────────────┘
┌─addr─────────────┬─isIPv4String(addr)─┐
│ 0.0.0.0 │ 1 │
│ 127.0.0.1 │ 1 │
│ ::ffff:127.0.0.1 │ 0 │
└──────────────────┴────────────────────┘
```
## isIPv6String
## isIPv6String {#isipv6string}
Determines if the input string is an IPv6 address or not. Returns `1` if true `0` otherwise.
Determines whether the input string is an IPv6 address or not. If `string` is IPv4 address returns `0`.
**Syntax**
```sql
isIPv6String(string)
```
**Parameters**
- `string` — IP address. [String](../../sql-reference/data-types/string.md).
**Returned value**
- `1` if `string` is IPv6 address, `0` otherwise.
Type: [UInt8](../../sql-reference/data-types/int-uint.md).
**Examples**
Query:
``` sql
SELECT isIPv6String('2001:438:ffff::407d:1bc1')
SELECT addr, isIPv6String(addr) FROM ( SELECT ['::', '1111::ffff', '::ffff:127.0.0.1', '127.0.0.1'] AS addr ) ARRAY JOIN addr
```
Result:
``` text
┌─isIPv6String('2001:438:ffff::407d:1bc1')─┐
│ 1 │
└──────────────────────────────────────────┘
┌─addr─────────────┬─isIPv6String(addr)─┐
│ :: │ 1 │
│ 1111::ffff │ 1 │
│ ::ffff:127.0.0.1 │ 1 │
│ 127.0.0.1 │ 0 │
└──────────────────┴────────────────────┘
```
[Original article](https://clickhouse.tech/docs/en/query_language/functions/ip_address_functions/) <!--hide-->

View File

@ -909,6 +909,66 @@ WHERE diff != 1
Same as for [runningDifference](../../sql-reference/functions/other-functions.md#other_functions-runningdifference), the difference is the value of the first row, returned the value of the first row, and each subsequent row returns the difference from the previous row.
## runningConcurrency {#runningconcurrency}
Given a series of beginning time and ending time of events, this function calculates concurrency of the events at each of the data point, that is, the beginning time.
!!! warning "Warning"
Events spanning multiple data blocks will not be processed correctly. The function resets its state for each new data block.
The result of the function depends on the order of data in the block. It assumes the beginning time is sorted in ascending order.
**Syntax**
``` sql
runningConcurrency(begin, end)
```
**Parameters**
- `begin` — A column for the beginning time of events (inclusive). [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md), or [DateTime64](../../sql-reference/data-types/datetime64.md).
- `end` — A column for the ending time of events (exclusive). [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md), or [DateTime64](../../sql-reference/data-types/datetime64.md).
Note that two columns `begin` and `end` must have the same type.
**Returned values**
- The concurrency of events at the data point.
Type: [UInt32](../../sql-reference/data-types/int-uint.md)
**Example**
Input table:
``` text
┌───────────────begin─┬─────────────────end─┐
│ 2020-12-01 00:00:00 │ 2020-12-01 00:59:59 │
│ 2020-12-01 00:30:00 │ 2020-12-01 00:59:59 │
│ 2020-12-01 00:40:00 │ 2020-12-01 01:30:30 │
│ 2020-12-01 01:10:00 │ 2020-12-01 01:30:30 │
│ 2020-12-01 01:50:00 │ 2020-12-01 01:59:59 │
└─────────────────────┴─────────────────────┘
```
Query:
``` sql
SELECT runningConcurrency(begin, end) FROM example
```
Result:
``` text
┌─runningConcurrency(begin, end)─┐
│ 1 │
│ 2 │
│ 3 │
│ 2 │
│ 1 │
└────────────────────────────────┘
```
## MACNumToString(num) {#macnumtostringnum}
Accepts a UInt64 number. Interprets it as a MAC address in big endian. Returns a string containing the corresponding MAC address in the format AA:BB:CC:DD:EE:FF (colon-separated numbers in hexadecimal form).

View File

@ -5,6 +5,68 @@ toc_title: Working with maps
# Functions for maps {#functions-for-working-with-tuple-maps}
## map {#function-map}
Arranges `key:value` pairs into [Map(key, value)](../../sql-reference/data-types/map.md) data type.
**Syntax**
``` sql
map(key1, value1[, key2, value2, ...])
```
**Parameters**
- `key` — The key part of the pair. [String](../../sql-reference/data-types/string.md) or [Integer](../../sql-reference/data-types/int-uint.md).
- `value` — The value part of the pair. [String](../../sql-reference/data-types/string.md), [Integer](../../sql-reference/data-types/int-uint.md) or [Array](../../sql-reference/data-types/array.md).
**Returned value**
- Data structure as `key:value` pairs.
Type: [Map(key, value)](../../sql-reference/data-types/map.md).
**Examples**
Query:
``` sql
SELECT map('key1', number, 'key2', number * 2) FROM numbers(3);
```
Result:
``` text
┌─map('key1', number, 'key2', multiply(number, 2))─┐
│ {'key1':0,'key2':0} │
│ {'key1':1,'key2':2} │
│ {'key1':2,'key2':4} │
└──────────────────────────────────────────────────┘
```
Query:
``` sql
CREATE TABLE table_map (a Map(String, UInt64)) ENGINE = MergeTree() ORDER BY a;
INSERT INTO table_map SELECT map('key1', number, 'key2', number * 2) FROM numbers(3);
SELECT a['key2'] FROM table_map;
```
Result:
``` text
┌─arrayElement(a, 'key2')─┐
│ 0 │
│ 2 │
│ 4 │
└─────────────────────────┘
```
**See Also**
- [Map(key, value)](../../sql-reference/data-types/map.md) data type
## mapAdd {#function-mapadd}
Collect all the keys and sum corresponding values.
@ -112,4 +174,4 @@ Result:
└──────────────────────────────┴───────────────────────────────────┘
```
[Original article](https://clickhouse.tech/docs/en/query_language/functions/tuple-map-functions/) <!--hide-->
[Original article](https://clickhouse.tech/docs/en/sql-reference/functions/tuple-map-functions/) <!--hide-->

View File

@ -133,10 +133,9 @@ For example:
### cutToFirstSignificantSubdomainCustom {#cuttofirstsignificantsubdomaincustom}
Same as `cutToFirstSignificantSubdomain` but accept custom TLD list name, useful if:
Returns the part of the domain that includes top-level subdomains up to the first significant subdomain. Accepts custom [TLD list](https://en.wikipedia.org/wiki/List_of_Internet_top-level_domains) name.
- you need fresh TLD list,
- or you have custom.
Can be useful if you need fresh TLD list or you have custom.
Configuration example:
@ -149,21 +148,150 @@ Configuration example:
</top_level_domains_lists>
```
Example:
**Syntax**
- `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/', 'public_suffix_list') = 'yandex.com.tr'`.
``` sql
cutToFirstSignificantSubdomain(URL, TLD)
```
**Parameters**
- `URL` — URL. [String](../../sql-reference/data-types/string.md).
- `TLD` — Custom TLD list name. [String](../../sql-reference/data-types/string.md).
**Returned value**
- Part of the domain that includes top-level subdomains up to the first significant subdomain.
Type: [String](../../sql-reference/data-types/string.md).
**Example**
Query:
```sql
SELECT cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
```
Result:
```text
┌─cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list')─┐
│ foo.there-is-no-such-domain │
└───────────────────────────────────────────────────────────────────────────────────────────────┘
```
**See Also**
- [firstSignificantSubdomain](#firstsignificantsubdomain).
### cutToFirstSignificantSubdomainCustomWithWWW {#cuttofirstsignificantsubdomaincustomwithwww}
Same as `cutToFirstSignificantSubdomainWithWWW` but accept custom TLD list name.
Returns the part of the domain that includes top-level subdomains up to the first significant subdomain without stripping `www`. Accepts custom TLD list name.
Can be useful if you need fresh TLD list or you have custom.
Configuration example:
```xml
<!-- <top_level_domains_path>/var/lib/clickhouse/top_level_domains/</top_level_domains_path> -->
<top_level_domains_lists>
<!-- https://publicsuffix.org/list/public_suffix_list.dat -->
<public_suffix_list>public_suffix_list.dat</public_suffix_list>
<!-- NOTE: path is under top_level_domains_path -->
</top_level_domains_lists>
```
**Syntax**
```sql
cutToFirstSignificantSubdomainCustomWithWWW(URL, TLD)
```
**Parameters**
- `URL` — URL. [String](../../sql-reference/data-types/string.md).
- `TLD` — Custom TLD list name. [String](../../sql-reference/data-types/string.md).
**Returned value**
- Part of the domain that includes top-level subdomains up to the first significant subdomain without stripping `www`.
Type: [String](../../sql-reference/data-types/string.md).
**Example**
Query:
```sql
SELECT cutToFirstSignificantSubdomainCustomWithWWW('www.foo', 'public_suffix_list');
```
Result:
```text
┌─cutToFirstSignificantSubdomainCustomWithWWW('www.foo', 'public_suffix_list')─┐
│ www.foo │
└──────────────────────────────────────────────────────────────────────────────┘
```
**See Also**
- [firstSignificantSubdomain](#firstsignificantsubdomain).
### firstSignificantSubdomainCustom {#firstsignificantsubdomaincustom}
Same as `firstSignificantSubdomain` but accept custom TLD list name.
Returns the first significant subdomain. Accepts customs TLD list name.
### cutToFirstSignificantSubdomainCustomWithWWW {#cuttofirstsignificantsubdomaincustomwithwww}
Can be useful if you need fresh TLD list or you have custom.
Same as `cutToFirstSignificantSubdomainWithWWW` but accept custom TLD list name.
Configuration example:
```xml
<!-- <top_level_domains_path>/var/lib/clickhouse/top_level_domains/</top_level_domains_path> -->
<top_level_domains_lists>
<!-- https://publicsuffix.org/list/public_suffix_list.dat -->
<public_suffix_list>public_suffix_list.dat</public_suffix_list>
<!-- NOTE: path is under top_level_domains_path -->
</top_level_domains_lists>
```
**Syntax**
```sql
firstSignificantSubdomainCustom(URL, TLD)
```
**Parameters**
- `URL` — URL. [String](../../sql-reference/data-types/string.md).
- `TLD` — Custom TLD list name. [String](../../sql-reference/data-types/string.md).
**Returned value**
- First significant subdomain.
Type: [String](../../sql-reference/data-types/string.md).
**Example**
Query:
```sql
SELECT firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
```
Result:
```text
┌─firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list')─┐
│ foo │
└──────────────────────────────────────────────────────────────────────────────────────────┘
```
**See Also**
- [firstSignificantSubdomain](#firstsignificantsubdomain).
### port(URL\[, default_port = 0\]) {#port}

View File

@ -20,6 +20,7 @@ The following actions are supported:
- [ADD COLUMN](#alter_add-column) — Adds a new column to the table.
- [DROP COLUMN](#alter_drop-column) — Deletes the column.
- [RENAME COLUMN](#alter_rename-column) — Renames the column.
- [CLEAR COLUMN](#alter_clear-column) — Resets column values.
- [COMMENT COLUMN](#alter_comment-column) — Adds a text comment to the column.
- [MODIFY COLUMN](#alter_modify-column) — Changes columns type, default expression and TTL.
@ -78,6 +79,22 @@ Example:
ALTER TABLE visits DROP COLUMN browser
```
## RENAME COLUMN {#alter_rename-column}
``` sql
RENAME COLUMN [IF EXISTS] name to new_name
```
Renames the column `name` to `new_name`. If the `IF EXISTS` clause is specified, the query wont return an error if the column doesnt exist. Since renaming does not involve the underlying data, the query is completed almost instantly.
**NOTE**: Columns specified in the key expression of the table (either with `ORDER BY` or `PRIMARY KEY`) cannot be renamed. Trying to change these columns will produce `SQL Error [524]`.
Example:
``` sql
ALTER TABLE visits RENAME COLUMN webBrowser TO browser
```
## CLEAR COLUMN {#alter_clear-column}
``` sql

View File

@ -14,14 +14,16 @@ ClickHouse supports the standard grammar for defining windows and window functio
| Feature | Support or workaround |
| --------| ----------|
| ad hoc window specification (`count(*) over (partition by id order by time desc)`) | yes |
| `WINDOW` clause (`select ... from table window w as (partiton by id)`) | yes |
| `ROWS` frame | yes |
| `RANGE` frame | yes, it is the default |
| `GROUPS` frame | no |
| ad hoc window specification (`count(*) over (partition by id order by time desc)`) | supported |
| expressions involving window functions, e.g. `(count(*) over ()) / 2)` | not supported, wrap in a subquery ([feature request](https://github.com/ClickHouse/ClickHouse/issues/19857)) |
| `WINDOW` clause (`select ... from table window w as (partiton by id)`) | supported |
| `ROWS` frame | supported |
| `RANGE` frame | supported, the default |
| `INTERVAL` syntax for `DateTime` `RANGE OFFSET` frame | not supported, specify the number of seconds instead |
| `GROUPS` frame | not supported |
| Calculating aggregate functions over a frame (`sum(value) over (order by time)`) | all aggregate functions are supported |
| `rank()`, `dense_rank()`, `row_number()` | yes |
| `lag/lead(value, offset)` | no, replace with `any(value) over (.... rows between <offset> preceding and <offset> preceding)`, or `following` for `lead`|
| `rank()`, `dense_rank()`, `row_number()` | supported |
| `lag/lead(value, offset)` | not supported, replace with `any(value) over (.... rows between <offset> preceding and <offset> preceding)`, or `following` for `lead`|
## References

View File

@ -283,12 +283,10 @@ INSERT INTO test VALUES (lower('Hello')), (lower('world')), (lower('INSERT')), (
## input_format_tsv_empty_as_default {#settings-input-format-tsv-empty-as-default}
Если эта настройка включена, замените пустые поля ввода в TSV значениями по умолчанию. Для сложных выражений по умолчанию также должна быть включена настройка `input_format_defaults_for_omitted_fields`.
Если эта настройка включена, все пустые поля во входящем TSV заменяются значениями по умолчанию. Для сложных выражений по умолчанию также должна быть включена настройка `input_format_defaults_for_omitted_fields`.
По умолчанию отключена.
Disabled by default.
## input_format_tsv_enum_as_number {#settings-input_format_tsv_enum_as_number}
Включает или отключает парсинг значений перечислений как идентификаторов перечислений для входного формата TSV.
@ -708,7 +706,7 @@ ClickHouse использует этот параметр при чтении д
Установка логирования запроса.
Запросы, переданные в ClickHouse с этой установкой, логируются согласно правилам конфигурационного параметра сервера [query_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query-log).
Запросы, переданные в ClickHouse с этой настройкой, логируются согласно правилам конфигурационного параметра сервера [query_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query-log).
Пример:
@ -1521,7 +1519,7 @@ ClickHouse генерирует исключение
- Тип: секунды
- Значение по умолчанию: 60 секунд
Управляет скоростью обнуления ошибок в распределенных таблицах. Если реплика недоступна в течение некоторого времени, накапливает 5 ошибок, а distributed_replica_error_half_life установлена на 1 секунду, то реплика считается нормальной через 3 секунды после последней ошибки.
Управляет скоростью обнуления счетчика ошибок в распределенных таблицах. Предположим, реплика остается недоступна в течение какого-то времени, и за этот период накопилось 5 ошибок. Если настройка `distributed_replica_error_half_life` установлена в значение 1 секунда, то реплика снова будет считаться доступной через 3 секунды после последней ошибки.
См. также:
@ -1673,7 +1671,7 @@ ClickHouse генерирует исключение
- Тип: bool
- Значение по умолчанию: True
Обеспечивает параллельный анализ форматов данных с сохранением порядка. Поддерживается только для форматов TSV, TKSV, CSV и JSONEachRow.
Включает режим, при котором входящие данные парсятся параллельно, но с сохранением исходного порядка следования. Поддерживается только для форматов TSV, TKSV, CSV и JSONEachRow.
## min_chunk_bytes_for_parallel_parsing {#min-chunk-bytes-for-parallel-parsing}
@ -1987,7 +1985,7 @@ SELECT idx, i FROM null_in WHERE i IN (1, NULL) SETTINGS transform_null_in = 1;
## output_format_pretty_grid_charset {#output-format-pretty-grid-charset}
Позволяет изменить кодировку, которая используется для печати грид-границ. Доступны следующие кодировки: UTF-8, ASCII.
Позволяет изменить кодировку, которая используется для отрисовки таблицы при выводе результатов запросов. Доступны следующие кодировки: UTF-8, ASCII.
**Пример**
@ -2473,6 +2471,18 @@ SELECT SUM(-1), MAX(0) FROM system.one WHERE 0;
Значение по умолчанию: `16`.
## opentelemetry_start_trace_probability {#opentelemetry-start-trace-probability}
Задает вероятность того, что ClickHouse начнет трассировку для выполненных запросов (если не указан [входящий контекст](https://www.w3.org/TR/trace-context/) трассировки).
Возможные значения:
- 0 — трассировка для выполненных запросов отключена (если не указан входящий контекст трассировки).
- Положительное число с плавающей точкой в диапазоне [0..1]. Например, при значении настройки, равной `0,5`, ClickHouse начнет трассировку в среднем для половины запросов.
- 1 — трассировка для всех выполненных запросов включена.
Значение по умолчанию: `0`.
## optimize_on_insert {#optimize-on-insert}
Включает или выключает преобразование данных перед добавлением в таблицу, как будто над добавляемым блоком предварительно было произведено слияние (в соответствии с движком таблицы).

View File

@ -0,0 +1,49 @@
# system.opentelemetry_span_log {#system_tables-opentelemetry_span_log}
Содержит информацию о [trace spans](https://opentracing.io/docs/overview/spans/) для выполненных запросов.
Столбцы:
- `trace_id` ([UUID](../../sql-reference/data-types/uuid.md) — идентификатор трассировки для выполненного запроса.
- `span_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — идентификатор `trace span`.
- `parent_span_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — идентификатор родительского `trace span`.
- `operation_name` ([String](../../sql-reference/data-types/string.md)) — имя операции.
- `start_time_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — время начала `trace span` (в микросекундах).
- `finish_time_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — время окончания `trace span` (в микросекундах).
- `finish_date` ([Date](../../sql-reference/data-types/date.md)) — дата окончания `trace span`.
- `attribute.names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — имена [атрибутов](https://opentelemetry.io/docs/go/instrumentation/#attributes) в зависимости от `trace span`. Заполняются согласно рекомендациям в стандарте [OpenTelemetry](https://opentelemetry.io/).
- `attribute.values` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — значения атрибутов в зависимости от `trace span`. Заполняются согласно рекомендациям в стандарте `OpenTelemetry`.
**Пример**
Запрос:
``` sql
SELECT * FROM system.opentelemetry_span_log LIMIT 1 FORMAT Vertical;
```
Результат:
``` text
Row 1:
──────
trace_id: cdab0847-0d62-61d5-4d38-dd65b19a1914
span_id: 701487461015578150
parent_span_id: 2991972114672045096
operation_name: DB::Block DB::InterpreterSelectQuery::getSampleBlockImpl()
start_time_us: 1612374594529090
finish_time_us: 1612374594529108
finish_date: 2021-02-03
attribute.names: []
attribute.values: []
```
[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/opentelemetry_span_log) <!--hide-->

View File

@ -0,0 +1,69 @@
---
toc_priority: 65
toc_title: Map(key, value)
---
# Map(key, value) {#data_type-map}
Тип данных `Map(key, value)` хранит пары `ключ:значение`.
**Параметры**
- `key` — ключ. [String](../../sql-reference/data-types/string.md) или [Integer](../../sql-reference/data-types/int-uint.md).
- `value` — значение. [String](../../sql-reference/data-types/string.md), [Integer](../../sql-reference/data-types/int-uint.md) или [Array](../../sql-reference/data-types/array.md).
!!! warning "Предупреждение"
Сейчас использование типа данных `Map` является экспериментальной возможностью. Чтобы использовать этот тип данных, включите настройку `allow_experimental_map_type = 1`.
Чтобы получить значение из колонки `a Map('key', 'value')`, используйте синтаксис `a['key']`. В настоящее время такая подстановка работает по алгоритму с линейной сложностью.
**Примеры**
Рассмотрим таблицу:
``` sql
CREATE TABLE table_map (a Map(String, UInt64)) ENGINE=Memory;
INSERT INTO table_map VALUES ({'key1':1, 'key2':10}), ({'key1':2,'key2':20}), ({'key1':3,'key2':30});
```
Выборка всех значений ключа `key2`:
```sql
SELECT a['key2'] FROM table_map;
```
Результат:
```text
┌─arrayElement(a, 'key2')─┐
│ 10 │
│ 20 │
│ 30 │
└─────────────────────────┘
```
Если для какого-то ключа `key` в колонке с типом `Map()` нет значения, запрос возвращает нули для числовых колонок, пустые строки или пустые массивы.
```sql
INSERT INTO table_map VALUES ({'key3':100}), ({});
SELECT a['key3'] FROM table_map;
```
Результат:
```text
┌─arrayElement(a, 'key3')─┐
│ 100 │
│ 0 │
└─────────────────────────┘
┌─arrayElement(a, 'key3')─┐
│ 0 │
│ 0 │
│ 0 │
└─────────────────────────┘
```
**См. также**
- функция [map()](../../sql-reference/functions/tuple-map-functions.md#function-map)
- функция [CAST()](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast)
[Original article](https://clickhouse.tech/docs/ru/data-types/map/) <!--hide-->

View File

@ -243,4 +243,81 @@ SELECT
└───────────────────────────────────┴──────────────────────────────────┘
```
## isIPv4String {#isipv4string}
Определяет, является ли строка адресом IPv4 или нет. Также вернет `0`, если `string` — адрес IPv6.
**Синтаксис**
```sql
isIPv4String(string)
```
**Параметры**
- `string` — IP адрес. [String](../../sql-reference/data-types/string.md).
**Возвращаемое значение**
- `1` если `string` является адресом IPv4 , иначе — `0`.
Тип: [UInt8](../../sql-reference/data-types/int-uint.md).
**Примеры**
Запрос:
```sql
SELECT addr, isIPv4String(addr) FROM ( SELECT ['0.0.0.0', '127.0.0.1', '::ffff:127.0.0.1'] AS addr ) ARRAY JOIN addr
```
Результат:
``` text
┌─addr─────────────┬─isIPv4String(addr)─┐
│ 0.0.0.0 │ 1 │
│ 127.0.0.1 │ 1 │
│ ::ffff:127.0.0.1 │ 0 │
└──────────────────┴────────────────────┘
```
## isIPv6String {#isipv6string}
Определяет, является ли строка адресом IPv6 или нет. Также вернет `0`, если `string` — адрес IPv4.
**Синтаксис**
```sql
isIPv6String(string)
```
**Параметры**
- `string` — IP адрес. [String](../../sql-reference/data-types/string.md).
**Возвращаемое значение**
- `1` если `string` является адресом IPv6 , иначе — `0`.
Тип: [UInt8](../../sql-reference/data-types/int-uint.md).
**Примеры**
Запрос:
``` sql
SELECT addr, isIPv6String(addr) FROM ( SELECT ['::', '1111::ffff', '::ffff:127.0.0.1', '127.0.0.1'] AS addr ) ARRAY JOIN addr
```
Результат:
``` text
┌─addr─────────────┬─isIPv6String(addr)─┐
│ :: │ 1 │
│ 1111::ffff │ 1 │
│ ::ffff:127.0.0.1 │ 1 │
│ 127.0.0.1 │ 0 │
└──────────────────┴────────────────────┘
```
[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/ip_address_functions/) <!--hide-->

View File

@ -5,6 +5,66 @@ toc_title: Работа с контейнерами map
# Функции для работы с контейнерами map {#functions-for-working-with-tuple-maps}
## map {#function-map}
Преобразовывает пары `ключ:значение` в тип данных [Map(key, value)](../../sql-reference/data-types/map.md).
**Синтаксис**
``` sql
map(key1, value1[, key2, value2, ...])
```
**Параметры**
- `key` — ключ. [String](../../sql-reference/data-types/string.md) или [Integer](../../sql-reference/data-types/int-uint.md).
- `value` — значение. [String](../../sql-reference/data-types/string.md), [Integer](../../sql-reference/data-types/int-uint.md) или [Array](../../sql-reference/data-types/array.md).
**Возвращаемое значение**
- Структура данных в виде пар `ключ:значение`.
Тип: [Map(key, value)](../../sql-reference/data-types/map.md).
**Примеры**
Запрос:
``` sql
SELECT map('key1', number, 'key2', number * 2) FROM numbers(3);
```
Результат:
``` text
┌─map('key1', number, 'key2', multiply(number, 2))─┐
│ {'key1':0,'key2':0} │
│ {'key1':1,'key2':2} │
│ {'key1':2,'key2':4} │
└──────────────────────────────────────────────────┘
```
Запрос:
``` sql
CREATE TABLE table_map (a Map(String, UInt64)) ENGINE = MergeTree() ORDER BY a;
INSERT INTO table_map SELECT map('key1', number, 'key2', number * 2) FROM numbers(3);
SELECT a['key2'] FROM table_map;
```
Результат:
``` text
┌─arrayElement(a, 'key2')─┐
│ 0 │
│ 2 │
│ 4 │
└─────────────────────────┘
```
**См. также**
- тип данных [Map(key, value)](../../sql-reference/data-types/map.md)
## mapAdd {#function-mapadd}
Собирает все ключи и суммирует соответствующие значения.

View File

@ -115,6 +115,168 @@ SELECT topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk')
Например, `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`.
### cutToFirstSignificantSubdomainCustom {#cuttofirstsignificantsubdomaincustom}
Возвращает часть домена, включающую поддомены верхнего уровня до первого существенного поддомена. Принимает имя пользовательского [списка доменов верхнего уровня](https://ru.wikipedia.org/wiki/Список_доменов_верхнего_уровня).
Полезно, если требуется актуальный список доменов верхнего уровня или если есть пользовательский.
Пример конфигурации:
```xml
<!-- <top_level_domains_path>/var/lib/clickhouse/top_level_domains/</top_level_domains_path> -->
<top_level_domains_lists>
<!-- https://publicsuffix.org/list/public_suffix_list.dat -->
<public_suffix_list>public_suffix_list.dat</public_suffix_list>
<!-- NOTE: path is under top_level_domains_path -->
</top_level_domains_lists>
```
**Синтаксис**
``` sql
cutToFirstSignificantSubdomain(URL, TLD)
```
**Parameters**
- `URL` — URL. [String](../../sql-reference/data-types/string.md).
- `TLD` — имя пользовательского списка доменов верхнего уровня. [String](../../sql-reference/data-types/string.md).
**Возвращаемое значение**
- Часть домена, включающая поддомены верхнего уровня до первого существенного поддомена.
Тип: [String](../../sql-reference/data-types/string.md).
**Пример**
Запрос:
```sql
SELECT cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
```
Результат:
```text
┌─cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list')─┐
│ foo.there-is-no-such-domain │
└───────────────────────────────────────────────────────────────────────────────────────────────┘
```
**Смотрите также**
- [firstSignificantSubdomain](#firstsignificantsubdomain).
### cutToFirstSignificantSubdomainCustomWithWWW {#cuttofirstsignificantsubdomaincustomwithwww}
Возвращает часть домена, включающую поддомены верхнего уровня до первого существенного поддомена, не опуская "www". Принимает имя пользовательского списка доменов верхнего уровня.
Полезно, если требуется актуальный список доменов верхнего уровня или если есть пользовательский.
Пример конфигурации:
```xml
<!-- <top_level_domains_path>/var/lib/clickhouse/top_level_domains/</top_level_domains_path> -->
<top_level_domains_lists>
<!-- https://publicsuffix.org/list/public_suffix_list.dat -->
<public_suffix_list>public_suffix_list.dat</public_suffix_list>
<!-- NOTE: path is under top_level_domains_path -->
</top_level_domains_lists>
```
**Синтаксис**
```sql
cutToFirstSignificantSubdomainCustomWithWWW(URL, TLD)
```
**Параметры**
- `URL` — URL. [String](../../sql-reference/data-types/string.md).
- `TLD` — имя пользовательского списка доменов верхнего уровня. [String](../../sql-reference/data-types/string.md).
**Возвращаемое значение**
- Часть домена, включающая поддомены верхнего уровня до первого существенного поддомена, без удаления `www`.
Тип: [String](../../sql-reference/data-types/string.md).
**Пример**
Запрос:
```sql
SELECT cutToFirstSignificantSubdomainCustomWithWWW('www.foo', 'public_suffix_list');
```
Результат:
```text
┌─cutToFirstSignificantSubdomainCustomWithWWW('www.foo', 'public_suffix_list')─┐
│ www.foo │
└──────────────────────────────────────────────────────────────────────────────┘
```
**Смотрите также**
- [firstSignificantSubdomain](#firstsignificantsubdomain).
### firstSignificantSubdomainCustom {#firstsignificantsubdomaincustom}
Возвращает первый существенный поддомен. Принимает имя пользовательского списка доменов верхнего уровня.
Полезно, если требуется актуальный список доменов верхнего уровня или если есть пользовательский.
Пример конфигурации:
```xml
<!-- <top_level_domains_path>/var/lib/clickhouse/top_level_domains/</top_level_domains_path> -->
<top_level_domains_lists>
<!-- https://publicsuffix.org/list/public_suffix_list.dat -->
<public_suffix_list>public_suffix_list.dat</public_suffix_list>
<!-- NOTE: path is under top_level_domains_path -->
</top_level_domains_lists>
```
**Синтаксис**
```sql
firstSignificantSubdomainCustom(URL, TLD)
```
**Параметры**
- `URL` — URL. [String](../../sql-reference/data-types/string.md).
- `TLD` — имя пользовательского списка доменов верхнего уровня. [String](../../sql-reference/data-types/string.md).
**Возвращаемое значение**
- Первый существенный поддомен.
Тип: [String](../../sql-reference/data-types/string.md).
**Пример**
Запрос:
```sql
SELECT firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
```
Результат:
```text
┌─firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list')─┐
│ foo │
└──────────────────────────────────────────────────────────────────────────────────────────┘
```
**Смотрите также**
- [firstSignificantSubdomain](#firstsignificantsubdomain).
### port(URL[, default_port = 0]) {#port}
Возвращает порт или значение `default_port`, если в URL-адресе нет порта (или передан невалидный URL)

View File

@ -59,7 +59,6 @@
#include <Disks/registerDisks.h>
#include <Common/Config/ConfigReloader.h>
#include <Server/HTTPHandlerFactory.h>
#include <Server/TestKeeperTCPHandlerFactory.h>
#include "MetricsTransmitter.h"
#include <Common/StatusFile.h>
#include <Server/TCPHandlerFactory.h>
@ -94,6 +93,9 @@
# include <Server/GRPCServer.h>
#endif
#if USE_NURAFT
# include <Server/NuKeeperTCPHandlerFactory.h>
#endif
namespace CurrentMetrics
{
@ -842,23 +844,33 @@ int Server::main(const std::vector<std::string> & /*args*/)
listen_try = true;
}
for (const auto & listen_host : listen_hosts)
if (config().has("test_keeper_server"))
{
/// TCP TestKeeper
const char * port_name = "test_keeper_server.tcp_port";
createServer(listen_host, port_name, listen_try, [&](UInt16 port)
#if USE_NURAFT
/// Initialize test keeper RAFT. Do nothing if no nu_keeper_server in config.
global_context->initializeNuKeeperStorageDispatcher();
for (const auto & listen_host : listen_hosts)
{
Poco::Net::ServerSocket socket;
auto address = socketBindListen(socket, listen_host, port);
socket.setReceiveTimeout(settings.receive_timeout);
socket.setSendTimeout(settings.send_timeout);
servers_to_start_before_tables->emplace_back(
port_name,
std::make_unique<Poco::Net::TCPServer>(
new TestKeeperTCPHandlerFactory(*this), server_pool, socket, new Poco::Net::TCPServerParams));
/// TCP NuKeeper
const char * port_name = "test_keeper_server.tcp_port";
createServer(listen_host, port_name, listen_try, [&](UInt16 port)
{
Poco::Net::ServerSocket socket;
auto address = socketBindListen(socket, listen_host, port);
socket.setReceiveTimeout(settings.receive_timeout);
socket.setSendTimeout(settings.send_timeout);
servers_to_start_before_tables->emplace_back(
port_name,
std::make_unique<Poco::Net::TCPServer>(
new NuKeeperTCPHandlerFactory(*this), server_pool, socket, new Poco::Net::TCPServerParams));
LOG_INFO(log, "Listening for connections to NuKeeper (tcp): {}", address.toString());
});
}
#else
throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "ClickHouse server built without NuRaft library. Cannot use internal coordination.");
#endif
LOG_INFO(log, "Listening for connections to fake zookeeper (tcp): {}", address.toString());
});
}
for (auto & server : *servers_to_start_before_tables)
@ -898,6 +910,8 @@ int Server::main(const std::vector<std::string> & /*args*/)
LOG_INFO(log, "Closed connections to servers for tables. But {} remain. Probably some tables of other users cannot finish their connections after context shutdown.", current_connections);
else
LOG_INFO(log, "Closed connections to servers for tables.");
global_context->shutdownNuKeeperStorageDispatcher();
}
/** Explicitly destroy Context. It is more convenient than in destructor of Server, because logger is still available.

View File

@ -421,9 +421,15 @@
<!-- Comma-separated list of prefixes for user-defined settings. -->
<custom_settings_prefixes></custom_settings_prefixes>
<!-- System profile of settings. This settings are used by internal processes (Buffer storage, Distributed DDL worker and so on). -->
<!-- System profile of settings. This settings are used by internal processes (Distributed DDL worker and so on). -->
<!-- <system_profile>default</system_profile> -->
<!-- Buffer profile of settings.
This settings are used by Buffer storage to flush data to the underlying table.
Default: used from system_profile directive.
-->
<!-- <buffer_profile>default</buffer_profile> -->
<!-- Default database. -->
<default_database>default</default_database>

View File

@ -217,6 +217,7 @@ namespace
/// Write the file.
WriteBufferFromFile out{tmp_file_path.string()};
out.write(file_contents.data(), file_contents.size());
out.close();
/// Rename.
std::filesystem::rename(tmp_file_path, file_path);
@ -274,6 +275,7 @@ namespace
writeStringBinary(name, out);
writeUUIDText(id, out);
}
out.close();
}

View File

@ -34,6 +34,14 @@ void registerAggregateFunctionsAny(AggregateFunctionFactory & factory)
factory.registerFunction("any", { createAggregateFunctionAny, properties });
factory.registerFunction("anyLast", { createAggregateFunctionAnyLast, properties });
factory.registerFunction("anyHeavy", { createAggregateFunctionAnyHeavy, properties });
// Synonyms for use as window functions.
factory.registerFunction("first_value",
{ createAggregateFunctionAny, properties },
AggregateFunctionFactory::CaseInsensitive);
factory.registerFunction("last_value",
{ createAggregateFunctionAnyLast, properties },
AggregateFunctionFactory::CaseInsensitive);
}
}

View File

@ -40,7 +40,7 @@ struct MovingData
Array value; /// Prefix sums.
T sum = 0;
void add(T val, Arena * arena)
void NO_SANITIZE_UNDEFINED add(T val, Arena * arena)
{
sum += val;
value.push_back(sum, arena);
@ -120,7 +120,7 @@ public:
this->data(place).add(static_cast<ResultT>(value), arena);
}
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
void NO_SANITIZE_UNDEFINED merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
{
auto & cur_elems = this->data(place);
auto & rhs_elems = this->data(rhs);

View File

@ -26,6 +26,7 @@ class ReadBuffer;
class WriteBuffer;
class IColumn;
class IDataType;
class IWindowFunction;
using DataTypePtr = std::shared_ptr<const IDataType>;
using DataTypes = std::vector<DataTypePtr>;
@ -215,6 +216,20 @@ public:
const DataTypes & getArgumentTypes() const { return argument_types; }
const Array & getParameters() const { return parameters; }
// Any aggregate function can be calculated over a window, but there are some
// window functions such as rank() that require a different interface, e.g.
// because they don't respect the window frame, or need to be notified when
// a new peer group starts. They pretend to be normal aggregate functions,
// but will fail if you actually try to use them in Aggregator. The
// WindowTransform recognizes these functions and handles them differently.
// We could have a separate factory for window functions, and make all
// aggregate functions implement IWindowFunction interface and so on. This
// would be more logically correct, but more complex. We only have a handful
// of true window functions, so this hack-ish interface suffices.
virtual IWindowFunction * asWindowFunction() { return nullptr; }
virtual const IWindowFunction * asWindowFunction() const
{ return const_cast<IAggregateFunction *>(this)->asWindowFunction(); }
protected:
DataTypes argument_types;
Array parameters;

View File

@ -58,6 +58,8 @@ void registerAggregateFunctionCombinatorOrFill(AggregateFunctionCombinatorFactor
void registerAggregateFunctionCombinatorResample(AggregateFunctionCombinatorFactory &);
void registerAggregateFunctionCombinatorDistinct(AggregateFunctionCombinatorFactory &);
void registerWindowFunctions(AggregateFunctionFactory & factory);
void registerAggregateFunctions()
{
@ -103,6 +105,8 @@ void registerAggregateFunctions()
registerAggregateFunctionMannWhitney(factory);
registerAggregateFunctionWelchTTest(factory);
registerAggregateFunctionStudentTTest(factory);
registerWindowFunctions(factory);
}
{

View File

@ -60,6 +60,7 @@ add_subdirectory (Processors)
add_subdirectory (Formats)
add_subdirectory (Compression)
add_subdirectory (Server)
add_subdirectory (Coordination)
set(dbms_headers)
@ -192,6 +193,10 @@ add_object_library(clickhouse_processors_merges_algorithms Processors/Merges/Alg
add_object_library(clickhouse_processors_queryplan Processors/QueryPlan)
add_object_library(clickhouse_processors_queryplan_optimizations Processors/QueryPlan/Optimizations)
if (USE_NURAFT)
add_object_library(clickhouse_coordination Coordination)
endif()
set (DBMS_COMMON_LIBRARIES)
# libgcc_s does not provide an implementation of an atomics library. Instead,
# GCCs libatomic library can be used to supply these when using libgcc_s.
@ -314,7 +319,7 @@ if (USE_KRB5)
endif()
if (USE_NURAFT)
dbms_target_link_libraries(PRIVATE ${NURAFT_LIBRARY})
dbms_target_link_libraries(PUBLIC ${NURAFT_LIBRARY})
endif()
if(RE2_INCLUDE_DIR)

View File

@ -756,7 +756,11 @@ std::optional<UInt64> Connection::checkPacket(size_t timeout_microseconds)
Packet Connection::receivePacket(std::function<void(Poco::Net::Socket &)> async_callback)
{
in->setAsyncCallback(std::move(async_callback));
SCOPE_EXIT(in->setAsyncCallback({}));
SCOPE_EXIT({
/// disconnect() will reset "in".
if (in)
in->setAsyncCallback({});
});
try
{

View File

@ -455,7 +455,14 @@ template <>
struct LowCardinalityKeys<false> {};
/// For the case when all keys are of fixed length, and they fit in N (for example, 128) bits.
template <typename Value, typename Key, typename Mapped, bool has_nullable_keys_ = false, bool has_low_cardinality_ = false, bool use_cache = true, bool need_offset = false>
template <
typename Value,
typename Key,
typename Mapped,
bool has_nullable_keys_ = false,
bool has_low_cardinality_ = false,
bool use_cache = true,
bool need_offset = false>
struct HashMethodKeysFixed
: private columns_hashing_impl::BaseStateKeysFixed<Key, has_nullable_keys_>
, public columns_hashing_impl::HashMethodBase<HashMethodKeysFixed<Value, Key, Mapped, has_nullable_keys_, has_low_cardinality_, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
@ -471,6 +478,12 @@ struct HashMethodKeysFixed
Sizes key_sizes;
size_t keys_size;
/// SSSE3 shuffle method can be used. Shuffle masks will be calculated and stored here.
#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
std::unique_ptr<uint8_t[]> masks;
std::unique_ptr<const char*[]> columns_data;
#endif
HashMethodKeysFixed(const ColumnRawPtrs & key_columns, const Sizes & key_sizes_, const HashMethodContextPtr &)
: Base(key_columns), key_sizes(std::move(key_sizes_)), keys_size(key_columns.size())
{
@ -491,6 +504,58 @@ struct HashMethodKeysFixed
low_cardinality_keys.nested_columns[i] = key_columns[i];
}
}
#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
if constexpr (!has_low_cardinality && !has_nullable_keys && sizeof(Key) <= 16)
{
/** The task is to "pack" multiple fixed-size fields into single larger Key.
* Example: pack UInt8, UInt32, UInt16, UInt64 into UInt128 key:
* [- ---- -- -------- -] - the resulting uint128 key
* ^ ^ ^ ^ ^
* u8 u32 u16 u64 zero
*
* We can do it with the help of SSSE3 shuffle instruction.
*
* There will be a mask for every GROUP BY element (keys_size masks in total).
* Every mask has 16 bytes but only sizeof(Key) bytes are used (other we don't care).
*
* Every byte in the mask has the following meaning:
* - if it is 0..15, take the element at this index from source register and place here in the result;
* - if it is 0xFF - set the elemend in the result to zero.
*
* Example:
* We want to copy UInt32 to offset 1 in the destination and set other bytes in the destination as zero.
* The corresponding mask will be: FF, 0, 1, 2, 3, FF, FF, FF, FF, FF, FF, FF, FF, FF, FF, FF
*
* The max size of destination is 16 bytes, because we cannot process more with SSSE3.
*
* The method is disabled under MSan, because it's allowed
* to load into SSE register and process up to 15 bytes of uninitialized memory in columns padding.
* We don't use this uninitialized memory but MSan cannot look "into" the shuffle instruction.
*
* 16-bytes masks can be placed overlapping, only first sizeof(Key) bytes are relevant in each mask.
* We initialize them to 0xFF and then set the needed elements.
*/
size_t total_masks_size = sizeof(Key) * keys_size + (16 - sizeof(Key));
masks.reset(new uint8_t[total_masks_size]);
memset(masks.get(), 0xFF, total_masks_size);
size_t offset = 0;
for (size_t i = 0; i < keys_size; ++i)
{
for (size_t j = 0; j < key_sizes[i]; ++j)
{
masks[i * sizeof(Key) + offset] = j;
++offset;
}
}
columns_data.reset(new const char*[keys_size]);
for (size_t i = 0; i < keys_size; ++i)
columns_data[i] = Base::getActualColumns()[i]->getRawData().data;
}
#endif
}
ALWAYS_INLINE Key getKeyHolder(size_t row, Arena &) const
@ -506,6 +571,10 @@ struct HashMethodKeysFixed
return packFixed<Key, true>(row, keys_size, low_cardinality_keys.nested_columns, key_sizes,
&low_cardinality_keys.positions, &low_cardinality_keys.position_sizes);
#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
if constexpr (!has_low_cardinality && !has_nullable_keys && sizeof(Key) <= 16)
return packFixedShuffle<Key>(columns_data.get(), keys_size, key_sizes.data(), row, masks.get());
#endif
return packFixed<Key>(row, keys_size, Base::getActualColumns(), key_sizes);
}
}

View File

@ -534,6 +534,7 @@
M(565, TOO_MANY_PARTITIONS) \
M(566, CANNOT_RMDIR) \
M(567, DUPLICATED_PART_UUIDS) \
M(568, RAFT_ERROR) \
\
M(999, KEEPER_EXCEPTION) \
M(1000, POCO_EXCEPTION) \

View File

@ -1,139 +0,0 @@
#include <Common/ZooKeeper/TestKeeperStorageDispatcher.h>
#include <Common/setThreadName.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int TIMEOUT_EXCEEDED;
}
}
namespace zkutil
{
void TestKeeperStorageDispatcher::processingThread()
{
setThreadName("TestKeeperSProc");
while (!shutdown)
{
RequestInfo info;
UInt64 max_wait = UInt64(operation_timeout.totalMilliseconds());
if (requests_queue.tryPop(info, max_wait))
{
if (shutdown)
break;
try
{
auto responses = storage.processRequest(info.request, info.session_id);
for (const auto & response_for_session : responses)
setResponse(response_for_session.session_id, response_for_session.response);
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
}
}
void TestKeeperStorageDispatcher::setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response)
{
std::lock_guard lock(session_to_response_callback_mutex);
auto session_writer = session_to_response_callback.find(session_id);
if (session_writer == session_to_response_callback.end())
return;
session_writer->second(response);
/// Session closed, no more writes
if (response->xid != Coordination::WATCH_XID && response->getOpNum() == Coordination::OpNum::Close)
session_to_response_callback.erase(session_writer);
}
void TestKeeperStorageDispatcher::finalize()
{
{
std::lock_guard lock(push_request_mutex);
if (shutdown)
return;
shutdown = true;
if (processing_thread.joinable())
processing_thread.join();
}
RequestInfo info;
TestKeeperStorage::RequestsForSessions expired_requests;
while (requests_queue.tryPop(info))
expired_requests.push_back(TestKeeperStorage::RequestForSession{info.session_id, info.request});
auto expired_responses = storage.finalize(expired_requests);
for (const auto & response_for_session : expired_responses)
setResponse(response_for_session.session_id, response_for_session.response);
}
void TestKeeperStorageDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id)
{
{
std::lock_guard lock(session_to_response_callback_mutex);
if (session_to_response_callback.count(session_id) == 0)
throw Exception(DB::ErrorCodes::LOGICAL_ERROR, "Unknown session id {}", session_id);
}
RequestInfo request_info;
request_info.time = clock::now();
request_info.request = request;
request_info.session_id = session_id;
std::lock_guard lock(push_request_mutex);
/// Put close requests without timeouts
if (request->getOpNum() == Coordination::OpNum::Close)
requests_queue.push(std::move(request_info));
else if (!requests_queue.tryPush(std::move(request_info), operation_timeout.totalMilliseconds()))
throw Exception("Cannot push request to queue within operation timeout", ErrorCodes::TIMEOUT_EXCEEDED);
}
TestKeeperStorageDispatcher::TestKeeperStorageDispatcher()
{
processing_thread = ThreadFromGlobalPool([this] { processingThread(); });
}
TestKeeperStorageDispatcher::~TestKeeperStorageDispatcher()
{
try
{
finalize();
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
void TestKeeperStorageDispatcher::registerSession(int64_t session_id, ZooKeeperResponseCallback callback)
{
std::lock_guard lock(session_to_response_callback_mutex);
if (!session_to_response_callback.try_emplace(session_id, callback).second)
throw Exception(DB::ErrorCodes::LOGICAL_ERROR, "Session with id {} already registered in dispatcher", session_id);
}
void TestKeeperStorageDispatcher::finishSession(int64_t session_id)
{
std::lock_guard lock(session_to_response_callback_mutex);
auto session_it = session_to_response_callback.find(session_id);
if (session_it != session_to_response_callback.end())
session_to_response_callback.erase(session_it);
}
}

View File

@ -1,60 +0,0 @@
#pragma once
#include <Common/ThreadPool.h>
#include <Common/ConcurrentBoundedQueue.h>
#include <Common/ZooKeeper/TestKeeperStorage.h>
#include <functional>
namespace zkutil
{
using ZooKeeperResponseCallback = std::function<void(const Coordination::ZooKeeperResponsePtr & response)>;
class TestKeeperStorageDispatcher
{
private:
Poco::Timespan operation_timeout{0, Coordination::DEFAULT_OPERATION_TIMEOUT_MS * 1000};
using clock = std::chrono::steady_clock;
struct RequestInfo
{
Coordination::ZooKeeperRequestPtr request;
clock::time_point time;
int64_t session_id;
};
std::mutex push_request_mutex;
using RequestsQueue = ConcurrentBoundedQueue<RequestInfo>;
RequestsQueue requests_queue{1};
std::atomic<bool> shutdown{false};
using SessionToResponseCallback = std::unordered_map<int64_t, ZooKeeperResponseCallback>;
std::mutex session_to_response_callback_mutex;
SessionToResponseCallback session_to_response_callback;
ThreadFromGlobalPool processing_thread;
TestKeeperStorage storage;
private:
void processingThread();
void finalize();
void setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response);
public:
TestKeeperStorageDispatcher();
~TestKeeperStorageDispatcher();
void putRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id);
int64_t getSessionID()
{
return storage.getSessionID();
}
void registerSession(int64_t session_id, ZooKeeperResponseCallback callback);
/// Call if we don't need any responses for this session no more (session was expired)
void finishSession(int64_t session_id);
};
}

View File

@ -37,6 +37,26 @@ void ZooKeeperRequest::write(WriteBuffer & out) const
out.next();
}
void ZooKeeperSyncRequest::writeImpl(WriteBuffer & out) const
{
Coordination::write(path, out);
}
void ZooKeeperSyncRequest::readImpl(ReadBuffer & in)
{
Coordination::read(path, in);
}
void ZooKeeperSyncResponse::readImpl(ReadBuffer & in)
{
Coordination::read(path, in);
}
void ZooKeeperSyncResponse::writeImpl(WriteBuffer & out) const
{
Coordination::write(path, out);
}
void ZooKeeperWatchResponse::readImpl(ReadBuffer & in)
{
Coordination::read(type, in);
@ -51,6 +71,13 @@ void ZooKeeperWatchResponse::writeImpl(WriteBuffer & out) const
Coordination::write(path, out);
}
void ZooKeeperWatchResponse::write(WriteBuffer & out) const
{
if (error == Error::ZOK)
ZooKeeperResponse::write(out);
/// skip bad responses for watches
}
void ZooKeeperAuthRequest::writeImpl(WriteBuffer & out) const
{
Coordination::write(type, out);
@ -326,6 +353,12 @@ void ZooKeeperMultiRequest::readImpl(ReadBuffer & in)
}
}
bool ZooKeeperMultiRequest::isReadRequest() const
{
/// Possibly we can do better
return false;
}
void ZooKeeperMultiResponse::readImpl(ReadBuffer & in)
{
for (auto & response : responses)
@ -410,6 +443,7 @@ void ZooKeeperMultiResponse::writeImpl(WriteBuffer & out) const
}
ZooKeeperResponsePtr ZooKeeperHeartbeatRequest::makeResponse() const { return std::make_shared<ZooKeeperHeartbeatResponse>(); }
ZooKeeperResponsePtr ZooKeeperSyncRequest::makeResponse() const { return std::make_shared<ZooKeeperSyncResponse>(); }
ZooKeeperResponsePtr ZooKeeperAuthRequest::makeResponse() const { return std::make_shared<ZooKeeperAuthResponse>(); }
ZooKeeperResponsePtr ZooKeeperCreateRequest::makeResponse() const { return std::make_shared<ZooKeeperCreateResponse>(); }
ZooKeeperResponsePtr ZooKeeperRemoveRequest::makeResponse() const { return std::make_shared<ZooKeeperRemoveResponse>(); }
@ -465,6 +499,7 @@ void registerZooKeeperRequest(ZooKeeperRequestFactory & factory)
ZooKeeperRequestFactory::ZooKeeperRequestFactory()
{
registerZooKeeperRequest<OpNum::Heartbeat, ZooKeeperHeartbeatRequest>(*this);
registerZooKeeperRequest<OpNum::Sync, ZooKeeperSyncRequest>(*this);
registerZooKeeperRequest<OpNum::Auth, ZooKeeperAuthRequest>(*this);
registerZooKeeperRequest<OpNum::Close, ZooKeeperCloseRequest>(*this);
registerZooKeeperRequest<OpNum::Create, ZooKeeperCreateRequest>(*this);

View File

@ -30,7 +30,7 @@ struct ZooKeeperResponse : virtual Response
virtual ~ZooKeeperResponse() override = default;
virtual void readImpl(ReadBuffer &) = 0;
virtual void writeImpl(WriteBuffer &) const = 0;
void write(WriteBuffer & out) const;
virtual void write(WriteBuffer & out) const;
virtual OpNum getOpNum() const = 0;
};
@ -60,6 +60,7 @@ struct ZooKeeperRequest : virtual Request
static std::shared_ptr<ZooKeeperRequest> read(ReadBuffer & in);
virtual ZooKeeperResponsePtr makeResponse() const = 0;
virtual bool isReadRequest() const = 0;
};
using ZooKeeperRequestPtr = std::shared_ptr<ZooKeeperRequest>;
@ -71,6 +72,26 @@ struct ZooKeeperHeartbeatRequest final : ZooKeeperRequest
void writeImpl(WriteBuffer &) const override {}
void readImpl(ReadBuffer &) override {}
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return false; }
};
struct ZooKeeperSyncRequest final : ZooKeeperRequest
{
String path;
String getPath() const override { return path; }
OpNum getOpNum() const override { return OpNum::Sync; }
void writeImpl(WriteBuffer & out) const override;
void readImpl(ReadBuffer & in) override;
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return false; }
};
struct ZooKeeperSyncResponse final : ZooKeeperResponse
{
String path;
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
OpNum getOpNum() const override { return OpNum::Sync; }
};
struct ZooKeeperHeartbeatResponse final : ZooKeeperResponse
@ -86,6 +107,8 @@ struct ZooKeeperWatchResponse final : WatchResponse, ZooKeeperResponse
void writeImpl(WriteBuffer & out) const override;
void write(WriteBuffer & out) const override;
OpNum getOpNum() const override
{
throw Exception("OpNum for watch response doesn't exist", Error::ZRUNTIMEINCONSISTENCY);
@ -104,6 +127,7 @@ struct ZooKeeperAuthRequest final : ZooKeeperRequest
void readImpl(ReadBuffer & in) override;
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return false; }
};
struct ZooKeeperAuthResponse final : ZooKeeperResponse
@ -122,6 +146,7 @@ struct ZooKeeperCloseRequest final : ZooKeeperRequest
void readImpl(ReadBuffer &) override {}
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return false; }
};
struct ZooKeeperCloseResponse final : ZooKeeperResponse
@ -146,6 +171,7 @@ struct ZooKeeperCreateRequest final : public CreateRequest, ZooKeeperRequest
void readImpl(ReadBuffer & in) override;
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return false; }
};
struct ZooKeeperCreateResponse final : CreateResponse, ZooKeeperResponse
@ -167,6 +193,7 @@ struct ZooKeeperRemoveRequest final : RemoveRequest, ZooKeeperRequest
void readImpl(ReadBuffer & in) override;
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return false; }
};
struct ZooKeeperRemoveResponse final : RemoveResponse, ZooKeeperResponse
@ -183,6 +210,7 @@ struct ZooKeeperExistsRequest final : ExistsRequest, ZooKeeperRequest
void readImpl(ReadBuffer & in) override;
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return !has_watch; }
};
struct ZooKeeperExistsResponse final : ExistsResponse, ZooKeeperResponse
@ -199,6 +227,7 @@ struct ZooKeeperGetRequest final : GetRequest, ZooKeeperRequest
void readImpl(ReadBuffer & in) override;
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return !has_watch; }
};
struct ZooKeeperGetResponse final : GetResponse, ZooKeeperResponse
@ -217,6 +246,7 @@ struct ZooKeeperSetRequest final : SetRequest, ZooKeeperRequest
void writeImpl(WriteBuffer & out) const override;
void readImpl(ReadBuffer & in) override;
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return false; }
};
struct ZooKeeperSetResponse final : SetResponse, ZooKeeperResponse
@ -232,6 +262,7 @@ struct ZooKeeperListRequest : ListRequest, ZooKeeperRequest
void writeImpl(WriteBuffer & out) const override;
void readImpl(ReadBuffer & in) override;
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return !has_watch; }
};
struct ZooKeeperSimpleListRequest final : ZooKeeperListRequest
@ -261,6 +292,7 @@ struct ZooKeeperCheckRequest final : CheckRequest, ZooKeeperRequest
void readImpl(ReadBuffer & in) override;
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return !has_watch; }
};
struct ZooKeeperCheckResponse final : CheckResponse, ZooKeeperResponse
@ -290,6 +322,7 @@ struct ZooKeeperMultiRequest final : MultiRequest, ZooKeeperRequest
void readImpl(ReadBuffer & in) override;
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override;
};
struct ZooKeeperMultiResponse final : MultiResponse, ZooKeeperResponse

View File

@ -15,6 +15,7 @@ static const std::unordered_set<int32_t> VALID_OPERATIONS =
static_cast<int32_t>(OpNum::Get),
static_cast<int32_t>(OpNum::Set),
static_cast<int32_t>(OpNum::SimpleList),
static_cast<int32_t>(OpNum::Sync),
static_cast<int32_t>(OpNum::Heartbeat),
static_cast<int32_t>(OpNum::List),
static_cast<int32_t>(OpNum::Check),
@ -48,6 +49,8 @@ std::string toString(OpNum op_num)
return "Check";
case OpNum::Multi:
return "Multi";
case OpNum::Sync:
return "Sync";
case OpNum::Heartbeat:
return "Heartbeat";
case OpNum::Auth:

View File

@ -24,6 +24,7 @@ enum class OpNum : int32_t
Get = 4,
Set = 5,
SimpleList = 8,
Sync = 9,
Heartbeat = 11,
List = 12,
Check = 13,

View File

@ -3,6 +3,13 @@
namespace Coordination
{
void write(size_t x, WriteBuffer & out)
{
x = __builtin_bswap64(x);
writeBinary(x, out);
}
void write(int64_t x, WriteBuffer & out)
{
x = __builtin_bswap64(x);
@ -57,6 +64,12 @@ void write(const Error & x, WriteBuffer & out)
write(static_cast<int32_t>(x), out);
}
void read(size_t & x, ReadBuffer & in)
{
readBinary(x, in);
x = __builtin_bswap64(x);
}
void read(int64_t & x, ReadBuffer & in)
{
readBinary(x, in);

View File

@ -13,6 +13,7 @@ namespace Coordination
using namespace DB;
void write(size_t x, WriteBuffer & out);
void write(int64_t x, WriteBuffer & out);
void write(int32_t x, WriteBuffer & out);
void write(OpNum x, WriteBuffer & out);
@ -37,6 +38,7 @@ void write(const std::vector<T> & arr, WriteBuffer & out)
write(elem, out);
}
void read(size_t & x, ReadBuffer & in);
void read(int64_t & x, ReadBuffer & in);
void read(int32_t & x, ReadBuffer & in);
void read(OpNum & x, ReadBuffer & in);

View File

@ -50,6 +50,7 @@ struct Test
{
DB::WriteBufferFromFile wb(filename);
wb.write(reinterpret_cast<const char *>(&store), sizeof(store));
wb.close();
}
{

View File

@ -83,8 +83,6 @@ SRCS(
WeakHash.cpp
ZooKeeper/IKeeper.cpp
ZooKeeper/TestKeeper.cpp
ZooKeeper/TestKeeperStorage.cpp
ZooKeeper/TestKeeperStorageDispatcher.cpp
ZooKeeper/ZooKeeper.cpp
ZooKeeper/ZooKeeperCommon.cpp
ZooKeeper/ZooKeeperConstants.cpp

View File

View File

@ -0,0 +1,35 @@
#include <Coordination/CoordinationSettings.h>
#include <Core/Settings.h>
#include <common/logger_useful.h>
namespace DB
{
namespace ErrorCodes
{
extern const int UNKNOWN_SETTING;
}
IMPLEMENT_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS)
void CoordinationSettings::loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config)
{
if (!config.has(config_elem))
return;
Poco::Util::AbstractConfiguration::Keys config_keys;
config.keys(config_elem, config_keys);
try
{
for (const String & key : config_keys)
set(key, config.getString(config_elem + "." + key));
}
catch (Exception & e)
{
if (e.code() == ErrorCodes::UNKNOWN_SETTING)
e.addMessage("in Coordination settings config");
throw;
}
}
}

View File

@ -0,0 +1,43 @@
#pragma once
#include <Core/Defines.h>
#include <Core/BaseSettings.h>
#include <Core/SettingsEnums.h>
#include <Common/ZooKeeper/ZooKeeperConstants.h>
#include <Poco/Util/AbstractConfiguration.h>
namespace DB
{
struct Settings;
/** These settings represent fine tunes for internal details of Coordination storages
* and should not be changed by the user without a reason.
*/
#define LIST_OF_COORDINATION_SETTINGS(M) \
M(Milliseconds, session_timeout_ms, Coordination::DEFAULT_SESSION_TIMEOUT_MS, "Default client session timeout", 0) \
M(Milliseconds, operation_timeout_ms, Coordination::DEFAULT_OPERATION_TIMEOUT_MS, "Default client operation timeout", 0) \
M(Milliseconds, dead_session_check_period_ms, 500, "How often leader will check sessions to consider them dead and remove", 0) \
M(Milliseconds, heart_beat_interval_ms, 500, "Heartbeat interval between quorum nodes", 0) \
M(Milliseconds, election_timeout_lower_bound_ms, 1000, "Lower bound of election timer (avoid too often leader elections)", 0) \
M(Milliseconds, election_timeout_upper_bound_ms, 2000, "Lower bound of election timer (avoid too often leader elections)", 0) \
M(UInt64, reserved_log_items, 5000, "How many log items to store (don't remove during compaction)", 0) \
M(UInt64, snapshot_distance, 5000, "How many log items we have to collect to write new snapshot", 0) \
M(UInt64, max_stored_snapshots, 3, "How many snapshots we want to store", 0) \
M(Bool, auto_forwarding, true, "Allow to forward write requests from followers to leader", 0) \
M(Milliseconds, shutdown_timeout, 5000, "How many time we will until RAFT shutdown", 0) \
M(Milliseconds, startup_timeout, 30000, "How many time we will until RAFT to start", 0) \
M(LogsLevel, raft_logs_level, LogsLevel::information, "Log internal RAFT logs into main server log level. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0)
DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS)
struct CoordinationSettings : public BaseSettings<CoordinationSettingsTraits>
{
void loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config);
};
using CoordinationSettingsPtr = std::shared_ptr<CoordinationSettings>;
}

View File

@ -0,0 +1,194 @@
#include <Coordination/InMemoryLogStore.h>
namespace DB
{
namespace
{
using namespace nuraft;
ptr<log_entry> makeClone(const ptr<log_entry> & entry)
{
ptr<log_entry> clone = cs_new<log_entry>(entry->get_term(), buffer::clone(entry->get_buf()), entry->get_val_type());
return clone;
}
}
InMemoryLogStore::InMemoryLogStore()
: start_idx(1)
{
nuraft::ptr<nuraft::buffer> buf = nuraft::buffer::alloc(sizeof(size_t));
logs[0] = nuraft::cs_new<nuraft::log_entry>(0, buf);
}
size_t InMemoryLogStore::start_index() const
{
return start_idx;
}
size_t InMemoryLogStore::next_slot() const
{
std::lock_guard<std::mutex> l(logs_lock);
// Exclude the dummy entry.
return start_idx + logs.size() - 1;
}
nuraft::ptr<nuraft::log_entry> InMemoryLogStore::last_entry() const
{
size_t next_idx = next_slot();
std::lock_guard<std::mutex> lock(logs_lock);
auto entry = logs.find(next_idx - 1);
if (entry == logs.end())
entry = logs.find(0);
return makeClone(entry->second);
}
size_t InMemoryLogStore::append(nuraft::ptr<nuraft::log_entry> & entry)
{
ptr<log_entry> clone = makeClone(entry);
std::lock_guard<std::mutex> l(logs_lock);
size_t idx = start_idx + logs.size() - 1;
logs[idx] = clone;
return idx;
}
void InMemoryLogStore::write_at(size_t index, nuraft::ptr<nuraft::log_entry> & entry)
{
nuraft::ptr<log_entry> clone = makeClone(entry);
// Discard all logs equal to or greater than `index.
std::lock_guard<std::mutex> l(logs_lock);
auto itr = logs.lower_bound(index);
while (itr != logs.end())
itr = logs.erase(itr);
logs[index] = clone;
}
nuraft::ptr<std::vector<nuraft::ptr<nuraft::log_entry>>> InMemoryLogStore::log_entries(size_t start, size_t end)
{
nuraft::ptr<std::vector<nuraft::ptr<nuraft::log_entry>>> ret =
nuraft::cs_new<std::vector<nuraft::ptr<nuraft::log_entry>>>();
ret->resize(end - start);
size_t cc = 0;
for (size_t ii = start; ii < end; ++ii)
{
nuraft::ptr<nuraft::log_entry> src = nullptr;
{
std::lock_guard<std::mutex> l(logs_lock);
auto entry = logs.find(ii);
if (entry == logs.end())
{
entry = logs.find(0);
assert(0);
}
src = entry->second;
}
(*ret)[cc++] = makeClone(src);
}
return ret;
}
nuraft::ptr<nuraft::log_entry> InMemoryLogStore::entry_at(size_t index)
{
nuraft::ptr<nuraft::log_entry> src = nullptr;
{
std::lock_guard<std::mutex> l(logs_lock);
auto entry = logs.find(index);
if (entry == logs.end())
entry = logs.find(0);
src = entry->second;
}
return makeClone(src);
}
size_t InMemoryLogStore::term_at(size_t index)
{
size_t term = 0;
{
std::lock_guard<std::mutex> l(logs_lock);
auto entry = logs.find(index);
if (entry == logs.end())
entry = logs.find(0);
term = entry->second->get_term();
}
return term;
}
nuraft::ptr<nuraft::buffer> InMemoryLogStore::pack(size_t index, Int32 cnt)
{
std::vector<nuraft::ptr<nuraft::buffer>> returned_logs;
size_t size_total = 0;
for (size_t ii = index; ii < index + cnt; ++ii)
{
ptr<log_entry> le = nullptr;
{
std::lock_guard<std::mutex> l(logs_lock);
le = logs[ii];
}
assert(le.get());
nuraft::ptr<nuraft::buffer> buf = le->serialize();
size_total += buf->size();
returned_logs.push_back(buf);
}
nuraft::ptr<buffer> buf_out = nuraft::buffer::alloc(sizeof(int32) + cnt * sizeof(int32) + size_total);
buf_out->pos(0);
buf_out->put(static_cast<Int32>(cnt));
for (auto & entry : returned_logs)
{
nuraft::ptr<nuraft::buffer> & bb = entry;
buf_out->put(static_cast<Int32>(bb->size()));
buf_out->put(*bb);
}
return buf_out;
}
void InMemoryLogStore::apply_pack(size_t index, nuraft::buffer & pack)
{
pack.pos(0);
Int32 num_logs = pack.get_int();
for (Int32 ii = 0; ii < num_logs; ++ii)
{
size_t cur_idx = index + ii;
Int32 buf_size = pack.get_int();
nuraft::ptr<nuraft::buffer> buf_local = nuraft::buffer::alloc(buf_size);
pack.get(buf_local);
nuraft::ptr<nuraft::log_entry> le = nuraft::log_entry::deserialize(*buf_local);
{
std::lock_guard<std::mutex> l(logs_lock);
logs[cur_idx] = le;
}
}
{
std::lock_guard<std::mutex> l(logs_lock);
auto entry = logs.upper_bound(0);
if (entry != logs.end())
start_idx = entry->first;
else
start_idx = 1;
}
}
bool InMemoryLogStore::compact(size_t last_log_index)
{
std::lock_guard<std::mutex> l(logs_lock);
for (size_t ii = start_idx; ii <= last_log_index; ++ii)
{
auto entry = logs.find(ii);
if (entry != logs.end())
logs.erase(entry);
}
start_idx = last_log_index + 1;
return true;
}
}

View File

@ -0,0 +1,47 @@
#pragma once
#include <atomic>
#include <map>
#include <mutex>
#include <Core/Types.h>
#include <libnuraft/log_store.hxx> // Y_IGNORE
namespace DB
{
class InMemoryLogStore : public nuraft::log_store
{
public:
InMemoryLogStore();
size_t start_index() const override;
size_t next_slot() const override;
nuraft::ptr<nuraft::log_entry> last_entry() const override;
size_t append(nuraft::ptr<nuraft::log_entry> & entry) override;
void write_at(size_t index, nuraft::ptr<nuraft::log_entry> & entry) override;
nuraft::ptr<std::vector<nuraft::ptr<nuraft::log_entry>>> log_entries(size_t start, size_t end) override;
nuraft::ptr<nuraft::log_entry> entry_at(size_t index) override;
size_t term_at(size_t index) override;
nuraft::ptr<nuraft::buffer> pack(size_t index, Int32 cnt) override;
void apply_pack(size_t index, nuraft::buffer & pack) override;
bool compact(size_t last_log_index) override;
bool flush() override { return true; }
private:
std::map<size_t, nuraft::ptr<nuraft::log_entry>> logs;
mutable std::mutex logs_lock;
std::atomic<size_t> start_idx;
};
}

View File

@ -0,0 +1,78 @@
#include <Coordination/InMemoryStateManager.h>
#include <Common/Exception.h>
namespace DB
{
namespace ErrorCodes
{
extern const int RAFT_ERROR;
}
InMemoryStateManager::InMemoryStateManager(int server_id_, const std::string & host, int port)
: my_server_id(server_id_)
, my_port(port)
, log_store(nuraft::cs_new<InMemoryLogStore>())
, cluster_config(nuraft::cs_new<nuraft::cluster_config>())
{
auto peer_config = nuraft::cs_new<nuraft::srv_config>(my_server_id, host + ":" + std::to_string(port));
cluster_config->get_servers().push_back(peer_config);
}
InMemoryStateManager::InMemoryStateManager(
int my_server_id_,
const std::string & config_prefix,
const Poco::Util::AbstractConfiguration & config)
: my_server_id(my_server_id_)
, log_store(nuraft::cs_new<InMemoryLogStore>())
, cluster_config(nuraft::cs_new<nuraft::cluster_config>())
{
Poco::Util::AbstractConfiguration::Keys keys;
config.keys(config_prefix, keys);
for (const auto & server_key : keys)
{
std::string full_prefix = config_prefix + "." + server_key;
int server_id = config.getInt(full_prefix + ".id");
std::string hostname = config.getString(full_prefix + ".hostname");
int port = config.getInt(full_prefix + ".port");
bool can_become_leader = config.getBool(full_prefix + ".can_become_leader", true);
int32_t priority = config.getInt(full_prefix + ".priority", 1);
bool start_as_follower = config.getBool(full_prefix + ".start_as_follower", false);
if (start_as_follower)
start_as_follower_servers.insert(server_id);
auto endpoint = hostname + ":" + std::to_string(port);
auto peer_config = nuraft::cs_new<nuraft::srv_config>(server_id, 0, endpoint, "", !can_become_leader, priority);
if (server_id == my_server_id)
{
my_server_config = peer_config;
my_port = port;
}
cluster_config->get_servers().push_back(peer_config);
}
if (!my_server_config)
throw Exception(ErrorCodes::RAFT_ERROR, "Our server id {} not found in raft_configuration section");
if (start_as_follower_servers.size() == cluster_config->get_servers().size())
throw Exception(ErrorCodes::RAFT_ERROR, "At least one of servers should be able to start as leader (without <start_as_follower>)");
}
void InMemoryStateManager::save_config(const nuraft::cluster_config & config)
{
// Just keep in memory in this example.
// Need to write to disk here, if want to make it durable.
nuraft::ptr<nuraft::buffer> buf = config.serialize();
cluster_config = nuraft::cluster_config::deserialize(*buf);
}
void InMemoryStateManager::save_state(const nuraft::srv_state & state)
{
// Just keep in memory in this example.
// Need to write to disk here, if want to make it durable.
nuraft::ptr<nuraft::buffer> buf = state.serialize();
server_state = nuraft::srv_state::deserialize(*buf);
}
}

View File

@ -0,0 +1,58 @@
#pragma once
#include <Core/Types.h>
#include <string>
#include <Coordination/InMemoryLogStore.h>
#include <libnuraft/nuraft.hxx> // Y_IGNORE
#include <Poco/Util/AbstractConfiguration.h>
namespace DB
{
class InMemoryStateManager : public nuraft::state_mgr
{
public:
InMemoryStateManager(
int server_id_,
const std::string & config_prefix,
const Poco::Util::AbstractConfiguration & config);
InMemoryStateManager(
int server_id_,
const std::string & host,
int port);
nuraft::ptr<nuraft::cluster_config> load_config() override { return cluster_config; }
void save_config(const nuraft::cluster_config & config) override;
void save_state(const nuraft::srv_state & state) override;
nuraft::ptr<nuraft::srv_state> read_state() override { return server_state; }
nuraft::ptr<nuraft::log_store> load_log_store() override { return log_store; }
Int32 server_id() override { return my_server_id; }
nuraft::ptr<nuraft::srv_config> get_srv_config() const { return my_server_config; }
void system_exit(const int /* exit_code */) override {}
int getPort() const { return my_port; }
bool shouldStartAsFollower() const
{
return start_as_follower_servers.count(my_server_id);
}
private:
int my_server_id;
int my_port;
std::unordered_set<int> start_as_follower_servers;
nuraft::ptr<InMemoryLogStore> log_store;
nuraft::ptr<nuraft::srv_config> my_server_config;
nuraft::ptr<nuraft::cluster_config> cluster_config;
nuraft::ptr<nuraft::srv_state> server_state;
};
}

View File

@ -0,0 +1,47 @@
#pragma once
#include <libnuraft/nuraft.hxx> // Y_IGNORE
#include <common/logger_useful.h>
#include <Core/SettingsEnums.h>
namespace DB
{
class LoggerWrapper : public nuraft::logger
{
public:
LoggerWrapper(const std::string & name, LogsLevel level_)
: log(&Poco::Logger::get(name))
, level(static_cast<int>(level_))
{
log->setLevel(level);
}
void put_details(
int level_,
const char * /* source_file */,
const char * /* func_name */,
size_t /* line_number */,
const std::string & msg) override
{
LOG_IMPL(log, static_cast<DB::LogsLevel>(level_), static_cast<Poco::Message::Priority>(level_), msg);
}
void set_level(int level_) override
{
level_ = std::min(6, std::max(1, level_));
log->setLevel(level_);
level = level_;
}
int get_level() override
{
return level;
}
private:
Poco::Logger * log;
std::atomic<int> level;
};
}

View File

@ -0,0 +1,24 @@
#pragma once
#include <Common/ZooKeeper/ZooKeeperCommon.h>
namespace DB
{
struct NuKeeperRequest
{
int64_t session_id;
Coordination::ZooKeeperRequestPtr request;
};
using NuKeeperRequests = std::vector<NuKeeperRequest>;
struct NuKeeperResponse
{
int64_t session_id;
Coordination::ZooKeeperRequestPtr response;
};
using NuKeeperResponses = std::vector<NuKeeperResponse>;
}

View File

@ -0,0 +1,182 @@
#include <Coordination/NuKeeperServer.h>
#include <Coordination/LoggerWrapper.h>
#include <Coordination/NuKeeperStateMachine.h>
#include <Coordination/InMemoryStateManager.h>
#include <Coordination/WriteBufferFromNuraftBuffer.h>
#include <Coordination/ReadBufferFromNuraftBuffer.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <chrono>
#include <Common/ZooKeeper/ZooKeeperIO.h>
#include <string>
namespace DB
{
namespace ErrorCodes
{
extern const int RAFT_ERROR;
}
NuKeeperServer::NuKeeperServer(
int server_id_,
const CoordinationSettingsPtr & coordination_settings_,
const Poco::Util::AbstractConfiguration & config,
ResponsesQueue & responses_queue_)
: server_id(server_id_)
, coordination_settings(coordination_settings_)
, state_machine(nuraft::cs_new<NuKeeperStateMachine>(responses_queue_, coordination_settings))
, state_manager(nuraft::cs_new<InMemoryStateManager>(server_id, "test_keeper_server.raft_configuration", config))
, responses_queue(responses_queue_)
{
}
void NuKeeperServer::startup()
{
nuraft::raft_params params;
params.heart_beat_interval_ = coordination_settings->heart_beat_interval_ms.totalMilliseconds();
params.election_timeout_lower_bound_ = coordination_settings->election_timeout_lower_bound_ms.totalMilliseconds();
params.election_timeout_upper_bound_ = coordination_settings->election_timeout_upper_bound_ms.totalMilliseconds();
params.reserved_log_items_ = coordination_settings->reserved_log_items;
params.snapshot_distance_ = coordination_settings->snapshot_distance;
params.client_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds();
params.auto_forwarding_ = coordination_settings->auto_forwarding;
params.auto_forwarding_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds() * 2;
params.return_method_ = nuraft::raft_params::blocking;
nuraft::asio_service::options asio_opts{};
nuraft::raft_server::init_options init_options;
init_options.skip_initial_election_timeout_ = state_manager->shouldStartAsFollower();
init_options.raft_callback_ = [this] (nuraft::cb_func::Type type, nuraft::cb_func::Param * param)
{
return callbackFunc(type, param);
};
raft_instance = launcher.init(
state_machine, state_manager, nuraft::cs_new<LoggerWrapper>("RaftInstance", coordination_settings->raft_logs_level), state_manager->getPort(),
asio_opts, params, init_options);
if (!raft_instance)
throw Exception(ErrorCodes::RAFT_ERROR, "Cannot allocate RAFT instance");
}
void NuKeeperServer::shutdown()
{
state_machine->shutdownStorage();
if (!launcher.shutdown(coordination_settings->shutdown_timeout.totalSeconds()))
LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Failed to shutdown RAFT server in {} seconds", 5);
}
namespace
{
nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(int64_t session_id, const Coordination::ZooKeeperRequestPtr & request)
{
DB::WriteBufferFromNuraftBuffer buf;
DB::writeIntBinary(session_id, buf);
request->write(buf);
return buf.getBuffer();
}
}
void NuKeeperServer::putRequest(const NuKeeperStorage::RequestForSession & request_for_session)
{
auto [session_id, request] = request_for_session;
if (isLeaderAlive() && request->isReadRequest())
{
state_machine->processReadRequest(request_for_session);
}
else
{
std::vector<nuraft::ptr<nuraft::buffer>> entries;
entries.push_back(getZooKeeperLogEntry(session_id, request));
std::lock_guard lock(append_entries_mutex);
auto result = raft_instance->append_entries(entries);
if (!result->get_accepted())
{
NuKeeperStorage::ResponsesForSessions responses;
auto response = request->makeResponse();
response->xid = request->xid;
response->zxid = 0;
response->error = Coordination::Error::ZOPERATIONTIMEOUT;
responses_queue.push(DB::NuKeeperStorage::ResponseForSession{session_id, response});
}
if (result->get_result_code() == nuraft::cmd_result_code::TIMEOUT)
{
NuKeeperStorage::ResponsesForSessions responses;
auto response = request->makeResponse();
response->xid = request->xid;
response->zxid = 0;
response->error = Coordination::Error::ZOPERATIONTIMEOUT;
responses_queue.push(DB::NuKeeperStorage::ResponseForSession{session_id, response});
}
else if (result->get_result_code() != nuraft::cmd_result_code::OK)
throw Exception(ErrorCodes::RAFT_ERROR, "Requests result failed with code {} and message: '{}'", result->get_result_code(), result->get_result_str());
}
}
int64_t NuKeeperServer::getSessionID(int64_t session_timeout_ms)
{
auto entry = nuraft::buffer::alloc(sizeof(int64_t));
/// Just special session request
nuraft::buffer_serializer bs(entry);
bs.put_i64(session_timeout_ms);
std::lock_guard lock(append_entries_mutex);
auto result = raft_instance->append_entries({entry});
if (!result->get_accepted())
throw Exception(ErrorCodes::RAFT_ERROR, "Cannot send session_id request to RAFT");
if (result->get_result_code() != nuraft::cmd_result_code::OK)
throw Exception(ErrorCodes::RAFT_ERROR, "session_id request failed to RAFT");
auto resp = result->get();
if (resp == nullptr)
throw Exception(ErrorCodes::RAFT_ERROR, "Received nullptr as session_id");
nuraft::buffer_serializer bs_resp(resp);
return bs_resp.get_i64();
}
bool NuKeeperServer::isLeader() const
{
return raft_instance->is_leader();
}
bool NuKeeperServer::isLeaderAlive() const
{
return raft_instance->is_leader_alive();
}
nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * /* param */)
{
if (type == nuraft::cb_func::Type::BecomeFresh || type == nuraft::cb_func::Type::BecomeLeader)
{
std::unique_lock lock(initialized_mutex);
initialized_flag = true;
initialized_cv.notify_all();
}
return nuraft::cb_func::ReturnCode::Ok;
}
void NuKeeperServer::waitInit()
{
std::unique_lock lock(initialized_mutex);
int64_t timeout = coordination_settings->startup_timeout.totalMilliseconds();
if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag; }))
throw Exception(ErrorCodes::RAFT_ERROR, "Failed to wait RAFT initialization");
}
std::unordered_set<int64_t> NuKeeperServer::getDeadSessions()
{
return state_machine->getDeadSessions();
}
}

View File

@ -0,0 +1,63 @@
#pragma once
#include <libnuraft/nuraft.hxx> // Y_IGNORE
#include <Coordination/InMemoryLogStore.h>
#include <Coordination/InMemoryStateManager.h>
#include <Coordination/NuKeeperStateMachine.h>
#include <Coordination/NuKeeperStorage.h>
#include <Coordination/CoordinationSettings.h>
#include <unordered_map>
namespace DB
{
class NuKeeperServer
{
private:
int server_id;
CoordinationSettingsPtr coordination_settings;
nuraft::ptr<NuKeeperStateMachine> state_machine;
nuraft::ptr<InMemoryStateManager> state_manager;
nuraft::raft_launcher launcher;
nuraft::ptr<nuraft::raft_server> raft_instance;
std::mutex append_entries_mutex;
ResponsesQueue & responses_queue;
std::mutex initialized_mutex;
bool initialized_flag = false;
std::condition_variable initialized_cv;
nuraft::cb_func::ReturnCode callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * param);
public:
NuKeeperServer(
int server_id_,
const CoordinationSettingsPtr & coordination_settings_,
const Poco::Util::AbstractConfiguration & config,
ResponsesQueue & responses_queue_);
void startup();
void putRequest(const NuKeeperStorage::RequestForSession & request);
int64_t getSessionID(int64_t session_timeout_ms);
std::unordered_set<int64_t> getDeadSessions();
bool isLeader() const;
bool isLeaderAlive() const;
void waitInit();
void shutdown();
};
}

View File

@ -0,0 +1,262 @@
#include <Coordination/NuKeeperStateMachine.h>
#include <Coordination/ReadBufferFromNuraftBuffer.h>
#include <Coordination/WriteBufferFromNuraftBuffer.h>
#include <IO/ReadHelpers.h>
#include <Common/ZooKeeper/ZooKeeperIO.h>
#include <Coordination/NuKeeperStorageSerializer.h>
namespace DB
{
NuKeeperStorage::RequestForSession parseRequest(nuraft::buffer & data)
{
ReadBufferFromNuraftBuffer buffer(data);
NuKeeperStorage::RequestForSession request_for_session;
readIntBinary(request_for_session.session_id, buffer);
int32_t length;
Coordination::read(length, buffer);
int32_t xid;
Coordination::read(xid, buffer);
Coordination::OpNum opnum;
Coordination::read(opnum, buffer);
request_for_session.request = Coordination::ZooKeeperRequestFactory::instance().get(opnum);
request_for_session.request->xid = xid;
request_for_session.request->readImpl(buffer);
return request_for_session;
}
nuraft::ptr<nuraft::buffer> writeResponses(NuKeeperStorage::ResponsesForSessions & responses)
{
WriteBufferFromNuraftBuffer buffer;
for (const auto & response_and_session : responses)
{
writeIntBinary(response_and_session.session_id, buffer);
response_and_session.response->write(buffer);
}
return buffer.getBuffer();
}
NuKeeperStateMachine::NuKeeperStateMachine(ResponsesQueue & responses_queue_, const CoordinationSettingsPtr & coordination_settings_)
: coordination_settings(coordination_settings_)
, storage(coordination_settings->dead_session_check_period_ms.totalMilliseconds())
, responses_queue(responses_queue_)
, last_committed_idx(0)
, log(&Poco::Logger::get("NuRaftStateMachine"))
{
LOG_DEBUG(log, "Created nukeeper state machine");
}
nuraft::ptr<nuraft::buffer> NuKeeperStateMachine::commit(const size_t log_idx, nuraft::buffer & data)
{
if (data.size() == sizeof(int64_t))
{
nuraft::buffer_serializer timeout_data(data);
int64_t session_timeout_ms = timeout_data.get_i64();
auto response = nuraft::buffer::alloc(sizeof(int64_t));
int64_t session_id;
nuraft::buffer_serializer bs(response);
{
std::lock_guard lock(storage_lock);
session_id = storage.getSessionID(session_timeout_ms);
bs.put_i64(session_id);
}
LOG_DEBUG(log, "Session ID response {} with timeout {}", session_id, session_timeout_ms);
last_committed_idx = log_idx;
return response;
}
else
{
auto request_for_session = parseRequest(data);
NuKeeperStorage::ResponsesForSessions responses_for_sessions;
{
std::lock_guard lock(storage_lock);
responses_for_sessions = storage.processRequest(request_for_session.request, request_for_session.session_id);
for (auto & response_for_session : responses_for_sessions)
responses_queue.push(response_for_session);
}
last_committed_idx = log_idx;
return nullptr;
}
}
bool NuKeeperStateMachine::apply_snapshot(nuraft::snapshot & s)
{
LOG_DEBUG(log, "Applying snapshot {}", s.get_last_log_idx());
StorageSnapshotPtr snapshot;
{
std::lock_guard<std::mutex> lock(snapshots_lock);
auto entry = snapshots.find(s.get_last_log_idx());
if (entry == snapshots.end())
return false;
snapshot = entry->second;
}
std::lock_guard lock(storage_lock);
storage = snapshot->storage;
last_committed_idx = s.get_last_log_idx();
return true;
}
nuraft::ptr<nuraft::snapshot> NuKeeperStateMachine::last_snapshot()
{
// Just return the latest snapshot.
std::lock_guard<std::mutex> lock(snapshots_lock);
auto entry = snapshots.rbegin();
if (entry == snapshots.rend())
return nullptr;
return entry->second->snapshot;
}
NuKeeperStateMachine::StorageSnapshotPtr NuKeeperStateMachine::createSnapshotInternal(nuraft::snapshot & s)
{
nuraft::ptr<nuraft::buffer> snp_buf = s.serialize();
nuraft::ptr<nuraft::snapshot> ss = nuraft::snapshot::deserialize(*snp_buf);
std::lock_guard lock(storage_lock);
return std::make_shared<NuKeeperStateMachine::StorageSnapshot>(ss, storage);
}
NuKeeperStateMachine::StorageSnapshotPtr NuKeeperStateMachine::readSnapshot(nuraft::snapshot & s, nuraft::buffer & in)
{
nuraft::ptr<nuraft::buffer> snp_buf = s.serialize();
nuraft::ptr<nuraft::snapshot> ss = nuraft::snapshot::deserialize(*snp_buf);
NuKeeperStorageSerializer serializer;
ReadBufferFromNuraftBuffer reader(in);
NuKeeperStorage new_storage(coordination_settings->dead_session_check_period_ms.totalMilliseconds());
serializer.deserialize(new_storage, reader);
return std::make_shared<StorageSnapshot>(ss, new_storage);
}
void NuKeeperStateMachine::writeSnapshot(const NuKeeperStateMachine::StorageSnapshotPtr & snapshot, nuraft::ptr<nuraft::buffer> & out)
{
NuKeeperStorageSerializer serializer;
WriteBufferFromNuraftBuffer writer;
serializer.serialize(snapshot->storage, writer);
out = writer.getBuffer();
}
void NuKeeperStateMachine::create_snapshot(
nuraft::snapshot & s,
nuraft::async_result<bool>::handler_type & when_done)
{
LOG_DEBUG(log, "Creating snapshot {}", s.get_last_log_idx());
auto snapshot = createSnapshotInternal(s);
{
std::lock_guard<std::mutex> lock(snapshots_lock);
snapshots[s.get_last_log_idx()] = snapshot;
size_t num = snapshots.size();
if (num > coordination_settings->max_stored_snapshots)
{
auto entry = snapshots.begin();
for (size_t i = 0; i < num - coordination_settings->max_stored_snapshots; ++i)
{
if (entry == snapshots.end())
break;
entry = snapshots.erase(entry);
}
}
}
LOG_DEBUG(log, "Created snapshot {}", s.get_last_log_idx());
nuraft::ptr<std::exception> except(nullptr);
bool ret = true;
when_done(ret, except);
}
void NuKeeperStateMachine::save_logical_snp_obj(
nuraft::snapshot & s,
size_t & obj_id,
nuraft::buffer & data,
bool /*is_first_obj*/,
bool /*is_last_obj*/)
{
LOG_DEBUG(log, "Saving snapshot {} obj_id {}", s.get_last_log_idx(), obj_id);
if (obj_id == 0)
{
auto new_snapshot = createSnapshotInternal(s);
std::lock_guard<std::mutex> lock(snapshots_lock);
snapshots.try_emplace(s.get_last_log_idx(), std::move(new_snapshot));
}
else
{
auto received_snapshot = readSnapshot(s, data);
std::lock_guard<std::mutex> lock(snapshots_lock);
snapshots[s.get_last_log_idx()] = std::move(received_snapshot);
}
obj_id++;
}
int NuKeeperStateMachine::read_logical_snp_obj(
nuraft::snapshot & s,
void* & /*user_snp_ctx*/,
ulong obj_id,
nuraft::ptr<nuraft::buffer> & data_out,
bool & is_last_obj)
{
LOG_DEBUG(log, "Reading snapshot {} obj_id {}", s.get_last_log_idx(), obj_id);
StorageSnapshotPtr required_snapshot;
{
std::lock_guard<std::mutex> lock(snapshots_lock);
auto entry = snapshots.find(s.get_last_log_idx());
if (entry == snapshots.end())
{
// Snapshot doesn't exist.
data_out = nullptr;
is_last_obj = true;
return 0;
}
required_snapshot = entry->second;
}
if (obj_id == 0)
{
auto new_snapshot = createSnapshotInternal(s);
writeSnapshot(new_snapshot, data_out);
is_last_obj = false;
}
else
{
writeSnapshot(required_snapshot, data_out);
is_last_obj = true;
}
return 0;
}
void NuKeeperStateMachine::processReadRequest(const NuKeeperStorage::RequestForSession & request_for_session)
{
NuKeeperStorage::ResponsesForSessions responses;
{
std::lock_guard lock(storage_lock);
responses = storage.processRequest(request_for_session.request, request_for_session.session_id);
}
for (const auto & response : responses)
responses_queue.push(response);
}
std::unordered_set<int64_t> NuKeeperStateMachine::getDeadSessions()
{
std::lock_guard lock(storage_lock);
return storage.getDeadSessions();
}
void NuKeeperStateMachine::shutdownStorage()
{
std::lock_guard lock(storage_lock);
storage.finalize();
}
}

View File

@ -0,0 +1,99 @@
#pragma once
#include <Coordination/NuKeeperStorage.h>
#include <libnuraft/nuraft.hxx> // Y_IGNORE
#include <common/logger_useful.h>
#include <Coordination/ThreadSafeQueue.h>
#include <Coordination/CoordinationSettings.h>
namespace DB
{
using ResponsesQueue = ThreadSafeQueue<NuKeeperStorage::ResponseForSession>;
class NuKeeperStateMachine : public nuraft::state_machine
{
public:
NuKeeperStateMachine(ResponsesQueue & responses_queue_, const CoordinationSettingsPtr & coordination_settings_);
nuraft::ptr<nuraft::buffer> pre_commit(const size_t /*log_idx*/, nuraft::buffer & /*data*/) override { return nullptr; }
nuraft::ptr<nuraft::buffer> commit(const size_t log_idx, nuraft::buffer & data) override;
void rollback(const size_t /*log_idx*/, nuraft::buffer & /*data*/) override {}
size_t last_commit_index() override { return last_committed_idx; }
bool apply_snapshot(nuraft::snapshot & s) override;
nuraft::ptr<nuraft::snapshot> last_snapshot() override;
void create_snapshot(
nuraft::snapshot & s,
nuraft::async_result<bool>::handler_type & when_done) override;
void save_logical_snp_obj(
nuraft::snapshot & s,
size_t & obj_id,
nuraft::buffer & data,
bool is_first_obj,
bool is_last_obj) override;
int read_logical_snp_obj(
nuraft::snapshot & s,
void* & user_snp_ctx,
ulong obj_id,
nuraft::ptr<nuraft::buffer> & data_out,
bool & is_last_obj) override;
NuKeeperStorage & getStorage()
{
return storage;
}
void processReadRequest(const NuKeeperStorage::RequestForSession & request_for_session);
std::unordered_set<int64_t> getDeadSessions();
void shutdownStorage();
private:
struct StorageSnapshot
{
StorageSnapshot(const nuraft::ptr<nuraft::snapshot> & s, const NuKeeperStorage & storage_)
: snapshot(s)
, storage(storage_)
{}
nuraft::ptr<nuraft::snapshot> snapshot;
NuKeeperStorage storage;
};
using StorageSnapshotPtr = std::shared_ptr<StorageSnapshot>;
StorageSnapshotPtr createSnapshotInternal(nuraft::snapshot & s);
StorageSnapshotPtr readSnapshot(nuraft::snapshot & s, nuraft::buffer & in);
static void writeSnapshot(const StorageSnapshotPtr & snapshot, nuraft::ptr<nuraft::buffer> & out);
CoordinationSettingsPtr coordination_settings;
NuKeeperStorage storage;
ResponsesQueue & responses_queue;
/// Mutex for snapshots
std::mutex snapshots_lock;
/// Lock for storage
std::mutex storage_lock;
/// Fake snapshot storage
std::map<uint64_t, StorageSnapshotPtr> snapshots;
/// Last committed Raft log number.
std::atomic<size_t> last_committed_idx;
Poco::Logger * log;
};
}

View File

@ -1,4 +1,4 @@
#include <Common/ZooKeeper/TestKeeperStorage.h>
#include <Coordination/NuKeeperStorage.h>
#include <Common/ZooKeeper/IKeeper.h>
#include <Common/setThreadName.h>
#include <mutex>
@ -17,13 +17,6 @@ namespace ErrorCodes
extern const int BAD_ARGUMENTS;
}
}
namespace zkutil
{
using namespace DB;
static String parentPath(const String & path)
{
auto rslash_pos = path.rfind('/');
@ -38,20 +31,20 @@ static String baseName(const String & path)
return path.substr(rslash_pos + 1);
}
static TestKeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, TestKeeperStorage::Watches & watches, TestKeeperStorage::Watches & list_watches, Coordination::Event event_type)
static NuKeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches, Coordination::Event event_type)
{
TestKeeperStorage::ResponsesForSessions result;
NuKeeperStorage::ResponsesForSessions result;
auto it = watches.find(path);
if (it != watches.end())
{
std::shared_ptr<Coordination::ZooKeeperWatchResponse> watch_response = std::make_shared<Coordination::ZooKeeperWatchResponse>();
watch_response->path = path;
watch_response->xid = -1;
watch_response->xid = Coordination::WATCH_XID;
watch_response->zxid = -1;
watch_response->type = event_type;
watch_response->state = Coordination::State::CONNECTED;
for (auto watcher_session : it->second)
result.push_back(TestKeeperStorage::ResponseForSession{watcher_session, watch_response});
result.push_back(NuKeeperStorage::ResponseForSession{watcher_session, watch_response});
watches.erase(it);
}
@ -62,58 +55,69 @@ static TestKeeperStorage::ResponsesForSessions processWatchesImpl(const String &
{
std::shared_ptr<Coordination::ZooKeeperWatchResponse> watch_list_response = std::make_shared<Coordination::ZooKeeperWatchResponse>();
watch_list_response->path = parent_path;
watch_list_response->xid = -1;
watch_list_response->xid = Coordination::WATCH_XID;
watch_list_response->zxid = -1;
watch_list_response->type = Coordination::Event::CHILD;
watch_list_response->state = Coordination::State::CONNECTED;
for (auto watcher_session : it->second)
result.push_back(TestKeeperStorage::ResponseForSession{watcher_session, watch_list_response});
result.push_back(NuKeeperStorage::ResponseForSession{watcher_session, watch_list_response});
list_watches.erase(it);
}
return result;
}
TestKeeperStorage::TestKeeperStorage()
NuKeeperStorage::NuKeeperStorage(int64_t tick_time_ms)
: session_expiry_queue(tick_time_ms)
{
container.emplace("/", Node());
}
using Undo = std::function<void()>;
struct TestKeeperStorageRequest
struct NuKeeperStorageRequest
{
Coordination::ZooKeeperRequestPtr zk_request;
explicit TestKeeperStorageRequest(const Coordination::ZooKeeperRequestPtr & zk_request_)
explicit NuKeeperStorageRequest(const Coordination::ZooKeeperRequestPtr & zk_request_)
: zk_request(zk_request_)
{}
virtual std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & container, TestKeeperStorage::Ephemerals & ephemerals, int64_t zxid, int64_t session_id) const = 0;
virtual TestKeeperStorage::ResponsesForSessions processWatches(TestKeeperStorage::Watches & /*watches*/, TestKeeperStorage::Watches & /*list_watches*/) const { return {}; }
virtual std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t zxid, int64_t session_id) const = 0;
virtual NuKeeperStorage::ResponsesForSessions processWatches(NuKeeperStorage::Watches & /*watches*/, NuKeeperStorage::Watches & /*list_watches*/) const { return {}; }
virtual ~TestKeeperStorageRequest() = default;
virtual ~NuKeeperStorageRequest() = default;
};
struct TestKeeperStorageHeartbeatRequest final : public TestKeeperStorageRequest
struct NuKeeperStorageHeartbeatRequest final : public NuKeeperStorageRequest
{
using TestKeeperStorageRequest::TestKeeperStorageRequest;
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & /* container */, TestKeeperStorage::Ephemerals & /* ephemerals */, int64_t /* zxid */, int64_t /* session_id */) const override
using NuKeeperStorageRequest::NuKeeperStorageRequest;
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & /* container */, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t /* zxid */, int64_t /* session_id */) const override
{
return {zk_request->makeResponse(), {}};
}
};
struct TestKeeperStorageCreateRequest final : public TestKeeperStorageRequest
struct NuKeeperStorageSyncRequest final : public NuKeeperStorageRequest
{
using TestKeeperStorageRequest::TestKeeperStorageRequest;
using NuKeeperStorageRequest::NuKeeperStorageRequest;
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & /* container */, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t /* zxid */, int64_t /* session_id */) const override
{
auto response = zk_request->makeResponse();
dynamic_cast<Coordination::ZooKeeperSyncResponse *>(response.get())->path = dynamic_cast<Coordination::ZooKeeperSyncRequest *>(zk_request.get())->path;
return {response, {}};
}
};
TestKeeperStorage::ResponsesForSessions processWatches(TestKeeperStorage::Watches & watches, TestKeeperStorage::Watches & list_watches) const override
struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest
{
using NuKeeperStorageRequest::NuKeeperStorageRequest;
NuKeeperStorage::ResponsesForSessions processWatches(NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches) const override
{
return processWatchesImpl(zk_request->getPath(), watches, list_watches, Coordination::Event::CREATED);
}
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & container, TestKeeperStorage::Ephemerals & ephemerals, int64_t zxid, int64_t session_id) const override
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t zxid, int64_t session_id) const override
{
Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
Undo undo;
@ -138,8 +142,7 @@ struct TestKeeperStorageCreateRequest final : public TestKeeperStorageRequest
}
else
{
TestKeeperStorage::Node created_node;
created_node.seq_num = 0;
NuKeeperStorage::Node created_node;
created_node.stat.czxid = zxid;
created_node.stat.mzxid = zxid;
created_node.stat.ctime = std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1);
@ -193,10 +196,10 @@ struct TestKeeperStorageCreateRequest final : public TestKeeperStorageRequest
}
};
struct TestKeeperStorageGetRequest final : public TestKeeperStorageRequest
struct NuKeeperStorageGetRequest final : public NuKeeperStorageRequest
{
using TestKeeperStorageRequest::TestKeeperStorageRequest;
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & container, TestKeeperStorage::Ephemerals & /* ephemerals */, int64_t /* zxid */, int64_t /* session_id */) const override
using NuKeeperStorageRequest::NuKeeperStorageRequest;
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t /* zxid */, int64_t /* session_id */) const override
{
Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
Coordination::ZooKeeperGetResponse & response = dynamic_cast<Coordination::ZooKeeperGetResponse &>(*response_ptr);
@ -218,10 +221,10 @@ struct TestKeeperStorageGetRequest final : public TestKeeperStorageRequest
}
};
struct TestKeeperStorageRemoveRequest final : public TestKeeperStorageRequest
struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest
{
using TestKeeperStorageRequest::TestKeeperStorageRequest;
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & container, TestKeeperStorage::Ephemerals & ephemerals, int64_t /*zxid*/, int64_t session_id) const override
using NuKeeperStorageRequest::NuKeeperStorageRequest;
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t /*zxid*/, int64_t session_id) const override
{
Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
Coordination::ZooKeeperRemoveResponse & response = dynamic_cast<Coordination::ZooKeeperRemoveResponse &>(*response_ptr);
@ -268,16 +271,16 @@ struct TestKeeperStorageRemoveRequest final : public TestKeeperStorageRequest
return { response_ptr, undo };
}
TestKeeperStorage::ResponsesForSessions processWatches(TestKeeperStorage::Watches & watches, TestKeeperStorage::Watches & list_watches) const override
NuKeeperStorage::ResponsesForSessions processWatches(NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches) const override
{
return processWatchesImpl(zk_request->getPath(), watches, list_watches, Coordination::Event::DELETED);
}
};
struct TestKeeperStorageExistsRequest final : public TestKeeperStorageRequest
struct NuKeeperStorageExistsRequest final : public NuKeeperStorageRequest
{
using TestKeeperStorageRequest::TestKeeperStorageRequest;
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & container, TestKeeperStorage::Ephemerals & /* ephemerals */, int64_t /*zxid*/, int64_t /* session_id */) const override
using NuKeeperStorageRequest::NuKeeperStorageRequest;
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t /*zxid*/, int64_t /* session_id */) const override
{
Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
Coordination::ZooKeeperExistsResponse & response = dynamic_cast<Coordination::ZooKeeperExistsResponse &>(*response_ptr);
@ -298,10 +301,10 @@ struct TestKeeperStorageExistsRequest final : public TestKeeperStorageRequest
}
};
struct TestKeeperStorageSetRequest final : public TestKeeperStorageRequest
struct NuKeeperStorageSetRequest final : public NuKeeperStorageRequest
{
using TestKeeperStorageRequest::TestKeeperStorageRequest;
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & container, TestKeeperStorage::Ephemerals & /* ephemerals */, int64_t zxid, int64_t /* session_id */) const override
using NuKeeperStorageRequest::NuKeeperStorageRequest;
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t zxid, int64_t /* session_id */) const override
{
Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
Coordination::ZooKeeperSetResponse & response = dynamic_cast<Coordination::ZooKeeperSetResponse &>(*response_ptr);
@ -341,17 +344,17 @@ struct TestKeeperStorageSetRequest final : public TestKeeperStorageRequest
return { response_ptr, undo };
}
TestKeeperStorage::ResponsesForSessions processWatches(TestKeeperStorage::Watches & watches, TestKeeperStorage::Watches & list_watches) const override
NuKeeperStorage::ResponsesForSessions processWatches(NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches) const override
{
return processWatchesImpl(zk_request->getPath(), watches, list_watches, Coordination::Event::CHANGED);
}
};
struct TestKeeperStorageListRequest final : public TestKeeperStorageRequest
struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest
{
using TestKeeperStorageRequest::TestKeeperStorageRequest;
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & container, TestKeeperStorage::Ephemerals & /* ephemerals */, int64_t /*zxid*/, int64_t /*session_id*/) const override
using NuKeeperStorageRequest::NuKeeperStorageRequest;
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t /*zxid*/, int64_t /*session_id*/) const override
{
Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
Coordination::ZooKeeperListResponse & response = dynamic_cast<Coordination::ZooKeeperListResponse &>(*response_ptr);
@ -387,10 +390,10 @@ struct TestKeeperStorageListRequest final : public TestKeeperStorageRequest
}
};
struct TestKeeperStorageCheckRequest final : public TestKeeperStorageRequest
struct NuKeeperStorageCheckRequest final : public NuKeeperStorageRequest
{
using TestKeeperStorageRequest::TestKeeperStorageRequest;
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & container, TestKeeperStorage::Ephemerals & /* ephemerals */, int64_t /*zxid*/, int64_t /*session_id*/) const override
using NuKeeperStorageRequest::NuKeeperStorageRequest;
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t /*zxid*/, int64_t /*session_id*/) const override
{
Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
Coordination::ZooKeeperCheckResponse & response = dynamic_cast<Coordination::ZooKeeperCheckResponse &>(*response_ptr);
@ -413,11 +416,11 @@ struct TestKeeperStorageCheckRequest final : public TestKeeperStorageRequest
}
};
struct TestKeeperStorageMultiRequest final : public TestKeeperStorageRequest
struct NuKeeperStorageMultiRequest final : public NuKeeperStorageRequest
{
std::vector<TestKeeperStorageRequestPtr> concrete_requests;
explicit TestKeeperStorageMultiRequest(const Coordination::ZooKeeperRequestPtr & zk_request_)
: TestKeeperStorageRequest(zk_request_)
std::vector<NuKeeperStorageRequestPtr> concrete_requests;
explicit NuKeeperStorageMultiRequest(const Coordination::ZooKeeperRequestPtr & zk_request_)
: NuKeeperStorageRequest(zk_request_)
{
Coordination::ZooKeeperMultiRequest & request = dynamic_cast<Coordination::ZooKeeperMultiRequest &>(*zk_request);
concrete_requests.reserve(request.requests.size());
@ -427,26 +430,26 @@ struct TestKeeperStorageMultiRequest final : public TestKeeperStorageRequest
auto sub_zk_request = std::dynamic_pointer_cast<Coordination::ZooKeeperRequest>(sub_request);
if (sub_zk_request->getOpNum() == Coordination::OpNum::Create)
{
concrete_requests.push_back(std::make_shared<TestKeeperStorageCreateRequest>(sub_zk_request));
concrete_requests.push_back(std::make_shared<NuKeeperStorageCreateRequest>(sub_zk_request));
}
else if (sub_zk_request->getOpNum() == Coordination::OpNum::Remove)
{
concrete_requests.push_back(std::make_shared<TestKeeperStorageRemoveRequest>(sub_zk_request));
concrete_requests.push_back(std::make_shared<NuKeeperStorageRemoveRequest>(sub_zk_request));
}
else if (sub_zk_request->getOpNum() == Coordination::OpNum::Set)
{
concrete_requests.push_back(std::make_shared<TestKeeperStorageSetRequest>(sub_zk_request));
concrete_requests.push_back(std::make_shared<NuKeeperStorageSetRequest>(sub_zk_request));
}
else if (sub_zk_request->getOpNum() == Coordination::OpNum::Check)
{
concrete_requests.push_back(std::make_shared<TestKeeperStorageCheckRequest>(sub_zk_request));
concrete_requests.push_back(std::make_shared<NuKeeperStorageCheckRequest>(sub_zk_request));
}
else
throw DB::Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal command as part of multi ZooKeeper request {}", sub_zk_request->getOpNum());
}
}
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & container, TestKeeperStorage::Ephemerals & ephemerals, int64_t zxid, int64_t session_id) const override
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t zxid, int64_t session_id) const override
{
Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
Coordination::ZooKeeperMultiResponse & response = dynamic_cast<Coordination::ZooKeeperMultiResponse &>(*response_ptr);
@ -499,9 +502,9 @@ struct TestKeeperStorageMultiRequest final : public TestKeeperStorageRequest
}
}
TestKeeperStorage::ResponsesForSessions processWatches(TestKeeperStorage::Watches & watches, TestKeeperStorage::Watches & list_watches) const override
NuKeeperStorage::ResponsesForSessions processWatches(NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches) const override
{
TestKeeperStorage::ResponsesForSessions result;
NuKeeperStorage::ResponsesForSessions result;
for (const auto & generic_request : concrete_requests)
{
auto responses = generic_request->processWatches(watches, list_watches);
@ -511,75 +514,49 @@ struct TestKeeperStorageMultiRequest final : public TestKeeperStorageRequest
}
};
struct TestKeeperStorageCloseRequest final : public TestKeeperStorageRequest
struct NuKeeperStorageCloseRequest final : public NuKeeperStorageRequest
{
using TestKeeperStorageRequest::TestKeeperStorageRequest;
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container &, TestKeeperStorage::Ephemerals &, int64_t, int64_t) const override
using NuKeeperStorageRequest::NuKeeperStorageRequest;
std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container &, NuKeeperStorage::Ephemerals &, int64_t, int64_t) const override
{
throw DB::Exception("Called process on close request", ErrorCodes::LOGICAL_ERROR);
}
};
TestKeeperStorage::ResponsesForSessions TestKeeperStorage::finalize(const RequestsForSessions & expired_requests)
void NuKeeperStorage::finalize()
{
if (finalized)
throw DB::Exception("Testkeeper storage already finalized", ErrorCodes::LOGICAL_ERROR);
finalized = true;
ResponsesForSessions finalize_results;
auto finish_watch = [] (const auto & watch_pair) -> ResponsesForSessions
{
ResponsesForSessions results;
std::shared_ptr<Coordination::ZooKeeperWatchResponse> response = std::make_shared<Coordination::ZooKeeperWatchResponse>();
response->type = Coordination::SESSION;
response->state = Coordination::EXPIRED_SESSION;
response->error = Coordination::Error::ZSESSIONEXPIRED;
for (const auto & [session_id, ephemerals_paths] : ephemerals)
for (const String & ephemeral_path : ephemerals_paths)
container.erase(ephemeral_path);
for (auto & watcher_session : watch_pair.second)
results.push_back(ResponseForSession{watcher_session, response});
return results;
};
for (auto & path_watch : watches)
{
auto watch_responses = finish_watch(path_watch);
finalize_results.insert(finalize_results.end(), watch_responses.begin(), watch_responses.end());
}
ephemerals.clear();
watches.clear();
for (auto & path_watch : list_watches)
{
auto list_watch_responses = finish_watch(path_watch);
finalize_results.insert(finalize_results.end(), list_watch_responses.begin(), list_watch_responses.end());
}
list_watches.clear();
sessions_and_watchers.clear();
for (const auto & [session_id, zk_request] : expired_requests)
{
auto response = zk_request->makeResponse();
response->error = Coordination::Error::ZSESSIONEXPIRED;
finalize_results.push_back(ResponseForSession{session_id, response});
}
return finalize_results;
session_expiry_queue.clear();
}
class TestKeeperWrapperFactory final : private boost::noncopyable
class NuKeeperWrapperFactory final : private boost::noncopyable
{
public:
using Creator = std::function<TestKeeperStorageRequestPtr(const Coordination::ZooKeeperRequestPtr &)>;
using Creator = std::function<NuKeeperStorageRequestPtr(const Coordination::ZooKeeperRequestPtr &)>;
using OpNumToRequest = std::unordered_map<Coordination::OpNum, Creator>;
static TestKeeperWrapperFactory & instance()
static NuKeeperWrapperFactory & instance()
{
static TestKeeperWrapperFactory factory;
static NuKeeperWrapperFactory factory;
return factory;
}
TestKeeperStorageRequestPtr get(const Coordination::ZooKeeperRequestPtr & zk_request) const
NuKeeperStorageRequestPtr get(const Coordination::ZooKeeperRequestPtr & zk_request) const
{
auto it = op_num_to_request.find(zk_request->getOpNum());
if (it == op_num_to_request.end())
@ -596,36 +573,37 @@ public:
private:
OpNumToRequest op_num_to_request;
TestKeeperWrapperFactory();
NuKeeperWrapperFactory();
};
template<Coordination::OpNum num, typename RequestT>
void registerTestKeeperRequestWrapper(TestKeeperWrapperFactory & factory)
void registerNuKeeperRequestWrapper(NuKeeperWrapperFactory & factory)
{
factory.registerRequest(num, [] (const Coordination::ZooKeeperRequestPtr & zk_request) { return std::make_shared<RequestT>(zk_request); });
}
TestKeeperWrapperFactory::TestKeeperWrapperFactory()
NuKeeperWrapperFactory::NuKeeperWrapperFactory()
{
registerTestKeeperRequestWrapper<Coordination::OpNum::Heartbeat, TestKeeperStorageHeartbeatRequest>(*this);
//registerTestKeeperRequestWrapper<Coordination::OpNum::Auth, TestKeeperStorageAuthRequest>(*this);
registerTestKeeperRequestWrapper<Coordination::OpNum::Close, TestKeeperStorageCloseRequest>(*this);
registerTestKeeperRequestWrapper<Coordination::OpNum::Create, TestKeeperStorageCreateRequest>(*this);
registerTestKeeperRequestWrapper<Coordination::OpNum::Remove, TestKeeperStorageRemoveRequest>(*this);
registerTestKeeperRequestWrapper<Coordination::OpNum::Exists, TestKeeperStorageExistsRequest>(*this);
registerTestKeeperRequestWrapper<Coordination::OpNum::Get, TestKeeperStorageGetRequest>(*this);
registerTestKeeperRequestWrapper<Coordination::OpNum::Set, TestKeeperStorageSetRequest>(*this);
registerTestKeeperRequestWrapper<Coordination::OpNum::List, TestKeeperStorageListRequest>(*this);
registerTestKeeperRequestWrapper<Coordination::OpNum::SimpleList, TestKeeperStorageListRequest>(*this);
registerTestKeeperRequestWrapper<Coordination::OpNum::Check, TestKeeperStorageCheckRequest>(*this);
registerTestKeeperRequestWrapper<Coordination::OpNum::Multi, TestKeeperStorageMultiRequest>(*this);
registerNuKeeperRequestWrapper<Coordination::OpNum::Heartbeat, NuKeeperStorageHeartbeatRequest>(*this);
registerNuKeeperRequestWrapper<Coordination::OpNum::Sync, NuKeeperStorageSyncRequest>(*this);
//registerNuKeeperRequestWrapper<Coordination::OpNum::Auth, NuKeeperStorageAuthRequest>(*this);
registerNuKeeperRequestWrapper<Coordination::OpNum::Close, NuKeeperStorageCloseRequest>(*this);
registerNuKeeperRequestWrapper<Coordination::OpNum::Create, NuKeeperStorageCreateRequest>(*this);
registerNuKeeperRequestWrapper<Coordination::OpNum::Remove, NuKeeperStorageRemoveRequest>(*this);
registerNuKeeperRequestWrapper<Coordination::OpNum::Exists, NuKeeperStorageExistsRequest>(*this);
registerNuKeeperRequestWrapper<Coordination::OpNum::Get, NuKeeperStorageGetRequest>(*this);
registerNuKeeperRequestWrapper<Coordination::OpNum::Set, NuKeeperStorageSetRequest>(*this);
registerNuKeeperRequestWrapper<Coordination::OpNum::List, NuKeeperStorageListRequest>(*this);
registerNuKeeperRequestWrapper<Coordination::OpNum::SimpleList, NuKeeperStorageListRequest>(*this);
registerNuKeeperRequestWrapper<Coordination::OpNum::Check, NuKeeperStorageCheckRequest>(*this);
registerNuKeeperRequestWrapper<Coordination::OpNum::Multi, NuKeeperStorageMultiRequest>(*this);
}
TestKeeperStorage::ResponsesForSessions TestKeeperStorage::processRequest(const Coordination::ZooKeeperRequestPtr & zk_request, int64_t session_id)
NuKeeperStorage::ResponsesForSessions NuKeeperStorage::processRequest(const Coordination::ZooKeeperRequestPtr & zk_request, int64_t session_id)
{
TestKeeperStorage::ResponsesForSessions results;
NuKeeperStorage::ResponsesForSessions results;
if (zk_request->getOpNum() == Coordination::OpNum::Close)
{
auto it = ephemerals.find(session_id);
@ -645,12 +623,24 @@ TestKeeperStorage::ResponsesForSessions TestKeeperStorage::processRequest(const
auto response = std::make_shared<Coordination::ZooKeeperCloseResponse>();
response->xid = zk_request->xid;
response->zxid = getZXID();
session_expiry_queue.remove(session_id);
session_and_timeout.erase(session_id);
results.push_back(ResponseForSession{session_id, response});
}
else if (zk_request->getOpNum() == Coordination::OpNum::Heartbeat)
{
session_expiry_queue.update(session_id, session_and_timeout[session_id]);
NuKeeperStorageRequestPtr storage_request = NuKeeperWrapperFactory::instance().get(zk_request);
auto [response, _] = storage_request->process(container, ephemerals, zxid, session_id);
response->xid = zk_request->xid;
response->zxid = getZXID();
results.push_back(ResponseForSession{session_id, response});
}
else
{
TestKeeperStorageRequestPtr storage_request = TestKeeperWrapperFactory::instance().get(zk_request);
NuKeeperStorageRequestPtr storage_request = NuKeeperWrapperFactory::instance().get(zk_request);
auto [response, _] = storage_request->process(container, ephemerals, zxid, session_id);
if (zk_request->has_watch)
@ -669,15 +659,6 @@ TestKeeperStorage::ResponsesForSessions TestKeeperStorage::processRequest(const
watches[zk_request->getPath()].emplace_back(session_id);
sessions_and_watchers[session_id].emplace(zk_request->getPath());
}
else
{
std::shared_ptr<Coordination::ZooKeeperWatchResponse> watch_response = std::make_shared<Coordination::ZooKeeperWatchResponse>();
watch_response->path = zk_request->getPath();
watch_response->xid = -1;
watch_response->error = response->error;
watch_response->type = Coordination::Event::NOTWATCHING;
results.push_back(ResponseForSession{session_id, watch_response});
}
}
if (response->error == Coordination::Error::ZOK)
@ -696,7 +677,7 @@ TestKeeperStorage::ResponsesForSessions TestKeeperStorage::processRequest(const
}
void TestKeeperStorage::clearDeadWatches(int64_t session_id)
void NuKeeperStorage::clearDeadWatches(int64_t session_id)
{
auto watches_it = sessions_and_watchers.find(session_id);
if (watches_it != sessions_and_watchers.end())

View File

@ -4,27 +4,28 @@
#include <Common/ZooKeeper/IKeeper.h>
#include <Common/ConcurrentBoundedQueue.h>
#include <Common/ZooKeeper/ZooKeeperCommon.h>
#include <Coordination/SessionExpiryQueue.h>
#include <unordered_map>
#include <unordered_set>
#include <vector>
namespace zkutil
namespace DB
{
using namespace DB;
struct TestKeeperStorageRequest;
using TestKeeperStorageRequestPtr = std::shared_ptr<TestKeeperStorageRequest>;
struct NuKeeperStorageRequest;
using NuKeeperStorageRequestPtr = std::shared_ptr<NuKeeperStorageRequest>;
using ResponseCallback = std::function<void(const Coordination::ZooKeeperResponsePtr &)>;
class TestKeeperStorage
class NuKeeperStorage
{
public:
std::atomic<int64_t> session_id_counter{0};
int64_t session_id_counter{0};
struct Node
{
String data;
Coordination::ACLs acls;
Coordination::ACLs acls{};
bool is_ephemeral = false;
bool is_sequental = false;
Coordination::Stat stat{};
@ -50,6 +51,7 @@ public:
using Container = std::map<std::string, Node>;
using Ephemerals = std::unordered_map<int64_t, std::unordered_set<String>>;
using SessionAndWatcher = std::unordered_map<int64_t, std::unordered_set<String>>;
using SessionAndTimeout = std::unordered_map<int64_t, long>;
using SessionIDs = std::vector<int64_t>;
using Watches = std::map<String /* path, relative of root_path */, SessionIDs>;
@ -57,9 +59,11 @@ public:
Container container;
Ephemerals ephemerals;
SessionAndWatcher sessions_and_watchers;
SessionExpiryQueue session_expiry_queue;
SessionAndTimeout session_and_timeout;
std::atomic<int64_t> zxid{0};
std::atomic<bool> finalized{false};
int64_t zxid{0};
bool finalized{false};
Watches watches;
Watches list_watches; /// Watches for 'list' request (watches on children).
@ -68,18 +72,27 @@ public:
int64_t getZXID()
{
return zxid.fetch_add(1);
return zxid++;
}
public:
TestKeeperStorage();
NuKeeperStorage(int64_t tick_time_ms);
int64_t getSessionID(int64_t session_timeout_ms)
{
auto result = session_id_counter++;
session_and_timeout.emplace(result, session_timeout_ms);
session_expiry_queue.update(result, session_timeout_ms);
return result;
}
ResponsesForSessions processRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id);
ResponsesForSessions finalize(const RequestsForSessions & expired_requests);
int64_t getSessionID()
void finalize();
std::unordered_set<int64_t> getDeadSessions()
{
return session_id_counter.fetch_add(1);
return session_expiry_queue.getExpiredSessions();
}
};

View File

@ -0,0 +1,237 @@
#include <Coordination/NuKeeperStorageDispatcher.h>
#include <Common/setThreadName.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int TIMEOUT_EXCEEDED;
}
NuKeeperStorageDispatcher::NuKeeperStorageDispatcher()
: coordination_settings(std::make_shared<CoordinationSettings>())
, log(&Poco::Logger::get("NuKeeperDispatcher"))
{
}
void NuKeeperStorageDispatcher::requestThread()
{
setThreadName("NuKeeperReqT");
while (!shutdown_called)
{
NuKeeperStorage::RequestForSession request;
UInt64 max_wait = UInt64(coordination_settings->operation_timeout_ms.totalMilliseconds());
if (requests_queue.tryPop(request, max_wait))
{
if (shutdown_called)
break;
try
{
server->putRequest(request);
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
}
}
void NuKeeperStorageDispatcher::responseThread()
{
setThreadName("NuKeeperRspT");
while (!shutdown_called)
{
NuKeeperStorage::ResponseForSession response_for_session;
UInt64 max_wait = UInt64(coordination_settings->operation_timeout_ms.totalMilliseconds());
if (responses_queue.tryPop(response_for_session, max_wait))
{
if (shutdown_called)
break;
try
{
setResponse(response_for_session.session_id, response_for_session.response);
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
}
}
void NuKeeperStorageDispatcher::setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response)
{
std::lock_guard lock(session_to_response_callback_mutex);
auto session_writer = session_to_response_callback.find(session_id);
if (session_writer == session_to_response_callback.end())
return;
session_writer->second(response);
/// Session closed, no more writes
if (response->xid != Coordination::WATCH_XID && response->getOpNum() == Coordination::OpNum::Close)
session_to_response_callback.erase(session_writer);
}
bool NuKeeperStorageDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id)
{
{
std::lock_guard lock(session_to_response_callback_mutex);
if (session_to_response_callback.count(session_id) == 0)
return false;
}
NuKeeperStorage::RequestForSession request_info;
request_info.request = request;
request_info.session_id = session_id;
std::lock_guard lock(push_request_mutex);
/// Put close requests without timeouts
if (request->getOpNum() == Coordination::OpNum::Close)
requests_queue.push(std::move(request_info));
else if (!requests_queue.tryPush(std::move(request_info), coordination_settings->operation_timeout_ms.totalMilliseconds()))
throw Exception("Cannot push request to queue within operation timeout", ErrorCodes::TIMEOUT_EXCEEDED);
return true;
}
void NuKeeperStorageDispatcher::initialize(const Poco::Util::AbstractConfiguration & config)
{
LOG_DEBUG(log, "Initializing storage dispatcher");
int myid = config.getInt("test_keeper_server.server_id");
coordination_settings->loadFromConfig("test_keeper_server.coordination_settings", config);
server = std::make_unique<NuKeeperServer>(myid, coordination_settings, config, responses_queue);
try
{
LOG_DEBUG(log, "Waiting server to initialize");
server->startup();
LOG_DEBUG(log, "Server initialized, waiting for quorum");
server->waitInit();
LOG_DEBUG(log, "Quorum initialized");
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
throw;
}
request_thread = ThreadFromGlobalPool([this] { requestThread(); });
responses_thread = ThreadFromGlobalPool([this] { responseThread(); });
session_cleaner_thread = ThreadFromGlobalPool([this] { sessionCleanerTask(); });
LOG_DEBUG(log, "Dispatcher initialized");
}
void NuKeeperStorageDispatcher::shutdown()
{
try
{
{
std::lock_guard lock(push_request_mutex);
if (shutdown_called)
return;
LOG_DEBUG(log, "Shutting down storage dispatcher");
shutdown_called = true;
if (session_cleaner_thread.joinable())
session_cleaner_thread.join();
if (request_thread.joinable())
request_thread.join();
if (responses_thread.joinable())
responses_thread.join();
}
if (server)
server->shutdown();
NuKeeperStorage::RequestForSession request_for_session;
while (requests_queue.tryPop(request_for_session))
{
auto response = request_for_session.request->makeResponse();
response->error = Coordination::Error::ZSESSIONEXPIRED;
setResponse(request_for_session.session_id, response);
}
session_to_response_callback.clear();
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
LOG_DEBUG(log, "Dispatcher shut down");
}
NuKeeperStorageDispatcher::~NuKeeperStorageDispatcher()
{
shutdown();
}
void NuKeeperStorageDispatcher::registerSession(int64_t session_id, ZooKeeperResponseCallback callback)
{
std::lock_guard lock(session_to_response_callback_mutex);
if (!session_to_response_callback.try_emplace(session_id, callback).second)
throw Exception(DB::ErrorCodes::LOGICAL_ERROR, "Session with id {} already registered in dispatcher", session_id);
}
void NuKeeperStorageDispatcher::sessionCleanerTask()
{
while (true)
{
if (shutdown_called)
return;
try
{
if (isLeader())
{
auto dead_sessions = server->getDeadSessions();
for (int64_t dead_session : dead_sessions)
{
LOG_INFO(log, "Found dead session {}, will try to close it", dead_session);
Coordination::ZooKeeperRequestPtr request = Coordination::ZooKeeperRequestFactory::instance().get(Coordination::OpNum::Close);
request->xid = Coordination::CLOSE_XID;
NuKeeperStorage::RequestForSession request_info;
request_info.request = request;
request_info.session_id = dead_session;
{
std::lock_guard lock(push_request_mutex);
requests_queue.push(std::move(request_info));
}
finishSession(dead_session);
LOG_INFO(log, "Dead session close request pushed");
}
}
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
std::this_thread::sleep_for(std::chrono::milliseconds(coordination_settings->dead_session_check_period_ms.totalMilliseconds()));
}
}
void NuKeeperStorageDispatcher::finishSession(int64_t session_id)
{
std::lock_guard lock(session_to_response_callback_mutex);
auto session_it = session_to_response_callback.find(session_id);
if (session_it != session_to_response_callback.end())
session_to_response_callback.erase(session_it);
}
}

View File

@ -0,0 +1,89 @@
#pragma once
#if !defined(ARCADIA_BUILD)
# include <Common/config.h>
# include "config_core.h"
#endif
#if USE_NURAFT
#include <Common/ThreadPool.h>
#include <Common/ConcurrentBoundedQueue.h>
#include <Poco/Util/AbstractConfiguration.h>
#include <Common/Exception.h>
#include <common/logger_useful.h>
#include <functional>
#include <Coordination/NuKeeperServer.h>
#include <Coordination/CoordinationSettings.h>
namespace DB
{
using ZooKeeperResponseCallback = std::function<void(const Coordination::ZooKeeperResponsePtr & response)>;
class NuKeeperStorageDispatcher
{
private:
std::mutex push_request_mutex;
CoordinationSettingsPtr coordination_settings;
using RequestsQueue = ConcurrentBoundedQueue<NuKeeperStorage::RequestForSession>;
RequestsQueue requests_queue{1};
ResponsesQueue responses_queue;
std::atomic<bool> shutdown_called{false};
using SessionToResponseCallback = std::unordered_map<int64_t, ZooKeeperResponseCallback>;
std::mutex session_to_response_callback_mutex;
SessionToResponseCallback session_to_response_callback;
ThreadFromGlobalPool request_thread;
ThreadFromGlobalPool responses_thread;
ThreadFromGlobalPool session_cleaner_thread;
std::unique_ptr<NuKeeperServer> server;
Poco::Logger * log;
private:
void requestThread();
void responseThread();
void sessionCleanerTask();
void setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response);
public:
NuKeeperStorageDispatcher();
void initialize(const Poco::Util::AbstractConfiguration & config);
void shutdown();
~NuKeeperStorageDispatcher();
bool putRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id);
bool isLeader() const
{
return server->isLeader();
}
bool hasLeader() const
{
return server->isLeaderAlive();
}
int64_t getSessionID(long session_timeout_ms)
{
return server->getSessionID(session_timeout_ms);
}
void registerSession(int64_t session_id, ZooKeeperResponseCallback callback);
/// Call if we don't need any responses for this session no more (session was expired)
void finishSession(int64_t session_id);
};
}
#endif

View File

@ -0,0 +1,87 @@
#include <Coordination/NuKeeperStorageSerializer.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
#include <Common/ZooKeeper/ZooKeeperIO.h>
namespace DB
{
namespace
{
void writeNode(const NuKeeperStorage::Node & node, WriteBuffer & out)
{
Coordination::write(node.data, out);
Coordination::write(node.acls, out);
Coordination::write(node.is_ephemeral, out);
Coordination::write(node.is_sequental, out);
Coordination::write(node.stat, out);
Coordination::write(node.seq_num, out);
}
void readNode(NuKeeperStorage::Node & node, ReadBuffer & in)
{
Coordination::read(node.data, in);
Coordination::read(node.acls, in);
Coordination::read(node.is_ephemeral, in);
Coordination::read(node.is_sequental, in);
Coordination::read(node.stat, in);
Coordination::read(node.seq_num, in);
}
}
void NuKeeperStorageSerializer::serialize(const NuKeeperStorage & storage, WriteBuffer & out)
{
Coordination::write(storage.zxid, out);
Coordination::write(storage.session_id_counter, out);
Coordination::write(storage.container.size(), out);
for (const auto & [path, node] : storage.container)
{
Coordination::write(path, out);
writeNode(node, out);
}
Coordination::write(storage.ephemerals.size(), out);
for (const auto & [session_id, paths] : storage.ephemerals)
{
Coordination::write(session_id, out);
Coordination::write(paths.size(), out);
for (const auto & path : paths)
Coordination::write(path, out);
}
}
void NuKeeperStorageSerializer::deserialize(NuKeeperStorage & storage, ReadBuffer & in)
{
int64_t session_id_counter, zxid;
Coordination::read(zxid, in);
Coordination::read(session_id_counter, in);
storage.zxid = zxid;
storage.session_id_counter = session_id_counter;
size_t container_size;
Coordination::read(container_size, in);
while (storage.container.size() < container_size)
{
std::string path;
Coordination::read(path, in);
NuKeeperStorage::Node node;
readNode(node, in);
storage.container[path] = node;
}
size_t ephemerals_size;
Coordination::read(ephemerals_size, in);
while (storage.ephemerals.size() < ephemerals_size)
{
int64_t session_id;
size_t ephemerals_for_session;
Coordination::read(session_id, in);
Coordination::read(ephemerals_for_session, in);
while (storage.ephemerals[session_id].size() < ephemerals_for_session)
{
std::string ephemeral_path;
Coordination::read(ephemeral_path, in);
storage.ephemerals[session_id].emplace(ephemeral_path);
}
}
}
}

View File

@ -0,0 +1,17 @@
#pragma once
#include <Coordination/NuKeeperStorage.h>
#include <IO/WriteBuffer.h>
#include <IO/ReadBuffer.h>
namespace DB
{
class NuKeeperStorageSerializer
{
public:
static void serialize(const NuKeeperStorage & storage, WriteBuffer & out);
static void deserialize(NuKeeperStorage & storage, ReadBuffer & in);
};
}

View File

@ -0,0 +1,20 @@
#pragma once
#include <IO/ReadBufferFromMemory.h>
#include <libnuraft/nuraft.hxx> // Y_IGNORE
namespace DB
{
class ReadBufferFromNuraftBuffer : public ReadBufferFromMemory
{
public:
explicit ReadBufferFromNuraftBuffer(nuraft::ptr<nuraft::buffer> buffer)
: ReadBufferFromMemory(buffer->data_begin(), buffer->size())
{}
explicit ReadBufferFromNuraftBuffer(nuraft::buffer & buffer)
: ReadBufferFromMemory(buffer.data_begin(), buffer.size())
{}
};
}

View File

@ -0,0 +1,83 @@
#include <Coordination/SessionExpiryQueue.h>
#include <common/logger_useful.h>
namespace DB
{
bool SessionExpiryQueue::remove(int64_t session_id)
{
auto session_it = session_to_timeout.find(session_id);
if (session_it != session_to_timeout.end())
{
auto set_it = expiry_to_sessions.find(session_it->second);
if (set_it != expiry_to_sessions.end())
set_it->second.erase(session_id);
return true;
}
return false;
}
bool SessionExpiryQueue::update(int64_t session_id, int64_t timeout_ms)
{
auto session_it = session_to_timeout.find(session_id);
int64_t now = getNowMilliseconds();
int64_t new_expiry_time = roundToNextInterval(now + timeout_ms);
if (session_it != session_to_timeout.end())
{
if (new_expiry_time == session_it->second)
return false;
auto set_it = expiry_to_sessions.find(new_expiry_time);
if (set_it == expiry_to_sessions.end())
std::tie(set_it, std::ignore) = expiry_to_sessions.emplace(new_expiry_time, std::unordered_set<int64_t>());
set_it->second.insert(session_id);
int64_t prev_expiry_time = session_it->second;
if (prev_expiry_time != new_expiry_time)
{
auto prev_set_it = expiry_to_sessions.find(prev_expiry_time);
if (prev_set_it != expiry_to_sessions.end())
prev_set_it->second.erase(session_id);
}
session_it->second = new_expiry_time;
return true;
}
else
{
session_to_timeout[session_id] = new_expiry_time;
auto set_it = expiry_to_sessions.find(new_expiry_time);
if (set_it == expiry_to_sessions.end())
std::tie(set_it, std::ignore) = expiry_to_sessions.emplace(new_expiry_time, std::unordered_set<int64_t>());
set_it->second.insert(session_id);
return false;
}
}
std::unordered_set<int64_t> SessionExpiryQueue::getExpiredSessions()
{
int64_t now = getNowMilliseconds();
if (now < next_expiration_time)
return {};
auto set_it = expiry_to_sessions.find(next_expiration_time);
int64_t new_expiration_time = next_expiration_time + expiration_interval;
next_expiration_time = new_expiration_time;
if (set_it != expiry_to_sessions.end())
{
auto result = set_it->second;
expiry_to_sessions.erase(set_it);
return result;
}
return {};
}
void SessionExpiryQueue::clear()
{
session_to_timeout.clear();
expiry_to_sessions.clear();
}
}

View File

@ -0,0 +1,45 @@
#pragma once
#include <unordered_map>
#include <unordered_set>
#include <chrono>
namespace DB
{
class SessionExpiryQueue
{
private:
std::unordered_map<int64_t, int64_t> session_to_timeout;
std::unordered_map<int64_t, std::unordered_set<int64_t>> expiry_to_sessions;
int64_t expiration_interval;
int64_t next_expiration_time;
static int64_t getNowMilliseconds()
{
using namespace std::chrono;
return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
}
int64_t roundToNextInterval(int64_t time) const
{
return (time / expiration_interval + 1) * expiration_interval;
}
public:
explicit SessionExpiryQueue(int64_t expiration_interval_)
: expiration_interval(expiration_interval_)
, next_expiration_time(roundToNextInterval(getNowMilliseconds()))
{
}
bool remove(int64_t session_id);
bool update(int64_t session_id, int64_t timeout_ms);
std::unordered_set<int64_t> getExpiredSessions();
void clear();
};
}

View File

@ -0,0 +1,167 @@
#include <Coordination/SummingStateMachine.h>
#include <iostream>
#include <cstring>
namespace DB
{
static constexpr int MAX_SNAPSHOTS = 3;
static int64_t deserializeValue(nuraft::buffer & buffer)
{
nuraft::buffer_serializer bs(buffer);
int64_t result;
memcpy(&result, bs.get_raw(buffer.size()), sizeof(result));
return result;
}
SummingStateMachine::SummingStateMachine()
: value(0)
, last_committed_idx(0)
{
}
nuraft::ptr<nuraft::buffer> SummingStateMachine::commit(const size_t log_idx, nuraft::buffer & data)
{
int64_t value_to_add = deserializeValue(data);
value += value_to_add;
last_committed_idx = log_idx;
// Return Raft log number as a return result.
nuraft::ptr<nuraft::buffer> ret = nuraft::buffer::alloc(sizeof(log_idx));
nuraft::buffer_serializer bs(ret);
bs.put_u64(log_idx);
return ret;
}
bool SummingStateMachine::apply_snapshot(nuraft::snapshot & s)
{
std::lock_guard<std::mutex> ll(snapshots_lock);
auto entry = snapshots.find(s.get_last_log_idx());
if (entry == snapshots.end())
return false;
auto ctx = entry->second;
value = ctx->value;
return true;
}
nuraft::ptr<nuraft::snapshot> SummingStateMachine::last_snapshot()
{
// Just return the latest snapshot.
std::lock_guard<std::mutex> ll(snapshots_lock);
auto entry = snapshots.rbegin();
if (entry == snapshots.rend())
return nullptr;
auto ctx = entry->second;
return ctx->snapshot;
}
void SummingStateMachine::createSnapshotInternal(nuraft::snapshot & s)
{
// Clone snapshot from `s`.
nuraft::ptr<nuraft::buffer> snp_buf = s.serialize();
nuraft::ptr<nuraft::snapshot> ss = nuraft::snapshot::deserialize(*snp_buf);
// Put into snapshot map.
auto ctx = cs_new<SingleValueSnapshotContext>(ss, value);
snapshots[s.get_last_log_idx()] = ctx;
// Maintain last 3 snapshots only.
int num = snapshots.size();
auto entry = snapshots.begin();
for (int ii = 0; ii < num - MAX_SNAPSHOTS; ++ii)
{
if (entry == snapshots.end())
break;
entry = snapshots.erase(entry);
}
}
void SummingStateMachine::save_logical_snp_obj(
nuraft::snapshot & s,
size_t & obj_id,
nuraft::buffer & data,
bool /*is_first_obj*/,
bool /*is_last_obj*/)
{
if (obj_id == 0)
{
// Object ID == 0: it contains dummy value, create snapshot context.
createSnapshotInternal(s);
}
else
{
// Object ID > 0: actual snapshot value.
nuraft::buffer_serializer bs(data);
int64_t local_value = static_cast<int64_t>(bs.get_u64());
std::lock_guard<std::mutex> ll(snapshots_lock);
auto entry = snapshots.find(s.get_last_log_idx());
assert(entry != snapshots.end());
entry->second->value = local_value;
}
// Request next object.
obj_id++;
}
int SummingStateMachine::read_logical_snp_obj(
nuraft::snapshot & s,
void* & /*user_snp_ctx*/,
size_t obj_id,
nuraft::ptr<nuraft::buffer> & data_out,
bool & is_last_obj)
{
nuraft::ptr<SingleValueSnapshotContext> ctx = nullptr;
{
std::lock_guard<std::mutex> ll(snapshots_lock);
auto entry = snapshots.find(s.get_last_log_idx());
if (entry == snapshots.end())
{
// Snapshot doesn't exist.
data_out = nullptr;
is_last_obj = true;
return 0;
}
ctx = entry->second;
}
if (obj_id == 0)
{
// Object ID == 0: first object, put dummy data.
data_out = nuraft::buffer::alloc(sizeof(Int32));
nuraft::buffer_serializer bs(data_out);
bs.put_i32(0);
is_last_obj = false;
}
else
{
// Object ID > 0: second object, put actual value.
data_out = nuraft::buffer::alloc(sizeof(size_t));
nuraft::buffer_serializer bs(data_out);
bs.put_u64(ctx->value);
is_last_obj = true;
}
return 0;
}
void SummingStateMachine::create_snapshot(
nuraft::snapshot & s,
nuraft::async_result<bool>::handler_type & when_done)
{
{
std::lock_guard<std::mutex> ll(snapshots_lock);
createSnapshotInternal(s);
}
nuraft::ptr<std::exception> except(nullptr);
bool ret = true;
when_done(ret, except);
}
}

View File

@ -0,0 +1,78 @@
#pragma once
#include <libnuraft/nuraft.hxx> // Y_IGNORE
#include <Core/Types.h>
#include <atomic>
#include <map>
#include <mutex>
namespace DB
{
/// Example trivial state machine.
class SummingStateMachine : public nuraft::state_machine
{
public:
SummingStateMachine();
nuraft::ptr<nuraft::buffer> pre_commit(const size_t /*log_idx*/, nuraft::buffer & /*data*/) override { return nullptr; }
nuraft::ptr<nuraft::buffer> commit(const size_t log_idx, nuraft::buffer & data) override;
void rollback(const size_t /*log_idx*/, nuraft::buffer & /*data*/) override {}
size_t last_commit_index() override { return last_committed_idx; }
bool apply_snapshot(nuraft::snapshot & s) override;
nuraft::ptr<nuraft::snapshot> last_snapshot() override;
void create_snapshot(
nuraft::snapshot & s,
nuraft::async_result<bool>::handler_type & when_done) override;
void save_logical_snp_obj(
nuraft::snapshot & s,
size_t & obj_id,
nuraft::buffer & data,
bool is_first_obj,
bool is_last_obj) override;
int read_logical_snp_obj(
nuraft::snapshot & s,
void* & user_snp_ctx,
size_t obj_id,
nuraft::ptr<nuraft::buffer> & data_out,
bool & is_last_obj) override;
int64_t getValue() const { return value; }
private:
struct SingleValueSnapshotContext
{
SingleValueSnapshotContext(nuraft::ptr<nuraft::snapshot> & s, int64_t v)
: snapshot(s)
, value(v)
{}
nuraft::ptr<nuraft::snapshot> snapshot;
int64_t value;
};
void createSnapshotInternal(nuraft::snapshot & s);
// State machine's current value.
std::atomic<int64_t> value;
// Last committed Raft log number.
std::atomic<uint64_t> last_committed_idx;
// Keeps the last 3 snapshots, by their Raft log numbers.
std::map<uint64_t, nuraft::ptr<SingleValueSnapshotContext>> snapshots;
// Mutex for `snapshots_`.
std::mutex snapshots_lock;
};
}

View File

@ -0,0 +1,45 @@
#pragma once
#include <queue>
#include <mutex>
namespace DB
{
/// Queue with mutex and condvar. As simple as possible.
template <typename T>
class ThreadSafeQueue
{
private:
mutable std::mutex queue_mutex;
std::condition_variable cv;
std::queue<T> queue;
public:
void push(const T & response)
{
std::lock_guard lock(queue_mutex);
queue.push(response);
cv.notify_one();
}
bool tryPop(T & response, int64_t timeout_ms = 0)
{
std::unique_lock lock(queue_mutex);
if (!cv.wait_for(lock,
std::chrono::milliseconds(timeout_ms), [this] { return !queue.empty(); }))
return false;
response = queue.front();
queue.pop();
return true;
}
size_t size() const
{
std::lock_guard lock(queue_mutex);
return queue.size();
}
};
}

View File

@ -0,0 +1,71 @@
#include <Coordination/WriteBufferFromNuraftBuffer.h>
#include <common/logger_useful.h>
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_WRITE_AFTER_END_OF_BUFFER;
}
void WriteBufferFromNuraftBuffer::nextImpl()
{
if (is_finished)
throw Exception("WriteBufferFromNuraftBuffer is finished", ErrorCodes::CANNOT_WRITE_AFTER_END_OF_BUFFER);
/// pos may not be equal to vector.data() + old_size, because WriteBuffer::next() can be used to flush data
size_t pos_offset = pos - reinterpret_cast<Position>(buffer->data_begin());
size_t old_size = buffer->size();
if (pos_offset == old_size)
{
nuraft::ptr<nuraft::buffer> new_buffer = nuraft::buffer::alloc(old_size * size_multiplier);
memcpy(new_buffer->data_begin(), buffer->data_begin(), buffer->size());
buffer = new_buffer;
}
internal_buffer = Buffer(reinterpret_cast<Position>(buffer->data_begin() + pos_offset), reinterpret_cast<Position>(buffer->data_begin() + buffer->size()));
working_buffer = internal_buffer;
}
WriteBufferFromNuraftBuffer::WriteBufferFromNuraftBuffer()
: WriteBuffer(nullptr, 0)
{
buffer = nuraft::buffer::alloc(initial_size);
set(reinterpret_cast<Position>(buffer->data_begin()), buffer->size());
}
void WriteBufferFromNuraftBuffer::finalize()
{
if (is_finished)
return;
is_finished = true;
size_t real_size = pos - reinterpret_cast<Position>(buffer->data_begin());
nuraft::ptr<nuraft::buffer> new_buffer = nuraft::buffer::alloc(real_size);
memcpy(new_buffer->data_begin(), buffer->data_begin(), real_size);
buffer = new_buffer;
/// Prevent further writes.
set(nullptr, 0);
}
nuraft::ptr<nuraft::buffer> WriteBufferFromNuraftBuffer::getBuffer()
{
finalize();
return buffer;
}
WriteBufferFromNuraftBuffer::~WriteBufferFromNuraftBuffer()
{
try
{
finalize();
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
}

View File

@ -0,0 +1,30 @@
#pragma once
#include <IO/WriteBuffer.h>
#include <libnuraft/nuraft.hxx> // Y_IGNORE
namespace DB
{
class WriteBufferFromNuraftBuffer : public WriteBuffer
{
private:
nuraft::ptr<nuraft::buffer> buffer;
bool is_finished = false;
static constexpr size_t initial_size = 32;
static constexpr size_t size_multiplier = 2;
void nextImpl() override;
public:
WriteBufferFromNuraftBuffer();
void finalize() override final;
nuraft::ptr<nuraft::buffer> getBuffer();
bool isFinished() const { return is_finished; }
~WriteBufferFromNuraftBuffer() override;
};
}

View File

@ -0,0 +1,336 @@
#include <gtest/gtest.h>
#if !defined(ARCADIA_BUILD)
# include <Common/config.h>
# include "config_core.h"
#endif
#if USE_NURAFT
#include <Coordination/InMemoryLogStore.h>
#include <Coordination/InMemoryStateManager.h>
#include <Coordination/NuKeeperStorageSerializer.h>
#include <Coordination/SummingStateMachine.h>
#include <Coordination/NuKeeperStateMachine.h>
#include <Coordination/LoggerWrapper.h>
#include <Coordination/WriteBufferFromNuraftBuffer.h>
#include <Coordination/ReadBufferFromNuraftBuffer.h>
#include <IO/ReadBufferFromString.h>
#include <IO/WriteBufferFromString.h>
#include <Common/ZooKeeper/ZooKeeperCommon.h>
#include <Common/ZooKeeper/ZooKeeperIO.h>
#include <Common/Exception.h>
#include <libnuraft/nuraft.hxx> // Y_IGNORE
#include <thread>
TEST(CoordinationTest, BuildTest)
{
DB::InMemoryLogStore store;
DB::SummingStateMachine machine;
EXPECT_EQ(1, 1);
}
TEST(CoordinationTest, BufferSerde)
{
Coordination::ZooKeeperRequestPtr request = Coordination::ZooKeeperRequestFactory::instance().get(Coordination::OpNum::Get);
request->xid = 3;
dynamic_cast<Coordination::ZooKeeperGetRequest *>(request.get())->path = "/path/value";
DB::WriteBufferFromNuraftBuffer wbuf;
request->write(wbuf);
auto nuraft_buffer = wbuf.getBuffer();
EXPECT_EQ(nuraft_buffer->size(), 28);
DB::ReadBufferFromNuraftBuffer rbuf(nuraft_buffer);
int32_t length;
Coordination::read(length, rbuf);
EXPECT_EQ(length + sizeof(length), nuraft_buffer->size());
int32_t xid;
Coordination::read(xid, rbuf);
EXPECT_EQ(xid, request->xid);
Coordination::OpNum opnum;
Coordination::read(opnum, rbuf);
Coordination::ZooKeeperRequestPtr request_read = Coordination::ZooKeeperRequestFactory::instance().get(opnum);
request_read->xid = xid;
request_read->readImpl(rbuf);
EXPECT_EQ(request_read->getOpNum(), Coordination::OpNum::Get);
EXPECT_EQ(request_read->xid, 3);
EXPECT_EQ(dynamic_cast<Coordination::ZooKeeperGetRequest *>(request_read.get())->path, "/path/value");
}
template <typename StateMachine>
struct SimpliestRaftServer
{
SimpliestRaftServer(int server_id_, const std::string & hostname_, int port_)
: server_id(server_id_)
, hostname(hostname_)
, port(port_)
, endpoint(hostname + ":" + std::to_string(port))
, state_machine(nuraft::cs_new<StateMachine>())
, state_manager(nuraft::cs_new<DB::InMemoryStateManager>(server_id, hostname, port))
{
nuraft::raft_params params;
params.heart_beat_interval_ = 100;
params.election_timeout_lower_bound_ = 200;
params.election_timeout_upper_bound_ = 400;
params.reserved_log_items_ = 5;
params.snapshot_distance_ = 1; /// forcefully send snapshots
params.client_req_timeout_ = 3000;
params.return_method_ = nuraft::raft_params::blocking;
raft_instance = launcher.init(
state_machine, state_manager, nuraft::cs_new<DB::LoggerWrapper>("ToyRaftLogger", DB::LogsLevel::trace), port,
nuraft::asio_service::options{}, params);
if (!raft_instance)
{
std::cerr << "Failed to initialize launcher (see the message "
"in the log file)." << std::endl;
exit(-1);
}
std::cout << "init Raft instance " << server_id;
for (size_t ii = 0; ii < 20; ++ii)
{
if (raft_instance->is_initialized())
{
std::cout << " done" << std::endl;
break;
}
std::cout << ".";
fflush(stdout);
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
}
// Server ID.
int server_id;
// Server address.
std::string hostname;
// Server port.
int port;
std::string endpoint;
// State machine.
nuraft::ptr<StateMachine> state_machine;
// State manager.
nuraft::ptr<nuraft::state_mgr> state_manager;
// Raft launcher.
nuraft::raft_launcher launcher;
// Raft server instance.
nuraft::ptr<nuraft::raft_server> raft_instance;
};
using SummingRaftServer = SimpliestRaftServer<DB::SummingStateMachine>;
nuraft::ptr<nuraft::buffer> getLogEntry(int64_t number)
{
nuraft::ptr<nuraft::buffer> ret = nuraft::buffer::alloc(sizeof(number));
nuraft::buffer_serializer bs(ret);
// WARNING: We don't consider endian-safety in this example.
bs.put_raw(&number, sizeof(number));
return ret;
}
TEST(CoordinationTest, TestSummingRaft1)
{
SummingRaftServer s1(1, "localhost", 44444);
/// Single node is leader
EXPECT_EQ(s1.raft_instance->get_leader(), 1);
auto entry1 = getLogEntry(143);
auto ret = s1.raft_instance->append_entries({entry1});
EXPECT_TRUE(ret->get_accepted()) << "failed to replicate: entry 1" << ret->get_result_code();
EXPECT_EQ(ret->get_result_code(), nuraft::cmd_result_code::OK) << "failed to replicate: entry 1" << ret->get_result_code();
while (s1.state_machine->getValue() != 143)
{
std::cout << "Waiting s1 to apply entry\n";
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
EXPECT_EQ(s1.state_machine->getValue(), 143);
s1.launcher.shutdown(5);
}
TEST(CoordinationTest, TestSummingRaft3)
{
SummingRaftServer s1(1, "localhost", 44444);
SummingRaftServer s2(2, "localhost", 44445);
SummingRaftServer s3(3, "localhost", 44446);
nuraft::srv_config first_config(1, "localhost:44444");
auto ret1 = s2.raft_instance->add_srv(first_config);
if (!ret1->get_accepted())
{
std::cout << "failed to add server: "
<< ret1->get_result_str() << std::endl;
EXPECT_TRUE(false);
}
while (s1.raft_instance->get_leader() != 2)
{
std::cout << "Waiting s1 to join to s2 quorum\n";
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
nuraft::srv_config third_config(3, "localhost:44446");
auto ret3 = s2.raft_instance->add_srv(third_config);
if (!ret3->get_accepted())
{
std::cout << "failed to add server: "
<< ret3->get_result_str() << std::endl;
EXPECT_TRUE(false);
}
while (s3.raft_instance->get_leader() != 2)
{
std::cout << "Waiting s3 to join to s2 quorum\n";
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
/// S2 is leader
EXPECT_EQ(s1.raft_instance->get_leader(), 2);
EXPECT_EQ(s2.raft_instance->get_leader(), 2);
EXPECT_EQ(s3.raft_instance->get_leader(), 2);
std::cerr << "Starting to add entries\n";
auto entry = getLogEntry(1);
auto ret = s2.raft_instance->append_entries({entry});
EXPECT_TRUE(ret->get_accepted()) << "failed to replicate: entry 1" << ret->get_result_code();
EXPECT_EQ(ret->get_result_code(), nuraft::cmd_result_code::OK) << "failed to replicate: entry 1" << ret->get_result_code();
while (s1.state_machine->getValue() != 1)
{
std::cout << "Waiting s1 to apply entry\n";
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
while (s2.state_machine->getValue() != 1)
{
std::cout << "Waiting s2 to apply entry\n";
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
while (s3.state_machine->getValue() != 1)
{
std::cout << "Waiting s3 to apply entry\n";
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
EXPECT_EQ(s1.state_machine->getValue(), 1);
EXPECT_EQ(s2.state_machine->getValue(), 1);
EXPECT_EQ(s3.state_machine->getValue(), 1);
auto non_leader_entry = getLogEntry(3);
auto ret_non_leader1 = s1.raft_instance->append_entries({non_leader_entry});
EXPECT_FALSE(ret_non_leader1->get_accepted());
auto ret_non_leader3 = s3.raft_instance->append_entries({non_leader_entry});
EXPECT_FALSE(ret_non_leader3->get_accepted());
auto leader_entry = getLogEntry(77);
auto ret_leader = s2.raft_instance->append_entries({leader_entry});
EXPECT_TRUE(ret_leader->get_accepted()) << "failed to replicate: entry 78" << ret_leader->get_result_code();
EXPECT_EQ(ret_leader->get_result_code(), nuraft::cmd_result_code::OK) << "failed to replicate: entry 78" << ret_leader->get_result_code();
while (s1.state_machine->getValue() != 78)
{
std::cout << "Waiting s1 to apply entry\n";
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
while (s3.state_machine->getValue() != 78)
{
std::cout << "Waiting s3 to apply entry\n";
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
EXPECT_EQ(s1.state_machine->getValue(), 78);
EXPECT_EQ(s2.state_machine->getValue(), 78);
EXPECT_EQ(s3.state_machine->getValue(), 78);
s1.launcher.shutdown(5);
s2.launcher.shutdown(5);
s3.launcher.shutdown(5);
}
nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(int64_t session_id, const Coordination::ZooKeeperRequestPtr & request)
{
DB::WriteBufferFromNuraftBuffer buf;
DB::writeIntBinary(session_id, buf);
request->write(buf);
return buf.getBuffer();
}
DB::NuKeeperStorage::ResponsesForSessions getZooKeeperResponses(nuraft::ptr<nuraft::buffer> & buffer, const Coordination::ZooKeeperRequestPtr & request)
{
DB::NuKeeperStorage::ResponsesForSessions results;
DB::ReadBufferFromNuraftBuffer buf(buffer);
while (!buf.eof())
{
int64_t session_id;
DB::readIntBinary(session_id, buf);
int32_t length;
Coordination::XID xid;
int64_t zxid;
Coordination::Error err;
Coordination::read(length, buf);
Coordination::read(xid, buf);
Coordination::read(zxid, buf);
Coordination::read(err, buf);
auto response = request->makeResponse();
response->readImpl(buf);
results.push_back(DB::NuKeeperStorage::ResponseForSession{session_id, response});
}
return results;
}
TEST(CoordinationTest, TestStorageSerialization)
{
DB::NuKeeperStorage storage(500);
storage.container["/hello"] = DB::NuKeeperStorage::Node{.data="world"};
storage.container["/hello/somepath"] = DB::NuKeeperStorage::Node{.data="somedata"};
storage.session_id_counter = 5;
storage.zxid = 156;
storage.ephemerals[3] = {"/hello", "/"};
storage.ephemerals[1] = {"/hello/somepath"};
DB::WriteBufferFromOwnString buffer;
DB::NuKeeperStorageSerializer serializer;
serializer.serialize(storage, buffer);
std::string serialized = buffer.str();
EXPECT_NE(serialized.size(), 0);
DB::ReadBufferFromString read(serialized);
DB::NuKeeperStorage new_storage(500);
serializer.deserialize(new_storage, read);
EXPECT_EQ(new_storage.container.size(), 3);
EXPECT_EQ(new_storage.container["/hello"].data, "world");
EXPECT_EQ(new_storage.container["/hello/somepath"].data, "somedata");
EXPECT_EQ(new_storage.session_id_counter, 5);
EXPECT_EQ(new_storage.zxid, 156);
EXPECT_EQ(new_storage.ephemerals.size(), 2);
EXPECT_EQ(new_storage.ephemerals[3].size(), 2);
EXPECT_EQ(new_storage.ephemerals[1].size(), 1);
}
#endif

13
src/Coordination/ya.make Normal file
View File

@ -0,0 +1,13 @@
# This file is generated automatically, do not edit. See 'ya.make.in' and use 'utils/generate-ya-make' to regenerate it.
OWNER(g:clickhouse)
LIBRARY()
PEERDIR(
clickhouse/src/Common
)
SRCS(
)
END()

View File

@ -0,0 +1,12 @@
OWNER(g:clickhouse)
LIBRARY()
PEERDIR(
clickhouse/src/Common
)
SRCS(
)
END()

View File

@ -21,7 +21,7 @@ namespace ErrorCodes
extern const int DECIMAL_OVERFLOW;
}
///
inline bool allowDecimalComparison(const DataTypePtr & left_type, const DataTypePtr & right_type)
{
if (isColumnedAsDecimal(left_type))
@ -30,7 +30,9 @@ inline bool allowDecimalComparison(const DataTypePtr & left_type, const DataType
return true;
}
else if (isNotDecimalButComparableToDecimal(left_type) && isColumnedAsDecimal(right_type))
{
return true;
}
return false;
}
@ -252,9 +254,9 @@ private:
else
{
if constexpr (scale_left)
x *= scale;
x = common::mulIgnoreOverflow(x, scale);
if constexpr (scale_right)
y *= scale;
y = common::mulIgnoreOverflow(y, scale);
}
return Op::apply(x, y);

View File

@ -13,3 +13,4 @@
#cmakedefine01 USE_LDAP
#cmakedefine01 USE_ROCKSDB
#cmakedefine01 USE_LIBPQXX
#cmakedefine01 USE_NURAFT

View File

@ -120,14 +120,17 @@ public:
return DecimalUtils::getFractionalPart(x, scale);
}
T maxWholeValue() const { return getScaleMultiplier(maxPrecision() - scale) - T(1); }
T maxWholeValue() const { return getScaleMultiplier(precision - scale) - T(1); }
bool canStoreWhole(T x) const
template<typename U>
bool canStoreWhole(U x) const
{
static_assert(std::is_signed_v<typename T::NativeType>);
T max = maxWholeValue();
if (x > max || x < -max)
return false;
return true;
if constexpr (std::is_signed_v<U>)
return -max <= x && x <= max;
else
return x <= static_cast<std::make_unsigned_t<typename T::NativeType>>(max.value);
}
/// @returns multiplier for U to become T with correct scale

View File

@ -7,6 +7,7 @@
#include "DiskS3.h"
#include "Disks/DiskCacheWrapper.h"
#include "Disks/DiskFactory.h"
#include "Storages/StorageS3Settings.h"
#include "ProxyConfiguration.h"
#include "ProxyListConfiguration.h"
#include "ProxyResolverConfiguration.h"
@ -137,6 +138,8 @@ void registerDiskS3(DiskFactory & factory)
uri.is_virtual_hosted_style,
config.getString(config_prefix + ".access_key_id", ""),
config.getString(config_prefix + ".secret_access_key", ""),
config.getString(config_prefix + ".server_side_encryption_customer_key_base64", ""),
{},
config.getBool(config_prefix + ".use_environment_credentials", config.getBool("s3.use_environment_credentials", false))
);

View File

@ -3,6 +3,11 @@
namespace DB
{
namespace ErrorCodes
{
extern const int INCORRECT_DATA;
extern const int LOGICAL_ERROR;
}
std::pair<bool, size_t> fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size)
{
@ -15,10 +20,18 @@ std::pair<bool, size_t> fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, D
while (loadAtPosition(in, memory, pos) && (balance || memory.size() + static_cast<size_t>(pos - in.position()) < min_chunk_size))
{
const auto current_object_size = memory.size() + static_cast<size_t>(pos - in.position());
if (current_object_size > 10 * min_chunk_size)
throw ParsingException("Size of JSON object is extremely large. Expected not greater than " +
std::to_string(min_chunk_size) + " bytes, but current is " + std::to_string(current_object_size) +
" bytes per row. Increase the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, most likely JSON is malformed", ErrorCodes::INCORRECT_DATA);
if (quotes)
{
pos = find_first_symbols<'\\', '"'>(pos, in.buffer().end());
if (pos == in.buffer().end())
if (pos > in.buffer().end())
throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR);
else if (pos == in.buffer().end())
continue;
if (*pos == '\\')
{
@ -35,9 +48,11 @@ std::pair<bool, size_t> fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, D
else
{
pos = find_first_symbols<'{', '}', '\\', '"'>(pos, in.buffer().end());
if (pos == in.buffer().end())
if (pos > in.buffer().end())
throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR);
else if (pos == in.buffer().end())
continue;
if (*pos == '{')
else if (*pos == '{')
{
++balance;
++pos;

View File

@ -6,11 +6,11 @@
#include <Common/NaNUtils.h>
#include <DataTypes/NumberTraits.h>
#if !defined(ARCADIA_BUILD)
# include <Common/config.h>
#endif
namespace DB
{
@ -90,17 +90,26 @@ struct DivideIntegralImpl
}
else
{
/// Comparisons are not strict to avoid rounding issues when operand is implicitly casted to float.
if constexpr (std::is_floating_point_v<A>)
if (isNaN(a) || a > std::numeric_limits<CastA>::max() || a < std::numeric_limits<CastA>::lowest())
if (isNaN(a) || a >= std::numeric_limits<CastA>::max() || a <= std::numeric_limits<CastA>::lowest())
throw Exception("Cannot perform integer division on infinite or too large floating point numbers",
ErrorCodes::ILLEGAL_DIVISION);
if constexpr (std::is_floating_point_v<B>)
if (isNaN(b) || b > std::numeric_limits<CastB>::max() || b < std::numeric_limits<CastB>::lowest())
if (isNaN(b) || b >= std::numeric_limits<CastB>::max() || b <= std::numeric_limits<CastB>::lowest())
throw Exception("Cannot perform integer division on infinite or too large floating point numbers",
ErrorCodes::ILLEGAL_DIVISION);
return static_cast<Result>(checkedDivision(CastA(a), CastB(b)));
auto res = checkedDivision(CastA(a), CastB(b));
if constexpr (std::is_floating_point_v<decltype(res)>)
if (isNaN(res) || res >= std::numeric_limits<Result>::max() || res <= std::numeric_limits<Result>::lowest())
throw Exception("Cannot perform integer division, because it will produce infinite or too large number",
ErrorCodes::ILLEGAL_DIVISION);
return static_cast<Result>(res);
}
}

View File

@ -47,6 +47,29 @@ struct ArrayDifferenceImpl
}
template <typename Element, typename Result>
static void NO_SANITIZE_UNDEFINED impl(const Element * __restrict src, Result * __restrict dst, size_t begin, size_t end)
{
/// First element is zero, then the differences of ith and i-1th elements.
Element prev{};
for (size_t pos = begin; pos < end; ++pos)
{
if (pos == begin)
{
dst[pos] = 0;
prev = src[pos];
}
else
{
Element curr = src[pos];
dst[pos] = curr - prev;
prev = curr;
}
}
}
template <typename Element, typename Result>
static bool executeType(const ColumnPtr & mapped, const ColumnArray & array, ColumnPtr & res_ptr)
{
@ -73,14 +96,10 @@ struct ArrayDifferenceImpl
size_t pos = 0;
for (auto offset : offsets)
{
// skip empty arrays
if (pos < offset)
{
res_values[pos] = 0;
for (++pos; pos < offset; ++pos)
res_values[pos] = static_cast<Result>(data[pos]) - static_cast<Result>(data[pos - 1]);
}
impl(data.data(), res_values.data(), pos, offset);
pos = offset;
}
res_ptr = ColumnArray::create(std::move(res_nested), array.getOffsetsPtr());
return true;
}

View File

@ -29,6 +29,7 @@ public:
}
bool useDefaultImplementationForNulls() const override { return false; }
bool isSuitableForConstantFolding() const override { return false; }
/// We should never return LowCardinality result, cause we declare that result is always constant zero.
/// (in getResultIfAlwaysReturnsConstantAndHasArguments)

View File

@ -45,6 +45,7 @@ void registerFunctionTimeZone(FunctionFactory &);
void registerFunctionRunningAccumulate(FunctionFactory &);
void registerFunctionRunningDifference(FunctionFactory &);
void registerFunctionRunningDifferenceStartingWithFirstValue(FunctionFactory &);
void registerFunctionRunningConcurrency(FunctionFactory &);
void registerFunctionFinalizeAggregation(FunctionFactory &);
void registerFunctionToLowCardinality(FunctionFactory &);
void registerFunctionLowCardinalityIndices(FunctionFactory &);
@ -112,6 +113,7 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory)
registerFunctionRunningAccumulate(factory);
registerFunctionRunningDifference(factory);
registerFunctionRunningDifferenceStartingWithFirstValue(factory);
registerFunctionRunningConcurrency(factory);
registerFunctionFinalizeAggregation(factory);
registerFunctionToLowCardinality(factory);
registerFunctionLowCardinalityIndices(factory);

View File

@ -0,0 +1,223 @@
#include <Columns/ColumnVector.h>
#include <Core/callOnTypeIndex.h>
#include <DataTypes/IDataType.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/DataTypeDateTime.h>
#include <DataTypes/DataTypeDateTime64.h>
#include <DataTypes/DataTypeNullable.h>
#include <Formats/FormatSettings.h>
#include <Functions/FunctionFactory.h>
#include <Functions/IFunctionImpl.h>
#include <IO/WriteBufferFromString.h>
#include <common/defines.h>
#include <set>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int INCORRECT_DATA;
}
template <typename Name, typename ArgDataType, typename ConcurrencyDataType>
class ExecutableFunctionRunningConcurrency : public IExecutableFunctionImpl
{
public:
String getName() const override
{
return Name::name;
}
ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
{
using ColVecArg = typename ArgDataType::ColumnType;
const ColVecArg * col_begin = checkAndGetColumn<ColVecArg>(arguments[0].column.get());
const ColVecArg * col_end = checkAndGetColumn<ColVecArg>(arguments[1].column.get());
if (!col_begin || !col_end)
throw Exception("Constant columns are not supported at the moment",
ErrorCodes::ILLEGAL_COLUMN);
const typename ColVecArg::Container & vec_begin = col_begin->getData();
const typename ColVecArg::Container & vec_end = col_end->getData();
using ColVecConc = typename ConcurrencyDataType::ColumnType;
typename ColVecConc::MutablePtr col_concurrency = ColVecConc::create(input_rows_count);
typename ColVecConc::Container & vec_concurrency = col_concurrency->getData();
std::multiset<typename ArgDataType::FieldType> ongoing_until;
for (size_t i = 0; i < input_rows_count; ++i)
{
const auto begin = vec_begin[i];
const auto end = vec_end[i];
if (unlikely(begin > end))
{
const FormatSettings default_format;
WriteBufferFromOwnString buf_begin, buf_end;
arguments[0].type->serializeAsTextQuoted(*(arguments[0].column), i, buf_begin, default_format);
arguments[1].type->serializeAsTextQuoted(*(arguments[1].column), i, buf_end, default_format);
throw Exception(
"Incorrect order of events: " + buf_begin.str() + " > " + buf_end.str(),
ErrorCodes::INCORRECT_DATA);
}
ongoing_until.insert(end);
// Erase all the elements from "ongoing_until" which
// are less than or equal to "begin", i.e. durations
// that have already ended. We consider "begin" to be
// inclusive, and "end" to be exclusive.
ongoing_until.erase(
ongoing_until.begin(), ongoing_until.upper_bound(begin));
vec_concurrency[i] = ongoing_until.size();
}
return col_concurrency;
}
bool useDefaultImplementationForConstants() const override
{
return true;
}
};
template <typename Name, typename ArgDataType, typename ConcurrencyDataType>
class FunctionBaseRunningConcurrency : public IFunctionBaseImpl
{
public:
explicit FunctionBaseRunningConcurrency(DataTypes argument_types_, DataTypePtr return_type_)
: argument_types(std::move(argument_types_))
, return_type(std::move(return_type_)) {}
String getName() const override
{
return Name::name;
}
const DataTypes & getArgumentTypes() const override
{
return argument_types;
}
const DataTypePtr & getResultType() const override
{
return return_type;
}
ExecutableFunctionImplPtr prepare(const ColumnsWithTypeAndName &) const override
{
return std::make_unique<ExecutableFunctionRunningConcurrency<Name, ArgDataType, ConcurrencyDataType>>();
}
bool isStateful() const override
{
return true;
}
private:
DataTypes argument_types;
DataTypePtr return_type;
};
template <typename Name, typename ConcurrencyDataType>
class RunningConcurrencyOverloadResolver : public IFunctionOverloadResolverImpl
{
template <typename T>
struct TypeTag
{
using Type = T;
};
/// Call a polymorphic lambda with a type tag of src_type.
template <typename F>
void dispatchForSourceType(const IDataType & src_type, F && f) const
{
WhichDataType which(src_type);
switch (which.idx)
{
case TypeIndex::Date: f(TypeTag<DataTypeDate>()); break;
case TypeIndex::DateTime: f(TypeTag<DataTypeDateTime>()); break;
case TypeIndex::DateTime64: f(TypeTag<DataTypeDateTime64>()); break;
default:
throw Exception(
"Arguments for function " + getName() + " must be Date, DateTime, or DateTime64.",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
}
public:
static constexpr auto name = Name::name;
static FunctionOverloadResolverImplPtr create(const Context &)
{
return std::make_unique<RunningConcurrencyOverloadResolver<Name, ConcurrencyDataType>>();
}
String getName() const override
{
return Name::name;
}
FunctionBaseImplPtr build(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override
{
// The type of the second argument must match with that of the first one.
if (unlikely(!arguments[1].type->equals(*(arguments[0].type))))
{
throw Exception(
"Function " + getName() + " must be called with two arguments having the same type.",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
DataTypes argument_types = { arguments[0].type, arguments[1].type };
FunctionBaseImplPtr base;
dispatchForSourceType(*(arguments[0].type), [&](auto arg_type_tag) // Throws when the type is inappropriate.
{
using Tag = decltype(arg_type_tag);
using ArgDataType = typename Tag::Type;
base = std::make_unique<FunctionBaseRunningConcurrency<Name, ArgDataType, ConcurrencyDataType>>(argument_types, return_type);
});
return base;
}
DataTypePtr getReturnType(const DataTypes &) const override
{
return std::make_shared<ConcurrencyDataType>();
}
size_t getNumberOfArguments() const override
{
return 2;
}
bool isInjective(const ColumnsWithTypeAndName &) const override
{
return false;
}
bool isStateful() const override
{
return true;
}
bool useDefaultImplementationForNulls() const override
{
return false;
}
};
struct NameRunningConcurrency
{
static constexpr auto name = "runningConcurrency";
};
void registerFunctionRunningConcurrency(FunctionFactory & factory)
{
factory.registerFunction<RunningConcurrencyOverloadResolver<NameRunningConcurrency, DataTypeUInt32>>();
}
}

View File

@ -423,6 +423,7 @@ SRCS(
rowNumberInAllBlocks.cpp
rowNumberInBlock.cpp
runningAccumulate.cpp
runningConcurrency.cpp
runningDifference.cpp
runningDifferenceStartingWithFirstValue.cpp
sigmoid.cpp

View File

@ -35,10 +35,10 @@ struct Memory : boost::noncopyable, Allocator
char * m_data = nullptr;
size_t alignment = 0;
Memory() {}
Memory() = default;
/// If alignment != 0, then allocate memory aligned to specified value.
Memory(size_t size_, size_t alignment_ = 0) : m_capacity(size_), m_size(m_capacity), alignment(alignment_)
explicit Memory(size_t size_, size_t alignment_ = 0) : m_capacity(size_), m_size(m_capacity), alignment(alignment_)
{
alloc();
}
@ -140,7 +140,7 @@ protected:
Memory<> memory;
public:
/// If non-nullptr 'existing_memory' is passed, then buffer will not create its own memory and will use existing_memory without ownership.
BufferWithOwnMemory(size_t size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0)
explicit BufferWithOwnMemory(size_t size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0)
: Base(nullptr, 0), memory(existing_memory ? 0 : size, alignment)
{
Base::set(existing_memory ? existing_memory : memory.data(), size);

View File

@ -1104,9 +1104,9 @@ void saveUpToPosition(ReadBuffer & in, DB::Memory<> & memory, char * current)
assert(current >= in.position());
assert(current <= in.buffer().end());
const int old_bytes = memory.size();
const int additional_bytes = current - in.position();
const int new_bytes = old_bytes + additional_bytes;
const size_t old_bytes = memory.size();
const size_t additional_bytes = current - in.position();
const size_t new_bytes = old_bytes + additional_bytes;
/// There are no new bytes to add to memory.
/// No need to do extra stuff.
if (new_bytes == 0)

View File

@ -13,6 +13,7 @@
# include <aws/core/platform/Environment.h>
# include <aws/core/utils/logging/LogMacros.h>
# include <aws/core/utils/logging/LogSystemInterface.h>
# include <aws/core/utils/HashingUtils.h>
# include <aws/s3/S3Client.h>
# include <aws/core/http/HttpClientFactory.h>
# include <IO/S3/PocoHTTPClientFactory.h>
@ -273,56 +274,12 @@ namespace S3
return ret;
}
/// This method is not static because it requires ClientFactory to be initialized.
std::shared_ptr<Aws::S3::S3Client> ClientFactory::create( // NOLINT
const String & endpoint,
bool is_virtual_hosted_style,
const String & access_key_id,
const String & secret_access_key,
bool use_environment_credentials,
const RemoteHostFilter & remote_host_filter,
unsigned int s3_max_redirects)
{
PocoHTTPClientConfiguration client_configuration(remote_host_filter, s3_max_redirects);
if (!endpoint.empty())
client_configuration.endpointOverride = endpoint;
return create(client_configuration,
is_virtual_hosted_style,
access_key_id,
secret_access_key,
use_environment_credentials);
}
std::shared_ptr<Aws::S3::S3Client> ClientFactory::create( // NOLINT
const PocoHTTPClientConfiguration & cfg_,
bool is_virtual_hosted_style,
const String & access_key_id,
const String & secret_access_key,
bool use_environment_credentials)
{
Aws::Auth::AWSCredentials credentials(access_key_id, secret_access_key);
PocoHTTPClientConfiguration client_configuration = cfg_;
client_configuration.updateSchemeAndRegion();
return std::make_shared<Aws::S3::S3Client>(
std::make_shared<S3CredentialsProviderChain>(
client_configuration,
credentials,
use_environment_credentials), // AWS credentials provider.
std::move(client_configuration), // Client configuration.
Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, // Sign policy.
is_virtual_hosted_style || client_configuration.endpointOverride.empty() // Use virtual addressing if endpoint is not specified.
);
}
std::shared_ptr<Aws::S3::S3Client> ClientFactory::create( // NOLINT
const PocoHTTPClientConfiguration & cfg_,
bool is_virtual_hosted_style,
const String & access_key_id,
const String & secret_access_key,
const String & server_side_encryption_customer_key_base64,
HeaderCollection headers,
bool use_environment_credentials)
{
@ -331,7 +288,28 @@ namespace S3
Aws::Auth::AWSCredentials credentials(access_key_id, secret_access_key);
auto auth_signer = std::make_shared<S3AuthSigner>(client_configuration, std::move(credentials), std::move(headers), use_environment_credentials);
if (!server_side_encryption_customer_key_base64.empty())
{
/// See S3Client::GeneratePresignedUrlWithSSEC().
headers.push_back({Aws::S3::SSEHeaders::SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM,
Aws::S3::Model::ServerSideEncryptionMapper::GetNameForServerSideEncryption(Aws::S3::Model::ServerSideEncryption::AES256)});
headers.push_back({Aws::S3::SSEHeaders::SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY,
server_side_encryption_customer_key_base64});
Aws::Utils::ByteBuffer buffer = Aws::Utils::HashingUtils::Base64Decode(server_side_encryption_customer_key_base64);
String str_buffer(reinterpret_cast<char *>(buffer.GetUnderlyingData()), buffer.GetLength());
headers.push_back({Aws::S3::SSEHeaders::SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5,
Aws::Utils::HashingUtils::Base64Encode(Aws::Utils::HashingUtils::CalculateMD5(str_buffer))});
}
auto auth_signer = std::make_shared<S3AuthSigner>(
client_configuration,
std::move(credentials),
std::move(headers),
use_environment_credentials);
return std::make_shared<Aws::S3::S3Client>(
std::move(auth_signer),
std::move(client_configuration), // Client configuration.

View File

@ -31,27 +31,12 @@ public:
static ClientFactory & instance();
std::shared_ptr<Aws::S3::S3Client> create(
const String & endpoint,
bool is_virtual_hosted_style,
const String & access_key_id,
const String & secret_access_key,
bool use_environment_credentials,
const RemoteHostFilter & remote_host_filter,
unsigned int s3_max_redirects);
std::shared_ptr<Aws::S3::S3Client> create(
const PocoHTTPClientConfiguration & cfg,
bool is_virtual_hosted_style,
const String & access_key_id,
const String & secret_access_key,
bool use_environment_credentials);
std::shared_ptr<Aws::S3::S3Client> create(
const PocoHTTPClientConfiguration & cfg,
bool is_virtual_hosted_style,
const String & access_key_id,
const String & secret_access_key,
const String & server_side_encryption_customer_key_base64,
HeaderCollection headers,
bool use_environment_credentials);

View File

@ -95,8 +95,15 @@ public:
++pos;
}
virtual void sync() {}
virtual void finalize() {}
virtual void sync()
{
next();
}
virtual void finalize()
{
next();
}
private:
/** Write the data in the buffer (from the beginning of the buffer to the current position).

View File

@ -188,14 +188,14 @@ void WriteBufferFromHTTPServerResponse::onProgress(const Progress & progress)
void WriteBufferFromHTTPServerResponse::finalize()
{
if (offset())
next();
if (out)
{
next();
if (out)
out.reset();
out->next();
out.reset();
}
else
if (!offset())
{
/// If no remaining data, just send headers.
std::lock_guard lock(mutex);

Some files were not shown because too many files have changed in this diff Show More