Merge branch 'master' of https://github.com/ClickHouse/ClickHouse into hdfs-idisk

This commit is contained in:
kssenii 2021-04-19 08:49:58 +00:00
commit ce16f4bb3f
116 changed files with 3130 additions and 848 deletions

2
contrib/NuRaft vendored

@ -1 +1 @@
Subproject commit d2feb5978b979729a07c3ca76eaa4ab94cef4ceb
Subproject commit 377f8e77491d9f66ce8e32e88aae19dffe8dc4d7

@ -1 +1 @@
Subproject commit f915d35b2de676683493c86c585141a1e1c83334
Subproject commit 45885c0c8c0807bb9480886d60ca7042000a4c43

2
contrib/poco vendored

@ -1 +1 @@
Subproject commit 83beecccb09eec0c9fd2669cacea03ede1d9f138
Subproject commit b7d9ec16ee33ca76643d5fcd907ea9a33285640a

2
contrib/zlib-ng vendored

@ -1 +1 @@
Subproject commit 4039bb4623905e73c6e32a0c022f144bab87b2b3
Subproject commit 7f254522fd676ff4e906c6d4e9b30d4df4214c2d

View File

@ -312,6 +312,8 @@ function run_tests
01533_collate_in_nullable
01542_collate_in_array
01543_collate_in_tuple
01798_uniq_theta_sketch
01799_long_uniq_theta_sketch
_orc_
arrow
avro

View File

@ -1,6 +1,7 @@
<yandex>
<http_port remove="remove"/>
<mysql_port remove="remove"/>
<postgresql_port remove="remove"/>
<interserver_http_port remove="remove"/>
<tcp_with_proxy_port remove="remove"/>
<keeper_server remove="remove"/>

View File

@ -38,3 +38,4 @@ We recommend using this function in almost all scenarios.
- [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64)
- [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md#agg_function-uniqhll12)
- [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact)
- [uniqThetaSketch](../../../sql-reference/aggregate-functions/reference/uniqthetasketch.md#agg_function-uniqthetasketch)

View File

@ -49,3 +49,4 @@ Compared to the [uniq](../../../sql-reference/aggregate-functions/reference/uniq
- [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64)
- [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md#agg_function-uniqhll12)
- [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact)
- [uniqThetaSketch](../../../sql-reference/aggregate-functions/reference/uniqthetasketch.md#agg_function-uniqthetasketch)

View File

@ -23,3 +23,4 @@ The function takes a variable number of parameters. Parameters can be `Tuple`, `
- [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq)
- [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniqcombined)
- [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniqhll12)
- [uniqThetaSketch](../../../sql-reference/aggregate-functions/reference/uniqthetasketch.md#agg_function-uniqthetasketch)

View File

@ -37,3 +37,4 @@ We dont recommend using this function. In most cases, use the [uniq](../../..
- [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq)
- [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined)
- [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact)
- [uniqThetaSketch](../../../sql-reference/aggregate-functions/reference/uniqthetasketch.md#agg_function-uniqthetasketch)

View File

@ -0,0 +1,39 @@
---
toc_priority: 195
---
# uniqThetaSketch {#agg_function-uniqthetasketch}
Calculates the approximate number of different argument values, using the [Theta Sketch Framework](https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html).
``` sql
uniqThetaSketch(x[, ...])
```
**Arguments**
The function takes a variable number of parameters. Parameters can be `Tuple`, `Array`, `Date`, `DateTime`, `String`, or numeric types.
**Returned value**
- A [UInt64](../../../sql-reference/data-types/int-uint.md)-type number.
**Implementation details**
Function:
- Calculates a hash for all parameters in the aggregate, then uses it in calculations.
- Uses the [KMV](https://datasketches.apache.org/docs/Theta/InverseEstimate.html) algorithm to approximate the number of different argument values.
4096(2^12) 64-bit sketch are used. The size of the state is about 41 KB.
- The relative error is 3.125% (95% confidence), see the [relative error table](https://datasketches.apache.org/docs/Theta/ThetaErrorTable.html) for detail.
**See Also**
- [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq)
- [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md#agg_function-uniqcombined)
- [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64)
- [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md#agg_function-uniqhll12)
- [uniqExact](../../../sql-reference/aggregate-functions/reference/uniqexact.md#agg_function-uniqexact)

View File

@ -1213,62 +1213,6 @@ SELECT arrayFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5, 6, 14,
Note that the `arrayFill` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it cant be omitted.
## arrayFold(func, arr1, …, init) {#array-fold}
Returns an result of [folding](https://en.wikipedia.org/wiki/Fold_(higher-order_function)) arrays and value `init` using function `func`.
I.e. result of calculation `func(arr1[n], …, func(arr1[n - 1], …, func(…, func(arr1[2], …, func(arr1[1], …, init)))))`.
Note that the `arrayMap` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it cant be omitted.
**Arguments**
- `func` — The lambda function with `n+1` arguments (where `n` is number of input arrays), first `n` arguments are for
current elements of input arrays, and last argument is for current value of accumulator.
- `arr` — Any number of [arrays](../../sql-reference/data-types/array.md).
- `init` - Initial value of accumulator.
**Returned value**
Final value of accumulator.
**Examples**
The following example shows how to acquire product and sum of elements of array:
``` sql
SELECT arrayMap(x, accum -> (accum.1 * x, accum.2 + x), [1, 2, 3], (0, 1)) as res;
```
``` text
┌─res───────┐
│ (120, 15) │
└───────────┘
```
The following example shows how to reverse elements of array:
``` sql
SELECT arrayFold(x, acc -> arrayPushFront(acc, x), [1,2,3,4,5], emptyArrayUInt64()) as res;
```
``` text
┌─res─────────┐
│ [5,4,3,2,1] │
└─────────────┘
```
Folding may be used to access of already passed elements due to function calculation, for example:
``` sql
SELECT arrayFold(x, acc -> (x, concat(acc.2, toString(acc.1), ',')), [1,2], (0,''))
```
``` text
┌─res────────┐
│ (2,'0,1,') │
└────────────┘
```
## arrayReverseFill(func, arr1, …) {#array-reverse-fill}
Scan through `arr1` from the last element to the first element and replace `arr1[i]` by `arr1[i + 1]` if `func` returns 0. The last element of `arr1` will not be replaced.

View File

@ -91,7 +91,7 @@ Hierarchy of privileges:
- `ALTER ADD CONSTRAINT`
- `ALTER DROP CONSTRAINT`
- `ALTER TTL`
- `ALTER MATERIALIZE TTL`
- `ALTER MATERIALIZE TTL`
- `ALTER SETTINGS`
- `ALTER MOVE PARTITION`
- `ALTER FETCH PARTITION`
@ -102,9 +102,9 @@ Hierarchy of privileges:
- [CREATE](#grant-create)
- `CREATE DATABASE`
- `CREATE TABLE`
- `CREATE TEMPORARY TABLE`
- `CREATE VIEW`
- `CREATE DICTIONARY`
- `CREATE TEMPORARY TABLE`
- [DROP](#grant-drop)
- `DROP DATABASE`
- `DROP TABLE`
@ -150,7 +150,7 @@ Hierarchy of privileges:
- `SYSTEM RELOAD`
- `SYSTEM RELOAD CONFIG`
- `SYSTEM RELOAD DICTIONARY`
- `SYSTEM RELOAD EMBEDDED DICTIONARIES`
- `SYSTEM RELOAD EMBEDDED DICTIONARIES`
- `SYSTEM MERGES`
- `SYSTEM TTL MERGES`
- `SYSTEM FETCHES`
@ -276,7 +276,7 @@ Allows executing [ALTER](../../sql-reference/statements/alter/index.md) queries
- `ALTER ADD CONSTRAINT`. Level: `TABLE`. Aliases: `ADD CONSTRAINT`
- `ALTER DROP CONSTRAINT`. Level: `TABLE`. Aliases: `DROP CONSTRAINT`
- `ALTER TTL`. Level: `TABLE`. Aliases: `ALTER MODIFY TTL`, `MODIFY TTL`
- `ALTER MATERIALIZE TTL`. Level: `TABLE`. Aliases: `MATERIALIZE TTL`
- `ALTER MATERIALIZE TTL`. Level: `TABLE`. Aliases: `MATERIALIZE TTL`
- `ALTER SETTINGS`. Level: `TABLE`. Aliases: `ALTER SETTING`, `ALTER MODIFY SETTING`, `MODIFY SETTING`
- `ALTER MOVE PARTITION`. Level: `TABLE`. Aliases: `ALTER MOVE PART`, `MOVE PARTITION`, `MOVE PART`
- `ALTER FETCH PARTITION`. Level: `TABLE`. Aliases: `ALTER FETCH PART`, `FETCH PARTITION`, `FETCH PART`
@ -304,9 +304,9 @@ Allows executing [CREATE](../../sql-reference/statements/create/index.md) and [A
- `CREATE`. Level: `GROUP`
- `CREATE DATABASE`. Level: `DATABASE`
- `CREATE TABLE`. Level: `TABLE`
- `CREATE TEMPORARY TABLE`. Level: `GLOBAL`
- `CREATE VIEW`. Level: `VIEW`
- `CREATE DICTIONARY`. Level: `DICTIONARY`
- `CREATE TEMPORARY TABLE`. Level: `GLOBAL`
**Notes**
@ -401,7 +401,7 @@ Allows a user to execute [SYSTEM](../../sql-reference/statements/system.md) quer
- `SYSTEM RELOAD`. Level: `GROUP`
- `SYSTEM RELOAD CONFIG`. Level: `GLOBAL`. Aliases: `RELOAD CONFIG`
- `SYSTEM RELOAD DICTIONARY`. Level: `GLOBAL`. Aliases: `SYSTEM RELOAD DICTIONARIES`, `RELOAD DICTIONARY`, `RELOAD DICTIONARIES`
- `SYSTEM RELOAD EMBEDDED DICTIONARIES`. Level: `GLOBAL`. Aliases: R`ELOAD EMBEDDED DICTIONARIES`
- `SYSTEM RELOAD EMBEDDED DICTIONARIES`. Level: `GLOBAL`. Aliases: `RELOAD EMBEDDED DICTIONARIES`
- `SYSTEM MERGES`. Level: `TABLE`. Aliases: `SYSTEM STOP MERGES`, `SYSTEM START MERGES`, `STOP MERGES`, `START MERGES`
- `SYSTEM TTL MERGES`. Level: `TABLE`. Aliases: `SYSTEM STOP TTL MERGES`, `SYSTEM START TTL MERGES`, `STOP TTL MERGES`, `START TTL MERGES`
- `SYSTEM FETCHES`. Level: `TABLE`. Aliases: `SYSTEM STOP FETCHES`, `SYSTEM START FETCHES`, `STOP FETCHES`, `START FETCHES`

View File

@ -121,6 +121,7 @@ $ clickhouse-client --param_tbl="numbers" --param_db="system" --param_col="numbe
- `--user, -u` — имя пользователя, по умолчанию — default.
- `--password` — пароль, по умолчанию — пустая строка.
- `--query, -q` — запрос для выполнения, при использовании в неинтерактивном режиме.
- `--queries-file, -qf` - путь к файлу с запросами для выполнения. Необходимо указать только одну из опций: `query` или `queries-file`.
- `--database, -d` — выбрать текущую БД. Без указания значение берется из настроек сервера (по умолчанию — БД default).
- `--multiline, -m` — если указано — разрешить многострочные запросы, не отправлять запрос по нажатию Enter.
- `--multiquery, -n` — если указано — разрешить выполнять несколько запросов, разделённых точкой с запятой.
@ -130,6 +131,7 @@ $ clickhouse-client --param_tbl="numbers" --param_db="system" --param_col="numbe
- `--stacktrace` — если указано, в случае исключения, выводить также его стек-трейс.
- `--config-file` — имя конфигурационного файла.
- `--secure` — если указано, будет использован безопасный канал.
- `--history_file` - путь к файлу с историей команд.
- `--param_<name>` — значение параметра для [запроса с параметрами](#cli-queries-with-parameters).
Начиная с версии 20.5, в `clickhouse-client` есть автоматическая подсветка синтаксиса (включена всегда).

View File

@ -1147,62 +1147,6 @@ SELECT arrayReverseFill(x -> not isNull(x), [1, null, 3, 11, 12, null, null, 5,
Функция `arrayReverseFill` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен.
## arrayFold(func, arr1, …, init) {#array-fold}
Возвращает результат [сворачивания](https://ru.wikipedia.org/wiki/%D0%A1%D0%B2%D1%91%D1%80%D1%82%D0%BA%D0%B0_%D1%81%D0%BF%D0%B8%D1%81%D0%BA%D0%B0) массивов и начального значения `init` с помощью функции `func`.
Т.е. результат вычисления `func(arr1[n], …, func(arr1[n - 1], …, func(…, func(arr1[2], …, func(arr1[1], …, init)))))`.
Функция `arrayFold` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен.
**Аргументы**
- `func` — лямбда-функция с `n+1` параметром (где `n` это количество входных массивов), причём первые `n` параметров
используются для текущих элементов входных массивов, а последний элемент для текущего значения аккумулятора.
- `arr` — произвольное количество [массивов](../../sql-reference/data-types/array.md).
- `init` - начальное значение аккумулятора.
**Возвращаемое значение**
Итоговое значение аккумулятора.
**Примеры**
Следующий пример показывает, как вычислить произведение и сумму элементов массива:
``` sql
SELECT arrayMap(x, accum -> (accum.1 * x, accum.2 + x), [1, 2, 3], (0, 1)) as res;
```
``` text
┌─res───────┐
│ (120, 15) │
└───────────┘
```
В этом примере показано, как обратить массив:
``` sql
SELECT arrayFold(x, acc -> arrayPushFront(acc, x), [1,2,3,4,5], emptyArrayUInt64()) as res;
```
``` text
┌─res─────────┐
│ [5,4,3,2,1] │
└─────────────┘
```
Свёртка может быть использована для доступа к уже пройденным в процессе вычисления элементам. Например:
``` sql
SELECT arrayFold(x, acc -> (x, concat(acc.2, toString(acc.1), ',')), [1,2], (0,''))
```
``` text
┌─res────────┐
│ (2,'0,1,') │
└────────────┘
```
## arraySplit(func, arr1, …) {#array-split}
Разделяет массив `arr1` на несколько. Если `func` возвращает не 0, то массив разделяется, а элемент помещается в левую часть. Массив не разбивается по первому элементу.
@ -1239,7 +1183,6 @@ SELECT arrayReverseSplit((x, y) -> y, [1, 2, 3, 4, 5], [1, 0, 0, 1, 0]) AS res
Функция `arrayReverseSplit` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен.
## arrayExists(\[func,\] arr1, …) {#arrayexistsfunc-arr1}
Возвращает 1, если существует хотя бы один элемент массива `arr`, для которого функция func возвращает не 0. Иначе возвращает 0.

View File

@ -93,7 +93,7 @@ GRANT SELECT(x,y) ON db.table TO john WITH GRANT OPTION
- `ALTER ADD CONSTRAINT`
- `ALTER DROP CONSTRAINT`
- `ALTER TTL`
- `ALTER MATERIALIZE TTL`
- `ALTER MATERIALIZE TTL`
- `ALTER SETTINGS`
- `ALTER MOVE PARTITION`
- `ALTER FETCH PARTITION`
@ -104,9 +104,9 @@ GRANT SELECT(x,y) ON db.table TO john WITH GRANT OPTION
- [CREATE](#grant-create)
- `CREATE DATABASE`
- `CREATE TABLE`
- `CREATE TEMPORARY TABLE`
- `CREATE VIEW`
- `CREATE DICTIONARY`
- `CREATE TEMPORARY TABLE`
- [DROP](#grant-drop)
- `DROP DATABASE`
- `DROP TABLE`
@ -152,7 +152,7 @@ GRANT SELECT(x,y) ON db.table TO john WITH GRANT OPTION
- `SYSTEM RELOAD`
- `SYSTEM RELOAD CONFIG`
- `SYSTEM RELOAD DICTIONARY`
- `SYSTEM RELOAD EMBEDDED DICTIONARIES`
- `SYSTEM RELOAD EMBEDDED DICTIONARIES`
- `SYSTEM MERGES`
- `SYSTEM TTL MERGES`
- `SYSTEM FETCHES`
@ -279,7 +279,7 @@ GRANT INSERT(x,y) ON db.table TO john
- `ALTER ADD CONSTRAINT`. Уровень: `TABLE`. Алиасы: `ADD CONSTRAINT`
- `ALTER DROP CONSTRAINT`. Уровень: `TABLE`. Алиасы: `DROP CONSTRAINT`
- `ALTER TTL`. Уровень: `TABLE`. Алиасы: `ALTER MODIFY TTL`, `MODIFY TTL`
- `ALTER MATERIALIZE TTL`. Уровень: `TABLE`. Алиасы: `MATERIALIZE TTL`
- `ALTER MATERIALIZE TTL`. Уровень: `TABLE`. Алиасы: `MATERIALIZE TTL`
- `ALTER SETTINGS`. Уровень: `TABLE`. Алиасы: `ALTER SETTING`, `ALTER MODIFY SETTING`, `MODIFY SETTING`
- `ALTER MOVE PARTITION`. Уровень: `TABLE`. Алиасы: `ALTER MOVE PART`, `MOVE PARTITION`, `MOVE PART`
- `ALTER FETCH PARTITION`. Уровень: `TABLE`. Алиасы: `FETCH PARTITION`
@ -307,9 +307,9 @@ GRANT INSERT(x,y) ON db.table TO john
- `CREATE`. Уровень: `GROUP`
- `CREATE DATABASE`. Уровень: `DATABASE`
- `CREATE TABLE`. Уровень: `TABLE`
- `CREATE TEMPORARY TABLE`. Уровень: `GLOBAL`
- `CREATE VIEW`. Уровень: `VIEW`
- `CREATE DICTIONARY`. Уровень: `DICTIONARY`
- `CREATE TEMPORARY TABLE`. Уровень: `GLOBAL`
**Дополнительно**
@ -407,7 +407,7 @@ GRANT INSERT(x,y) ON db.table TO john
- `SYSTEM RELOAD`. Уровень: `GROUP`
- `SYSTEM RELOAD CONFIG`. Уровень: `GLOBAL`. Алиасы: `RELOAD CONFIG`
- `SYSTEM RELOAD DICTIONARY`. Уровень: `GLOBAL`. Алиасы: `SYSTEM RELOAD DICTIONARIES`, `RELOAD DICTIONARY`, `RELOAD DICTIONARIES`
- `SYSTEM RELOAD EMBEDDED DICTIONARIES`. Уровень: `GLOBAL`. Алиасы: `RELOAD EMBEDDED DICTIONARIES`
- `SYSTEM RELOAD EMBEDDED DICTIONARIES`. Уровень: `GLOBAL`. Алиасы: `RELOAD EMBEDDED DICTIONARIES`
- `SYSTEM MERGES`. Уровень: `TABLE`. Алиасы: `SYSTEM STOP MERGES`, `SYSTEM START MERGES`, `STOP MERGES`, `START MERGES`
- `SYSTEM TTL MERGES`. Уровень: `TABLE`. Алиасы: `SYSTEM STOP TTL MERGES`, `SYSTEM START TTL MERGES`, `STOP TTL MERGES`, `START TTL MERGES`
- `SYSTEM FETCHES`. Уровень: `TABLE`. Алиасы: `SYSTEM STOP FETCHES`, `SYSTEM START FETCHES`, `STOP FETCHES`, `START FETCHES`

View File

@ -7,6 +7,8 @@
#include <IO/ConnectionTimeouts.h>
#include <Poco/Util/AbstractConfiguration.h>
#include <Common/Exception.h>
#include <Common/isLocalAddress.h>
#include <Common/DNSResolver.h>
#include <common/setTerminalEcho.h>
#include <ext/scope_guard.h>
@ -60,7 +62,9 @@ ConnectionParameters::ConnectionParameters(const Poco::Util::AbstractConfigurati
#endif
}
compression = config.getBool("compression", true) ? Protocol::Compression::Enable : Protocol::Compression::Disable;
/// By default compression is disabled if address looks like localhost.
compression = config.getBool("compression", !isLocalAddress(DNSResolver::instance().resolveHost(host)))
? Protocol::Compression::Enable : Protocol::Compression::Disable;
timeouts = ConnectionTimeouts(
Poco::Timespan(config.getInt("connect_timeout", DBMS_DEFAULT_CONNECT_TIMEOUT_SEC), 0),

View File

@ -54,9 +54,10 @@ void ODBCHandler::processError(HTTPServerResponse & response, const std::string
void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response)
{
HTMLForm params(request);
LOG_TRACE(log, "Request URI: {}", request.getURI());
if (mode == "read")
params.read(request.getStream());
LOG_TRACE(log, "Request URI: {}", request.getURI());
if (mode == "read" && !params.has("query"))
{
@ -64,11 +65,6 @@ void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse
return;
}
if (!params.has("columns"))
{
processError(response, "No 'columns' in request URL");
return;
}
if (!params.has("connection_string"))
{
@ -76,6 +72,16 @@ void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse
return;
}
if (!params.has("sample_block"))
{
processError(response, "No 'sample_block' in request URL");
return;
}
std::string format = params.get("format", "RowBinary");
std::string connection_string = params.get("connection_string");
LOG_TRACE(log, "Connection string: '{}'", connection_string);
UInt64 max_block_size = DEFAULT_BLOCK_SIZE;
if (params.has("max_block_size"))
{
@ -88,24 +94,19 @@ void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse
max_block_size = parse<size_t>(max_block_size_str);
}
std::string columns = params.get("columns");
std::string sample_block_string = params.get("sample_block");
std::unique_ptr<Block> sample_block;
try
{
sample_block = parseColumns(std::move(columns));
sample_block = parseColumns(std::move(sample_block_string));
}
catch (const Exception & ex)
{
processError(response, "Invalid 'columns' parameter in request body '" + ex.message() + "'");
LOG_WARNING(log, ex.getStackTraceString());
processError(response, "Invalid 'sample_block' parameter in request body '" + ex.message() + "'");
LOG_ERROR(log, ex.getStackTraceString());
return;
}
std::string format = params.get("format", "RowBinary");
std::string connection_string = params.get("connection_string");
LOG_TRACE(log, "Connection string: '{}'", connection_string);
WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout);
try

View File

@ -19,7 +19,18 @@ namespace ErrorCodes
std::string getIdentifierQuote(nanodbc::connection & connection)
{
return connection.get_info<std::string>(SQL_IDENTIFIER_QUOTE_CHAR);
std::string quote;
try
{
quote = connection.get_info<std::string>(SQL_IDENTIFIER_QUOTE_CHAR);
}
catch (...)
{
LOG_WARNING(&Poco::Logger::get("ODBCGetIdentifierQuote"), "Cannot fetch identifier quote. Default double quote is used. Reason: {}", getCurrentExceptionMessage(false));
return "\"";
}
return quote;
}

View File

@ -130,6 +130,7 @@ enum class AccessType
M(SYSTEM_RELOAD_CONFIG, "RELOAD CONFIG", GLOBAL, SYSTEM_RELOAD) \
M(SYSTEM_RELOAD_SYMBOLS, "RELOAD SYMBOLS", GLOBAL, SYSTEM_RELOAD) \
M(SYSTEM_RELOAD_DICTIONARY, "SYSTEM RELOAD DICTIONARIES, RELOAD DICTIONARY, RELOAD DICTIONARIES", GLOBAL, SYSTEM_RELOAD) \
M(SYSTEM_RELOAD_MODEL, "SYSTEM RELOAD MODELS, RELOAD MODEL, RELOAD MODELS", GLOBAL, SYSTEM_RELOAD) \
M(SYSTEM_RELOAD_EMBEDDED_DICTIONARIES, "RELOAD EMBEDDED DICTIONARIES", GLOBAL, SYSTEM_RELOAD) /* implicitly enabled by the grant SYSTEM_RELOAD_DICTIONARY ON *.* */\
M(SYSTEM_RELOAD, "", GROUP, SYSTEM) \
M(SYSTEM_MERGES, "SYSTEM STOP MERGES, SYSTEM START MERGES, STOP_MERGES, START MERGES", TABLE, SYSTEM) \

View File

@ -132,6 +132,12 @@ void registerAggregateFunctionsUniq(AggregateFunctionFactory & factory)
factory.registerFunction("uniqExact",
{createAggregateFunctionUniq<true, AggregateFunctionUniqExactData, AggregateFunctionUniqExactData<String>>, properties});
#if USE_DATASKETCHES
factory.registerFunction("uniqThetaSketch",
{createAggregateFunctionUniq<AggregateFunctionUniqThetaSketchData, AggregateFunctionUniqThetaSketchData>, properties});
#endif
}
}

View File

@ -22,6 +22,7 @@
#include <AggregateFunctions/UniquesHashSet.h>
#include <AggregateFunctions/IAggregateFunction.h>
#include <AggregateFunctions/ThetaSketchData.h>
#include <AggregateFunctions/UniqVariadicHash.h>
@ -124,6 +125,19 @@ struct AggregateFunctionUniqExactData<String>
};
/// uniqThetaSketch
#if USE_DATASKETCHES
struct AggregateFunctionUniqThetaSketchData
{
using Set = ThetaSketchData<UInt64>;
Set set;
static String getName() { return "uniqThetaSketch"; }
};
#endif
namespace detail
{
@ -189,6 +203,12 @@ struct OneAdder
data.set.insert(key);
}
}
#if USE_DATASKETCHES
else if constexpr (std::is_same_v<Data, AggregateFunctionUniqThetaSketchData>)
{
data.set.insertOriginal(column.getDataAt(row_num));
}
#endif
}
};

View File

@ -0,0 +1,119 @@
#pragma once
#if !defined(ARCADIA_BUILD)
# include <Common/config.h>
#endif
#if USE_DATASKETCHES
#include <boost/noncopyable.hpp>
#include <memory>
#include <theta_sketch.hpp> // Y_IGNORE
#include <theta_union.hpp> // Y_IGNORE
namespace DB
{
template <typename Key>
class ThetaSketchData : private boost::noncopyable
{
private:
std::unique_ptr<datasketches::update_theta_sketch> sk_update;
std::unique_ptr<datasketches::theta_union> sk_union;
inline datasketches::update_theta_sketch * getSkUpdate()
{
if (!sk_update)
sk_update = std::make_unique<datasketches::update_theta_sketch>(datasketches::update_theta_sketch::builder().build());
return sk_update.get();
}
inline datasketches::theta_union * getSkUnion()
{
if (!sk_union)
sk_union = std::make_unique<datasketches::theta_union>(datasketches::theta_union::builder().build());
return sk_union.get();
}
public:
using value_type = Key;
ThetaSketchData() = default;
~ThetaSketchData() = default;
/// Insert original value without hash, as `datasketches::update_theta_sketch.update` will do the hash internal.
void insertOriginal(const StringRef & value)
{
getSkUpdate()->update(value.data, value.size);
}
/// Note that `datasketches::update_theta_sketch.update` will do the hash again.
void insert(Key value)
{
getSkUpdate()->update(value);
}
UInt64 size() const
{
if (sk_union)
return static_cast<UInt64>(sk_union->get_result().get_estimate());
else if (sk_update)
return static_cast<UInt64>(sk_update->get_estimate());
else
return 0;
}
void merge(const ThetaSketchData & rhs)
{
datasketches::theta_union * u = getSkUnion();
if (sk_update)
{
u->update(*sk_update);
sk_update.reset(nullptr);
}
if (rhs.sk_update)
u->update(*rhs.sk_update);
else if (rhs.sk_union)
u->update(rhs.sk_union->get_result());
}
/// You can only call for an empty object.
void read(DB::ReadBuffer & in)
{
datasketches::compact_theta_sketch::vector_bytes bytes;
readVectorBinary(bytes, in);
if (!bytes.empty())
{
auto sk = datasketches::compact_theta_sketch::deserialize(bytes.data(), bytes.size());
getSkUnion()->update(sk);
}
}
void write(DB::WriteBuffer & out) const
{
if (sk_update)
{
auto bytes = sk_update->compact().serialize();
writeVectorBinary(bytes, out);
}
else if (sk_union)
{
auto bytes = sk_union->get_result().serialize();
writeVectorBinary(bytes, out);
}
else
{
datasketches::compact_theta_sketch::vector_bytes bytes;
writeVectorBinary(bytes, out);
}
}
};
}
#endif

View File

@ -37,7 +37,7 @@ class IXDBCBridgeHelper : public IBridgeHelper
public:
explicit IXDBCBridgeHelper(ContextPtr context_) : IBridgeHelper(context_) {}
virtual std::vector<std::pair<std::string, std::string>> getURLParams(const std::string & cols, UInt64 max_block_size) const = 0;
virtual std::vector<std::pair<std::string, std::string>> getURLParams(UInt64 max_block_size) const = 0;
virtual Poco::URI getColumnsInfoURI() const = 0;
@ -138,12 +138,11 @@ protected:
return uri;
}
URLParams getURLParams(const std::string & cols, UInt64 max_block_size) const override
URLParams getURLParams(UInt64 max_block_size) const override
{
std::vector<std::pair<std::string, std::string>> result;
result.emplace_back("connection_string", connection_string); /// already validated
result.emplace_back("columns", cols);
result.emplace_back("max_block_size", std::to_string(max_block_size));
return result;

View File

@ -139,6 +139,8 @@ public:
UInt16 getPort() const;
const String & getDefaultDatabase() const;
Protocol::Compression getCompression() const { return compression; }
/// If last flag is true, you need to call sendExternalTablesData after.
void sendQuery(
const ConnectionTimeouts & timeouts,

View File

@ -116,6 +116,7 @@ struct Request
virtual ~Request() = default;
virtual String getPath() const = 0;
virtual void addRootPath(const String & /* root_path */) {}
virtual size_t bytesSize() const { return 0; }
};
struct Response;
@ -131,6 +132,7 @@ struct Response
Response & operator=(const Response &) = default;
virtual ~Response() = default;
virtual void removeRootPath(const String & /* root_path */) {}
virtual size_t bytesSize() const { return 0; }
};
struct WatchResponse : virtual Response
@ -140,6 +142,8 @@ struct WatchResponse : virtual Response
String path;
void removeRootPath(const String & root_path) override;
size_t bytesSize() const override { return path.size() + sizeof(type) + sizeof(state); }
};
using WatchCallback = std::function<void(const WatchResponse &)>;
@ -154,6 +158,9 @@ struct CreateRequest : virtual Request
void addRootPath(const String & root_path) override;
String getPath() const override { return path; }
size_t bytesSize() const override { return path.size() + data.size()
+ sizeof(is_ephemeral) + sizeof(is_sequential) + acls.size() * sizeof(ACL); }
};
struct CreateResponse : virtual Response
@ -161,6 +168,8 @@ struct CreateResponse : virtual Response
String path_created;
void removeRootPath(const String & root_path) override;
size_t bytesSize() const override { return path_created.size(); }
};
struct RemoveRequest : virtual Request
@ -170,6 +179,8 @@ struct RemoveRequest : virtual Request
void addRootPath(const String & root_path) override;
String getPath() const override { return path; }
size_t bytesSize() const override { return path.size() + sizeof(version); }
};
struct RemoveResponse : virtual Response
@ -182,11 +193,15 @@ struct ExistsRequest : virtual Request
void addRootPath(const String & root_path) override;
String getPath() const override { return path; }
size_t bytesSize() const override { return path.size(); }
};
struct ExistsResponse : virtual Response
{
Stat stat;
size_t bytesSize() const override { return sizeof(Stat); }
};
struct GetRequest : virtual Request
@ -195,12 +210,16 @@ struct GetRequest : virtual Request
void addRootPath(const String & root_path) override;
String getPath() const override { return path; }
size_t bytesSize() const override { return path.size(); }
};
struct GetResponse : virtual Response
{
String data;
Stat stat;
size_t bytesSize() const override { return data.size() + sizeof(stat); }
};
struct SetRequest : virtual Request
@ -211,11 +230,15 @@ struct SetRequest : virtual Request
void addRootPath(const String & root_path) override;
String getPath() const override { return path; }
size_t bytesSize() const override { return data.size() + data.size() + sizeof(version); }
};
struct SetResponse : virtual Response
{
Stat stat;
size_t bytesSize() const override { return sizeof(stat); }
};
struct ListRequest : virtual Request
@ -224,12 +247,22 @@ struct ListRequest : virtual Request
void addRootPath(const String & root_path) override;
String getPath() const override { return path; }
size_t bytesSize() const override { return path.size(); }
};
struct ListResponse : virtual Response
{
std::vector<String> names;
Stat stat;
size_t bytesSize() const override
{
size_t size = sizeof(stat);
for (const auto & name : names)
size += name.size();
return size;
}
};
struct CheckRequest : virtual Request
@ -239,6 +272,8 @@ struct CheckRequest : virtual Request
void addRootPath(const String & root_path) override;
String getPath() const override { return path; }
size_t bytesSize() const override { return path.size() + sizeof(version); }
};
struct CheckResponse : virtual Response
@ -251,6 +286,14 @@ struct MultiRequest : virtual Request
void addRootPath(const String & root_path) override;
String getPath() const override { return {}; }
size_t bytesSize() const override
{
size_t size = 0;
for (const auto & request : requests)
size += request->bytesSize();
return size;
}
};
struct MultiResponse : virtual Response
@ -258,6 +301,14 @@ struct MultiResponse : virtual Response
Responses responses;
void removeRootPath(const String & root_path) override;
size_t bytesSize() const override
{
size_t size = 0;
for (const auto & response : responses)
size += response->bytesSize();
return size;
}
};
/// This response may be received only as an element of responses in MultiResponse.

View File

@ -455,6 +455,39 @@ ZooKeeperResponsePtr ZooKeeperCheckRequest::makeResponse() const { return std::m
ZooKeeperResponsePtr ZooKeeperMultiRequest::makeResponse() const { return std::make_shared<ZooKeeperMultiResponse>(requests); }
ZooKeeperResponsePtr ZooKeeperCloseRequest::makeResponse() const { return std::make_shared<ZooKeeperCloseResponse>(); }
void ZooKeeperSessionIDRequest::writeImpl(WriteBuffer & out) const
{
Coordination::write(internal_id, out);
Coordination::write(session_timeout_ms, out);
Coordination::write(server_id, out);
}
void ZooKeeperSessionIDRequest::readImpl(ReadBuffer & in)
{
Coordination::read(internal_id, in);
Coordination::read(session_timeout_ms, in);
Coordination::read(server_id, in);
}
Coordination::ZooKeeperResponsePtr ZooKeeperSessionIDRequest::makeResponse() const
{
return std::make_shared<ZooKeeperSessionIDResponse>();
}
void ZooKeeperSessionIDResponse::readImpl(ReadBuffer & in)
{
Coordination::read(internal_id, in);
Coordination::read(session_id, in);
Coordination::read(server_id, in);
}
void ZooKeeperSessionIDResponse::writeImpl(WriteBuffer & out) const
{
Coordination::write(internal_id, out);
Coordination::write(session_id, out);
Coordination::write(server_id, out);
}
void ZooKeeperRequestFactory::registerRequest(OpNum op_num, Creator creator)
{
if (!op_num_to_request.try_emplace(op_num, creator).second)
@ -511,6 +544,7 @@ ZooKeeperRequestFactory::ZooKeeperRequestFactory()
registerZooKeeperRequest<OpNum::List, ZooKeeperListRequest>(*this);
registerZooKeeperRequest<OpNum::Check, ZooKeeperCheckRequest>(*this);
registerZooKeeperRequest<OpNum::Multi, ZooKeeperMultiRequest>(*this);
registerZooKeeperRequest<OpNum::SessionID, ZooKeeperSessionIDRequest>(*this);
}
}

View File

@ -84,6 +84,8 @@ struct ZooKeeperSyncRequest final : ZooKeeperRequest
void readImpl(ReadBuffer & in) override;
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return false; }
size_t bytesSize() const override { return ZooKeeperRequest::bytesSize() + path.size(); }
};
struct ZooKeeperSyncResponse final : ZooKeeperResponse
@ -92,6 +94,8 @@ struct ZooKeeperSyncResponse final : ZooKeeperResponse
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
OpNum getOpNum() const override { return OpNum::Sync; }
size_t bytesSize() const override { return path.size(); }
};
struct ZooKeeperHeartbeatResponse final : ZooKeeperResponse
@ -128,6 +132,9 @@ struct ZooKeeperAuthRequest final : ZooKeeperRequest
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return false; }
size_t bytesSize() const override { return ZooKeeperRequest::bytesSize() + sizeof(xid) +
sizeof(type) + scheme.size() + data.size(); }
};
struct ZooKeeperAuthResponse final : ZooKeeperResponse
@ -136,6 +143,8 @@ struct ZooKeeperAuthResponse final : ZooKeeperResponse
void writeImpl(WriteBuffer &) const override {}
OpNum getOpNum() const override { return OpNum::Auth; }
size_t bytesSize() const override { return ZooKeeperResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
};
struct ZooKeeperCloseRequest final : ZooKeeperRequest
@ -172,6 +181,8 @@ struct ZooKeeperCreateRequest final : public CreateRequest, ZooKeeperRequest
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return false; }
size_t bytesSize() const override { return CreateRequest::bytesSize() + sizeof(xid) + sizeof(has_watch); }
};
struct ZooKeeperCreateResponse final : CreateResponse, ZooKeeperResponse
@ -181,6 +192,8 @@ struct ZooKeeperCreateResponse final : CreateResponse, ZooKeeperResponse
void writeImpl(WriteBuffer & out) const override;
OpNum getOpNum() const override { return OpNum::Create; }
size_t bytesSize() const override { return CreateResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
};
struct ZooKeeperRemoveRequest final : RemoveRequest, ZooKeeperRequest
@ -194,6 +207,8 @@ struct ZooKeeperRemoveRequest final : RemoveRequest, ZooKeeperRequest
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return false; }
size_t bytesSize() const override { return RemoveRequest::bytesSize() + sizeof(xid); }
};
struct ZooKeeperRemoveResponse final : RemoveResponse, ZooKeeperResponse
@ -201,6 +216,8 @@ struct ZooKeeperRemoveResponse final : RemoveResponse, ZooKeeperResponse
void readImpl(ReadBuffer &) override {}
void writeImpl(WriteBuffer &) const override {}
OpNum getOpNum() const override { return OpNum::Remove; }
size_t bytesSize() const override { return RemoveResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
};
struct ZooKeeperExistsRequest final : ExistsRequest, ZooKeeperRequest
@ -211,6 +228,8 @@ struct ZooKeeperExistsRequest final : ExistsRequest, ZooKeeperRequest
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return !has_watch; }
size_t bytesSize() const override { return ExistsRequest::bytesSize() + sizeof(xid) + sizeof(has_watch); }
};
struct ZooKeeperExistsResponse final : ExistsResponse, ZooKeeperResponse
@ -218,6 +237,8 @@ struct ZooKeeperExistsResponse final : ExistsResponse, ZooKeeperResponse
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
OpNum getOpNum() const override { return OpNum::Exists; }
size_t bytesSize() const override { return ExistsResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
};
struct ZooKeeperGetRequest final : GetRequest, ZooKeeperRequest
@ -228,6 +249,8 @@ struct ZooKeeperGetRequest final : GetRequest, ZooKeeperRequest
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return !has_watch; }
size_t bytesSize() const override { return GetRequest::bytesSize() + sizeof(xid) + sizeof(has_watch); }
};
struct ZooKeeperGetResponse final : GetResponse, ZooKeeperResponse
@ -235,6 +258,8 @@ struct ZooKeeperGetResponse final : GetResponse, ZooKeeperResponse
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
OpNum getOpNum() const override { return OpNum::Get; }
size_t bytesSize() const override { return GetResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
};
struct ZooKeeperSetRequest final : SetRequest, ZooKeeperRequest
@ -247,6 +272,8 @@ struct ZooKeeperSetRequest final : SetRequest, ZooKeeperRequest
void readImpl(ReadBuffer & in) override;
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return false; }
size_t bytesSize() const override { return SetRequest::bytesSize() + sizeof(xid); }
};
struct ZooKeeperSetResponse final : SetResponse, ZooKeeperResponse
@ -254,6 +281,8 @@ struct ZooKeeperSetResponse final : SetResponse, ZooKeeperResponse
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
OpNum getOpNum() const override { return OpNum::Set; }
size_t bytesSize() const override { return SetResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
};
struct ZooKeeperListRequest : ListRequest, ZooKeeperRequest
@ -263,6 +292,8 @@ struct ZooKeeperListRequest : ListRequest, ZooKeeperRequest
void readImpl(ReadBuffer & in) override;
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return !has_watch; }
size_t bytesSize() const override { return ListRequest::bytesSize() + sizeof(xid) + sizeof(has_watch); }
};
struct ZooKeeperSimpleListRequest final : ZooKeeperListRequest
@ -275,6 +306,8 @@ struct ZooKeeperListResponse : ListResponse, ZooKeeperResponse
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
OpNum getOpNum() const override { return OpNum::List; }
size_t bytesSize() const override { return ListResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
};
struct ZooKeeperSimpleListResponse final : ZooKeeperListResponse
@ -293,6 +326,8 @@ struct ZooKeeperCheckRequest final : CheckRequest, ZooKeeperRequest
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return !has_watch; }
size_t bytesSize() const override { return CheckRequest::bytesSize() + sizeof(xid) + sizeof(has_watch); }
};
struct ZooKeeperCheckResponse final : CheckResponse, ZooKeeperResponse
@ -300,6 +335,8 @@ struct ZooKeeperCheckResponse final : CheckResponse, ZooKeeperResponse
void readImpl(ReadBuffer &) override {}
void writeImpl(WriteBuffer &) const override {}
OpNum getOpNum() const override { return OpNum::Check; }
size_t bytesSize() const override { return CheckResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
};
/// This response may be received only as an element of responses in MultiResponse.
@ -309,6 +346,8 @@ struct ZooKeeperErrorResponse final : ErrorResponse, ZooKeeperResponse
void writeImpl(WriteBuffer & out) const override;
OpNum getOpNum() const override { return OpNum::Error; }
size_t bytesSize() const override { return ErrorResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
};
struct ZooKeeperMultiRequest final : MultiRequest, ZooKeeperRequest
@ -323,6 +362,8 @@ struct ZooKeeperMultiRequest final : MultiRequest, ZooKeeperRequest
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override;
size_t bytesSize() const override { return MultiRequest::bytesSize() + sizeof(xid) + sizeof(has_watch); }
};
struct ZooKeeperMultiResponse final : MultiResponse, ZooKeeperResponse
@ -346,6 +387,41 @@ struct ZooKeeperMultiResponse final : MultiResponse, ZooKeeperResponse
void writeImpl(WriteBuffer & out) const override;
size_t bytesSize() const override { return MultiResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
};
/// Fake internal coordination (keeper) response. Never received from client
/// and never send to client.
struct ZooKeeperSessionIDRequest final : ZooKeeperRequest
{
int64_t internal_id;
int64_t session_timeout_ms;
/// Who requested this session
int32_t server_id;
Coordination::OpNum getOpNum() const override { return OpNum::SessionID; }
String getPath() const override { return {}; }
void writeImpl(WriteBuffer & out) const override;
void readImpl(ReadBuffer & in) override;
Coordination::ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return false; }
};
/// Fake internal coordination (keeper) response. Never received from client
/// and never send to client.
struct ZooKeeperSessionIDResponse final : ZooKeeperResponse
{
int64_t internal_id;
int64_t session_id;
/// Who requested this session
int32_t server_id;
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
Coordination::OpNum getOpNum() const override { return OpNum::SessionID; }
};
class ZooKeeperRequestFactory final : private boost::noncopyable

View File

@ -21,6 +21,7 @@ static const std::unordered_set<int32_t> VALID_OPERATIONS =
static_cast<int32_t>(OpNum::Check),
static_cast<int32_t>(OpNum::Multi),
static_cast<int32_t>(OpNum::Auth),
static_cast<int32_t>(OpNum::SessionID),
};
std::string toString(OpNum op_num)
@ -55,6 +56,8 @@ std::string toString(OpNum op_num)
return "Heartbeat";
case OpNum::Auth:
return "Auth";
case OpNum::SessionID:
return "SessionID";
}
int32_t raw_op = static_cast<int32_t>(op_num);
throw Exception("Operation " + std::to_string(raw_op) + " is unknown", Error::ZUNIMPLEMENTED);

View File

@ -30,6 +30,7 @@ enum class OpNum : int32_t
Check = 13,
Multi = 14,
Auth = 100,
SessionID = 997, /// Special internal request
};
std::string toString(OpNum op_num);

View File

@ -1012,6 +1012,16 @@ void ZooKeeper::pushRequest(RequestInfo && info)
ProfileEvents::increment(ProfileEvents::ZooKeeperTransactions);
}
void ZooKeeper::executeGenericRequest(
const ZooKeeperRequestPtr & request,
ResponseCallback callback)
{
RequestInfo request_info;
request_info.request = request;
request_info.callback = callback;
pushRequest(std::move(request_info));
}
void ZooKeeper::create(
const String & path,

View File

@ -121,6 +121,9 @@ public:
/// Useful to check owner of ephemeral node.
int64_t getSessionID() const override { return session_id; }
void executeGenericRequest(
const ZooKeeperRequestPtr & request,
ResponseCallback callback);
/// See the documentation about semantics of these methods in IKeeper class.

View File

@ -15,3 +15,4 @@
#cmakedefine01 USE_GRPC
#cmakedefine01 USE_STATS
#cmakedefine01 CLICKHOUSE_SPLIT_BINARY
#cmakedefine01 USE_DATASKETCHES

View File

@ -80,7 +80,7 @@ public:
{}
off_t appendRecord(ChangelogRecord && record, bool sync)
off_t appendRecord(ChangelogRecord && record)
{
off_t result = plain_buf.count();
writeIntBinary(computeRecordChecksum(record), plain_buf);
@ -96,23 +96,21 @@ public:
entries_written++;
if (sync)
plain_buf.sync();
else
plain_buf.next();
return result;
}
void truncateToLength(off_t new_length)
{
flush();
plain_buf.next();
plain_buf.truncate(new_length);
plain_buf.seek(new_length, SEEK_SET);
}
void flush()
void flush(bool force_fsync)
{
plain_buf.sync();
plain_buf.next();
if (force_fsync)
plain_buf.sync();
}
uint64_t getEntriesWritten() const
@ -247,9 +245,14 @@ private:
ReadBufferFromFile read_buf;
};
Changelog::Changelog(const std::string & changelogs_dir_, uint64_t rotate_interval_, Poco::Logger * log_)
Changelog::Changelog(
const std::string & changelogs_dir_,
uint64_t rotate_interval_,
bool force_sync_,
Poco::Logger * log_)
: changelogs_dir(changelogs_dir_)
, rotate_interval(rotate_interval_)
, force_sync(force_sync_)
, log(log_)
{
namespace fs = std::filesystem;
@ -357,6 +360,9 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin
void Changelog::rotate(uint64_t new_start_log_index)
{
/// Flush previous log
flush();
ChangelogFileDescription new_description;
new_description.prefix = DEFAULT_PREFIX;
new_description.from_log_index = new_start_log_index;
@ -387,7 +393,7 @@ ChangelogRecord Changelog::buildRecord(uint64_t index, const LogEntryPtr & log_e
return record;
}
void Changelog::appendEntry(uint64_t index, const LogEntryPtr & log_entry, bool force_sync)
void Changelog::appendEntry(uint64_t index, const LogEntryPtr & log_entry)
{
if (!current_writer)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Changelog must be initialized before appending records");
@ -398,14 +404,14 @@ void Changelog::appendEntry(uint64_t index, const LogEntryPtr & log_entry, bool
if (current_writer->getEntriesWritten() == rotate_interval)
rotate(index);
auto offset = current_writer->appendRecord(buildRecord(index, log_entry), force_sync);
auto offset = current_writer->appendRecord(buildRecord(index, log_entry));
if (!index_to_start_pos.try_emplace(index, offset).second)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Record with index {} already exists", index);
logs[index] = makeClone(log_entry);
}
void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry, bool force_sync)
void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry)
{
if (index_to_start_pos.count(index) == 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot write at index {} because changelog doesn't contain it", index);
@ -451,7 +457,7 @@ void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry, bool forc
current_writer->setEntriesWritten(entries_written);
appendEntry(index, log_entry, force_sync);
appendEntry(index, log_entry);
}
void Changelog::compact(uint64_t up_to_log_index)
@ -540,7 +546,7 @@ nuraft::ptr<nuraft::buffer> Changelog::serializeEntriesToBuffer(uint64_t index,
return buf_out;
}
void Changelog::applyEntriesFromBuffer(uint64_t index, nuraft::buffer & buffer, bool force_sync)
void Changelog::applyEntriesFromBuffer(uint64_t index, nuraft::buffer & buffer)
{
buffer.pos(0);
int num_logs = buffer.get_int();
@ -555,23 +561,23 @@ void Changelog::applyEntriesFromBuffer(uint64_t index, nuraft::buffer & buffer,
LogEntryPtr log_entry = nuraft::log_entry::deserialize(*buf_local);
if (i == 0 && logs.count(cur_index))
writeAt(cur_index, log_entry, force_sync);
writeAt(cur_index, log_entry);
else
appendEntry(cur_index, log_entry, force_sync);
appendEntry(cur_index, log_entry);
}
}
void Changelog::flush()
{
current_writer->flush();
if (current_writer)
current_writer->flush(force_sync);
}
Changelog::~Changelog()
{
try
{
if (current_writer)
current_writer->flush();
flush();
}
catch (...)
{

View File

@ -63,17 +63,17 @@ class Changelog
{
public:
Changelog(const std::string & changelogs_dir_, uint64_t rotate_interval_, Poco::Logger * log_);
Changelog(const std::string & changelogs_dir_, uint64_t rotate_interval_, bool force_sync_, Poco::Logger * log_);
/// Read changelog from files on changelogs_dir_ skipping all entries before from_log_index
/// Truncate broken entries, remove files after broken entries.
void readChangelogAndInitWriter(uint64_t last_commited_log_index, uint64_t logs_to_keep);
/// Add entry to log with index. Call fsync if force_sync true.
void appendEntry(uint64_t index, const LogEntryPtr & log_entry, bool force_sync);
/// Add entry to log with index.
void appendEntry(uint64_t index, const LogEntryPtr & log_entry);
/// Write entry at index and truncate all subsequent entries.
void writeAt(uint64_t index, const LogEntryPtr & log_entry, bool force_sync);
void writeAt(uint64_t index, const LogEntryPtr & log_entry);
/// Remove log files with to_log_index <= up_to_log_index.
void compact(uint64_t up_to_log_index);
@ -101,9 +101,9 @@ public:
BufferPtr serializeEntriesToBuffer(uint64_t index, int32_t count);
/// Apply entries from buffer overriding existing entries
void applyEntriesFromBuffer(uint64_t index, nuraft::buffer & buffer, bool force_sync);
void applyEntriesFromBuffer(uint64_t index, nuraft::buffer & buffer);
/// Fsync log to disk
/// Fsync latest log to disk and flush buffer
void flush();
uint64_t size() const
@ -124,6 +124,7 @@ private:
private:
const std::string changelogs_dir;
const uint64_t rotate_interval;
const bool force_sync;
Poco::Logger * log;
std::map<uint64_t, ChangelogFileDescription> existing_changelogs;

View File

@ -22,18 +22,19 @@ struct Settings;
M(Milliseconds, heart_beat_interval_ms, 500, "Heartbeat interval between quorum nodes", 0) \
M(Milliseconds, election_timeout_lower_bound_ms, 1000, "Lower bound of election timer (avoid too often leader elections)", 0) \
M(Milliseconds, election_timeout_upper_bound_ms, 2000, "Lower bound of election timer (avoid too often leader elections)", 0) \
M(UInt64, reserved_log_items, 10000, "How many log items to store (don't remove during compaction)", 0) \
M(UInt64, snapshot_distance, 10000, "How many log items we have to collect to write new snapshot", 0) \
M(UInt64, reserved_log_items, 100000, "How many log items to store (don't remove during compaction)", 0) \
M(UInt64, snapshot_distance, 100000, "How many log items we have to collect to write new snapshot", 0) \
M(Bool, auto_forwarding, true, "Allow to forward write requests from followers to leader", 0) \
M(Milliseconds, shutdown_timeout, 5000, "How many time we will until RAFT shutdown", 0) \
M(Milliseconds, startup_timeout, 30000, "How many time we will until RAFT to start", 0) \
M(LogsLevel, raft_logs_level, LogsLevel::information, "Log internal RAFT logs into main server log level. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0) \
M(UInt64, rotate_log_storage_interval, 10000, "How many records will be stored in one log storage file", 0) \
M(UInt64, rotate_log_storage_interval, 100000, "How many records will be stored in one log storage file", 0) \
M(UInt64, snapshots_to_keep, 3, "How many compressed snapshots to keep on disk", 0) \
M(UInt64, stale_log_gap, 10000, "When node became stale and should receive snapshots from leader", 0) \
M(UInt64, fresh_log_gap, 200, "When node became fresh", 0) \
M(UInt64, max_requests_batch_size, 100, "Max size of batch in requests count before it will be sent to RAFT", 0) \
M(Bool, quorum_reads, false, "Execute read requests as writes through whole RAFT consesus with similar speed", 0) \
M(Bool, force_sync, true, " Call fsync on each change in RAFT changelog", 0)
M(Bool, force_sync, true, "Call fsync on each change in RAFT changelog", 0)
DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS)

View File

@ -5,9 +5,12 @@ namespace DB
KeeperLogStore::KeeperLogStore(const std::string & changelogs_path, uint64_t rotate_interval_, bool force_sync_)
: log(&Poco::Logger::get("KeeperLogStore"))
, changelog(changelogs_path, rotate_interval_, log)
, force_sync(force_sync_)
, changelog(changelogs_path, rotate_interval_, force_sync_, log)
{
if (force_sync_)
LOG_INFO(log, "force_sync enabled");
else
LOG_INFO(log, "force_sync disabled");
}
uint64_t KeeperLogStore::start_index() const
@ -38,7 +41,7 @@ uint64_t KeeperLogStore::append(nuraft::ptr<nuraft::log_entry> & entry)
{
std::lock_guard lock(changelog_lock);
uint64_t idx = changelog.getNextEntryIndex();
changelog.appendEntry(idx, entry, force_sync);
changelog.appendEntry(idx, entry);
return idx;
}
@ -46,7 +49,7 @@ uint64_t KeeperLogStore::append(nuraft::ptr<nuraft::log_entry> & entry)
void KeeperLogStore::write_at(uint64_t index, nuraft::ptr<nuraft::log_entry> & entry)
{
std::lock_guard lock(changelog_lock);
changelog.writeAt(index, entry, force_sync);
changelog.writeAt(index, entry);
}
nuraft::ptr<std::vector<nuraft::ptr<nuraft::log_entry>>> KeeperLogStore::log_entries(uint64_t start, uint64_t end)
@ -93,7 +96,7 @@ bool KeeperLogStore::flush()
void KeeperLogStore::apply_pack(uint64_t index, nuraft::buffer & pack)
{
std::lock_guard lock(changelog_lock);
changelog.applyEntriesFromBuffer(index, pack, force_sync);
changelog.applyEntriesFromBuffer(index, pack);
}
uint64_t KeeperLogStore::size() const
@ -102,4 +105,10 @@ uint64_t KeeperLogStore::size() const
return changelog.size();
}
void KeeperLogStore::end_of_append_batch(uint64_t /*start_index*/, uint64_t /*count*/)
{
std::lock_guard lock(changelog_lock);
changelog.flush();
}
}

View File

@ -42,11 +42,12 @@ public:
uint64_t size() const;
void end_of_append_batch(uint64_t start_index, uint64_t count) override;
private:
mutable std::mutex changelog_lock;
Poco::Logger * log;
Changelog changelog;
bool force_sync;
};
}

View File

@ -24,6 +24,7 @@ namespace ErrorCodes
extern const int RAFT_ERROR;
extern const int NO_ELEMENTS_IN_CONFIG;
extern const int SUPPORT_IS_DISABLED;
extern const int LOGICAL_ERROR;
}
namespace
@ -73,7 +74,6 @@ KeeperServer::KeeperServer(
config.getString("keeper_server.snapshot_storage_path", config.getString("path", DBMS_DEFAULT_PATH) + "coordination/snapshots"),
coordination_settings))
, state_manager(nuraft::cs_new<KeeperStateManager>(server_id, "keeper_server", config, coordination_settings))
, responses_queue(responses_queue_)
, log(&Poco::Logger::get("KeeperServer"))
{
if (coordination_settings->quorum_reads)
@ -111,7 +111,7 @@ void KeeperServer::startup()
params.auto_forwarding_ = coordination_settings->auto_forwarding;
params.auto_forwarding_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds() * 2;
params.return_method_ = nuraft::raft_params::blocking;
params.return_method_ = nuraft::raft_params::async_handler;
nuraft::asio_service::options asio_opts{};
if (state_manager->isSecure())
@ -222,75 +222,26 @@ nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(int64_t session_id, const Coord
}
void KeeperServer::putRequest(const KeeperStorage::RequestForSession & request_for_session)
void KeeperServer::putLocalReadRequest(const KeeperStorage::RequestForSession & request_for_session)
{
auto [session_id, request] = request_for_session;
if (!coordination_settings->quorum_reads && isLeaderAlive() && request->isReadRequest())
{
state_machine->processReadRequest(request_for_session);
}
else
{
std::vector<nuraft::ptr<nuraft::buffer>> entries;
entries.push_back(getZooKeeperLogEntry(session_id, request));
if (!request_for_session.request->isReadRequest())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot process non-read request locally");
std::lock_guard lock(append_entries_mutex);
auto result = raft_instance->append_entries(entries);
if (!result->get_accepted())
{
KeeperStorage::ResponsesForSessions responses;
auto response = request->makeResponse();
response->xid = request->xid;
response->zxid = 0;
response->error = Coordination::Error::ZOPERATIONTIMEOUT;
responses_queue.push(DB::KeeperStorage::ResponseForSession{session_id, response});
}
if (result->get_result_code() == nuraft::cmd_result_code::TIMEOUT)
{
KeeperStorage::ResponsesForSessions responses;
auto response = request->makeResponse();
response->xid = request->xid;
response->zxid = 0;
response->error = Coordination::Error::ZOPERATIONTIMEOUT;
responses_queue.push(DB::KeeperStorage::ResponseForSession{session_id, response});
}
else if (result->get_result_code() != nuraft::cmd_result_code::OK)
throw Exception(ErrorCodes::RAFT_ERROR, "Requests result failed with code {} and message: '{}'", result->get_result_code(), result->get_result_str());
}
state_machine->processReadRequest(request_for_session);
}
int64_t KeeperServer::getSessionID(int64_t session_timeout_ms)
RaftAppendResult KeeperServer::putRequestBatch(const KeeperStorage::RequestsForSessions & requests_for_sessions)
{
/// Just some sanity check. We don't want to make a lot of clients wait with lock.
if (active_session_id_requests > 10)
throw Exception(ErrorCodes::RAFT_ERROR, "Too many concurrent SessionID requests already in flight");
++active_session_id_requests;
SCOPE_EXIT({ --active_session_id_requests; });
std::vector<nuraft::ptr<nuraft::buffer>> entries;
for (const auto & [session_id, request] : requests_for_sessions)
entries.push_back(getZooKeeperLogEntry(session_id, request));
auto entry = nuraft::buffer::alloc(sizeof(int64_t));
/// Just special session request
nuraft::buffer_serializer bs(entry);
bs.put_i64(session_timeout_ms);
std::lock_guard lock(append_entries_mutex);
auto result = raft_instance->append_entries({entry});
if (!result->get_accepted())
throw Exception(ErrorCodes::RAFT_ERROR, "Cannot send session_id request to RAFT");
if (result->get_result_code() != nuraft::cmd_result_code::OK)
throw Exception(ErrorCodes::RAFT_ERROR, "session_id request failed to RAFT");
auto resp = result->get();
if (resp == nullptr)
throw Exception(ErrorCodes::RAFT_ERROR, "Received nullptr as session_id");
nuraft::buffer_serializer bs_resp(resp);
return bs_resp.get_i64();
{
std::lock_guard lock(append_entries_mutex);
return raft_instance->append_entries(entries);
}
}
bool KeeperServer::isLeader() const

View File

@ -12,10 +12,12 @@
namespace DB
{
using RaftAppendResult = nuraft::ptr<nuraft::cmd_result<nuraft::ptr<nuraft::buffer>>>;
class KeeperServer
{
private:
int server_id;
const int server_id;
CoordinationSettingsPtr coordination_settings;
@ -29,13 +31,10 @@ private:
std::mutex append_entries_mutex;
ResponsesQueue & responses_queue;
std::mutex initialized_mutex;
std::atomic<bool> initialized_flag = false;
std::condition_variable initialized_cv;
std::atomic<bool> initial_batch_committed = false;
std::atomic<size_t> active_session_id_requests = 0;
Poco::Logger * log;
@ -60,9 +59,9 @@ public:
void startup();
void putRequest(const KeeperStorage::RequestForSession & request);
void putLocalReadRequest(const KeeperStorage::RequestForSession & request);
int64_t getSessionID(int64_t session_timeout_ms);
RaftAppendResult putRequestBatch(const KeeperStorage::RequestsForSessions & requests);
std::unordered_set<int64_t> getDeadSessions();
@ -73,6 +72,8 @@ public:
void waitInit();
void shutdown();
int getServerID() const { return server_id; }
};
}

View File

@ -90,25 +90,29 @@ void KeeperStateMachine::init()
nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, nuraft::buffer & data)
{
if (data.size() == sizeof(int64_t))
auto request_for_session = parseRequest(data);
if (request_for_session.request->getOpNum() == Coordination::OpNum::SessionID)
{
nuraft::buffer_serializer timeout_data(data);
int64_t session_timeout_ms = timeout_data.get_i64();
auto response = nuraft::buffer::alloc(sizeof(int64_t));
const Coordination::ZooKeeperSessionIDRequest & session_id_request = dynamic_cast<const Coordination::ZooKeeperSessionIDRequest &>(*request_for_session.request);
int64_t session_id;
nuraft::buffer_serializer bs(response);
{
std::lock_guard lock(storage_lock);
session_id = storage->getSessionID(session_timeout_ms);
bs.put_i64(session_id);
session_id = storage->getSessionID(session_id_request.session_timeout_ms);
}
LOG_DEBUG(log, "Session ID response {} with timeout {}", session_id, session_timeout_ms);
last_committed_idx = log_idx;
return response;
LOG_DEBUG(log, "Session ID response {} with timeout {}", session_id, session_id_request.session_timeout_ms);
std::shared_ptr<Coordination::ZooKeeperSessionIDResponse> response = std::make_shared<Coordination::ZooKeeperSessionIDResponse>();
response->internal_id = session_id_request.internal_id;
response->session_id = session_id;
response->server_id = session_id_request.server_id;
KeeperStorage::ResponseForSession response_for_session;
response_for_session.session_id = -1;
response_for_session.response = response;
responses_queue.push(response_for_session);
}
else
{
auto request_for_session = parseRequest(data);
KeeperStorage::ResponsesForSessions responses_for_sessions;
{
std::lock_guard lock(storage_lock);
@ -116,10 +120,10 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
for (auto & response_for_session : responses_for_sessions)
responses_queue.push(response_for_session);
}
last_committed_idx = log_idx;
return nullptr;
}
last_committed_idx = log_idx;
return nullptr;
}
bool KeeperStateMachine::apply_snapshot(nuraft::snapshot & s)

View File

@ -405,8 +405,6 @@ struct KeeperStorageListRequest final : public KeeperStorageRequest
response.names.insert(response.names.end(), it->value.children.begin(), it->value.children.end());
std::sort(response.names.begin(), response.names.end());
response.stat = it->value.stat;
response.error = Coordination::Error::ZOK;
}

View File

@ -1,5 +1,9 @@
#include <Coordination/KeeperStorageDispatcher.h>
#include <Common/setThreadName.h>
#include <Common/Stopwatch.h>
#include <Common/ZooKeeper/KeeperException.h>
#include <future>
#include <chrono>
namespace DB
{
@ -17,29 +21,116 @@ KeeperStorageDispatcher::KeeperStorageDispatcher()
{
}
void KeeperStorageDispatcher::requestThread()
{
setThreadName("KeeperReqT");
/// Result of requests batch from previous iteration
RaftAppendResult prev_result = nullptr;
/// Requests from previous iteration. We store them to be able
/// to send errors to the client.
KeeperStorage::RequestsForSessions prev_batch;
while (!shutdown_called)
{
KeeperStorage::RequestForSession request;
UInt64 max_wait = UInt64(coordination_settings->operation_timeout_ms.totalMilliseconds());
uint64_t max_batch_size = coordination_settings->max_requests_batch_size;
if (requests_queue.tryPop(request, max_wait))
/// The code below do a very simple thing: batch all write (quorum) requests into vector until
/// previous write batch is not finished or max_batch size achieved. The main complexity goes from
/// the ability to process read requests without quorum (from local state). So when we are collecting
/// requests into a batch we must check that the new request is not read request. Otherwise we have to
/// process all already accumulated write requests, wait them synchronously and only after that process
/// read request. So reads are some kind of "separator" for writes.
try
{
if (shutdown_called)
break;
if (requests_queue->tryPop(request, max_wait))
{
if (shutdown_called)
break;
try
{
server->putRequest(request);
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
KeeperStorage::RequestsForSessions current_batch;
bool has_read_request = false;
/// If new request is not read request or we must to process it through quorum.
/// Otherwise we will process it locally.
if (coordination_settings->quorum_reads || !request.request->isReadRequest())
{
current_batch.emplace_back(request);
/// Waiting until previous append will be successful, or batch is big enough
/// has_result == false && get_result_code == OK means that our request still not processed.
/// Sometimes NuRaft set errorcode without setting result, so we check both here.
while (prev_result && (!prev_result->has_result() && prev_result->get_result_code() == nuraft::cmd_result_code::OK) && current_batch.size() <= max_batch_size)
{
/// Trying to get batch requests as fast as possible
if (requests_queue->tryPop(request, 1))
{
/// Don't append read request into batch, we have to process them separately
if (!coordination_settings->quorum_reads && request.request->isReadRequest())
{
has_read_request = true;
break;
}
else
{
current_batch.emplace_back(request);
}
}
if (shutdown_called)
break;
}
}
else
has_read_request = true;
if (shutdown_called)
break;
/// Forcefully process all previous pending requests
if (prev_result)
forceWaitAndProcessResult(prev_result, prev_batch);
/// Process collected write requests batch
if (!current_batch.empty())
{
auto result = server->putRequestBatch(current_batch);
if (result)
{
if (has_read_request) /// If we will execute read request next, than we have to process result now
forceWaitAndProcessResult(result, current_batch);
}
else
{
addErrorResponses(current_batch, Coordination::Error::ZRUNTIMEINCONSISTENCY);
current_batch.clear();
}
prev_batch = current_batch;
prev_result = result;
}
/// Read request always goes after write batch (last request)
if (has_read_request)
{
if (server->isLeaderAlive())
server->putLocalReadRequest(request);
else
addErrorResponses({request}, Coordination::Error::ZRUNTIMEINCONSISTENCY);
}
}
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
}
@ -94,14 +185,32 @@ void KeeperStorageDispatcher::snapshotThread()
void KeeperStorageDispatcher::setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response)
{
std::lock_guard lock(session_to_response_callback_mutex);
auto session_writer = session_to_response_callback.find(session_id);
if (session_writer == session_to_response_callback.end())
return;
if (response->xid != Coordination::WATCH_XID && response->getOpNum() == Coordination::OpNum::SessionID)
{
const Coordination::ZooKeeperSessionIDResponse & session_id_resp = dynamic_cast<const Coordination::ZooKeeperSessionIDResponse &>(*response);
session_writer->second(response);
/// Session closed, no more writes
if (response->xid != Coordination::WATCH_XID && response->getOpNum() == Coordination::OpNum::Close)
session_to_response_callback.erase(session_writer);
/// Nobody waits for this session id
if (session_id_resp.server_id != server->getServerID() || !new_session_id_response_callback.count(session_id_resp.internal_id))
return;
auto callback = new_session_id_response_callback[session_id_resp.internal_id];
callback(response);
new_session_id_response_callback.erase(session_id_resp.internal_id);
}
else
{
auto session_writer = session_to_response_callback.find(session_id);
if (session_writer == session_to_response_callback.end())
return;
session_writer->second(response);
/// Session closed, no more writes
if (response->xid != Coordination::WATCH_XID && response->getOpNum() == Coordination::OpNum::Close)
{
session_to_response_callback.erase(session_writer);
}
}
}
bool KeeperStorageDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id)
@ -119,8 +228,8 @@ bool KeeperStorageDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr
std::lock_guard lock(push_request_mutex);
/// Put close requests without timeouts
if (request->getOpNum() == Coordination::OpNum::Close)
requests_queue.push(std::move(request_info));
else if (!requests_queue.tryPush(std::move(request_info), coordination_settings->operation_timeout_ms.totalMilliseconds()))
requests_queue->push(std::move(request_info));
else if (!requests_queue->tryPush(std::move(request_info), coordination_settings->operation_timeout_ms.totalMilliseconds()))
throw Exception("Cannot push request to queue within operation timeout", ErrorCodes::TIMEOUT_EXCEEDED);
return true;
}
@ -131,6 +240,7 @@ void KeeperStorageDispatcher::initialize(const Poco::Util::AbstractConfiguration
int myid = config.getInt("keeper_server.server_id");
coordination_settings->loadFromConfig("keeper_server.coordination_settings", config);
requests_queue = std::make_unique<RequestsQueue>(coordination_settings->max_requests_batch_size);
request_thread = ThreadFromGlobalPool([this] { requestThread(); });
responses_thread = ThreadFromGlobalPool([this] { responseThread(); });
@ -175,7 +285,7 @@ void KeeperStorageDispatcher::shutdown()
session_cleaner_thread.join();
/// FIXME not the best way to notify
requests_queue.push({});
requests_queue->push({});
if (request_thread.joinable())
request_thread.join();
@ -192,7 +302,7 @@ void KeeperStorageDispatcher::shutdown()
server->shutdown();
KeeperStorage::RequestForSession request_for_session;
while (requests_queue.tryPop(request_for_session))
while (requests_queue->tryPop(request_for_session))
{
if (request_for_session.request)
{
@ -249,7 +359,7 @@ void KeeperStorageDispatcher::sessionCleanerTask()
request_info.session_id = dead_session;
{
std::lock_guard lock(push_request_mutex);
requests_queue.push(std::move(request_info));
requests_queue->push(std::move(request_info));
}
finishSession(dead_session);
LOG_INFO(log, "Dead session close request pushed");
@ -273,4 +383,79 @@ void KeeperStorageDispatcher::finishSession(int64_t session_id)
session_to_response_callback.erase(session_it);
}
void KeeperStorageDispatcher::addErrorResponses(const KeeperStorage::RequestsForSessions & requests_for_sessions, Coordination::Error error)
{
for (const auto & [session_id, request] : requests_for_sessions)
{
KeeperStorage::ResponsesForSessions responses;
auto response = request->makeResponse();
response->xid = request->xid;
response->zxid = 0;
response->error = error;
responses_queue.push(DB::KeeperStorage::ResponseForSession{session_id, response});
}
}
void KeeperStorageDispatcher::forceWaitAndProcessResult(RaftAppendResult & result, KeeperStorage::RequestsForSessions & requests_for_sessions)
{
if (!result->has_result())
result->get();
/// If we get some errors, than send them to clients
if (!result->get_accepted() || result->get_result_code() == nuraft::cmd_result_code::TIMEOUT)
addErrorResponses(requests_for_sessions, Coordination::Error::ZOPERATIONTIMEOUT);
else if (result->get_result_code() != nuraft::cmd_result_code::OK)
addErrorResponses(requests_for_sessions, Coordination::Error::ZRUNTIMEINCONSISTENCY);
result = nullptr;
requests_for_sessions.clear();
}
int64_t KeeperStorageDispatcher::getSessionID(int64_t session_timeout_ms)
{
KeeperStorage::RequestForSession request_info;
std::shared_ptr<Coordination::ZooKeeperSessionIDRequest> request = std::make_shared<Coordination::ZooKeeperSessionIDRequest>();
request->internal_id = internal_session_id_counter.fetch_add(1);
request->session_timeout_ms = session_timeout_ms;
request->server_id = server->getServerID();
request_info.request = request;
request_info.session_id = -1;
auto promise = std::make_shared<std::promise<int64_t>>();
auto future = promise->get_future();
{
std::lock_guard lock(session_to_response_callback_mutex);
new_session_id_response_callback[request->internal_id] = [promise, internal_id = request->internal_id] (const Coordination::ZooKeeperResponsePtr & response)
{
if (response->getOpNum() != Coordination::OpNum::SessionID)
promise->set_exception(std::make_exception_ptr(Exception(ErrorCodes::LOGICAL_ERROR,
"Incorrect response of type {} instead of SessionID response", Coordination::toString(response->getOpNum()))));
auto session_id_response = dynamic_cast<const Coordination::ZooKeeperSessionIDResponse &>(*response);
if (session_id_response.internal_id != internal_id)
{
promise->set_exception(std::make_exception_ptr(Exception(ErrorCodes::LOGICAL_ERROR,
"Incorrect response with internal id {} instead of {}", session_id_response.internal_id, internal_id)));
}
if (response->error != Coordination::Error::ZOK)
promise->set_exception(std::make_exception_ptr(zkutil::KeeperException("SessionID request failed with error", response->error)));
promise->set_value(session_id_response.session_id);
};
}
{
std::lock_guard lock(push_request_mutex);
if (!requests_queue->tryPush(std::move(request_info), session_timeout_ms))
throw Exception("Cannot push session id request to queue within session timeout", ErrorCodes::TIMEOUT_EXCEEDED);
}
if (future.wait_for(std::chrono::milliseconds(session_timeout_ms)) != std::future_status::ready)
throw Exception("Cannot receive session id within session timeout", ErrorCodes::TIMEOUT_EXCEEDED);
return future.get();
}
}

View File

@ -32,24 +32,42 @@ private:
using RequestsQueue = ConcurrentBoundedQueue<KeeperStorage::RequestForSession>;
using SessionToResponseCallback = std::unordered_map<int64_t, ZooKeeperResponseCallback>;
RequestsQueue requests_queue{1};
/// Size depends on coordination settings
std::unique_ptr<RequestsQueue> requests_queue;
ResponsesQueue responses_queue;
SnapshotsQueue snapshots_queue{1};
std::atomic<bool> shutdown_called{false};
std::mutex session_to_response_callback_mutex;
/// These two maps looks similar, but serves different purposes.
/// The first map is subscription map for normal responses like
/// (get, set, list, etc.). Dispatcher determines callback for each response
/// using session id from this map.
SessionToResponseCallback session_to_response_callback;
/// But when client connects to the server for the first time it doesn't
/// have session_id. It request it from server. We give temporary
/// internal id for such requests just to much client with its response.
SessionToResponseCallback new_session_id_response_callback;
/// Reading and batching new requests from client handlers
ThreadFromGlobalPool request_thread;
/// Pushing responses to clients client handlers
/// using session_id.
ThreadFromGlobalPool responses_thread;
/// Cleaning old dead sessions
ThreadFromGlobalPool session_cleaner_thread;
/// Dumping new snapshots to disk
ThreadFromGlobalPool snapshot_thread;
/// RAFT wrapper. Most important class.
std::unique_ptr<KeeperServer> server;
Poco::Logger * log;
/// Counter for new session_id requests.
std::atomic<int64_t> internal_session_id_counter{0};
private:
void requestThread();
void responseThread();
@ -57,6 +75,14 @@ private:
void snapshotThread();
void setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response);
/// Add error responses for requests to responses queue.
/// Clears requests.
void addErrorResponses(const KeeperStorage::RequestsForSessions & requests_for_sessions, Coordination::Error error);
/// Forcefully wait for result and sets errors if something when wrong.
/// Clears both arguments
void forceWaitAndProcessResult(RaftAppendResult & result, KeeperStorage::RequestsForSessions & requests_for_sessions);
public:
KeeperStorageDispatcher();
@ -78,10 +104,7 @@ public:
return server->isLeaderAlive();
}
int64_t getSessionID(long session_timeout_ms)
{
return server->getSessionID(session_timeout_ms);
}
int64_t getSessionID(int64_t session_timeout_ms);
void registerSession(int64_t session_id, ZooKeeperResponseCallback callback);
/// Call if we don't need any responses for this session no more (session was expired)

View File

@ -211,6 +211,8 @@ TEST(CoordinationTest, ChangelogTestSimple)
changelog.init(1, 0);
auto entry = getLogEntry("hello world", 77);
changelog.append(entry);
changelog.end_of_append_batch(0, 0);
EXPECT_EQ(changelog.next_slot(), 2);
EXPECT_EQ(changelog.start_index(), 1);
EXPECT_EQ(changelog.last_entry()->get_term(), 77);
@ -225,6 +227,7 @@ TEST(CoordinationTest, ChangelogTestFile)
changelog.init(1, 0);
auto entry = getLogEntry("hello world", 77);
changelog.append(entry);
changelog.end_of_append_batch(0, 0);
EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
for (const auto & p : fs::directory_iterator("./logs"))
EXPECT_EQ(p.path(), "./logs/changelog_1_5.bin");
@ -234,6 +237,7 @@ TEST(CoordinationTest, ChangelogTestFile)
changelog.append(entry);
changelog.append(entry);
changelog.append(entry);
changelog.end_of_append_batch(0, 0);
EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
@ -249,6 +253,8 @@ TEST(CoordinationTest, ChangelogReadWrite)
auto entry = getLogEntry("hello world", i * 10);
changelog.append(entry);
}
changelog.end_of_append_batch(0, 0);
EXPECT_EQ(changelog.size(), 10);
DB::KeeperLogStore changelog_reader("./logs", 1000, true);
changelog_reader.init(1, 0);
@ -276,10 +282,14 @@ TEST(CoordinationTest, ChangelogWriteAt)
auto entry = getLogEntry("hello world", i * 10);
changelog.append(entry);
}
changelog.end_of_append_batch(0, 0);
EXPECT_EQ(changelog.size(), 10);
auto entry = getLogEntry("writer", 77);
changelog.write_at(7, entry);
changelog.end_of_append_batch(0, 0);
EXPECT_EQ(changelog.size(), 7);
EXPECT_EQ(changelog.last_entry()->get_term(), 77);
EXPECT_EQ(changelog.entry_at(7)->get_term(), 77);
@ -305,6 +315,7 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead)
auto entry = getLogEntry("hello world", i * 10);
changelog.append(entry);
}
changelog.end_of_append_batch(0, 0);
EXPECT_EQ(changelog.size(), 7);
EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
@ -319,6 +330,7 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead)
auto entry = getLogEntry("hello world", i * 10);
changelog_reader.append(entry);
}
changelog_reader.end_of_append_batch(0, 0);
EXPECT_EQ(changelog_reader.size(), 10);
EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
@ -331,6 +343,7 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead)
auto entry = getLogEntry("someentry", 77);
changelog_reader.append(entry);
changelog_reader.end_of_append_batch(0, 0);
EXPECT_EQ(changelog_reader.size(), 11);
EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
@ -354,6 +367,7 @@ TEST(CoordinationTest, ChangelogTestCompaction)
auto entry = getLogEntry("hello world", i * 10);
changelog.append(entry);
}
changelog.end_of_append_batch(0, 0);
EXPECT_EQ(changelog.size(), 3);
@ -373,6 +387,7 @@ TEST(CoordinationTest, ChangelogTestCompaction)
changelog.append(e3);
auto e4 = getLogEntry("hello world", 60);
changelog.append(e4);
changelog.end_of_append_batch(0, 0);
EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
@ -405,6 +420,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperations)
auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10);
changelog.append(entry);
}
changelog.end_of_append_batch(0, 0);
EXPECT_EQ(changelog.size(), 10);
@ -420,6 +436,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperations)
EXPECT_EQ(apply_changelog.size(), 10);
apply_changelog.apply_pack(8, *entries);
apply_changelog.end_of_append_batch(0, 0);
EXPECT_EQ(apply_changelog.size(), 12);
EXPECT_EQ(apply_changelog.start_index(), 1);
@ -447,6 +464,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty)
auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10);
changelog.append(entry);
}
changelog.end_of_append_batch(0, 0);
EXPECT_EQ(changelog.size(), 10);
@ -458,6 +476,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty)
EXPECT_EQ(changelog_new.size(), 0);
changelog_new.apply_pack(5, *entries);
changelog_new.end_of_append_batch(0, 0);
EXPECT_EQ(changelog_new.size(), 5);
EXPECT_EQ(changelog_new.start_index(), 5);
@ -468,6 +487,8 @@ TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty)
auto e = getLogEntry("hello_world", 110);
changelog_new.append(e);
changelog_new.end_of_append_batch(0, 0);
EXPECT_EQ(changelog_new.size(), 6);
EXPECT_EQ(changelog_new.start_index(), 5);
EXPECT_EQ(changelog_new.next_slot(), 11);
@ -488,6 +509,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile)
auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10);
changelog.append(entry);
}
changelog.end_of_append_batch(0, 0);
EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
@ -501,6 +523,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile)
auto e1 = getLogEntry("helloworld", 5555);
changelog.write_at(7, e1);
changelog.end_of_append_batch(0, 0);
EXPECT_EQ(changelog.size(), 7);
EXPECT_EQ(changelog.start_index(), 1);
EXPECT_EQ(changelog.next_slot(), 8);
@ -534,6 +557,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtFileBorder)
auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10);
changelog.append(entry);
}
changelog.end_of_append_batch(0, 0);
EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
@ -547,6 +571,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtFileBorder)
auto e1 = getLogEntry("helloworld", 5555);
changelog.write_at(11, e1);
changelog.end_of_append_batch(0, 0);
EXPECT_EQ(changelog.size(), 11);
EXPECT_EQ(changelog.start_index(), 1);
EXPECT_EQ(changelog.next_slot(), 12);
@ -580,6 +605,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtAllFiles)
auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10);
changelog.append(entry);
}
changelog.end_of_append_batch(0, 0);
EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
@ -593,6 +619,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtAllFiles)
auto e1 = getLogEntry("helloworld", 5555);
changelog.write_at(1, e1);
changelog.end_of_append_batch(0, 0);
EXPECT_EQ(changelog.size(), 1);
EXPECT_EQ(changelog.start_index(), 1);
EXPECT_EQ(changelog.next_slot(), 2);
@ -619,6 +646,7 @@ TEST(CoordinationTest, ChangelogTestStartNewLogAfterRead)
auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10);
changelog.append(entry);
}
changelog.end_of_append_batch(0, 0);
EXPECT_EQ(changelog.size(), 35);
EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
@ -635,6 +663,7 @@ TEST(CoordinationTest, ChangelogTestStartNewLogAfterRead)
auto entry = getLogEntry("36_hello_world", 360);
changelog_reader.append(entry);
changelog_reader.end_of_append_batch(0, 0);
EXPECT_EQ(changelog_reader.size(), 36);
EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
@ -660,6 +689,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate)
auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10);
changelog.append(entry);
}
changelog.end_of_append_batch(0, 0);
EXPECT_EQ(changelog.size(), 35);
EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
@ -674,6 +704,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate)
DB::KeeperLogStore changelog_reader("./logs", 5, true);
changelog_reader.init(1, 0);
changelog_reader.end_of_append_batch(0, 0);
EXPECT_EQ(changelog_reader.size(), 10);
EXPECT_EQ(changelog_reader.last_entry()->get_term(), 90);
@ -689,6 +720,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate)
auto entry = getLogEntry("h", 7777);
changelog_reader.append(entry);
changelog_reader.end_of_append_batch(0, 0);
EXPECT_EQ(changelog_reader.size(), 11);
EXPECT_EQ(changelog_reader.last_entry()->get_term(), 7777);
@ -719,6 +751,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2)
auto entry = getLogEntry(std::to_string(i) + "_hello_world", (i + 44) * 10);
changelog.append(entry);
}
changelog.end_of_append_batch(0, 0);
EXPECT_TRUE(fs::exists("./logs/changelog_1_20.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_21_40.bin"));
@ -735,6 +768,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2)
EXPECT_FALSE(fs::exists("./logs/changelog_21_40.bin"));
auto entry = getLogEntry("hello_world", 7777);
changelog_reader.append(entry);
changelog_reader.end_of_append_batch(0, 0);
EXPECT_EQ(changelog_reader.size(), 3);
EXPECT_EQ(changelog_reader.last_entry()->get_term(), 7777);
@ -757,6 +791,7 @@ TEST(CoordinationTest, ChangelogTestLostFiles)
auto entry = getLogEntry(std::to_string(i) + "_hello_world", (i + 44) * 10);
changelog.append(entry);
}
changelog.end_of_append_batch(0, 0);
EXPECT_TRUE(fs::exists("./logs/changelog_1_20.bin"));
EXPECT_TRUE(fs::exists("./logs/changelog_21_40.bin"));
@ -1105,6 +1140,7 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint
request->path = "/hello_" + std::to_string(i);
auto entry = getLogEntryFromZKRequest(0, 1, request);
changelog.append(entry);
changelog.end_of_append_batch(0, 0);
state_machine->commit(i, changelog.entry_at(i)->get_buf());
bool snapshot_created = false;

View File

@ -103,7 +103,10 @@ inline DecimalType decimalFromComponentsWithMultiplier(
if (common::mulOverflow(whole, scale_multiplier, whole_scaled))
throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
const T value = whole_scaled + fractional_sign * (fractional % scale_multiplier);
T value;
if (common::addOverflow(whole_scaled, fractional_sign * (fractional % scale_multiplier), value))
throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW);
return DecimalType(value);
}

View File

@ -57,7 +57,7 @@ class IColumn;
M(Seconds, tcp_keep_alive_timeout, 0, "The time in seconds the connection needs to remain idle before TCP starts sending keepalive probes", 0) \
M(Milliseconds, hedged_connection_timeout_ms, DBMS_DEFAULT_HEDGED_CONNECTION_TIMEOUT_MS, "Connection timeout for establishing connection with replica for Hedged requests", 0) \
M(Milliseconds, receive_data_timeout_ms, DBMS_DEFAULT_RECEIVE_DATA_TIMEOUT_MS, "Connection timeout for receiving first packet of data or packet with positive progress from replica", 0) \
M(Bool, use_hedged_requests, true, "Use hedged requests for distributed queries", 0) \
M(Bool, use_hedged_requests, false, "Use hedged requests for distributed queries", 0) \
M(Bool, allow_changing_replica_until_first_data_packet, false, "Allow HedgedConnections to change replica until receiving first data packet", 0) \
M(Milliseconds, queue_max_wait_ms, 0, "The wait time in the request queue, if the number of concurrent requests exceeds the maximum.", 0) \
M(Milliseconds, connection_pool_max_wait_ms, 0, "The wait time when the connection pool is full.", 0) \
@ -432,7 +432,7 @@ class IColumn;
M(Bool, allow_experimental_map_type, false, "Allow data type Map", 0) \
M(Bool, allow_experimental_window_functions, false, "Allow experimental window functions", 0) \
M(Bool, use_antlr_parser, false, "Parse incoming queries using ANTLR-generated experimental parser", 0) \
M(Bool, async_socket_for_remote, true, "Asynchronously read from socket executing remote query", 0) \
M(Bool, async_socket_for_remote, false, "Asynchronously read from socket executing remote query", 0) \
\
M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \
M(UInt64, insert_shard_id, 0, "If non zero, when insert into a distributed table, the data will be inserted into the shard `insert_shard_id` synchronously. Possible values range from 1 to `shards_number` of corresponding distributed table", 0) \

View File

@ -16,12 +16,9 @@
#include "DictionarySourceFactory.h"
#include "DictionaryStructure.h"
#include "readInvalidateQuery.h"
#include "registerDictionaries.h"
#include <Common/escapeForFileName.h>
#if USE_ODBC
# include <Poco/Data/ODBC/Connector.h> // Y_IGNORE
#endif
namespace DB
{
@ -125,7 +122,7 @@ XDBCDictionarySource::XDBCDictionarySource(
{
bridge_url = bridge_helper->getMainURI();
auto url_params = bridge_helper->getURLParams(sample_block_.getNamesAndTypesList().toString(), max_block_size);
auto url_params = bridge_helper->getURLParams(max_block_size);
for (const auto & [name, value] : url_params)
bridge_url.addQueryParameter(name, value);
}
@ -151,6 +148,7 @@ XDBCDictionarySource::XDBCDictionarySource(const XDBCDictionarySource & other)
{
}
std::string XDBCDictionarySource::getUpdateFieldAndDate()
{
if (update_time != std::chrono::system_clock::from_time_t(0))
@ -167,52 +165,61 @@ std::string XDBCDictionarySource::getUpdateFieldAndDate()
}
}
BlockInputStreamPtr XDBCDictionarySource::loadAll()
{
LOG_TRACE(log, load_all_query);
return loadBase(load_all_query);
return loadFromQuery(bridge_url, sample_block, load_all_query);
}
BlockInputStreamPtr XDBCDictionarySource::loadUpdatedAll()
{
std::string load_query_update = getUpdateFieldAndDate();
LOG_TRACE(log, load_query_update);
return loadBase(load_query_update);
return loadFromQuery(bridge_url, sample_block, load_query_update);
}
BlockInputStreamPtr XDBCDictionarySource::loadIds(const std::vector<UInt64> & ids)
{
const auto query = query_builder.composeLoadIdsQuery(ids);
return loadBase(query);
return loadFromQuery(bridge_url, sample_block, query);
}
BlockInputStreamPtr XDBCDictionarySource::loadKeys(const Columns & key_columns, const std::vector<size_t> & requested_rows)
{
const auto query = query_builder.composeLoadKeysQuery(key_columns, requested_rows, ExternalQueryBuilder::AND_OR_CHAIN);
return loadBase(query);
return loadFromQuery(bridge_url, sample_block, query);
}
bool XDBCDictionarySource::supportsSelectiveLoad() const
{
return true;
}
bool XDBCDictionarySource::hasUpdateField() const
{
return !update_field.empty();
}
DictionarySourcePtr XDBCDictionarySource::clone() const
{
return std::make_unique<XDBCDictionarySource>(*this);
}
std::string XDBCDictionarySource::toString() const
{
return bridge_helper->getName() + ": " + db + '.' + table + (where.empty() ? "" : ", where: " + where);
}
bool XDBCDictionarySource::isModified() const
{
if (!invalidate_query.empty())
@ -235,41 +242,38 @@ std::string XDBCDictionarySource::doInvalidateQuery(const std::string & request)
bridge_helper->startBridgeSync();
auto invalidate_url = bridge_helper->getMainURI();
auto url_params = bridge_helper->getURLParams(invalidate_sample_block.getNamesAndTypesList().toString(), max_block_size);
auto url_params = bridge_helper->getURLParams(max_block_size);
for (const auto & [name, value] : url_params)
invalidate_url.addQueryParameter(name, value);
XDBCBridgeBlockInputStream stream(
invalidate_url,
[request](std::ostream & os) { os << "query=" << request; },
invalidate_sample_block,
getContext(),
max_block_size,
timeouts,
bridge_helper->getName() + "BlockInputStream");
return readInvalidateQuery(stream);
return readInvalidateQuery(*loadFromQuery(invalidate_url, invalidate_sample_block, request));
}
BlockInputStreamPtr XDBCDictionarySource::loadBase(const std::string & query) const
BlockInputStreamPtr XDBCDictionarySource::loadFromQuery(const Poco::URI url, const Block & required_sample_block, const std::string & query) const
{
bridge_helper->startBridgeSync();
auto write_body_callback = [required_sample_block, query](std::ostream & os)
{
os << "sample_block=" << escapeForFileName(required_sample_block.getNamesAndTypesList().toString());
os << "&";
os << "query=" << escapeForFileName(query);
};
return std::make_shared<XDBCBridgeBlockInputStream>(
bridge_url,
[query](std::ostream & os) { os << "query=" << query; },
sample_block,
url,
write_body_callback,
required_sample_block,
getContext(),
max_block_size,
timeouts,
bridge_helper->getName() + "BlockInputStream");
}
void registerDictionarySourceXDBC(DictionarySourceFactory & factory)
{
#if USE_ODBC
Poco::Data::ODBC::Connector::registerConnector();
#endif
auto create_table_source = [=](const DictionaryStructure & dict_struct,
const Poco::Util::AbstractConfiguration & config,
const std::string & config_prefix,
@ -294,6 +298,7 @@ void registerDictionarySourceXDBC(DictionarySourceFactory & factory)
factory.registerSource("odbc", create_table_source);
}
void registerDictionarySourceJDBC(DictionarySourceFactory & factory)
{
auto create_table_source = [=](const DictionaryStructure & /* dict_struct */,

View File

@ -62,7 +62,7 @@ private:
// execute invalidate_query. expects single cell in result
std::string doInvalidateQuery(const std::string & request) const;
BlockInputStreamPtr loadBase(const std::string & query) const;
BlockInputStreamPtr loadFromQuery(const Poco::URI url, const Block & required_sample_block, const std::string & query) const;
Poco::Logger * log;

View File

@ -307,7 +307,22 @@ void buildPrimaryKeyConfiguration(
AutoPtr<Element> name_element(doc->createElement("name"));
id_element->appendChild(name_element);
const ASTDictionaryAttributeDeclaration * dict_attr = children.front()->as<const ASTDictionaryAttributeDeclaration>();
auto identifier_name = key_names.front();
auto it = std::find_if(children.begin(), children.end(), [&](const ASTPtr & node)
{
const ASTDictionaryAttributeDeclaration * dict_attr = node->as<const ASTDictionaryAttributeDeclaration>();
return dict_attr->name == identifier_name;
});
if (it == children.end())
{
throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION,
"Primary key field '{}' not found among attributes",
identifier_name);
}
const ASTDictionaryAttributeDeclaration * dict_attr = (*it)->as<const ASTDictionaryAttributeDeclaration>();
AutoPtr<Text> name(doc->createTextNode(dict_attr->name));
name_element->appendChild(name);

View File

@ -1,7 +1,6 @@
#include "DiskCacheWrapper.h"
#include <IO/copyData.h>
#include <Common/quoteString.h>
#include <common/logger_useful.h>
#include <condition_variable>
namespace DB
@ -114,7 +113,7 @@ DiskCacheWrapper::readFile(
if (!cache_file_predicate(path))
return DiskDecorator::readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold, mmap_cache);
LOG_DEBUG(&Poco::Logger::get("DiskCache"), "Read file {} from cache", backQuote(path));
LOG_DEBUG(log, "Read file {} from cache", backQuote(path));
if (cache_disk->exists(path))
return cache_disk->readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold, mmap_cache);
@ -128,11 +127,11 @@ DiskCacheWrapper::readFile(
{
/// This thread will responsible for file downloading to cache.
metadata->status = DOWNLOADING;
LOG_DEBUG(&Poco::Logger::get("DiskCache"), "File {} doesn't exist in cache. Will download it", backQuote(path));
LOG_DEBUG(log, "File {} doesn't exist in cache. Will download it", backQuote(path));
}
else if (metadata->status == DOWNLOADING)
{
LOG_DEBUG(&Poco::Logger::get("DiskCache"), "Waiting for file {} download to cache", backQuote(path));
LOG_DEBUG(log, "Waiting for file {} download to cache", backQuote(path));
metadata->condition.wait(lock, [metadata] { return metadata->status == DOWNLOADED || metadata->status == ERROR; });
}
}
@ -157,7 +156,7 @@ DiskCacheWrapper::readFile(
}
cache_disk->moveFile(tmp_path, path);
LOG_DEBUG(&Poco::Logger::get("DiskCache"), "File {} downloaded to cache", backQuote(path));
LOG_DEBUG(log, "File {} downloaded to cache", backQuote(path));
}
catch (...)
{
@ -186,7 +185,7 @@ DiskCacheWrapper::writeFile(const String & path, size_t buf_size, WriteMode mode
if (!cache_file_predicate(path))
return DiskDecorator::writeFile(path, buf_size, mode);
LOG_DEBUG(&Poco::Logger::get("DiskCache"), "Write file {} to cache", backQuote(path));
LOG_DEBUG(log, "Write file {} to cache", backQuote(path));
auto dir_path = directoryPath(path);
if (!cache_disk->exists(dir_path))

View File

@ -1,6 +1,7 @@
#pragma once
#include <unordered_map>
#include <common/logger_useful.h>
#include "DiskDecorator.h"
#include "DiskLocal.h"
@ -63,6 +64,8 @@ private:
mutable std::unordered_map<String, std::weak_ptr<FileDownloadMetadata>> file_downloads;
/// Protects concurrent downloading files to cache.
mutable std::mutex mutex;
Poco::Logger * log = &Poco::Logger::get("DiskCache");
};
}

View File

@ -2,13 +2,13 @@
#include <Common/createHardLink.h>
#include "DiskFactory.h"
#include <Disks/LocalDirectorySyncGuard.h>
#include <Interpreters/Context.h>
#include <Common/filesystemHelpers.h>
#include <Common/quoteString.h>
#include <Disks/LocalDirectorySyncGuard.h>
#include <IO/createReadBufferFromFileBase.h>
#include <common/logger_useful.h>
#include <unistd.h>
@ -96,7 +96,7 @@ bool DiskLocal::tryReserve(UInt64 bytes)
std::lock_guard lock(DiskLocal::reservation_mutex);
if (bytes == 0)
{
LOG_DEBUG(&Poco::Logger::get("DiskLocal"), "Reserving 0 bytes on disk {}", backQuote(name));
LOG_DEBUG(log, "Reserving 0 bytes on disk {}", backQuote(name));
++reservation_count;
return true;
}
@ -105,7 +105,7 @@ bool DiskLocal::tryReserve(UInt64 bytes)
UInt64 unreserved_space = available_space - std::min(available_space, reserved_bytes);
if (unreserved_space >= bytes)
{
LOG_DEBUG(&Poco::Logger::get("DiskLocal"), "Reserving {} on disk {}, having unreserved {}.",
LOG_DEBUG(log, "Reserving {} on disk {}, having unreserved {}.",
ReadableSize(bytes), backQuote(name), ReadableSize(unreserved_space));
++reservation_count;
reserved_bytes += bytes;
@ -339,7 +339,7 @@ DiskLocalReservation::~DiskLocalReservation()
if (disk->reserved_bytes < size)
{
disk->reserved_bytes = 0;
LOG_ERROR(&Poco::Logger::get("DiskLocal"), "Unbalanced reservations size for disk '{}'.", disk->getName());
LOG_ERROR(disk->log, "Unbalanced reservations size for disk '{}'.", disk->getName());
}
else
{
@ -347,7 +347,7 @@ DiskLocalReservation::~DiskLocalReservation()
}
if (disk->reservation_count == 0)
LOG_ERROR(&Poco::Logger::get("DiskLocal"), "Unbalanced reservation count for disk '{}'.", disk->getName());
LOG_ERROR(disk->log, "Unbalanced reservation count for disk '{}'.", disk->getName());
else
--disk->reservation_count;
}

View File

@ -1,5 +1,6 @@
#pragma once
#include <common/logger_useful.h>
#include <Disks/IDisk.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/ReadBufferFromFileBase.h>
@ -115,6 +116,8 @@ private:
UInt64 reservation_count = 0;
static std::mutex reservation_mutex;
Poco::Logger * log = &Poco::Logger::get("DiskLocal");
};
}

View File

@ -2,9 +2,10 @@
#include "Disks/DiskFactory.h"
#include <bitset>
#include <random>
#include <utility>
#include <optional>
#include <IO/ReadBufferFromString.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/ReadBufferFromS3.h>
#include <IO/ReadHelpers.h>
@ -18,7 +19,6 @@
#include <Common/quoteString.h>
#include <Common/thread_local_rng.h>
#include <Common/ThreadPool.h>
#include <common/logger_useful.h>
#include <aws/s3/model/CopyObjectRequest.h>
#include <aws/s3/model/DeleteObjectsRequest.h>
@ -305,7 +305,7 @@ public:
}
catch (...)
{
tryLogCurrentException(&Poco::Logger::get("DiskS3"), "Failed to run async task");
tryLogCurrentException("DiskS3", "Failed to run async task");
try
{
@ -386,7 +386,7 @@ std::unique_ptr<ReadBufferFromFileBase> DiskS3::readFile(const String & path, si
{
auto metadata = readMeta(path);
LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Read from file by path: {}. Existing S3 objects: {}",
LOG_DEBUG(log, "Read from file by path: {}. Existing S3 objects: {}",
backQuote(metadata_path + path), metadata.remote_fs_objects.size());
auto reader = std::make_unique<ReadIndirectBufferFromS3>(client, bucket, metadata, buf_size);
@ -422,7 +422,7 @@ std::unique_ptr<WriteBufferFromFileBase> DiskS3::writeFile(const String & path,
/// Save empty metadata to disk to have ability to get file size while buffer is not finalized.
metadata.save();
LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Write to file by path: {}. New S3 path: {}", backQuote(metadata_path + path), remote_fs_root_path + s3_path);
LOG_DEBUG(log, "Write to file by path: {}. New S3 path: {}", backQuote(metadata_path + path), remote_fs_root_path + s3_path);
return std::make_unique<WriteIndirectBufferFromS3>(
client, bucket, metadata, s3_path, object_metadata, min_upload_part_size, max_single_part_upload_size, buf_size);
@ -431,7 +431,7 @@ std::unique_ptr<WriteBufferFromFileBase> DiskS3::writeFile(const String & path,
{
auto metadata = readMeta(path);
LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Append to file by path: {}. New S3 path: {}. Existing S3 objects: {}.",
LOG_DEBUG(log, "Append to file by path: {}. New S3 path: {}. Existing S3 objects: {}.",
backQuote(metadata_path + path), remote_fs_root_path + s3_path, metadata.remote_fs_objects.size());
return std::make_unique<WriteIndirectBufferFromS3>(
@ -441,7 +441,7 @@ std::unique_ptr<WriteBufferFromFileBase> DiskS3::writeFile(const String & path,
void DiskS3::removeMeta(const String & path, AwsS3KeyKeeper & keys)
{
LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Remove file by path: {}", backQuote(metadata_path + path));
LOG_DEBUG(log, "Remove file by path: {}", backQuote(metadata_path + path));
Poco::File file(metadata_path + path);
@ -473,7 +473,7 @@ void DiskS3::removeMeta(const String & path, AwsS3KeyKeeper & keys)
if (e.code() == ErrorCodes::UNKNOWN_FORMAT)
{
LOG_WARNING(
&Poco::Logger::get("DiskS3"),
log,
"Metadata file {} can't be read by reason: {}. Removing it forcibly.",
backQuote(path),
e.nested() ? e.nested()->message() : e.message());
@ -591,40 +591,36 @@ void DiskS3::startup()
if (!send_metadata)
return;
LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting up disk {}", disk_name);
LOG_INFO(log, "Starting up disk {}", disk_name);
if (readSchemaVersion(bucket, remote_fs_root_path) < RESTORABLE_SCHEMA_VERSION)
migrateToRestorableSchema();
findLastRevision();
LOG_INFO(&Poco::Logger::get("DiskS3"), "Disk {} started up", disk_name);
LOG_INFO(log, "Disk {} started up", disk_name);
}
void DiskS3::findLastRevision()
{
UInt64 l = 0, r = LATEST_REVISION;
while (l < r)
/// Construct revision number from high to low bits.
String revision;
revision.reserve(64);
for (int bit = 0; bit < 64; bit++)
{
LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Check revision in bounds {}-{}", l, r);
auto revision_prefix = revision + "1";
auto revision = l + (r - l + 1) / 2;
if (revision == 0)
break;
LOG_DEBUG(log, "Check object exists with revision prefix {}", revision_prefix);
auto revision_str = revisionToString(revision);
LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Check object with revision {}", revision);
/// Check file or operation with such revision exists.
if (checkObjectExists(bucket, remote_fs_root_path + "r" + revision_str)
|| checkObjectExists(bucket, remote_fs_root_path + "operations/r" + revision_str))
l = revision;
/// Check file or operation with such revision prefix exists.
if (checkObjectExists(bucket, remote_fs_root_path + "r" + revision_prefix)
|| checkObjectExists(bucket, remote_fs_root_path + "operations/r" + revision_prefix))
revision += "1";
else
r = revision - 1;
revision += "0";
}
revision_counter = l;
LOG_INFO(&Poco::Logger::get("DiskS3"), "Found last revision number {} for disk {}", revision_counter, disk_name);
revision_counter = static_cast<UInt64>(std::bitset<64>(revision).to_ullong());
LOG_INFO(log, "Found last revision number {} for disk {}", revision_counter, disk_name);
}
int DiskS3::readSchemaVersion(const String & source_bucket, const String & source_path)
@ -661,7 +657,7 @@ void DiskS3::updateObjectMetadata(const String & key, const ObjectMetadata & met
void DiskS3::migrateFileToRestorableSchema(const String & path)
{
LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Migrate file {} to restorable schema", metadata_path + path);
LOG_DEBUG(log, "Migrate file {} to restorable schema", metadata_path + path);
auto meta = readMeta(path);
@ -678,7 +674,7 @@ void DiskS3::migrateToRestorableSchemaRecursive(const String & path, Futures & r
{
checkStackSize(); /// This is needed to prevent stack overflow in case of cyclic symlinks.
LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Migrate directory {} to restorable schema", metadata_path + path);
LOG_DEBUG(log, "Migrate directory {} to restorable schema", metadata_path + path);
bool dir_contains_only_files = true;
for (auto it = iterateDirectory(path); it->isValid(); it->next())
@ -721,7 +717,7 @@ void DiskS3::migrateToRestorableSchema()
{
try
{
LOG_INFO(&Poco::Logger::get("DiskS3"), "Start migration to restorable schema for disk {}", disk_name);
LOG_INFO(log, "Start migration to restorable schema for disk {}", disk_name);
Futures results;
@ -736,9 +732,9 @@ void DiskS3::migrateToRestorableSchema()
saveSchemaVersion(RESTORABLE_SCHEMA_VERSION);
}
catch (const Exception & e)
catch (const Exception &)
{
LOG_ERROR(&Poco::Logger::get("DiskS3"), "Failed to migrate to restorable schema. Code: {}, e.displayText() = {}, Stack trace:\n\n{}", e.code(), e.displayText(), e.getStackTraceString());
tryLogCurrentException(log, fmt::format("Failed to migrate to restorable schema for disk {}", disk_name));
throw;
}
@ -824,6 +820,7 @@ struct DiskS3::RestoreInformation
UInt64 revision = LATEST_REVISION;
String source_bucket;
String source_path;
bool detached = false;
};
void DiskS3::readRestoreInformation(DiskS3::RestoreInformation & restore_information)
@ -831,33 +828,50 @@ void DiskS3::readRestoreInformation(DiskS3::RestoreInformation & restore_informa
ReadBufferFromFile buffer(metadata_path + RESTORE_FILE_NAME, 512);
buffer.next();
/// Empty file - just restore all metadata.
if (!buffer.hasPendingData())
return;
try
{
readIntText(restore_information.revision, buffer);
assertChar('\n', buffer);
std::map<String, String> properties;
if (!buffer.hasPendingData())
return;
while (buffer.hasPendingData())
{
String property;
readText(property, buffer);
assertChar('\n', buffer);
readText(restore_information.source_bucket, buffer);
assertChar('\n', buffer);
auto pos = property.find('=');
if (pos == String::npos || pos == 0 || pos == property.length())
throw Exception(fmt::format("Invalid property {} in restore file", property), ErrorCodes::UNKNOWN_FORMAT);
if (!buffer.hasPendingData())
return;
auto key = property.substr(0, pos);
auto value = property.substr(pos + 1);
readText(restore_information.source_path, buffer);
assertChar('\n', buffer);
auto it = properties.find(key);
if (it != properties.end())
throw Exception(fmt::format("Property key duplication {} in restore file", key), ErrorCodes::UNKNOWN_FORMAT);
if (buffer.hasPendingData())
throw Exception("Extra information at the end of restore file", ErrorCodes::UNKNOWN_FORMAT);
properties[key] = value;
}
for (const auto & [key, value] : properties)
{
ReadBufferFromString value_buffer (value);
if (key == "revision")
readIntText(restore_information.revision, value_buffer);
else if (key == "source_bucket")
readText(restore_information.source_bucket, value_buffer);
else if (key == "source_path")
readText(restore_information.source_path, value_buffer);
else if (key == "detached")
readBoolTextWord(restore_information.detached, value_buffer);
else
throw Exception(fmt::format("Unknown key {} in restore file", key), ErrorCodes::UNKNOWN_FORMAT);
}
}
catch (const Exception & e)
catch (const Exception &)
{
throw Exception("Failed to read restore information", e, ErrorCodes::UNKNOWN_FORMAT);
tryLogCurrentException(log, "Failed to read restore information");
throw;
}
}
@ -890,43 +904,43 @@ void DiskS3::restore()
throw Exception("Restoring to the same bucket is allowed only if source path is not a sub-path of configured path in S3 disk", ErrorCodes::BAD_ARGUMENTS);
}
LOG_INFO(log, "Starting to restore disk {}. Revision: {}, Source bucket: {}, Source path: {}",
disk_name, information.revision, information.source_bucket, information.source_path);
if (readSchemaVersion(information.source_bucket, information.source_path) < RESTORABLE_SCHEMA_VERSION)
throw Exception("Source bucket doesn't have restorable schema.", ErrorCodes::BAD_ARGUMENTS);
LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting to restore disk {}. Revision: {}, Source bucket: {}, Source path: {}",
disk_name, information.revision, information.source_bucket, information.source_path);
LOG_INFO(&Poco::Logger::get("DiskS3"), "Removing old metadata...");
LOG_INFO(log, "Removing old metadata...");
bool cleanup_s3 = information.source_bucket != bucket || information.source_path != remote_fs_root_path;
for (const auto & root : data_roots)
if (exists(root))
removeSharedRecursive(root + '/', !cleanup_s3);
restoreFiles(information.source_bucket, information.source_path, information.revision);
restoreFileOperations(information.source_bucket, information.source_path, information.revision);
restoreFiles(information);
restoreFileOperations(information);
Poco::File restore_file(metadata_path + RESTORE_FILE_NAME);
restore_file.remove();
saveSchemaVersion(RESTORABLE_SCHEMA_VERSION);
LOG_INFO(&Poco::Logger::get("DiskS3"), "Restore disk {} finished", disk_name);
LOG_INFO(log, "Restore disk {} finished", disk_name);
}
catch (const Exception & e)
catch (const Exception &)
{
LOG_ERROR(&Poco::Logger::get("DiskS3"), "Failed to restore disk. Code: {}, e.displayText() = {}, Stack trace:\n\n{}", e.code(), e.displayText(), e.getStackTraceString());
tryLogCurrentException(log, fmt::format("Failed to restore disk {}", disk_name));
throw;
}
}
void DiskS3::restoreFiles(const String & source_bucket, const String & source_path, UInt64 target_revision)
void DiskS3::restoreFiles(const RestoreInformation & restore_information)
{
LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting restore files for disk {}", disk_name);
LOG_INFO(log, "Starting restore files for disk {}", disk_name);
std::vector<std::future<void>> results;
listObjects(source_bucket, source_path, [this, &source_bucket, &source_path, &target_revision, &results](auto list_result)
auto restore_files = [this, &restore_information, &results](auto list_result)
{
std::vector<String> keys;
for (const auto & row : list_result.GetContents())
@ -939,7 +953,7 @@ void DiskS3::restoreFiles(const String & source_bucket, const String & source_pa
const auto [revision, _] = extractRevisionAndOperationFromKey(key);
/// Filter early if it's possible to get revision from key.
if (revision > target_revision)
if (revision > restore_information.revision)
continue;
keys.push_back(key);
@ -947,23 +961,26 @@ void DiskS3::restoreFiles(const String & source_bucket, const String & source_pa
if (!keys.empty())
{
auto result = getExecutor().execute([this, &source_bucket, &source_path, keys]()
auto result = getExecutor().execute([this, &restore_information, keys]()
{
processRestoreFiles(source_bucket, source_path, keys);
processRestoreFiles(restore_information.source_bucket, restore_information.source_path, keys);
});
results.push_back(std::move(result));
}
return true;
});
};
/// Execute.
listObjects(restore_information.source_bucket, restore_information.source_path, restore_files);
for (auto & result : results)
result.wait();
for (auto & result : results)
result.get();
LOG_INFO(&Poco::Logger::get("DiskS3"), "Files are restored for disk {}", disk_name);
LOG_INFO(log, "Files are restored for disk {}", disk_name);
}
void DiskS3::processRestoreFiles(const String & source_bucket, const String & source_path, Strings keys)
@ -978,7 +995,7 @@ void DiskS3::processRestoreFiles(const String & source_bucket, const String & so
if (path_entry == object_metadata.end())
{
/// Such keys can remain after migration, we can skip them.
LOG_WARNING(&Poco::Logger::get("DiskS3"), "Skip key {} because it doesn't have 'path' in metadata", key);
LOG_WARNING(log, "Skip key {} because it doesn't have 'path' in metadata", key);
continue;
}
@ -995,18 +1012,19 @@ void DiskS3::processRestoreFiles(const String & source_bucket, const String & so
metadata.addObject(relative_key, head_result.GetContentLength());
metadata.save();
LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Restored file {}", path);
LOG_DEBUG(log, "Restored file {}", path);
}
}
void DiskS3::restoreFileOperations(const String & source_bucket, const String & source_path, UInt64 target_revision)
void DiskS3::restoreFileOperations(const RestoreInformation & restore_information)
{
LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting restore file operations for disk {}", disk_name);
LOG_INFO(log, "Starting restore file operations for disk {}", disk_name);
/// Enable recording file operations if we restore to different bucket / path.
send_metadata = bucket != source_bucket || remote_fs_root_path != source_path;
send_metadata = bucket != restore_information.source_bucket || remote_fs_root_path != restore_information.source_path;
listObjects(source_bucket, source_path + "operations/", [this, &source_bucket, &target_revision](auto list_result)
std::set<String> renames;
auto restore_file_operations = [this, &restore_information, &renames](auto list_result)
{
const String rename = "rename";
const String hardlink = "hardlink";
@ -1018,20 +1036,20 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String &
const auto [revision, operation] = extractRevisionAndOperationFromKey(key);
if (revision == UNKNOWN_REVISION)
{
LOG_WARNING(&Poco::Logger::get("DiskS3"), "Skip key {} with unknown revision", key);
LOG_WARNING(log, "Skip key {} with unknown revision", key);
continue;
}
/// S3 ensures that keys will be listed in ascending UTF-8 bytes order (revision order).
/// We can stop processing if revision of the object is already more than required.
if (revision > target_revision)
if (revision > restore_information.revision)
return false;
/// Keep original revision if restore to different bucket / path.
if (send_metadata)
revision_counter = revision - 1;
auto object_metadata = headObject(source_bucket, key).GetMetadata();
auto object_metadata = headObject(restore_information.source_bucket, key).GetMetadata();
if (operation == rename)
{
auto from_path = object_metadata["from_path"];
@ -1039,7 +1057,23 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String &
if (exists(from_path))
{
moveFile(from_path, to_path);
LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Revision {}. Restored rename {} -> {}", revision, from_path, to_path);
LOG_DEBUG(log, "Revision {}. Restored rename {} -> {}", revision, from_path, to_path);
if (restore_information.detached && isDirectory(to_path))
{
/// Sometimes directory paths are passed without trailing '/'. We should keep them in one consistent way.
if (!from_path.ends_with('/'))
from_path += '/';
if (!to_path.ends_with('/'))
to_path += '/';
/// Always keep latest actual directory path to avoid 'detaching' not existing paths.
auto it = renames.find(from_path);
if (it != renames.end())
renames.erase(it);
renames.insert(to_path);
}
}
}
else if (operation == hardlink)
@ -1050,27 +1084,55 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String &
{
createDirectories(directoryPath(dst_path));
createHardLink(src_path, dst_path);
LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Revision {}. Restored hardlink {} -> {}", revision, src_path, dst_path);
LOG_DEBUG(log, "Revision {}. Restored hardlink {} -> {}", revision, src_path, dst_path);
}
}
}
return true;
});
};
/// Execute.
listObjects(restore_information.source_bucket, restore_information.source_path + "operations/", restore_file_operations);
if (restore_information.detached)
{
Strings not_finished_prefixes{"tmp_", "delete_tmp_", "attaching_", "deleting_"};
for (const auto & path : renames)
{
/// Skip already detached parts.
if (path.find("/detached/") != std::string::npos)
continue;
/// Skip not finished parts. They shouldn't be in 'detached' directory, because CH wouldn't be able to finish processing them.
Poco::Path directory_path (path);
auto directory_name = directory_path.directory(directory_path.depth() - 1);
auto predicate = [&directory_name](String & prefix) { return directory_name.starts_with(prefix); };
if (std::any_of(not_finished_prefixes.begin(), not_finished_prefixes.end(), predicate))
continue;
auto detached_path = pathToDetached(path);
LOG_DEBUG(log, "Move directory to 'detached' {} -> {}", path, detached_path);
Poco::File(metadata_path + path).moveTo(metadata_path + detached_path);
}
}
send_metadata = true;
LOG_INFO(&Poco::Logger::get("DiskS3"), "File operations restored for disk {}", disk_name);
LOG_INFO(log, "File operations restored for disk {}", disk_name);
}
std::tuple<UInt64, String> DiskS3::extractRevisionAndOperationFromKey(const String & key)
{
UInt64 revision = UNKNOWN_REVISION;
String revision_str;
String operation;
re2::RE2::FullMatch(key, key_regexp, &revision, &operation);
re2::RE2::FullMatch(key, key_regexp, &revision_str, &operation);
return {revision, operation};
return {(revision_str.empty() ? UNKNOWN_REVISION : static_cast<UInt64>(std::bitset<64>(revision_str).to_ullong())), operation};
}
String DiskS3::shrinkKey(const String & path, const String & key)
@ -1083,15 +1145,12 @@ String DiskS3::shrinkKey(const String & path, const String & key)
String DiskS3::revisionToString(UInt64 revision)
{
static constexpr size_t max_digits = 19; /// UInt64 max digits in decimal representation.
return std::bitset<64>(revision).to_string();
}
/// Align revision number with leading zeroes to have strict lexicographical order of them.
auto revision_str = std::to_string(revision);
auto digits_to_align = max_digits - revision_str.length();
for (size_t i = 0; i < digits_to_align; ++i)
revision_str = "0" + revision_str;
return revision_str;
String DiskS3::pathToDetached(const String & source_path)
{
return Poco::Path(source_path).parent().append(Poco::Path("detached")).toString() + '/';
}
void DiskS3::onFreeze(const String & path)

View File

@ -1,6 +1,7 @@
#pragma once
#include <atomic>
#include <common/logger_useful.h>
#include "Disks/DiskFactory.h"
#include "Disks/Executor.h"
#include "ProxyConfiguration.h"
@ -103,6 +104,7 @@ private:
void removeAws(const AwsS3KeyKeeper & keys);
void createFileOperationObject(const String & operation_name, UInt64 revision, const ObjectMetadata & metadata);
/// Converts revision to binary string with leading zeroes (64 bit).
static String revisionToString(UInt64 revision);
bool checkObjectExists(const String & source_bucket, const String & prefix);
@ -120,15 +122,18 @@ private:
void copyObject(const String & src_bucket, const String & src_key, const String & dst_bucket, const String & dst_key);
void readRestoreInformation(RestoreInformation & restore_information);
void restoreFiles(const String & source_bucket, const String & source_path, UInt64 target_revision);
void restoreFiles(const RestoreInformation & restore_information);
void processRestoreFiles(const String & source_bucket, const String & source_path, std::vector<String> keys);
void restoreFileOperations(const String & source_bucket, const String & source_path, UInt64 target_revision);
void restoreFileOperations(const RestoreInformation & restore_information);
/// Remove 'path' prefix from 'key' to get relative key.
/// It's needed to store keys to metadata files in RELATIVE_PATHS version.
static String shrinkKey(const String & path, const String & key);
std::tuple<UInt64, String> extractRevisionAndOperationFromKey(const String & key);
/// Forms detached path '../../detached/part_name/' from '../../part_name/'
static String pathToDetached(const String & source_path);
std::shared_ptr<Aws::S3::S3Client> client;
std::shared_ptr<S3::ProxyConfiguration> proxy_configuration;
const String bucket;

View File

@ -20,7 +20,7 @@ void registerFunctionsCoding(FunctionFactory & factory)
factory.registerFunction<FunctionUUIDNumToString>();
factory.registerFunction<FunctionUUIDStringToNum>();
factory.registerFunction<FunctionHex>(FunctionFactory::CaseInsensitive);
factory.registerFunction<FunctionUnhex>();
factory.registerFunction<FunctionUnhex>(FunctionFactory::CaseInsensitive);
factory.registerFunction<FunctionChar>(FunctionFactory::CaseInsensitive);
factory.registerFunction<FunctionBitmaskToArray>();
factory.registerFunction<FunctionToIPv4>();

View File

@ -1,187 +0,0 @@
#include "FunctionArrayMapped.h"
#include <Functions/FunctionFactory.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int SIZES_OF_ARRAYS_DOESNT_MATCH;
extern const int TYPE_MISMATCH;
}
/** arrayFold(x1,...,xn,accum -> expression, array1,...,arrayn, init_accum) - apply the expression to each element of the array (or set of parallel arrays).
*/
class FunctionArrayFold : public IFunction
{
public:
static constexpr auto name = "arrayFold";
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionArrayFold>(); }
String getName() const override { return name; }
bool isVariadic() const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
void getLambdaArgumentTypes(DataTypes & arguments) const override
{
if (arguments.size() < 3)
throw Exception("Function " + getName() + " needs lambda function, at least one array argument and one accumulator argument.",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
DataTypes nested_types(arguments.size() - 1);
for (size_t i = 0; i < nested_types.size() - 1; ++i)
{
const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(&*arguments[i + 1]);
if (!array_type)
throw Exception("Argument " + toString(i + 2) + " of function " + getName() + " must be array. Found "
+ arguments[i + 1]->getName() + " instead.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
nested_types[i] = recursiveRemoveLowCardinality(array_type->getNestedType());
}
nested_types[nested_types.size() - 1] = arguments[arguments.size() - 1];
const DataTypeFunction * function_type = checkAndGetDataType<DataTypeFunction>(arguments[0].get());
if (!function_type || function_type->getArgumentTypes().size() != nested_types.size())
throw Exception("First argument for this overload of " + getName() + " must be a function with "
+ toString(nested_types.size()) + " arguments. Found "
+ arguments[0]->getName() + " instead.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
arguments[0] = std::make_shared<DataTypeFunction>(nested_types);
}
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
if (arguments.size() < 2)
throw Exception("Function " + getName() + " needs at least 2 arguments; passed "
+ toString(arguments.size()) + ".",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
const auto * data_type_function = checkAndGetDataType<DataTypeFunction>(arguments[0].type.get());
if (!data_type_function)
throw Exception("First argument for function " + getName() + " must be a function.",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
auto const accumulator_type = arguments.back().type;
auto const lambda_type = data_type_function->getReturnType();
if (! accumulator_type->equals(*lambda_type))
throw Exception("Return type of lambda function must be the same as the accumulator type. "
"Inferred type of lambda " + lambda_type->getName() + ", "
+ "inferred type of accumulator " + accumulator_type->getName() + ".",
ErrorCodes::TYPE_MISMATCH);
return DataTypePtr(accumulator_type);
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
{
const auto & column_with_type_and_name = arguments[0];
if (!column_with_type_and_name.column)
throw Exception("First argument for function " + getName() + " must be a function.",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
const auto * column_function = typeid_cast<const ColumnFunction *>(column_with_type_and_name.column.get());
if (!column_function)
throw Exception("First argument for function " + getName() + " must be a function.",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
ColumnPtr offsets_column;
ColumnPtr column_first_array_ptr;
const ColumnArray * column_first_array = nullptr;
ColumnsWithTypeAndName arrays;
arrays.reserve(arguments.size() - 1);
for (size_t i = 1; i < arguments.size() - 1; ++i)
{
const auto & array_with_type_and_name = arguments[i];
ColumnPtr column_array_ptr = array_with_type_and_name.column;
const auto * column_array = checkAndGetColumn<ColumnArray>(column_array_ptr.get());
const DataTypePtr & array_type_ptr = array_with_type_and_name.type;
const auto * array_type = checkAndGetDataType<DataTypeArray>(array_type_ptr.get());
if (!column_array)
{
const ColumnConst * column_const_array = checkAndGetColumnConst<ColumnArray>(column_array_ptr.get());
if (!column_const_array)
throw Exception("Expected array column, found " + column_array_ptr->getName(), ErrorCodes::ILLEGAL_COLUMN);
column_array_ptr = recursiveRemoveLowCardinality(column_const_array->convertToFullColumn());
column_array = checkAndGetColumn<ColumnArray>(column_array_ptr.get());
}
if (!array_type)
throw Exception("Expected array type, found " + array_type_ptr->getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (!offsets_column)
{
offsets_column = column_array->getOffsetsPtr();
}
else
{
/// The first condition is optimization: do not compare data if the pointers are equal.
if (column_array->getOffsetsPtr() != offsets_column
&& column_array->getOffsets() != typeid_cast<const ColumnArray::ColumnOffsets &>(*offsets_column).getData())
throw Exception("Arrays passed to " + getName() + " must have equal size", ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH);
}
if (i == 1)
{
column_first_array_ptr = column_array_ptr;
column_first_array = column_array;
}
arrays.emplace_back(ColumnWithTypeAndName(column_array->getDataPtr(),
recursiveRemoveLowCardinality(array_type->getNestedType()),
array_with_type_and_name.name));
}
arrays.emplace_back(arguments.back());
MutableColumnPtr result = arguments.back().column->convertToFullColumnIfConst()->cloneEmpty();
size_t arr_cursor = 0;
for (size_t irow = 0; irow < column_first_array->size(); ++irow) // for each row of result
{
// Make accumulator column for this row. We initialize it
// with the starting value given as the last argument.
ColumnWithTypeAndName accumulator_column = arguments.back();
ColumnPtr acc(accumulator_column.column->cut(irow, 1));
auto accumulator = ColumnWithTypeAndName(acc,
accumulator_column.type,
accumulator_column.name);
ColumnPtr res(acc);
size_t const arr_next = column_first_array->getOffsets()[irow]; // when we do folding
for (size_t iter = 0; arr_cursor < arr_next; ++iter, ++arr_cursor)
{
// Make slice of input arrays and accumulator for lambda
ColumnsWithTypeAndName iter_arrays;
iter_arrays.reserve(arrays.size() + 1);
for (size_t icolumn = 0; icolumn < arrays.size() - 1; ++icolumn)
{
auto const & arr = arrays[icolumn];
iter_arrays.emplace_back(ColumnWithTypeAndName(arr.column->cut(arr_cursor, 1),
arr.type,
arr.name));
}
iter_arrays.emplace_back(accumulator);
// Calculate function on arguments
auto replicated_column_function_ptr = IColumn::mutate(column_function->replicate(ColumnArray::Offsets(column_first_array->getOffsets().size(), 1)));
auto * replicated_column_function = typeid_cast<ColumnFunction *>(replicated_column_function_ptr.get());
replicated_column_function->appendArguments(iter_arrays);
auto lambda_result = replicated_column_function->reduce().column;
if (lambda_result->lowCardinality())
lambda_result = lambda_result->convertToFullColumnIfLowCardinality();
res = lambda_result->cut(0, 1);
accumulator.column = res;
}
result->insert((*res)[0]);
}
return result;
}
};
void registerFunctionArrayFold(FunctionFactory & factory)
{
factory.registerFunction<FunctionArrayFold>();
}
}

View File

@ -4,7 +4,6 @@ namespace DB
class FunctionFactory;
void registerFunctionArrayMap(FunctionFactory & factory);
void registerFunctionArrayFold(FunctionFactory & factory);
void registerFunctionArrayFilter(FunctionFactory & factory);
void registerFunctionArrayCount(FunctionFactory & factory);
void registerFunctionArrayExists(FunctionFactory & factory);
@ -23,7 +22,6 @@ void registerFunctionArrayDifference(FunctionFactory & factory);
void registerFunctionsHigherOrder(FunctionFactory & factory)
{
registerFunctionArrayMap(factory);
registerFunctionArrayFold(factory);
registerFunctionArrayFilter(factory);
registerFunctionArrayCount(factory);
registerFunctionArrayExists(factory);

View File

@ -144,7 +144,6 @@ SRCS(
array/arrayFirst.cpp
array/arrayFirstIndex.cpp
array/arrayFlatten.cpp
array/arrayFold.cpp
array/arrayIntersect.cpp
array/arrayJoin.cpp
array/arrayMap.cpp

View File

@ -103,10 +103,14 @@ Cluster::Address::Address(
password = config.getString(config_prefix + ".password", "");
default_database = config.getString(config_prefix + ".default_database", "");
secure = config.getBool(config_prefix + ".secure", false) ? Protocol::Secure::Enable : Protocol::Secure::Disable;
compression = config.getBool(config_prefix + ".compression", true) ? Protocol::Compression::Enable : Protocol::Compression::Disable;
priority = config.getInt(config_prefix + ".priority", 1);
const char * port_type = secure == Protocol::Secure::Enable ? "tcp_port_secure" : "tcp_port";
is_local = isLocal(config.getInt(port_type, 0));
/// By default compression is disabled if address looks like localhost.
/// NOTE: it's still enabled when interacting with servers on different port, but we don't want to complicate the logic.
compression = config.getBool(config_prefix + ".compression", !is_local)
? Protocol::Compression::Enable : Protocol::Compression::Disable;
}

View File

@ -20,9 +20,14 @@ public:
/// Models will be loaded immediately and then will be updated in separate thread, each 'reload_period' seconds.
explicit ExternalModelsLoader(ContextPtr context_);
ModelPtr getModel(const std::string & name) const
ModelPtr getModel(const std::string & model_name) const
{
return std::static_pointer_cast<const IModel>(load(name));
return std::static_pointer_cast<const IModel>(load(model_name));
}
void reloadModel(const std::string & model_name) const
{
loadOrReload(model_name);
}
protected:

View File

@ -10,6 +10,7 @@
#include <Interpreters/Context.h>
#include <Interpreters/DatabaseCatalog.h>
#include <Interpreters/ExternalDictionariesLoader.h>
#include <Interpreters/ExternalModelsLoader.h>
#include <Interpreters/EmbeddedDictionaries.h>
#include <Interpreters/ActionLocksManager.h>
#include <Interpreters/InterpreterDropQuery.h>
@ -286,6 +287,7 @@ BlockIO InterpreterSystemQuery::execute()
auto & external_dictionaries_loader = system_context->getExternalDictionariesLoader();
external_dictionaries_loader.reloadDictionary(query.target_dictionary, getContext());
ExternalDictionariesLoader::resetAll();
break;
}
@ -299,6 +301,22 @@ BlockIO InterpreterSystemQuery::execute()
ExternalDictionariesLoader::resetAll();
break;
}
case Type::RELOAD_MODEL:
{
getContext()->checkAccess(AccessType::SYSTEM_RELOAD_MODEL);
auto & external_models_loader = system_context->getExternalModelsLoader();
external_models_loader.reloadModel(query.target_model);
break;
}
case Type::RELOAD_MODELS:
{
getContext()->checkAccess(AccessType::SYSTEM_RELOAD_MODEL);
auto & external_models_loader = system_context->getExternalModelsLoader();
external_models_loader.reloadAllTriedToLoad();
break;
}
case Type::RELOAD_EMBEDDED_DICTIONARIES:
getContext()->checkAccess(AccessType::SYSTEM_RELOAD_EMBEDDED_DICTIONARIES);
system_context->getEmbeddedDictionaries().reload();
@ -652,6 +670,12 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster()
required_access.emplace_back(AccessType::SYSTEM_RELOAD_DICTIONARY);
break;
}
case Type::RELOAD_MODEL: [[fallthrough]];
case Type::RELOAD_MODELS:
{
required_access.emplace_back(AccessType::SYSTEM_RELOAD_MODEL);
break;
}
case Type::RELOAD_CONFIG:
{
required_access.emplace_back(AccessType::SYSTEM_RELOAD_CONFIG);

View File

@ -12,7 +12,8 @@ namespace DB
static bool isUniq(const ASTFunction & func)
{
return func.name == "uniq" || func.name == "uniqExact" || func.name == "uniqHLL12"
|| func.name == "uniqCombined" || func.name == "uniqCombined64";
|| func.name == "uniqCombined" || func.name == "uniqCombined64"
|| func.name == "uniqThetaSketch";
}
/// Remove injective functions of one argument: replace with a child

View File

@ -54,6 +54,10 @@ const char * ASTSystemQuery::typeToString(Type type)
return "RELOAD DICTIONARY";
case Type::RELOAD_DICTIONARIES:
return "RELOAD DICTIONARIES";
case Type::RELOAD_MODEL:
return "RELOAD MODEL";
case Type::RELOAD_MODELS:
return "RELOAD MODELS";
case Type::RELOAD_EMBEDDED_DICTIONARIES:
return "RELOAD EMBEDDED DICTIONARIES";
case Type::RELOAD_CONFIG:

View File

@ -36,6 +36,8 @@ public:
SYNC_REPLICA,
RELOAD_DICTIONARY,
RELOAD_DICTIONARIES,
RELOAD_MODEL,
RELOAD_MODELS,
RELOAD_EMBEDDED_DICTIONARIES,
RELOAD_CONFIG,
RELOAD_SYMBOLS,
@ -63,6 +65,7 @@ public:
Type type = Type::UNKNOWN;
String target_dictionary;
String target_model;
String database;
String table;
String replica;

View File

@ -57,7 +57,35 @@ bool ParserSystemQuery::parseImpl(IParser::Pos & pos, ASTPtr & node, Expected &
return false;
break;
}
case Type::RELOAD_MODEL:
{
String cluster_str;
if (ParserKeyword{"ON"}.ignore(pos, expected))
{
if (!ASTQueryWithOnCluster::parse(pos, cluster_str, expected))
return false;
}
res->cluster = cluster_str;
ASTPtr ast;
if (ParserStringLiteral{}.parse(pos, ast, expected))
{
res->target_model = ast->as<ASTLiteral &>().value.safeGet<String>();
}
else
{
ParserIdentifier model_parser;
ASTPtr model;
String target_model;
if (!model_parser.parse(pos, model, expected))
return false;
if (!tryGetIdentifierNameInto(model, res->target_model))
return false;
}
break;
}
case Type::DROP_REPLICA:
{
ASTPtr ast;

View File

@ -231,7 +231,12 @@ namespace
block_in.readSuffix();
}
void writeRemoteConvert(const DistributedHeader & distributed_header, RemoteBlockOutputStream & remote, ReadBufferFromFile & in, Poco::Logger * log)
void writeRemoteConvert(
const DistributedHeader & distributed_header,
RemoteBlockOutputStream & remote,
bool compression_expected,
ReadBufferFromFile & in,
Poco::Logger * log)
{
if (!remote.getHeader())
{
@ -262,6 +267,14 @@ namespace
return;
}
/// If connection does not use compression, we have to uncompress the data.
if (!compression_expected)
{
writeAndConvert(remote, in);
return;
}
/// Otherwise write data as it was already prepared (more efficient path).
CheckingCompressedReadBuffer checking_in(in);
remote.writePrepared(checking_in);
}
@ -545,7 +558,8 @@ void StorageDistributedDirectoryMonitor::processFile(const std::string & file_pa
distributed_header.insert_settings,
distributed_header.client_info};
remote.writePrefix();
writeRemoteConvert(distributed_header, remote, in, log);
bool compression_expected = connection->getCompression() == Protocol::Compression::Enable;
writeRemoteConvert(distributed_header, remote, compression_expected, in, log);
remote.writeSuffix();
}
catch (const Exception & e)
@ -690,7 +704,8 @@ struct StorageDistributedDirectoryMonitor::Batch
distributed_header.client_info);
remote->writePrefix();
}
writeRemoteConvert(distributed_header, *remote, in, parent.log);
bool compression_expected = connection->getCompression() == Protocol::Compression::Enable;
writeRemoteConvert(distributed_header, *remote, compression_expected, in, parent.log);
}
if (remote)

View File

@ -14,6 +14,8 @@
#include <Storages/StorageURL.h>
#include <Storages/transformQueryForExternalDatabase.h>
#include <common/logger_useful.h>
#include <Common/escapeForFileName.h>
namespace DB
{
@ -53,24 +55,18 @@ std::string StorageXDBC::getReadMethod() const
}
std::vector<std::pair<std::string, std::string>> StorageXDBC::getReadURIParams(
const Names & column_names,
const StorageMetadataPtr & metadata_snapshot,
const Names & /* column_names */,
const StorageMetadataPtr & /* metadata_snapshot */,
const SelectQueryInfo & /*query_info*/,
ContextPtr /*context*/,
QueryProcessingStage::Enum & /*processed_stage*/,
size_t max_block_size) const
{
NamesAndTypesList cols;
for (const String & name : column_names)
{
auto column_data = metadata_snapshot->getColumns().getPhysical(name);
cols.emplace_back(column_data.name, column_data.type);
}
return bridge_helper->getURLParams(cols.toString(), max_block_size);
return bridge_helper->getURLParams(max_block_size);
}
std::function<void(std::ostream &)> StorageXDBC::getReadPOSTDataCallback(
const Names & /*column_names*/,
const Names & column_names,
const StorageMetadataPtr & metadata_snapshot,
const SelectQueryInfo & query_info,
ContextPtr local_context,
@ -84,7 +80,21 @@ std::function<void(std::ostream &)> StorageXDBC::getReadPOSTDataCallback(
remote_table_name,
local_context);
return [query](std::ostream & os) { os << "query=" << query; };
NamesAndTypesList cols;
for (const String & name : column_names)
{
auto column_data = metadata_snapshot->getColumns().getPhysical(name);
cols.emplace_back(column_data.name, column_data.type);
}
auto write_body_callback = [query, cols](std::ostream & os)
{
os << "sample_block=" << escapeForFileName(cols.toString());
os << "&";
os << "query=" << escapeForFileName(query);
};
return write_body_callback;
}
Pipe StorageXDBC::read(
@ -106,20 +116,17 @@ BlockOutputStreamPtr StorageXDBC::write(const ASTPtr & /*query*/, const StorageM
{
bridge_helper->startBridgeSync();
NamesAndTypesList cols;
Poco::URI request_uri = uri;
request_uri.setPath("/write");
for (const String & name : metadata_snapshot->getSampleBlock().getNames())
{
auto column_data = metadata_snapshot->getColumns().getPhysical(name);
cols.emplace_back(column_data.name, column_data.type);
}
auto url_params = bridge_helper->getURLParams(cols.toString(), 65536);
auto url_params = bridge_helper->getURLParams(65536);
for (const auto & [param, value] : url_params)
request_uri.addQueryParameter(param, value);
request_uri.addQueryParameter("db_name", remote_database_name);
request_uri.addQueryParameter("table_name", remote_table_name);
request_uri.addQueryParameter("format_name", format_name);
request_uri.addQueryParameter("sample_block", metadata_snapshot->getSampleBlock().getNamesAndTypesList().toString());
return std::make_shared<StorageURLBlockOutputStream>(
request_uri,

View File

@ -1124,23 +1124,28 @@ class ClickHouseInstance:
return self.http_query(sql=sql, data=data, params=params, user=user, password=password,
expect_fail_and_get_error=True)
def stop_clickhouse(self, start_wait_sec=5, kill=False):
def stop_clickhouse(self, stop_wait_sec=30, kill=False):
if not self.stay_alive:
raise Exception("clickhouse can be stopped only with stay_alive=True instance")
self.exec_in_container(["bash", "-c", "pkill {} clickhouse".format("-9" if kill else "")], user='root')
time.sleep(start_wait_sec)
deadline = time.time() + stop_wait_sec
while time.time() < deadline:
time.sleep(0.5)
if self.get_process_pid("clickhouse") is None:
break
assert self.get_process_pid("clickhouse") is None, "ClickHouse was not stopped"
def start_clickhouse(self, stop_wait_sec=5):
def start_clickhouse(self, start_wait_sec=30):
if not self.stay_alive:
raise Exception("clickhouse can be started again only with stay_alive=True instance")
self.exec_in_container(["bash", "-c", "{} --daemon".format(CLICKHOUSE_START_COMMAND)], user=str(os.getuid()))
# wait start
from helpers.test_tools import assert_eq_with_retry
assert_eq_with_retry(self, "select 1", "1", retry_count=int(stop_wait_sec / 0.5), sleep_time=0.5)
assert_eq_with_retry(self, "select 1", "1", retry_count=int(start_wait_sec / 0.5), sleep_time=0.5)
def restart_clickhouse(self, stop_start_wait_sec=5, kill=False):
def restart_clickhouse(self, stop_start_wait_sec=30, kill=False):
self.stop_clickhouse(stop_start_wait_sec, kill)
self.start_clickhouse(stop_start_wait_sec)

View File

@ -0,0 +1,3 @@
<yandex>
<catboost_dynamic_library_path>/etc/clickhouse-server/model/libcatboostmodel.so</catboost_dynamic_library_path>
</yandex>

View File

@ -0,0 +1,3 @@
<yandex>
<models_config>/etc/clickhouse-server/model/model_config.xml</models_config>
</yandex>

View File

@ -0,0 +1,8 @@
<models>
<model>
<type>catboost</type>
<name>model</name>
<path>/etc/clickhouse-server/model/model.cbm</path>
<lifetime>0</lifetime>
</model>
</models>

View File

@ -0,0 +1,74 @@
import os
import sys
import time
import pytest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
from helpers.cluster import ClickHouseCluster
cluster = ClickHouseCluster(__file__)
node = cluster.add_instance('node', stay_alive=True, main_configs=['config/models_config.xml', 'config/catboost_lib.xml'])
def copy_file_to_container(local_path, dist_path, container_id):
os.system("docker cp {local} {cont_id}:{dist}".format(local=local_path, cont_id=container_id, dist=dist_path))
@pytest.fixture(scope="module")
def started_cluster():
try:
cluster.start()
copy_file_to_container(os.path.join(SCRIPT_DIR, 'model/.'), '/etc/clickhouse-server/model', node.docker_id)
node.query("CREATE TABLE binary (x UInt64, y UInt64) ENGINE = TinyLog()")
node.query("INSERT INTO binary VALUES (1, 1), (1, 0), (0, 1), (0, 0)")
node.restart_clickhouse()
yield cluster
finally:
cluster.shutdown()
def test_model_reload(started_cluster):
node.exec_in_container(["bash", "-c", "rm -f /etc/clickhouse-server/model/model.cbm"])
node.exec_in_container(["bash", "-c", "ln /etc/clickhouse-server/model/conjunction.cbm /etc/clickhouse-server/model/model.cbm"])
node.query("SYSTEM RELOAD MODEL model")
result = node.query("""
WITH modelEvaluate('model', toFloat64(x), toFloat64(y)) as prediction, exp(prediction) / (1 + exp(prediction)) as probability
SELECT if(probability > 0.5, 1, 0) FROM binary;
""")
assert result == '1\n0\n0\n0\n'
node.exec_in_container(["bash", "-c", "rm /etc/clickhouse-server/model/model.cbm"])
node.exec_in_container(["bash", "-c", "ln /etc/clickhouse-server/model/disjunction.cbm /etc/clickhouse-server/model/model.cbm"])
node.query("SYSTEM RELOAD MODEL model")
result = node.query("""
WITH modelEvaluate('model', toFloat64(x), toFloat64(y)) as prediction, exp(prediction) / (1 + exp(prediction)) as probability
SELECT if(probability > 0.5, 1, 0) FROM binary;
""")
assert result == '1\n1\n1\n0\n'
def test_models_reload(started_cluster):
node.exec_in_container(["bash", "-c", "rm -f /etc/clickhouse-server/model/model.cbm"])
node.exec_in_container(["bash", "-c", "ln /etc/clickhouse-server/model/conjunction.cbm /etc/clickhouse-server/model/model.cbm"])
node.query("SYSTEM RELOAD MODELS")
result = node.query("""
WITH modelEvaluate('model', toFloat64(x), toFloat64(y)) as prediction, exp(prediction) / (1 + exp(prediction)) as probability
SELECT if(probability > 0.5, 1, 0) FROM binary;
""")
assert result == '1\n0\n0\n0\n'
node.exec_in_container(["bash", "-c", "rm /etc/clickhouse-server/model/model.cbm"])
node.exec_in_container(["bash", "-c", "ln /etc/clickhouse-server/model/disjunction.cbm /etc/clickhouse-server/model/model.cbm"])
node.query("SYSTEM RELOAD MODELS")
result = node.query("""
WITH modelEvaluate('model', toFloat64(x), toFloat64(y)) as prediction, exp(prediction) / (1 + exp(prediction)) as probability
SELECT if(probability > 0.5, 1, 0) FROM binary;
""")
assert result == '1\n1\n1\n0\n'

View File

@ -0,0 +1,23 @@
<?xml version="1.0"?>
<yandex>
<remote_servers>
<node>
<shard>
<internal_replication>true</internal_replication>
<replica>
<host>node</host>
<port>9000</port>
</replica>
</shard>
</node>
<node_another_bucket>
<shard>
<internal_replication>true</internal_replication>
<replica>
<host>node_another_bucket</host>
<port>9000</port>
</replica>
</shard>
</node_another_bucket>
</remote_servers>
</yandex>

View File

@ -7,20 +7,21 @@ import time
import pytest
from helpers.cluster import ClickHouseCluster
logging.getLogger().setLevel(logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler())
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
CONFIG_PATH = os.path.join(SCRIPT_DIR, './_instances/node_not_restorable/configs/config.d/storage_conf_not_restorable.xml')
NOT_RESTORABLE_CONFIG_PATH = os.path.join(SCRIPT_DIR, './_instances/node_not_restorable/configs/config.d/storage_conf_not_restorable.xml')
COMMON_CONFIGS = ["configs/config.d/bg_processing_pool_conf.xml", "configs/config.d/log_conf.xml", "configs/config.d/clusters.xml"]
def replace_config(old, new):
config = open(CONFIG_PATH, 'r')
config = open(NOT_RESTORABLE_CONFIG_PATH, 'r')
config_lines = config.readlines()
config.close()
config_lines = [line.replace(old, new) for line in config_lines]
config = open(CONFIG_PATH, 'w')
config = open(NOT_RESTORABLE_CONFIG_PATH, 'w')
config.writelines(config_lines)
config.close()
@ -29,22 +30,22 @@ def replace_config(old, new):
def cluster():
try:
cluster = ClickHouseCluster(__file__)
cluster.add_instance("node", main_configs=[
"configs/config.d/storage_conf.xml",
"configs/config.d/bg_processing_pool_conf.xml",
"configs/config.d/log_conf.xml"], user_configs=[], with_minio=True, stay_alive=True)
cluster.add_instance("node_another_bucket", main_configs=[
"configs/config.d/storage_conf_another_bucket.xml",
"configs/config.d/bg_processing_pool_conf.xml",
"configs/config.d/log_conf.xml"], user_configs=[], stay_alive=True)
cluster.add_instance("node_another_bucket_path", main_configs=[
"configs/config.d/storage_conf_another_bucket_path.xml",
"configs/config.d/bg_processing_pool_conf.xml",
"configs/config.d/log_conf.xml"], user_configs=[], stay_alive=True)
cluster.add_instance("node_not_restorable", main_configs=[
"configs/config.d/storage_conf_not_restorable.xml",
"configs/config.d/bg_processing_pool_conf.xml",
"configs/config.d/log_conf.xml"], user_configs=[], stay_alive=True)
cluster.add_instance("node",
main_configs=COMMON_CONFIGS + ["configs/config.d/storage_conf.xml"],
macros={"cluster": "node", "replica": "0"},
with_minio=True, with_zookeeper=True, stay_alive=True)
cluster.add_instance("node_another_bucket",
main_configs=COMMON_CONFIGS + ["configs/config.d/storage_conf_another_bucket.xml"],
macros={"cluster": "node_another_bucket", "replica": "0"},
with_zookeeper=True, stay_alive=True)
cluster.add_instance("node_another_bucket_path",
main_configs=COMMON_CONFIGS + ["configs/config.d/storage_conf_another_bucket_path.xml"],
stay_alive=True)
cluster.add_instance("node_not_restorable",
main_configs=COMMON_CONFIGS + ["configs/config.d/storage_conf_not_restorable.xml"],
stay_alive=True)
logging.info("Starting cluster...")
cluster.start()
logging.info("Cluster started")
@ -65,28 +66,26 @@ def generate_values(date_str, count, sign=1):
return ",".join(["('{}',{},'{}',{})".format(x, y, z, 0) for x, y, z in data])
def create_table(node, table_name, additional_settings=None):
def create_table(node, table_name, replicated=False):
node.query("CREATE DATABASE IF NOT EXISTS s3 ENGINE = Ordinary")
create_table_statement = """
CREATE TABLE s3.{} (
CREATE TABLE s3.{table_name} {on_cluster} (
dt Date,
id Int64,
data String,
counter Int64,
INDEX min_max (id) TYPE minmax GRANULARITY 3
) ENGINE=MergeTree()
) ENGINE={engine}
PARTITION BY dt
ORDER BY (dt, id)
SETTINGS
storage_policy='s3',
old_parts_lifetime=600,
index_granularity=512
""".format(table_name)
if additional_settings:
create_table_statement += ","
create_table_statement += additional_settings
""".format(table_name=table_name,
on_cluster="ON CLUSTER '{}'".format(node.name) if replicated else "",
engine="ReplicatedMergeTree('/clickhouse/tables/{cluster}/test', '{replica}')" if replicated else "MergeTree()")
node.query(create_table_statement)
@ -107,17 +106,23 @@ def drop_shadow_information(node):
node.exec_in_container(['bash', '-c', 'rm -rf /var/lib/clickhouse/shadow/*'], user='root')
def create_restore_file(node, revision=0, bucket=None, path=None):
add_restore_option = 'echo -en "{}\n" >> /var/lib/clickhouse/disks/s3/restore'
node.exec_in_container(['bash', '-c', add_restore_option.format(revision)], user='root')
def create_restore_file(node, revision=None, bucket=None, path=None, detached=None):
node.exec_in_container(['bash', '-c', 'touch /var/lib/clickhouse/disks/s3/restore'], user='root')
add_restore_option = 'echo -en "{}={}\n" >> /var/lib/clickhouse/disks/s3/restore'
if revision:
node.exec_in_container(['bash', '-c', add_restore_option.format('revision', revision)], user='root')
if bucket:
node.exec_in_container(['bash', '-c', add_restore_option.format(bucket)], user='root')
node.exec_in_container(['bash', '-c', add_restore_option.format('source_bucket', bucket)], user='root')
if path:
node.exec_in_container(['bash', '-c', add_restore_option.format(path)], user='root')
node.exec_in_container(['bash', '-c', add_restore_option.format('source_path', path)], user='root')
if detached:
node.exec_in_container(['bash', '-c', add_restore_option.format('detached', 'true')], user='root')
def get_revision_counter(node, backup_number):
return int(node.exec_in_container(['bash', '-c', 'cat /var/lib/clickhouse/disks/s3/shadow/{}/revision.txt'.format(backup_number)], user='root'))
return int(node.exec_in_container(
['bash', '-c', 'cat /var/lib/clickhouse/disks/s3/shadow/{}/revision.txt'.format(backup_number)], user='root'))
@pytest.fixture(autouse=True)
@ -128,7 +133,8 @@ def drop_table(cluster):
for node_name in node_names:
node = cluster.instances[node_name]
node.query("DROP TABLE IF EXISTS s3.test NO DELAY")
node.query("DROP TABLE IF EXISTS s3.test SYNC")
node.query("DROP DATABASE IF EXISTS s3 SYNC")
drop_s3_metadata(node)
drop_shadow_information(node)
@ -138,32 +144,23 @@ def drop_table(cluster):
purge_s3(cluster, bucket)
def test_full_restore(cluster):
@pytest.mark.parametrize(
"replicated", [False, True]
)
def test_full_restore(cluster, replicated):
node = cluster.instances["node"]
create_table(node, "test")
create_table(node, "test", replicated)
node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-03', 4096)))
node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-04', 4096, -1)))
node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-05', 4096)))
node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-05', 4096, -1)))
# To ensure parts have merged
node.query("OPTIMIZE TABLE s3.test")
assert node.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
assert node.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
node.stop_clickhouse()
drop_s3_metadata(node)
node.start_clickhouse()
# All data is removed.
assert node.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(0)
node.stop_clickhouse()
create_restore_file(node)
node.start_clickhouse(10)
node.start_clickhouse()
assert node.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
assert node.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
@ -191,7 +188,7 @@ def test_restore_another_bucket_path(cluster):
node_another_bucket.stop_clickhouse()
create_restore_file(node_another_bucket, bucket="root")
node_another_bucket.start_clickhouse(10)
node_another_bucket.start_clickhouse()
assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
@ -202,7 +199,7 @@ def test_restore_another_bucket_path(cluster):
node_another_bucket_path.stop_clickhouse()
create_restore_file(node_another_bucket_path, bucket="root2", path="data")
node_another_bucket_path.start_clickhouse(10)
node_another_bucket_path.start_clickhouse()
assert node_another_bucket_path.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
assert node_another_bucket_path.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
@ -244,7 +241,7 @@ def test_restore_different_revisions(cluster):
drop_s3_metadata(node_another_bucket)
purge_s3(cluster, cluster.minio_bucket_2)
create_restore_file(node_another_bucket, revision=revision1, bucket="root")
node_another_bucket.start_clickhouse(10)
node_another_bucket.start_clickhouse()
assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 2)
assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
@ -255,7 +252,7 @@ def test_restore_different_revisions(cluster):
drop_s3_metadata(node_another_bucket)
purge_s3(cluster, cluster.minio_bucket_2)
create_restore_file(node_another_bucket, revision=revision2, bucket="root")
node_another_bucket.start_clickhouse(10)
node_another_bucket.start_clickhouse()
assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
@ -266,7 +263,7 @@ def test_restore_different_revisions(cluster):
drop_s3_metadata(node_another_bucket)
purge_s3(cluster, cluster.minio_bucket_2)
create_restore_file(node_another_bucket, revision=revision3, bucket="root")
node_another_bucket.start_clickhouse(10)
node_another_bucket.start_clickhouse()
assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
@ -298,7 +295,7 @@ def test_restore_mutations(cluster):
drop_s3_metadata(node_another_bucket)
purge_s3(cluster, cluster.minio_bucket_2)
create_restore_file(node_another_bucket, revision=revision_before_mutation, bucket="root")
node_another_bucket.start_clickhouse(10)
node_another_bucket.start_clickhouse()
assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 2)
assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
@ -309,7 +306,7 @@ def test_restore_mutations(cluster):
drop_s3_metadata(node_another_bucket)
purge_s3(cluster, cluster.minio_bucket_2)
create_restore_file(node_another_bucket, revision=revision_after_mutation, bucket="root")
node_another_bucket.start_clickhouse(10)
node_another_bucket.start_clickhouse()
assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 2)
assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
@ -323,7 +320,7 @@ def test_restore_mutations(cluster):
purge_s3(cluster, cluster.minio_bucket_2)
revision = (revision_before_mutation + revision_after_mutation) // 2
create_restore_file(node_another_bucket, revision=revision, bucket="root")
node_another_bucket.start_clickhouse(10)
node_another_bucket.start_clickhouse()
# Wait for unfinished mutation completion.
time.sleep(3)
@ -365,7 +362,57 @@ def test_migrate_to_restorable_schema(cluster):
drop_s3_metadata(node_another_bucket)
purge_s3(cluster, cluster.minio_bucket_2)
create_restore_file(node_another_bucket, revision=revision, bucket="root", path="another_data")
node_another_bucket.start_clickhouse(10)
node_another_bucket.start_clickhouse()
assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 6)
assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
@pytest.mark.parametrize(
"replicated", [False, True]
)
def test_restore_to_detached(cluster, replicated):
node = cluster.instances["node"]
create_table(node, "test", replicated)
node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-03', 4096)))
node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-04', 4096, -1)))
node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-05', 4096)))
node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-06', 4096, -1)))
node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-07', 4096, 0)))
# Add some mutation.
node.query("ALTER TABLE s3.test UPDATE counter = 1 WHERE 1", settings={"mutations_sync": 2})
# Detach some partition.
node.query("ALTER TABLE s3.test DETACH PARTITION '2020-01-07'")
node.query("ALTER TABLE s3.test FREEZE")
revision = get_revision_counter(node, 1)
node_another_bucket = cluster.instances["node_another_bucket"]
create_table(node_another_bucket, "test", replicated)
node_another_bucket.stop_clickhouse()
create_restore_file(node_another_bucket, revision=revision, bucket="root", path="data", detached=True)
node_another_bucket.start_clickhouse()
assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(0)
node_another_bucket.query("ALTER TABLE s3.test ATTACH PARTITION '2020-01-03'")
node_another_bucket.query("ALTER TABLE s3.test ATTACH PARTITION '2020-01-04'")
node_another_bucket.query("ALTER TABLE s3.test ATTACH PARTITION '2020-01-05'")
node_another_bucket.query("ALTER TABLE s3.test ATTACH PARTITION '2020-01-06'")
assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
assert node_another_bucket.query("SELECT sum(counter) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
# Attach partition that was already detached before backup-restore.
node_another_bucket.query("ALTER TABLE s3.test ATTACH PARTITION '2020-01-07'")
assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 5)
assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
assert node_another_bucket.query("SELECT sum(counter) FROM s3.test FORMAT Values") == "({})".format(4096 * 5)

View File

@ -505,3 +505,59 @@ def test_concurrent_queries(started_cluster):
node1.query('DROP TABLE test_pg_table;')
cursor.execute('DROP TABLE clickhouse.test_pg_table;')
def test_odbc_long_column_names(started_cluster):
conn = get_postgres_conn();
cursor = conn.cursor()
column_name = "column" * 8
create_table = "CREATE TABLE clickhouse.test_long_column_names ("
for i in range(1000):
if i != 0:
create_table += ", "
create_table += "{} integer".format(column_name + str(i))
create_table += ")"
cursor.execute(create_table)
insert = "INSERT INTO clickhouse.test_long_column_names SELECT i" + ", i" * 999 + " FROM generate_series(0, 99) as t(i)"
cursor.execute(insert)
conn.commit()
create_table = "CREATE TABLE test_long_column_names ("
for i in range(1000):
if i != 0:
create_table += ", "
create_table += "{} UInt32".format(column_name + str(i))
create_table += ") ENGINE=ODBC('DSN=postgresql_odbc; Servername=postgre-sql.local', 'clickhouse', 'test_long_column_names')"
result = node1.query(create_table);
result = node1.query('SELECT * FROM test_long_column_names');
expected = node1.query("SELECT number" + ", number" * 999 + " FROM numbers(100)")
assert(result == expected)
cursor.execute("DROP TABLE IF EXISTS clickhouse.test_long_column_names")
node1.query("DROP TABLE IF EXISTS test_long_column_names")
def test_odbc_long_text(started_cluster):
conn = get_postgres_conn()
cursor = conn.cursor()
cursor.execute("drop table if exists clickhouse.test_long_text")
cursor.execute("create table clickhouse.test_long_text(flen int, field1 text)");
# sample test from issue 9363
text_from_issue = """BEGIN These examples only show the order that data is arranged in. The values from different columns are stored separately, and data from the same column is stored together. Examples of a column-oriented DBMS: Vertica, Paraccel (Actian Matrix and Amazon Redshift), Sybase IQ, Exasol, Infobright, InfiniDB, MonetDB (VectorWise and Actian Vector), LucidDB, SAP HANA, Google Dremel, Google PowerDrill, Druid, and kdb+. Different orders for storing data are better suited to different scenarios. The data access scenario refers to what queries are made, how often, and in what proportion; how much data is read for each type of query rows, columns, and bytes; the relationship between reading and updating data; the working size of the data and how locally it is used; whether transactions are used, and how isolated they are; requirements for data replication and logical integrity; requirements for latency and throughput for each type of query, and so on. The higher the load on the system, the more important it is to customize the system set up to match the requirements of the usage scenario, and the more fine grained this customization becomes. There is no system that is equally well-suited to significantly different scenarios. If a system is adaptable to a wide set of scenarios, under a high load, the system will handle all the scenarios equally poorly, or will work well for just one or few of possible scenarios. Key Properties of OLAP Scenario¶ The vast majority of requests are for read access. Data is updated in fairly large batches (> 1000 rows), not by single rows; or it is not updated at all. Data is added to the DB but is not modified. For reads, quite a large number of rows are extracted from the DB, but only a small subset of columns. Tables are "wide," meaning they contain a large number of columns. Queries are relatively rare (usually hundreds of queries per server or less per second). For simple queries, latencies around 50 ms are allowed. Column values are fairly small: numbers and short strings (for example, 60 bytes per URL). Requires high throughput when processing a single query (up to billions of rows per second per server). Transactions are not necessary. Low requirements for data consistency. There is one large table per query. All tables are small, except for one. A query result is significantly smaller than the source data. In other words, data is filtered or aggregated, so the result fits in a single server"s RAM. It is easy to see that the OLAP scenario is very different from other popular scenarios (such as OLTP or Key-Value access). So it doesn"t make sense to try to use OLTP or a Key-Value DB for processing analytical queries if you want to get decent performance. For example, if you try to use MongoDB or Redis for analytics, you will get very poor performance compared to OLAP databases. Why Column-Oriented Databases Work Better in the OLAP Scenario¶ Column-oriented databases are better suited to OLAP scenarios: they are at least 100 times faster in processing most queries. The reasons are explained in detail below, but the fact is easier to demonstrate visually. END"""
cursor.execute("""insert into clickhouse.test_long_text (flen, field1) values (3248, '{}')""".format(text_from_issue));
node1.query('''
DROP TABLE IF EXISTS test_long_test;
CREATE TABLE test_long_text (flen UInt32, field1 String)
ENGINE = ODBC('DSN=postgresql_odbc; Servername=postgre-sql.local', 'clickhouse', 'test_long_text')''')
result = node1.query("select field1 from test_long_text;")
assert(result.strip() == text_from_issue)
long_text = "text" * 1000000
cursor.execute("""insert into clickhouse.test_long_text (flen, field1) values (400000, '{}')""".format(long_text));
result = node1.query("select field1 from test_long_text where flen=400000;")
assert(result.strip() == long_text)

View File

@ -0,0 +1,23 @@
# http://hadoop.apache.org/zookeeper/docs/current/zookeeperAdmin.html
# The number of milliseconds of each tick
tickTime=2000
# The number of ticks that the initial
# synchronization phase can take
initLimit=10
# The number of ticks that can pass between
# sending a request and getting an acknowledgement
syncLimit=5
# the directory where the snapshot is stored.
dataDir=/var/lib/zookeeper
# Place the dataLogDir to a separate physical disc for better performance
# dataLogDir=/disk2/zookeeper
# the port at which the clients will connect
clientPort=2181
# Leader accepts client connections. Default value is "yes". The leader machine
# coordinates updates. For higher update throughput at thes slight expense of
# read throughput the leader can be configured to not accept clients and focus
# on coordination.
leaderServes=yes

View File

@ -0,0 +1,39 @@
(ns jepsen.clickhouse-keeper.bench
(:require [clojure.tools.logging :refer :all]
[jepsen
[client :as client]])
(:import (java.lang ProcessBuilder)
(java.lang ProcessBuilder$Redirect)))
(defn exec-process-builder
[command & args]
(let [pbuilder (ProcessBuilder. (into-array (cons command args)))]
(.redirectOutput pbuilder ProcessBuilder$Redirect/INHERIT)
(.redirectError pbuilder ProcessBuilder$Redirect/INHERIT)
(let [p (.start pbuilder)]
(.waitFor p))))
(defrecord BenchClient [port]
client/Client
(open! [this test node]
this)
(setup! [this test]
this)
(invoke! [this test op]
(let [bench-opts (into [] (clojure.string/split (:bench-opts op) #" "))
bench-path (:bench-path op)
nodes (into [] (flatten (map (fn [x] (identity ["-h" (str x ":" port)])) (:nodes test))))
all-args (concat [bench-path] bench-opts nodes)]
(info "Running cmd" all-args)
(apply exec-process-builder all-args)
(assoc op :type :ok :value "ok")))
(teardown! [_ test])
(close! [_ test]))
(defn bench-client
[port]
(BenchClient. port))

View File

@ -19,12 +19,17 @@
[url]
(non-precise-cached-wget! url))
(defn get-clickhouse-scp
[path]
(c/upload path (str common-prefix "/clickhouse")))
(defn download-clickhouse
[source]
(info "Downloading clickhouse from" source)
(cond
(clojure.string/starts-with? source "rbtorrent:") (get-clickhouse-sky source)
(clojure.string/starts-with? source "http") (get-clickhouse-url source)
(.exists (io/file source)) (get-clickhouse-scp source)
:else (throw (Exception. (str "Don't know how to download clickhouse from" source)))))
(defn unpack-deb
@ -128,11 +133,11 @@
db/LogFiles
(log-files [_ test node]
(c/su
(if (cu/exists? pid-file-path)
(do
(info node "Collecting traces")
(collect-traces test node))
(info node "Pid files doesn't exists"))
;(if (cu/exists? pid-file-path)
;(do
; (info node "Collecting traces")
; (collect-traces test node))
;(info node "Pid files doesn't exists"))
(kill-clickhouse! node test)
(if (cu/exists? coordination-data-dir)
(do

View File

@ -4,11 +4,13 @@
[clojure.pprint :refer [pprint]]
[jepsen.clickhouse-keeper.set :as set]
[jepsen.clickhouse-keeper.db :refer :all]
[jepsen.clickhouse-keeper.zookeeperdb :refer :all]
[jepsen.clickhouse-keeper.nemesis :as custom-nemesis]
[jepsen.clickhouse-keeper.register :as register]
[jepsen.clickhouse-keeper.unique :as unique]
[jepsen.clickhouse-keeper.queue :as queue]
[jepsen.clickhouse-keeper.counter :as counter]
[jepsen.clickhouse-keeper.bench :as bench]
[jepsen.clickhouse-keeper.constants :refer :all]
[clojure.string :as str]
[jepsen
@ -72,12 +74,29 @@
:validate [pos? "Must be a positive integer."]]
[nil, "--lightweight-run" "Subset of workloads/nemesises which is simple to validate"]
[nil, "--reuse-binary" "Use already downloaded binary if it exists, don't remove it on shutdown"]
[nil, "--bench" "Run perf-test mode"]
[nil, "--zookeeper-version VERSION" "Run zookeeper with version"
:default ""]
[nil, "--bench-opts STR" "Run perf-test mode"
:default "--generator list_medium_nodes -c 30 -i 1000"]
["-c" "--clickhouse-source URL" "URL for clickhouse deb or tgz package"
:default "https://clickhouse-builds.s3.yandex.net/21677/ef82333089156907a0979669d9374c2e18daabe5/clickhouse_build_check/clang-11_relwithdebuginfo_none_bundled_unsplitted_disable_False_deb/clickhouse-common-static_21.4.1.6313_amd64.deb"]])
:default "https://clickhouse-builds.s3.yandex.net/21677/ef82333089156907a0979669d9374c2e18daabe5/clickhouse_build_check/clang-11_relwithdebuginfo_none_bundled_unsplitted_disable_False_deb/clickhouse-common-static_21.4.1.6313_amd64.deb"]
[nil "--bench-path path" "Path to keeper-bench util"
:default "/home/alesap/code/cpp/BuildCH/utils/keeper-bench/keeper-bench"]])
(defn clickhouse-keeper-test
"Given an options map from the command line runner (e.g. :nodes, :ssh,
:concurrency, ...), constructs a test map."
(defn get-db
[opts]
(if (empty? (:zookeeper-version opts))
(db (:clickhouse-source opts) (boolean (:reuse-binary opts)))
(zookeeper-db (:zookeeper-version opts))))
(defn get-port
[opts]
(if (empty? (:zookeeper-version opts))
9181
2181))
(defn clickhouse-func-tests
[opts]
(info "Test opts\n" (with-out-str (pprint opts)))
(let [quorum (boolean (:quorum opts))
@ -87,7 +106,7 @@
opts
{:name (str "clickhouse-keeper-quorum=" quorum "-" (name (:workload opts)) "-" (name (:nemesis opts)))
:os ubuntu/os
:db (db (:clickhouse-source opts) (boolean (:reuse-binary opts)))
:db (get-db opts)
:pure-generators true
:client (:client workload)
:nemesis (:nemesis current-nemesis)
@ -105,6 +124,30 @@
(gen/sleep 10)
(gen/clients (:final-generator workload)))})))
(defn clickhouse-perf-test
[opts]
(info "Starting performance test")
(let [dct {:type :invoke :bench-opts (:bench-opts opts) :bench-path (:bench-path opts)}]
(merge tests/noop-test
opts
{:name (str "clickhouse-keeper-perf")
:os ubuntu/os
:db (get-db opts)
:pure-generators true
:client (bench/bench-client (get-port opts))
:nemesis nemesis/noop
:generator (->> dct
(gen/stagger 1)
(gen/nemesis nil))})))
(defn clickhouse-keeper-test
"Given an options map from the command line runner (e.g. :nodes, :ssh,
:concurrency, ...), constructs a test map."
[opts]
(if (boolean (:bench opts))
(clickhouse-perf-test opts)
(clickhouse-func-tests opts)))
(def all-nemesises (keys custom-nemesis/custom-nemesises))
(def all-workloads (keys workloads))

View File

@ -45,7 +45,7 @@
(defn zk-connect
[host port timeout]
(exec-with-retries 15 (fn [] (zk/connect (str host ":" port) :timeout-msec timeout))))
(exec-with-retries 30 (fn [] (zk/connect (str host ":" port) :timeout-msec timeout))))
(defn zk-create-range
[conn n]

View File

@ -0,0 +1,64 @@
(ns jepsen.clickhouse-keeper.zookeeperdb
(:require [clojure.tools.logging :refer :all]
[jepsen.clickhouse-keeper.utils :refer :all]
[clojure.java.io :as io]
[jepsen
[control :as c]
[db :as db]]
[jepsen.os.ubuntu :as ubuntu]))
(defn zk-node-ids
"Returns a map of node names to node ids."
[test]
(->> test
:nodes
(map-indexed (fn [i node] [node (inc i)]))
(into {})))
(defn zk-node-id
"Given a test and a node name from that test, returns the ID for that node."
[test node]
((zk-node-ids test) node))
(defn zoo-cfg-servers
"Constructs a zoo.cfg fragment for servers."
[test mynode]
(->> (zk-node-ids test)
(map (fn [[node id]]
(str "server." id "=" (if (= (name node) mynode) "0.0.0.0" (name node)) ":2888:3888")))
(clojure.string/join "\n")))
(defn zookeeper-db
"Zookeeper DB for a particular version."
[version]
(reify db/DB
(setup! [_ test node]
(c/su
(info node "Installing ZK" version)
(c/exec :apt-get :update)
(c/exec :apt-get :install (str "zookeeper=" version))
(c/exec :apt-get :install (str "zookeeperd=" version))
(c/exec :echo (zk-node-id test node) :> "/etc/zookeeper/conf/myid")
(c/exec :echo (str (slurp (io/resource "zoo.cfg"))
"\n"
(zoo-cfg-servers test node))
:> "/etc/zookeeper/conf/zoo.cfg")
(info node "ZK restarting")
(c/exec :service :zookeeper :restart)
(info "Connecting to zk" (name node))
(zk-connect (name node) 2181 1000)
(info node "ZK ready")))
(teardown! [_ test node]
(info node "tearing down ZK")
(c/su
(c/exec :service :zookeeper :stop :|| true)
(c/exec :rm :-rf
(c/lit "/var/lib/zookeeper/version-*")
(c/lit "/var/log/zookeeper/*"))))
db/LogFiles
(log-files [_ test node]
["/var/log/zookeeper/zookeeper.log"])))

View File

@ -1,4 +0,0 @@
<test>
<query>SELECT arrayFold(x, acc -> acc + 1, range(100000), toUInt64(0))</query> <!-- count -->
<query>SELECT arrayFold(x, acc -> acc + x, range(100000), toUInt64(0))</query> <!-- sum -->
</test>

View File

@ -82,6 +82,7 @@ SYSTEM DROP CACHE ['DROP CACHE'] \N SYSTEM
SYSTEM RELOAD CONFIG ['RELOAD CONFIG'] GLOBAL SYSTEM RELOAD
SYSTEM RELOAD SYMBOLS ['RELOAD SYMBOLS'] GLOBAL SYSTEM RELOAD
SYSTEM RELOAD DICTIONARY ['SYSTEM RELOAD DICTIONARIES','RELOAD DICTIONARY','RELOAD DICTIONARIES'] GLOBAL SYSTEM RELOAD
SYSTEM RELOAD MODEL ['SYSTEM RELOAD MODELS','RELOAD MODEL','RELOAD MODELS'] GLOBAL SYSTEM RELOAD
SYSTEM RELOAD EMBEDDED DICTIONARIES ['RELOAD EMBEDDED DICTIONARIES'] GLOBAL SYSTEM RELOAD
SYSTEM RELOAD [] \N SYSTEM
SYSTEM MERGES ['SYSTEM STOP MERGES','SYSTEM START MERGES','STOP_MERGES','START MERGES'] TABLE SYSTEM

View File

@ -1,2 +1,2 @@
SET max_memory_usage = 1;
SET max_memory_usage = 1, max_untracked_memory = 1000000;
select 'test', count(*) from zeros_mt(1000000) where not ignore(zero); -- { serverError 241 }

View File

@ -0,0 +1,230 @@
uniqThetaSketch many agrs
10 10 100 100 1000 1000
17 10 10 100 100 610 610 766
52 10 10 100 100 608 608 766
5 10 10 100 100 608 608 765
9 10 10 100 100 608 608 765
13 10 10 100 100 607 607 765
46 10 10 100 100 607 607 765
48 10 10 100 100 609 609 765
50 10 10 100 100 608 608 765
54 10 10 100 100 609 609 765
56 10 10 100 100 608 608 765
uniqThetaSketch distinct
123
143
uniqThetaSketch arrays
2
3
3
uniqThetaSketch complex types
3
3
3
3
3
3
3
3
3
uniqThetaSketch decimals
(0,0,0)
(101,101,101)
uniqThetaSketch remove injective
SELECT uniqThetaSketch(x)
FROM
(
SELECT number % 2 AS x
FROM numbers(10)
)
SELECT uniqThetaSketch(x + y)
FROM
(
SELECT
number % 2 AS x,
number % 3 AS y
FROM numbers(10)
)
SELECT uniqThetaSketch(x)
FROM
(
SELECT number % 2 AS x
FROM numbers(10)
)
SELECT uniqThetaSketch(x)
FROM
(
SELECT number % 2 AS x
FROM numbers(10)
)
SELECT uniqThetaSketch(x)
FROM
(
SELECT number % 2 AS x
FROM numbers(10)
)
SELECT uniqThetaSketch(x)
FROM
(
SELECT number % 2 AS x
FROM numbers(10)
)
SELECT uniqThetaSketch(x)
FROM
(
SELECT number % 2 AS x
FROM numbers(10)
)
SELECT uniqThetaSketch(x + y)
FROM
(
SELECT
number % 2 AS x,
number % 3 AS y
FROM numbers(10)
)
SELECT uniqThetaSketch(-x)
FROM
(
SELECT number % 2 AS x
FROM numbers(10)
)
SELECT uniqThetaSketch(bitNot(x))
FROM
(
SELECT number % 2 AS x
FROM numbers(10)
)
SELECT uniqThetaSketch(bitNot(-x))
FROM
(
SELECT number % 2 AS x
FROM numbers(10)
)
SELECT uniqThetaSketch(-bitNot(-x))
FROM
(
SELECT number % 2 AS x
FROM numbers(10)
)
1000 1000
2014-06-01 1000 1000
1000 1000
2014-06-01 1000 1000
2014-06-01 0 0 7 7
2014-06-01 0 1 7 7
2014-06-01 0 2 7 7
2014-06-01 0 3 7 7
2014-06-01 0 4 7 7
2014-06-01 0 5 7 7
2014-06-01 0 6 7 7
2014-06-01 0 7 7 7
2014-06-01 0 8 7 7
2014-06-01 0 9 7 7
2014-06-01 1 10 7 7
2014-06-01 1 11 7 7
2014-06-01 1 12 7 7
2014-06-01 1 13 7 7
2014-06-01 1 14 7 7
2014-06-01 1 15 7 7
2014-06-01 1 16 7 7
2014-06-01 1 17 7 7
2014-06-01 1 18 7 7
2014-06-01 1 19 7 7
2014-06-01 2 20 7 7
2014-06-01 2 21 7 7
2014-06-01 2 22 7 7
2014-06-01 2 23 7 7
2014-06-01 2 24 7 7
2014-06-01 2 25 7 7
2014-06-01 2 26 7 7
2014-06-01 2 27 7 7
2014-06-01 2 28 7 7
2014-06-01 2 29 7 7
2014-06-01 3 30 7 7
2014-06-01 3 31 7 7
2014-06-01 3 32 7 7
2014-06-01 3 33 7 7
2014-06-01 3 34 7 7
2014-06-01 3 35 7 7
2014-06-01 3 36 7 7
2014-06-01 3 37 7 7
2014-06-01 3 38 7 7
2014-06-01 3 39 7 7
2014-06-01 4 40 7 7
2014-06-01 4 41 7 7
2014-06-01 4 42 7 7
2014-06-01 4 43 7 7
2014-06-01 4 44 7 7
2014-06-01 4 45 7 7
2014-06-01 4 46 7 7
2014-06-01 4 47 7 7
2014-06-01 4 48 7 7
2014-06-01 4 49 7 7
2014-06-01 5 50 7 7
2014-06-01 5 51 7 7
2014-06-01 5 52 7 7
2014-06-01 5 53 7 7
2014-06-01 5 54 7 7
2014-06-01 5 55 7 7
2014-06-01 5 56 7 7
2014-06-01 5 57 7 7
2014-06-01 5 58 7 7
2014-06-01 5 59 7 7
2014-06-01 6 60 7 7
2014-06-01 6 61 7 7
2014-06-01 6 62 7 7
2014-06-01 6 63 7 7
2014-06-01 6 64 7 7
2014-06-01 6 65 7 7
2014-06-01 6 66 7 7
2014-06-01 6 67 7 7
2014-06-01 6 68 7 7
2014-06-01 6 69 7 7
2014-06-01 7 70 7 7
2014-06-01 7 71 7 7
2014-06-01 7 72 7 7
2014-06-01 7 73 7 7
2014-06-01 7 74 7 7
2014-06-01 7 75 7 7
2014-06-01 7 76 7 7
2014-06-01 7 77 7 7
2014-06-01 7 78 7 7
2014-06-01 7 79 7 7
2014-06-01 8 80 7 7
2014-06-01 8 81 7 7
2014-06-01 8 82 7 7
2014-06-01 8 83 7 7
2014-06-01 8 84 7 7
2014-06-01 8 85 7 7
2014-06-01 8 86 7 7
2014-06-01 8 87 7 7
2014-06-01 8 88 7 7
2014-06-01 8 89 7 7
2014-06-01 9 90 7 7
2014-06-01 9 91 7 7
2014-06-01 9 92 7 7
2014-06-01 9 93 7 7
2014-06-01 9 94 7 7
2014-06-01 9 95 7 7
2014-06-01 9 96 7 7
2014-06-01 9 97 7 7
2014-06-01 9 98 7 7
2014-06-01 9 99 7 7
2014-06-01 0 7 7
2014-06-01 1 7 7
2014-06-01 2 7 7
2014-06-01 3 7 7
2014-06-01 4 7 7
2014-06-01 5 7 7
2014-06-01 6 7 7
2014-06-01 7 7 7
2014-06-01 8 7 7
2014-06-01 9 7 7
2014-06-01 7 7
0 333333 53 53
1 333333 53 53
2 333333 53 53
0 333333 53 53
1 333333 53 53
2 333333 53 53

View File

@ -0,0 +1,211 @@
SELECT 'uniqThetaSketch many agrs';
SELECT
uniqThetaSketch(x), uniqThetaSketch((x)), uniqThetaSketch(x, y), uniqThetaSketch((x, y)), uniqThetaSketch(x, y, z), uniqThetaSketch((x, y, z))
FROM
(
SELECT
number % 10 AS x,
intDiv(number, 10) % 10 AS y,
toString(intDiv(number, 100) % 10) AS z
FROM system.numbers LIMIT 1000
);
SELECT k,
uniqThetaSketch(x), uniqThetaSketch((x)), uniqThetaSketch(x, y), uniqThetaSketch((x, y)), uniqThetaSketch(x, y, z), uniqThetaSketch((x, y, z)),
count() AS c
FROM
(
SELECT
(number + 0x8ffcbd8257219a26) * 0x66bb3430c06d2353 % 131 AS k,
number % 10 AS x,
intDiv(number, 10) % 10 AS y,
toString(intDiv(number, 100) % 10) AS z
FROM system.numbers LIMIT 100000
)
GROUP BY k
ORDER BY c DESC, k ASC
LIMIT 10;
SELECT 'uniqThetaSketch distinct';
SET count_distinct_implementation = 'uniqThetaSketch';
SELECT count(DISTINCT x) FROM (SELECT number % 123 AS x FROM system.numbers LIMIT 1000);
SELECT count(DISTINCT x, y) FROM (SELECT number % 11 AS x, number % 13 AS y FROM system.numbers LIMIT 1000);
SELECT 'uniqThetaSketch arrays';
SELECT uniqThetaSketchArray([0, 1, 1], [0, 1, 1], [0, 1, 1]);
SELECT uniqThetaSketchArray([0, 1, 1], [0, 1, 1], [0, 1, 0]);
SELECT uniqThetaSketch(x) FROM (SELECT arrayJoin([[1, 2], [1, 2], [1, 2, 3], []]) AS x);
SELECT 'uniqThetaSketch complex types';
SELECT uniqThetaSketch(x) FROM (SELECT arrayJoin([[], ['a'], ['a', 'b'], []]) AS x);
SELECT uniqThetaSketch(x) FROM (SELECT arrayJoin([[[]], [['a', 'b']], [['a'], ['b']], [['a', 'b']]]) AS x);
SELECT uniqThetaSketch(x, x) FROM (SELECT arrayJoin([[], ['a'], ['a', 'b'], []]) AS x);
SELECT uniqThetaSketch(x, arrayMap(elem -> [elem, elem], x)) FROM (SELECT arrayJoin([[], ['a'], ['a', 'b'], []]) AS x);
SELECT uniqThetaSketch(x, toString(x)) FROM (SELECT arrayJoin([[], ['a'], ['a', 'b'], []]) AS x);
SELECT uniqThetaSketch((x, x)) FROM (SELECT arrayJoin([[], ['a'], ['a', 'b'], []]) AS x);
SELECT uniqThetaSketch((x, arrayMap(elem -> [elem, elem], x))) FROM (SELECT arrayJoin([[], ['a'], ['a', 'b'], []]) AS x);
SELECT uniqThetaSketch((x, toString(x))) FROM (SELECT arrayJoin([[], ['a'], ['a', 'b'], []]) AS x);
SELECT uniqThetaSketch(x) FROM (SELECT arrayJoin([[], ['a'], ['a', NULL, 'b'], []]) AS x);
SELECT 'uniqThetaSketch decimals';
DROP TABLE IF EXISTS decimal;
CREATE TABLE decimal
(
a Decimal32(4),
b Decimal64(8),
c Decimal128(8)
) ENGINE = Memory;
SELECT (uniqThetaSketch(a), uniqThetaSketch(b), uniqThetaSketch(c))
FROM (SELECT * FROM decimal ORDER BY a);
INSERT INTO decimal (a, b, c)
SELECT toDecimal32(number - 50, 4), toDecimal64(number - 50, 8) / 3, toDecimal128(number - 50, 8) / 5
FROM system.numbers LIMIT 101;
SELECT (uniqThetaSketch(a), uniqThetaSketch(b), uniqThetaSketch(c))
FROM (SELECT * FROM decimal ORDER BY a);
DROP TABLE decimal;
SELECT 'uniqThetaSketch remove injective';
set optimize_injective_functions_inside_uniq = 1;
EXPLAIN SYNTAX select uniqThetaSketch(x) from (select number % 2 as x from numbers(10));
EXPLAIN SYNTAX select uniqThetaSketch(x + y) from (select number % 2 as x, number % 3 y from numbers(10));
EXPLAIN SYNTAX select uniqThetaSketch(-x) from (select number % 2 as x from numbers(10));
EXPLAIN SYNTAX select uniqThetaSketch(bitNot(x)) from (select number % 2 as x from numbers(10));
EXPLAIN SYNTAX select uniqThetaSketch(bitNot(-x)) from (select number % 2 as x from numbers(10));
EXPLAIN SYNTAX select uniqThetaSketch(-bitNot(-x)) from (select number % 2 as x from numbers(10));
set optimize_injective_functions_inside_uniq = 0;
EXPLAIN SYNTAX select uniqThetaSketch(x) from (select number % 2 as x from numbers(10));
EXPLAIN SYNTAX select uniqThetaSketch(x + y) from (select number % 2 as x, number % 3 y from numbers(10));
EXPLAIN SYNTAX select uniqThetaSketch(-x) from (select number % 2 as x from numbers(10));
EXPLAIN SYNTAX select uniqThetaSketch(bitNot(x)) from (select number % 2 as x from numbers(10));
EXPLAIN SYNTAX select uniqThetaSketch(bitNot(-x)) from (select number % 2 as x from numbers(10));
EXPLAIN SYNTAX select uniqThetaSketch(-bitNot(-x)) from (select number % 2 as x from numbers(10));
DROP TABLE IF EXISTS stored_aggregates;
-- simple
CREATE TABLE stored_aggregates
(
d Date,
Uniq AggregateFunction(uniq, UInt64),
UniqThetaSketch AggregateFunction(uniqThetaSketch, UInt64)
)
ENGINE = AggregatingMergeTree(d, d, 8192);
INSERT INTO stored_aggregates
SELECT
toDate('2014-06-01') AS d,
uniqState(number) AS Uniq,
uniqThetaSketchState(number) AS UniqThetaSketch
FROM
(
SELECT * FROM system.numbers LIMIT 1000
);
SELECT uniqMerge(Uniq), uniqThetaSketchMerge(UniqThetaSketch) FROM stored_aggregates;
SELECT d, uniqMerge(Uniq), uniqThetaSketchMerge(UniqThetaSketch) FROM stored_aggregates GROUP BY d ORDER BY d;
OPTIMIZE TABLE stored_aggregates;
SELECT uniqMerge(Uniq), uniqThetaSketchMerge(UniqThetaSketch) FROM stored_aggregates;
SELECT d, uniqMerge(Uniq), uniqThetaSketchMerge(UniqThetaSketch) FROM stored_aggregates GROUP BY d ORDER BY d;
DROP TABLE stored_aggregates;
-- complex
CREATE TABLE stored_aggregates
(
d Date,
k1 UInt64,
k2 String,
Uniq AggregateFunction(uniq, UInt64),
UniqThetaSketch AggregateFunction(uniqThetaSketch, UInt64)
)
ENGINE = AggregatingMergeTree(d, (d, k1, k2), 8192);
INSERT INTO stored_aggregates
SELECT
toDate('2014-06-01') AS d,
intDiv(number, 100) AS k1,
toString(intDiv(number, 10)) AS k2,
uniqState(toUInt64(number % 7)) AS Uniq,
uniqThetaSketchState(toUInt64(number % 7)) AS UniqThetaSketch
FROM
(
SELECT * FROM system.numbers LIMIT 1000
)
GROUP BY d, k1, k2
ORDER BY d, k1, k2;
SELECT d, k1, k2,
uniqMerge(Uniq), uniqThetaSketchMerge(UniqThetaSketch)
FROM stored_aggregates
GROUP BY d, k1, k2
ORDER BY d, k1, k2;
SELECT d, k1,
uniqMerge(Uniq), uniqThetaSketchMerge(UniqThetaSketch)
FROM stored_aggregates
GROUP BY d, k1
ORDER BY d, k1;
SELECT d,
uniqMerge(Uniq), uniqThetaSketchMerge(UniqThetaSketch)
FROM stored_aggregates
GROUP BY d
ORDER BY d;
DROP TABLE stored_aggregates;
---- sum + uniq with more data
drop table if exists summing_merge_tree_null;
drop table if exists summing_merge_tree_aggregate_function;
create table summing_merge_tree_null (
d materialized today(),
k UInt64,
c UInt64,
u UInt64
) engine=Null;
create materialized view summing_merge_tree_aggregate_function (
d Date,
k UInt64,
c UInt64,
un AggregateFunction(uniq, UInt64),
ut AggregateFunction(uniqThetaSketch, UInt64)
) engine=SummingMergeTree(d, k, 8192)
as select d, k, sum(c) as c, uniqState(u) as un, uniqThetaSketchState(u) as ut
from summing_merge_tree_null
group by d, k;
-- prime number 53 to avoid resonanse between %3 and %53
insert into summing_merge_tree_null select number % 3, 1, number % 53 from numbers(999999);
select k, sum(c), uniqMerge(un), uniqThetaSketchMerge(ut) from summing_merge_tree_aggregate_function group by k order by k;
optimize table summing_merge_tree_aggregate_function;
select k, sum(c), uniqMerge(un), uniqThetaSketchMerge(ut) from summing_merge_tree_aggregate_function group by k order by k;
drop table summing_merge_tree_aggregate_function;
drop table summing_merge_tree_null;

View File

@ -0,0 +1,219 @@
uniqThetaSketch
1 1
3 1
6 1
7 1
9 1
11 1
14 1
17 1
19 1
20 2
26 1
31 1
35 1
36 1
0 162
1 162
3 162
6 162
7 163
9 163
10 81
11 163
13 162
14 162
17 162
19 162
20 162
21 162
22 162
26 162
31 162
35 162
36 162
0 55018
1 54020
3 53774
6 53947
7 53839
9 54408
10 26876
11 54985
13 53479
14 53516
17 53331
19 53680
20 54211
21 53054
22 54690
26 53716
31 54139
35 52331
36 53766
uniqThetaSketch round(float)
0.125 1
0.5 1
0.05 1
0.143 1
0.056 1
0.048 2
0.083 1
0.25 1
0.1 1
0.028 1
0.027 1
0.031 1
0.067 1
0.037 1
0.045 162
0.125 163
0.5 162
0.05 162
0.143 162
0.091 81
0.056 162
0.048 162
0.083 163
0.25 162
1 162
0.1 163
0.028 162
0.027 162
0.031 162
0.067 162
0.043 162
0.037 162
0.071 162
0.045 53054
0.125 53839
0.5 54020
0.05 53680
0.143 53947
0.091 26876
0.056 53331
0.048 54211
0.083 54985
0.25 53774
1 55018
0.1 54408
0.028 52331
0.027 53766
0.031 54139
0.067 53516
0.043 54690
0.037 53716
0.071 53479
uniqThetaSketch round(toFloat32())
0.5 1
0.05 1
0.25 1
0.048 2
0.083 1
0.125 1
0.031 1
0.143 1
0.028 1
0.067 1
0.027 1
0.056 1
0.037 1
0.1 1
0.5 162
0.05 162
0.25 162
0.048 162
0.091 81
0.043 162
0.071 162
0.083 163
0.125 163
0.031 162
0.143 162
0.028 162
0.067 162
0.045 162
0.027 162
0.056 162
0.037 162
0.1 163
1 162
0.5 54020
0.05 53680
0.25 53774
0.048 54211
0.091 26876
0.043 54690
0.071 53479
0.083 54985
0.125 53839
0.031 54139
0.143 53947
0.028 52331
0.067 53516
0.045 53054
0.027 53766
0.056 53331
0.037 53716
0.1 54408
1 55018
uniqThetaSketch IPv4NumToString
1 1
3 1
6 1
7 1
9 1
11 1
14 1
17 1
19 1
20 2
26 1
31 1
35 1
36 1
0 162
1 162
3 162
6 162
7 163
9 163
10 81
11 163
13 162
14 162
17 162
19 162
20 162
21 162
22 162
26 162
31 162
35 162
36 162
0 54929
1 53802
3 54706
6 54700
7 53592
9 54036
10 27392
11 53768
13 54566
14 53104
17 54243
19 55003
20 53398
21 53831
22 54603
26 54607
31 54012
35 54826
36 54910
uniqThetaSketch remote()
1
uniqThetaSketch precise
10000000
10021957
10021969
10094819

View File

@ -0,0 +1,35 @@
SELECT 'uniqThetaSketch';
SELECT Y, uniqThetaSketch(X) FROM (SELECT number AS X, (3*X*X - 7*X + 11) % 37 AS Y FROM system.numbers LIMIT 15) GROUP BY Y;
SELECT Y, uniqThetaSketch(X) FROM (SELECT number AS X, (3*X*X - 7*X + 11) % 37 AS Y FROM system.numbers LIMIT 3000) GROUP BY Y;
SELECT Y, uniqThetaSketch(X) FROM (SELECT number AS X, (3*X*X - 7*X + 11) % 37 AS Y FROM system.numbers LIMIT 1000000) GROUP BY Y;
SELECT 'uniqThetaSketch round(float)';
SELECT Y, uniqThetaSketch(X) FROM (SELECT number AS X, round(1/(1 + (3*X*X - 7*X + 11) % 37), 3) AS Y FROM system.numbers LIMIT 15) GROUP BY Y;
SELECT Y, uniqThetaSketch(X) FROM (SELECT number AS X, round(1/(1 + (3*X*X - 7*X + 11) % 37), 3) AS Y FROM system.numbers LIMIT 3000) GROUP BY Y;
SELECT Y, uniqThetaSketch(X) FROM (SELECT number AS X, round(1/(1 + (3*X*X - 7*X + 11) % 37), 3) AS Y FROM system.numbers LIMIT 1000000) GROUP BY Y;
SELECT 'uniqThetaSketch round(toFloat32())';
SELECT Y, uniqThetaSketch(X) FROM (SELECT number AS X, round(toFloat32(1/(1 + (3*X*X - 7*X + 11) % 37)), 3) AS Y FROM system.numbers LIMIT 15) GROUP BY Y;
SELECT Y, uniqThetaSketch(X) FROM (SELECT number AS X, round(toFloat32(1/(1 + (3*X*X - 7*X + 11) % 37)), 3) AS Y FROM system.numbers LIMIT 3000) GROUP BY Y;
SELECT Y, uniqThetaSketch(X) FROM (SELECT number AS X, round(toFloat32(1/(1 + (3*X*X - 7*X + 11) % 37)), 3) AS Y FROM system.numbers LIMIT 1000000) GROUP BY Y;
SELECT 'uniqThetaSketch IPv4NumToString';
SELECT Y, uniqThetaSketch(Z) FROM (SELECT number AS X, IPv4NumToString(toUInt32(X)) AS Z, (3*X*X - 7*X + 11) % 37 AS Y FROM system.numbers LIMIT 15) GROUP BY Y;
SELECT Y, uniqThetaSketch(Z) FROM (SELECT number AS X, IPv4NumToString(toUInt32(X)) AS Z, (3*X*X - 7*X + 11) % 37 AS Y FROM system.numbers LIMIT 3000) GROUP BY Y;
SELECT Y, uniqThetaSketch(Z) FROM (SELECT number AS X, IPv4NumToString(toUInt32(X)) AS Z, (3*X*X - 7*X + 11) % 37 AS Y FROM system.numbers LIMIT 1000000) GROUP BY Y;
SELECT 'uniqThetaSketch remote()';
SELECT uniqThetaSketch(dummy) FROM remote('127.0.0.{2,3}', system.one);
SELECT 'uniqThetaSketch precise';
SELECT uniqExact(number) FROM numbers(1e7);
SELECT uniqCombined(number) FROM numbers(1e7);
SELECT uniqCombined64(number) FROM numbers(1e7);
SELECT uniqThetaSketch(number) FROM numbers(1e7);

View File

@ -1,8 +0,0 @@
23
3
101
269
[1,2,3,4]
[4,3,2,1]
([4,3,2,1],[1,2,3,4])
([1,3,5],[2,4,6])

View File

@ -1,8 +0,0 @@
SELECT arrayFold(x,acc -> acc + x * 2, [1,2,3,4], toInt64(3));
SELECT arrayFold(x,acc -> acc + x * 2, emptyArrayInt64(), toInt64(3));
SELECT arrayFold(x,y,acc -> acc + x * 2 + y * 3, [1,2,3,4], [5,6,7,8], toInt64(3));
SELECT arrayFold(x,y,z,acc -> acc + x * 2 + y * 3 + z * 4, [1,2,3,4], [5,6,7,8], [9,10,11,12], toInt64(3));
SELECT arrayFold(x,acc -> arrayPushBack(acc,x), [1,2,3,4], emptyArrayInt64());
SELECT arrayFold(x,acc -> arrayPushFront(acc,x), [1,2,3,4], emptyArrayInt64());
SELECT arrayFold(x,acc -> (arrayPushFront(acc.1,x), arrayPushBack(acc.2,x)), [1,2,3,4], (emptyArrayInt64(), emptyArrayInt64()));
SELECT arrayFold(x,acc -> x % 2 ? (arrayPushBack(acc.1,x), acc.2): (acc.1, arrayPushBack(acc.2,x)), [1,2,3,4,5,6], (emptyArrayInt64(), emptyArrayInt64()));

View File

@ -1,80 +0,0 @@
0
0
1
3
6
10
15
21
28
36
0
1
3
6
10
15
21
28
36
45
[]
[0]
[1,0]
[2,1,0]
[3,2,1,0]
[4,3,2,1,0]
[5,4,3,2,1,0]
[6,5,4,3,2,1,0]
[7,6,5,4,3,2,1,0]
[8,7,6,5,4,3,2,1,0]
[]
[0]
[1,0]
[1,0,2]
[3,1,0,2]
[3,1,0,2,4]
[5,3,1,0,2,4]
[5,3,1,0,2,4,6]
[7,5,3,1,0,2,4,6]
[7,5,3,1,0,2,4,6,8]
(0,0)
(0,0)
(1,-1)
(3,-3)
(6,-6)
(10,-10)
(15,-15)
(21,-21)
(28,-28)
(36,-36)
(0,0)
(0,0)
(1,-1)
(3,-3)
(6,-6)
(10,-10)
(15,-15)
(21,-21)
(28,-28)
(36,-36)
[(0,0)]
[(0,1),(0,0)]
[(1,2),(0,1),(0,0)]
[(2,3),(1,2),(0,1),(0,0)]
[(3,4),(2,3),(1,2),(0,1),(0,0)]
[(4,5),(3,4),(2,3),(1,2),(0,1),(0,0)]
[(5,6),(4,5),(3,4),(2,3),(1,2),(0,1),(0,0)]
[(6,7),(5,6),(4,5),(3,4),(2,3),(1,2),(0,1),(0,0)]
[(7,8),(6,7),(5,6),(4,5),(3,4),(2,3),(1,2),(0,1),(0,0)]
[(8,9),(7,8),(6,7),(5,6),(4,5),(3,4),(2,3),(1,2),(0,1),(0,0)]
[]
['0']
['0','1']
['0','1','2']
['0','1','2','3']
['0','1','2','3','4']
['0','1','2','3','4','5']
['0','1','2','3','4','5','6']
['0','1','2','3','4','5','6','7']
['0','1','2','3','4','5','6','7','8']

View File

@ -1,8 +0,0 @@
SELECT arrayFold(x,acc -> acc+x, range(number), toInt64(0)) FROM system.numbers LIMIT 10;
SELECT arrayFold(x,acc -> acc+x, range(number), number) FROM system.numbers LIMIT 10;
SELECT arrayFold(x,acc -> arrayPushFront(acc, x), range(number), emptyArrayUInt64()) FROM system.numbers LIMIT 10;
SELECT arrayFold(x,acc -> x % 2 ? arrayPushFront(acc, x) : arrayPushBack(acc, x), range(number), emptyArrayUInt64()) FROM system.numbers LIMIT 10;
SELECT arrayFold(x,acc -> (acc.1+x, acc.2-x), range(number), (toInt64(0), toInt64(0))) FROM system.numbers LIMIT 10;
SELECT arrayFold(x,acc -> (acc.1+x.1, acc.2-x.2), arrayZip(range(number), range(number)), (toInt64(0), toInt64(0))) FROM system.numbers LIMIT 10;
SELECT arrayFold(x,acc -> arrayPushFront(acc, (x, x+1)), range(number), [(toUInt64(0),toUInt64(0))]) FROM system.numbers LIMIT 10;
SELECT arrayFold(x, acc -> concat(acc, arrayMap(z -> toString(x), [number])) , range(number), CAST([] as Array(String))) FROM system.numbers LIMIT 10;

View File

@ -1,12 +0,0 @@
SELECT arrayFold([]); -- { serverError 42 }
SELECT arrayFold([1,2,3]); -- { serverError 42 }
SELECT arrayFold([1,2,3], [4,5,6]); -- { serverError 43 }
SELECT arrayFold(1234); -- { serverError 42 }
SELECT arrayFold(x, acc -> acc + x, 10, 20); -- { serverError 43 }
SELECT arrayFold(x, acc -> acc + x, 10, [20, 30, 40]); -- { serverError 43 }
SELECT arrayFold(x -> x * 2, [1,2,3,4], toInt64(3)); -- { serverError 43 }
SELECT arrayFold(x,acc -> acc+x, number, toInt64(0)) FROM system.numbers LIMIT 10; -- { serverError 43 }
SELECT arrayFold(x,y,acc -> acc + x * 2 + y * 3, [1,2,3,4], [5,6,7], toInt64(3)); -- { serverError 190 }
SELECT arrayFold(x,acc -> acc + x * 2 + y * 3, [1,2,3,4], [5,6,7,8], toInt64(3)); -- { serverError 47 }
SELECT arrayFold(x,acc -> acc + x * 2, [1,2,3,4], [5,6,7,8], toInt64(3)); -- { serverError 43 }
SELECT arrayFold(x,acc -> concat(acc,', ', x), [1, 2, 3, 4], '0') -- { serverError 44 }

Some files were not shown because too many files have changed in this diff Show More