Merge branch 'master' of https://github.com/ClickHouse/ClickHouse into testflows_windows_functions_add_lag_and_lead_in_frame_tests

This commit is contained in:
Vitaliy Zakaznikov 2021-06-09 09:00:16 -04:00
commit 5564691d8c
100 changed files with 2788 additions and 660 deletions

5
.gitignore vendored
View File

@ -14,6 +14,11 @@
/build-*
/tests/venv
# logs
*.log
*.stderr
*.stdout
/docs/build
/docs/publish
/docs/edit

View File

@ -2,6 +2,7 @@
#### Upgrade Notes
* Do not upgrade if you have partition key with `UUID`.
* `zstd` compression library is updated to v1.5.0. You may get messages about "checksum does not match" in replication. These messages are expected due to update of compression algorithm and you can ignore them. These messages are informational and do not indicate any kinds of undesired behaviour.
* The setting `compile_expressions` is enabled by default. Although it has been heavily tested on variety of scenarios, if you find some undesired behaviour on your servers, you can try turning this setting off.
* Values of `UUID` type cannot be compared with integer. For example, instead of writing `uuid != 0` type `uuid != '00000000-0000-0000-0000-000000000000'`.
@ -763,6 +764,7 @@
* Allow using extended integer types (`Int128`, `Int256`, `UInt256`) in `avg` and `avgWeighted` functions. Also allow using different types (integer, decimal, floating point) for value and for weight in `avgWeighted` function. This is a backward-incompatible change: now the `avg` and `avgWeighted` functions always return `Float64` (as documented). Before this change the return type for `Decimal` arguments was also `Decimal`. [#15419](https://github.com/ClickHouse/ClickHouse/pull/15419) ([Mike](https://github.com/myrrc)).
* Expression `toUUID(N)` no longer works. Replace with `toUUID('00000000-0000-0000-0000-000000000000')`. This change is motivated by non-obvious results of `toUUID(N)` where N is non zero.
* SSL Certificates with incorrect "key usage" are rejected. In previous versions they are used to work. See [#19262](https://github.com/ClickHouse/ClickHouse/issues/19262).
* `incl` references to substitutions file (`/etc/metrika.xml`) were removed from the default config (`<remote_servers>`, `<zookeeper>`, `<macros>`, `<compression>`, `<networks>`). If you were using substitutions file and were relying on those implicit references, you should put them back manually and explicitly by adding corresponding sections with `incl="..."` attributes before the update. See [#18740](https://github.com/ClickHouse/ClickHouse/pull/18740) ([alexey-milovidov](https://github.com/alexey-milovidov)).
#### New Feature

View File

@ -4,12 +4,14 @@
#include <Core/Block.h>
#include <Interpreters/InternalTextLogsQueue.h>
#include <Interpreters/TextLog.h>
#include <IO/WriteBufferFromFileDescriptor.h>
#include <sys/time.h>
#include <Poco/Message.h>
#include <Common/CurrentThread.h>
#include <Common/DNSResolver.h>
#include <common/getThreadId.h>
#include <Common/SensitiveDataMasker.h>
#include <Common/IO.h>
namespace DB
{
@ -26,16 +28,48 @@ void OwnSplitChannel::log(const Poco::Message & msg)
auto matches = masker->wipeSensitiveData(message_text);
if (matches > 0)
{
logSplit({msg, message_text}); // we will continue with the copy of original message with text modified
tryLogSplit({msg, message_text}); // we will continue with the copy of original message with text modified
return;
}
}
logSplit(msg);
tryLogSplit(msg);
}
void OwnSplitChannel::tryLogSplit(const Poco::Message & msg)
{
try
{
logSplit(msg);
}
/// It is better to catch the errors here in order to avoid
/// breaking some functionality because of unexpected "File not
/// found" (or similar) error.
///
/// For example StorageDistributedDirectoryMonitor will mark batch
/// as broken, some MergeTree code can also be affected.
///
/// Also note, that we cannot log the exception here, since this
/// will lead to recursion, using regular tryLogCurrentException().
/// but let's log it into the stderr at least.
catch (...)
{
MemoryTracker::LockExceptionInThread lock_memory_tracker(VariableContext::Global);
const std::string & exception_message = getCurrentExceptionMessage(true);
const std::string & message = msg.getText();
/// NOTE: errors are ignored, since nothing can be done.
writeRetry(STDERR_FILENO, "Cannot add message to the log: ");
writeRetry(STDERR_FILENO, message.data(), message.size());
writeRetry(STDERR_FILENO, "\n");
writeRetry(STDERR_FILENO, exception_message.data(), exception_message.size());
writeRetry(STDERR_FILENO, "\n");
}
}
void OwnSplitChannel::logSplit(const Poco::Message & msg)
{
ExtendedLogMessage msg_ext = ExtendedLogMessage::getFrom(msg);

View File

@ -24,6 +24,7 @@ public:
private:
void logSplit(const Poco::Message & msg);
void tryLogSplit(const Poco::Message & msg);
using ChannelPtr = Poco::AutoPtr<Poco::Channel>;
/// Handler and its pointer casted to extended interface

View File

@ -229,6 +229,7 @@ status()
case "$1" in
status)
status
exit 0
;;
esac

View File

@ -8,6 +8,7 @@ RUN apt-get update -y && \
python3-wheel \
brotli \
netcat-openbsd \
postgresql-client \
zstd
RUN python3 -m pip install \

View File

@ -32,7 +32,7 @@ CREATE TABLE `ontime`
`Reporting_Airline` String,
`DOT_ID_Reporting_Airline` Int32,
`IATA_CODE_Reporting_Airline` String,
`Tail_Number` Int32,
`Tail_Number` String,
`Flight_Number_Reporting_Airline` String,
`OriginAirportID` Int32,
`OriginAirportSeqID` Int32,

View File

@ -0,0 +1,107 @@
---
toc_priority: 146
toc_title: intervalLengthSum
---
# intervalLengthSum {#agg_function-intervallengthsum}
Calculates the total length of union of all ranges (segments on numeric axis).
**Syntax**
``` sql
intervalLengthSum(start, end)
```
**Arguments**
- `start` — The starting value of the interval. [Int32](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [Int64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [UInt32](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [UInt64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [Float32](../../../sql-reference/data-types/float.md#float32-float64), [Float64](../../../sql-reference/data-types/float.md#float32-float64), [DateTime](../../../sql-reference/data-types/datetime.md#data_type-datetime) or [Date](../../../sql-reference/data-types/date.md#data_type-date).
- `end` — The ending value of the interval. [Int32](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [Int64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [UInt32](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [UInt64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [Float32](../../../sql-reference/data-types/float.md#float32-float64), [Float64](../../../sql-reference/data-types/float.md#float32-float64), [DateTime](../../../sql-reference/data-types/datetime.md#data_type-datetime) or [Date](../../../sql-reference/data-types/date.md#data_type-date).
!!! info "Note"
Arguments must be of the same data type. Otherwise, an exception will be thrown.
**Returned value**
- Total length of union of all ranges (segments on numeric axis). Depending on the type of the argument, the return value may be [UInt64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64) or [Float64](../../../sql-reference/data-types/float.md#float32-float64) type.
**Examples**
1. Input table:
``` text
┌─id─┬─start─┬─end─┐
│ a │ 1.1 │ 2.9 │
│ a │ 2.5 │ 3.2 │
│ a │ 4 │ 5 │
└────┴───────┴─────┘
```
In this example, the arguments of the Float32 type are used. The function returns a value of the Float64 type.
Result is the sum of lengths of intervals `[1.1, 3.2]` (union of `[1.1, 2.9]` and `[2.5, 3.2]`) and `[4, 5]`
Query:
``` sql
SELECT id, intervalLengthSum(start, end), toTypeName(intervalLengthSum(start, end)) FROM fl_interval GROUP BY id ORDER BY id;
```
Result:
``` text
┌─id─┬─intervalLengthSum(start, end)─┬─toTypeName(intervalLengthSum(start, end))─┐
│ a │ 3.1 │ Float64 │
└────┴───────────────────────────────┴───────────────────────────────────────────┘
```
2. Input table:
``` text
┌─id─┬───────────────start─┬─────────────────end─┐
│ a │ 2020-01-01 01:12:30 │ 2020-01-01 02:10:10 │
│ a │ 2020-01-01 02:05:30 │ 2020-01-01 02:50:31 │
│ a │ 2020-01-01 03:11:22 │ 2020-01-01 03:23:31 │
└────┴─────────────────────┴─────────────────────┘
```
In this example, the arguments of the DateTime type are used. The function returns a value in seconds.
Query:
``` sql
SELECT id, intervalLengthSum(start, end), toTypeName(intervalLengthSum(start, end)) FROM dt_interval GROUP BY id ORDER BY id;
```
Result:
``` text
┌─id─┬─intervalLengthSum(start, end)─┬─toTypeName(intervalLengthSum(start, end))─┐
│ a │ 6610 │ UInt64 │
└────┴───────────────────────────────┴───────────────────────────────────────────┘
```
3. Input table:
``` text
┌─id─┬──────start─┬────────end─┐
│ a │ 2020-01-01 │ 2020-01-04 │
│ a │ 2020-01-12 │ 2020-01-18 │
└────┴────────────┴────────────┘
```
In this example, the arguments of the Date type are used. The function returns a value in days.
Query:
``` sql
SELECT id, intervalLengthSum(start, end), toTypeName(intervalLengthSum(start, end)) FROM date_interval GROUP BY id ORDER BY id;
```
Result:
``` text
┌─id─┬─intervalLengthSum(start, end)─┬─toTypeName(intervalLengthSum(start, end))─┐
│ a │ 9 │ UInt64 │
└────┴───────────────────────────────┴───────────────────────────────────────────┘
```

View File

@ -159,7 +159,7 @@ Configuration fields:
| Tag | Description | Required |
|------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|
| `name` | Column name. | Yes |
| `type` | ClickHouse data type.<br/>ClickHouse tries to cast value from dictionary to the specified data type. For example, for MySQL, the field might be `TEXT`, `VARCHAR`, or `BLOB` in the MySQL source table, but it can be uploaded as `String` in ClickHouse.<br/>[Nullable](../../../sql-reference/data-types/nullable.md) is currently supported for [Flat](external-dicts-dict-layout.md#flat), [Hashed](external-dicts-dict-layout.md#dicts-external_dicts_dict_layout-hashed), [ComplexKeyHashed](external-dicts-dict-layout.md#complex-key-hashed), [Direct](external-dicts-dict-layout.md#direct), [ComplexKeyDirect](external-dicts-dict-layout.md#complex-key-direct), [RangeHashed](external-dicts-dict-layout.md#range-hashed), [Polygon](external-dicts-dict-polygon.md), [Cache](external-dicts-dict-layout.md#cache), [ComplexKeyCache](external-dicts-dict-layout.md#complex-key-cache), [SSDCache](external-dicts-dict-layout.md#ssd-cache), [SSDComplexKeyCache](external-dicts-dict-layout.md#complex-key-ssd-cache) dictionaries. In [IPTrie](external-dicts-dict-layout.md#ip-trie) dictionaries `Nullable` types are not supported. | Yes |
| `type` | ClickHouse data type: [UInt8](../../../sql-reference/data-types/int-uint.md), [UInt16](../../../sql-reference/data-types/int-uint.md), [UInt32](../../../sql-reference/data-types/int-uint.md), [UInt64](../../../sql-reference/data-types/int-uint.md), [Int8](../../../sql-reference/data-types/int-uint.md), [Int16](../../../sql-reference/data-types/int-uint.md), [Int32](../../../sql-reference/data-types/int-uint.md), [Int64](../../../sql-reference/data-types/int-uint.md), [Float32](../../../sql-reference/data-types/float.md), [Float64](../../../sql-reference/data-types/float.md), [UUID](../../../sql-reference/data-types/uuid.md), [Decimal32](../../../sql-reference/data-types/decimal.md), [Decimal64](../../../sql-reference/data-types/decimal.md), [Decimal128](../../../sql-reference/data-types/decimal.md), [Decimal256](../../../sql-reference/data-types/decimal.md), [String](../../../sql-reference/data-types/string.md).<br/>ClickHouse tries to cast value from dictionary to the specified data type. For example, for MySQL, the field might be `TEXT`, `VARCHAR`, or `BLOB` in the MySQL source table, but it can be uploaded as `String` in ClickHouse.<br/>[Nullable](../../../sql-reference/data-types/nullable.md) is currently supported for [Flat](external-dicts-dict-layout.md#flat), [Hashed](external-dicts-dict-layout.md#dicts-external_dicts_dict_layout-hashed), [ComplexKeyHashed](external-dicts-dict-layout.md#complex-key-hashed), [Direct](external-dicts-dict-layout.md#direct), [ComplexKeyDirect](external-dicts-dict-layout.md#complex-key-direct), [RangeHashed](external-dicts-dict-layout.md#range-hashed), [Polygon](external-dicts-dict-polygon.md), [Cache](external-dicts-dict-layout.md#cache), [ComplexKeyCache](external-dicts-dict-layout.md#complex-key-cache), [SSDCache](external-dicts-dict-layout.md#ssd-cache), [SSDComplexKeyCache](external-dicts-dict-layout.md#complex-key-ssd-cache) dictionaries. In [IPTrie](external-dicts-dict-layout.md#ip-trie) dictionaries `Nullable` types are not supported. | Yes |
| `null_value` | Default value for a non-existing element.<br/>In the example, it is an empty string. [NULL](../../syntax.md#null-literal) value can be used only for the `Nullable` types (see the previous line with types description). | Yes |
| `expression` | [Expression](../../../sql-reference/syntax.md#syntax-expressions) that ClickHouse executes on the value.<br/>The expression can be a column name in the remote SQL database. Thus, you can use it to create an alias for the remote column.<br/><br/>Default value: no expression. | No |
| <a name="hierarchical-dict-attr"></a> `hierarchical` | If `true`, the attribute contains the value of a parent key for the current key. See [Hierarchical Dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md).<br/><br/>Default value: `false`. | No |

View File

@ -0,0 +1,107 @@
---
toc_priority: 146
toc_title: intervalLengthSum
---
# intervalLengthSum {#agg_function-intervallengthsum}
Вычисляет длину объединения интервалов (отрезков на числовой оси).
**Синтаксис**
``` sql
intervalLengthSum(start, end)
```
**Аргументы**
- `start` — начальное значение интервала. [Int32](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [Int64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [UInt32](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [UInt64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [Float32](../../../sql-reference/data-types/float.md#float32-float64), [Float64](../../../sql-reference/data-types/float.md#float32-float64), [DateTime](../../../sql-reference/data-types/datetime.md#data_type-datetime) или [Date](../../../sql-reference/data-types/date.md#data_type-date).
- `end` — конечное значение интервала. [Int32](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [Int64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [UInt32](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [UInt64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [Float32](../../../sql-reference/data-types/float.md#float32-float64), [Float64](../../../sql-reference/data-types/float.md#float32-float64), [DateTime](../../../sql-reference/data-types/datetime.md#data_type-datetime) или [Date](../../../sql-reference/data-types/date.md#data_type-date).
!!! info "Примечание"
Аргументы должны быть одного типа. В противном случае ClickHouse сгенерирует исключение.
**Возвращаемое значение**
- Длина объединения всех интервалов (отрезков на числовой оси). В зависимости от типа аргумента возвращаемое значение может быть типа [UInt64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64) или [Float64](../../../sql-reference/data-types/float.md#float32-float64).
**Примеры**
1. Входная таблица:
``` text
┌─id─┬─start─┬─end─┐
│ a │ 1.1 │ 2.9 │
│ a │ 2.5 │ 3.2 │
│ a │ 4 │ 5 │
└────┴───────┴─────┘
```
В этом примере используются аргументы типа Float32. Функция возвращает значение типа Float64.
Результатом функции будет сумма длин интервалов `[1.1, 3.2]` (объединение `[1.1, 2.9]` и `[2.5, 3.2]`) и `[4, 5]`
Запрос:
``` sql
SELECT id, intervalLengthSum(start, end), toTypeName(intervalLengthSum(start, end)) FROM fl_interval GROUP BY id ORDER BY id;
```
Результат:
``` text
┌─id─┬─intervalLengthSum(start, end)─┬─toTypeName(intervalLengthSum(start, end))─┐
│ a │ 3.1 │ Float64 │
└────┴───────────────────────────────┴───────────────────────────────────────────┘
```
2. Входная таблица:
``` text
┌─id─┬───────────────start─┬─────────────────end─┐
│ a │ 2020-01-01 01:12:30 │ 2020-01-01 02:10:10 │
│ a │ 2020-01-01 02:05:30 │ 2020-01-01 02:50:31 │
│ a │ 2020-01-01 03:11:22 │ 2020-01-01 03:23:31 │
└────┴─────────────────────┴─────────────────────┘
```
В этом примере используются аргументы типа DateTime. Функция возвращает значение, выраженное в секундах.
Запрос:
``` sql
SELECT id, intervalLengthSum(start, end), toTypeName(intervalLengthSum(start, end)) FROM dt_interval GROUP BY id ORDER BY id;
```
Результат:
``` text
┌─id─┬─intervalLengthSum(start, end)─┬─toTypeName(intervalLengthSum(start, end))─┐
│ a │ 6610 │ UInt64 │
└────┴───────────────────────────────┴───────────────────────────────────────────┘
```
3. Входная таблица:
``` text
┌─id─┬──────start─┬────────end─┐
│ a │ 2020-01-01 │ 2020-01-04 │
│ a │ 2020-01-12 │ 2020-01-18 │
└────┴────────────┴────────────┘
```
В этом примере используются аргументы типа Date. Функция возвращает значение, выраженное в днях.
Запрос:
``` sql
SELECT id, intervalLengthSum(start, end), toTypeName(intervalLengthSum(start, end)) FROM date_interval GROUP BY id ORDER BY id;
```
Результат:
``` text
┌─id─┬─intervalLengthSum(start, end)─┬─toTypeName(intervalLengthSum(start, end))─┐
│ a │ 9 │ UInt64 │
└────┴───────────────────────────────┴───────────────────────────────────────────┘
```

View File

@ -159,7 +159,7 @@ CREATE DICTIONARY somename (
| Тег | Описание | Обязательный |
|------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------|
| `name` | Имя столбца. | Да |
| `type` | Тип данных ClickHouse.<br/>ClickHouse пытается привести значение из словаря к заданному типу данных. Например, в случае MySQL, в таблице-источнике поле может быть `TEXT`, `VARCHAR`, `BLOB`, но загружено может быть как `String`. <br/>[Nullable](../../../sql-reference/data-types/nullable.md) в настоящее время поддерживается для словарей [Flat](external-dicts-dict-layout.md#flat), [Hashed](external-dicts-dict-layout.md#dicts-external_dicts_dict_layout-hashed), [ComplexKeyHashed](external-dicts-dict-layout.md#complex-key-hashed), [Direct](external-dicts-dict-layout.md#direct), [ComplexKeyDirect](external-dicts-dict-layout.md#complex-key-direct), [RangeHashed](external-dicts-dict-layout.md#range-hashed), [Polygon](external-dicts-dict-polygon.md), [Cache](external-dicts-dict-layout.md#cache), [ComplexKeyCache](external-dicts-dict-layout.md#complex-key-cache), [SSDCache](external-dicts-dict-layout.md#ssd-cache), [SSDComplexKeyCache](external-dicts-dict-layout.md#complex-key-ssd-cache). Для словарей [IPTrie](external-dicts-dict-layout.md#ip-trie) `Nullable`-типы не поддерживаются. | Да |
| `type` | Тип данных ClickHouse: [UInt8](../../../sql-reference/data-types/int-uint.md), [UInt16](../../../sql-reference/data-types/int-uint.md), [UInt32](../../../sql-reference/data-types/int-uint.md), [UInt64](../../../sql-reference/data-types/int-uint.md), [Int8](../../../sql-reference/data-types/int-uint.md), [Int16](../../../sql-reference/data-types/int-uint.md), [Int32](../../../sql-reference/data-types/int-uint.md), [Int64](../../../sql-reference/data-types/int-uint.md), [Float32](../../../sql-reference/data-types/float.md), [Float64](../../../sql-reference/data-types/float.md), [UUID](../../../sql-reference/data-types/uuid.md), [Decimal32](../../../sql-reference/data-types/decimal.md), [Decimal64](../../../sql-reference/data-types/decimal.md), [Decimal128](../../../sql-reference/data-types/decimal.md), [Decimal256](../../../sql-reference/data-types/decimal.md), [String](../../../sql-reference/data-types/string.md).<br/>ClickHouse пытается привести значение из словаря к заданному типу данных. Например, в случае MySQL, в таблице-источнике поле может быть `TEXT`, `VARCHAR`, `BLOB`, но загружено может быть как `String`. <br/>[Nullable](../../../sql-reference/data-types/nullable.md) в настоящее время поддерживается для словарей [Flat](external-dicts-dict-layout.md#flat), [Hashed](external-dicts-dict-layout.md#dicts-external_dicts_dict_layout-hashed), [ComplexKeyHashed](external-dicts-dict-layout.md#complex-key-hashed), [Direct](external-dicts-dict-layout.md#direct), [ComplexKeyDirect](external-dicts-dict-layout.md#complex-key-direct), [RangeHashed](external-dicts-dict-layout.md#range-hashed), [Polygon](external-dicts-dict-polygon.md), [Cache](external-dicts-dict-layout.md#cache), [ComplexKeyCache](external-dicts-dict-layout.md#complex-key-cache), [SSDCache](external-dicts-dict-layout.md#ssd-cache), [SSDComplexKeyCache](external-dicts-dict-layout.md#complex-key-ssd-cache). Для словарей [IPTrie](external-dicts-dict-layout.md#ip-trie) `Nullable`-типы не поддерживаются. | Да |
| `null_value` | Значение по умолчанию для несуществующего элемента.<br/>В примере это пустая строка. Значение [NULL](../../syntax.md#null-literal) можно указывать только для типов `Nullable` (см. предыдущую строку с описанием типов). | Да |
| `expression` | [Выражение](../../syntax.md#syntax-expressions), которое ClickHouse выполняет со значением.<br/>Выражением может быть имя столбца в удаленной SQL базе. Таким образом, вы можете использовать его для создания псевдонима удаленного столбца.<br/><br/>Значение по умолчанию: нет выражения. | Нет |
| <a name="hierarchical-dict-attr"></a> `hierarchical` | Если `true`, то атрибут содержит ключ предка для текущего элемента. Смотрите [Иерархические словари](external-dicts-dict-hierarchical.md).<br/><br/>Значение по умолчанию: `false`. | Нет |

File diff suppressed because it is too large Load Diff

View File

@ -18,12 +18,13 @@ public:
ClusterCopier(const String & task_path_,
const String & host_id_,
const String & proxy_database_name_,
ContextMutablePtr context_)
ContextMutablePtr context_,
Poco::Logger * log_)
: WithMutableContext(context_),
task_zookeeper_path(task_path_),
host_id(host_id_),
working_database_name(proxy_database_name_),
log(&Poco::Logger::get("ClusterCopier")) {}
log(log_) {}
void init();
@ -117,14 +118,14 @@ protected:
TaskStatus tryMoveAllPiecesToDestinationTable(const TaskTable & task_table, const String & partition_name);
/// Removes MATERIALIZED and ALIAS columns from create table query
static ASTPtr removeAliasColumnsFromCreateQuery(const ASTPtr & query_ast);
static ASTPtr removeAliasMaterializedAndTTLColumnsFromCreateQuery(const ASTPtr & query_ast, bool allow_to_copy_alias_and_materialized_columns);
bool tryDropPartitionPiece(ShardPartition & task_partition, size_t current_piece_number,
const zkutil::ZooKeeperPtr & zookeeper, const CleanStateClock & clean_state_clock);
static constexpr UInt64 max_table_tries = 3;
static constexpr UInt64 max_shard_partition_tries = 3;
static constexpr UInt64 max_shard_partition_piece_tries_for_alter = 3;
static constexpr UInt64 max_shard_partition_piece_tries_for_alter = 10;
bool tryProcessTable(const ConnectionTimeouts & timeouts, TaskTable & task_table);
@ -189,9 +190,7 @@ protected:
const ClusterPtr & cluster,
const String & query,
const Settings & current_settings,
PoolMode pool_mode = PoolMode::GET_ALL,
ClusterExecutionMode execution_mode = ClusterExecutionMode::ON_EACH_SHARD,
UInt64 max_successful_executions_per_shard = 0) const;
ClusterExecutionMode execution_mode = ClusterExecutionMode::ON_EACH_SHARD) const;
private:
String task_zookeeper_path;
@ -208,7 +207,6 @@ private:
ConfigurationPtr task_cluster_initial_config;
ConfigurationPtr task_cluster_current_config;
Coordination::Stat task_description_current_stat{};
std::unique_ptr<TaskCluster> task_cluster;

View File

@ -22,8 +22,9 @@ void ClusterCopierApp::initialize(Poco::Util::Application & self)
config_xml_path = config().getString("config-file");
task_path = config().getString("task-path");
log_level = config().getString("log-level", "trace");
log_level = config().getString("log-level", "info");
is_safe_mode = config().has("safe-mode");
is_status_mode = config().has("status");
if (config().has("copy-fault-probability"))
copy_fault_probability = std::max(std::min(config().getDouble("copy-fault-probability"), 1.0), 0.0);
if (config().has("move-fault-probability"))
@ -97,6 +98,7 @@ void ClusterCopierApp::defineOptions(Poco::Util::OptionSet & options)
.argument("base-dir").binding("base-dir"));
options.addOption(Poco::Util::Option("experimental-use-sample-offset", "", "Use SAMPLE OFFSET query instead of cityHash64(PRIMARY KEY) % n == k")
.argument("experimental-use-sample-offset").binding("experimental-use-sample-offset"));
options.addOption(Poco::Util::Option("status", "", "Get for status for current execution").binding("status"));
using Me = std::decay_t<decltype(*this)>;
options.addOption(Poco::Util::Option("help", "", "produce this help message").binding("help")
@ -106,6 +108,25 @@ void ClusterCopierApp::defineOptions(Poco::Util::OptionSet & options)
void ClusterCopierApp::mainImpl()
{
/// Status command
{
if (is_status_mode)
{
SharedContextHolder shared_context = Context::createShared();
auto context = Context::createGlobal(shared_context.get());
context->makeGlobalContext();
SCOPE_EXIT_SAFE(context->shutdown());
auto zookeeper = context->getZooKeeper();
auto status_json = zookeeper->get(task_path + "/status");
LOG_INFO(&logger(), "{}", status_json);
std::cout << status_json << std::endl;
context->resetZooKeeper();
return;
}
}
StatusFile status_file(process_path + "/status", StatusFile::write_full_info);
ThreadStatus thread_status;
@ -136,7 +157,7 @@ void ClusterCopierApp::mainImpl()
/// Initialize query scope just in case.
CurrentThread::QueryScope query_scope(context);
auto copier = std::make_unique<ClusterCopier>(task_path, host_id, default_database, context);
auto copier = std::make_unique<ClusterCopier>(task_path, host_id, default_database, context, log);
copier->setSafeMode(is_safe_mode);
copier->setCopyFaultProbability(copy_fault_probability);
copier->setMoveFaultProbability(move_fault_probability);

View File

@ -76,8 +76,9 @@ private:
std::string config_xml_path;
std::string task_path;
std::string log_level = "trace";
std::string log_level = "info";
bool is_safe_mode = false;
bool is_status_mode = false;
double copy_fault_probability = 0.0;
double move_fault_probability = 0.0;
bool is_help = false;

View File

@ -0,0 +1,65 @@
#pragma once
#include <Poco/JSON/Parser.h>
#include <Poco/JSON/JSON.h>
#include <Poco/JSON/Object.h>
#include <Poco/JSON/Stringifier.h>
#include <unordered_map>
#include <memory>
#include <string>
#include <iostream>
namespace DB
{
class StatusAccumulator
{
public:
struct TableStatus
{
size_t all_partitions_count;
size_t processed_partitions_count;
};
using Map = std::unordered_map<std::string, TableStatus>;
using MapPtr = std::shared_ptr<Map>;
static MapPtr fromJSON(std::string state_json)
{
Poco::JSON::Parser parser;
auto state = parser.parse(state_json).extract<Poco::JSON::Object::Ptr>();
MapPtr result_ptr = std::make_shared<Map>();
for (const auto & table_name : state->getNames())
{
auto table_status_json = state->getValue<std::string>(table_name);
auto table_status = parser.parse(table_status_json).extract<Poco::JSON::Object::Ptr>();
/// Map entry will be created if it is absent
auto & map_table_status = (*result_ptr)[table_name];
map_table_status.all_partitions_count += table_status->getValue<size_t>("all_partitions_count");
map_table_status.processed_partitions_count += table_status->getValue<size_t>("processed_partitions_count");
}
return result_ptr;
}
static std::string serializeToJSON(MapPtr statuses)
{
Poco::JSON::Object result_json;
for (const auto & [table_name, table_status] : *statuses)
{
Poco::JSON::Object status_json;
status_json.set("all_partitions_count", table_status.all_partitions_count);
status_json.set("processed_partitions_count", table_status.processed_partitions_count);
result_json.set(table_name, status_json);
}
std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
oss.exceptions(std::ios::failbit);
Poco::JSON::Stringifier::stringify(result_json, oss);
auto result = oss.str();
return result;
}
};
}

View File

@ -77,6 +77,8 @@ inline void DB::TaskCluster::reloadSettings(const Poco::Util::AbstractConfigurat
if (config.has(prefix + "settings"))
settings_common.loadSettingsFromConfig(prefix + "settings", config);
settings_common.prefer_localhost_replica = 0;
settings_pull = settings_common;
if (config.has(prefix + "settings_pull"))
settings_pull.loadSettingsFromConfig(prefix + "settings_pull", config);
@ -92,11 +94,15 @@ inline void DB::TaskCluster::reloadSettings(const Poco::Util::AbstractConfigurat
/// Override important settings
settings_pull.readonly = 1;
settings_push.insert_distributed_sync = 1;
settings_pull.prefer_localhost_replica = false;
settings_push.insert_distributed_sync = true;
settings_push.prefer_localhost_replica = false;
set_default_value(settings_pull.load_balancing, LoadBalancing::NEAREST_HOSTNAME);
set_default_value(settings_pull.max_threads, 1);
set_default_value(settings_pull.max_block_size, 8192UL);
set_default_value(settings_pull.preferred_block_size_bytes, 0);
set_default_value(settings_push.insert_distributed_timeout, 0);
set_default_value(settings_push.replication_alter_partitions_sync, 2);
}

View File

@ -36,27 +36,33 @@ struct TaskTable
String getPartitionAttachIsDonePath(const String & partition_name) const;
String getPartitionPiecePath(const String & partition_name, const size_t piece_number) const;
String getPartitionPiecePath(const String & partition_name, size_t piece_number) const;
String getCertainPartitionIsDirtyPath(const String & partition_name) const;
String getCertainPartitionPieceIsDirtyPath(const String & partition_name, const size_t piece_number) const;
String getCertainPartitionPieceIsDirtyPath(const String & partition_name, size_t piece_number) const;
String getCertainPartitionIsCleanedPath(const String & partition_name) const;
String getCertainPartitionPieceIsCleanedPath(const String & partition_name, const size_t piece_number) const;
String getCertainPartitionPieceIsCleanedPath(const String & partition_name, size_t piece_number) const;
String getCertainPartitionTaskStatusPath(const String & partition_name) const;
String getCertainPartitionPieceTaskStatusPath(const String & partition_name, const size_t piece_number) const;
String getCertainPartitionPieceTaskStatusPath(const String & partition_name, size_t piece_number) const;
bool isReplicatedTable() const { return is_replicated_table; }
/// These nodes are used for check-status option
String getStatusAllPartitionCount() const;
String getStatusProcessedPartitionsCount() const;
/// Partitions will be split into number-of-splits pieces.
/// Each piece will be copied independently. (10 by default)
size_t number_of_splits;
bool allow_to_copy_alias_and_materialized_columns{false};
bool allow_to_drop_target_partitions{false};
String name_in_config;
/// Used as task ID
@ -83,7 +89,7 @@ struct TaskTable
String engine_push_zk_path;
bool is_replicated_table;
ASTPtr rewriteReplicatedCreateQueryToPlain();
ASTPtr rewriteReplicatedCreateQueryToPlain() const;
/*
* A Distributed table definition used to split data
@ -181,6 +187,7 @@ struct TaskShard
/// Last CREATE TABLE query of the table of the shard
ASTPtr current_pull_table_create_query;
ASTPtr current_push_table_create_query;
/// Internal distributed tables
DatabaseAndTableName table_read_shard;
@ -242,6 +249,16 @@ inline String TaskTable::getCertainPartitionPieceTaskStatusPath(const String & p
return getPartitionPiecePath(partition_name, piece_number) + "/shards";
}
inline String TaskTable::getStatusAllPartitionCount() const
{
return task_cluster.task_zookeeper_path + "/status/all_partitions_count";
}
inline String TaskTable::getStatusProcessedPartitionsCount() const
{
return task_cluster.task_zookeeper_path + "/status/processed_partitions_count";
}
inline TaskTable::TaskTable(TaskCluster & parent, const Poco::Util::AbstractConfiguration & config,
const String & prefix_, const String & table_key)
: task_cluster(parent)
@ -250,7 +267,10 @@ inline TaskTable::TaskTable(TaskCluster & parent, const Poco::Util::AbstractConf
name_in_config = table_key;
number_of_splits = config.getUInt64(table_prefix + "number_of_splits", 10);
number_of_splits = config.getUInt64(table_prefix + "number_of_splits", 3);
allow_to_copy_alias_and_materialized_columns = config.getBool(table_prefix + "allow_to_copy_alias_and_materialized_columns", false);
allow_to_drop_target_partitions = config.getBool(table_prefix + "allow_to_drop_target_partitions", false);
cluster_pull_name = config.getString(table_prefix + "cluster_pull");
cluster_push_name = config.getString(table_prefix + "cluster_push");
@ -343,7 +363,7 @@ inline void TaskTable::initShards(RandomEngine && random_engine)
std::uniform_int_distribution<UInt8> get_urand(0, std::numeric_limits<UInt8>::max());
// Compute the priority
for (auto & shard_info : cluster_pull->getShardsInfo())
for (const auto & shard_info : cluster_pull->getShardsInfo())
{
TaskShardPtr task_shard = std::make_shared<TaskShard>(*this, shard_info);
const auto & replicas = cluster_pull->getShardsAddresses().at(task_shard->indexInCluster());
@ -369,7 +389,7 @@ inline void TaskTable::initShards(RandomEngine && random_engine)
local_shards.assign(all_shards.begin(), it_first_remote);
}
inline ASTPtr TaskTable::rewriteReplicatedCreateQueryToPlain()
inline ASTPtr TaskTable::rewriteReplicatedCreateQueryToPlain() const
{
ASTPtr prev_engine_push_ast = engine_push_ast->clone();
@ -383,9 +403,15 @@ inline ASTPtr TaskTable::rewriteReplicatedCreateQueryToPlain()
{
auto & replicated_table_arguments = new_engine_ast.arguments->children;
/// Delete first two arguments of Replicated...MergeTree() table.
replicated_table_arguments.erase(replicated_table_arguments.begin());
replicated_table_arguments.erase(replicated_table_arguments.begin());
/// In some cases of Atomic database engine usage ReplicatedMergeTree tables
/// could be created without arguments.
if (!replicated_table_arguments.empty())
{
/// Delete first two arguments of Replicated...MergeTree() table.
replicated_table_arguments.erase(replicated_table_arguments.begin());
replicated_table_arguments.erase(replicated_table_arguments.begin());
}
}
return new_storage_ast.clone();
@ -400,7 +426,7 @@ inline String DB::TaskShard::getDescription() const
inline String DB::TaskShard::getHostNameExample() const
{
auto & replicas = task_table.cluster_pull->getShardsAddresses().at(indexInCluster());
const auto & replicas = task_table.cluster_pull->getShardsAddresses().at(indexInCluster());
return replicas.at(0).readableString();
}

View File

@ -19,6 +19,7 @@
#include <Common/StringUtils/StringUtils.h>
#include <Common/getHashOfLoadedBinary.h>
#include <Common/IO.h>
#include <common/phdr_cache.h>
#include <ext/scope_guard.h>
@ -172,11 +173,11 @@ enum class InstructionFail
AVX512 = 8
};
std::pair<const char *, size_t> instructionFailToString(InstructionFail fail)
auto instructionFailToString(InstructionFail fail)
{
switch (fail)
{
#define ret(x) return std::make_pair(x, ARRAY_SIZE(x) - 1)
#define ret(x) return std::make_tuple(STDERR_FILENO, x, ARRAY_SIZE(x) - 1)
case InstructionFail::NONE:
ret("NONE");
case InstructionFail::SSE3:
@ -260,28 +261,12 @@ void checkRequiredInstructionsImpl(volatile InstructionFail & fail)
fail = InstructionFail::NONE;
}
/// This function is safe to use in static initializers.
void writeErrorLen(const char * data, size_t size)
{
while (size != 0)
{
ssize_t res = ::write(STDERR_FILENO, data, size);
if ((-1 == res || 0 == res) && errno != EINTR)
_Exit(1);
if (res > 0)
{
data += res;
size -= res;
}
}
}
/// Macros to avoid using strlen(), since it may fail if SSE is not supported.
#define writeError(data) do \
{ \
static_assert(__builtin_constant_p(data)); \
writeErrorLen(data, ARRAY_SIZE(data) - 1); \
if (!writeRetry(STDERR_FILENO, data, ARRAY_SIZE(data) - 1)) \
_Exit(1); \
} while (false)
/// Check SSE and others instructions availability. Calls exit on fail.
@ -310,7 +295,8 @@ void checkRequiredInstructions()
if (sigsetjmp(jmpbuf, 1))
{
writeError("Instruction check fail. The CPU does not support ");
std::apply(writeErrorLen, instructionFailToString(fail));
if (!std::apply(writeRetry, instructionFailToString(fail)))
_Exit(1);
writeError(" instruction set.\n");
_Exit(1);
}

View File

@ -637,7 +637,7 @@ struct AggregateFunctionAnyLastData : Data
template <typename Data>
struct AggregateFunctionAnyHeavyData : Data
{
size_t counter = 0;
UInt64 counter = 0;
using Self = AggregateFunctionAnyHeavyData;

27
src/Common/IO.cpp Normal file
View File

@ -0,0 +1,27 @@
#include <Common/IO.h>
#include <unistd.h>
#include <errno.h>
#include <cstring>
bool writeRetry(int fd, const char * data, size_t size)
{
if (!size)
size = strlen(data);
while (size != 0)
{
ssize_t res = ::write(fd, data, size);
if ((-1 == res || 0 == res) && errno != EINTR)
return false;
if (res > 0)
{
data += res;
size -= res;
}
}
return true;
}

13
src/Common/IO.h Normal file
View File

@ -0,0 +1,13 @@
#pragma once
#include <cstddef>
/// IO helpers
/// Write loop with EINTR handling.
///
/// This function is safe to use in static initializers.
///
/// @param size - len of @data or 0 to use strlen()
/// @return true if write was succeed, otherwise false.
bool writeRetry(int fd, const char * data, size_t size = 0);

View File

@ -46,6 +46,7 @@ SRCS(
ExternalLoaderStatus.cpp
FieldVisitors.cpp
FileChecker.cpp
IO.cpp
IPv6ToBinary.cpp
IntervalKind.cpp
JSONBuilder.cpp

View File

@ -80,7 +80,7 @@ class IColumn;
M(UInt64, background_pool_size, 16, "Number of threads performing background work for tables (for example, merging in merge tree). Only has meaning at server startup.", 0) \
M(UInt64, background_move_pool_size, 8, "Number of threads performing background moves for tables. Only has meaning at server startup.", 0) \
M(UInt64, background_fetches_pool_size, 8, "Number of threads performing background fetches for replicated tables. Only has meaning at server startup.", 0) \
M(UInt64, background_schedule_pool_size, 16, "Number of threads performing background tasks for replicated tables, dns cache updates. Only has meaning at server startup.", 0) \
M(UInt64, background_schedule_pool_size, 128, "Number of threads performing background tasks for replicated tables, dns cache updates. Only has meaning at server startup.", 0) \
M(UInt64, background_message_broker_schedule_pool_size, 16, "Number of threads performing background tasks for message streaming. Only has meaning at server startup.", 0) \
M(UInt64, background_distributed_schedule_pool_size, 16, "Number of threads performing background tasks for distributed sends. Only has meaning at server startup.", 0) \
M(UInt64, max_replicated_fetches_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for replicated fetches. Zero means unlimited. Only has meaning at server startup.", 0) \
@ -403,6 +403,7 @@ class IColumn;
M(Bool, optimize_if_chain_to_multiif, false, "Replace if(cond1, then1, if(cond2, ...)) chains to multiIf. Currently it's not beneficial for numeric types.", 0) \
M(Bool, optimize_if_transform_strings_to_enum, false, "Replaces string-type arguments in If and Transform to enum. Disabled by default cause it could make inconsistent change in distributed query that would lead to its fail.", 0) \
M(Bool, optimize_monotonous_functions_in_order_by, true, "Replace monotonous function with its argument in ORDER BY", 0) \
M(Bool, optimize_functions_to_subcolumns, false, "Transform functions to subcolumns, if possible, to reduce amount of read data. E.g. 'length(arr)' -> 'arr.size0', 'col IS NULL' -> 'col.null' ", 0) \
M(Bool, normalize_function_names, true, "Normalize function names to their canonical names", 0) \
M(Bool, allow_experimental_alter_materialized_view_structure, false, "Allow atomic alter on Materialized views. Work in progress.", 0) \
M(Bool, enable_early_constant_folding, true, "Enable query optimization where we analyze function and subqueries results and rewrite query if there're constants there", 0) \

View File

@ -182,18 +182,20 @@ public:
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
{
const ColumnMap * col_map = typeid_cast<const ColumnMap *>(arguments[0].column.get());
bool is_const = isColumnConst(*arguments[0].column);
const ColumnMap * col_map = is_const ? checkAndGetColumnConstData<ColumnMap>(arguments[0].column.get()) : checkAndGetColumn<ColumnMap>(arguments[0].column.get());
if (!col_map)
return nullptr;
throw Exception{"First argument for function " + getName() + " must be a map", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
const auto & nested_column = col_map->getNestedColumn();
const auto & keys_data = col_map->getNestedData().getColumn(0);
/// Prepare arguments to call arrayIndex for check has the array element.
ColumnPtr column_array = ColumnArray::create(keys_data.getPtr(), nested_column.getOffsetsPtr());
ColumnsWithTypeAndName new_arguments =
{
{
ColumnArray::create(keys_data.getPtr(), nested_column.getOffsetsPtr()),
is_const ? ColumnConst::create(std::move(column_array), keys_data.size()) : std::move(column_array),
std::make_shared<DataTypeArray>(result_type),
""
},

View File

@ -1894,11 +1894,11 @@ void NO_INLINE Aggregator::mergeWithoutKeyStreamsImpl(
res = place;
}
if (block.rows() > 0)
for (size_t row = 0, rows = block.rows(); row < rows; ++row)
{
/// Adding Values
for (size_t i = 0; i < params.aggregates_size; ++i)
aggregate_functions[i]->merge(res + offsets_of_aggregate_states[i], (*aggregate_columns[i])[0], result.aggregates_pool);
aggregate_functions[i]->merge(res + offsets_of_aggregate_states[i], (*aggregate_columns[i])[row], result.aggregates_pool);
}
/// Early release memory.

View File

@ -85,7 +85,8 @@ public:
/// If this is already an external table, you do not need to add anything. Just remember its presence.
auto temporary_table_name = getIdentifierName(subquery_or_table_name);
bool exists_in_local_map = external_tables.end() != external_tables.find(temporary_table_name);
bool exists_in_context = getContext()->tryResolveStorageID(StorageID("", temporary_table_name), Context::ResolveExternal);
bool exists_in_context = static_cast<bool>(getContext()->tryResolveStorageID(
StorageID("", temporary_table_name), Context::ResolveExternal));
if (exists_in_local_map || exists_in_context)
return;
}

View File

@ -44,7 +44,8 @@ BlockInputStreamPtr InterpreterExistsQuery::executeImpl()
{
if (exists_query->temporary)
{
result = getContext()->tryResolveStorageID({"", exists_query->table}, Context::ResolveExternal);
result = static_cast<bool>(getContext()->tryResolveStorageID(
{"", exists_query->table}, Context::ResolveExternal));
}
else
{

View File

@ -80,6 +80,9 @@ void QueryNormalizer::visit(ASTIdentifier & node, ASTPtr & ast, Data & data)
/// If it is an alias, but not a parent alias (for constructs like "SELECT column + 1 AS column").
auto it_alias = data.aliases.find(node.name());
if (!data.allow_self_aliases && current_alias == node.name())
throw Exception(ErrorCodes::CYCLIC_ALIASES, "Self referencing of {} to {}. Cyclic alias", backQuote(current_alias), backQuote(node.name()));
if (it_alias != data.aliases.end() && current_alias != node.name())
{
if (!IdentifierSemantic::canBeAlias(node))

View File

@ -48,18 +48,22 @@ public:
MapOfASTs finished_asts; /// already processed vertices (and by what they replaced)
SetOfASTs current_asts; /// vertices in the current call stack of this method
std::string current_alias; /// the alias referencing to the ancestor of ast (the deepest ancestor with aliases)
bool ignore_alias; /// normalize query without any aliases
const bool ignore_alias; /// normalize query without any aliases
Data(const Aliases & aliases_, const NameSet & source_columns_set_, bool ignore_alias_, ExtractedSettings && settings_)
/// It's Ok to have "c + 1 AS c" in queries, but not in table definition
const bool allow_self_aliases; /// for constructs like "SELECT column + 1 AS column"
Data(const Aliases & aliases_, const NameSet & source_columns_set_, bool ignore_alias_, ExtractedSettings && settings_, bool allow_self_aliases_)
: aliases(aliases_)
, source_columns_set(source_columns_set_)
, settings(settings_)
, level(0)
, ignore_alias(ignore_alias_)
, allow_self_aliases(allow_self_aliases_)
{}
};
QueryNormalizer(Data & data)
explicit QueryNormalizer(Data & data)
: visitor_data(data)
{}

View File

@ -0,0 +1,80 @@
#include <Interpreters/RewriteFunctionToSubcolumnVisitor.h>
#include <DataTypes/NestedUtils.h>
#include <Parsers/ASTIdentifier.h>
#include <Parsers/ASTLiteral.h>
namespace DB
{
namespace
{
ASTPtr transformToSubcolumn(const String & name_in_storage, const String & subcolumn_name)
{
return std::make_shared<ASTIdentifier>(Nested::concatenateName(name_in_storage, subcolumn_name));
}
ASTPtr transformEmptyToSubcolumn(const String & name_in_storage, const String & subcolumn_name)
{
auto ast = transformToSubcolumn(name_in_storage, subcolumn_name);
return makeASTFunction("equals", ast, std::make_shared<ASTLiteral>(0u));
}
ASTPtr transformNotEmptyToSubcolumn(const String & name_in_storage, const String & subcolumn_name)
{
auto ast = transformToSubcolumn(name_in_storage, subcolumn_name);
return makeASTFunction("notEquals", ast, std::make_shared<ASTLiteral>(0u));
}
ASTPtr transformIsNotNullToSubcolumn(const String & name_in_storage, const String & subcolumn_name)
{
auto ast = transformToSubcolumn(name_in_storage, subcolumn_name);
return makeASTFunction("not", ast);
}
ASTPtr transformCountNullableToSubcolumn(const String & name_in_storage, const String & subcolumn_name)
{
auto ast = transformToSubcolumn(name_in_storage, subcolumn_name);
return makeASTFunction("sum", makeASTFunction("not", ast));
}
const std::unordered_map<String, std::tuple<TypeIndex, String, decltype(&transformToSubcolumn)>> function_to_subcolumn =
{
{"length", {TypeIndex::Array, "size0", transformToSubcolumn}},
{"empty", {TypeIndex::Array, "size0", transformEmptyToSubcolumn}},
{"notEmpty", {TypeIndex::Array, "size0", transformNotEmptyToSubcolumn}},
{"isNull", {TypeIndex::Nullable, "null", transformToSubcolumn}},
{"isNotNull", {TypeIndex::Nullable, "null", transformIsNotNullToSubcolumn}},
{"count", {TypeIndex::Nullable, "null", transformCountNullableToSubcolumn}},
{"mapKeys", {TypeIndex::Map, "keys", transformToSubcolumn}},
{"mapValues", {TypeIndex::Map, "values", transformToSubcolumn}},
};
}
void RewriteFunctionToSubcolumnData::visit(ASTFunction & function, ASTPtr & ast) const
{
const auto & arguments = function.arguments->children;
if (arguments.size() != 1)
return;
const auto * identifier = arguments[0]->as<ASTIdentifier>();
if (!identifier)
return;
auto it = function_to_subcolumn.find(function.name);
if (it == function_to_subcolumn.end())
return;
const auto & [type_id, subcolumn_name, transformer] = it->second;
const auto & columns = metadata_snapshot->getColumns();
const auto & name_in_storage = identifier->name();
if (columns.has(name_in_storage)
&& columns.get(name_in_storage).type->getTypeId() == type_id)
{
ast = transformer(name_in_storage, subcolumn_name);
}
}
}

View File

@ -0,0 +1,24 @@
#pragma once
#include <Parsers/ASTFunction.h>
#include <Interpreters/InDepthNodeVisitor.h>
#include <Storages/StorageInMemoryMetadata.h>
namespace DB
{
/// Rewrites functions to subcolumns, if possible, to reduce amount of read data.
/// E.g. 'length(arr)' -> 'arr.size0', 'col IS NULL' -> 'col.null'
class RewriteFunctionToSubcolumnData
{
public:
using TypeToVisit = ASTFunction;
void visit(ASTFunction & function, ASTPtr & ast) const;
StorageMetadataPtr metadata_snapshot;
};
using RewriteFunctionToSubcolumnMatcher = OneTypeMatcher<RewriteFunctionToSubcolumnData>;
using RewriteFunctionToSubcolumnVisitor = InDepthNodeVisitor<RewriteFunctionToSubcolumnMatcher, true>;
}

View File

@ -79,6 +79,15 @@ bool StorageID::operator<(const StorageID & rhs) const
return !hasUUID();
}
bool StorageID::operator==(const StorageID & rhs) const
{
assertNotEmpty();
if (hasUUID() && rhs.hasUUID())
return uuid == rhs.uuid;
else
return std::tie(database_name, table_name) == std::tie(rhs.database_name, rhs.table_name);
}
String StorageID::getFullTableName() const
{
return backQuoteIfNeed(getDatabaseName()) + "." + backQuoteIfNeed(table_name);

View File

@ -54,7 +54,7 @@ struct StorageID
String getNameForLogs() const;
operator bool () const
explicit operator bool () const
{
return !empty();
}
@ -70,6 +70,7 @@ struct StorageID
}
bool operator<(const StorageID & rhs) const;
bool operator==(const StorageID & rhs) const;
void assertNotEmpty() const
{

View File

@ -1,6 +1,7 @@
#include <Core/Settings.h>
#include <Interpreters/TreeOptimizer.h>
#include <Interpreters/TreeRewriter.h>
#include <Interpreters/OptimizeIfChains.h>
#include <Interpreters/OptimizeIfWithConstantConditionVisitor.h>
#include <Interpreters/ArithmeticOperationsInAgrFuncOptimize.h>
@ -14,6 +15,7 @@
#include <Interpreters/MonotonicityCheckVisitor.h>
#include <Interpreters/ConvertStringsToEnumVisitor.h>
#include <Interpreters/PredicateExpressionsOptimizer.h>
#include <Interpreters/RewriteFunctionToSubcolumnVisitor.h>
#include <Interpreters/Context.h>
#include <Interpreters/ExternalDictionariesLoader.h>
@ -27,7 +29,7 @@
#include <Parsers/ASTTablesInSelectQuery.h>
#include <Functions/FunctionFactory.h>
#include <Storages/StorageInMemoryMetadata.h>
#include <Storages/IStorage.h>
#include <Interpreters/RewriteSumIfFunctionVisitor.h>
@ -579,6 +581,12 @@ void transformIfStringsIntoEnum(ASTPtr & query)
ConvertStringsToEnumVisitor(convert_data).visit(query);
}
void optimizeFunctionsToSubcolumns(ASTPtr & query, const StorageMetadataPtr & metadata_snapshot)
{
RewriteFunctionToSubcolumnVisitor::Data data{metadata_snapshot};
RewriteFunctionToSubcolumnVisitor(data).visit(query);
}
}
void TreeOptimizer::optimizeIf(ASTPtr & query, Aliases & aliases, bool if_chain_to_multiif)
@ -590,10 +598,8 @@ void TreeOptimizer::optimizeIf(ASTPtr & query, Aliases & aliases, bool if_chain_
OptimizeIfChainsVisitor().visit(query);
}
void TreeOptimizer::apply(ASTPtr & query, Aliases & aliases, const NameSet & source_columns_set,
const std::vector<TableWithColumnNamesAndTypes> & tables_with_columns,
ContextConstPtr context, const StorageMetadataPtr & metadata_snapshot,
bool & rewrite_subqueries)
void TreeOptimizer::apply(ASTPtr & query, TreeRewriterResult & result,
const std::vector<TableWithColumnNamesAndTypes> & tables_with_columns, ContextConstPtr context)
{
const auto & settings = context->getSettingsRef();
@ -601,17 +607,21 @@ void TreeOptimizer::apply(ASTPtr & query, Aliases & aliases, const NameSet & sou
if (!select_query)
throw Exception("Select analyze for not select asts.", ErrorCodes::LOGICAL_ERROR);
optimizeIf(query, aliases, settings.optimize_if_chain_to_multiif);
if (settings.optimize_functions_to_subcolumns && result.storage
&& result.storage->supportsSubcolumns() && result.metadata_snapshot)
optimizeFunctionsToSubcolumns(query, result.metadata_snapshot);
optimizeIf(query, result.aliases, settings.optimize_if_chain_to_multiif);
/// Move arithmetic operations out of aggregation functions
if (settings.optimize_arithmetic_operations_in_aggregate_functions)
optimizeAggregationFunctions(query);
/// Push the predicate expression down to the subqueries.
rewrite_subqueries = PredicateExpressionsOptimizer(context, tables_with_columns, settings).optimize(*select_query);
result.rewrite_subqueries = PredicateExpressionsOptimizer(context, tables_with_columns, settings).optimize(*select_query);
/// GROUP BY injective function elimination.
optimizeGroupBy(select_query, source_columns_set, context);
optimizeGroupBy(select_query, result.source_columns_set, context);
/// GROUP BY functions of other keys elimination.
if (settings.optimize_group_by_function_keys)
@ -658,7 +668,7 @@ void TreeOptimizer::apply(ASTPtr & query, Aliases & aliases, const NameSet & sou
/// Replace monotonous functions with its argument
if (settings.optimize_monotonous_functions_in_order_by)
optimizeMonotonousFunctionsInOrderBy(select_query, context, tables_with_columns,
metadata_snapshot ? metadata_snapshot->getSortingKeyColumns() : Names{});
result.metadata_snapshot ? result.metadata_snapshot->getSortingKeyColumns() : Names{});
/// Remove duplicate items from ORDER BY.
/// Execute it after all order by optimizations,

View File

@ -8,8 +8,7 @@
namespace DB
{
struct StorageInMemoryMetadata;
using StorageMetadataPtr = std::shared_ptr<const StorageInMemoryMetadata>;
struct TreeRewriterResult;
/// Part of of Tree Rewriter (SyntaxAnalyzer) that optimizes AST.
/// Query should be ready to execute either before either after it. But resulting query could be faster.
@ -18,12 +17,9 @@ class TreeOptimizer
public:
static void apply(
ASTPtr & query,
Aliases & aliases,
const NameSet & source_columns_set,
TreeRewriterResult & result,
const std::vector<TableWithColumnNamesAndTypes> & tables_with_columns,
ContextConstPtr context,
const StorageMetadataPtr & metadata_snapshot,
bool & rewrite_subqueries);
ContextConstPtr context);
static void optimizeIf(ASTPtr & query, Aliases & aliases, bool if_chain_to_multiif);
};

View File

@ -913,7 +913,7 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect(
all_source_columns_set.insert(name);
}
normalize(query, result.aliases, all_source_columns_set, select_options.ignore_alias, settings);
normalize(query, result.aliases, all_source_columns_set, select_options.ignore_alias, settings, /* allow_self_aliases = */ true);
/// Remove unneeded columns according to 'required_result_columns'.
/// Leave all selected columns in case of DISTINCT; columns that contain arrayJoin function inside.
@ -924,8 +924,7 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect(
/// Executing scalar subqueries - replacing them with constant values.
executeScalarSubqueries(query, getContext(), subquery_depth, result.scalars, select_options.only_analyze);
TreeOptimizer::apply(
query, result.aliases, source_columns_set, tables_with_columns, getContext(), result.metadata_snapshot, result.rewrite_subqueries);
TreeOptimizer::apply(query, result, tables_with_columns, getContext());
/// array_join_alias_to_name, array_join_result_to_source.
getArrayJoinedColumns(query, result, select_query, result.source_columns, source_columns_set);
@ -959,7 +958,8 @@ TreeRewriterResultPtr TreeRewriter::analyze(
const NamesAndTypesList & source_columns,
ConstStoragePtr storage,
const StorageMetadataPtr & metadata_snapshot,
bool allow_aggregations) const
bool allow_aggregations,
bool allow_self_aliases) const
{
if (query->as<ASTSelectQuery>())
throw Exception("Not select analyze for select asts.", ErrorCodes::LOGICAL_ERROR);
@ -968,7 +968,7 @@ TreeRewriterResultPtr TreeRewriter::analyze(
TreeRewriterResult result(source_columns, storage, metadata_snapshot, false);
normalize(query, result.aliases, result.source_columns_set, false, settings);
normalize(query, result.aliases, result.source_columns_set, false, settings, allow_self_aliases);
/// Executing scalar subqueries. Column defaults could be a scalar subquery.
executeScalarSubqueries(query, getContext(), 0, result.scalars, false);
@ -994,7 +994,7 @@ TreeRewriterResultPtr TreeRewriter::analyze(
}
void TreeRewriter::normalize(
ASTPtr & query, Aliases & aliases, const NameSet & source_columns_set, bool ignore_alias, const Settings & settings)
ASTPtr & query, Aliases & aliases, const NameSet & source_columns_set, bool ignore_alias, const Settings & settings, bool allow_self_aliases)
{
CustomizeCountDistinctVisitor::Data data_count_distinct{settings.count_distinct_implementation};
CustomizeCountDistinctVisitor(data_count_distinct).visit(query);
@ -1054,7 +1054,7 @@ void TreeRewriter::normalize(
FunctionNameNormalizer().visit(query.get());
/// Common subexpression elimination. Rewrite rules.
QueryNormalizer::Data normalizer_data(aliases, source_columns_set, ignore_alias, settings);
QueryNormalizer::Data normalizer_data(aliases, source_columns_set, ignore_alias, settings, allow_self_aliases);
QueryNormalizer(normalizer_data).visit(query);
}

View File

@ -103,7 +103,8 @@ public:
const NamesAndTypesList & source_columns_,
ConstStoragePtr storage = {},
const StorageMetadataPtr & metadata_snapshot = {},
bool allow_aggregations = false) const;
bool allow_aggregations = false,
bool allow_self_aliases = true) const;
/// Analyze and rewrite select query
TreeRewriterResultPtr analyzeSelect(
@ -115,7 +116,7 @@ public:
std::shared_ptr<TableJoin> table_join = {}) const;
private:
static void normalize(ASTPtr & query, Aliases & aliases, const NameSet & source_columns_set, bool ignore_alias, const Settings & settings);
static void normalize(ASTPtr & query, Aliases & aliases, const NameSet & source_columns_set, bool ignore_alias, const Settings & settings, bool allow_self_aliases);
};
}

View File

@ -9,6 +9,21 @@
using namespace DB;
TEST(QueryNormalizer, SimpleLoopAlias)
{
String query = "a as a";
ParserExpressionList parser(false);
ASTPtr ast = parseQuery(parser, query, 0, 0);
Aliases aliases;
aliases["a"] = parseQuery(parser, "a as a", 0, 0)->children[0];
Settings settings;
QueryNormalizer::Data normalizer_data(aliases, {}, false, settings, false);
EXPECT_THROW(QueryNormalizer(normalizer_data).visit(ast), Exception);
}
TEST(QueryNormalizer, SimpleCycleAlias)
{
String query = "a as b, b as a";
@ -20,6 +35,6 @@ TEST(QueryNormalizer, SimpleCycleAlias)
aliases["b"] = parseQuery(parser, "a as b", 0, 0)->children[0];
Settings settings;
QueryNormalizer::Data normalizer_data(aliases, {}, false, settings);
QueryNormalizer::Data normalizer_data(aliases, {}, false, settings, true);
EXPECT_THROW(QueryNormalizer(normalizer_data).visit(ast), Exception);
}

View File

@ -590,7 +590,7 @@ Block validateColumnsDefaultsAndGetSampleBlock(ASTPtr default_expr_list, const N
try
{
auto syntax_analyzer_result = TreeRewriter(context).analyze(default_expr_list, all_columns);
auto syntax_analyzer_result = TreeRewriter(context).analyze(default_expr_list, all_columns, {}, {}, false, /* allow_self_aliases = */ false);
const auto actions = ExpressionAnalyzer(default_expr_list, syntax_analyzer_result, context).getActions(true);
for (const auto & action : actions->getActions())
if (action.node->type == ActionsDAG::ActionType::ARRAY_JOIN)

View File

@ -8,9 +8,8 @@
namespace DB
{
MergeListElement::MergeListElement(const std::string & database_, const std::string & table_, const FutureMergedMutatedPart & future_part)
: database{database_}
, table{table_}
MergeListElement::MergeListElement(const StorageID & table_id_, const FutureMergedMutatedPart & future_part)
: table_id{table_id_}
, partition_id{future_part.part_info.partition_id}
, result_part_name{future_part.name}
, result_part_path{future_part.path}
@ -60,8 +59,8 @@ MergeListElement::MergeListElement(const std::string & database_, const std::str
MergeInfo MergeListElement::getInfo() const
{
MergeInfo res;
res.database = database;
res.table = table;
res.database = table_id.getDatabaseName();
res.table = table_id.getTableName();
res.result_part_name = result_part_name;
res.result_part_path = result_part_path;
res.partition_id = partition_id;

View File

@ -8,6 +8,7 @@
#include <Storages/MergeTree/MergeType.h>
#include <Storages/MergeTree/MergeAlgorithm.h>
#include <Storages/MergeTree/BackgroundProcessList.h>
#include <Interpreters/StorageID.h>
#include <boost/noncopyable.hpp>
#include <memory>
#include <list>
@ -54,8 +55,7 @@ struct FutureMergedMutatedPart;
struct MergeListElement : boost::noncopyable
{
const std::string database;
const std::string table;
const StorageID table_id;
std::string partition_id;
const std::string result_part_name;
@ -94,7 +94,7 @@ struct MergeListElement : boost::noncopyable
/// Detected after merge already started
std::atomic<MergeAlgorithm> merge_algorithm;
MergeListElement(const std::string & database, const std::string & table, const FutureMergedMutatedPart & future_part);
MergeListElement(const StorageID & table_id_, const FutureMergedMutatedPart & future_part);
MergeInfo getInfo() const;
@ -122,12 +122,13 @@ public:
--merges_with_ttl_counter;
}
void cancelPartMutations(const String & partition_id, Int64 mutation_version)
void cancelPartMutations(const StorageID & table_id, const String & partition_id, Int64 mutation_version)
{
std::lock_guard lock{mutex};
for (auto & merge_element : entries)
{
if ((partition_id.empty() || merge_element.partition_id == partition_id)
&& merge_element.table_id == table_id
&& merge_element.source_data_version < mutation_version
&& merge_element.result_data_version >= mutation_version)
merge_element.is_cancelled = true;

View File

@ -234,7 +234,6 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read(
select.setExpression(ASTSelectQuery::Expression::WHERE, given_select.where()->clone());
if (given_select.prewhere())
select.setExpression(ASTSelectQuery::Expression::WHERE, given_select.prewhere()->clone());
// TODO will row policy filter work?
// After overriding the group by clause, we finish the possible aggregations directly
if (processed_stage >= QueryProcessingStage::Enum::WithMergeableState && given_select.groupBy())

View File

@ -1042,7 +1042,7 @@ ClusterPtr StorageDistributed::skipUnusedShards(
if (!limit)
{
LOG_TRACE(log,
LOG_DEBUG(log,
"Number of values for sharding key exceeds optimize_skip_unused_shards_limit={}, "
"try to increase it, but note that this may increase query processing time.",
local_context->getSettingsRef().optimize_skip_unused_shards_limit);

View File

@ -625,7 +625,7 @@ CancellationCode StorageMergeTree::killMutation(const String & mutation_id)
if (!to_kill)
return CancellationCode::NotFound;
getContext()->getMergeList().cancelPartMutations({}, to_kill->block_number);
getContext()->getMergeList().cancelPartMutations(getStorageID(), {}, to_kill->block_number);
to_kill->removeFile();
LOG_TRACE(log, "Cancelled part mutations and removed mutation file {}", mutation_id);
{
@ -817,9 +817,8 @@ bool StorageMergeTree::mergeSelectedParts(
auto & future_part = merge_mutate_entry.future_part;
Stopwatch stopwatch;
MutableDataPartPtr new_part;
auto table_id = getStorageID();
auto merge_list_entry = getContext()->getMergeList().insert(table_id.database_name, table_id.table_name, future_part);
auto merge_list_entry = getContext()->getMergeList().insert(getStorageID(), future_part);
auto write_part_log = [&] (const ExecutionStatus & execution_status)
{
@ -964,9 +963,8 @@ std::shared_ptr<StorageMergeTree::MergeMutateSelectedEntry> StorageMergeTree::se
bool StorageMergeTree::mutateSelectedPart(const StorageMetadataPtr & metadata_snapshot, MergeMutateSelectedEntry & merge_mutate_entry, TableLockHolder & table_lock_holder)
{
auto & future_part = merge_mutate_entry.future_part;
auto table_id = getStorageID();
auto merge_list_entry = getContext()->getMergeList().insert(table_id.database_name, table_id.table_name, future_part);
auto merge_list_entry = getContext()->getMergeList().insert(getStorageID(), future_part);
Stopwatch stopwatch;
MutableDataPartPtr new_part;

View File

@ -1726,7 +1726,7 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry)
auto table_id = getStorageID();
/// Add merge to list
MergeList::EntryPtr merge_entry = getContext()->getMergeList().insert(table_id.database_name, table_id.table_name, future_merged_part);
MergeList::EntryPtr merge_entry = getContext()->getMergeList().insert(getStorageID(), future_merged_part);
Transaction transaction(*this);
MutableDataPartPtr part;
@ -1871,9 +1871,7 @@ bool StorageReplicatedMergeTree::tryExecutePartMutation(const StorageReplicatedM
future_mutated_part.updatePath(*this, reserved_space);
future_mutated_part.type = source_part->getType();
auto table_id = getStorageID();
MergeList::EntryPtr merge_entry = getContext()->getMergeList().insert(
table_id.database_name, table_id.table_name, future_mutated_part);
MergeList::EntryPtr merge_entry = getContext()->getMergeList().insert(getStorageID(), future_mutated_part);
Stopwatch stopwatch;
@ -5934,7 +5932,7 @@ CancellationCode StorageReplicatedMergeTree::killMutation(const String & mutatio
{
const String & partition_id = pair.first;
Int64 block_number = pair.second;
getContext()->getMergeList().cancelPartMutations(partition_id, block_number);
getContext()->getMergeList().cancelPartMutations(getStorageID(), partition_id, block_number);
}
return CancellationCode::CancelSent;
}

View File

@ -2188,6 +2188,7 @@ class ClickHouseInstance:
odbc_bridge_volume = "- " + self.odbc_bridge_bin_path + ":/usr/share/clickhouse-odbc-bridge_fresh"
library_bridge_volume = "- " + self.library_bridge_bin_path + ":/usr/share/clickhouse-library-bridge_fresh"
with open(self.docker_compose_path, 'w') as docker_compose:
docker_compose.write(DOCKER_COMPOSE_TEMPLATE.format(
image=self.image,

View File

@ -0,0 +1,21 @@
<?xml version="1.0"?>
<yandex>
<remote_servers>
<source_trivial_cluster>
<shard>
<replica>
<host>first_trivial</host>
<port>9000</port>
</replica>
</shard>
</source_trivial_cluster>
<destination_trivial_cluster>
<shard>
<replica>
<host>second_trivial</host>
<port>9000</port>
</replica>
</shard>
</destination_trivial_cluster>
</remote_servers>
</yandex>

View File

@ -1,6 +1,6 @@
<yandex>
<logger>
<level>trace</level>
<level>information</level>
<log>/var/log/clickhouse-server/copier/log.log</log>
<errorlog>/var/log/clickhouse-server/copier/log.err.log</errorlog>
<size>1000M</size>

View File

@ -0,0 +1,28 @@
<?xml version="1.0"?>
<yandex>
<remote_servers>
<events>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>first</host>
<port>9000</port>
</replica>
</shard>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>second</host>
<port>9000</port>
</replica>
</shard>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>third</host>
<port>9000</port>
</replica>
</shard>
</events>
</remote_servers>
</yandex>

View File

@ -0,0 +1,6 @@
<?xml version="1.0"?>
<yandex>
<distributed_ddl>
<path>/clickhouse/task_queue/ddl</path>
</distributed_ddl>
</yandex>

View File

@ -0,0 +1,28 @@
<?xml version="1.0"?>
<yandex>
<logger>
<level>information</level>
<log>/var/log/clickhouse-server/copier/log.log</log>
<errorlog>/var/log/clickhouse-server/copier/log.err.log</errorlog>
<size>1000M</size>
<count>10</count>
<stderr>/var/log/clickhouse-server/copier/stderr.log</stderr>
<stdout>/var/log/clickhouse-server/copier/stdout.log</stdout>
</logger>
<zookeeper>
<node index="1">
<host>zoo1</host>
<port>2181</port>
</node>
<node index="2">
<host>zoo2</host>
<port>2181</port>
</node>
<node index="3">
<host>zoo3</host>
<port>2181</port>
</node>
<session_timeout_ms>2000</session_timeout_ms>
</zookeeper>
</yandex>

View File

@ -0,0 +1,32 @@
<?xml version="1.0"?>
<yandex>
<profiles>
<default>
<log_queries>1</log_queries>
</default>
</profiles>
<users>
<default>
<password></password>
<networks incl="networks" replace="replace">
<ip>::/0</ip>
</networks>
<profile>default</profile>
<quota>default</quota>
</default>
<dbuser>
<password>12345678</password>
<networks incl="networks" replace="replace">
<ip>::/0</ip>
</networks>
<profile>default</profile>
<quota>default</quota>
</dbuser>
</users>
<quotas>
<default>
</default>
</quotas>
</yandex>

View File

@ -0,0 +1,23 @@
<?xml version="1.0"?>
<yandex>
<remote_servers>
<source>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>first_of_two</host>
<port>9000</port>
</replica>
</shard>
</source>
<destination>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>second_of_two</host>
<port>9000</port>
</replica>
</shard>
</destination>
</remote_servers>
</yandex>

View File

@ -0,0 +1,6 @@
<?xml version="1.0"?>
<yandex>
<distributed_ddl>
<path>/clickhouse/task_queue/ddl</path>
</distributed_ddl>
</yandex>

View File

@ -0,0 +1,34 @@
<yandex>
<storage_configuration>
<disks>
<default>
</default>
<jbod1>
<path>/jbod1/</path>
</jbod1>
<jbod2>
<path>/jbod2/</path>
</jbod2>
<external>
<path>/external/</path>
</external>
</disks>
<policies>
<external_with_jbods>
<volumes>
<external>
<disk>external</disk>
</external>
<main>
<disk>jbod1</disk>
<disk>jbod2</disk>
</main>
</volumes>
</external_with_jbods>
</policies>
</storage_configuration>
</yandex>

View File

@ -0,0 +1,20 @@
<?xml version="1.0"?>
<yandex>
<logger>
<level>information</level>
<log>/var/log/clickhouse-server/copier/log.log</log>
<errorlog>/var/log/clickhouse-server/copier/log.err.log</errorlog>
<size>1000M</size>
<count>10</count>
<stderr>/var/log/clickhouse-server/copier/stderr.log</stderr>
<stdout>/var/log/clickhouse-server/copier/stdout.log</stdout>
</logger>
<zookeeper>
<node index="1">
<host>zoo1</host>
<port>2181</port>
</node>
<session_timeout_ms>2000</session_timeout_ms>
</zookeeper>
</yandex>

View File

@ -0,0 +1,32 @@
<?xml version="1.0"?>
<yandex>
<profiles>
<default>
<log_queries>1</log_queries>
</default>
</profiles>
<users>
<default>
<password></password>
<networks incl="networks" replace="replace">
<ip>::/0</ip>
</networks>
<profile>default</profile>
<quota>default</quota>
</default>
<dbuser>
<password>12345678</password>
<networks incl="networks" replace="replace">
<ip>::/0</ip>
</networks>
<profile>default</profile>
<quota>default</quota>
</dbuser>
</users>
<quotas>
<default>
</default>
</quotas>
</yandex>

View File

@ -0,0 +1,42 @@
<?xml version="1.0"?>
<yandex>
<remote_servers>
<source>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>first_of_two</host>
<port>9000</port>
</replica>
</shard>
</source>
<destination>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>second_of_two</host>
<port>9000</port>
</replica>
</shard>
</destination>
</remote_servers>
<max_workers>2</max_workers>
<tables>
<table_events>
<cluster_pull>source</cluster_pull>
<database_pull>db_drop_target_partition</database_pull>
<table_pull>source</table_pull>
<cluster_push>destination</cluster_push>
<database_push>db_drop_target_partition</database_push>
<table_push>destination</table_push>
<allow_to_drop_target_partitions>true</allow_to_drop_target_partitions>
<engine>ENGINE = MergeTree() PARTITION BY toYYYYMMDD(Column3) ORDER BY (Column3, Column2, Column1)</engine>
<sharding_key>rand()</sharding_key>
</table_events>
</tables>
</yandex>

View File

@ -0,0 +1,40 @@
<?xml version="1.0"?>
<yandex>
<remote_servers>
<source>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>first_of_two</host>
<port>9000</port>
</replica>
</shard>
</source>
<destination>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>second_of_two</host>
<port>9000</port>
</replica>
</shard>
</destination>
</remote_servers>
<max_workers>2</max_workers>
<tables>
<table_events>
<cluster_pull>source</cluster_pull>
<database_pull>db_skip_index</database_pull>
<table_pull>source</table_pull>
<cluster_push>destination</cluster_push>
<database_push>db_skip_index</database_push>
<table_push>destination</table_push>
<engine>ENGINE = MergeTree() PARTITION BY toYYYYMMDD(Column3) ORDER BY (Column3, Column2, Column1)</engine>
<sharding_key>rand()</sharding_key>
</table_events>
</tables>
</yandex>

View File

@ -0,0 +1,43 @@
<?xml version="1.0"?>
<yandex>
<remote_servers>
<events>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>first</host>
<port>9000</port>
</replica>
</shard>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>second</host>
<port>9000</port>
</replica>
</shard>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>third</host>
<port>9000</port>
</replica>
</shard>
</events>
</remote_servers>
<max_workers>2</max_workers>
<tables>
<table_events>
<cluster_pull>events</cluster_pull>
<database_pull>dailyhistory</database_pull>
<table_pull>yellow_tripdata_staging</table_pull>
<cluster_push>events</cluster_push>
<database_push>monthlyhistory</database_push>
<table_push>yellow_tripdata_staging</table_push>
<engine>Engine=ReplacingMergeTree() PRIMARY KEY (tpep_pickup_datetime, id) ORDER BY (tpep_pickup_datetime, id) PARTITION BY (pickup_location_id, toYYYYMM(tpep_pickup_datetime))</engine>
<sharding_key>sipHash64(id) % 3</sharding_key>
</table_events>
</tables>
</yandex>

View File

@ -44,7 +44,7 @@
<source_trivial_cluster>
<shard>
<replica>
<host>s0_0_0</host>
<host>first_trivial</host>
<port>9000</port>
</replica>
</shard>
@ -54,11 +54,11 @@
<destination_trivial_cluster>
<shard>
<replica>
<host>s1_0_0</host>
<host>second_trivial</host>
<port>9000</port>
</replica>
</shard>
</destination_trivial_cluster>
</remote_servers>
</yandex>
</yandex>

View File

@ -0,0 +1,64 @@
<?xml version="1.0"?>
<yandex>
<!-- How many simualteneous workers are posssible -->
<max_workers>3</max_workers>
<!-- Common setting for pull and push operations -->
<settings>
<connect_timeout>1</connect_timeout>
</settings>
<!-- Setting used to fetch data -->
<settings_pull>
<max_rows_in_distinct>0</max_rows_in_distinct>
</settings_pull>
<!-- Setting used to insert data -->
<settings_push>
</settings_push>
<!-- Tasks -->
<tables>
<hits>
<cluster_pull>source_trivial_cluster</cluster_pull>
<database_pull>default</database_pull>
<table_pull>trivial_without_arguments</table_pull>
<cluster_push>destination_trivial_cluster</cluster_push>
<database_push>default</database_push>
<table_push>trivial_without_arguments</table_push>
<!-- Engine of destination tables -->
<engine>ENGINE=ReplicatedMergeTree() PARTITION BY d % 5 ORDER BY (d, sipHash64(d)) SAMPLE BY sipHash64(d) SETTINGS index_granularity = 16</engine>
<!-- Which sarding key to use while copying -->
<sharding_key>d + 1</sharding_key>
<!-- Optional expression that filter copying data -->
<where_condition>d - d = 0</where_condition>
</hits>
</tables>
<!-- Configuration of clusters -->
<remote_servers>
<source_trivial_cluster>
<shard>
<replica>
<host>first_trivial</host>
<port>9000</port>
</replica>
</shard>
</source_trivial_cluster>
<destination_trivial_cluster>
<shard>
<replica>
<host>second_trivial</host>
<port>9000</port>
</replica>
</shard>
</destination_trivial_cluster>
</remote_servers>
</yandex>

View File

@ -0,0 +1,40 @@
<?xml version="1.0"?>
<yandex>
<remote_servers>
<source>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>first_of_two</host>
<port>9000</port>
</replica>
</shard>
</source>
<destination>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>second_of_two</host>
<port>9000</port>
</replica>
</shard>
</destination>
</remote_servers>
<max_workers>2</max_workers>
<tables>
<table_events>
<cluster_pull>source</cluster_pull>
<database_pull>db_ttl_columns</database_pull>
<table_pull>source</table_pull>
<cluster_push>destination</cluster_push>
<database_push>db_ttl_columns</database_push>
<table_push>destination</table_push>
<engine>ENGINE = MergeTree() PARTITION BY toYYYYMMDD(Column3) ORDER BY (Column3, Column2, Column1)</engine>
<sharding_key>rand()</sharding_key>
</table_events>
</tables>
</yandex>

View File

@ -0,0 +1,40 @@
<?xml version="1.0"?>
<yandex>
<remote_servers>
<source>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>first_of_two</host>
<port>9000</port>
</replica>
</shard>
</source>
<destination>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>second_of_two</host>
<port>9000</port>
</replica>
</shard>
</destination>
</remote_servers>
<max_workers>2</max_workers>
<tables>
<table_events>
<cluster_pull>source</cluster_pull>
<database_pull>db_move_to_volume</database_pull>
<table_pull>source</table_pull>
<cluster_push>destination</cluster_push>
<database_push>db_move_to_volume</database_push>
<table_push>destination</table_push>
<engine>ENGINE = MergeTree() PARTITION BY toYYYYMMDD(Column3) ORDER BY (Column3, Column2, Column1) TTL Column3 + INTERVAL 1 MONTH TO VOLUME 'external' SETTINGS storage_policy = 'external_with_jbods'</engine>
<sharding_key>rand()</sharding_key>
</table_events>
</tables>
</yandex>

View File

@ -0,0 +1,40 @@
<?xml version="1.0"?>
<yandex>
<remote_servers>
<source>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>first_of_two</host>
<port>9000</port>
</replica>
</shard>
</source>
<destination>
<shard>
<internal_replication>false</internal_replication>
<replica>
<host>second_of_two</host>
<port>9000</port>
</replica>
</shard>
</destination>
</remote_servers>
<max_workers>2</max_workers>
<tables>
<table_events>
<cluster_pull>source</cluster_pull>
<database_pull>db_different_schema</database_pull>
<table_pull>source</table_pull>
<cluster_push>destination</cluster_push>
<database_push>db_different_schema</database_push>
<table_push>destination</table_push>
<engine>ENGINE = MergeTree() PARTITION BY toYYYYMMDD(Column3) ORDER BY (Column9, Column1, Column2, Column3, Column4)</engine>
<sharding_key>rand()</sharding_key>
</table_events>
</tables>
</yandex>

View File

@ -2,21 +2,26 @@ import os
import random
import sys
import time
from contextlib import contextmanager
import docker
import kazoo
import pytest
import string
import random
from contextlib import contextmanager
from helpers.cluster import ClickHouseCluster
from helpers.test_tools import TSV
import docker
CURRENT_TEST_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.dirname(CURRENT_TEST_DIR))
COPYING_FAIL_PROBABILITY = 0.2
MOVING_FAIL_PROBABILITY = 0.2
cluster = ClickHouseCluster(__file__)
cluster = ClickHouseCluster(__file__, name='copier_test')
def generateRandomString(count):
return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(count))
def check_all_hosts_sucesfully_executed(tsv_content, num_hosts):
@ -72,8 +77,13 @@ class Task1:
def __init__(self, cluster):
self.cluster = cluster
self.zk_task_path = "/clickhouse-copier/task_simple"
self.copier_task_config = open(os.path.join(CURRENT_TEST_DIR, 'task0_description.xml'), 'r').read()
self.zk_task_path = "/clickhouse-copier/task_simple_" + generateRandomString(10)
self.container_task_file = "/task0_description.xml"
for instance_name, _ in cluster.instances.items():
instance = cluster.instances[instance_name]
instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, './task0_description.xml'), self.container_task_file)
print("Copied task file to container of '{}' instance. Path {}".format(instance_name, self.container_task_file))
def start(self):
instance = cluster.instances['s0_0_0']
@ -112,9 +122,14 @@ class Task2:
def __init__(self, cluster, unique_zk_path):
self.cluster = cluster
self.zk_task_path = "/clickhouse-copier/task_month_to_week_partition"
self.copier_task_config = open(os.path.join(CURRENT_TEST_DIR, 'task_month_to_week_description.xml'), 'r').read()
self.unique_zk_path = unique_zk_path
self.zk_task_path = "/clickhouse-copier/task_month_to_week_partition_" + generateRandomString(5)
self.unique_zk_path = generateRandomString(10)
self.container_task_file = "/task_month_to_week_description.xml"
for instance_name, _ in cluster.instances.items():
instance = cluster.instances[instance_name]
instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, './task_month_to_week_description.xml'), self.container_task_file)
print("Copied task file to container of '{}' instance. Path {}".format(instance_name, self.container_task_file))
def start(self):
instance = cluster.instances['s0_0_0']
@ -163,9 +178,14 @@ class Task_test_block_size:
def __init__(self, cluster):
self.cluster = cluster
self.zk_task_path = "/clickhouse-copier/task_test_block_size"
self.copier_task_config = open(os.path.join(CURRENT_TEST_DIR, 'task_test_block_size.xml'), 'r').read()
self.zk_task_path = "/clickhouse-copier/task_test_block_size_" + generateRandomString(5)
self.rows = 1000000
self.container_task_file = "/task_test_block_size.xml"
for instance_name, _ in cluster.instances.items():
instance = cluster.instances[instance_name]
instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, './task_test_block_size.xml'), self.container_task_file)
print("Copied task file to container of '{}' instance. Path {}".format(instance_name, self.container_task_file))
def start(self):
instance = cluster.instances['s0_0_0']
@ -192,13 +212,19 @@ class Task_no_index:
def __init__(self, cluster):
self.cluster = cluster
self.zk_task_path = "/clickhouse-copier/task_no_index"
self.copier_task_config = open(os.path.join(CURRENT_TEST_DIR, 'task_no_index.xml'), 'r').read()
self.zk_task_path = "/clickhouse-copier/task_no_index_" + generateRandomString(5)
self.rows = 1000000
self.container_task_file = "/task_no_index.xml"
for instance_name, _ in cluster.instances.items():
instance = cluster.instances[instance_name]
instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, './task_no_index.xml'), self.container_task_file)
print("Copied task file to container of '{}' instance. Path {}".format(instance_name, self.container_task_file))
def start(self):
instance = cluster.instances['s0_0_0']
instance.query("create table ontime (Year UInt16, FlightDate String) ENGINE = Memory")
instance.query("DROP TABLE IF EXISTS ontime SYNC")
instance.query("create table IF NOT EXISTS ontime (Year UInt16, FlightDate String) ENGINE = Memory")
instance.query("insert into ontime values (2016, 'test6'), (2017, 'test7'), (2018, 'test8')")
def check(self):
@ -214,32 +240,44 @@ class Task_no_arg:
def __init__(self, cluster):
self.cluster = cluster
self.zk_task_path = "/clickhouse-copier/task_no_arg"
self.copier_task_config = open(os.path.join(CURRENT_TEST_DIR, 'task_no_arg.xml'), 'r').read()
self.rows = 1000000
self.container_task_file = "/task_no_arg.xml"
for instance_name, _ in cluster.instances.items():
instance = cluster.instances[instance_name]
instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, './task_no_arg.xml'), self.container_task_file)
print("Copied task file to container of '{}' instance. Path {}".format(instance_name, self.container_task_file))
def start(self):
instance = cluster.instances['s0_0_0']
instance.query("DROP TABLE IF EXISTS copier_test1 SYNC")
instance.query(
"create table copier_test1 (date Date, id UInt32) engine = MergeTree PARTITION BY date ORDER BY date SETTINGS index_granularity = 8192")
"create table if not exists copier_test1 (date Date, id UInt32) engine = MergeTree PARTITION BY date ORDER BY date SETTINGS index_granularity = 8192")
instance.query("insert into copier_test1 values ('2016-01-01', 10);")
def check(self):
assert TSV(self.cluster.instances['s1_1_0'].query("SELECT date FROM copier_test1_1")) == TSV("2016-01-01\n")
instance = cluster.instances['s0_0_0']
instance.query("DROP TABLE copier_test1")
instance.query("DROP TABLE copier_test1 SYNC")
instance = cluster.instances['s1_1_0']
instance.query("DROP TABLE copier_test1_1")
instance.query("DROP TABLE copier_test1_1 SYNC")
class Task_non_partitioned_table:
def __init__(self, cluster):
self.cluster = cluster
self.zk_task_path = "/clickhouse-copier/task_non_partitoned_table"
self.copier_task_config = open(os.path.join(CURRENT_TEST_DIR, 'task_non_partitioned_table.xml'), 'r').read()
self.rows = 1000000
self.container_task_file = "/task_non_partitioned_table.xml"
for instance_name, _ in cluster.instances.items():
instance = cluster.instances[instance_name]
instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, './task_non_partitioned_table.xml'), self.container_task_file)
print("Copied task file to container of '{}' instance. Path {}".format(instance_name, self.container_task_file))
def start(self):
instance = cluster.instances['s0_0_0']
instance.query("DROP TABLE IF EXISTS copier_test1 SYNC")
instance.query(
"create table copier_test1 (date Date, id UInt32) engine = MergeTree ORDER BY date SETTINGS index_granularity = 8192")
instance.query("insert into copier_test1 values ('2016-01-01', 10);")
@ -256,16 +294,23 @@ class Task_self_copy:
def __init__(self, cluster):
self.cluster = cluster
self.zk_task_path = "/clickhouse-copier/task_self_copy"
self.copier_task_config = open(os.path.join(CURRENT_TEST_DIR, 'task_self_copy.xml'), 'r').read()
self.container_task_file = "/task_self_copy.xml"
for instance_name, _ in cluster.instances.items():
instance = cluster.instances[instance_name]
instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, './task_self_copy.xml'), self.container_task_file)
print("Copied task file to container of '{}' instance. Path {}".format(instance_name, self.container_task_file))
def start(self):
instance = cluster.instances['s0_0_0']
instance.query("CREATE DATABASE db1;")
instance.query("DROP DATABASE IF EXISTS db1 SYNC")
instance.query("DROP DATABASE IF EXISTS db2 SYNC")
instance.query("CREATE DATABASE IF NOT EXISTS db1;")
instance.query(
"CREATE TABLE db1.source_table (`a` Int8, `b` String, `c` Int8) ENGINE = MergeTree PARTITION BY a ORDER BY a SETTINGS index_granularity = 8192")
instance.query("CREATE DATABASE db2;")
"CREATE TABLE IF NOT EXISTS db1.source_table (`a` Int8, `b` String, `c` Int8) ENGINE = MergeTree PARTITION BY a ORDER BY a SETTINGS index_granularity = 8192")
instance.query("CREATE DATABASE IF NOT EXISTS db2;")
instance.query(
"CREATE TABLE db2.destination_table (`a` Int8, `b` String, `c` Int8) ENGINE = MergeTree PARTITION BY a ORDER BY a SETTINGS index_granularity = 8192")
"CREATE TABLE IF NOT EXISTS db2.destination_table (`a` Int8, `b` String, `c` Int8) ENGINE = MergeTree PARTITION BY a ORDER BY a SETTINGS index_granularity = 8192")
instance.query("INSERT INTO db1.source_table VALUES (1, 'ClickHouse', 1);")
instance.query("INSERT INTO db1.source_table VALUES (2, 'Copier', 2);")
@ -273,8 +318,8 @@ class Task_self_copy:
instance = cluster.instances['s0_0_0']
assert TSV(instance.query("SELECT * FROM db2.destination_table ORDER BY a")) == TSV(instance.query("SELECT * FROM db1.source_table ORDER BY a"))
instance = cluster.instances['s0_0_0']
instance.query("DROP DATABASE db1 SYNC")
instance.query("DROP DATABASE db2 SYNC")
instance.query("DROP DATABASE IF EXISTS db1 SYNC")
instance.query("DROP DATABASE IF EXISTS db2 SYNC")
def execute_task(started_cluster, task, cmd_options):
@ -283,26 +328,27 @@ def execute_task(started_cluster, task, cmd_options):
zk = started_cluster.get_kazoo_client('zoo1')
print("Use ZooKeeper server: {}:{}".format(zk.hosts[0][0], zk.hosts[0][1]))
try:
zk.delete("/clickhouse-copier", recursive=True)
except kazoo.exceptions.NoNodeError:
print("No node /clickhouse-copier. It is Ok in first test.")
zk_task_path = task.zk_task_path
zk.ensure_path(zk_task_path)
zk.create(zk_task_path + "/description", task.copier_task_config.encode())
# Run cluster-copier processes on each node
docker_api = started_cluster.docker_client.api
copiers_exec_ids = []
cmd = ['/usr/bin/clickhouse', 'copier',
'--config', '/etc/clickhouse-server/config-copier.xml',
'--task-path', zk_task_path,
'--task-path', task.zk_task_path,
'--task-file', task.container_task_file,
'--task-upload-force', 'true',
'--base-dir', '/var/log/clickhouse-server/copier']
cmd += cmd_options
copiers = random.sample(list(cluster.instances.keys()), 3)
print(cmd)
copiers = random.sample(list(started_cluster.instances.keys()), 3)
for instance_name in copiers:
instance = started_cluster.instances[instance_name]
@ -330,18 +376,12 @@ def execute_task(started_cluster, task, cmd_options):
try:
task.check()
finally:
zk.delete(zk_task_path, recursive=True)
zk.delete(task.zk_task_path, recursive=True)
# Tests
@pytest.mark.parametrize(
('use_sample_offset'),
[
False,
True
]
)
@pytest.mark.parametrize(('use_sample_offset'), [False, True])
def test_copy_simple(started_cluster, use_sample_offset):
if use_sample_offset:
execute_task(started_cluster, Task1(started_cluster), ['--experimental-use-sample-offset', '1'])
@ -349,13 +389,7 @@ def test_copy_simple(started_cluster, use_sample_offset):
execute_task(started_cluster, Task1(started_cluster), [])
@pytest.mark.parametrize(
('use_sample_offset'),
[
False,
True
]
)
@pytest.mark.parametrize(('use_sample_offset'),[False, True])
def test_copy_with_recovering(started_cluster, use_sample_offset):
if use_sample_offset:
execute_task(started_cluster, Task1(started_cluster), ['--copy-fault-probability', str(COPYING_FAIL_PROBABILITY),
@ -364,13 +398,7 @@ def test_copy_with_recovering(started_cluster, use_sample_offset):
execute_task(started_cluster, Task1(started_cluster), ['--copy-fault-probability', str(COPYING_FAIL_PROBABILITY)])
@pytest.mark.parametrize(
('use_sample_offset'),
[
False,
True
]
)
@pytest.mark.parametrize(('use_sample_offset'),[False, True])
def test_copy_with_recovering_after_move_faults(started_cluster, use_sample_offset):
if use_sample_offset:
execute_task(started_cluster, Task1(started_cluster), ['--move-fault-probability', str(MOVING_FAIL_PROBABILITY),
@ -412,9 +440,3 @@ def test_non_partitioned_table(started_cluster):
def test_self_copy(started_cluster):
execute_task(started_cluster, Task_self_copy(started_cluster), [])
if __name__ == '__main__':
with contextmanager(started_cluster)() as cluster:
for name, instance in list(cluster.instances.items()):
print(name, instance.ip_address)
input("Cluster created, press any key to destroy...")

View File

@ -0,0 +1,238 @@
import os
import sys
import time
import logging
import pytest
from helpers.cluster import ClickHouseCluster
from helpers.test_tools import TSV
import docker
CURRENT_TEST_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.dirname(CURRENT_TEST_DIR))
cluster = ClickHouseCluster(__file__, name='copier_test_three_nodes')
@pytest.fixture(scope="module")
def started_cluster():
global cluster
try:
for name in ["first", "second", "third"]:
cluster.add_instance(name,
main_configs=["configs_three_nodes/conf.d/clusters.xml", "configs_three_nodes/conf.d/ddl.xml"], user_configs=["configs_three_nodes/users.xml"],
with_zookeeper=True)
cluster.start()
yield cluster
finally:
cluster.shutdown()
class Task:
def __init__(self, cluster):
self.cluster = cluster
self.zk_task_path = '/clickhouse-copier/task'
self.container_task_file = "/task_taxi_data.xml"
for instance_name, _ in cluster.instances.items():
instance = cluster.instances[instance_name]
instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, './task_taxi_data.xml'), self.container_task_file)
print("Copied task file to container of '{}' instance. Path {}".format(instance_name, self.container_task_file))
def start(self):
for name in ["first", "second", "third"]:
node = cluster.instances[name]
node.query("DROP DATABASE IF EXISTS dailyhistory SYNC;")
node.query("DROP DATABASE IF EXISTS monthlyhistory SYNC;")
instance = cluster.instances['first']
# daily partition database
instance.query("CREATE DATABASE IF NOT EXISTS dailyhistory on cluster events;")
instance.query("""CREATE TABLE dailyhistory.yellow_tripdata_staging ON CLUSTER events
(
id UUID DEFAULT generateUUIDv4(),
vendor_id String,
tpep_pickup_datetime DateTime('UTC'),
tpep_dropoff_datetime DateTime('UTC'),
passenger_count Nullable(Float64),
trip_distance String,
pickup_longitude Float64,
pickup_latitude Float64,
rate_code_id String,
store_and_fwd_flag String,
dropoff_longitude Float64,
dropoff_latitude Float64,
payment_type String,
fare_amount String,
extra String,
mta_tax String,
tip_amount String,
tolls_amount String,
improvement_surcharge String,
total_amount String,
pickup_location_id String,
dropoff_location_id String,
congestion_surcharge String,
junk1 String, junk2 String
)
Engine = ReplacingMergeTree()
PRIMARY KEY (tpep_pickup_datetime, id)
ORDER BY (tpep_pickup_datetime, id)
PARTITION BY (toYYYYMMDD(tpep_pickup_datetime))""")
instance.query("""CREATE TABLE dailyhistory.yellow_tripdata
ON CLUSTER events
AS dailyhistory.yellow_tripdata_staging
ENGINE = Distributed('events', 'dailyhistory', yellow_tripdata_staging, sipHash64(id) % 3);""")
instance.query("""INSERT INTO dailyhistory.yellow_tripdata
SELECT * FROM generateRandom(
'id UUID DEFAULT generateUUIDv4(),
vendor_id String,
tpep_pickup_datetime DateTime(\\'UTC\\'),
tpep_dropoff_datetime DateTime(\\'UTC\\'),
passenger_count Nullable(Float64),
trip_distance String,
pickup_longitude Float64,
pickup_latitude Float64,
rate_code_id String,
store_and_fwd_flag String,
dropoff_longitude Float64,
dropoff_latitude Float64,
payment_type String,
fare_amount String,
extra String,
mta_tax String,
tip_amount String,
tolls_amount String,
improvement_surcharge String,
total_amount String,
pickup_location_id String,
dropoff_location_id String,
congestion_surcharge String,
junk1 String,
junk2 String',
1, 10, 2) LIMIT 50;""")
# monthly partition database
instance.query("create database IF NOT EXISTS monthlyhistory on cluster events;")
instance.query("""CREATE TABLE monthlyhistory.yellow_tripdata_staging ON CLUSTER events
(
id UUID DEFAULT generateUUIDv4(),
vendor_id String,
tpep_pickup_datetime DateTime('UTC'),
tpep_dropoff_datetime DateTime('UTC'),
passenger_count Nullable(Float64),
trip_distance String,
pickup_longitude Float64,
pickup_latitude Float64,
rate_code_id String,
store_and_fwd_flag String,
dropoff_longitude Float64,
dropoff_latitude Float64,
payment_type String,
fare_amount String,
extra String,
mta_tax String,
tip_amount String,
tolls_amount String,
improvement_surcharge String,
total_amount String,
pickup_location_id String,
dropoff_location_id String,
congestion_surcharge String,
junk1 String,
junk2 String
)
Engine = ReplacingMergeTree()
PRIMARY KEY (tpep_pickup_datetime, id)
ORDER BY (tpep_pickup_datetime, id)
PARTITION BY (pickup_location_id, toYYYYMM(tpep_pickup_datetime))""")
instance.query("""CREATE TABLE monthlyhistory.yellow_tripdata
ON CLUSTER events
AS monthlyhistory.yellow_tripdata_staging
ENGINE = Distributed('events', 'monthlyhistory', yellow_tripdata_staging, sipHash64(id) % 3);""")
def check(self):
instance = cluster.instances["first"]
a = TSV(instance.query("SELECT count() from dailyhistory.yellow_tripdata"))
b = TSV(instance.query("SELECT count() from monthlyhistory.yellow_tripdata"))
assert a == b, "Distributed tables"
for instance_name, instance in cluster.instances.items():
instance = cluster.instances[instance_name]
a = instance.query("SELECT count() from dailyhistory.yellow_tripdata_staging")
b = instance.query("SELECT count() from monthlyhistory.yellow_tripdata_staging")
assert a == b, "MergeTree tables on each shard"
a = TSV(instance.query("SELECT sipHash64(*) from dailyhistory.yellow_tripdata_staging ORDER BY id"))
b = TSV(instance.query("SELECT sipHash64(*) from monthlyhistory.yellow_tripdata_staging ORDER BY id"))
assert a == b, "Data on each shard"
for name in ["first", "second", "third"]:
node = cluster.instances[name]
node.query("DROP DATABASE IF EXISTS dailyhistory SYNC;")
node.query("DROP DATABASE IF EXISTS monthlyhistory SYNC;")
def execute_task(started_cluster, task, cmd_options):
task.start()
zk = started_cluster.get_kazoo_client('zoo1')
print("Use ZooKeeper server: {}:{}".format(zk.hosts[0][0], zk.hosts[0][1]))
# Run cluster-copier processes on each node
docker_api = docker.from_env().api
copiers_exec_ids = []
cmd = ['/usr/bin/clickhouse', 'copier',
'--config', '/etc/clickhouse-server/config-copier.xml',
'--task-path', task.zk_task_path,
'--task-file', task.container_task_file,
'--task-upload-force', 'true',
'--base-dir', '/var/log/clickhouse-server/copier']
cmd += cmd_options
print(cmd)
for instance_name, instance in started_cluster.instances.items():
instance = started_cluster.instances[instance_name]
container = instance.get_docker_handle()
instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, "configs_three_nodes/config-copier.xml"), "/etc/clickhouse-server/config-copier.xml")
logging.info("Copied copier config to {}".format(instance.name))
exec_id = docker_api.exec_create(container.id, cmd, stderr=True)
output = docker_api.exec_start(exec_id).decode('utf8')
logging.info(output)
copiers_exec_ids.append(exec_id)
logging.info("Copier for {} ({}) has started".format(instance.name, instance.ip_address))
# time.sleep(1000)
# Wait for copiers stopping and check their return codes
for exec_id, instance in zip(copiers_exec_ids, iter(started_cluster.instances.values())):
while True:
res = docker_api.exec_inspect(exec_id)
if not res['Running']:
break
time.sleep(1)
assert res['ExitCode'] == 0, "Instance: {} ({}). Info: {}".format(instance.name, instance.ip_address, repr(res))
try:
task.check()
finally:
zk.delete(task.zk_task_path, recursive=True)
# Tests
@pytest.mark.timeout(600)
def test(started_cluster):
execute_task(started_cluster, Task(started_cluster), [])

View File

@ -0,0 +1,182 @@
import os
import sys
import time
import random
import string
from helpers.cluster import ClickHouseCluster
from helpers.test_tools import TSV
import kazoo
import pytest
import docker
CURRENT_TEST_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.dirname(CURRENT_TEST_DIR))
COPYING_FAIL_PROBABILITY = 0.1
MOVING_FAIL_PROBABILITY = 0.1
cluster = ClickHouseCluster(__file__, name='copier_test_trivial')
def generateRandomString(count):
return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(count))
@pytest.fixture(scope="module")
def started_cluster():
global cluster
try:
for name in ["first_trivial", "second_trivial"]:
instance = cluster.add_instance(name,
main_configs=["configs/conf.d/clusters_trivial.xml"],
user_configs=["configs_two_nodes/users.xml"],
macros={"cluster" : name, "shard" : "the_only_shard", "replica" : "the_only_replica"},
with_zookeeper=True)
cluster.start()
yield cluster
finally:
cluster.shutdown()
class TaskTrivial:
def __init__(self, cluster):
self.cluster = cluster
self.zk_task_path = "/clickhouse-copier/task_trivial"
self.copier_task_config = open(os.path.join(CURRENT_TEST_DIR, 'task_trivial.xml'), 'r').read()
def start(self):
source = cluster.instances['first_trivial']
destination = cluster.instances['second_trivial']
for node in [source, destination]:
node.query("DROP DATABASE IF EXISTS default")
node.query("CREATE DATABASE IF NOT EXISTS default")
source.query("CREATE TABLE trivial (d UInt64, d1 UInt64 MATERIALIZED d+1)"
"ENGINE=ReplicatedMergeTree('/clickhouse/tables/source_trivial_cluster/1/trivial/{}', '1') "
"PARTITION BY d % 5 ORDER BY (d, sipHash64(d)) SAMPLE BY sipHash64(d) SETTINGS index_granularity = 16".format(generateRandomString(10)))
source.query("INSERT INTO trivial SELECT * FROM system.numbers LIMIT 1002",
settings={"insert_distributed_sync": 1})
def check(self):
zk = cluster.get_kazoo_client('zoo1')
status_data, _ = zk.get(self.zk_task_path + "/status")
assert status_data == b'{"hits":{"all_partitions_count":5,"processed_partitions_count":5}}'
source = cluster.instances['first_trivial']
destination = cluster.instances['second_trivial']
assert TSV(source.query("SELECT count() FROM trivial")) == TSV("1002\n")
assert TSV(destination.query("SELECT count() FROM trivial")) == TSV("1002\n")
for node in [source, destination]:
node.query("DROP TABLE trivial")
class TaskReplicatedWithoutArguments:
def __init__(self, cluster):
self.cluster = cluster
self.zk_task_path = "/clickhouse-copier/task_trivial_without_arguments"
self.copier_task_config = open(os.path.join(CURRENT_TEST_DIR, 'task_trivial_without_arguments.xml'), 'r').read()
def start(self):
source = cluster.instances['first_trivial']
destination = cluster.instances['second_trivial']
for node in [source, destination]:
node.query("DROP DATABASE IF EXISTS default")
node.query("CREATE DATABASE IF NOT EXISTS default")
source.query("CREATE TABLE trivial_without_arguments ON CLUSTER source_trivial_cluster (d UInt64, d1 UInt64 MATERIALIZED d+1) "
"ENGINE=ReplicatedMergeTree() "
"PARTITION BY d % 5 ORDER BY (d, sipHash64(d)) SAMPLE BY sipHash64(d) SETTINGS index_granularity = 16")
source.query("INSERT INTO trivial_without_arguments SELECT * FROM system.numbers LIMIT 1002",
settings={"insert_distributed_sync": 1})
def check(self):
zk = cluster.get_kazoo_client('zoo1')
status_data, _ = zk.get(self.zk_task_path + "/status")
assert status_data == b'{"hits":{"all_partitions_count":5,"processed_partitions_count":5}}'
source = cluster.instances['first_trivial']
destination = cluster.instances['second_trivial']
assert TSV(source.query("SELECT count() FROM trivial_without_arguments")) == TSV("1002\n")
assert TSV(destination.query("SELECT count() FROM trivial_without_arguments")) == TSV("1002\n")
for node in [source, destination]:
node.query("DROP TABLE trivial_without_arguments")
def execute_task(started_cluster, task, cmd_options):
task.start()
zk = started_cluster.get_kazoo_client('zoo1')
print("Use ZooKeeper server: {}:{}".format(zk.hosts[0][0], zk.hosts[0][1]))
try:
zk.delete("/clickhouse-copier", recursive=True)
except kazoo.exceptions.NoNodeError:
print("No node /clickhouse-copier. It is Ok in first test.")
zk_task_path = task.zk_task_path
zk.ensure_path(zk_task_path)
zk.create(zk_task_path + "/description", task.copier_task_config.encode())
# Run cluster-copier processes on each node
docker_api = started_cluster.docker_client.api
copiers_exec_ids = []
cmd = ['/usr/bin/clickhouse', 'copier',
'--config', '/etc/clickhouse-server/config-copier.xml',
'--task-path', zk_task_path,
'--base-dir', '/var/log/clickhouse-server/copier']
cmd += cmd_options
copiers = list(started_cluster.instances.keys())
for instance_name in copiers:
instance = started_cluster.instances[instance_name]
container = instance.get_docker_handle()
instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, "configs/config-copier.xml"),
"/etc/clickhouse-server/config-copier.xml")
print("Copied copier config to {}".format(instance.name))
exec_id = docker_api.exec_create(container.id, cmd, stderr=True)
output = docker_api.exec_start(exec_id).decode('utf8')
print(output)
copiers_exec_ids.append(exec_id)
print("Copier for {} ({}) has started".format(instance.name, instance.ip_address))
# Wait for copiers stopping and check their return codes
for exec_id, instance_name in zip(copiers_exec_ids, copiers):
instance = started_cluster.instances[instance_name]
while True:
res = docker_api.exec_inspect(exec_id)
if not res['Running']:
break
time.sleep(0.5)
assert res['ExitCode'] == 0, "Instance: {} ({}). Info: {}".format(instance.name, instance.ip_address, repr(res))
try:
task.check()
finally:
zk.delete(zk_task_path, recursive=True)
# Tests
def test_trivial_copy(started_cluster):
execute_task(started_cluster, TaskTrivial(started_cluster), [])
def test_trivial_without_arguments(started_cluster):
execute_task(started_cluster, TaskReplicatedWithoutArguments(started_cluster), [])

View File

@ -0,0 +1,493 @@
import os
import sys
import time
import logging
import pytest
from helpers.cluster import ClickHouseCluster
from helpers.test_tools import TSV
import docker
CURRENT_TEST_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.dirname(CURRENT_TEST_DIR))
cluster = ClickHouseCluster(__file__, name='copier_test_two_nodes')
@pytest.fixture(scope="module")
def started_cluster():
global cluster
try:
for name in ["first_of_two", "second_of_two"]:
instance = cluster.add_instance(name,
main_configs=[
"configs_two_nodes/conf.d/clusters.xml",
"configs_two_nodes/conf.d/ddl.xml",
"configs_two_nodes/conf.d/storage_configuration.xml"],
user_configs=["configs_two_nodes/users.xml"],
with_zookeeper=True)
cluster.start()
for name in ["first_of_two", "second_of_two"]:
instance = cluster.instances[name]
instance.exec_in_container(['bash', '-c', 'mkdir /jbod1'])
instance.exec_in_container(['bash', '-c', 'mkdir /jbod2'])
instance.exec_in_container(['bash', '-c', 'mkdir /external'])
yield cluster
finally:
cluster.shutdown()
# Will copy table from `first` node to `second`
class TaskWithDifferentSchema:
def __init__(self, cluster):
self.cluster = cluster
self.zk_task_path = '/clickhouse-copier/task_with_different_schema'
self.container_task_file = "/task_with_different_schema.xml"
for instance_name, _ in cluster.instances.items():
instance = cluster.instances[instance_name]
instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, './task_with_different_schema.xml'), self.container_task_file)
print("Copied task file to container of '{}' instance. Path {}".format(instance_name, self.container_task_file))
def start(self):
first = cluster.instances["first_of_two"]
second = cluster.instances["second_of_two"]
first.query("DROP DATABASE IF EXISTS db_different_schema SYNC")
second.query("DROP DATABASE IF EXISTS db_different_schema SYNC")
first.query("CREATE DATABASE IF NOT EXISTS db_different_schema;")
first.query("""CREATE TABLE db_different_schema.source
(
Column1 String,
Column2 UInt32,
Column3 Date,
Column4 DateTime,
Column5 UInt16,
Column6 String,
Column7 String,
Column8 String,
Column9 String,
Column10 String,
Column11 String,
Column12 Decimal(3, 1),
Column13 DateTime,
Column14 UInt16
)
ENGINE = MergeTree()
PARTITION BY (toYYYYMMDD(Column3), Column3)
PRIMARY KEY (Column1, Column2, Column3, Column4, Column6, Column7, Column8, Column9)
ORDER BY (Column1, Column2, Column3, Column4, Column6, Column7, Column8, Column9)
SETTINGS index_granularity = 8192""")
first.query("""INSERT INTO db_different_schema.source SELECT * FROM generateRandom(
'Column1 String, Column2 UInt32, Column3 Date, Column4 DateTime, Column5 UInt16,
Column6 String, Column7 String, Column8 String, Column9 String, Column10 String,
Column11 String, Column12 Decimal(3, 1), Column13 DateTime, Column14 UInt16', 1, 10, 2) LIMIT 50;""")
second.query("CREATE DATABASE IF NOT EXISTS db_different_schema;")
second.query("""CREATE TABLE db_different_schema.destination
(
Column1 LowCardinality(String) CODEC(LZ4),
Column2 UInt32 CODEC(LZ4),
Column3 Date CODEC(DoubleDelta, LZ4),
Column4 DateTime CODEC(DoubleDelta, LZ4),
Column5 UInt16 CODEC(LZ4),
Column6 LowCardinality(String) CODEC(ZSTD),
Column7 LowCardinality(String) CODEC(ZSTD),
Column8 LowCardinality(String) CODEC(ZSTD),
Column9 LowCardinality(String) CODEC(ZSTD),
Column10 String CODEC(ZSTD(6)),
Column11 LowCardinality(String) CODEC(LZ4),
Column12 Decimal(3,1) CODEC(LZ4),
Column13 DateTime CODEC(DoubleDelta, LZ4),
Column14 UInt16 CODEC(LZ4)
) ENGINE = MergeTree()
PARTITION BY toYYYYMMDD(Column3)
ORDER BY (Column9, Column1, Column2, Column3, Column4);""")
print("Preparation completed")
def check(self):
first = cluster.instances["first_of_two"]
second = cluster.instances["second_of_two"]
a = first.query("SELECT count() from db_different_schema.source")
b = second.query("SELECT count() from db_different_schema.destination")
assert a == b, "Count"
a = TSV(first.query("""SELECT sipHash64(*) from db_different_schema.source
ORDER BY (Column1, Column2, Column3, Column4, Column5, Column6, Column7, Column8, Column9, Column10, Column11, Column12, Column13, Column14)"""))
b = TSV(second.query("""SELECT sipHash64(*) from db_different_schema.destination
ORDER BY (Column1, Column2, Column3, Column4, Column5, Column6, Column7, Column8, Column9, Column10, Column11, Column12, Column13, Column14)"""))
assert a == b, "Data"
first.query("DROP DATABASE IF EXISTS db_different_schema SYNC")
second.query("DROP DATABASE IF EXISTS db_different_schema SYNC")
# Just simple copying, but table schema has TTL on columns
# Also table will have slightly different schema
class TaskTTL:
def __init__(self, cluster):
self.cluster = cluster
self.zk_task_path = '/clickhouse-copier/task_ttl_columns'
self.container_task_file = "/task_ttl_columns.xml"
for instance_name, _ in cluster.instances.items():
instance = cluster.instances[instance_name]
instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, './task_ttl_columns.xml'), self.container_task_file)
print("Copied task file to container of '{}' instance. Path {}".format(instance_name, self.container_task_file))
def start(self):
first = cluster.instances["first_of_two"]
second = cluster.instances["second_of_two"]
first.query("DROP DATABASE IF EXISTS db_ttl_columns SYNC")
second.query("DROP DATABASE IF EXISTS db_ttl_columns SYNC")
first.query("CREATE DATABASE IF NOT EXISTS db_ttl_columns;")
first.query("""CREATE TABLE db_ttl_columns.source
(
Column1 String,
Column2 UInt32,
Column3 Date,
Column4 DateTime,
Column5 UInt16,
Column6 String TTL now() + INTERVAL 1 MONTH,
Column7 Decimal(3, 1) TTL now() + INTERVAL 1 MONTH,
Column8 Tuple(Float64, Float64) TTL now() + INTERVAL 1 MONTH
)
ENGINE = MergeTree()
PARTITION BY (toYYYYMMDD(Column3), Column3)
PRIMARY KEY (Column1, Column2, Column3)
ORDER BY (Column1, Column2, Column3)
SETTINGS index_granularity = 8192""")
first.query("""INSERT INTO db_ttl_columns.source SELECT * FROM generateRandom(
'Column1 String, Column2 UInt32, Column3 Date, Column4 DateTime, Column5 UInt16,
Column6 String, Column7 Decimal(3, 1), Column8 Tuple(Float64, Float64)', 1, 10, 2) LIMIT 50;""")
second.query("CREATE DATABASE IF NOT EXISTS db_ttl_columns;")
second.query("""CREATE TABLE db_ttl_columns.destination
(
Column1 String,
Column2 UInt32,
Column3 Date,
Column4 DateTime TTL now() + INTERVAL 1 MONTH,
Column5 UInt16 TTL now() + INTERVAL 1 MONTH,
Column6 String TTL now() + INTERVAL 1 MONTH,
Column7 Decimal(3, 1) TTL now() + INTERVAL 1 MONTH,
Column8 Tuple(Float64, Float64)
) ENGINE = MergeTree()
PARTITION BY toYYYYMMDD(Column3)
ORDER BY (Column3, Column2, Column1);""")
print("Preparation completed")
def check(self):
first = cluster.instances["first_of_two"]
second = cluster.instances["second_of_two"]
a = first.query("SELECT count() from db_ttl_columns.source")
b = second.query("SELECT count() from db_ttl_columns.destination")
assert a == b, "Count"
a = TSV(first.query("""SELECT sipHash64(*) from db_ttl_columns.source
ORDER BY (Column1, Column2, Column3, Column4, Column5, Column6, Column7, Column8)"""))
b = TSV(second.query("""SELECT sipHash64(*) from db_ttl_columns.destination
ORDER BY (Column1, Column2, Column3, Column4, Column5, Column6, Column7, Column8)"""))
assert a == b, "Data"
first.query("DROP DATABASE IF EXISTS db_ttl_columns SYNC")
second.query("DROP DATABASE IF EXISTS db_ttl_columns SYNC")
class TaskSkipIndex:
def __init__(self, cluster):
self.cluster = cluster
self.zk_task_path = '/clickhouse-copier/task_skip_index'
self.container_task_file = "/task_skip_index.xml"
for instance_name, _ in cluster.instances.items():
instance = cluster.instances[instance_name]
instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, './task_skip_index.xml'), self.container_task_file)
print("Copied task file to container of '{}' instance. Path {}".format(instance_name, self.container_task_file))
def start(self):
first = cluster.instances["first_of_two"]
second = cluster.instances["second_of_two"]
first.query("DROP DATABASE IF EXISTS db_skip_index SYNC")
second.query("DROP DATABASE IF EXISTS db_skip_index SYNC")
first.query("CREATE DATABASE IF NOT EXISTS db_skip_index;")
first.query("""CREATE TABLE db_skip_index.source
(
Column1 UInt64,
Column2 Int32,
Column3 Date,
Column4 DateTime,
Column5 String,
INDEX a (Column1 * Column2, Column5) TYPE minmax GRANULARITY 3,
INDEX b (Column1 * length(Column5)) TYPE set(1000) GRANULARITY 4
)
ENGINE = MergeTree()
PARTITION BY (toYYYYMMDD(Column3), Column3)
PRIMARY KEY (Column1, Column2, Column3)
ORDER BY (Column1, Column2, Column3)
SETTINGS index_granularity = 8192""")
first.query("""INSERT INTO db_skip_index.source SELECT * FROM generateRandom(
'Column1 UInt64, Column2 Int32, Column3 Date, Column4 DateTime, Column5 String', 1, 10, 2) LIMIT 100;""")
second.query("CREATE DATABASE IF NOT EXISTS db_skip_index;")
second.query("""CREATE TABLE db_skip_index.destination
(
Column1 UInt64,
Column2 Int32,
Column3 Date,
Column4 DateTime,
Column5 String,
INDEX a (Column1 * Column2, Column5) TYPE minmax GRANULARITY 3,
INDEX b (Column1 * length(Column5)) TYPE set(1000) GRANULARITY 4
) ENGINE = MergeTree()
PARTITION BY toYYYYMMDD(Column3)
ORDER BY (Column3, Column2, Column1);""")
print("Preparation completed")
def check(self):
first = cluster.instances["first_of_two"]
second = cluster.instances["second_of_two"]
a = first.query("SELECT count() from db_skip_index.source")
b = second.query("SELECT count() from db_skip_index.destination")
assert a == b, "Count"
a = TSV(first.query("""SELECT sipHash64(*) from db_skip_index.source
ORDER BY (Column1, Column2, Column3, Column4, Column5)"""))
b = TSV(second.query("""SELECT sipHash64(*) from db_skip_index.destination
ORDER BY (Column1, Column2, Column3, Column4, Column5)"""))
assert a == b, "Data"
first.query("DROP DATABASE IF EXISTS db_skip_index SYNC")
second.query("DROP DATABASE IF EXISTS db_skip_index SYNC")
class TaskTTLMoveToVolume:
def __init__(self, cluster):
self.cluster = cluster
self.zk_task_path = '/clickhouse-copier/task_ttl_move_to_volume'
self.container_task_file = "/task_ttl_move_to_volume.xml"
for instance_name, _ in cluster.instances.items():
instance = cluster.instances[instance_name]
instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, './task_ttl_move_to_volume.xml'), self.container_task_file)
print("Copied task file to container of '{}' instance. Path {}".format(instance_name, self.container_task_file))
def start(self):
first = cluster.instances["first_of_two"]
second = cluster.instances["first_of_two"]
first.query("DROP DATABASE IF EXISTS db_move_to_volume SYNC")
second.query("DROP DATABASE IF EXISTS db_move_to_volume SYNC")
first.query("CREATE DATABASE IF NOT EXISTS db_move_to_volume;")
first.query("""CREATE TABLE db_move_to_volume.source
(
Column1 UInt64,
Column2 Int32,
Column3 Date,
Column4 DateTime,
Column5 String
)
ENGINE = MergeTree()
PARTITION BY (toYYYYMMDD(Column3), Column3)
PRIMARY KEY (Column1, Column2, Column3)
ORDER BY (Column1, Column2, Column3)
TTL Column3 + INTERVAL 1 MONTH TO VOLUME 'external'
SETTINGS storage_policy = 'external_with_jbods';""")
first.query("""INSERT INTO db_move_to_volume.source SELECT * FROM generateRandom(
'Column1 UInt64, Column2 Int32, Column3 Date, Column4 DateTime, Column5 String', 1, 10, 2) LIMIT 100;""")
second.query("CREATE DATABASE IF NOT EXISTS db_move_to_volume;")
second.query("""CREATE TABLE db_move_to_volume.destination
(
Column1 UInt64,
Column2 Int32,
Column3 Date,
Column4 DateTime,
Column5 String
) ENGINE = MergeTree()
PARTITION BY toYYYYMMDD(Column3)
ORDER BY (Column3, Column2, Column1)
TTL Column3 + INTERVAL 1 MONTH TO VOLUME 'external'
SETTINGS storage_policy = 'external_with_jbods';""")
print("Preparation completed")
def check(self):
first = cluster.instances["first_of_two"]
second = cluster.instances["second_of_two"]
a = first.query("SELECT count() from db_move_to_volume.source")
b = second.query("SELECT count() from db_move_to_volume.destination")
assert a == b, "Count"
a = TSV(first.query("""SELECT sipHash64(*) from db_move_to_volume.source
ORDER BY (Column1, Column2, Column3, Column4, Column5)"""))
b = TSV(second.query("""SELECT sipHash64(*) from db_move_to_volume.destination
ORDER BY (Column1, Column2, Column3, Column4, Column5)"""))
assert a == b, "Data"
first.query("DROP DATABASE IF EXISTS db_move_to_volume SYNC")
second.query("DROP DATABASE IF EXISTS db_move_to_volume SYNC")
class TaskDropTargetPartition:
def __init__(self, cluster):
self.cluster = cluster
self.zk_task_path = '/clickhouse-copier/task_drop_target_partition'
self.container_task_file = "/task_drop_target_partition.xml"
for instance_name, _ in cluster.instances.items():
instance = cluster.instances[instance_name]
instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, './task_drop_target_partition.xml'), self.container_task_file)
print("Copied task file to container of '{}' instance. Path {}".format(instance_name, self.container_task_file))
def start(self):
first = cluster.instances["first_of_two"]
second = cluster.instances["second_of_two"]
first.query("DROP DATABASE IF EXISTS db_drop_target_partition SYNC")
second.query("DROP DATABASE IF EXISTS db_drop_target_partition SYNC")
first.query("CREATE DATABASE IF NOT EXISTS db_drop_target_partition;")
first.query("""CREATE TABLE db_drop_target_partition.source
(
Column1 UInt64,
Column2 Int32,
Column3 Date,
Column4 DateTime,
Column5 String
)
ENGINE = MergeTree()
PARTITION BY (toYYYYMMDD(Column3), Column3)
PRIMARY KEY (Column1, Column2, Column3)
ORDER BY (Column1, Column2, Column3);""")
first.query("""INSERT INTO db_drop_target_partition.source SELECT * FROM generateRandom(
'Column1 UInt64, Column2 Int32, Column3 Date, Column4 DateTime, Column5 String', 1, 10, 2) LIMIT 100;""")
second.query("CREATE DATABASE IF NOT EXISTS db_drop_target_partition;")
second.query("""CREATE TABLE db_drop_target_partition.destination
(
Column1 UInt64,
Column2 Int32,
Column3 Date,
Column4 DateTime,
Column5 String
) ENGINE = MergeTree()
PARTITION BY toYYYYMMDD(Column3)
ORDER BY (Column3, Column2, Column1);""")
# Insert data in target too. It has to be dropped.
first.query("""INSERT INTO db_drop_target_partition.destination SELECT * FROM db_drop_target_partition.source;""")
print("Preparation completed")
def check(self):
first = cluster.instances["first_of_two"]
second = cluster.instances["second_of_two"]
a = first.query("SELECT count() from db_drop_target_partition.source")
b = second.query("SELECT count() from db_drop_target_partition.destination")
assert a == b, "Count"
a = TSV(first.query("""SELECT sipHash64(*) from db_drop_target_partition.source
ORDER BY (Column1, Column2, Column3, Column4, Column5)"""))
b = TSV(second.query("""SELECT sipHash64(*) from db_drop_target_partition.destination
ORDER BY (Column1, Column2, Column3, Column4, Column5)"""))
assert a == b, "Data"
first.query("DROP DATABASE IF EXISTS db_drop_target_partition SYNC")
second.query("DROP DATABASE IF EXISTS db_drop_target_partition SYNC")
def execute_task(started_cluster, task, cmd_options):
task.start()
zk = started_cluster.get_kazoo_client('zoo1')
print("Use ZooKeeper server: {}:{}".format(zk.hosts[0][0], zk.hosts[0][1]))
# Run cluster-copier processes on each node
docker_api = docker.from_env().api
copiers_exec_ids = []
cmd = ['/usr/bin/clickhouse', 'copier',
'--config', '/etc/clickhouse-server/config-copier.xml',
'--task-path', task.zk_task_path,
'--task-file', task.container_task_file,
'--task-upload-force', 'true',
'--base-dir', '/var/log/clickhouse-server/copier']
cmd += cmd_options
print(cmd)
for instance_name, instance in started_cluster.instances.items():
instance = started_cluster.instances[instance_name]
container = instance.get_docker_handle()
instance.copy_file_to_container(os.path.join(CURRENT_TEST_DIR, "configs_two_nodes/config-copier.xml"), "/etc/clickhouse-server/config-copier.xml")
logging.info("Copied copier config to {}".format(instance.name))
exec_id = docker_api.exec_create(container.id, cmd, stderr=True)
output = docker_api.exec_start(exec_id).decode('utf8')
logging.info(output)
copiers_exec_ids.append(exec_id)
logging.info("Copier for {} ({}) has started".format(instance.name, instance.ip_address))
# time.sleep(1000)
# Wait for copiers stopping and check their return codes
for exec_id, instance in zip(copiers_exec_ids, iter(started_cluster.instances.values())):
while True:
res = docker_api.exec_inspect(exec_id)
if not res['Running']:
break
time.sleep(1)
assert res['ExitCode'] == 0, "Instance: {} ({}). Info: {}".format(instance.name, instance.ip_address, repr(res))
try:
task.check()
finally:
zk.delete(task.zk_task_path, recursive=True)
# Tests
@pytest.mark.timeout(600)
def test_different_schema(started_cluster):
execute_task(started_cluster, TaskWithDifferentSchema(started_cluster), [])
@pytest.mark.timeout(600)
def test_ttl_columns(started_cluster):
execute_task(started_cluster, TaskTTL(started_cluster), [])
@pytest.mark.timeout(600)
def test_skip_index(started_cluster):
execute_task(started_cluster, TaskSkipIndex(started_cluster), [])
@pytest.mark.skip(reason="Too flaky :(")
def test_ttl_move_to_volume(started_cluster):
execute_task(started_cluster, TaskTTLMoveToVolume(started_cluster), [])

View File

@ -1,180 +0,0 @@
import os
import sys
import time
from contextlib import contextmanager
import docker
import pytest
CURRENT_TEST_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.dirname(CURRENT_TEST_DIR))
from helpers.cluster import ClickHouseCluster
from helpers.test_tools import TSV
COPYING_FAIL_PROBABILITY = 0.33
MOVING_FAIL_PROBABILITY = 0.1
cluster = None
@pytest.fixture(scope="function")
def started_cluster():
global cluster
try:
clusters_schema = {
"0": {"0": ["0"]},
"1": {"0": ["0"]}
}
cluster = ClickHouseCluster(__file__)
for cluster_name, shards in clusters_schema.items():
for shard_name, replicas in shards.items():
for replica_name in replicas:
name = "s{}_{}_{}".format(cluster_name, shard_name, replica_name)
cluster.add_instance(name,
main_configs=[], user_configs=[],
macros={"cluster": cluster_name, "shard": shard_name, "replica": replica_name},
with_zookeeper=True)
cluster.start()
yield cluster
finally:
pass
cluster.shutdown()
class TaskTrivial:
def __init__(self, cluster, use_sample_offset):
self.cluster = cluster
if use_sample_offset:
self.zk_task_path = "/clickhouse-copier/task_trivial_use_sample_offset"
else:
self.zk_task_path = "/clickhouse-copier/task_trivial"
self.copier_task_config = open(os.path.join(CURRENT_TEST_DIR, 'task_trivial.xml'), 'r').read()
def start(self):
source = cluster.instances['s0_0_0']
destination = cluster.instances['s1_0_0']
for node in [source, destination]:
node.query("DROP DATABASE IF EXISTS default")
node.query("CREATE DATABASE IF NOT EXISTS default")
source.query("CREATE TABLE trivial (d UInt64, d1 UInt64 MATERIALIZED d+1) "
"ENGINE=ReplicatedMergeTree('/clickhouse/tables/source_trivial_cluster/1/trivial', '1') "
"PARTITION BY d % 5 ORDER BY (d, sipHash64(d)) SAMPLE BY sipHash64(d) SETTINGS index_granularity = 16")
source.query("INSERT INTO trivial SELECT * FROM system.numbers LIMIT 1002",
settings={"insert_distributed_sync": 1})
def check(self):
source = cluster.instances['s0_0_0']
destination = cluster.instances['s1_0_0']
assert TSV(source.query("SELECT count() FROM trivial")) == TSV("1002\n")
assert TSV(destination.query("SELECT count() FROM trivial")) == TSV("1002\n")
for node in [source, destination]:
node.query("DROP TABLE trivial")
def execute_task(started_cluster, task, cmd_options):
task.start()
zk = started_cluster.get_kazoo_client('zoo1')
print("Use ZooKeeper server: {}:{}".format(zk.hosts[0][0], zk.hosts[0][1]))
zk_task_path = task.zk_task_path
zk.ensure_path(zk_task_path)
zk.create(zk_task_path + "/description", task.copier_task_config)
# Run cluster-copier processes on each node
docker_api = started_cluster.docker_client.api
copiers_exec_ids = []
cmd = ['/usr/bin/clickhouse', 'copier',
'--config', '/etc/clickhouse-server/config-copier.xml',
'--task-path', zk_task_path,
'--base-dir', '/var/log/clickhouse-server/copier']
cmd += cmd_options
print(cmd)
for instance_name, instance in started_cluster.instances.items():
container = instance.get_docker_handle()
exec_id = docker_api.exec_create(container.id, cmd, stderr=True)
docker_api.exec_start(exec_id, detach=True)
copiers_exec_ids.append(exec_id)
print("Copier for {} ({}) has started".format(instance.name, instance.ip_address))
# Wait for copiers stopping and check their return codes
for exec_id, instance in zip(copiers_exec_ids, iter(started_cluster.instances.values())):
while True:
res = docker_api.exec_inspect(exec_id)
if not res['Running']:
break
time.sleep(1)
assert res['ExitCode'] == 0, "Instance: {} ({}). Info: {}".format(instance.name, instance.ip_address, repr(res))
try:
task.check()
finally:
zk.delete(zk_task_path, recursive=True)
# Tests
@pytest.mark.parametrize(
('use_sample_offset'),
[
False,
True
]
)
def test_trivial_copy(started_cluster, use_sample_offset):
if use_sample_offset:
execute_task(started_cluster, TaskTrivial(started_cluster, use_sample_offset), ['--experimental-use-sample-offset', '1'])
else:
print("AAAAA")
execute_task(started_cluster, TaskTrivial(started_cluster, use_sample_offset), [])
@pytest.mark.parametrize(
('use_sample_offset'),
[
False,
True
]
)
def test_trivial_copy_with_copy_fault(started_cluster, use_sample_offset):
if use_sample_offset:
execute_task(started_cluster, TaskTrivial(started_cluster), ['--copy-fault-probability', str(COPYING_FAIL_PROBABILITY),
'--experimental-use-sample-offset', '1'])
else:
execute_task(started_cluster, TaskTrivial(started_cluster), ['--copy-fault-probability', str(COPYING_FAIL_PROBABILITY)])
@pytest.mark.parametrize(
('use_sample_offset'),
[
False,
True
]
)
def test_trivial_copy_with_move_fault(started_cluster, use_sample_offset):
if use_sample_offset:
execute_task(started_cluster, TaskTrivial(started_cluster), ['--move-fault-probability', str(MOVING_FAIL_PROBABILITY),
'--experimental-use-sample-offset', '1'])
else:
execute_task(started_cluster, TaskTrivial(started_cluster), ['--move-fault-probability', str(MOVING_FAIL_PROBABILITY)])
if __name__ == '__main__':
with contextmanager(started_cluster)() as cluster:
for name, instance in list(cluster.instances.items()):
print(name, instance.ip_address)
input("Cluster created, press any key to destroy...")

View File

@ -65,8 +65,6 @@ def started_cluster():
finally:
cluster.shutdown()
# since it includes started_cluster fixture at first start
@pytest.mark.timeout(60)
@pytest.mark.parametrize('table,settings', itertools.product(
[ # tables
'dist_one',

View File

@ -70,7 +70,6 @@ def rabbitmq_setup_teardown():
# Tests
@pytest.mark.timeout(240)
def test_rabbitmq_select(rabbitmq_cluster):
instance.query('''
CREATE TABLE test.rabbitmq (key UInt64, value UInt64)
@ -106,7 +105,6 @@ def test_rabbitmq_select(rabbitmq_cluster):
rabbitmq_check_result(result, True)
@pytest.mark.timeout(240)
def test_rabbitmq_select_empty(rabbitmq_cluster):
instance.query('''
CREATE TABLE test.rabbitmq (key UInt64, value UInt64)
@ -120,7 +118,6 @@ def test_rabbitmq_select_empty(rabbitmq_cluster):
assert int(instance.query('SELECT count() FROM test.rabbitmq')) == 0
@pytest.mark.timeout(240)
def test_rabbitmq_json_without_delimiter(rabbitmq_cluster):
instance.query('''
CREATE TABLE test.rabbitmq (key UInt64, value UInt64)
@ -162,7 +159,6 @@ def test_rabbitmq_json_without_delimiter(rabbitmq_cluster):
rabbitmq_check_result(result, True)
@pytest.mark.timeout(240)
def test_rabbitmq_csv_with_delimiter(rabbitmq_cluster):
instance.query('''
CREATE TABLE test.rabbitmq (key UInt64, value UInt64)
@ -197,7 +193,6 @@ def test_rabbitmq_csv_with_delimiter(rabbitmq_cluster):
rabbitmq_check_result(result, True)
@pytest.mark.timeout(240)
def test_rabbitmq_tsv_with_delimiter(rabbitmq_cluster):
instance.query('''
DROP TABLE IF EXISTS test.view;
@ -238,7 +233,6 @@ def test_rabbitmq_tsv_with_delimiter(rabbitmq_cluster):
rabbitmq_check_result(result, True)
@pytest.mark.timeout(240)
def test_rabbitmq_materialized_view(rabbitmq_cluster):
instance.query('''
DROP TABLE IF EXISTS test.view;
@ -281,7 +275,6 @@ def test_rabbitmq_materialized_view(rabbitmq_cluster):
rabbitmq_check_result(result, True)
@pytest.mark.timeout(240)
def test_rabbitmq_materialized_view_with_subquery(rabbitmq_cluster):
instance.query('''
DROP TABLE IF EXISTS test.view;
@ -324,7 +317,6 @@ def test_rabbitmq_materialized_view_with_subquery(rabbitmq_cluster):
rabbitmq_check_result(result, True)
@pytest.mark.timeout(240)
def test_rabbitmq_many_materialized_views(rabbitmq_cluster):
instance.query('''
DROP TABLE IF EXISTS test.view1;
@ -379,7 +371,6 @@ def test_rabbitmq_many_materialized_views(rabbitmq_cluster):
@pytest.mark.skip(reason="clichouse_path with rabbitmq.proto fails to be exported")
@pytest.mark.timeout(240)
def test_rabbitmq_protobuf(rabbitmq_cluster):
instance.query('''
DROP TABLE IF EXISTS test.view;
@ -443,7 +434,6 @@ def test_rabbitmq_protobuf(rabbitmq_cluster):
rabbitmq_check_result(result, True)
@pytest.mark.timeout(240)
def test_rabbitmq_big_message(rabbitmq_cluster):
# Create batchs of messages of size ~100Kb
rabbitmq_messages = 1000
@ -487,7 +477,6 @@ def test_rabbitmq_big_message(rabbitmq_cluster):
assert int(result) == rabbitmq_messages * batch_messages, 'ClickHouse lost some messages: {}'.format(result)
@pytest.mark.timeout(420)
def test_rabbitmq_sharding_between_queues_publish(rabbitmq_cluster):
NUM_CONSUMERS = 10
NUM_QUEUES = 10
@ -558,7 +547,6 @@ def test_rabbitmq_sharding_between_queues_publish(rabbitmq_cluster):
assert int(result2) == 10
@pytest.mark.timeout(420)
def test_rabbitmq_mv_combo(rabbitmq_cluster):
NUM_MV = 5
NUM_CONSUMERS = 4
@ -636,7 +624,6 @@ def test_rabbitmq_mv_combo(rabbitmq_cluster):
assert int(result) == messages_num * threads_num * NUM_MV, 'ClickHouse lost some messages: {}'.format(result)
@pytest.mark.timeout(240)
def test_rabbitmq_insert(rabbitmq_cluster):
instance.query('''
CREATE TABLE test.rabbitmq (key UInt64, value UInt64)
@ -689,7 +676,6 @@ def test_rabbitmq_insert(rabbitmq_cluster):
rabbitmq_check_result(result, True)
@pytest.mark.timeout(240)
def test_rabbitmq_insert_headers_exchange(rabbitmq_cluster):
instance.query('''
CREATE TABLE test.rabbitmq (key UInt64, value UInt64)
@ -743,7 +729,6 @@ def test_rabbitmq_insert_headers_exchange(rabbitmq_cluster):
rabbitmq_check_result(result, True)
@pytest.mark.timeout(240)
def test_rabbitmq_many_inserts(rabbitmq_cluster):
instance.query('''
DROP TABLE IF EXISTS test.rabbitmq_many;
@ -819,7 +804,6 @@ def test_rabbitmq_many_inserts(rabbitmq_cluster):
assert int(result) == messages_num * threads_num, 'ClickHouse lost some messages: {}'.format(result)
@pytest.mark.timeout(420)
def test_rabbitmq_overloaded_insert(rabbitmq_cluster):
instance.query('''
DROP TABLE IF EXISTS test.view_overload;
@ -898,7 +882,6 @@ def test_rabbitmq_overloaded_insert(rabbitmq_cluster):
assert int(result) == messages_num * threads_num, 'ClickHouse lost some messages: {}'.format(result)
@pytest.mark.timeout(420)
def test_rabbitmq_direct_exchange(rabbitmq_cluster):
instance.query('''
DROP TABLE IF EXISTS test.destination;
@ -972,7 +955,6 @@ def test_rabbitmq_direct_exchange(rabbitmq_cluster):
assert int(result) == messages_num * num_tables, 'ClickHouse lost some messages: {}'.format(result)
@pytest.mark.timeout(420)
def test_rabbitmq_fanout_exchange(rabbitmq_cluster):
instance.query('''
DROP TABLE IF EXISTS test.destination;
@ -1039,7 +1021,6 @@ def test_rabbitmq_fanout_exchange(rabbitmq_cluster):
assert int(result) == messages_num * num_tables, 'ClickHouse lost some messages: {}'.format(result)
@pytest.mark.timeout(420)
def test_rabbitmq_topic_exchange(rabbitmq_cluster):
instance.query('''
DROP TABLE IF EXISTS test.destination;
@ -1136,7 +1117,6 @@ def test_rabbitmq_topic_exchange(rabbitmq_cluster):
result)
@pytest.mark.timeout(420)
def test_rabbitmq_hash_exchange(rabbitmq_cluster):
instance.query('''
DROP TABLE IF EXISTS test.destination;
@ -1220,7 +1200,6 @@ def test_rabbitmq_hash_exchange(rabbitmq_cluster):
assert int(result2) == 4 * num_tables
@pytest.mark.timeout(420)
def test_rabbitmq_multiple_bindings(rabbitmq_cluster):
instance.query('''
DROP TABLE IF EXISTS test.destination;
@ -1295,7 +1274,6 @@ def test_rabbitmq_multiple_bindings(rabbitmq_cluster):
assert int(result) == messages_num * threads_num * 5, 'ClickHouse lost some messages: {}'.format(result)
@pytest.mark.timeout(420)
def test_rabbitmq_headers_exchange(rabbitmq_cluster):
instance.query('''
DROP TABLE IF EXISTS test.destination;
@ -1385,7 +1363,6 @@ def test_rabbitmq_headers_exchange(rabbitmq_cluster):
assert int(result) == messages_num * num_tables_to_receive, 'ClickHouse lost some messages: {}'.format(result)
@pytest.mark.timeout(420)
def test_rabbitmq_virtual_columns(rabbitmq_cluster):
instance.query('''
DROP TABLE IF EXISTS test.view;
@ -1447,7 +1424,6 @@ def test_rabbitmq_virtual_columns(rabbitmq_cluster):
assert TSV(result) == TSV(expected)
@pytest.mark.timeout(420)
def test_rabbitmq_virtual_columns_with_materialized_view(rabbitmq_cluster):
instance.query('''
DROP TABLE IF EXISTS test.view;
@ -1512,7 +1488,6 @@ def test_rabbitmq_virtual_columns_with_materialized_view(rabbitmq_cluster):
assert TSV(result) == TSV(expected)
@pytest.mark.timeout(420)
def test_rabbitmq_many_consumers_to_each_queue(rabbitmq_cluster):
instance.query('''
DROP TABLE IF EXISTS test.destination;
@ -1595,7 +1570,6 @@ def test_rabbitmq_many_consumers_to_each_queue(rabbitmq_cluster):
assert int(result2) == 8
@pytest.mark.timeout(420)
def test_rabbitmq_restore_failed_connection_without_losses_1(rabbitmq_cluster):
instance.query('''
DROP TABLE IF EXISTS test.consume;
@ -1667,7 +1641,6 @@ def test_rabbitmq_restore_failed_connection_without_losses_1(rabbitmq_cluster):
assert int(result) == messages_num, 'ClickHouse lost some messages: {}'.format(result)
@pytest.mark.timeout(420)
def test_rabbitmq_restore_failed_connection_without_losses_2(rabbitmq_cluster):
instance.query('''
CREATE TABLE test.consumer_reconnect (key UInt64, value UInt64)
@ -1735,7 +1708,6 @@ def test_rabbitmq_restore_failed_connection_without_losses_2(rabbitmq_cluster):
assert int(result) == messages_num, 'ClickHouse lost some messages: {}'.format(result)
@pytest.mark.timeout(300)
def test_rabbitmq_commit_on_block_write(rabbitmq_cluster):
instance.query('''
DROP TABLE IF EXISTS test.view;
@ -1804,7 +1776,6 @@ def test_rabbitmq_commit_on_block_write(rabbitmq_cluster):
assert result == 1, 'Messages from RabbitMQ get duplicated!'
@pytest.mark.timeout(420)
def test_rabbitmq_no_connection_at_startup(rabbitmq_cluster):
# no connection when table is initialized
rabbitmq_cluster.pause_container('rabbitmq1')
@ -1854,7 +1825,6 @@ def test_rabbitmq_no_connection_at_startup(rabbitmq_cluster):
assert int(result) == messages_num, 'ClickHouse lost some messages: {}'.format(result)
@pytest.mark.timeout(120)
def test_rabbitmq_format_factory_settings(rabbitmq_cluster):
instance.query('''
CREATE TABLE test.format_settings (
@ -1907,7 +1877,6 @@ def test_rabbitmq_format_factory_settings(rabbitmq_cluster):
assert(result == expected)
@pytest.mark.timeout(120)
def test_rabbitmq_vhost(rabbitmq_cluster):
instance.query('''
CREATE TABLE test.rabbitmq_vhost (key UInt64, value UInt64)
@ -1930,7 +1899,6 @@ def test_rabbitmq_vhost(rabbitmq_cluster):
break
@pytest.mark.timeout(120)
def test_rabbitmq_drop_table_properly(rabbitmq_cluster):
instance.query('''
CREATE TABLE test.rabbitmq_drop (key UInt64, value UInt64)
@ -1966,7 +1934,6 @@ def test_rabbitmq_drop_table_properly(rabbitmq_cluster):
assert(not exists)
@pytest.mark.timeout(120)
def test_rabbitmq_queue_settings(rabbitmq_cluster):
instance.query('''
CREATE TABLE test.rabbitmq_settings (key UInt64, value UInt64)
@ -2009,7 +1976,6 @@ def test_rabbitmq_queue_settings(rabbitmq_cluster):
assert(int(result) == 10)
@pytest.mark.timeout(120)
def test_rabbitmq_queue_consume(rabbitmq_cluster):
credentials = pika.PlainCredentials('root', 'clickhouse')
parameters = pika.ConnectionParameters(rabbitmq_cluster.rabbitmq_ip, rabbitmq_cluster.rabbitmq_port, '/', credentials)

View File

@ -11,7 +11,7 @@ expect_after {
}
set basedir [file dirname $argv0]
spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT"
spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion"
expect ":) "
send -- "DROP TABLE IF EXISTS test_01179\r"

View File

@ -11,7 +11,7 @@ expect_after {
}
set basedir [file dirname $argv0]
spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT"
spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion"
expect ":) "
# Make a query with syntax error

View File

@ -12,7 +12,7 @@ expect_after {
}
set basedir [file dirname $argv0]
spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT"
spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion"
expect ":) "
send -- "SELECT 1\r"

View File

@ -11,7 +11,7 @@ expect_after {
}
set basedir [file dirname $argv0]
spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT"
spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion"
expect ":) "
send -- "SELECT 1\r"

View File

@ -11,7 +11,7 @@ expect_after {
}
set basedir [file dirname $argv0]
spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT"
spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion"
expect ":) "
# Make a query

View File

@ -11,7 +11,7 @@ expect_after {
}
set basedir [file dirname $argv0]
spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT"
spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion"
expect ":) "
# Make a query

View File

@ -15,7 +15,7 @@ expect_after {
}
set basedir [file dirname $argv0]
spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT -mn"
spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion -mn"
expect "\n:) "
send -- "DROP TABLE IF EXISTS t01565;\n"

View File

@ -0,0 +1,35 @@
<?xml version="1.0"?>
<yandex>
<logger>
<level>trace</level>
<console>true</console>
</logger>
<tcp_port>9000</tcp_port>
<path>./</path>
<mark_cache_size>0</mark_cache_size>
<users>
<default>
<password></password>
<networks>
<ip>::/0</ip>
</networks>
<profile>default</profile>
<quota>default</quota>
<access_management>1</access_management>
</default>
</users>
<profiles>
<default/>
</profiles>
<quotas>
<default />
</quotas>
</yandex>

View File

@ -1,16 +1,103 @@
#!/usr/bin/env bash
#
# Regression for INSERT SELECT, that abnormally terminates the server
# in case of too small memory limits.
#
# NOTE: After #24483 had been merged the only place where the allocation may
# fail is the insert into PODArray in DB::OwnSplitChannel::log, but after
# #24069 those errors will be ignored, so to check new behaviour separate
# server is required.
#
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
server_opts=(
"--config-file=$CURDIR/$(basename "${BASH_SOURCE[0]}" .sh).config.xml"
"--"
# to avoid multiple listen sockets (complexity for port discovering)
"--listen_host=127.1"
# we will discover the real port later.
"--tcp_port=0"
"--shutdown_wait_unfinished=0"
)
CLICKHOUSE_WATCHDOG_ENABLE=0 $CLICKHOUSE_SERVER_BINARY "${server_opts[@]}" >clickhouse-server.log 2>clickhouse-server.stderr &
server_pid=$!
trap cleanup EXIT
function cleanup()
{
kill -9 $server_pid
echo "Test failed. Server log:"
cat clickhouse-server.log
cat clickhouse-server.stderr
rm -f clickhouse-server.log
rm -f clickhouse-server.stderr
exit 1
}
server_port=
i=0 retries=300
# wait until server will start to listen (max 30 seconds)
while [[ -z $server_port ]] && [[ $i -lt $retries ]]; do
server_port=$(lsof -n -a -P -i tcp -s tcp:LISTEN -p $server_pid 2>/dev/null | awk -F'[ :]' '/LISTEN/ { print $(NF-1) }')
((++i))
sleep 0.1
if ! kill -0 $server_pid >& /dev/null; then
echo "No server (pid $server_pid)"
break
fi
done
if [[ -z $server_port ]]; then
echo "Cannot wait for LISTEN socket" >&2
exit 1
fi
# wait for the server to start accepting tcp connections (max 30 seconds)
i=0 retries=300
while ! $CLICKHOUSE_CLIENT_BINARY --host 127.1 --port "$server_port" --format Null -q 'select 1' 2>/dev/null && [[ $i -lt $retries ]]; do
sleep 0.1
if ! kill -0 $server_pid >& /dev/null; then
echo "No server (pid $server_pid)"
break
fi
done
if ! $CLICKHOUSE_CLIENT_BINARY --host 127.1 --port "$server_port" --format Null -q 'select 1'; then
echo "Cannot wait until server will start accepting connections on <tcp_port>" >&2
exit 1
fi
# it is not mandatory to use existing table since it fails earlier, hence just a placeholder.
# this is format of INSERT SELECT, that pass these settings exactly for INSERT query not the SELECT
${CLICKHOUSE_CLIENT} --format Null -q 'insert into placeholder_table_name select * from numbers_mt(65535) format Null settings max_memory_usage=1, max_untracked_memory=1' >& /dev/null
exit_code=$?
if $CLICKHOUSE_CLIENT_BINARY --host 127.1 --port "$server_port" --format Null --send_logs_level=warning --max_memory_usage=1 --max_untracked_memory=1 -q 'insert into placeholder_table_name select * from numbers_mt(65535)' >& /dev/null; then
echo "INSERT SELECT should fail" >&2
exit 1
fi
# expecting ATTEMPT_TO_READ_AFTER_EOF, 32
test $exit_code -eq 32 || exit 1
# no sleep, since flushing to stderr should not be buffered.
if ! grep -E -q 'Cannot add message to the log: Code: 60.*placeholder_table_name' clickhouse-server.stderr; then
echo "Adding message to the log should fail" >&2
exit 1
fi
# check that server is still alive
${CLICKHOUSE_CLIENT} --format Null -q 'SELECT 1'
$CLICKHOUSE_CLIENT_BINARY --host 127.1 --port "$server_port" --format Null -q 'SELECT 1'
# send TERM and save the error code to ensure that it is 0 (EXIT_SUCCESS)
kill $server_pid
wait $server_pid
return_code=$?
trap '' EXIT
if [ $return_code != 0 ]; then
cat clickhouse-server.log
cat clickhouse-server.stderr
fi
rm -f clickhouse-server.log
rm -f clickhouse-server.stderr
exit $return_code

View File

@ -6,7 +6,7 @@ match_max 100000
if ![info exists env(CLICKHOUSE_PORT_TCP)] {set env(CLICKHOUSE_PORT_TCP) 9000}
spawn clickhouse-client --multiline --port "$env(CLICKHOUSE_PORT_TCP)"
spawn clickhouse-client --multiline --disable_suggestion --port "$env(CLICKHOUSE_PORT_TCP)"
expect ":) "
# Make a query

View File

@ -8,7 +8,7 @@ if ![info exists env(CLICKHOUSE_PORT_TCP)] {set env(CLICKHOUSE_PORT_TCP) 9000}
set env(EDITOR) [file dirname [file normalize [info script]]]"/01610_client_spawn_editor_open.editor"
spawn clickhouse-client
spawn clickhouse-client --disable_suggestion
expect ":) "
# Open EDITOR

View File

@ -19,3 +19,5 @@
[1002]
{'aa':4,'bb':5} ['aa','bb'] [4,5]
{'aa':4,'bb':5} 1 0
{0:0} 1
{0:0} 0

View File

@ -24,4 +24,6 @@ drop table if exists table_map;
-- Const column
select map( 'aa', 4, 'bb' , 5) as m, mapKeys(m), mapValues(m);
select map( 'aa', 4, 'bb' , 5) as m, mapContains(m, 'aa'), mapContains(m, 'k');
select map( 'aa', 4, 'bb' , 5) as m, mapContains(m, 'aa'), mapContains(m, 'k');
select map(0, 0) as m, mapContains(m, number % 2) from numbers(2);

View File

@ -0,0 +1,2 @@
499500
499500

View File

@ -0,0 +1,8 @@
drop table if exists projection_without_key;
create table projection_without_key (key UInt32, PROJECTION x (SELECT sum(key) group by key % 3)) engine MergeTree order by key;
insert into projection_without_key select number from numbers(1000);
select sum(key) from projection_without_key settings allow_experimental_projection_optimization = 1;
select sum(key) from projection_without_key settings allow_experimental_projection_optimization = 0;
drop table projection_without_key;

View File

@ -11,7 +11,7 @@ expect_after {
}
set basedir [file dirname $argv0]
spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT"
spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion"
expect ":) "
# regression for heap-buffer-overflow issue (under ASAN)

View File

@ -0,0 +1,47 @@
0 0 1
0 1 0
SELECT
isNull(id),
`n.null`,
NOT `n.null`
FROM t_func_to_subcolumns
3 0 1 0
0 1 0 \N
SELECT
`arr.size0`,
`arr.size0` = 0,
`arr.size0` != 0,
empty(n)
FROM t_func_to_subcolumns
['foo','bar'] [1,2]
[] []
SELECT
`m.keys`,
`m.values`
FROM t_func_to_subcolumns
1
SELECT sum(NOT `n.null`)
FROM t_func_to_subcolumns
2
SELECT count(id)
FROM t_func_to_subcolumns
1 0 0
2 1 0
3 0 0
SELECT
id,
`n.null`,
isNull(right.n)
FROM t_func_to_subcolumns AS left
ALL FULL OUTER JOIN
(
SELECT
1 AS id,
\'qqq\' AS n
UNION ALL
SELECT
3 AS id,
\'www\'
) AS right USING (id)
0 10
0 20

View File

@ -0,0 +1,42 @@
DROP TABLE IF EXISTS t_func_to_subcolumns;
SET allow_experimental_map_type = 1;
SET optimize_functions_to_subcolumns = 1;
CREATE TABLE t_func_to_subcolumns (id UInt64, arr Array(UInt64), n Nullable(String), m Map(String, UInt64))
ENGINE = MergeTree ORDER BY tuple();
INSERT INTO t_func_to_subcolumns VALUES (1, [1, 2, 3], 'abc', map('foo', 1, 'bar', 2)) (2, [], NULL, map());
SELECT id IS NULL, n IS NULL, n IS NOT NULL FROM t_func_to_subcolumns;
EXPLAIN SYNTAX SELECT id IS NULL, n IS NULL, n IS NOT NULL FROM t_func_to_subcolumns;
SELECT length(arr), empty(arr), notEmpty(arr), empty(n) FROM t_func_to_subcolumns;
EXPLAIN SYNTAX SELECT length(arr), empty(arr), notEmpty(arr), empty(n) FROM t_func_to_subcolumns;
SELECT mapKeys(m), mapValues(m) FROM t_func_to_subcolumns;
EXPLAIN SYNTAX SELECT mapKeys(m), mapValues(m) FROM t_func_to_subcolumns;
SELECT count(n) FROM t_func_to_subcolumns;
EXPLAIN SYNTAX SELECT count(n) FROM t_func_to_subcolumns;
SELECT count(id) FROM t_func_to_subcolumns;
EXPLAIN SYNTAX SELECT count(id) FROM t_func_to_subcolumns;
SELECT id, left.n IS NULL, right.n IS NULL FROM t_func_to_subcolumns AS left
FULL JOIN (SELECT 1 AS id, 'qqq' AS n UNION ALL SELECT 3 AS id, 'www') AS right USING(id);
EXPLAIN SYNTAX SELECT id, left.n IS NULL, right.n IS NULL FROM t_func_to_subcolumns AS left
FULL JOIN (SELECT 1 AS id, 'qqq' AS n UNION ALL SELECT 3 AS id, 'www') AS right USING(id);
DROP TABLE t_func_to_subcolumns;
DROP TABLE IF EXISTS t_tuple_null;
CREATE TABLE t_tuple_null (t Tuple(null UInt32)) ENGINE = MergeTree ORDER BY tuple();
INSERT INTO t_tuple_null VALUES ((10)), ((20));
SELECT t IS NULL, t.null FROM t_tuple_null;
DROP TABLE t_tuple_null;

View File

@ -0,0 +1,2 @@
1 bar_100
2 bar_100

View File

@ -0,0 +1,40 @@
#!/usr/bin/env bash
#
# Check that KILL MUTATION can be executed in parallel for different tables.
# For this two identical tables will be created:
# - on one table ALTER + KILL MUTATION will be executed
# - on another table only ALTER, that should be succeed
#
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
$CLICKHOUSE_CLIENT -nm -q "
drop table if exists data_01900_1;
drop table if exists data_01900_2;
create table data_01900_1 (k UInt64, s String) engine=MergeTree() order by k;
create table data_01900_2 (k UInt64, s String) engine=MergeTree() order by k;
insert into data_01900_1 values (1, 'hello'), (2, 'world');
insert into data_01900_2 values (1, 'hello'), (2, 'world');
"
# default finished_mutations_to_keep is 100
# so 100 mutations will be scheduled and killed later.
for i in {1..100}; do
echo "alter table data_01900_1 update s = 'foo_$i' where 1;"
done | $CLICKHOUSE_CLIENT -nm
# but these mutations should not be killed.
(
for i in {1..100}; do
echo "alter table data_01900_2 update s = 'bar_$i' where 1;"
done | $CLICKHOUSE_CLIENT -nm --mutations_sync=1
) &
$CLICKHOUSE_CLIENT --format Null -nm -q "kill mutation where table = 'data_01900_1' and database = '$CLICKHOUSE_DATABASE';"
wait
$CLICKHOUSE_CLIENT -nm -q "select * from data_01900_2"

View File

@ -0,0 +1,4 @@
1
1
2
2

View File

@ -0,0 +1,23 @@
DROP TABLE IF EXISTS test_alter_attach_01901S;
DROP TABLE IF EXISTS test_alter_attach_01901D;
CREATE TABLE test_alter_attach_01901S (A Int64, D date) ENGINE = MergeTree PARTITION BY D ORDER BY A;
INSERT INTO test_alter_attach_01901S VALUES (1, '2020-01-01');
CREATE TABLE test_alter_attach_01901D (A Int64, D date)
Engine=ReplicatedMergeTree('/clickhouse/tables/test_alter_attach_01901D', 'r1')
PARTITION BY D ORDER BY A;
ALTER TABLE test_alter_attach_01901D ATTACH PARTITION '2020-01-01' FROM test_alter_attach_01901S;
SELECT count() FROM test_alter_attach_01901D;
SELECT count() FROM test_alter_attach_01901S;
INSERT INTO test_alter_attach_01901S VALUES (1, '2020-01-01');
ALTER TABLE test_alter_attach_01901D REPLACE PARTITION '2020-01-01' FROM test_alter_attach_01901S;
SELECT count() FROM test_alter_attach_01901D;
SELECT count() FROM test_alter_attach_01901S;
DROP TABLE test_alter_attach_01901S;
DROP TABLE test_alter_attach_01901D;

View File

@ -0,0 +1,14 @@
CREATE TABLE a
(
`number` UInt64,
`x` MATERIALIZED x
)
ENGINE = MergeTree
ORDER BY number; --{ serverError 174}
CREATE TABLE foo
(
i Int32,
j ALIAS j + 1
)
ENGINE = MergeTree() ORDER BY i; --{ serverError 174}

View File

@ -13,6 +13,7 @@
"01193_metadata_loading",
"01473_event_time_microseconds",
"01526_max_untracked_memory", /// requires TraceCollector, does not available under sanitizers
"01594_too_low_memory_limits", /// requires jemalloc to track small allocations
"01474_executable_dictionary", /// informational stderr from sanitizer at start
"functions_bad_arguments", /// Too long for TSan
"01603_read_with_backoff_bug", /// Too long for TSan
@ -28,6 +29,7 @@
"01103_check_cpu_instructions_at_startup",
"01473_event_time_microseconds",
"01526_max_untracked_memory", /// requires TraceCollector, does not available under sanitizers
"01594_too_low_memory_limits", /// requires jemalloc to track small allocations
"01193_metadata_loading",
"01017_uniqCombined_memory_usage" /// Fine thresholds on memory usage
],
@ -39,6 +41,7 @@
"00900_orc_load",
"01473_event_time_microseconds",
"01526_max_untracked_memory", /// requires TraceCollector, does not available under sanitizers
"01594_too_low_memory_limits", /// requires jemalloc to track small allocations
"01193_metadata_loading"
],
"memory-sanitizer": [
@ -51,6 +54,7 @@
"00877_memory_limit_for_new_delete", /// memory limits don't work correctly under msan because it replaces malloc/free
"01473_event_time_microseconds",
"01526_max_untracked_memory", /// requires TraceCollector, does not available under sanitizers
"01594_too_low_memory_limits", /// requires jemalloc to track small allocations
"01193_metadata_loading",
"01017_uniqCombined_memory_usage" /// Fine thresholds on memory usage
],
@ -718,6 +722,7 @@
"01684_ssd_cache_dictionary_simple_key",
"01685_ssd_cache_dictionary_complex_key",
"01737_clickhouse_server_wait_server_pool_long", // This test is fully compatible to run in parallel, however under ASAN processes are pretty heavy and may fail under flaky adress check.
"01594_too_low_memory_limits", // This test is fully compatible to run in parallel, however under ASAN processes are pretty heavy and may fail under flaky adress check.
"01760_system_dictionaries",
"01760_polygon_dictionaries",
"01778_hierarchical_dictionaries",