Merge branch 'master' into fix-01111_create_drop_replicated_db_stress

This commit is contained in:
alesapin 2023-07-20 12:03:07 +02:00 committed by GitHub
commit 8fce71838c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
136 changed files with 3040 additions and 1015 deletions

View File

@ -4,6 +4,8 @@ services:
kafka_zookeeper:
image: zookeeper:3.4.9
hostname: kafka_zookeeper
ports:
- 2181:2181
environment:
ZOO_MY_ID: 1
ZOO_PORT: 2181
@ -15,15 +17,14 @@ services:
image: confluentinc/cp-kafka:5.2.0
hostname: kafka1
ports:
- ${KAFKA_EXTERNAL_PORT:-8081}:${KAFKA_EXTERNAL_PORT:-8081}
- ${KAFKA_EXTERNAL_PORT}:${KAFKA_EXTERNAL_PORT}
environment:
KAFKA_ADVERTISED_LISTENERS: INSIDE://localhost:${KAFKA_EXTERNAL_PORT},OUTSIDE://kafka1:19092
KAFKA_ADVERTISED_HOST_NAME: kafka1
KAFKA_LISTENERS: INSIDE://0.0.0.0:${KAFKA_EXTERNAL_PORT},OUTSIDE://0.0.0.0:19092
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: INSIDE:PLAINTEXT,OUTSIDE:PLAINTEXT
KAFKA_INTER_BROKER_LISTENER_NAME: INSIDE
KAFKA_BROKER_ID: 1
KAFKA_ZOOKEEPER_CONNECT: "kafka_zookeeper:2181"
KAFKA_ZOOKEEPER_CONNECT: kafka_zookeeper:2181
KAFKA_LOG4J_LOGGERS: "kafka.controller=INFO,kafka.producer.async.DefaultEventHandler=INFO,state.change.logger=INFO"
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
depends_on:
@ -35,13 +36,38 @@ services:
image: confluentinc/cp-schema-registry:5.2.0
hostname: schema-registry
ports:
- ${SCHEMA_REGISTRY_EXTERNAL_PORT:-12313}:${SCHEMA_REGISTRY_INTERNAL_PORT:-12313}
- ${SCHEMA_REGISTRY_EXTERNAL_PORT}:${SCHEMA_REGISTRY_EXTERNAL_PORT}
environment:
SCHEMA_REGISTRY_HOST_NAME: schema-registry
SCHEMA_REGISTRY_KAFKASTORE_SECURITY_PROTOCOL: PLAINTEXT
SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: PLAINTEXT://kafka1:19092
SCHEMA_REGISTRY_LISTENERS: http://0.0.0.0:${SCHEMA_REGISTRY_EXTERNAL_PORT}
SCHEMA_REGISTRY_SCHEMA_REGISTRY_GROUP_ID: noauth
depends_on:
- kafka_zookeeper
- kafka1
restart: always
security_opt:
- label:disable
schema-registry-auth:
image: confluentinc/cp-schema-registry:5.2.0
hostname: schema-registry-auth
ports:
- ${SCHEMA_REGISTRY_AUTH_EXTERNAL_PORT}:${SCHEMA_REGISTRY_AUTH_EXTERNAL_PORT}
environment:
SCHEMA_REGISTRY_HOST_NAME: schema-registry-auth
SCHEMA_REGISTRY_LISTENERS: http://0.0.0.0:${SCHEMA_REGISTRY_AUTH_EXTERNAL_PORT}
SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: PLAINTEXT://kafka1:19092
SCHEMA_REGISTRY_AUTHENTICATION_METHOD: BASIC
SCHEMA_REGISTRY_AUTHENTICATION_ROLES: user
SCHEMA_REGISTRY_AUTHENTICATION_REALM: RealmFooBar
SCHEMA_REGISTRY_OPTS: "-Djava.security.auth.login.config=/etc/schema-registry/secrets/schema_registry_jaas.conf"
SCHEMA_REGISTRY_SCHEMA_REGISTRY_GROUP_ID: auth
volumes:
- ${SCHEMA_REGISTRY_DIR:-}/secrets:/etc/schema-registry/secrets
depends_on:
- kafka_zookeeper
- kafka1
restart: always
security_opt:
- label:disable

View File

@ -76,6 +76,7 @@ The supported formats are:
| [RowBinary](#rowbinary) | ✔ | ✔ |
| [RowBinaryWithNames](#rowbinarywithnamesandtypes) | ✔ | ✔ |
| [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ |
| [RowBinaryWithDefaults](#rowbinarywithdefaults) | ✔ | ✔ |
| [Native](#native) | ✔ | ✔ |
| [Null](#null) | ✗ | ✔ |
| [XML](#xml) | ✗ | ✔ |
@ -1515,6 +1516,23 @@ If setting [input_format_with_types_use_header](/docs/en/operations/settings/set
the types from input data will be compared with the types of the corresponding columns from the table. Otherwise, the second row will be skipped.
:::
## RowBinaryWithDefaults {#rowbinarywithdefaults}
Similar to [RowBinary](#rowbinary), but with an extra byte before each column that indicates if default value should be used.
Examples:
```sql
:) select * from format('RowBinaryWithDefaults', 'x UInt32 default 42, y UInt32', x'010001000000')
┌──x─┬─y─┐
│ 42 │ 1 │
└────┴───┘
```
For column `x` there is only one byte `01` that indicates that default value should be used and no other data after this byte is provided.
For column `y` data starts with byte `00` that indicates that column has actual value that should be read from the subsequent data `01000000`.
## RowBinary format settings {#row-binary-format-settings}
- [format_binary_max_string_size](/docs/en/operations/settings/settings-formats.md/#format_binary_max_string_size) - The maximum allowed size for String in RowBinary format. Default value - `1GiB`.

View File

@ -30,7 +30,7 @@ description: In order to effectively mitigate possible human errors, you should
```
:::note ALL
`ALL` is only applicable to the `RESTORE` command prior to version 23.4 of Clickhouse.
Prior to version 23.4 of ClickHouse, `ALL` was only applicable to the `RESTORE` command.
:::
## Background

View File

@ -1325,6 +1325,17 @@ Default value: 0.
Sets [Confluent Schema Registry](https://docs.confluent.io/current/schema-registry/index.html) URL to use with [AvroConfluent](../../interfaces/formats.md/#data-format-avro-confluent) format.
Format:
``` text
http://[user:password@]machine[:port]"
```
Examples:
``` text
http://registry.example.com:8081
http://admin:secret@registry.example.com:8081
```
Default value: `Empty`.
### output_format_avro_codec {#output_format_avro_codec}

View File

@ -722,7 +722,7 @@ SELECT toDate('2016-12-27') AS date, toYearWeek(date) AS yearWeek0, toYearWeek(d
## age
Returns the `unit` component of the difference between `startdate` and `enddate`. The difference is calculated using a precision of 1 second.
Returns the `unit` component of the difference between `startdate` and `enddate`. The difference is calculated using a precision of 1 microsecond.
E.g. the difference between `2021-12-29` and `2022-01-01` is 3 days for `day` unit, 0 months for `month` unit, 0 years for `year` unit.
For an alternative to `age`, see function `date\_diff`.
@ -738,6 +738,8 @@ age('unit', startdate, enddate, [timezone])
- `unit` — The type of interval for result. [String](../../sql-reference/data-types/string.md).
Possible values:
- `microsecond` (possible abbreviations: `us`, `u`)
- `millisecond` (possible abbreviations: `ms`)
- `second` (possible abbreviations: `ss`, `s`)
- `minute` (possible abbreviations: `mi`, `n`)
- `hour` (possible abbreviations: `hh`, `h`)
@ -813,6 +815,8 @@ Aliases: `dateDiff`, `DATE_DIFF`, `timestampDiff`, `timestamp_diff`, `TIMESTAMP_
- `unit` — The type of interval for result. [String](../../sql-reference/data-types/string.md).
Possible values:
- `microsecond` (possible abbreviations: `us`, `u`)
- `millisecond` (possible abbreviations: `ms`)
- `second` (possible abbreviations: `ss`, `s`)
- `minute` (possible abbreviations: `mi`, `n`)
- `hour` (possible abbreviations: `hh`, `h`)

View File

@ -51,7 +51,7 @@ Calculates the MD5 from a string and returns the resulting set of bytes as Fixed
If you do not need MD5 in particular, but you need a decent cryptographic 128-bit hash, use the sipHash128 function instead.
If you want to get the same result as output by the md5sum utility, use lower(hex(MD5(s))).
## sipHash64 (#hash_functions-siphash64)
## sipHash64 {#hash_functions-siphash64}
Produces a 64-bit [SipHash](https://en.wikipedia.org/wiki/SipHash) hash value.
@ -63,9 +63,9 @@ This is a cryptographic hash function. It works at least three times faster than
The function [interprets](/docs/en/sql-reference/functions/type-conversion-functions.md/#type_conversion_functions-reinterpretAsString) all the input parameters as strings and calculates the hash value for each of them. It then combines the hashes by the following algorithm:
1. The first and the second hash value are concatenated to an array which is hashed.
2. The previously calculated hash value and the hash of the third input parameter are hashed in a similar way.
3. This calculation is repeated for all remaining hash values of the original input.
1. The first and the second hash value are concatenated to an array which is hashed.
2. The previously calculated hash value and the hash of the third input parameter are hashed in a similar way.
3. This calculation is repeated for all remaining hash values of the original input.
**Arguments**

View File

@ -1267,3 +1267,36 @@ Like [initcap](#initcap), assuming that the string contains valid UTF-8 encoded
Does not detect the language, e.g. for Turkish the result might not be exactly correct (i/İ vs. i/I).
If the length of the UTF-8 byte sequence is different for upper and lower case of a code point, the result may be incorrect for this code point.
## firstLine
Returns the first line from a multi-line string.
**Syntax**
```sql
firstLine(val)
```
**Arguments**
- `val` - Input value. [String](../data-types/string.md)
**Returned value**
- The first line of the input value or the whole value if there is no line
separators. [String](../data-types/string.md)
**Example**
```sql
select firstLine('foo\nbar\nbaz');
```
Result:
```result
┌─firstLine('foo\nbar\nbaz')─┐
│ foo │
└────────────────────────────┘
```

View File

@ -97,7 +97,7 @@ This is an experimental feature that may change in backwards-incompatible ways i
:::
```sql
CREATE LIVE VIEW [IF NOT EXISTS] [db.]table_name [WITH [TIMEOUT [value_in_sec] [AND]] [REFRESH [value_in_sec]]] AS SELECT ...
CREATE LIVE VIEW [IF NOT EXISTS] [db.]table_name [WITH REFRESH [value_in_sec]] AS SELECT ...
```
Live views store result of the corresponding [SELECT](../../../sql-reference/statements/select/index.md) query and are updated any time the result of the query changes. Query result as well as partial result needed to combine with new data are stored in memory providing increased performance for repeated queries. Live views can provide push notifications when query result changes using the [WATCH](../../../sql-reference/statements/watch.md) query.

View File

@ -134,7 +134,7 @@ Multiple path components can have globs. For being processed file must exist and
- `*` — Substitutes any number of any characters except `/` including empty string.
- `?` — Substitutes any single character.
- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`.
- `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`, including `/`.
- `{N..M}` — Substitutes any number in range from N to M including both borders.
- `**` - Fetches all files inside the folder recursively.

View File

@ -625,7 +625,7 @@ SELECT toDate('2016-12-27') AS date, toYearWeek(date) AS yearWeek0, toYearWeek(d
## age
Вычисляет компонент `unit` разницы между `startdate` и `enddate`. Разница вычисляется с точностью в 1 секунду.
Вычисляет компонент `unit` разницы между `startdate` и `enddate`. Разница вычисляется с точностью в 1 микросекунду.
Например, разница между `2021-12-29` и `2022-01-01` 3 дня для единицы `day`, 0 месяцев для единицы `month`, 0 лет для единицы `year`.
**Синтаксис**
@ -639,6 +639,8 @@ age('unit', startdate, enddate, [timezone])
- `unit` — единица измерения времени, в которой будет выражено возвращаемое значение функции. [String](../../sql-reference/data-types/string.md).
Возможные значения:
- `microsecond` (возможные сокращения: `us`, `u`)
- `millisecond` (возможные сокращения: `ms`)
- `second` (возможные сокращения: `ss`, `s`)
- `minute` (возможные сокращения: `mi`, `n`)
- `hour` (возможные сокращения: `hh`, `h`)
@ -712,6 +714,8 @@ date_diff('unit', startdate, enddate, [timezone])
- `unit` — единица измерения времени, в которой будет выражено возвращаемое значение функции. [String](../../sql-reference/data-types/string.md).
Возможные значения:
- `microsecond` (возможные сокращения: `us`, `u`)
- `millisecond` (возможные сокращения: `ms`)
- `second` (возможные сокращения: `ss`, `s`)
- `minute` (возможные сокращения: `mi`, `n`)
- `hour` (возможные сокращения: `hh`, `h`)

View File

@ -1124,3 +1124,39 @@ Do Nothing for 2 Minutes 2:00  
Не учитывает язык. То есть, для турецкого языка, результат может быть не совсем верным.
Если длина UTF-8 последовательности байтов различна для верхнего и нижнего регистра кодовой точки, то для этой кодовой точки результат работы может быть некорректным.
Если строка содержит набор байтов, не являющийся UTF-8, то поведение не определено.
## firstLine
Возвращает первую строку в многострочном тексте.
**Синтаксис**
```sql
firstLine(val)
```
**Аргументы**
- `val` - текст для обработки. [String](../data-types/string.md)
**Returned value**
- Первая строка текста или весь текст, если переносы строк отсутствуют.
Тип: [String](../data-types/string.md)
**Пример**
Запрос:
```sql
select firstLine('foo\nbar\nbaz');
```
Результат:
```result
┌─firstLine('foo\nbar\nbaz')─┐
│ foo │
└────────────────────────────┘
```

View File

@ -73,7 +73,7 @@ CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]na
Чтобы использовать `LIVE VIEW` и запросы `WATCH`, включите настройку [allow_experimental_live_view](../../../operations/settings/settings.md#allow-experimental-live-view).
:::
```sql
CREATE LIVE VIEW [IF NOT EXISTS] [db.]table_name [WITH [TIMEOUT [value_in_sec] [AND]] [REFRESH [value_in_sec]]] AS SELECT ...
CREATE LIVE VIEW [IF NOT EXISTS] [db.]table_name [WITH REFRESH [value_in_sec]] AS SELECT ...
```
`LIVE VIEW` хранит результат запроса [SELECT](../../../sql-reference/statements/select/index.md), указанного при создании, и обновляется сразу же при изменении этого результата. Конечный результат запроса и промежуточные данные, из которых формируется результат, хранятся в оперативной памяти, и это обеспечивает высокую скорость обработки для повторяющихся запросов. LIVE-представления могут отправлять push-уведомления при изменении результата исходного запроса `SELECT`. Для этого используйте запрос [WATCH](../../../sql-reference/statements/watch.md).

View File

@ -79,7 +79,7 @@ SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 U
- `*` — заменяет любое количество любых символов кроме `/`, включая отсутствие символов.
- `?` — заменяет ровно один любой символ.
- `{some_string,another_string,yet_another_one}` — заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`.
- `{some_string,another_string,yet_another_one}` — заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`, причём строка может содержать `/`.
- `{N..M}` — заменяет любое число в интервале от `N` до `M` включительно (может содержать ведущие нули).
Конструкция с `{}` аналогична табличной функции [remote](remote.md).

View File

@ -643,6 +643,8 @@ date_diff('unit', startdate, enddate, [timezone])
- `unit``value`对应的时间单位。类型为[String](../../sql-reference/data-types/string.md)。
可能的值:
- `microsecond`
- `millisecond`
- `second`
- `minute`
- `hour`

View File

@ -72,7 +72,7 @@ ClickHouse 中的物化视图更像是插入触发器。 如果视图查询中
使用[allow_experimental_live_view](../../../operations/settings/settings.md#allow-experimental-live-view)设置启用实时视图和`WATCH`查询的使用。 输入命令`set allow_experimental_live_view = 1`。
```sql
CREATE LIVE VIEW [IF NOT EXISTS] [db.]table_name [WITH [TIMEOUT [value_in_sec] [AND]] [REFRESH [value_in_sec]]] AS SELECT ...
CREATE LIVE VIEW [IF NOT EXISTS] [db.]table_name [WITH REFRESH [value_in_sec]] AS SELECT ...
```
实时视图存储相应[SELECT](../../../sql-reference/statements/select/index.md)查询的结果,并在查询结果更改时随时更新。 查询结果以及与新数据结合所需的部分结果存储在内存中,为重复查询提供更高的性能。当使用[WATCH](../../../sql-reference/statements/watch.md)查询更改查询结果时,实时视图可以提供推送通知。

View File

@ -887,6 +887,7 @@ try
#endif
global_context->setRemoteHostFilter(config());
global_context->setHTTPHeaderFilter(config());
std::string path_str = getCanonicalPath(config().getString("path", DBMS_DEFAULT_PATH));
fs::path path = path_str;
@ -1200,6 +1201,7 @@ try
}
global_context->setRemoteHostFilter(*config);
global_context->setHTTPHeaderFilter(*config);
global_context->setMaxTableSizeToDrop(server_settings_.max_table_size_to_drop);
global_context->setMaxPartitionSizeToDrop(server_settings_.max_partition_size_to_drop);

View File

@ -866,6 +866,14 @@
-->
<!--</remote_url_allow_hosts>-->
<!-- The list of HTTP headers forbidden to use in HTTP-related storage engines and table functions.
If this section is not present in configuration, all headers are allowed.
-->
<!-- <http_forbid_headers>
<header>exact_header</header>
<header_regexp>(?i)(case_insensitive_header)</header_regexp>
</http_forbid_headers> -->
<!-- If element has 'incl' attribute, then for it's value will be used corresponding substitution from another file.
By default, path to file with substitutions is /etc/metrika.xml. It could be changed in config in 'include_from' element.
Values for substitutions are specified in /clickhouse/name_of_substitution elements in that file.

204
rust/skim/Cargo.lock generated
View File

@ -42,17 +42,6 @@ version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
[[package]]
name = "atty"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
dependencies = [
"hermit-abi 0.1.19",
"libc",
"winapi",
]
[[package]]
name = "autocfg"
version = "1.1.0"
@ -104,31 +93,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "clap"
version = "3.2.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123"
dependencies = [
"atty",
"bitflags",
"clap_lex",
"indexmap",
"once_cell",
"strsim",
"termcolor",
"textwrap",
]
[[package]]
name = "clap_lex"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5"
dependencies = [
"os_str_bytes",
]
[[package]]
name = "codespan-reporting"
version = "0.11.1"
@ -214,9 +178,9 @@ dependencies = [
[[package]]
name = "cxx"
version = "1.0.97"
version = "1.0.101"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e88abab2f5abbe4c56e8f1fb431b784d710b709888f35755a160e62e33fe38e8"
checksum = "5032837c1384de3708043de9d4e97bb91290faca6c16529a28aa340592a78166"
dependencies = [
"cc",
"cxxbridge-flags",
@ -226,9 +190,9 @@ dependencies = [
[[package]]
name = "cxx-build"
version = "1.0.97"
version = "1.0.101"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c0c11acd0e63bae27dcd2afced407063312771212b7a823b4fd72d633be30fb"
checksum = "51368b3d0dbf356e10fcbfd455a038503a105ee556f7ee79b6bb8c53a7247456"
dependencies = [
"cc",
"codespan-reporting",
@ -236,24 +200,24 @@ dependencies = [
"proc-macro2",
"quote",
"scratch",
"syn 2.0.23",
"syn 2.0.26",
]
[[package]]
name = "cxxbridge-flags"
version = "1.0.97"
version = "1.0.101"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8d3816ed957c008ccd4728485511e3d9aaf7db419aa321e3d2c5a2f3411e36c8"
checksum = "0d9062157072e4aafc8e56ceaf8325ce850c5ae37578c852a0d4de2cecdded13"
[[package]]
name = "cxxbridge-macro"
version = "1.0.97"
version = "1.0.101"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a26acccf6f445af85ea056362561a24ef56cdc15fcc685f03aec50b9c702cb6d"
checksum = "cf01e8a540f5a4e0f284595834f81cf88572f244b768f051724537afa99a2545"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.23",
"syn 2.0.26",
]
[[package]]
@ -359,19 +323,6 @@ version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
[[package]]
name = "env_logger"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a12e6657c4c97ebab115a42dcee77225f7f482cdd841cf7088c657a42e9e00e7"
dependencies = [
"atty",
"humantime",
"log",
"regex",
"termcolor",
]
[[package]]
name = "fnv"
version = "1.0.7"
@ -398,32 +349,11 @@ dependencies = [
"wasi 0.11.0+wasi-snapshot-preview1",
]
[[package]]
name = "hashbrown"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
[[package]]
name = "hermit-abi"
version = "0.1.19"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
dependencies = [
"libc",
]
[[package]]
name = "hermit-abi"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
[[package]]
name = "humantime"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b"
[[package]]
name = "iana-time-zone"
@ -454,16 +384,6 @@ version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
[[package]]
name = "indexmap"
version = "1.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
dependencies = [
"autocfg",
"hashbrown",
]
[[package]]
name = "js-sys"
version = "0.3.64"
@ -487,9 +407,9 @@ checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
[[package]]
name = "link-cplusplus"
version = "1.0.8"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5"
checksum = "9d240c6f7e1ba3a28b0249f774e6a9dd0175054b52dfbb61b16eb8505c3785c9"
dependencies = [
"cc",
]
@ -564,7 +484,7 @@ version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
dependencies = [
"hermit-abi 0.3.1",
"hermit-abi",
"libc",
]
@ -574,12 +494,6 @@ version = "1.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
[[package]]
name = "os_str_bytes"
version = "6.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d5d9eb14b174ee9aa2ef96dc2b94637a2d4b6e7cb873c7e171f0c20c6cf3eac"
[[package]]
name = "pin-utils"
version = "0.1.0"
@ -588,18 +502,18 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "proc-macro2"
version = "1.0.63"
version = "1.0.66"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b368fba921b0dce7e60f5e04ec15e565b3303972b42bcfde1d0713b881959eb"
checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.29"
version = "1.0.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "573015e8ab27661678357f27dc26460738fd2b6c86e46f386fde94cb5d913105"
checksum = "5fe8a65d69dd0808184ebb5f836ab526bb259db23c657efa38711b1072ee47f0"
dependencies = [
"proc-macro2",
]
@ -648,9 +562,21 @@ dependencies = [
[[package]]
name = "regex"
version = "1.8.4"
version = "1.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f"
checksum = "b2eae68fc220f7cf2532e4494aded17545fce192d59cd996e0fe7887f4ceb575"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39354c10dd07468c2e73926b23bb9c2caca74c5501e38a35da70406f1d923310"
dependencies = [
"aho-corasick",
"memchr",
@ -659,39 +585,33 @@ dependencies = [
[[package]]
name = "regex-syntax"
version = "0.7.2"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"
checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2"
[[package]]
name = "rustversion"
version = "1.0.12"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4f3208ce4d8448b3f3e7d168a73f5e0c43a61e32930de3bceeccedb388b6bf06"
checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4"
[[package]]
name = "scopeguard"
version = "1.1.0"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scratch"
version = "1.0.5"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1792db035ce95be60c3f8853017b3999209281c24e2ba5bc8e59bf97a0c590c1"
checksum = "a3cf7c11c38cb994f3d40e8a8cde3bbd1f72a435e4c49e85d6553d8312306152"
[[package]]
name = "serde"
version = "1.0.164"
version = "1.0.171"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d"
[[package]]
name = "shlex"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3"
checksum = "30e27d1e4fd7659406c492fd6cfaf2066ba8773de45ca75e855590f856dc34a9"
[[package]]
name = "skim"
@ -699,23 +619,19 @@ version = "0.10.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5d28de0a6cb2cdd83a076f1de9d965b973ae08b244df1aa70b432946dda0f32"
dependencies = [
"atty",
"beef",
"bitflags",
"chrono",
"clap",
"crossbeam",
"defer-drop",
"derive_builder",
"env_logger",
"fuzzy-matcher",
"lazy_static",
"log",
"nix 0.25.1",
"rayon",
"regex",
"shlex",
"time 0.3.22",
"time 0.3.23",
"timer",
"tuikit",
"unicode-width",
@ -741,9 +657,9 @@ dependencies = [
[[package]]
name = "syn"
version = "2.0.23"
version = "2.0.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59fb7d6d8281a51045d62b8eb3a7d1ce347b76f312af50cd3dc0af39c87c1737"
checksum = "45c3457aacde3c65315de5031ec191ce46604304d2446e803d71ade03308d970"
dependencies = [
"proc-macro2",
"quote",
@ -770,30 +686,24 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "textwrap"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
[[package]]
name = "thiserror"
version = "1.0.40"
version = "1.0.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
checksum = "a35fc5b8971143ca348fa6df4f024d4d55264f3468c71ad1c2f365b0a4d58c42"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.40"
version = "1.0.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
checksum = "463fe12d7993d3b327787537ce8dd4dfa058de32fc2b195ef3cde03dc4771e8f"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.23",
"syn 2.0.26",
]
[[package]]
@ -819,9 +729,9 @@ dependencies = [
[[package]]
name = "time"
version = "0.3.22"
version = "0.3.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ea9e1b3cf1243ae005d9e74085d4d542f3125458f3a81af210d901dcd7411efd"
checksum = "59e399c068f43a5d116fedaf73b203fa4f9c519f17e2b34f63221d3792f81446"
dependencies = [
"serde",
"time-core",
@ -858,9 +768,9 @@ dependencies = [
[[package]]
name = "unicode-ident"
version = "1.0.9"
version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"
checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c"
[[package]]
name = "unicode-width"
@ -928,7 +838,7 @@ dependencies = [
"once_cell",
"proc-macro2",
"quote",
"syn 2.0.23",
"syn 2.0.26",
"wasm-bindgen-shared",
]
@ -950,7 +860,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.23",
"syn 2.0.26",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]

View File

@ -6,7 +6,7 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
skim = "0.10.2"
skim = { version = "0.10.2", default-features = false }
cxx = "1.0.83"
term = "0.7.0"

View File

@ -319,24 +319,21 @@ Packet MultiplexedConnections::receivePacketUnlocked(AsyncCallback async_callbac
throw Exception(ErrorCodes::NO_AVAILABLE_REPLICA, "Logical error: no available replica");
Packet packet;
try
{
AsyncCallbackSetter async_setter(current_connection, std::move(async_callback));
try
packet = current_connection->receivePacket();
}
catch (Exception & e)
{
if (e.code() == ErrorCodes::UNKNOWN_PACKET_FROM_SERVER)
{
packet = current_connection->receivePacket();
}
catch (Exception & e)
{
if (e.code() == ErrorCodes::UNKNOWN_PACKET_FROM_SERVER)
{
/// Exception may happen when packet is received, e.g. when got unknown packet.
/// In this case, invalidate replica, so that we would not read from it anymore.
current_connection->disconnect();
invalidateReplica(state);
}
throw;
/// Exception may happen when packet is received, e.g. when got unknown packet.
/// In this case, invalidate replica, so that we would not read from it anymore.
current_connection->disconnect();
invalidateReplica(state);
}
throw;
}
switch (packet.type)

View File

@ -5,7 +5,6 @@ namespace DB
AsyncTaskExecutor::AsyncTaskExecutor(std::unique_ptr<AsyncTask> task_) : task(std::move(task_))
{
createFiber();
}
void AsyncTaskExecutor::resume()
@ -13,6 +12,10 @@ void AsyncTaskExecutor::resume()
if (routine_is_finished)
return;
/// Create fiber lazily on first resume() call.
if (!fiber)
createFiber();
if (!checkBeforeTaskResume())
return;
@ -22,6 +25,11 @@ void AsyncTaskExecutor::resume()
return;
resumeUnlocked();
/// Destroy fiber when it's finished.
if (routine_is_finished)
destroyFiber();
if (exception)
processException(exception);
}
@ -46,9 +54,8 @@ void AsyncTaskExecutor::cancel()
void AsyncTaskExecutor::restart()
{
std::lock_guard guard(fiber_lock);
if (fiber)
if (!routine_is_finished)
destroyFiber();
createFiber();
routine_is_finished = false;
}

View File

@ -0,0 +1,56 @@
#include <Common/HTTPHeaderFilter.h>
#include <Common/StringUtils/StringUtils.h>
#include <Common/Exception.h>
#include <re2/re2.h>
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
void HTTPHeaderFilter::checkHeaders(const HTTPHeaderEntries & entries) const
{
std::lock_guard guard(mutex);
for (const auto & entry : entries)
{
if (forbidden_headers.contains(entry.name))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "HTTP header \"{}\" is forbidden in configuration file, "
"see <http_forbid_headers>", entry.name);
for (const auto & header_regex : forbidden_headers_regexp)
if (re2::RE2::FullMatch(entry.name, header_regex))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "HTTP header \"{}\" is forbidden in configuration file, "
"see <http_forbid_headers>", entry.name);
}
}
void HTTPHeaderFilter::setValuesFromConfig(const Poco::Util::AbstractConfiguration & config)
{
std::lock_guard guard(mutex);
if (config.has("http_forbid_headers"))
{
std::vector<std::string> keys;
config.keys("http_forbid_headers", keys);
for (const auto & key : keys)
{
if (startsWith(key, "header_regexp"))
forbidden_headers_regexp.push_back(config.getString("http_forbid_headers." + key));
else if (startsWith(key, "header"))
forbidden_headers.insert(config.getString("http_forbid_headers." + key));
}
}
else
{
forbidden_headers.clear();
forbidden_headers_regexp.clear();
}
}
}

View File

@ -0,0 +1,27 @@
#pragma once
#include <IO/HTTPHeaderEntries.h>
#include <Poco/Util/AbstractConfiguration.h>
#include <vector>
#include <unordered_set>
#include <mutex>
namespace DB
{
class HTTPHeaderFilter
{
public:
void setValuesFromConfig(const Poco::Util::AbstractConfiguration & config);
void checkHeaders(const HTTPHeaderEntries & entries) const;
private:
std::unordered_set<std::string> forbidden_headers;
std::vector<std::string> forbidden_headers_regexp;
mutable std::mutex mutex;
};
}

View File

@ -41,9 +41,38 @@ void KeeperContext::initialize(const Poco::Util::AbstractConfiguration & config)
initializeDisks(config);
}
namespace
{
bool diskValidator(const Poco::Util::AbstractConfiguration & config, const std::string & disk_config_prefix)
{
const auto disk_type = config.getString(disk_config_prefix + ".type", "local");
using namespace std::literals;
static constexpr std::array supported_disk_types
{
"s3"sv,
"s3_plain"sv,
"local"sv
};
if (std::all_of(
supported_disk_types.begin(),
supported_disk_types.end(),
[&](const auto supported_type) { return disk_type != supported_type; }))
{
LOG_INFO(&Poco::Logger::get("KeeperContext"), "Disk type '{}' is not supported for Keeper", disk_type);
return false;
}
return true;
}
}
void KeeperContext::initializeDisks(const Poco::Util::AbstractConfiguration & config)
{
disk_selector->initialize(config, "storage_configuration.disks", Context::getGlobalContextInstance());
disk_selector->initialize(config, "storage_configuration.disks", Context::getGlobalContextInstance(), diskValidator);
log_storage = getLogsPathFromConfig(config);

View File

@ -48,7 +48,11 @@ inline auto scaleMultiplier(UInt32 scale)
/** Components of DecimalX value:
* whole - represents whole part of decimal, can be negative or positive.
* fractional - for fractional part of decimal, always positive.
* fractional - for fractional part of decimal.
*
* 0.123 represents 0 / 0.123
* -0.123 represents 0 / -0.123
* -1.123 represents -1 / 0.123
*/
template <typename DecimalType>
struct DecimalComponents

View File

@ -3,6 +3,7 @@
#if USE_MYSQL
#include <Databases/MySQL/MaterializedMySQLSyncThread.h>
#include <Databases/MySQL/tryParseTableIDFromDDL.h>
#include <cstdlib>
#include <random>
#include <string_view>
@ -151,61 +152,6 @@ static void checkMySQLVariables(const mysqlxx::Pool::Entry & connection, const S
}
}
static std::tuple<String, String> tryExtractTableNameFromDDL(const String & ddl)
{
String table_name;
String database_name;
if (ddl.empty()) return std::make_tuple(database_name, table_name);
bool parse_failed = false;
Tokens tokens(ddl.data(), ddl.data() + ddl.size());
IParser::Pos pos(tokens, 0);
Expected expected;
ASTPtr res;
ASTPtr table;
if (ParserKeyword("CREATE TEMPORARY TABLE").ignore(pos, expected) || ParserKeyword("CREATE TABLE").ignore(pos, expected))
{
ParserKeyword("IF NOT EXISTS").ignore(pos, expected);
if (!ParserCompoundIdentifier(true).parse(pos, table, expected))
parse_failed = true;
}
else if (ParserKeyword("ALTER TABLE").ignore(pos, expected))
{
if (!ParserCompoundIdentifier(true).parse(pos, table, expected))
parse_failed = true;
}
else if (ParserKeyword("DROP TABLE").ignore(pos, expected) || ParserKeyword("DROP TEMPORARY TABLE").ignore(pos, expected))
{
ParserKeyword("IF EXISTS").ignore(pos, expected);
if (!ParserCompoundIdentifier(true).parse(pos, table, expected))
parse_failed = true;
}
else if (ParserKeyword("TRUNCATE").ignore(pos, expected))
{
ParserKeyword("TABLE").ignore(pos, expected);
if (!ParserCompoundIdentifier(true).parse(pos, table, expected))
parse_failed = true;
}
else if (ParserKeyword("RENAME TABLE").ignore(pos, expected))
{
if (!ParserCompoundIdentifier(true).parse(pos, table, expected))
parse_failed = true;
}
else
{
parse_failed = true;
}
if (!parse_failed)
{
if (auto table_id = table->as<ASTTableIdentifier>()->getTableId())
{
database_name = table_id.database_name;
table_name = table_id.table_name;
}
}
return std::make_tuple(database_name, table_name);
}
MaterializedMySQLSyncThread::MaterializedMySQLSyncThread(
ContextPtr context_,
const String & database_name_,
@ -868,14 +814,12 @@ void MaterializedMySQLSyncThread::executeDDLAtomic(const QueryEvent & query_even
String query = query_event.query;
if (!materialized_tables_list.empty())
{
auto [ddl_database_name, ddl_table_name] = tryExtractTableNameFromDDL(query_event.query);
if (!ddl_table_name.empty())
auto table_id = tryParseTableIDFromDDL(query, query_event.schema);
if (!table_id.table_name.empty())
{
ddl_database_name = ddl_database_name.empty() ? query_event.schema: ddl_database_name;
if (ddl_database_name != mysql_database_name || !materialized_tables_list.contains(ddl_table_name))
if (table_id.database_name != mysql_database_name || !materialized_tables_list.contains(table_id.table_name))
{
LOG_DEBUG(log, "Skip MySQL DDL: \n {}", query_event.query);
LOG_DEBUG(log, "Skip MySQL DDL for {}.{}:\n{}", table_id.database_name, table_id.table_name, query);
return;
}
}

View File

@ -0,0 +1,185 @@
#include "config.h"
#include <gtest/gtest.h>
#include <Databases/MySQL/tryParseTableIDFromDDL.h>
using namespace DB;
struct ParseTableIDFromDDLTestCase
{
String query;
String database_name;
String table_name;
ParseTableIDFromDDLTestCase(
const String & query_,
const String & database_name_,
const String & table_name_)
: query(query_)
, database_name(database_name_)
, table_name(table_name_)
{
}
};
std::ostream & operator<<(std::ostream & ostr, const ParseTableIDFromDDLTestCase & test_case)
{
return ostr << '"' << test_case.query << "\" extracts `" << test_case.database_name << "`.`" << test_case.table_name << "`";
}
class ParseTableIDFromDDLTest : public ::testing::TestWithParam<ParseTableIDFromDDLTestCase>
{
};
TEST_P(ParseTableIDFromDDLTest, parse)
{
const auto & [query, expected_database_name, expected_table_name] = GetParam();
auto table_id = tryParseTableIDFromDDL(query, "default");
EXPECT_EQ(expected_database_name, table_id.database_name);
EXPECT_EQ(expected_table_name, table_id.table_name);
}
INSTANTIATE_TEST_SUITE_P(MaterializedMySQL, ParseTableIDFromDDLTest, ::testing::ValuesIn(std::initializer_list<ParseTableIDFromDDLTestCase>{
{
"SELECT * FROM db.table",
"",
""
},
{
"CREATE TEMPORARY TABLE db.table",
"db",
"table"
},
{
"CREATE TEMPORARY TABLE IF NOT EXISTS db.table",
"db",
"table"
},
{
"CREATE TEMPORARY TABLE table",
"default",
"table"
},
{
"CREATE TEMPORARY TABLE IF NOT EXISTS table",
"default",
"table"
},
{
"CREATE TABLE db.table",
"db",
"table"
},
{
"CREATE TABLE IF NOT EXISTS db.table",
"db",
"table"
},
{
"CREATE TABLE table",
"default",
"table"
},
{
"CREATE TABLE IF NOT EXISTS table",
"default",
"table"
},
{
"ALTER TABLE db.table",
"db",
"table"
},
{
"ALTER TABLE table",
"default",
"table"
},
{
"DROP TABLE db.table",
"db",
"table"
},
{
"DROP TABLE IF EXISTS db.table",
"db",
"table"
},
{
"DROP TABLE table",
"default",
"table"
},
{
"DROP TABLE IF EXISTS table",
"default",
"table"
},
{
"DROP TEMPORARY TABLE db.table",
"db",
"table"
},
{
"DROP TEMPORARY TABLE IF EXISTS db.table",
"db",
"table"
},
{
"DROP TEMPORARY TABLE table",
"default",
"table"
},
{
"DROP TEMPORARY TABLE IF EXISTS table",
"default",
"table"
},
{
"TRUNCATE db.table",
"db",
"table"
},
{
"TRUNCATE TABLE db.table",
"db",
"table"
},
{
"TRUNCATE table1",
"default",
"table1"
},
{
"TRUNCATE TABLE table",
"default",
"table"
},
{
"RENAME TABLE db.table",
"db",
"table"
},
{
"RENAME TABLE table",
"default",
"table"
},
{
"DROP DATABASE db",
"",
""
},
{
"DROP DATA`BASE db",
"",
""
},
{
"NOT A SQL",
"",
""
},
}));

View File

@ -0,0 +1,44 @@
#include <Databases/MySQL/tryParseTableIDFromDDL.h>
#include <Parsers/ASTIdentifier.h>
#include <Parsers/CommonParsers.h>
#include <Parsers/ExpressionElementParsers.h>
namespace DB
{
StorageID tryParseTableIDFromDDL(const String & query, const String & default_database_name)
{
bool is_ddl = false;
Tokens tokens(query.data(), query.data() + query.size());
IParser::Pos pos(tokens, 0);
Expected expected;
if (ParserKeyword("CREATE TEMPORARY TABLE").ignore(pos, expected) || ParserKeyword("CREATE TABLE").ignore(pos, expected))
{
ParserKeyword("IF NOT EXISTS").ignore(pos, expected);
is_ddl = true;
}
else if (ParserKeyword("ALTER TABLE").ignore(pos, expected) || ParserKeyword("RENAME TABLE").ignore(pos, expected))
{
is_ddl = true;
}
else if (ParserKeyword("DROP TABLE").ignore(pos, expected) || ParserKeyword("DROP TEMPORARY TABLE").ignore(pos, expected))
{
ParserKeyword("IF EXISTS").ignore(pos, expected);
is_ddl = true;
}
else if (ParserKeyword("TRUNCATE").ignore(pos, expected))
{
ParserKeyword("TABLE").ignore(pos, expected);
is_ddl = true;
}
ASTPtr table;
if (!is_ddl || !ParserCompoundIdentifier(true).parse(pos, table, expected))
return StorageID::createEmpty();
auto table_id = table->as<ASTTableIdentifier>()->getTableId();
if (table_id.database_name.empty())
table_id.database_name = default_database_name;
return table_id;
}
}

View File

@ -0,0 +1,11 @@
#pragma once
#include <base/types.h>
#include <Storages/IStorage.h>
namespace DB
{
StorageID tryParseTableIDFromDDL(const String & query, const String & default_database_name);
}

View File

@ -257,7 +257,6 @@ void registerDictionarySourceHTTP(DictionarySourceFactory & factory)
const auto & headers_prefix = settings_config_prefix + ".headers";
if (config.has(headers_prefix))
{
Poco::Util::AbstractConfiguration::Keys config_keys;
@ -297,7 +296,10 @@ void registerDictionarySourceHTTP(DictionarySourceFactory & factory)
auto context = copyContextAndApplySettingsFromDictionaryConfig(global_context, config, config_prefix);
if (created_from_ddl)
{
context->getRemoteHostFilter().checkURL(Poco::URI(configuration.url));
context->getHTTPHeaderFilter().checkHeaders(configuration.header_entries);
}
return std::make_unique<HTTPDictionarySource>(dict_struct, configuration, credentials, sample_block, context);
};

View File

@ -27,7 +27,7 @@ void DiskSelector::assertInitialized() const
}
void DiskSelector::initialize(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context)
void DiskSelector::initialize(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context, DiskValidator disk_validator)
{
Poco::Util::AbstractConfiguration::Keys keys;
config.keys(config_prefix, keys);
@ -46,6 +46,9 @@ void DiskSelector::initialize(const Poco::Util::AbstractConfiguration & config,
auto disk_config_prefix = config_prefix + "." + disk_name;
if (disk_validator && !disk_validator(config, disk_config_prefix))
continue;
disks.emplace(disk_name, factory.create(disk_name, config, disk_config_prefix, context, disks));
}
if (!has_default_disk)

View File

@ -23,7 +23,8 @@ public:
DiskSelector() = default;
DiskSelector(const DiskSelector & from) = default;
void initialize(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context);
using DiskValidator = std::function<bool(const Poco::Util::AbstractConfiguration & config, const String & disk_config_prefix)>;
void initialize(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context, DiskValidator disk_validator = {});
DiskSelectorPtr updateFromConfig(
const Poco::Util::AbstractConfiguration & config,

View File

@ -23,10 +23,6 @@ namespace ErrorCodes
extern const int LOGICAL_ERROR;
}
TemporaryFileOnDisk::TemporaryFileOnDisk(const DiskPtr & disk_)
: TemporaryFileOnDisk(disk_, "")
{}
TemporaryFileOnDisk::TemporaryFileOnDisk(const DiskPtr & disk_, CurrentMetrics::Metric metric_scope)
: TemporaryFileOnDisk(disk_)
{

View File

@ -16,9 +16,8 @@ using DiskPtr = std::shared_ptr<IDisk>;
class TemporaryFileOnDisk
{
public:
explicit TemporaryFileOnDisk(const DiskPtr & disk_);
explicit TemporaryFileOnDisk(const DiskPtr & disk_, CurrentMetrics::Metric metric_scope);
explicit TemporaryFileOnDisk(const DiskPtr & disk_, const String & prefix);
explicit TemporaryFileOnDisk(const DiskPtr & disk_, const String & prefix = "tmp");
~TemporaryFileOnDisk();

View File

@ -19,6 +19,9 @@
namespace DB
{
static constexpr auto microsecond_multiplier = 1000000;
static constexpr auto millisecond_multiplier = 1000;
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
@ -1377,6 +1380,36 @@ struct ToRelativeSecondNumImpl
using FactorTransform = ZeroTransform;
};
template <Int64 scale_multiplier>
struct ToRelativeSubsecondNumImpl
{
static constexpr auto name = "toRelativeSubsecondNumImpl";
static inline Int64 execute(const DateTime64 & t, DateTime64::NativeType scale, const DateLUTImpl &)
{
static_assert(scale_multiplier == 1000 || scale_multiplier == 1000000);
if (scale == scale_multiplier)
return t.value;
if (scale > scale_multiplier)
return t.value / (scale / scale_multiplier);
return t.value * (scale_multiplier / scale);
}
static inline Int64 execute(UInt32 t, const DateLUTImpl &)
{
return t * scale_multiplier;
}
static inline Int64 execute(Int32 d, const DateLUTImpl & time_zone)
{
return static_cast<Int64>(time_zone.fromDayNum(ExtendedDayNum(d))) * scale_multiplier;
}
static inline Int64 execute(UInt16 d, const DateLUTImpl & time_zone)
{
return static_cast<Int64>(time_zone.fromDayNum(DayNum(d)) * scale_multiplier);
}
using FactorTransform = ZeroTransform;
};
struct ToYYYYMMImpl
{
static constexpr auto name = "toYYYYMM";
@ -1476,25 +1509,47 @@ struct ToYYYYMMDDhhmmssImpl
using FactorTransform = ZeroTransform;
};
struct DateTimeComponentsWithFractionalPart : public DateLUTImpl::DateTimeComponents
{
UInt16 millisecond;
UInt16 microsecond;
};
struct ToDateTimeComponentsImpl
{
static constexpr auto name = "toDateTimeComponents";
static inline DateLUTImpl::DateTimeComponents execute(Int64 t, const DateLUTImpl & time_zone)
static inline DateTimeComponentsWithFractionalPart execute(const DateTime64 & t, DateTime64::NativeType scale_multiplier, const DateLUTImpl & time_zone)
{
return time_zone.toDateTimeComponents(t);
auto components = DecimalUtils::splitWithScaleMultiplier(t, scale_multiplier);
if (t.value < 0 && components.fractional)
{
components.fractional = scale_multiplier + (components.whole ? Int64(-1) : Int64(1)) * components.fractional;
--components.whole;
}
Int64 fractional = components.fractional;
if (scale_multiplier > microsecond_multiplier)
fractional = fractional / (scale_multiplier / microsecond_multiplier);
else if (scale_multiplier < microsecond_multiplier)
fractional = fractional * (microsecond_multiplier / scale_multiplier);
constexpr Int64 divider = microsecond_multiplier/ millisecond_multiplier;
UInt16 millisecond = static_cast<UInt16>(fractional / divider);
UInt16 microsecond = static_cast<UInt16>(fractional % divider);
return DateTimeComponentsWithFractionalPart{time_zone.toDateTimeComponents(components.whole), millisecond, microsecond};
}
static inline DateLUTImpl::DateTimeComponents execute(UInt32 t, const DateLUTImpl & time_zone)
static inline DateTimeComponentsWithFractionalPart execute(UInt32 t, const DateLUTImpl & time_zone)
{
return time_zone.toDateTimeComponents(static_cast<DateLUTImpl::Time>(t));
return DateTimeComponentsWithFractionalPart{time_zone.toDateTimeComponents(static_cast<DateLUTImpl::Time>(t)), 0, 0};
}
static inline DateLUTImpl::DateTimeComponents execute(Int32 d, const DateLUTImpl & time_zone)
static inline DateTimeComponentsWithFractionalPart execute(Int32 d, const DateLUTImpl & time_zone)
{
return time_zone.toDateTimeComponents(ExtendedDayNum(d));
return DateTimeComponentsWithFractionalPart{time_zone.toDateTimeComponents(ExtendedDayNum(d)), 0, 0};
}
static inline DateLUTImpl::DateTimeComponents execute(UInt16 d, const DateLUTImpl & time_zone)
static inline DateTimeComponentsWithFractionalPart execute(UInt16 d, const DateLUTImpl & time_zone)
{
return time_zone.toDateTimeComponents(DayNum(d));
return DateTimeComponentsWithFractionalPart{time_zone.toDateTimeComponents(DayNum(d)), 0, 0};
}
using FactorTransform = ZeroTransform;

View File

@ -79,28 +79,51 @@ namespace impl
UInt64 key1 = 0;
};
static SipHashKey parseSipHashKey(const ColumnWithTypeAndName & key)
struct SipHashKeyColumns
{
SipHashKey ret{};
ColumnPtr key0;
ColumnPtr key1;
bool is_const;
const auto * tuple = checkAndGetColumn<ColumnTuple>(key.column.get());
size_t size() const
{
assert(key0 && key1);
assert(key0->size() == key1->size());
return key0->size();
}
SipHashKey getKey(size_t i) const
{
if (is_const)
i = 0;
const auto & key0data = assert_cast<const ColumnUInt64 &>(*key0).getData();
const auto & key1data = assert_cast<const ColumnUInt64 &>(*key1).getData();
return {key0data[i], key1data[i]};
}
};
static SipHashKeyColumns parseSipHashKeyColumns(const ColumnWithTypeAndName & key)
{
const ColumnTuple * tuple = nullptr;
const auto * column = key.column.get();
bool is_const = false;
if (isColumnConst(*column))
{
is_const = true;
tuple = checkAndGetColumnConstData<ColumnTuple>(column);
}
else
tuple = checkAndGetColumn<ColumnTuple>(column);
if (!tuple)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "key must be a tuple");
if (tuple->tupleSize() != 2)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "wrong tuple size: key must be a tuple of 2 UInt64");
if (tuple->empty())
return ret;
if (const auto * key0col = checkAndGetColumn<ColumnUInt64>(&(tuple->getColumn(0))))
ret.key0 = key0col->get64(0);
else
SipHashKeyColumns ret{tuple->getColumnPtr(0), tuple->getColumnPtr(1), is_const};
assert(ret.key0);
if (!checkColumn<ColumnUInt64>(*ret.key0))
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "first element of the key tuple is not UInt64");
if (const auto * key1col = checkAndGetColumn<ColumnUInt64>(&(tuple->getColumn(1))))
ret.key1 = key1col->get64(0);
else
assert(ret.key1);
if (!checkColumn<ColumnUInt64>(*ret.key1))
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "second element of the key tuple is not UInt64");
return ret;
@ -329,8 +352,10 @@ struct SipHash64KeyedImpl
static constexpr auto name = "sipHash64Keyed";
using ReturnType = UInt64;
using Key = impl::SipHashKey;
using KeyColumns = impl::SipHashKeyColumns;
static Key parseKey(const ColumnWithTypeAndName & key) { return impl::parseSipHashKey(key); }
static KeyColumns parseKeyColumns(const ColumnWithTypeAndName & key) { return impl::parseSipHashKeyColumns(key); }
static Key getKey(const KeyColumns & key, size_t i) { return key.getKey(i); }
static UInt64 applyKeyed(const Key & key, const char * begin, size_t size) { return sipHash64Keyed(key.key0, key.key1, begin, size); }
@ -371,8 +396,10 @@ struct SipHash128KeyedImpl
static constexpr auto name = "sipHash128Keyed";
using ReturnType = UInt128;
using Key = impl::SipHashKey;
using KeyColumns = impl::SipHashKeyColumns;
static Key parseKey(const ColumnWithTypeAndName & key) { return impl::parseSipHashKey(key); }
static KeyColumns parseKeyColumns(const ColumnWithTypeAndName & key) { return impl::parseSipHashKeyColumns(key); }
static Key getKey(const KeyColumns & key, size_t i) { return key.getKey(i); }
static UInt128 applyKeyed(const Key & key, const char * begin, size_t size) { return sipHash128Keyed(key.key0, key.key1, begin, size); }
@ -398,13 +425,43 @@ struct SipHash128ReferenceImpl
using ReturnType = UInt128;
static UInt128 combineHashes(UInt128 h1, UInt128 h2) { return combineHashesFunc<UInt128, SipHash128Impl>(h1, h2); }
static UInt128 combineHashes(UInt128 h1, UInt128 h2) { return combineHashesFunc<UInt128, SipHash128ReferenceImpl>(h1, h2); }
static UInt128 apply(const char * data, const size_t size) { return sipHash128Reference(data, size); }
static constexpr bool use_int_hash_for_pods = false;
};
struct SipHash128ReferenceKeyedImpl
{
static constexpr auto name = "sipHash128ReferenceKeyed";
using ReturnType = UInt128;
using Key = impl::SipHashKey;
using KeyColumns = impl::SipHashKeyColumns;
static KeyColumns parseKeyColumns(const ColumnWithTypeAndName & key) { return impl::parseSipHashKeyColumns(key); }
static Key getKey(const KeyColumns & key, size_t i) { return key.getKey(i); }
static UInt128 applyKeyed(const Key & key, const char * begin, size_t size)
{
return sipHash128ReferenceKeyed(key.key0, key.key1, begin, size);
}
static UInt128 combineHashesKeyed(const Key & key, UInt128 h1, UInt128 h2)
{
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
UInt128 tmp;
reverseMemcpy(&tmp, &h1, sizeof(UInt128));
h1 = tmp;
reverseMemcpy(&tmp, &h2, sizeof(UInt128));
h2 = tmp;
#endif
UInt128 hashes[] = {h1, h2};
return applyKeyed(key, reinterpret_cast<const char *>(hashes), 2 * sizeof(UInt128));
}
static constexpr bool use_int_hash_for_pods = false;
};
/** Why we need MurmurHash2?
* MurmurHash2 is an outdated hash function, superseded by MurmurHash3 and subsequently by CityHash, xxHash, HighwayHash.
@ -1023,7 +1080,7 @@ private:
DECLARE_MULTITARGET_CODE(
template <typename Impl, bool Keyed, typename KeyType>
template <typename Impl, bool Keyed, typename KeyType, typename KeyColumnsType>
class FunctionAnyHash : public IFunction
{
public:
@ -1033,9 +1090,12 @@ private:
using ToType = typename Impl::ReturnType;
template <typename FromType, bool first>
void executeIntType(const KeyType & key, const IColumn * column, typename ColumnVector<ToType>::Container & vec_to) const
void executeIntType(const KeyColumnsType & key_cols, const IColumn * column, typename ColumnVector<ToType>::Container & vec_to) const
{
using ColVecType = ColumnVectorOrDecimal<FromType>;
KeyType key{};
if constexpr (Keyed)
key = Impl::getKey(key_cols, 0);
if (const ColVecType * col_from = checkAndGetColumn<ColVecType>(column))
{
@ -1044,6 +1104,9 @@ private:
for (size_t i = 0; i < size; ++i)
{
ToType hash;
if constexpr (Keyed)
if (!key_cols.is_const && i != 0)
key = Impl::getKey(key_cols, i);
if constexpr (Impl::use_int_hash_for_pods)
{
@ -1077,6 +1140,14 @@ private:
}
else if (auto col_from_const = checkAndGetColumnConst<ColVecType>(column))
{
if constexpr (Keyed)
{
if (!key_cols.is_const)
{
ColumnPtr full_column = col_from_const->convertToFullColumn();
return executeIntType<FromType, first>(key_cols, full_column.get(), vec_to);
}
}
auto value = col_from_const->template getValue<FromType>();
ToType hash;
@ -1107,8 +1178,15 @@ private:
if constexpr (first)
vec_to.assign(size, hash);
else
{
for (size_t i = 0; i < size; ++i)
{
if constexpr (Keyed)
if (!key_cols.is_const && i != 0)
key = Impl::getKey(key_cols, i);
vec_to[i] = combineHashes(key, vec_to[i], hash);
}
}
}
else
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}",
@ -1116,9 +1194,12 @@ private:
}
template <typename FromType, bool first>
void executeBigIntType(const KeyType & key, const IColumn * column, typename ColumnVector<ToType>::Container & vec_to) const
void executeBigIntType(const KeyColumnsType & key_cols, const IColumn * column, typename ColumnVector<ToType>::Container & vec_to) const
{
using ColVecType = ColumnVectorOrDecimal<FromType>;
KeyType key{};
if constexpr (Keyed)
key = Impl::getKey(key_cols, 0);
if (const ColVecType * col_from = checkAndGetColumn<ColVecType>(column))
{
@ -1127,6 +1208,9 @@ private:
for (size_t i = 0; i < size; ++i)
{
ToType hash;
if constexpr (Keyed)
if (!key_cols.is_const && i != 0)
key = Impl::getKey(key_cols, i);
if constexpr (std::endian::native == std::endian::little)
hash = apply(key, reinterpret_cast<const char *>(&vec_from[i]), sizeof(vec_from[i]));
else
@ -1143,6 +1227,14 @@ private:
}
else if (auto col_from_const = checkAndGetColumnConst<ColVecType>(column))
{
if constexpr (Keyed)
{
if (!key_cols.is_const)
{
ColumnPtr full_column = col_from_const->convertToFullColumn();
return executeBigIntType<FromType, first>(key_cols, full_column.get(), vec_to);
}
}
auto value = col_from_const->template getValue<FromType>();
ToType hash;
@ -1158,8 +1250,15 @@ private:
if constexpr (first)
vec_to.assign(size, hash);
else
{
for (size_t i = 0; i < size; ++i)
{
if constexpr (Keyed)
if (!key_cols.is_const && i != 0)
key = Impl::getKey(key_cols, i);
vec_to[i] = combineHashes(key, vec_to[i], hash);
}
}
}
else
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}",
@ -1167,10 +1266,16 @@ private:
}
template <bool first>
void executeGeneric(const KeyType & key, const IColumn * column, typename ColumnVector<ToType>::Container & vec_to) const
void executeGeneric(const KeyColumnsType & key_cols, const IColumn * column, typename ColumnVector<ToType>::Container & vec_to) const
{
KeyType key{};
if constexpr (Keyed)
key = Impl::getKey(key_cols, 0);
for (size_t i = 0, size = column->size(); i < size; ++i)
{
if constexpr (Keyed)
if (!key_cols.is_const && i != 0)
key = Impl::getKey(key_cols, i);
StringRef bytes = column->getDataAt(i);
const ToType hash = apply(key, bytes.data, bytes.size);
if constexpr (first)
@ -1181,8 +1286,11 @@ private:
}
template <bool first>
void executeString(const KeyType & key, const IColumn * column, typename ColumnVector<ToType>::Container & vec_to) const
void executeString(const KeyColumnsType & key_cols, const IColumn * column, typename ColumnVector<ToType>::Container & vec_to) const
{
KeyType key{};
if constexpr (Keyed)
key = Impl::getKey(key_cols, 0);
if (const ColumnString * col_from = checkAndGetColumn<ColumnString>(column))
{
const typename ColumnString::Chars & data = col_from->getChars();
@ -1192,6 +1300,9 @@ private:
ColumnString::Offset current_offset = 0;
for (size_t i = 0; i < size; ++i)
{
if constexpr (Keyed)
if (!key_cols.is_const && i != 0)
key = Impl::getKey(key_cols, i);
const ToType hash = apply(key,
reinterpret_cast<const char *>(&data[current_offset]),
offsets[i] - current_offset - 1);
@ -1212,6 +1323,9 @@ private:
for (size_t i = 0; i < size; ++i)
{
if constexpr (Keyed)
if (!key_cols.is_const && i != 0)
key = Impl::getKey(key_cols, i);
const ToType hash = apply(key, reinterpret_cast<const char *>(&data[i * n]), n);
if constexpr (first)
vec_to[i] = hash;
@ -1221,6 +1335,14 @@ private:
}
else if (const ColumnConst * col_from_const = checkAndGetColumnConstStringOrFixedString(column))
{
if constexpr (Keyed)
{
if (!key_cols.is_const)
{
ColumnPtr full_column = col_from_const->convertToFullColumn();
return executeString<first>(key_cols, full_column.get(), vec_to);
}
}
String value = col_from_const->getValue<String>();
const ToType hash = apply(key, value.data(), value.size());
const size_t size = vec_to.size();
@ -1228,8 +1350,15 @@ private:
if constexpr (first)
vec_to.assign(size, hash);
else
{
for (size_t i = 0; i < size; ++i)
{
if constexpr (Keyed)
if (!key_cols.is_const && i != 0)
key = Impl::getKey(key_cols, i);
vec_to[i] = combineHashes(key, vec_to[i], hash);
}
}
}
else
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}",
@ -1237,7 +1366,7 @@ private:
}
template <bool first>
void executeArray(const KeyType & key, const IDataType * type, const IColumn * column, typename ColumnVector<ToType>::Container & vec_to) const
void executeArray(const KeyColumnsType & key_cols, const IDataType * type, const IColumn * column, typename ColumnVector<ToType>::Container & vec_to) const
{
const IDataType * nested_type = typeid_cast<const DataTypeArray &>(*type).getNestedType().get();
@ -1249,13 +1378,19 @@ private:
typename ColumnVector<ToType>::Container vec_temp(nested_size);
bool nested_is_first = true;
executeForArgument(key, nested_type, nested_column, vec_temp, nested_is_first);
executeForArgument(key_cols, nested_type, nested_column, vec_temp, nested_is_first);
const size_t size = offsets.size();
ColumnArray::Offset current_offset = 0;
KeyType key{};
if constexpr (Keyed)
key = Impl::getKey(key_cols, 0);
for (size_t i = 0; i < size; ++i)
{
if constexpr (Keyed)
if (!key_cols.is_const && i != 0)
key = Impl::getKey(key_cols, i);
ColumnArray::Offset next_offset = offsets[i];
ToType hash;
@ -1279,7 +1414,7 @@ private:
{
/// NOTE: here, of course, you can do without the materialization of the column.
ColumnPtr full_column = col_from_const->convertToFullColumn();
executeArray<first>(key, type, full_column.get(), vec_to);
executeArray<first>(key_cols, type, full_column.get(), vec_to);
}
else
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}",
@ -1287,7 +1422,7 @@ private:
}
template <bool first>
void executeAny(const KeyType & key, const IDataType * from_type, const IColumn * icolumn, typename ColumnVector<ToType>::Container & vec_to) const
void executeAny(const KeyColumnsType & key_cols, const IDataType * from_type, const IColumn * icolumn, typename ColumnVector<ToType>::Container & vec_to) const
{
WhichDataType which(from_type);
@ -1295,40 +1430,45 @@ private:
throw Exception(ErrorCodes::LOGICAL_ERROR, "Argument column '{}' size {} doesn't match result column size {} of function {}",
icolumn->getName(), icolumn->size(), vec_to.size(), getName());
if (which.isUInt8()) executeIntType<UInt8, first>(key, icolumn, vec_to);
else if (which.isUInt16()) executeIntType<UInt16, first>(key, icolumn, vec_to);
else if (which.isUInt32()) executeIntType<UInt32, first>(key, icolumn, vec_to);
else if (which.isUInt64()) executeIntType<UInt64, first>(key, icolumn, vec_to);
else if (which.isUInt128()) executeBigIntType<UInt128, first>(key, icolumn, vec_to);
else if (which.isUInt256()) executeBigIntType<UInt256, first>(key, icolumn, vec_to);
else if (which.isInt8()) executeIntType<Int8, first>(key, icolumn, vec_to);
else if (which.isInt16()) executeIntType<Int16, first>(key, icolumn, vec_to);
else if (which.isInt32()) executeIntType<Int32, first>(key, icolumn, vec_to);
else if (which.isInt64()) executeIntType<Int64, first>(key, icolumn, vec_to);
else if (which.isInt128()) executeBigIntType<Int128, first>(key, icolumn, vec_to);
else if (which.isInt256()) executeBigIntType<Int256, first>(key, icolumn, vec_to);
else if (which.isUUID()) executeBigIntType<UUID, first>(key, icolumn, vec_to);
else if (which.isIPv4()) executeIntType<IPv4, first>(key, icolumn, vec_to);
else if (which.isIPv6()) executeBigIntType<IPv6, first>(key, icolumn, vec_to);
else if (which.isEnum8()) executeIntType<Int8, first>(key, icolumn, vec_to);
else if (which.isEnum16()) executeIntType<Int16, first>(key, icolumn, vec_to);
else if (which.isDate()) executeIntType<UInt16, first>(key, icolumn, vec_to);
else if (which.isDate32()) executeIntType<Int32, first>(key, icolumn, vec_to);
else if (which.isDateTime()) executeIntType<UInt32, first>(key, icolumn, vec_to);
if constexpr (Keyed)
if ((!key_cols.is_const && key_cols.size() != vec_to.size())
|| (key_cols.is_const && key_cols.size() != 1))
throw Exception(ErrorCodes::LOGICAL_ERROR, "Key column size {} doesn't match result column size {} of function {}", key_cols.size(), vec_to.size(), getName());
if (which.isUInt8()) executeIntType<UInt8, first>(key_cols, icolumn, vec_to);
else if (which.isUInt16()) executeIntType<UInt16, first>(key_cols, icolumn, vec_to);
else if (which.isUInt32()) executeIntType<UInt32, first>(key_cols, icolumn, vec_to);
else if (which.isUInt64()) executeIntType<UInt64, first>(key_cols, icolumn, vec_to);
else if (which.isUInt128()) executeBigIntType<UInt128, first>(key_cols, icolumn, vec_to);
else if (which.isUInt256()) executeBigIntType<UInt256, first>(key_cols, icolumn, vec_to);
else if (which.isInt8()) executeIntType<Int8, first>(key_cols, icolumn, vec_to);
else if (which.isInt16()) executeIntType<Int16, first>(key_cols, icolumn, vec_to);
else if (which.isInt32()) executeIntType<Int32, first>(key_cols, icolumn, vec_to);
else if (which.isInt64()) executeIntType<Int64, first>(key_cols, icolumn, vec_to);
else if (which.isInt128()) executeBigIntType<Int128, first>(key_cols, icolumn, vec_to);
else if (which.isInt256()) executeBigIntType<Int256, first>(key_cols, icolumn, vec_to);
else if (which.isUUID()) executeBigIntType<UUID, first>(key_cols, icolumn, vec_to);
else if (which.isIPv4()) executeIntType<IPv4, first>(key_cols, icolumn, vec_to);
else if (which.isIPv6()) executeBigIntType<IPv6, first>(key_cols, icolumn, vec_to);
else if (which.isEnum8()) executeIntType<Int8, first>(key_cols, icolumn, vec_to);
else if (which.isEnum16()) executeIntType<Int16, first>(key_cols, icolumn, vec_to);
else if (which.isDate()) executeIntType<UInt16, first>(key_cols, icolumn, vec_to);
else if (which.isDate32()) executeIntType<Int32, first>(key_cols, icolumn, vec_to);
else if (which.isDateTime()) executeIntType<UInt32, first>(key_cols, icolumn, vec_to);
/// TODO: executeIntType() for Decimal32/64 leads to incompatible result
else if (which.isDecimal32()) executeBigIntType<Decimal32, first>(key, icolumn, vec_to);
else if (which.isDecimal64()) executeBigIntType<Decimal64, first>(key, icolumn, vec_to);
else if (which.isDecimal128()) executeBigIntType<Decimal128, first>(key, icolumn, vec_to);
else if (which.isDecimal256()) executeBigIntType<Decimal256, first>(key, icolumn, vec_to);
else if (which.isFloat32()) executeIntType<Float32, first>(key, icolumn, vec_to);
else if (which.isFloat64()) executeIntType<Float64, first>(key, icolumn, vec_to);
else if (which.isString()) executeString<first>(key, icolumn, vec_to);
else if (which.isFixedString()) executeString<first>(key, icolumn, vec_to);
else if (which.isArray()) executeArray<first>(key, from_type, icolumn, vec_to);
else executeGeneric<first>(key, icolumn, vec_to);
else if (which.isDecimal32()) executeBigIntType<Decimal32, first>(key_cols, icolumn, vec_to);
else if (which.isDecimal64()) executeBigIntType<Decimal64, first>(key_cols, icolumn, vec_to);
else if (which.isDecimal128()) executeBigIntType<Decimal128, first>(key_cols, icolumn, vec_to);
else if (which.isDecimal256()) executeBigIntType<Decimal256, first>(key_cols, icolumn, vec_to);
else if (which.isFloat32()) executeIntType<Float32, first>(key_cols, icolumn, vec_to);
else if (which.isFloat64()) executeIntType<Float64, first>(key_cols, icolumn, vec_to);
else if (which.isString()) executeString<first>(key_cols, icolumn, vec_to);
else if (which.isFixedString()) executeString<first>(key_cols, icolumn, vec_to);
else if (which.isArray()) executeArray<first>(key_cols, from_type, icolumn, vec_to);
else executeGeneric<first>(key_cols, icolumn, vec_to);
}
void executeForArgument(const KeyType & key, const IDataType * type, const IColumn * column, typename ColumnVector<ToType>::Container & vec_to, bool & is_first) const
void executeForArgument(const KeyColumnsType & key_cols, const IDataType * type, const IColumn * column, typename ColumnVector<ToType>::Container & vec_to, bool & is_first) const
{
/// Flattening of tuples.
if (const ColumnTuple * tuple = typeid_cast<const ColumnTuple *>(column))
@ -1337,7 +1477,7 @@ private:
const DataTypes & tuple_types = typeid_cast<const DataTypeTuple &>(*type).getElements();
size_t tuple_size = tuple_columns.size();
for (size_t i = 0; i < tuple_size; ++i)
executeForArgument(key, tuple_types[i].get(), tuple_columns[i].get(), vec_to, is_first);
executeForArgument(key_cols, tuple_types[i].get(), tuple_columns[i].get(), vec_to, is_first);
}
else if (const ColumnTuple * tuple_const = checkAndGetColumnConstData<ColumnTuple>(column))
{
@ -1347,24 +1487,24 @@ private:
for (size_t i = 0; i < tuple_size; ++i)
{
auto tmp = ColumnConst::create(tuple_columns[i], column->size());
executeForArgument(key, tuple_types[i].get(), tmp.get(), vec_to, is_first);
executeForArgument(key_cols, tuple_types[i].get(), tmp.get(), vec_to, is_first);
}
}
else if (const auto * map = checkAndGetColumn<ColumnMap>(column))
{
const auto & type_map = assert_cast<const DataTypeMap &>(*type);
executeForArgument(key, type_map.getNestedType().get(), map->getNestedColumnPtr().get(), vec_to, is_first);
executeForArgument(key_cols, type_map.getNestedType().get(), map->getNestedColumnPtr().get(), vec_to, is_first);
}
else if (const auto * const_map = checkAndGetColumnConst<ColumnMap>(column))
{
executeForArgument(key, type, const_map->convertToFullColumnIfConst().get(), vec_to, is_first);
executeForArgument(key_cols, type, const_map->convertToFullColumnIfConst().get(), vec_to, is_first);
}
else
{
if (is_first)
executeAny<true>(key, type, column, vec_to);
executeAny<true>(key_cols, type, column, vec_to);
else
executeAny<false>(key, type, column, vec_to);
executeAny<false>(key_cols, type, column, vec_to);
}
is_first = false;
@ -1395,30 +1535,33 @@ public:
{
auto col_to = ColumnVector<ToType>::create(input_rows_count);
typename ColumnVector<ToType>::Container & vec_to = col_to->getData();
/// If using a "keyed" algorithm, the first argument is the key and
/// the data starts from the second argument.
/// Otherwise there is no key and all arguments are interpreted as data.
constexpr size_t first_data_argument = Keyed;
if (arguments.size() <= first_data_argument)
if (input_rows_count != 0)
{
/// Return a fixed random-looking magic number when input is empty
vec_to.assign(input_rows_count, static_cast<ToType>(0xe28dbde7fe22e41c));
}
typename ColumnVector<ToType>::Container & vec_to = col_to->getData();
KeyType key{};
if constexpr (Keyed)
if (!arguments.empty())
key = Impl::parseKey(arguments[0]);
/// If using a "keyed" algorithm, the first argument is the key and
/// the data starts from the second argument.
/// Otherwise there is no key and all arguments are interpreted as data.
constexpr size_t first_data_argument = Keyed;
/// The function supports arbitrary number of arguments of arbitrary types.
bool is_first_argument = true;
for (size_t i = first_data_argument; i < arguments.size(); ++i)
{
const auto & col = arguments[i];
executeForArgument(key, col.type.get(), col.column.get(), vec_to, is_first_argument);
if (arguments.size() <= first_data_argument)
{
/// Return a fixed random-looking magic number when input is empty
vec_to.assign(input_rows_count, static_cast<ToType>(0xe28dbde7fe22e41c));
}
KeyColumnsType key_cols{};
if constexpr (Keyed)
if (!arguments.empty())
key_cols = Impl::parseKeyColumns(arguments[0]);
/// The function supports arbitrary number of arguments of arbitrary types.
bool is_first_argument = true;
for (size_t i = first_data_argument; i < arguments.size(); ++i)
{
const auto & col = arguments[i];
executeForArgument(key_cols, col.type.get(), col.column.get(), vec_to, is_first_argument);
}
}
if constexpr (std::is_same_v<ToType, UInt128>) /// backward-compatible
@ -1450,17 +1593,19 @@ public:
) // DECLARE_MULTITARGET_CODE
template <typename Impl, bool Keyed = false, typename KeyType = char>
class FunctionAnyHash : public TargetSpecific::Default::FunctionAnyHash<Impl, Keyed, KeyType>
template <typename Impl, bool Keyed = false, typename KeyType = char, typename KeyColumnsType = char>
class FunctionAnyHash : public TargetSpecific::Default::FunctionAnyHash<Impl, Keyed, KeyType, KeyColumnsType>
{
public:
explicit FunctionAnyHash(ContextPtr context) : selector(context)
{
selector.registerImplementation<TargetArch::Default, TargetSpecific::Default::FunctionAnyHash<Impl, Keyed, KeyType>>();
selector
.registerImplementation<TargetArch::Default, TargetSpecific::Default::FunctionAnyHash<Impl, Keyed, KeyType, KeyColumnsType>>();
#if USE_MULTITARGET_CODE
selector.registerImplementation<TargetArch::AVX2, TargetSpecific::AVX2::FunctionAnyHash<Impl, Keyed, KeyType>>();
selector.registerImplementation<TargetArch::AVX512F, TargetSpecific::AVX512F::FunctionAnyHash<Impl, Keyed, KeyType>>();
selector.registerImplementation<TargetArch::AVX2, TargetSpecific::AVX2::FunctionAnyHash<Impl, Keyed, KeyType, KeyColumnsType>>();
selector
.registerImplementation<TargetArch::AVX512F, TargetSpecific::AVX512F::FunctionAnyHash<Impl, Keyed, KeyType, KeyColumnsType>>();
#endif
}
@ -1696,7 +1841,7 @@ struct NameIntHash32 { static constexpr auto name = "intHash32"; };
struct NameIntHash64 { static constexpr auto name = "intHash64"; };
using FunctionSipHash64 = FunctionAnyHash<SipHash64Impl>;
using FunctionSipHash64Keyed = FunctionAnyHash<SipHash64KeyedImpl, true, SipHash64KeyedImpl::Key>;
using FunctionSipHash64Keyed = FunctionAnyHash<SipHash64KeyedImpl, true, SipHash64KeyedImpl::Key, SipHash64KeyedImpl::KeyColumns>;
using FunctionIntHash32 = FunctionIntHash<IntHash32Impl, NameIntHash32>;
using FunctionIntHash64 = FunctionIntHash<IntHash64Impl, NameIntHash64>;
#if USE_SSL
@ -1710,8 +1855,10 @@ using FunctionSHA384 = FunctionStringHashFixedString<SHA384Impl>;
using FunctionSHA512 = FunctionStringHashFixedString<SHA512Impl>;
#endif
using FunctionSipHash128 = FunctionAnyHash<SipHash128Impl>;
using FunctionSipHash128Keyed = FunctionAnyHash<SipHash128KeyedImpl, true, SipHash128KeyedImpl::Key>;
using FunctionSipHash128Keyed = FunctionAnyHash<SipHash128KeyedImpl, true, SipHash128KeyedImpl::Key, SipHash128KeyedImpl::KeyColumns>;
using FunctionSipHash128Reference = FunctionAnyHash<SipHash128ReferenceImpl>;
using FunctionSipHash128ReferenceKeyed
= FunctionAnyHash<SipHash128ReferenceKeyedImpl, true, SipHash128ReferenceKeyedImpl::Key, SipHash128ReferenceKeyedImpl::KeyColumns>;
using FunctionCityHash64 = FunctionAnyHash<ImplCityHash64>;
using FunctionFarmFingerprint64 = FunctionAnyHash<ImplFarmFingerprint64>;
using FunctionFarmHash64 = FunctionAnyHash<ImplFarmHash64>;

View File

@ -20,6 +20,11 @@ REGISTER_FUNCTION(Hashing)
.examples{{"hash", "SELECT hex(sipHash128Reference('foo', '\\x01', 3))", ""}},
.categories{"Hash"}
});
factory.registerFunction<FunctionSipHash128ReferenceKeyed>(FunctionDocumentation{
.description = "Same as [sipHash128Reference](#hash_functions-siphash128reference) but additionally takes an explicit key argument "
"instead of using a fixed key.",
.examples{{"hash", "SELECT hex(sipHash128ReferenceKeyed((506097522914230528, 1084818905618843912),'foo', '\\x01', 3));", ""}},
.categories{"Hash"}});
factory.registerFunction<FunctionCityHash64>();
factory.registerFunction<FunctionFarmFingerprint64>();
factory.registerFunction<FunctionFarmHash64>();

View File

@ -7,8 +7,8 @@
namespace DB
{
/** URL processing functions. See implementation in separate .cpp files.
* All functions are not strictly follow RFC, instead they are maximally simplified for performance reasons.
/** These helpers are used by URL processing functions. See implementation in separate .cpp files.
* All functions do not strictly follow RFC, instead they are maximally simplified for performance reasons.
*
* Functions for extraction parts of URL.
* If URL has nothing like, then empty string is returned.
@ -101,7 +101,7 @@ struct ExtractSubstringImpl
static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
{
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by URL functions");
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by this function");
}
};
@ -156,7 +156,7 @@ struct CutSubstringImpl
static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
{
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by URL functions");
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Column of type FixedString is not supported by this function");
}
};

View File

@ -5,7 +5,7 @@
namespace DB
{
/** Tansform-type wrapper for DateTime64, simplifies DateTime64 support for given Transform.
/** Transform-type wrapper for DateTime64, simplifies DateTime64 support for given Transform.
*
* Depending on what overloads of Transform::execute() are available, when called with DateTime64 value,
* invokes Transform::execute() with either:
@ -80,7 +80,10 @@ public:
}
else
{
const auto components = DecimalUtils::splitWithScaleMultiplier(t, scale_multiplier);
auto components = DecimalUtils::splitWithScaleMultiplier(t, scale_multiplier);
if (t.value < 0 && components.fractional)
--components.whole;
return wrapped_transform.execute(static_cast<Int64>(components.whole), std::forward<Args>(args)...);
}
}

View File

@ -1,8 +1,8 @@
#pragma once
#include <Functions/FunctionFactory.h>
#include <Functions/URL/FunctionsURL.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/StringHelpers.h>
#include <DataTypes/DataTypeString.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnFixedString.h>

View File

@ -1,7 +1,7 @@
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h>
#include <Functions/StringHelpers.h>
#include <base/find_symbols.h>
#include "FunctionsURL.h"
namespace DB
{

View File

@ -1,7 +1,7 @@
#pragma once
#include "FunctionsURL.h"
#include <base/find_symbols.h>
#include <Functions/StringHelpers.h>
namespace DB
{

View File

@ -1,7 +1,7 @@
#include <Common/StringUtils/StringUtils.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h>
#include <Functions/URL/FunctionsURL.h>
#include <Functions/StringHelpers.h>
namespace DB
@ -154,4 +154,3 @@ REGISTER_FUNCTION(Netloc)
}
}

View File

@ -1,6 +1,6 @@
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h>
#include "FunctionsURL.h"
#include <Functions/StringHelpers.h>
#include "path.h"
#include <base/find_symbols.h>

View File

@ -1,7 +1,7 @@
#pragma once
#include <base/find_symbols.h>
#include <Functions/URL/FunctionsURL.h>
#include <Functions/StringHelpers.h>
namespace DB

View File

@ -1,6 +1,6 @@
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h>
#include "FunctionsURL.h"
#include <Functions/StringHelpers.h>
#include "path.h"
#include <base/find_symbols.h>

View File

@ -1,7 +1,7 @@
#pragma once
#include "FunctionsURL.h"
#include <Common/StringUtils/StringUtils.h>
#include <Functions/StringHelpers.h>
namespace DB
@ -54,4 +54,3 @@ struct ExtractProtocol
};
}

View File

@ -1,7 +1,7 @@
#pragma once
#include "FunctionsURL.h"
#include <base/find_symbols.h>
#include <Functions/StringHelpers.h>
namespace DB

View File

@ -1,7 +1,7 @@
#pragma once
#include "FunctionsURL.h"
#include <base/find_symbols.h>
#include <Functions/StringHelpers.h>
namespace DB
@ -34,4 +34,3 @@ struct ExtractQueryStringAndFragment
};
}

View File

@ -174,12 +174,13 @@ public:
{
auto res = static_cast<Int64>(transform_y.execute(y, timezone_y))
- static_cast<Int64>(transform_x.execute(x, timezone_x));
DateLUTImpl::DateTimeComponents a_comp;
DateLUTImpl::DateTimeComponents b_comp;
DateTimeComponentsWithFractionalPart a_comp;
DateTimeComponentsWithFractionalPart b_comp;
Int64 adjust_value;
auto x_seconds = TransformDateTime64<ToRelativeSecondNumImpl<ResultPrecision::Extended>>(transform_x.getScaleMultiplier()).execute(x, timezone_x);
auto y_seconds = TransformDateTime64<ToRelativeSecondNumImpl<ResultPrecision::Extended>>(transform_y.getScaleMultiplier()).execute(y, timezone_y);
if (x_seconds <= y_seconds)
auto x_microseconds = TransformDateTime64<ToRelativeSubsecondNumImpl<microsecond_multiplier>>(transform_x.getScaleMultiplier()).execute(x, timezone_x);
auto y_microseconds = TransformDateTime64<ToRelativeSubsecondNumImpl<microsecond_multiplier>>(transform_y.getScaleMultiplier()).execute(y, timezone_y);
if (x_microseconds <= y_microseconds)
{
a_comp = TransformDateTime64<ToDateTimeComponentsImpl>(transform_x.getScaleMultiplier()).execute(x, timezone_x);
b_comp = TransformDateTime64<ToDateTimeComponentsImpl>(transform_y.getScaleMultiplier()).execute(y, timezone_y);
@ -192,14 +193,16 @@ public:
adjust_value = 1;
}
if constexpr (std::is_same_v<TransformX, TransformDateTime64<ToRelativeYearNumImpl<ResultPrecision::Extended>>>)
{
if ((a_comp.date.month > b_comp.date.month)
|| ((a_comp.date.month == b_comp.date.month) && ((a_comp.date.day > b_comp.date.day)
|| ((a_comp.date.day == b_comp.date.day) && ((a_comp.time.hour > b_comp.time.hour)
|| ((a_comp.time.hour == b_comp.time.hour) && ((a_comp.time.minute > b_comp.time.minute)
|| ((a_comp.time.minute == b_comp.time.minute) && (a_comp.time.second > b_comp.time.second))))
)))))
|| ((a_comp.time.minute == b_comp.time.minute) && ((a_comp.time.second > b_comp.time.second)
|| ((a_comp.time.second == b_comp.time.second) && ((a_comp.millisecond > b_comp.millisecond)
|| ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond)))))))))))))
res += adjust_value;
}
else if constexpr (std::is_same_v<TransformX, TransformDateTime64<ToRelativeQuarterNumImpl<ResultPrecision::Extended>>>)
@ -210,8 +213,9 @@ public:
|| ((x_month_in_quarter == y_month_in_quarter) && ((a_comp.date.day > b_comp.date.day)
|| ((a_comp.date.day == b_comp.date.day) && ((a_comp.time.hour > b_comp.time.hour)
|| ((a_comp.time.hour == b_comp.time.hour) && ((a_comp.time.minute > b_comp.time.minute)
|| ((a_comp.time.minute == b_comp.time.minute) && (a_comp.time.second > b_comp.time.second))))
)))))
|| ((a_comp.time.minute == b_comp.time.minute) && ((a_comp.time.second > b_comp.time.second)
|| ((a_comp.time.second == b_comp.time.second) && ((a_comp.millisecond > b_comp.millisecond)
|| ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond)))))))))))))
res += adjust_value;
}
else if constexpr (std::is_same_v<TransformX, TransformDateTime64<ToRelativeMonthNumImpl<ResultPrecision::Extended>>>)
@ -219,8 +223,9 @@ public:
if ((a_comp.date.day > b_comp.date.day)
|| ((a_comp.date.day == b_comp.date.day) && ((a_comp.time.hour > b_comp.time.hour)
|| ((a_comp.time.hour == b_comp.time.hour) && ((a_comp.time.minute > b_comp.time.minute)
|| ((a_comp.time.minute == b_comp.time.minute) && (a_comp.time.second > b_comp.time.second))))
)))
|| ((a_comp.time.minute == b_comp.time.minute) && ((a_comp.time.second > b_comp.time.second)
|| ((a_comp.time.second == b_comp.time.second) && ((a_comp.millisecond > b_comp.millisecond)
|| ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond)))))))))))
res += adjust_value;
}
else if constexpr (std::is_same_v<TransformX, TransformDateTime64<ToRelativeWeekNumImpl<ResultPrecision::Extended>>>)
@ -230,25 +235,44 @@ public:
if ((x_day_of_week > y_day_of_week)
|| ((x_day_of_week == y_day_of_week) && (a_comp.time.hour > b_comp.time.hour))
|| ((a_comp.time.hour == b_comp.time.hour) && ((a_comp.time.minute > b_comp.time.minute)
|| ((a_comp.time.minute == b_comp.time.minute) && (a_comp.time.second > b_comp.time.second)))))
|| ((a_comp.time.minute == b_comp.time.minute) && ((a_comp.time.second > b_comp.time.second)
|| ((a_comp.time.second == b_comp.time.second) && ((a_comp.millisecond > b_comp.millisecond)
|| ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond)))))))))
res += adjust_value;
}
else if constexpr (std::is_same_v<TransformX, TransformDateTime64<ToRelativeDayNumImpl<ResultPrecision::Extended>>>)
{
if ((a_comp.time.hour > b_comp.time.hour)
|| ((a_comp.time.hour == b_comp.time.hour) && ((a_comp.time.minute > b_comp.time.minute)
|| ((a_comp.time.minute == b_comp.time.minute) && (a_comp.time.second > b_comp.time.second)))))
|| ((a_comp.time.minute == b_comp.time.minute) && ((a_comp.time.second > b_comp.time.second)
|| ((a_comp.time.second == b_comp.time.second) && ((a_comp.millisecond > b_comp.millisecond)
|| ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond)))))))))
res += adjust_value;
}
else if constexpr (std::is_same_v<TransformX, TransformDateTime64<ToRelativeHourNumImpl<ResultPrecision::Extended>>>)
{
if ((a_comp.time.minute > b_comp.time.minute)
|| ((a_comp.time.minute == b_comp.time.minute) && (a_comp.time.second > b_comp.time.second)))
|| ((a_comp.time.minute == b_comp.time.minute) && ((a_comp.time.second > b_comp.time.second)
|| ((a_comp.time.second == b_comp.time.second) && ((a_comp.millisecond > b_comp.millisecond)
|| ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond)))))))
res += adjust_value;
}
else if constexpr (std::is_same_v<TransformX, TransformDateTime64<ToRelativeMinuteNumImpl<ResultPrecision::Extended>>>)
{
if (a_comp.time.second > b_comp.time.second)
if ((a_comp.time.second > b_comp.time.second)
|| ((a_comp.time.second == b_comp.time.second) && ((a_comp.millisecond > b_comp.millisecond)
|| ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond)))))
res += adjust_value;
}
else if constexpr (std::is_same_v<TransformX, TransformDateTime64<ToRelativeSecondNumImpl<ResultPrecision::Extended>>>)
{
if ((a_comp.millisecond > b_comp.millisecond)
|| ((a_comp.millisecond == b_comp.millisecond) && (a_comp.microsecond > b_comp.microsecond)))
res += adjust_value;
}
else if constexpr (std::is_same_v<TransformX, TransformDateTime64<ToRelativeSubsecondNumImpl<1000>>>)
{
if (a_comp.microsecond > b_comp.microsecond)
res += adjust_value;
}
return res;
@ -373,6 +397,10 @@ public:
impl.template dispatchForColumns<ToRelativeMinuteNumImpl<ResultPrecision::Extended>>(x, y, timezone_x, timezone_y, res->getData());
else if (unit == "second" || unit == "ss" || unit == "s")
impl.template dispatchForColumns<ToRelativeSecondNumImpl<ResultPrecision::Extended>>(x, y, timezone_x, timezone_y, res->getData());
else if (unit == "millisecond" || unit == "ms")
impl.template dispatchForColumns<ToRelativeSubsecondNumImpl<millisecond_multiplier>>(x, y, timezone_x, timezone_y, res->getData());
else if (unit == "microsecond" || unit == "us" || unit == "u")
impl.template dispatchForColumns<ToRelativeSubsecondNumImpl<microsecond_multiplier>>(x, y, timezone_x, timezone_y, res->getData());
else
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Function {} does not support '{}' unit", getName(), unit);

View File

@ -0,0 +1,42 @@
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringToString.h>
#include <Functions/StringHelpers.h>
#include <base/find_symbols.h>
namespace DB
{
struct FirstLine
{
static size_t getReserveLengthForElement() { return 16; }
static void execute(Pos data, size_t size, Pos & res_data, size_t & res_size)
{
res_data = data;
const Pos end = data + size;
const Pos pos = find_first_symbols<'\r', '\n'>(data, end);
res_size = pos - data;
}
};
struct NameFirstLine
{
static constexpr auto name = "firstLine";
};
using FunctionFirstLine = FunctionStringToString<ExtractSubstringImpl<FirstLine>, NameFirstLine>;
REGISTER_FUNCTION(FirstLine)
{
factory.registerFunction<FunctionFirstLine>(FunctionDocumentation{
.description = "Returns first line of a multi-line string.",
.syntax = "firstLine(string)",
.arguments = {{.name = "string", .description = "The string to process."}},
.returned_value = {"The first line of the string or the whole string if there is no line separators."},
.examples = {
{.name = "Return first line", .query = "firstLine('Hello\\nWorld')", .result = "'Hello'"},
{.name = "Return whole string", .query = "firstLine('Hello World')", .result = "'Hello World'"},
}});
}
}

View File

@ -119,7 +119,7 @@ public:
if (!lhs_array->hasEqualOffsets(*rhs_array))
throw Exception(ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH,
"The argument 1 and argument {} of function {} have different array offsets",
"The argument 2 and argument {} of function {} have different array offsets",
i + 1,
getName());

View File

@ -10,7 +10,6 @@
#include <Functions/DateTimeTransforms.h>
#include <Functions/FunctionFactory.h>
#include <Functions/IFunction.h>
#include <Functions/TransformDateTime64.h>
#include <IO/WriteHelpers.h>

View File

@ -97,6 +97,10 @@ UInt128 AsynchronousInsertQueue::InsertQuery::calculateHash() const
for (const auto & setting : settings.allChanged())
{
/// We don't consider this setting because it is only for deduplication,
/// which means we can put two inserts with different tokens in the same block safely.
if (setting.getName() == "insert_deduplication_token")
continue;
siphash.update(setting.getName());
applyVisitor(FieldVisitorHash(siphash), setting.getValue());
}
@ -111,9 +115,10 @@ bool AsynchronousInsertQueue::InsertQuery::operator==(const InsertQuery & other)
return query_str == other.query_str && settings == other.settings;
}
AsynchronousInsertQueue::InsertData::Entry::Entry(String && bytes_, String && query_id_, MemoryTracker * user_memory_tracker_)
AsynchronousInsertQueue::InsertData::Entry::Entry(String && bytes_, String && query_id_, const String & async_dedup_token_, MemoryTracker * user_memory_tracker_)
: bytes(std::move(bytes_))
, query_id(std::move(query_id_))
, async_dedup_token(async_dedup_token_)
, user_memory_tracker(user_memory_tracker_)
, create_time(std::chrono::system_clock::now())
{
@ -227,7 +232,7 @@ AsynchronousInsertQueue::push(ASTPtr query, ContextPtr query_context)
/// to avoid buffering of huge amount of data in memory.
auto read_buf = getReadBufferFromASTInsertQuery(query);
LimitReadBuffer limit_buf(*read_buf, settings.async_insert_max_data_size, /* trow_exception */ false, /* exact_limit */ {});
LimitReadBuffer limit_buf(*read_buf, settings.async_insert_max_data_size, /* throw_exception */ false, /* exact_limit */ {});
WriteBufferFromString write_buf(bytes);
copyData(limit_buf, write_buf);
@ -253,7 +258,7 @@ AsynchronousInsertQueue::push(ASTPtr query, ContextPtr query_context)
if (auto quota = query_context->getQuota())
quota->used(QuotaType::WRITTEN_BYTES, bytes.size());
auto entry = std::make_shared<InsertData::Entry>(std::move(bytes), query_context->getCurrentQueryId(), CurrentThread::getUserMemoryTracker());
auto entry = std::make_shared<InsertData::Entry>(std::move(bytes), query_context->getCurrentQueryId(), settings.insert_deduplication_token, CurrentThread::getUserMemoryTracker());
InsertQuery key{query, settings};
InsertDataPtr data_to_process;
@ -517,7 +522,7 @@ try
StreamingFormatExecutor executor(header, format, std::move(on_error), std::move(adding_defaults_transform));
std::unique_ptr<ReadBuffer> last_buffer;
auto chunk_info = std::make_shared<ChunkOffsets>();
auto chunk_info = std::make_shared<AsyncInsertInfo>();
for (const auto & entry : data->entries)
{
auto buffer = std::make_unique<ReadBufferFromString>(entry->bytes);
@ -526,6 +531,7 @@ try
size_t num_rows = executor.execute(*buffer);
total_rows += num_rows;
chunk_info->offsets.push_back(total_rows);
chunk_info->tokens.push_back(entry->async_dedup_token);
/// Keep buffer, because it still can be used
/// in destructor, while resetting buffer at next iteration.

View File

@ -69,10 +69,11 @@ private:
public:
String bytes;
const String query_id;
const String async_dedup_token;
MemoryTracker * const user_memory_tracker;
const std::chrono::time_point<std::chrono::system_clock> create_time;
Entry(String && bytes_, String && query_id_, MemoryTracker * user_memory_tracker_);
Entry(String && bytes_, String && query_id_, const String & async_dedup_token, MemoryTracker * user_memory_tracker_);
void finish(std::exception_ptr exception_ = nullptr);
std::future<void> getFuture() { return promise.get_future(); }

View File

@ -98,6 +98,7 @@
#include <Common/logger_useful.h>
#include <base/EnumReflection.h>
#include <Common/RemoteHostFilter.h>
#include <Common/HTTPHeaderFilter.h>
#include <Interpreters/AsynchronousInsertQueue.h>
#include <Interpreters/DatabaseCatalog.h>
#include <Interpreters/JIT/CompiledExpressionCache.h>
@ -327,9 +328,10 @@ struct ContextSharedPart : boost::noncopyable
OrdinaryBackgroundExecutorPtr fetch_executor;
OrdinaryBackgroundExecutorPtr common_executor;
RemoteHostFilter remote_host_filter; /// Allowed URL from config.xml
RemoteHostFilter remote_host_filter; /// Allowed URL from config.xml
HTTPHeaderFilter http_header_filter; /// Forbidden HTTP headers from config.xml
std::optional<TraceCollector> trace_collector; /// Thread collecting traces from threads executing queries
std::optional<TraceCollector> trace_collector; /// Thread collecting traces from threads executing queries
/// Clusters for distributed tables
/// Initialized on demand (on distributed storages initialization) since Settings should be initialized
@ -2963,6 +2965,16 @@ const RemoteHostFilter & Context::getRemoteHostFilter() const
return shared->remote_host_filter;
}
void Context::setHTTPHeaderFilter(const Poco::Util::AbstractConfiguration & config)
{
shared->http_header_filter.setValuesFromConfig(config);
}
const HTTPHeaderFilter & Context::getHTTPHeaderFilter() const
{
return shared->http_header_filter;
}
UInt16 Context::getTCPPort() const
{
auto lock = getLock();

View File

@ -6,6 +6,7 @@
#include <Common/isLocalAddress.h>
#include <Common/MultiVersion.h>
#include <Common/RemoteHostFilter.h>
#include <Common/HTTPHeaderFilter.h>
#include <Common/ThreadPool_fwd.h>
#include <Common/Throttler_fwd.h>
#include <Core/NamesAndTypes.h>
@ -766,6 +767,10 @@ public:
void setRemoteHostFilter(const Poco::Util::AbstractConfiguration & config);
const RemoteHostFilter & getRemoteHostFilter() const;
/// Storage of forbidden HTTP headers from config.xml
void setHTTPHeaderFilter(const Poco::Util::AbstractConfiguration & config);
const HTTPHeaderFilter & getHTTPHeaderFilter() const;
/// The port that the server listens for executing SQL queries.
UInt16 getTCPPort() const;

View File

@ -118,7 +118,10 @@ bool PredicateExpressionsOptimizer::tryRewritePredicatesToTables(ASTs & tables_e
if (table_element->table_join && isLeft(table_element->table_join->as<ASTTableJoin>()->kind))
continue; /// Skip right table optimization
if (table_element->table_join && isFull(table_element->table_join->as<ASTTableJoin>()->kind))
if (table_element->table_join && (
isFull(table_element->table_join->as<ASTTableJoin>()->kind)
|| table_element->table_join->as<ASTTableJoin>()->strictness == JoinStrictness::Asof
|| table_element->table_join->as<ASTTableJoin>()->strictness == JoinStrictness::Anti))
break; /// Skip left and right table optimization
is_rewrite_tables |= tryRewritePredicatesToTable(tables_element[table_pos], tables_predicates[table_pos],

View File

@ -114,16 +114,20 @@ private:
using Chunks = std::vector<Chunk>;
/// ChunkOffsets marks offsets of different sub-chunks, which will be used by async inserts.
class ChunkOffsets : public ChunkInfo
/// AsyncInsert needs two kinds of information:
/// - offsets of different sub-chunks
/// - tokens of different sub-chunks, which are assigned by setting `insert_deduplication_token`.
class AsyncInsertInfo : public ChunkInfo
{
public:
ChunkOffsets() = default;
explicit ChunkOffsets(const std::vector<size_t> & offsets_) : offsets(offsets_) {}
AsyncInsertInfo() = default;
explicit AsyncInsertInfo(const std::vector<size_t> & offsets_, const std::vector<String> & tokens_) : offsets(offsets_), tokens(tokens_) {}
std::vector<size_t> offsets;
std::vector<String> tokens;
};
using ChunkOffsetsPtr = std::shared_ptr<ChunkOffsets>;
using AsyncInsertInfoPtr = std::shared_ptr<AsyncInsertInfo>;
/// Extension to support delayed defaults. AddingDefaultsProcessor uses it to replace missing values with column defaults.
class ChunkMissingValues : public ChunkInfo

View File

@ -75,7 +75,7 @@ public:
{
if (!allow_missing_columns)
throw Exception(
ErrorCodes::THERE_IS_NO_COLUMN, "Not found field({}) in arrow schema:{}.", named_col.name, schema.ToString());
ErrorCodes::THERE_IS_NO_COLUMN, "Not found field ({}) in the following Arrow schema:\n{}\n", named_col.name, schema.ToString());
else
continue;
}
@ -168,4 +168,3 @@ private:
};
}
#endif

View File

@ -52,6 +52,8 @@
#include <Poco/Buffer.h>
#include <Poco/JSON/Object.h>
#include <Poco/JSON/Parser.h>
#include <Poco/Net/HTTPBasicCredentials.h>
#include <Poco/Net/HTTPCredentials.h>
#include <Poco/Net/HTTPRequest.h>
#include <Poco/Net/HTTPResponse.h>
#include <Poco/URI.h>
@ -934,6 +936,27 @@ private:
Poco::Net::HTTPRequest request(Poco::Net::HTTPRequest::HTTP_GET, url.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1);
request.setHost(url.getHost());
if (!url.getUserInfo().empty())
{
Poco::Net::HTTPCredentials http_credentials;
Poco::Net::HTTPBasicCredentials http_basic_credentials;
http_credentials.fromUserInfo(url.getUserInfo());
std::string decoded_username;
Poco::URI::decode(http_credentials.getUsername(), decoded_username);
http_basic_credentials.setUsername(decoded_username);
if (!http_credentials.getPassword().empty())
{
std::string decoded_password;
Poco::URI::decode(http_credentials.getPassword(), decoded_password);
http_basic_credentials.setPassword(decoded_password);
}
http_basic_credentials.authenticate(request);
}
auto session = makePooledHTTPSession(url, timeouts, 1);
session->sendRequest(request);

View File

@ -13,7 +13,8 @@ namespace ErrorCodes
extern const int CANNOT_SKIP_UNKNOWN_FIELD;
}
BinaryRowInputFormat::BinaryRowInputFormat(ReadBuffer & in_, const Block & header, Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_)
template <bool with_defaults>
BinaryRowInputFormat<with_defaults>::BinaryRowInputFormat(ReadBuffer & in_, const Block & header, Params params_, bool with_names_, bool with_types_, const FormatSettings & format_settings_)
: RowInputFormatWithNamesAndTypes(
header,
in_,
@ -22,16 +23,17 @@ BinaryRowInputFormat::BinaryRowInputFormat(ReadBuffer & in_, const Block & heade
with_names_,
with_types_,
format_settings_,
std::make_unique<BinaryFormatReader>(in_, format_settings_))
std::make_unique<BinaryFormatReader<with_defaults>>(in_, format_settings_))
{
}
BinaryFormatReader::BinaryFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesReader(in_, format_settings_)
template <bool with_defaults>
BinaryFormatReader<with_defaults>::BinaryFormatReader(ReadBuffer & in_, const FormatSettings & format_settings_) : FormatWithNamesAndTypesReader(in_, format_settings_)
{
}
std::vector<String> BinaryFormatReader::readHeaderRow()
template <bool with_defaults>
std::vector<String> BinaryFormatReader<with_defaults>::readHeaderRow()
{
std::vector<String> fields;
String field;
@ -43,13 +45,15 @@ std::vector<String> BinaryFormatReader::readHeaderRow()
return fields;
}
std::vector<String> BinaryFormatReader::readNames()
template <bool with_defaults>
std::vector<String> BinaryFormatReader<with_defaults>::readNames()
{
readVarUInt(read_columns, *in);
return readHeaderRow();
}
std::vector<String> BinaryFormatReader::readTypes()
template <bool with_defaults>
std::vector<String> BinaryFormatReader<with_defaults>::readTypes()
{
auto types = readHeaderRow();
for (const auto & type_name : types)
@ -57,26 +61,40 @@ std::vector<String> BinaryFormatReader::readTypes()
return types;
}
bool BinaryFormatReader::readField(IColumn & column, const DataTypePtr & /*type*/, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & /*column_name*/)
template <bool with_defaults>
bool BinaryFormatReader<with_defaults>::readField(IColumn & column, const DataTypePtr & /*type*/, const SerializationPtr & serialization, bool /*is_last_file_column*/, const String & /*column_name*/)
{
if constexpr (with_defaults)
{
UInt8 is_default;
readBinary(is_default, *in);
if (is_default)
{
column.insertDefault();
return false;
}
}
serialization->deserializeBinary(column, *in, format_settings);
return true;
}
void BinaryFormatReader::skipHeaderRow()
template <bool with_defaults>
void BinaryFormatReader<with_defaults>::skipHeaderRow()
{
String tmp;
for (size_t i = 0; i < read_columns; ++i)
readStringBinary(tmp, *in);
}
void BinaryFormatReader::skipNames()
template <bool with_defaults>
void BinaryFormatReader<with_defaults>::skipNames()
{
readVarUInt(read_columns, *in);
skipHeaderRow();
}
void BinaryFormatReader::skipTypes()
template <bool with_defaults>
void BinaryFormatReader<with_defaults>::skipTypes()
{
if (read_columns == 0)
{
@ -87,7 +105,8 @@ void BinaryFormatReader::skipTypes()
skipHeaderRow();
}
void BinaryFormatReader::skipField(size_t file_column)
template <bool with_defaults>
void BinaryFormatReader<with_defaults>::skipField(size_t file_column)
{
if (file_column >= read_data_types.size())
throw Exception(ErrorCodes::CANNOT_SKIP_UNKNOWN_FIELD,
@ -111,12 +130,21 @@ void registerInputFormatRowBinary(FormatFactory & factory)
const IRowInputFormat::Params & params,
const FormatSettings & settings)
{
return std::make_shared<BinaryRowInputFormat>(buf, sample, params, with_names, with_types, settings);
return std::make_shared<BinaryRowInputFormat<false>>(buf, sample, params, with_names, with_types, settings);
});
};
registerWithNamesAndTypes("RowBinary", register_func);
factory.registerFileExtension("bin", "RowBinary");
factory.registerInputFormat("RowBinaryWithDefaults", [](
ReadBuffer & buf,
const Block & sample,
const IRowInputFormat::Params & params,
const FormatSettings & settings)
{
return std::make_shared<BinaryRowInputFormat<true>>(buf, sample, params, false, false, settings);
});
}
void registerRowBinaryWithNamesAndTypesSchemaReader(FormatFactory & factory)
@ -125,6 +153,8 @@ void registerRowBinaryWithNamesAndTypesSchemaReader(FormatFactory & factory)
{
return std::make_shared<BinaryWithNamesAndTypesSchemaReader>(buf, settings);
});
}

View File

@ -12,6 +12,7 @@ class ReadBuffer;
/** A stream for inputting data in a binary line-by-line format.
*/
template <bool with_defaults = false>
class BinaryRowInputFormat final : public RowInputFormatWithNamesAndTypes
{
public:
@ -25,6 +26,7 @@ public:
std::string getDiagnosticInfo() override { return {}; }
};
template <bool with_defaults = false>
class BinaryFormatReader final : public FormatWithNamesAndTypesReader
{
public:
@ -54,7 +56,7 @@ public:
BinaryWithNamesAndTypesSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_);
private:
BinaryFormatReader reader;
BinaryFormatReader<false> reader;
};
}

View File

@ -64,23 +64,131 @@ namespace ErrorCodes
}
namespace
{
/// Forward-declared to use in LSWithFoldedRegexpMatching w/o circular dependency.
std::vector<StorageHDFS::PathWithInfo> LSWithRegexpMatching(const String & path_for_ls,
const HDFSFSPtr & fs,
const String & for_match);
/*
* When `{...}` has any `/`s, it must be processed in a different way:
* Basically, a path with globs is processed by LSWithRegexpMatching. In case it detects multi-dir glob {.../..., .../...},
* LSWithFoldedRegexpMatching is in charge from now on.
* It works a bit different: it still recursively goes through subdirectories, but does not match every directory to glob.
* Instead, it goes many levels down (until the approximate max_depth is reached) and compares this multi-dir path to a glob.
* StorageFile.cpp has the same logic.
*/
std::vector<StorageHDFS::PathWithInfo> LSWithFoldedRegexpMatching(const String & path_for_ls,
const HDFSFSPtr & fs,
const String & processed_suffix,
const String & suffix_with_globs,
re2::RE2 & matcher,
const size_t max_depth,
const size_t next_slash_after_glob_pos)
{
/// We don't need to go all the way in every directory if max_depth is reached
/// as it is upper limit of depth by simply counting `/`s in curly braces
if (!max_depth)
return {};
HDFSFileInfo ls;
ls.file_info = hdfsListDirectory(fs.get(), path_for_ls.data(), &ls.length);
if (ls.file_info == nullptr && errno != ENOENT) // NOLINT
{
// ignore file not found exception, keep throw other exception, libhdfs3 doesn't have function to get exception type, so use errno.
throw Exception(
ErrorCodes::ACCESS_DENIED, "Cannot list directory {}: {}", path_for_ls, String(hdfsGetLastError()));
}
std::vector<StorageHDFS::PathWithInfo> result;
if (!ls.file_info && ls.length > 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "file_info shouldn't be null");
for (int i = 0; i < ls.length; ++i)
{
const String full_path = String(ls.file_info[i].mName);
const size_t last_slash = full_path.rfind('/');
const String dir_or_file_name = full_path.substr(last_slash);
const bool is_directory = ls.file_info[i].mKind == 'D';
if (re2::RE2::FullMatch(processed_suffix + dir_or_file_name, matcher))
{
if (next_slash_after_glob_pos == std::string::npos)
{
result.emplace_back(
String(ls.file_info[i].mName),
StorageHDFS::PathInfo{ls.file_info[i].mLastMod, static_cast<size_t>(ls.file_info[i].mSize)});
}
else
{
std::vector<StorageHDFS::PathWithInfo> result_part = LSWithRegexpMatching(
fs::path(full_path) / "" , fs, suffix_with_globs.substr(next_slash_after_glob_pos));
std::move(result_part.begin(), result_part.end(), std::back_inserter(result));
}
}
else if (is_directory)
{
std::vector<StorageHDFS::PathWithInfo> result_part = LSWithFoldedRegexpMatching(
fs::path(full_path), fs, processed_suffix + dir_or_file_name,
suffix_with_globs, matcher, max_depth - 1, next_slash_after_glob_pos);
std::move(result_part.begin(), result_part.end(), std::back_inserter(result));
}
}
return result;
}
/* Recursive directory listing with matched paths as a result.
* Have the same method in StorageFile.
*/
std::vector<StorageHDFS::PathWithInfo> LSWithRegexpMatching(const String & path_for_ls, const HDFSFSPtr & fs, const String & for_match)
std::vector<StorageHDFS::PathWithInfo> LSWithRegexpMatching(
const String & path_for_ls,
const HDFSFSPtr & fs,
const String & for_match)
{
const size_t first_glob = for_match.find_first_of("*?{");
const size_t first_glob_pos = for_match.find_first_of("*?{");
const bool has_glob = first_glob_pos != std::string::npos;
const size_t end_of_path_without_globs = for_match.substr(0, first_glob).rfind('/');
const size_t end_of_path_without_globs = for_match.substr(0, first_glob_pos).rfind('/');
const String suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/'
const String prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); /// ends with '/'
const size_t next_slash = suffix_with_globs.find('/', 1);
re2::RE2 matcher(makeRegexpPatternFromGlobs(suffix_with_globs.substr(0, next_slash)));
size_t slashes_in_glob = 0;
const size_t next_slash_after_glob_pos = [&]()
{
if (!has_glob)
return suffix_with_globs.find('/', 1);
size_t in_curly = 0;
for (std::string::const_iterator it = ++suffix_with_globs.begin(); it != suffix_with_globs.end(); it++)
{
if (*it == '{')
++in_curly;
else if (*it == '/')
{
if (in_curly)
++slashes_in_glob;
else
return size_t(std::distance(suffix_with_globs.begin(), it));
}
else if (*it == '}')
--in_curly;
}
return std::string::npos;
}();
const std::string current_glob = suffix_with_globs.substr(0, next_slash_after_glob_pos);
re2::RE2 matcher(makeRegexpPatternFromGlobs(current_glob));
if (!matcher.ok())
throw Exception(ErrorCodes::CANNOT_COMPILE_REGEXP,
"Cannot compile regex from glob ({}): {}", for_match, matcher.error());
if (slashes_in_glob)
{
return LSWithFoldedRegexpMatching(fs::path(prefix_without_globs), fs, "", suffix_with_globs,
matcher, slashes_in_glob, next_slash_after_glob_pos);
}
HDFSFileInfo ls;
ls.file_info = hdfsListDirectory(fs.get(), prefix_without_globs.data(), &ls.length);
if (ls.file_info == nullptr && errno != ENOENT) // NOLINT
@ -97,7 +205,7 @@ namespace
const String full_path = String(ls.file_info[i].mName);
const size_t last_slash = full_path.rfind('/');
const String file_name = full_path.substr(last_slash);
const bool looking_for_directory = next_slash != std::string::npos;
const bool looking_for_directory = next_slash_after_glob_pos != std::string::npos;
const bool is_directory = ls.file_info[i].mKind == 'D';
/// Condition with type of current file_info means what kind of path is it in current iteration of ls
if (!is_directory && !looking_for_directory)
@ -111,7 +219,7 @@ namespace
{
if (re2::RE2::FullMatch(file_name, matcher))
{
std::vector<StorageHDFS::PathWithInfo> result_part = LSWithRegexpMatching(fs::path(full_path) / "", fs, suffix_with_globs.substr(next_slash));
std::vector<StorageHDFS::PathWithInfo> result_part = LSWithRegexpMatching(fs::path(full_path) / "", fs, suffix_with_globs.substr(next_slash_after_glob_pos));
/// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check.
std::move(result_part.begin(), result_part.end(), std::back_inserter(result));
}

View File

@ -171,23 +171,23 @@ void MergeTreeDataWriter::TemporaryPart::finalize()
projection->getDataPartStorage().precommitTransaction();
}
std::vector<ChunkOffsetsPtr> scatterOffsetsBySelector(ChunkOffsetsPtr chunk_offsets, const IColumn::Selector & selector, size_t partition_num)
std::vector<AsyncInsertInfoPtr> scatterAsyncInsertInfoBySelector(AsyncInsertInfoPtr async_insert_info, const IColumn::Selector & selector, size_t partition_num)
{
if (nullptr == chunk_offsets)
if (nullptr == async_insert_info)
{
return {};
}
if (selector.empty())
{
return {chunk_offsets};
return {async_insert_info};
}
std::vector<ChunkOffsetsPtr> result(partition_num);
std::vector<AsyncInsertInfoPtr> result(partition_num);
std::vector<Int64> last_row_for_partition(partition_num, -1);
size_t offset_idx = 0;
for (size_t i = 0; i < selector.size(); ++i)
{
++last_row_for_partition[selector[i]];
if (i + 1 == chunk_offsets->offsets[offset_idx])
if (i + 1 == async_insert_info->offsets[offset_idx])
{
for (size_t part_id = 0; part_id < last_row_for_partition.size(); ++part_id)
{
@ -196,9 +196,12 @@ std::vector<ChunkOffsetsPtr> scatterOffsetsBySelector(ChunkOffsetsPtr chunk_offs
continue;
size_t offset = static_cast<size_t>(last_row + 1);
if (result[part_id] == nullptr)
result[part_id] = std::make_shared<ChunkOffsets>();
result[part_id] = std::make_shared<AsyncInsertInfo>();
if (result[part_id]->offsets.empty() || offset > *result[part_id]->offsets.rbegin())
{
result[part_id]->offsets.push_back(offset);
result[part_id]->tokens.push_back(async_insert_info->tokens[offset_idx]);
}
}
++offset_idx;
}
@ -207,7 +210,7 @@ std::vector<ChunkOffsetsPtr> scatterOffsetsBySelector(ChunkOffsetsPtr chunk_offs
}
BlocksWithPartition MergeTreeDataWriter::splitBlockIntoParts(
const Block & block, size_t max_parts, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, ChunkOffsetsPtr chunk_offsets)
const Block & block, size_t max_parts, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, AsyncInsertInfoPtr async_insert_info)
{
BlocksWithPartition result;
if (!block || !block.rows())
@ -218,8 +221,11 @@ BlocksWithPartition MergeTreeDataWriter::splitBlockIntoParts(
if (!metadata_snapshot->hasPartitionKey()) /// Table is not partitioned.
{
result.emplace_back(Block(block), Row{});
if (chunk_offsets != nullptr)
result[0].offsets = std::move(chunk_offsets->offsets);
if (async_insert_info != nullptr)
{
result[0].offsets = std::move(async_insert_info->offsets);
result[0].tokens = std::move(async_insert_info->tokens);
}
return result;
}
@ -236,7 +242,7 @@ BlocksWithPartition MergeTreeDataWriter::splitBlockIntoParts(
IColumn::Selector selector;
buildScatterSelector(partition_columns, partition_num_to_first_row, selector, max_parts);
auto chunk_offsets_with_partition = scatterOffsetsBySelector(chunk_offsets, selector, partition_num_to_first_row.size());
auto async_insert_info_with_partition = scatterAsyncInsertInfoBySelector(async_insert_info, selector, partition_num_to_first_row.size());
size_t partitions_count = partition_num_to_first_row.size();
result.reserve(partitions_count);
@ -255,8 +261,11 @@ BlocksWithPartition MergeTreeDataWriter::splitBlockIntoParts(
/// NOTE: returning a copy of the original block so that calculated partition key columns
/// do not interfere with possible calculated primary key columns of the same name.
result.emplace_back(Block(block), get_partition(0));
if (!chunk_offsets_with_partition.empty())
result[0].offsets = std::move(chunk_offsets_with_partition[0]->offsets);
if (!async_insert_info_with_partition.empty())
{
result[0].offsets = std::move(async_insert_info_with_partition[0]->offsets);
result[0].tokens = std::move(async_insert_info_with_partition[0]->tokens);
}
return result;
}
@ -270,8 +279,11 @@ BlocksWithPartition MergeTreeDataWriter::splitBlockIntoParts(
result[i].block.getByPosition(col).column = std::move(scattered[i]);
}
for (size_t i = 0; i < chunk_offsets_with_partition.size(); ++i)
result[i].offsets = std::move(chunk_offsets_with_partition[i]->offsets);
for (size_t i = 0; i < async_insert_info_with_partition.size(); ++i)
{
result[i].offsets = std::move(async_insert_info_with_partition[i]->offsets);
result[i].tokens = std::move(async_insert_info_with_partition[i]->tokens);
}
return result;
}

View File

@ -23,14 +23,15 @@ struct BlockWithPartition
Block block;
Row partition;
std::vector<size_t> offsets;
std::vector<String> tokens;
BlockWithPartition(Block && block_, Row && partition_)
: block(block_), partition(std::move(partition_))
{
}
BlockWithPartition(Block && block_, Row && partition_, std::vector<size_t> && offsets_)
: block(block_), partition(std::move(partition_)), offsets(std::move(offsets_))
BlockWithPartition(Block && block_, Row && partition_, std::vector<size_t> && offsets_, std::vector<String> && tokens_)
: block(block_), partition(std::move(partition_)), offsets(std::move(offsets_)), tokens(std::move(tokens_))
{
}
};
@ -51,7 +52,7 @@ public:
* (split rows by partition)
* Works deterministically: if same block was passed, function will return same result in same order.
*/
static BlocksWithPartition splitBlockIntoParts(const Block & block, size_t max_parts, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, ChunkOffsetsPtr chunk_offsets = nullptr);
static BlocksWithPartition splitBlockIntoParts(const Block & block, size_t max_parts, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, AsyncInsertInfoPtr async_insert_info = nullptr);
/// This structure contains not completely written temporary part.
/// Some writes may happen asynchronously, e.g. for blob storages.

View File

@ -78,7 +78,7 @@ struct ReplicatedMergeTreeSinkImpl<async_insert>::DelayedChunk
unmerged_block_with_partition(std::move(unmerged_block_with_partition_)),
part_counters(std::move(part_counters_))
{
initBlockIDMap();
initBlockIDMap();
}
void initBlockIDMap()
@ -209,8 +209,8 @@ std::vector<Int64> testSelfDeduplicate(std::vector<Int64> data, std::vector<size
column->insert(datum);
}
Block block({ColumnWithTypeAndName(std::move(column), DataTypePtr(new DataTypeInt64()), "a")});
BlockWithPartition block1(std::move(block), Row(), std::move(offsets));
std::vector<String> tokens(offsets.size());
BlockWithPartition block1(std::move(block), Row(), std::move(offsets), std::move(tokens));
ProfileEvents::Counters profile_counters;
ReplicatedMergeTreeSinkImpl<true>::DelayedChunk::Partition part(
&Poco::Logger::get("testSelfDeduplicate"), MergeTreeDataWriter::TemporaryPart(), 0, std::move(hashes), std::move(block1), std::nullopt, std::move(profile_counters));
@ -242,22 +242,29 @@ namespace
size_t start = 0;
auto cols = block.block.getColumns();
std::vector<String> block_id_vec;
for (auto offset : block.offsets)
for (size_t i = 0; i < block.offsets.size(); ++i)
{
SipHash hash;
for (size_t i = start; i < offset; ++i)
size_t offset = block.offsets[i];
std::string_view token = block.tokens[i];
if (token.empty())
{
for (const auto & col : cols)
col->updateHashWithValue(i, hash);
}
union
{
char bytes[16];
UInt64 words[2];
} hash_value;
hash.get128(hash_value.bytes);
SipHash hash;
for (size_t j = start; j < offset; ++j)
{
for (const auto & col : cols)
col->updateHashWithValue(j, hash);
}
union
{
char bytes[16];
UInt64 words[2];
} hash_value;
hash.get128(hash_value.bytes);
block_id_vec.push_back(partition_id + "_" + DB::toString(hash_value.words[0]) + "_" + DB::toString(hash_value.words[1]));
block_id_vec.push_back(partition_id + "_" + DB::toString(hash_value.words[0]) + "_" + DB::toString(hash_value.words[1]));
}
else
block_id_vec.push_back(partition_id + "_" + std::string(token));
start = offset;
}
@ -418,18 +425,18 @@ void ReplicatedMergeTreeSinkImpl<async_insert>::consume(Chunk chunk)
convertDynamicColumnsToTuples(block, storage_snapshot);
ChunkOffsetsPtr chunk_offsets;
AsyncInsertInfoPtr async_insert_info;
if constexpr (async_insert)
{
const auto & chunk_info = chunk.getChunkInfo();
if (const auto * chunk_offsets_ptr = typeid_cast<const ChunkOffsets *>(chunk_info.get()))
chunk_offsets = std::make_shared<ChunkOffsets>(chunk_offsets_ptr->offsets);
if (const auto * async_insert_info_ptr = typeid_cast<const AsyncInsertInfo *>(chunk_info.get()))
async_insert_info = std::make_shared<AsyncInsertInfo>(async_insert_info_ptr->offsets, async_insert_info_ptr->tokens);
else
throw Exception(ErrorCodes::LOGICAL_ERROR, "No chunk info for async inserts");
}
auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context, chunk_offsets);
auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context, async_insert_info);
using DelayedPartition = typename ReplicatedMergeTreeSinkImpl<async_insert>::DelayedChunk::Partition;
using DelayedPartitions = std::vector<DelayedPartition>;
@ -453,7 +460,7 @@ void ReplicatedMergeTreeSinkImpl<async_insert>::consume(Chunk chunk)
{
/// we copy everything but offsets which we move because they are only used by async insert
if (settings.optimize_on_insert && storage.writer.getMergingMode() != MergeTreeData::MergingParams::Mode::Ordinary)
unmerged_block.emplace(Block(current_block.block), Row(current_block.partition), std::move(current_block.offsets));
unmerged_block.emplace(Block(current_block.block), Row(current_block.partition), std::move(current_block.offsets), std::move(current_block.tokens));
}
/// Write part to the filesystem under temporary name. Calculate a checksum.
@ -468,7 +475,6 @@ void ReplicatedMergeTreeSinkImpl<async_insert>::consume(Chunk chunk)
if constexpr (async_insert)
{
/// TODO consider insert_deduplication_token
block_id = getHashesForBlocks(unmerged_block.has_value() ? *unmerged_block : current_block, temp_part.part->info.partition_id);
LOG_TRACE(log, "async insert part, part id {}, block id {}, offsets {}, size {}", temp_part.part->info.partition_id, toString(block_id), toString(current_block.offsets), current_block.offsets.size());
}

View File

@ -8,7 +8,7 @@
namespace DB {
std::vector<ChunkOffsetsPtr> scatterOffsetsBySelector(ChunkOffsetsPtr chunk_offsets, const IColumn::Selector & selector, size_t partition_num);
std::vector<AsyncInsertInfoPtr> scatterAsyncInsertInfoBySelector(AsyncInsertInfoPtr chunk_offsets, const IColumn::Selector & selector, size_t partition_num);
class AsyncInsertsTest : public ::testing::TestPartResult
{};
@ -16,31 +16,36 @@ class AsyncInsertsTest : public ::testing::TestPartResult
TEST(AsyncInsertsTest, testScatterOffsetsBySelector)
{
auto test_impl = [](std::vector<size_t> offsets, std::vector<size_t> selector_data, size_t part_num, std::vector<std::vector<size_t>> expected)
auto test_impl = [](std::vector<size_t> offsets, std::vector<size_t> selector_data, std::vector<String> tokens, size_t part_num, std::vector<std::vector<std::tuple<size_t, String>>> expected)
{
auto offset_ptr = std::make_shared<ChunkOffsets>(offsets);
auto offset_ptr = std::make_shared<AsyncInsertInfo>(offsets, tokens);
IColumn::Selector selector(selector_data.size());
size_t num_rows = selector_data.size();
for (size_t i = 0; i < num_rows; i++)
selector[i] = selector_data[i];
auto results = scatterOffsetsBySelector(offset_ptr, selector, part_num);
auto results = scatterAsyncInsertInfoBySelector(offset_ptr, selector, part_num);
ASSERT_EQ(results.size(), expected.size());
for (size_t i = 0; i < results.size(); i++)
{
auto result = results[i]->offsets;
auto result = results[i];
auto expect = expected[i];
ASSERT_EQ(result.size(), expect.size());
for (size_t j = 0; j < result.size(); j++)
ASSERT_EQ(result[j], expect[j]);
ASSERT_EQ(result->offsets.size(), expect.size());
ASSERT_EQ(result->tokens.size(), expect.size());
for (size_t j = 0; j < expect.size(); j++)
{
ASSERT_EQ(result->offsets[j], std::get<0>(expect[j]));
ASSERT_EQ(result->tokens[j], std::get<1>(expect[j]));
}
}
};
test_impl({5}, {0,1,0,1,0}, 2, {{3},{2}});
test_impl({5,10}, {0,1,0,1,0,1,0,1,0,1}, 2, {{3,5},{2,5}});
test_impl({4,8,12}, {0,1,0,1,0,2,0,2,1,2,1,2}, 3, {{2,4},{2,4},{2,4}});
test_impl({1,2,3,4,5}, {0,1,2,3,4}, 5, {{1},{1},{1},{1},{1}});
test_impl({3,6,10}, {1,1,1,2,2,2,0,0,0,0}, 3, {{4},{3},{3}});
test_impl({1}, {0}, {"a"}, 1, {{{1,"a"}}});
test_impl({5}, {0,1,0,1,0}, {"a"}, 2, {{{3,"a"}},{{2,"a"}}});
test_impl({5,10}, {0,1,0,1,0,1,0,1,0,1}, {"a", "b"}, 2, {{{3,"a"},{5,"b"}},{{2,"a"},{5,"b"}}});
test_impl({4,8,12}, {0,1,0,1,0,2,0,2,1,2,1,2}, {"a", "b", "c"}, 3, {{{2, "a"},{4, "b"}},{{2,"a"},{4,"c"}},{{2,"b"},{4,"c"}}});
test_impl({1,2,3,4,5}, {0,1,2,3,4}, {"a", "b", "c", "d", "e"}, 5, {{{1,"a"}},{{1,"b"}},{{1, "c"}},{{1, "d"}},{{1, "e"}}});
test_impl({3,6,10}, {1,1,1,2,2,2,0,0,0,0}, {"a", "b", "c"}, 3, {{{4, "c"}},{{3, "a"}},{{3, "b"}}});
}
std::vector<Int64> testSelfDeduplicate(std::vector<Int64> data, std::vector<size_t> offsets, std::vector<String> hashes);

View File

@ -93,6 +93,65 @@ namespace ErrorCodes
namespace
{
/// Forward-declare to use in listFilesWithFoldedRegexpMatchingImpl()
void listFilesWithRegexpMatchingImpl(
const std::string & path_for_ls,
const std::string & for_match,
size_t & total_bytes_to_read,
std::vector<std::string> & result,
bool recursive = false);
/*
* When `{...}` has any `/`s, it must be processed in a different way:
* Basically, a path with globs is processed by listFilesWithRegexpMatchingImpl. In case it detects multi-dir glob {.../..., .../...},
* listFilesWithFoldedRegexpMatchingImpl is in charge from now on.
* It works a bit different: it still recursively goes through subdirectories, but does not match every directory to glob.
* Instead, it goes many levels down (until the approximate max_depth is reached) and compares this multi-dir path to a glob.
* StorageHDFS.cpp has the same logic.
*/
void listFilesWithFoldedRegexpMatchingImpl(const std::string & path_for_ls,
const std::string & processed_suffix,
const std::string & suffix_with_globs,
re2::RE2 & matcher,
size_t & total_bytes_to_read,
const size_t max_depth,
const size_t next_slash_after_glob_pos,
std::vector<std::string> & result)
{
if (!max_depth)
return;
const fs::directory_iterator end;
for (fs::directory_iterator it(path_for_ls); it != end; ++it)
{
const std::string full_path = it->path().string();
const size_t last_slash = full_path.rfind('/');
const String dir_or_file_name = full_path.substr(last_slash);
if (re2::RE2::FullMatch(processed_suffix + dir_or_file_name, matcher))
{
if (next_slash_after_glob_pos == std::string::npos)
{
total_bytes_to_read += it->file_size();
result.push_back(it->path().string());
}
else
{
listFilesWithRegexpMatchingImpl(fs::path(full_path) / "" ,
suffix_with_globs.substr(next_slash_after_glob_pos),
total_bytes_to_read, result);
}
}
else if (it->is_directory())
{
listFilesWithFoldedRegexpMatchingImpl(fs::path(full_path), processed_suffix + dir_or_file_name,
suffix_with_globs, matcher, total_bytes_to_read,
max_depth - 1, next_slash_after_glob_pos, result);
}
}
}
/* Recursive directory listing with matched paths as a result.
* Have the same method in StorageHDFS.
*/
@ -101,15 +160,42 @@ void listFilesWithRegexpMatchingImpl(
const std::string & for_match,
size_t & total_bytes_to_read,
std::vector<std::string> & result,
bool recursive = false)
bool recursive)
{
const size_t first_glob = for_match.find_first_of("*?{");
const size_t first_glob_pos = for_match.find_first_of("*?{");
const bool has_glob = first_glob_pos != std::string::npos;
const size_t end_of_path_without_globs = for_match.substr(0, first_glob).rfind('/');
const size_t end_of_path_without_globs = for_match.substr(0, first_glob_pos).rfind('/');
const std::string suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/'
const size_t next_slash = suffix_with_globs.find('/', 1);
const std::string current_glob = suffix_with_globs.substr(0, next_slash);
/// slashes_in_glob counter is a upper-bound estimate of recursion depth
/// needed to process complex cases when `/` is included into glob, e.g. /pa{th1/a,th2/b}.csv
size_t slashes_in_glob = 0;
const size_t next_slash_after_glob_pos = [&]()
{
if (!has_glob)
return suffix_with_globs.find('/', 1);
size_t in_curly = 0;
for (std::string::const_iterator it = ++suffix_with_globs.begin(); it != suffix_with_globs.end(); it++)
{
if (*it == '{')
++in_curly;
else if (*it == '/')
{
if (in_curly)
++slashes_in_glob;
else
return size_t(std::distance(suffix_with_globs.begin(), it));
}
else if (*it == '}')
--in_curly;
}
return std::string::npos;
}();
const std::string current_glob = suffix_with_globs.substr(0, next_slash_after_glob_pos);
auto regexp = makeRegexpPatternFromGlobs(current_glob);
re2::RE2 matcher(regexp);
@ -126,13 +212,22 @@ void listFilesWithRegexpMatchingImpl(
if (!fs::exists(prefix_without_globs))
return;
const bool looking_for_directory = next_slash_after_glob_pos != std::string::npos;
if (slashes_in_glob)
{
listFilesWithFoldedRegexpMatchingImpl(fs::path(prefix_without_globs), "", suffix_with_globs,
matcher, total_bytes_to_read, slashes_in_glob,
next_slash_after_glob_pos, result);
return;
}
const fs::directory_iterator end;
for (fs::directory_iterator it(prefix_without_globs); it != end; ++it)
{
const std::string full_path = it->path().string();
const size_t last_slash = full_path.rfind('/');
const String file_name = full_path.substr(last_slash);
const bool looking_for_directory = next_slash != std::string::npos;
/// Condition is_directory means what kind of path is it in current iteration of ls
if (!it->is_directory() && !looking_for_directory)
@ -148,14 +243,12 @@ void listFilesWithRegexpMatchingImpl(
if (recursive)
{
listFilesWithRegexpMatchingImpl(fs::path(full_path).append(it->path().string()) / "" ,
looking_for_directory ? suffix_with_globs.substr(next_slash) : current_glob ,
looking_for_directory ? suffix_with_globs.substr(next_slash_after_glob_pos) : current_glob ,
total_bytes_to_read, result, recursive);
}
else if (looking_for_directory && re2::RE2::FullMatch(file_name, matcher))
{
/// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check.
listFilesWithRegexpMatchingImpl(fs::path(full_path) / "", suffix_with_globs.substr(next_slash), total_bytes_to_read, result);
}
listFilesWithRegexpMatchingImpl(fs::path(full_path) / "", suffix_with_globs.substr(next_slash_after_glob_pos), total_bytes_to_read, result);
}
}
}

View File

@ -1385,7 +1385,7 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks)
}
const UInt64 parts_to_fetch_blocks = std::accumulate(parts_to_fetch.cbegin(), parts_to_fetch.cend(), 0,
[&](UInt64 acc, const String& part_name)
[&](UInt64 acc, const String & part_name)
{
if (const auto part_info = MergeTreePartInfo::tryParsePartName(part_name, format_version))
return acc + part_info->getBlocksCount();

View File

@ -974,6 +974,7 @@ StorageS3::StorageS3(
FormatFactory::instance().checkFormatName(configuration.format);
context_->getGlobalContext()->getRemoteHostFilter().checkURL(configuration.url.uri);
context_->getGlobalContext()->getHTTPHeaderFilter().checkHeaders(configuration.headers_from_ast);
StorageInMemoryMetadata storage_metadata;
if (columns_.empty())

View File

@ -44,6 +44,8 @@ StorageS3Cluster::StorageS3Cluster(
, s3_configuration{configuration_}
{
context_->getGlobalContext()->getRemoteHostFilter().checkURL(configuration_.url.uri);
context_->getGlobalContext()->getHTTPHeaderFilter().checkHeaders(configuration_.headers_from_ast);
StorageInMemoryMetadata storage_metadata;
updateConfigurationIfChanged(context_);

View File

@ -1019,6 +1019,7 @@ StorageURL::StorageURL(
distributed_processing_)
{
context_->getRemoteHostFilter().checkURL(Poco::URI(uri));
context_->getHTTPHeaderFilter().checkHeaders(headers);
}

View File

@ -48,6 +48,7 @@ StorageURLCluster::StorageURLCluster(
, uri(uri_)
{
context_->getRemoteHostFilter().checkURL(Poco::URI(uri));
context_->getHTTPHeaderFilter().checkHeaders(configuration_.headers);
StorageInMemoryMetadata storage_metadata;

View File

@ -10,6 +10,7 @@
#include <Processors/Executors/PullingPipelineExecutor.h>
#include <Processors/Formats/IInputFormat.h>
#include <Processors/Transforms/AddingDefaultsTransform.h>
#include <QueryPipeline/Pipe.h>
#include <QueryPipeline/QueryPipelineBuilder.h>
@ -72,7 +73,17 @@ Block TableFunctionFormat::parseData(ColumnsDescription columns, ContextPtr cont
auto read_buf = std::make_unique<ReadBufferFromString>(data);
auto input_format = context->getInputFormat(format, *read_buf, block, context->getSettingsRef().max_block_size);
auto pipeline = std::make_unique<QueryPipeline>(input_format);
QueryPipelineBuilder builder;
builder.init(Pipe(input_format));
if (columns.hasDefaults())
{
builder.addSimpleTransform([&](const Block & header)
{
return std::make_shared<AddingDefaultsTransform>(header, columns, *input_format, context);
});
}
auto pipeline = std::make_unique<QueryPipeline>(QueryPipelineBuilder::getPipeline(std::move(builder)));
auto reader = std::make_unique<PullingPipelineExecutor>(*pipeline);
std::vector<Block> blocks;

View File

@ -18,6 +18,8 @@
#include <Storages/NamedCollectionsHelpers.h>
#include <Formats/FormatFactory.h>
#include "registerTableFunctions.h"
#include <Analyzer/FunctionNode.h>
#include <Analyzer/TableFunctionNode.h>
#include <boost/algorithm/string.hpp>
@ -32,6 +34,24 @@ namespace ErrorCodes
}
std::vector<size_t> TableFunctionS3::skipAnalysisForArguments(const QueryTreeNodePtr & query_node_table_function, ContextPtr) const
{
auto & table_function_node = query_node_table_function->as<TableFunctionNode &>();
auto & table_function_arguments_nodes = table_function_node.getArguments().getNodes();
size_t table_function_arguments_size = table_function_arguments_nodes.size();
std::vector<size_t> result;
for (size_t i = 0; i < table_function_arguments_size; ++i)
{
auto * function_node = table_function_arguments_nodes[i]->as<FunctionNode>();
if (function_node && function_node->getFunctionName() == "headers")
result.push_back(i);
}
return result;
}
/// This is needed to avoid copy-pase. Because s3Cluster arguments only differ in additional argument (first) - cluster name
void TableFunctionS3::parseArgumentsImpl(ASTs & args, const ContextPtr & context)
{
@ -41,13 +61,14 @@ void TableFunctionS3::parseArgumentsImpl(ASTs & args, const ContextPtr & context
}
else
{
if (args.empty() || args.size() > 6)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "The signature of table function {} shall be the following:\n{}", getName(), getSignature());
auto * header_it = StorageURL::collectHeaders(args, configuration.headers_from_ast, context);
if (header_it != args.end())
args.erase(header_it);
if (args.empty() || args.size() > 6)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "The signature of table function {} shall be the following:\n{}", getName(), getSignature());
for (auto & arg : args)
arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context);

View File

@ -73,6 +73,10 @@ protected:
mutable StorageS3::Configuration configuration;
ColumnsDescription structure_hint;
private:
std::vector<size_t> skipAnalysisForArguments(const QueryTreeNodePtr & query_node_table_function, ContextPtr context) const override;
};
}

View File

@ -246,6 +246,12 @@ def main():
if args.check_running_workflows:
workflows = get_workflows_for_head(repo, pr.head.sha)
logging.info(
"The PR #%s has following workflows:\n%s",
pr.number,
"\n".join(f"{wf.html_url}: status is {wf.status}" for wf in workflows),
)
workflows_in_progress = [wf for wf in workflows if wf.status != "completed"]
# At most one workflow in progress is fine. We check that there no
# cases like, e.g. PullRequestCI and DocksCheck in progress at once

View File

@ -0,0 +1,6 @@
<clickhouse>
<http_forbid_headers>
<header>exact_header</header>
<header_regexp>(?i)(case_insensitive_header)</header_regexp>
</http_forbid_headers>
</clickhouse>

View File

@ -1,64 +1,14 @@
<clickhouse>
<storage_configuration>
<disks>
<!-- s3 disks -->
<s3_common_disk>
<s3_disk>
<type>s3</type>
<path>s3_common_disk/</path>
<path>s3_disk/</path>
<endpoint>http://localhost:11111/test/common/</endpoint>
<access_key_id>clickhouse</access_key_id>
<secret_access_key>clickhouse</secret_access_key>
<request_timeout_ms>20000</request_timeout_ms>
</s3_common_disk>
<s3_disk>
<type>s3</type>
<path>s3_disk/</path>
<endpoint>http://localhost:11111/test/00170_test/</endpoint>
<access_key_id>clickhouse</access_key_id>
<secret_access_key>clickhouse</secret_access_key>
<request_timeout_ms>20000</request_timeout_ms>
</s3_disk>
<s3_disk_2>
<type>s3</type>
<path>s3_disk_2/</path>
<endpoint>http://localhost:11111/test/00170_test/</endpoint>
<access_key_id>clickhouse</access_key_id>
<secret_access_key>clickhouse</secret_access_key>
<request_timeout_ms>20000</request_timeout_ms>
</s3_disk_2>
<s3_disk_3>
<type>s3</type>
<path>s3_disk_3/</path>
<endpoint>http://localhost:11111/test/00170_test/</endpoint>
<access_key_id>clickhouse</access_key_id>
<secret_access_key>clickhouse</secret_access_key>
<request_timeout_ms>20000</request_timeout_ms>
</s3_disk_3>
<s3_disk_4>
<type>s3</type>
<path>s3_disk_4/</path>
<endpoint>http://localhost:11111/test/00170_test/</endpoint>
<access_key_id>clickhouse</access_key_id>
<secret_access_key>clickhouse</secret_access_key>
<request_timeout_ms>20000</request_timeout_ms>
</s3_disk_4>
<s3_disk_5>
<type>s3</type>
<path>s3_disk_5/</path>
<endpoint>http://localhost:11111/test/00170_test/</endpoint>
<access_key_id>clickhouse</access_key_id>
<secret_access_key>clickhouse</secret_access_key>
<request_timeout_ms>20000</request_timeout_ms>
</s3_disk_5>
<s3_disk_6>
<type>s3</type>
<path>s3_disk_6/</path>
<endpoint>http://localhost:11111/test/00170_test/</endpoint>
<access_key_id>clickhouse</access_key_id>
<secret_access_key>clickhouse</secret_access_key>
<request_timeout_ms>20000</request_timeout_ms>
</s3_disk_6>
<!-- cache for s3 disks -->
<s3_cache>
<type>cache</type>
<disk>s3_disk</disk>
@ -67,65 +17,6 @@
<cache_on_write_operations>1</cache_on_write_operations>
<delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
</s3_cache>
<s3_cache_2>
<type>cache</type>
<disk>s3_disk_2</disk>
<path>s3_cache_2/</path>
<max_size>128Mi</max_size>
<max_file_segment_size>100Mi</max_file_segment_size>
<delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
</s3_cache_2>
<s3_cache_3>
<type>cache</type>
<disk>s3_disk_3</disk>
<path>s3_disk_3_cache/</path>
<max_size>128Mi</max_size>
<data_cache_max_size>22548578304</data_cache_max_size>
<cache_on_write_operations>1</cache_on_write_operations>
<enable_cache_hits_threshold>1</enable_cache_hits_threshold>
<delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
</s3_cache_3>
<s3_cache_4>
<type>cache</type>
<disk>s3_disk_4</disk>
<path>s3_cache_4/</path>
<max_size>128Mi</max_size>
<cache_on_write_operations>1</cache_on_write_operations>
<enable_filesystem_query_cache_limit>1</enable_filesystem_query_cache_limit>
<delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
</s3_cache_4>
<s3_cache_5>
<type>cache</type>
<disk>s3_disk_5</disk>
<path>s3_cache_5/</path>
<max_size>128Mi</max_size>
<delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
</s3_cache_5>
<s3_cache_6>
<type>cache</type>
<disk>s3_disk_6</disk>
<path>s3_cache_6/</path>
<max_size>128Mi</max_size>
<enable_bypass_cache_with_threashold>1</enable_bypass_cache_with_threashold>
<bypass_cache_threashold>100</bypass_cache_threashold>
<delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
</s3_cache_6>
<s3_cache_small>
<type>cache</type>
<disk>s3_disk_6</disk>
<path>s3_cache_small/</path>
<max_size>1000</max_size>
<delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
</s3_cache_small>
<s3_cache_small_segment_size>
<type>cache</type>
<disk>s3_disk_6</disk>
<path>s3_cache_small_segment_size/</path>
<max_size>128Mi</max_size>
<max_file_segment_size>10Ki</max_file_segment_size>
<cache_on_write_operations>1</cache_on_write_operations>
<delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
</s3_cache_small_segment_size>
<!-- local disks -->
<local_disk>
<type>local_blob_storage</type>
@ -167,7 +58,7 @@
<!-- multi layer cache -->
<s3_cache_multi>
<type>cache</type>
<disk>s3_cache_5</disk>
<disk>s3_cache</disk>
<path>s3_cache_multi/</path>
<max_size>22548578304</max_size>
<delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
@ -188,34 +79,6 @@
</main>
</volumes>
</s3_cache>
<s3_cache_2>
<volumes>
<main>
<disk>s3_cache_2</disk>
</main>
</volumes>
</s3_cache_2>
<s3_cache_3>
<volumes>
<main>
<disk>s3_cache_3</disk>
</main>
</volumes>
</s3_cache_3>
<s3_cache_4>
<volumes>
<main>
<disk>s3_cache_4</disk>
</main>
</volumes>
</s3_cache_4>
<s3_cache_6>
<volumes>
<main>
<disk>s3_cache_6</disk>
</main>
</volumes>
</s3_cache_6>
<s3_cache_multi>
<volumes>
<main>
@ -223,13 +86,6 @@
</main>
</volumes>
</s3_cache_multi>
<s3_cache_small>
<volumes>
<main>
<disk>s3_cache_small</disk>
</main>
</volumes>
</s3_cache_small>
<local_cache>
<volumes>
<main>
@ -251,13 +107,6 @@
</main>
</volumes>
</local_cache_3>
<s3_cache_small_segment_size>
<volumes>
<main>
<disk>s3_cache_small_segment_size</disk>
</main>
</volumes>
</s3_cache_small_segment_size>
</policies>
</storage_configuration>
</clickhouse>

View File

@ -51,6 +51,7 @@ ln -sf $SRC_PATH/config.d/session_log.xml $DEST_SERVER_PATH/config.d/
ln -sf $SRC_PATH/config.d/system_unfreeze.xml $DEST_SERVER_PATH/config.d/
ln -sf $SRC_PATH/config.d/enable_zero_copy_replication.xml $DEST_SERVER_PATH/config.d/
ln -sf $SRC_PATH/config.d/nlp.xml $DEST_SERVER_PATH/config.d/
ln -sf $SRC_PATH/config.d/forbidden_headers.xml $DEST_SERVER_PATH/config.d/
ln -sf $SRC_PATH/config.d/enable_keeper_map.xml $DEST_SERVER_PATH/config.d/
ln -sf $SRC_PATH/config.d/custom_disks_base_path.xml $DEST_SERVER_PATH/config.d/
ln -sf $SRC_PATH/config.d/display_name.xml $DEST_SERVER_PATH/config.d/

View File

@ -486,6 +486,8 @@ class ClickHouseCluster:
self.kafka_docker_id = None
self.schema_registry_host = "schema-registry"
self._schema_registry_port = 0
self.schema_registry_auth_host = "schema-registry-auth"
self._schema_registry_auth_port = 0
self.kafka_docker_id = self.get_instance_docker_id(self.kafka_host)
self.coredns_host = "coredns"
@ -657,6 +659,13 @@ class ClickHouseCluster:
self._schema_registry_port = get_free_port()
return self._schema_registry_port
@property
def schema_registry_auth_port(self):
if self._schema_registry_auth_port:
return self._schema_registry_auth_port
self._schema_registry_auth_port = get_free_port()
return self._schema_registry_auth_port
@property
def kerberized_kafka_port(self):
if self._kerberized_kafka_port:
@ -1163,8 +1172,11 @@ class ClickHouseCluster:
self.with_kafka = True
env_variables["KAFKA_HOST"] = self.kafka_host
env_variables["KAFKA_EXTERNAL_PORT"] = str(self.kafka_port)
env_variables["SCHEMA_REGISTRY_DIR"] = instance.path + "/"
env_variables["SCHEMA_REGISTRY_EXTERNAL_PORT"] = str(self.schema_registry_port)
env_variables["SCHEMA_REGISTRY_INTERNAL_PORT"] = "8081"
env_variables["SCHEMA_REGISTRY_AUTH_EXTERNAL_PORT"] = str(
self.schema_registry_auth_port
)
self.base_cmd.extend(
["--file", p.join(docker_compose_yml_dir, "docker_compose_kafka.yml")]
)
@ -1498,6 +1510,7 @@ class ClickHouseCluster:
with_kafka=False,
with_kerberized_kafka=False,
with_kerberos_kdc=False,
with_secrets=False,
with_rabbitmq=False,
with_nats=False,
clickhouse_path_dir=None,
@ -1604,6 +1617,10 @@ class ClickHouseCluster:
with_nats=with_nats,
with_nginx=with_nginx,
with_kerberized_hdfs=with_kerberized_hdfs,
with_secrets=with_secrets
or with_kerberized_hdfs
or with_kerberos_kdc
or with_kerberized_kafka,
with_mongo=with_mongo or with_mongo_secure,
with_meili=with_meili,
with_redis=with_redis,
@ -2493,20 +2510,27 @@ class ClickHouseCluster:
raise Exception("Can't wait Azurite to start")
def wait_schema_registry_to_start(self, timeout=180):
sr_client = CachedSchemaRegistryClient(
{"url": "http://localhost:{}".format(self.schema_registry_port)}
)
start = time.time()
while time.time() - start < timeout:
try:
sr_client._send_request(sr_client.url)
logging.debug("Connected to SchemaRegistry")
return sr_client
except Exception as ex:
logging.debug(("Can't connect to SchemaRegistry: %s", str(ex)))
time.sleep(1)
for port in self.schema_registry_port, self.schema_registry_auth_port:
reg_url = "http://localhost:{}".format(port)
arg = {"url": reg_url}
sr_client = CachedSchemaRegistryClient(arg)
raise Exception("Can't wait Schema Registry to start")
start = time.time()
sr_started = False
sr_auth_started = False
while time.time() - start < timeout:
try:
sr_client._send_request(sr_client.url)
logging.debug("Connected to SchemaRegistry")
# don't care about possible auth errors
sr_started = True
break
except Exception as ex:
logging.debug(("Can't connect to SchemaRegistry: %s", str(ex)))
time.sleep(1)
if not sr_started:
raise Exception("Can't wait Schema Registry to start")
def wait_cassandra_to_start(self, timeout=180):
self.cassandra_ip = self.get_instance_ip(self.cassandra_host)
@ -3135,6 +3159,7 @@ class ClickHouseInstance:
with_nats,
with_nginx,
with_kerberized_hdfs,
with_secrets,
with_mongo,
with_meili,
with_redis,
@ -3197,7 +3222,7 @@ class ClickHouseInstance:
if clickhouse_path_dir
else None
)
self.kerberos_secrets_dir = p.abspath(p.join(base_path, "secrets"))
self.secrets_dir = p.abspath(p.join(base_path, "secrets"))
self.macros = macros if macros is not None else {}
self.with_zookeeper = with_zookeeper
self.zookeeper_config_path = zookeeper_config_path
@ -3220,6 +3245,7 @@ class ClickHouseInstance:
self.with_nats = with_nats
self.with_nginx = with_nginx
self.with_kerberized_hdfs = with_kerberized_hdfs
self.with_secrets = with_secrets
self.with_mongo = with_mongo
self.with_meili = with_meili
self.with_redis = with_redis
@ -4217,17 +4243,16 @@ class ClickHouseInstance:
if self.with_zookeeper:
shutil.copy(self.zookeeper_config_path, conf_d_dir)
if (
self.with_kerberized_kafka
or self.with_kerberized_hdfs
or self.with_kerberos_kdc
):
if self.with_secrets:
if self.with_kerberos_kdc:
base_secrets_dir = self.cluster.instances_dir
else:
base_secrets_dir = self.path
from_dir = self.secrets_dir
to_dir = p.abspath(p.join(base_secrets_dir, "secrets"))
logging.debug(f"Copy secret from {from_dir} to {to_dir}")
shutil.copytree(
self.kerberos_secrets_dir,
self.secrets_dir,
p.abspath(p.join(base_secrets_dir, "secrets")),
dirs_exist_ok=True,
)

View File

@ -0,0 +1,3 @@
schemauser: MD5:0d107d09f5bbe40cade3de5c71e9e9b7,user
schemauser/slash: MD5:0d107d09f5bbe40cade3de5c71e9e9b7,user
complexschemauser: MD5:fcaeda86837fcd37755044e7258edc5d,user

View File

@ -0,0 +1,5 @@
RealmFooBar {
org.eclipse.jetty.jaas.spi.PropertyFileLoginModule required
file="/etc/schema-registry/secrets/password"
debug="true";
};

View File

@ -1,5 +1,6 @@
import io
import logging
import time
import avro.schema
import pytest
@ -8,13 +9,14 @@ from confluent_kafka.avro.cached_schema_registry_client import (
)
from confluent_kafka.avro.serializer.message_serializer import MessageSerializer
from helpers.cluster import ClickHouseCluster, ClickHouseInstance
from urllib import parse
@pytest.fixture(scope="module")
def started_cluster():
try:
cluster = ClickHouseCluster(__file__)
cluster.add_instance("dummy", with_kafka=True)
cluster.add_instance("dummy", with_kafka=True, with_secrets=True)
logging.info("Starting cluster...")
cluster.start()
logging.info("Cluster started")
@ -40,14 +42,15 @@ def run_query(instance, query, data=None, settings=None):
def test_select(started_cluster):
# type: (ClickHouseCluster) -> None
schema_registry_client = CachedSchemaRegistryClient(
"http://localhost:{}".format(started_cluster.schema_registry_port)
)
reg_url = "http://localhost:{}".format(started_cluster.schema_registry_port)
arg = {"url": reg_url}
schema_registry_client = CachedSchemaRegistryClient(arg)
serializer = MessageSerializer(schema_registry_client)
schema = avro.schema.make_avsc_object(
{
"name": "test_record",
"name": "test_record1",
"type": "record",
"fields": [{"name": "value", "type": "long"}],
}
@ -56,14 +59,14 @@ def test_select(started_cluster):
buf = io.BytesIO()
for x in range(0, 3):
message = serializer.encode_record_with_schema(
"test_subject", schema, {"value": x}
"test_subject1", schema, {"value": x}
)
buf.write(message)
data = buf.getvalue()
instance = started_cluster.instances["dummy"] # type: ClickHouseInstance
schema_registry_url = "http://{}:{}".format(
started_cluster.schema_registry_host, 8081
started_cluster.schema_registry_host, started_cluster.schema_registry_port
)
run_query(instance, "create table avro_data(value Int64) engine = Memory()")
@ -75,3 +78,164 @@ def test_select(started_cluster):
["1"],
["2"],
]
def test_select_auth(started_cluster):
# type: (ClickHouseCluster) -> None
reg_url = "http://localhost:{}".format(started_cluster.schema_registry_auth_port)
arg = {
"url": reg_url,
"basic.auth.credentials.source": "USER_INFO",
"basic.auth.user.info": "schemauser:letmein",
}
schema_registry_client = CachedSchemaRegistryClient(arg)
serializer = MessageSerializer(schema_registry_client)
schema = avro.schema.make_avsc_object(
{
"name": "test_record_auth",
"type": "record",
"fields": [{"name": "value", "type": "long"}],
}
)
buf = io.BytesIO()
for x in range(0, 3):
message = serializer.encode_record_with_schema(
"test_subject_auth", schema, {"value": x}
)
buf.write(message)
data = buf.getvalue()
instance = started_cluster.instances["dummy"] # type: ClickHouseInstance
schema_registry_url = "http://{}:{}@{}:{}".format(
"schemauser",
"letmein",
started_cluster.schema_registry_auth_host,
started_cluster.schema_registry_auth_port,
)
run_query(instance, "create table avro_data_auth(value Int64) engine = Memory()")
settings = {"format_avro_schema_registry_url": schema_registry_url}
run_query(
instance, "insert into avro_data_auth format AvroConfluent", data, settings
)
stdout = run_query(instance, "select * from avro_data_auth")
assert list(map(str.split, stdout.splitlines())) == [
["0"],
["1"],
["2"],
]
def test_select_auth_encoded(started_cluster):
# type: (ClickHouseCluster) -> None
reg_url = "http://localhost:{}".format(started_cluster.schema_registry_auth_port)
arg = {
"url": reg_url,
"basic.auth.credentials.source": "USER_INFO",
"basic.auth.user.info": "schemauser:letmein",
}
schema_registry_client = CachedSchemaRegistryClient(arg)
serializer = MessageSerializer(schema_registry_client)
schema = avro.schema.make_avsc_object(
{
"name": "test_record_auth_encoded",
"type": "record",
"fields": [{"name": "value", "type": "long"}],
}
)
buf = io.BytesIO()
for x in range(0, 3):
message = serializer.encode_record_with_schema(
"test_subject_auth_encoded", schema, {"value": x}
)
buf.write(message)
data = buf.getvalue()
instance = started_cluster.instances["dummy"] # type: ClickHouseInstance
schema_registry_url = "http://{}:{}@{}:{}".format(
parse.quote_plus("schemauser/slash"),
parse.quote_plus("letmein"),
started_cluster.schema_registry_auth_host,
started_cluster.schema_registry_auth_port,
)
run_query(
instance, "create table avro_data_auth_encoded(value Int64) engine = Memory()"
)
settings = {"format_avro_schema_registry_url": schema_registry_url}
run_query(
instance,
"insert into avro_data_auth_encoded format AvroConfluent",
data,
settings,
)
stdout = run_query(instance, "select * from avro_data_auth_encoded")
assert list(map(str.split, stdout.splitlines())) == [
["0"],
["1"],
["2"],
]
def test_select_auth_encoded_complex(started_cluster):
# type: (ClickHouseCluster) -> None
reg_url = "http://localhost:{}".format(started_cluster.schema_registry_auth_port)
arg = {
"url": reg_url,
"basic.auth.credentials.source": "USER_INFO",
"basic.auth.user.info": "schemauser:letmein",
}
schema_registry_client = CachedSchemaRegistryClient(arg)
serializer = MessageSerializer(schema_registry_client)
schema = avro.schema.make_avsc_object(
{
"name": "test_record_auth_encoded_complex",
"type": "record",
"fields": [{"name": "value", "type": "long"}],
}
)
buf = io.BytesIO()
for x in range(0, 3):
message = serializer.encode_record_with_schema(
"test_subject_auth_encoded_complex", schema, {"value": x}
)
buf.write(message)
data = buf.getvalue()
instance = started_cluster.instances["dummy"] # type: ClickHouseInstance
schema_registry_url = "http://{}:{}@{}:{}".format(
parse.quote_plus("complexschemauser"),
parse.quote_plus("letmein%@:/"),
started_cluster.schema_registry_auth_host,
started_cluster.schema_registry_auth_port,
)
run_query(
instance,
"create table avro_data_auth_encoded_complex(value Int64) engine = Memory()",
)
settings = {"format_avro_schema_registry_url": schema_registry_url}
run_query(
instance,
"insert into avro_data_auth_encoded_complex format AvroConfluent",
data,
settings,
)
stdout = run_query(instance, "select * from avro_data_auth_encoded_complex")
assert list(map(str.split, stdout.splitlines())) == [
["0"],
["1"],
["2"],
]

View File

@ -1,6 +1,10 @@
<clickhouse>
<storage_configuration>
<disks>
<disk_hdfs>
<type>hdfs</type>
<endpoint>hdfs://hdfs1:9000/</endpoint>
</disk_hdfs>
<log_local>
<type>local</type>
<path>/var/lib/clickhouse/coordination/logs/</path>

View File

@ -9,7 +9,11 @@ import os
CURRENT_TEST_DIR = os.path.dirname(os.path.abspath(__file__))
cluster = ClickHouseCluster(__file__)
node = cluster.add_instance(
"node", main_configs=["configs/enable_keeper.xml"], stay_alive=True, with_minio=True
"node",
main_configs=["configs/enable_keeper.xml"],
stay_alive=True,
with_minio=True,
with_hdfs=True,
)
from kazoo.client import KazooClient, KazooState
@ -117,6 +121,12 @@ def get_local_snapshots():
return get_local_files("/var/lib/clickhouse/coordination/snapshots")
def test_supported_disk_types(started_cluster):
node.stop_clickhouse()
node.start_clickhouse()
node.contains_in_log("Disk type 'hdfs' is not supported for Keeper")
def test_logs_with_disks(started_cluster):
setup_local_storage(started_cluster)

View File

@ -39,257 +39,261 @@ def test_lost_part_same_replica(start_cluster):
node1.query("DROP TABLE IF EXISTS mt0 SYNC")
node2.query("DROP TABLE IF EXISTS mt0 SYNC")
for node in [node1, node2]:
node.query(
f"CREATE TABLE mt0 (id UInt64, date Date) ENGINE ReplicatedMergeTree('/clickhouse/tables/t', '{node.name}') ORDER BY tuple() PARTITION BY date "
"SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0,"
"merge_selecting_sleep_ms=100, max_merge_selecting_sleep_ms=1000"
)
try:
for node in [node1, node2]:
node.query(
f"CREATE TABLE mt0 (id UInt64, date Date) ENGINE ReplicatedMergeTree('/clickhouse/tables/t', '{node.name}') ORDER BY tuple() PARTITION BY date "
"SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0,"
"merge_selecting_sleep_ms=100, max_merge_selecting_sleep_ms=1000"
)
node1.query("SYSTEM STOP MERGES mt0")
node2.query("SYSTEM STOP REPLICATION QUEUES")
node1.query("SYSTEM STOP MERGES mt0")
node2.query("SYSTEM STOP REPLICATION QUEUES")
for i in range(5):
node1.query(f"INSERT INTO mt0 VALUES ({i}, toDate('2020-10-01'))")
for i in range(5):
node1.query(f"INSERT INTO mt0 VALUES ({i}, toDate('2020-10-01'))")
for i in range(20):
parts_to_merge = node1.query(
"SELECT parts_to_merge FROM system.replication_queue WHERE table='mt0' AND length(parts_to_merge) > 0"
)
if parts_to_merge:
parts_list = list(sorted(ast.literal_eval(parts_to_merge)))
print("Got parts list", parts_list)
if len(parts_list) < 3:
raise Exception(f"Got too small parts list {parts_list}")
break
time.sleep(1)
for i in range(20):
parts_to_merge = node1.query(
"SELECT parts_to_merge FROM system.replication_queue WHERE table='mt0' AND length(parts_to_merge) > 0"
)
if parts_to_merge:
parts_list = list(sorted(ast.literal_eval(parts_to_merge)))
print("Got parts list", parts_list)
if len(parts_list) < 3:
raise Exception(f"Got too small parts list {parts_list}")
break
time.sleep(1)
victim_part_from_the_middle = random.choice(parts_list[1:-1])
print("Will corrupt part", victim_part_from_the_middle)
victim_part_from_the_middle = random.choice(parts_list[1:-1])
print("Will corrupt part", victim_part_from_the_middle)
remove_part_from_disk(node1, "mt0", victim_part_from_the_middle)
remove_part_from_disk(node1, "mt0", victim_part_from_the_middle)
node1.query("DETACH TABLE mt0")
node1.query("DETACH TABLE mt0")
node1.query("ATTACH TABLE mt0")
node1.query("ATTACH TABLE mt0")
node1.query("SYSTEM START MERGES mt0")
res, err = node1.query_and_get_answer_with_error("SYSTEM SYNC REPLICA mt0")
print("result: ", res)
print("error: ", res)
node1.query("SYSTEM START MERGES mt0")
res, err = node1.query_and_get_answer_with_error("SYSTEM SYNC REPLICA mt0")
print("result: ", res)
print("error: ", res)
for i in range(10):
result = node1.query("SELECT count() FROM system.replication_queue")
if int(result) == 0:
break
time.sleep(1)
else:
assert False, "Still have something in replication queue:\n" + node1.query(
"SELECT count() FROM system.replication_queue FORMAT Vertical"
)
for i in range(10):
result = node1.query("SELECT count() FROM system.replication_queue")
if int(result) == 0:
break
time.sleep(1)
else:
assert False, "Still have something in replication queue:\n" + node1.query(
"SELECT count() FROM system.replication_queue FORMAT Vertical"
)
assert node1.contains_in_log(
"Created empty part"
), f"Seems like empty part {victim_part_from_the_middle} is not created or log message changed"
assert node1.contains_in_log(
"Created empty part"
), f"Seems like empty part {victim_part_from_the_middle} is not created or log message changed"
assert node1.query("SELECT COUNT() FROM mt0") == "4\n"
assert node1.query("SELECT COUNT() FROM mt0") == "4\n"
node2.query("SYSTEM START REPLICATION QUEUES")
node2.query("SYSTEM START REPLICATION QUEUES")
assert_eq_with_retry(node2, "SELECT COUNT() FROM mt0", "4")
assert_eq_with_retry(node2, "SELECT COUNT() FROM system.replication_queue", "0")
node1.query("DROP TABLE IF EXISTS mt0 SYNC")
node2.query("DROP TABLE IF EXISTS mt0 SYNC")
assert_eq_with_retry(node2, "SELECT COUNT() FROM mt0", "4")
assert_eq_with_retry(node2, "SELECT COUNT() FROM system.replication_queue", "0")
finally:
node1.query("DROP TABLE IF EXISTS mt0 SYNC")
node2.query("DROP TABLE IF EXISTS mt0 SYNC")
def test_lost_part_other_replica(start_cluster):
node1.query("DROP TABLE IF EXISTS mt1 SYNC")
node2.query("DROP TABLE IF EXISTS mt1 SYNC")
for node in [node1, node2]:
node.query(
f"CREATE TABLE mt1 (id UInt64) ENGINE ReplicatedMergeTree('/clickhouse/tables/t1', '{node.name}') ORDER BY tuple() "
"SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0,"
"merge_selecting_sleep_ms=100, max_merge_selecting_sleep_ms=1000"
try:
for node in [node1, node2]:
node.query(
f"CREATE TABLE mt1 (id UInt64) ENGINE ReplicatedMergeTree('/clickhouse/tables/t1', '{node.name}') ORDER BY tuple() "
"SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0,"
"merge_selecting_sleep_ms=100, max_merge_selecting_sleep_ms=1000"
)
node1.query("SYSTEM STOP MERGES mt1")
node2.query("SYSTEM STOP REPLICATION QUEUES")
for i in range(5):
node1.query(f"INSERT INTO mt1 VALUES ({i})")
for i in range(20):
parts_to_merge = node1.query(
"SELECT parts_to_merge FROM system.replication_queue WHERE table='mt1' AND length(parts_to_merge) > 0"
)
if parts_to_merge:
parts_list = list(sorted(ast.literal_eval(parts_to_merge)))
print("Got parts list", parts_list)
if len(parts_list) < 3:
raise Exception("Got too small parts list {}".format(parts_list))
break
time.sleep(1)
victim_part_from_the_middle = random.choice(parts_list[1:-1])
print("Will corrupt part", victim_part_from_the_middle)
remove_part_from_disk(node1, "mt1", victim_part_from_the_middle)
# other way to detect broken parts
node1.query("CHECK TABLE mt1")
node2.query("SYSTEM START REPLICATION QUEUES")
res, err = node1.query_and_get_answer_with_error("SYSTEM SYNC REPLICA mt1")
print("result: ", res)
print("error: ", res)
for i in range(10):
result = node2.query("SELECT count() FROM system.replication_queue")
if int(result) == 0:
break
time.sleep(1)
else:
assert False, "Still have something in replication queue:\n" + node2.query(
"SELECT * FROM system.replication_queue FORMAT Vertical"
)
assert node1.contains_in_log(
"Created empty part"
), "Seems like empty part {} is not created or log message changed".format(
victim_part_from_the_middle
)
node1.query("SYSTEM STOP MERGES mt1")
node2.query("SYSTEM STOP REPLICATION QUEUES")
assert_eq_with_retry(node2, "SELECT COUNT() FROM mt1", "4")
assert_eq_with_retry(node2, "SELECT COUNT() FROM system.replication_queue", "0")
for i in range(5):
node1.query(f"INSERT INTO mt1 VALUES ({i})")
node1.query("SYSTEM START MERGES mt1")
for i in range(20):
parts_to_merge = node1.query(
"SELECT parts_to_merge FROM system.replication_queue WHERE table='mt1' AND length(parts_to_merge) > 0"
)
if parts_to_merge:
parts_list = list(sorted(ast.literal_eval(parts_to_merge)))
print("Got parts list", parts_list)
if len(parts_list) < 3:
raise Exception("Got too small parts list {}".format(parts_list))
break
time.sleep(1)
victim_part_from_the_middle = random.choice(parts_list[1:-1])
print("Will corrupt part", victim_part_from_the_middle)
remove_part_from_disk(node1, "mt1", victim_part_from_the_middle)
# other way to detect broken parts
node1.query("CHECK TABLE mt1")
node2.query("SYSTEM START REPLICATION QUEUES")
res, err = node1.query_and_get_answer_with_error("SYSTEM SYNC REPLICA mt1")
print("result: ", res)
print("error: ", res)
for i in range(10):
result = node2.query("SELECT count() FROM system.replication_queue")
if int(result) == 0:
break
time.sleep(1)
else:
assert False, "Still have something in replication queue:\n" + node2.query(
"SELECT * FROM system.replication_queue FORMAT Vertical"
)
assert node1.contains_in_log(
"Created empty part"
), "Seems like empty part {} is not created or log message changed".format(
victim_part_from_the_middle
)
assert_eq_with_retry(node2, "SELECT COUNT() FROM mt1", "4")
assert_eq_with_retry(node2, "SELECT COUNT() FROM system.replication_queue", "0")
node1.query("SYSTEM START MERGES mt1")
assert_eq_with_retry(node1, "SELECT COUNT() FROM mt1", "4")
assert_eq_with_retry(node1, "SELECT COUNT() FROM system.replication_queue", "0")
node1.query("DROP TABLE IF EXISTS mt1 SYNC")
node2.query("DROP TABLE IF EXISTS mt1 SYNC")
assert_eq_with_retry(node1, "SELECT COUNT() FROM mt1", "4")
assert_eq_with_retry(node1, "SELECT COUNT() FROM system.replication_queue", "0")
finally:
node1.query("DROP TABLE IF EXISTS mt1 SYNC")
node2.query("DROP TABLE IF EXISTS mt1 SYNC")
def test_lost_part_mutation(start_cluster):
node1.query("DROP TABLE IF EXISTS mt2 SYNC")
node2.query("DROP TABLE IF EXISTS mt2 SYNC")
for node in [node1, node2]:
node.query(
f"CREATE TABLE mt2 (id UInt64) ENGINE ReplicatedMergeTree('/clickhouse/tables/t2', '{node.name}') ORDER BY tuple() "
"SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0,"
"merge_selecting_sleep_ms=100, max_merge_selecting_sleep_ms=1000"
try:
for node in [node1, node2]:
node.query(
f"CREATE TABLE mt2 (id UInt64) ENGINE ReplicatedMergeTree('/clickhouse/tables/t2', '{node.name}') ORDER BY tuple() "
"SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0,"
"merge_selecting_sleep_ms=100, max_merge_selecting_sleep_ms=1000"
)
node1.query("SYSTEM STOP MERGES mt2")
node2.query("SYSTEM STOP REPLICATION QUEUES")
for i in range(2):
node1.query(f"INSERT INTO mt2 VALUES ({i})")
node1.query(
"ALTER TABLE mt2 UPDATE id = 777 WHERE 1", settings={"mutations_sync": "0"}
)
node1.query("SYSTEM STOP MERGES mt2")
node2.query("SYSTEM STOP REPLICATION QUEUES")
for i in range(20):
parts_to_mutate = node1.query(
"SELECT count() FROM system.replication_queue WHERE table='mt2'"
)
# two mutations for both replicas
if int(parts_to_mutate) == 4:
break
time.sleep(1)
for i in range(2):
node1.query(f"INSERT INTO mt2 VALUES ({i})")
remove_part_from_disk(node1, "mt2", "all_1_1_0")
node1.query(
"ALTER TABLE mt2 UPDATE id = 777 WHERE 1", settings={"mutations_sync": "0"}
)
# other way to detect broken parts
node1.query("CHECK TABLE mt2")
for i in range(20):
parts_to_mutate = node1.query(
"SELECT count() FROM system.replication_queue WHERE table='mt2'"
)
# two mutations for both replicas
if int(parts_to_mutate) == 4:
break
time.sleep(1)
node1.query("SYSTEM START MERGES mt2")
res, err = node1.query_and_get_answer_with_error("SYSTEM SYNC REPLICA mt2")
print("result: ", res)
print("error: ", res)
remove_part_from_disk(node1, "mt2", "all_1_1_0")
for i in range(10):
result = node1.query("SELECT count() FROM system.replication_queue")
if int(result) == 0:
break
time.sleep(1)
else:
assert False, "Still have something in replication queue:\n" + node1.query(
"SELECT * FROM system.replication_queue FORMAT Vertical"
)
# other way to detect broken parts
node1.query("CHECK TABLE mt2")
assert_eq_with_retry(node1, "SELECT COUNT() FROM mt2", "1")
assert_eq_with_retry(node1, "SELECT SUM(id) FROM mt2", "777")
assert_eq_with_retry(node1, "SELECT COUNT() FROM system.replication_queue", "0")
node1.query("SYSTEM START MERGES mt2")
res, err = node1.query_and_get_answer_with_error("SYSTEM SYNC REPLICA mt2")
print("result: ", res)
print("error: ", res)
node2.query("SYSTEM START REPLICATION QUEUES")
for i in range(10):
result = node1.query("SELECT count() FROM system.replication_queue")
if int(result) == 0:
break
time.sleep(1)
else:
assert False, "Still have something in replication queue:\n" + node1.query(
"SELECT * FROM system.replication_queue FORMAT Vertical"
)
assert_eq_with_retry(node1, "SELECT COUNT() FROM mt2", "1")
assert_eq_with_retry(node1, "SELECT SUM(id) FROM mt2", "777")
assert_eq_with_retry(node1, "SELECT COUNT() FROM system.replication_queue", "0")
node2.query("SYSTEM START REPLICATION QUEUES")
assert_eq_with_retry(node2, "SELECT COUNT() FROM mt2", "1")
assert_eq_with_retry(node2, "SELECT SUM(id) FROM mt2", "777")
assert_eq_with_retry(node2, "SELECT COUNT() FROM system.replication_queue", "0")
node1.query("DROP TABLE IF EXISTS mt2 SYNC")
node2.query("DROP TABLE IF EXISTS mt2 SYNC")
assert_eq_with_retry(node2, "SELECT COUNT() FROM mt2", "1")
assert_eq_with_retry(node2, "SELECT SUM(id) FROM mt2", "777")
assert_eq_with_retry(node2, "SELECT COUNT() FROM system.replication_queue", "0")
finally:
node1.query("DROP TABLE IF EXISTS mt2 SYNC")
node2.query("DROP TABLE IF EXISTS mt2 SYNC")
def test_lost_last_part(start_cluster):
node1.query("DROP TABLE IF EXISTS mt3 SYNC")
node2.query("DROP TABLE IF EXISTS mt3 SYNC")
for node in [node1, node2]:
node.query(
f"CREATE TABLE mt3 (id UInt64, p String) ENGINE ReplicatedMergeTree('/clickhouse/tables/t3', '{node.name}') "
"ORDER BY tuple() PARTITION BY p SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0,"
"merge_selecting_sleep_ms=100, max_merge_selecting_sleep_ms=1000"
try:
for node in [node1, node2]:
node.query(
f"CREATE TABLE mt3 (id UInt64, p String) ENGINE ReplicatedMergeTree('/clickhouse/tables/t3', '{node.name}') "
"ORDER BY tuple() PARTITION BY p SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, cleanup_thread_preferred_points_per_iteration=0,"
"merge_selecting_sleep_ms=100, max_merge_selecting_sleep_ms=1000"
)
node1.query("SYSTEM STOP MERGES mt3")
node2.query("SYSTEM STOP REPLICATION QUEUES")
for i in range(1):
node1.query(f"INSERT INTO mt3 VALUES ({i}, 'x')")
# actually not important
node1.query(
"ALTER TABLE mt3 UPDATE id = 777 WHERE 1", settings={"mutations_sync": "0"}
)
node1.query("SYSTEM STOP MERGES mt3")
node2.query("SYSTEM STOP REPLICATION QUEUES")
partition_id = node1.query("select partitionId('x')").strip()
remove_part_from_disk(node1, "mt3", f"{partition_id}_0_0_0")
for i in range(1):
node1.query(f"INSERT INTO mt3 VALUES ({i}, 'x')")
# other way to detect broken parts
node1.query("CHECK TABLE mt3")
# actually not important
node1.query(
"ALTER TABLE mt3 UPDATE id = 777 WHERE 1", settings={"mutations_sync": "0"}
)
node1.query("SYSTEM START MERGES mt3")
partition_id = node1.query("select partitionId('x')").strip()
remove_part_from_disk(node1, "mt3", f"{partition_id}_0_0_0")
for i in range(100):
result = node1.query(
"SELECT count() FROM system.replication_queue WHERE table='mt3'"
)
assert int(result) <= 2, "Have a lot of entries in queue {}".format(
node1.query("SELECT * FROM system.replication_queue FORMAT Vertical")
)
if node1.contains_in_log(
"Cannot create empty part"
) and node1.contains_in_log("DROP/DETACH PARTITION"):
break
if node1.contains_in_log(
"Created empty part 8b8f0fede53df97513a9fb4cb19dc1e4_0_0_0 "
):
break
time.sleep(0.5)
else:
assert False, "Don't have required messages in node1 log"
# other way to detect broken parts
node1.query("CHECK TABLE mt3")
node1.query(f"ALTER TABLE mt3 DROP PARTITION ID '{partition_id}'")
node1.query("SYSTEM START MERGES mt3")
for i in range(10):
result = node1.query(
"SELECT count() FROM system.replication_queue WHERE table='mt3'"
)
assert int(result) <= 2, "Have a lot of entries in queue {}".format(
node1.query("SELECT * FROM system.replication_queue FORMAT Vertical")
)
if node1.contains_in_log("Cannot create empty part") and node1.contains_in_log(
"DROP/DETACH PARTITION"
):
break
if node1.contains_in_log(
"Created empty part 8b8f0fede53df97513a9fb4cb19dc1e4_0_0_0 "
):
break
time.sleep(1)
else:
assert False, "Don't have required messages in node1 log"
node1.query(f"ALTER TABLE mt3 DROP PARTITION ID '{partition_id}'")
assert_eq_with_retry(node1, "SELECT COUNT() FROM mt3", "0")
assert_eq_with_retry(node1, "SELECT COUNT() FROM system.replication_queue", "0")
node1.query("DROP TABLE IF EXISTS mt3 SYNC")
node2.query("DROP TABLE IF EXISTS mt3 SYNC")
assert_eq_with_retry(node1, "SELECT COUNT() FROM mt3", "0")
assert_eq_with_retry(node1, "SELECT COUNT() FROM system.replication_queue", "0")
finally:
node1.query("DROP TABLE IF EXISTS mt3 SYNC")
node2.query("DROP TABLE IF EXISTS mt3 SYNC")

View File

@ -85,6 +85,32 @@ def test_read_write_storage_with_globs(started_cluster):
assert "in readonly mode" in str(ex)
def test_storage_with_multidirectory_glob(started_cluster):
hdfs_api = started_cluster.hdfs_api
for i in ["1", "2"]:
hdfs_api.write_data(
f"/multiglob/p{i}/path{i}/postfix/data{i}", f"File{i}\t{i}{i}\n"
)
assert (
hdfs_api.read_data(f"/multiglob/p{i}/path{i}/postfix/data{i}")
== f"File{i}\t{i}{i}\n"
)
r = node1.query(
"SELECT * FROM hdfs('hdfs://hdfs1:9000/multiglob/{p1/path1,p2/path2}/postfix/data{1,2}', TSV)"
)
assert (r == f"File1\t11\nFile2\t22\n") or (r == f"File2\t22\nFile1\t11\n")
try:
node1.query(
"SELECT * FROM hdfs('hdfs://hdfs1:9000/multiglob/{p4/path1,p2/path3}/postfix/data{1,2}.nonexist', TSV)"
)
assert False, "Exception have to be thrown"
except Exception as ex:
print(ex)
assert "no files" in str(ex)
def test_read_write_table(started_cluster):
hdfs_api = started_cluster.hdfs_api

View File

@ -762,7 +762,7 @@ def test_kafka_formats(kafka_cluster):
),
],
"extra_settings": ", format_avro_schema_registry_url='http://{}:{}'".format(
kafka_cluster.schema_registry_host, 8081
kafka_cluster.schema_registry_host, kafka_cluster.schema_registry_port
),
"supports_empty_value": True,
},
@ -4339,7 +4339,7 @@ def test_row_based_formats(kafka_cluster):
f"""
DROP TABLE IF EXISTS test.view;
DROP TABLE IF EXISTS test.kafka;
CREATE TABLE test.kafka (key UInt64, value UInt64)
ENGINE = Kafka
SETTINGS kafka_broker_list = 'kafka1:19092',
@ -4347,10 +4347,10 @@ def test_row_based_formats(kafka_cluster):
kafka_group_name = '{format_name}',
kafka_format = '{format_name}',
kafka_max_rows_per_message = 5;
CREATE MATERIALIZED VIEW test.view Engine=Log AS
SELECT key, value FROM test.kafka;
INSERT INTO test.kafka SELECT number * 10 as key, number * 100 as value FROM numbers({num_rows});
"""
)
@ -4459,17 +4459,17 @@ def test_block_based_formats_2(kafka_cluster):
f"""
DROP TABLE IF EXISTS test.view;
DROP TABLE IF EXISTS test.kafka;
CREATE TABLE test.kafka (key UInt64, value UInt64)
ENGINE = Kafka
SETTINGS kafka_broker_list = 'kafka1:19092',
kafka_topic_list = '{format_name}',
kafka_group_name = '{format_name}',
kafka_format = '{format_name}';
CREATE MATERIALIZED VIEW test.view Engine=Log AS
SELECT key, value FROM test.kafka;
INSERT INTO test.kafka SELECT number * 10 as key, number * 100 as value FROM numbers({num_rows}) settings max_block_size=12, optimize_trivial_insert_select=0;
"""
)

View File

@ -0,0 +1,57 @@
# pylint: disable=unused-argument
# pylint: disable=redefined-outer-name
import pytest
import time
from helpers.cluster import ClickHouseCluster
cluster = ClickHouseCluster(__file__)
node = cluster.add_instance(
"node",
stay_alive=True,
)
@pytest.fixture(scope="module")
def start_cluster():
try:
cluster.start()
yield cluster
finally:
cluster.shutdown()
def test_tmp_data_no_leftovers(start_cluster):
q = node.get_query_request
settings = {
"max_bytes_before_external_group_by": "10K",
"max_bytes_before_external_sort": "10K",
"join_algorithm": "grace_hash",
"max_bytes_in_join": "10K",
"grace_hash_join_initial_buckets": "16",
}
# Run some queries in the background to generate temporary data
q(
"SELECT ignore(*) FROM numbers(10 * 1024 * 1024) ORDER BY sipHash64(number)",
settings=settings,
)
q("SELECT * FROM system.numbers GROUP BY ALL", settings=settings)
q(
"SELECT * FROM system.numbers as t1 JOIN system.numbers as t2 USING (number)",
settings=settings,
)
# Wait a bit to make sure the temporary data is written to disk
time.sleep(5)
# Hard restart the node
node.restart_clickhouse(kill=True)
path_to_data = "/var/lib/clickhouse/"
# Check that there are no temporary files left
result = node.exec_in_container(["ls", path_to_data + "tmp/"])
assert result == ""

View File

@ -36,7 +36,7 @@ ${CLICKHOUSE_CLIENT} --query="SYSTEM SYNC REPLICA kill_mutation_r1"
${CLICKHOUSE_CLIENT} --query="SYSTEM SYNC REPLICA kill_mutation_r2"
# Should be empty, but in case of problems we will see some diagnostics
${CLICKHOUSE_CLIENT} --query="SELECT * FROM system.replication_queue WHERE table like 'kill_mutation_r%'"
${CLICKHOUSE_CLIENT} --query="SELECT * FROM system.replication_queue WHERE database = '$CLICKHOUSE_DATABASE' AND table like 'kill_mutation_r%'"
${CLICKHOUSE_CLIENT} --query="ALTER TABLE kill_mutation_r1 DELETE WHERE toUInt32(s) = 1"
@ -57,6 +57,14 @@ $CLICKHOUSE_CLIENT --query="SELECT count() FROM system.mutations WHERE database
${CLICKHOUSE_CLIENT} --query="KILL MUTATION WHERE database = '$CLICKHOUSE_DATABASE' AND table = 'kill_mutation_r1' AND mutation_id = '0000000001'"
# Wait for the 1st mutation to be actually killed and the 2nd to finish
query_result=$($CLICKHOUSE_CLIENT --query="$check_query1" 2>&1)
while [ "$query_result" != "0" ]
do
query_result=$($CLICKHOUSE_CLIENT --query="$check_query1" 2>&1)
sleep 0.5
done
${CLICKHOUSE_CLIENT} --query="SYSTEM SYNC REPLICA kill_mutation_r1"
${CLICKHOUSE_CLIENT} --query="SYSTEM SYNC REPLICA kill_mutation_r2"

View File

@ -33,3 +33,4 @@
1 3 1 4
2 1 2 3
2 2 2 3
1 2 1 2

View File

@ -23,5 +23,10 @@ SELECT count() FROM A ASOF JOIN B ON A.a == B.b AND A.t != B.t; -- { serverError
SELECT A.a, A.t, B.b, B.t FROM A ASOF JOIN B ON A.a == B.b AND A.t < B.t OR A.a == B.b + 1 ORDER BY (A.a, A.t); -- { serverError 48 }
SELECT A.a, A.t, B.b, B.t FROM A
ASOF INNER JOIN (SELECT * FROM B UNION ALL SELECT 1, 3) AS B ON B.t <= A.t AND A.a == B.b
WHERE B.t != 3 ORDER BY (A.a, A.t)
;
DROP TABLE A;
DROP TABLE B;

View File

@ -138,8 +138,13 @@ while true ; do
done
for i in $(seq $REPLICAS); do
$CLICKHOUSE_CLIENT --query "SYSTEM SYNC REPLICA concurrent_mutate_mt_$i"
$CLICKHOUSE_CLIENT --query "CHECK TABLE concurrent_mutate_mt_$i" &> /dev/null # if we will remove something the output of select will be wrong
$CLICKHOUSE_CLIENT --query "SELECT SUM(toUInt64(value1)) > $INITIAL_SUM FROM concurrent_mutate_mt_$i"
$CLICKHOUSE_CLIENT --query "SELECT COUNT() FROM system.mutations WHERE table='concurrent_mutate_mt_$i' and is_done=0" # all mutations have to be done
$CLICKHOUSE_CLIENT --query "SELECT * FROM system.mutations WHERE table='concurrent_mutate_mt_$i' and is_done=0" # for verbose output
done
for i in $(seq $REPLICAS); do
$CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS concurrent_mutate_mt_$i"
done

View File

@ -0,0 +1,12 @@
select 1
\N 1232 Johny
select 2
\N 1232 Johny
select 3
\N 1232 Johny
select 4
\N 1232 Johny
select 5
\N 1232 Johny
select 6
\N 1232 Johny

View File

@ -0,0 +1,58 @@
drop table if EXISTS l;
drop table if EXISTS r;
CREATE TABLE l (luid Nullable(Int16), name String)
ENGINE=MergeTree order by luid settings allow_nullable_key=1 as
select * from VALUES ((1231, 'John'),(6666, 'Ksenia'),(Null, '---'));
CREATE TABLE r (ruid Nullable(Int16), name String)
ENGINE=MergeTree order by ruid settings allow_nullable_key=1 as
select * from VALUES ((1231, 'John'),(1232, 'Johny'));
select 'select 1';
SELECT * FROM l full outer join r on l.luid = r.ruid
where luid is null
and ruid is not null;
select 'select 2';
select * from (
SELECT * FROM l full outer join r on l.luid = r.ruid)
where luid is null
and ruid is not null;
select 'select 3';
select * from (
SELECT * FROM l full outer join r on l.luid = r.ruid
limit 100000000)
where luid is null
and ruid is not null;
drop table l;
drop table r;
CREATE TABLE l (luid Nullable(Int16), name String) ENGINE=MergeTree order by tuple() as
select * from VALUES ((1231, 'John'),(6666, 'Ksenia'),(Null, '---'));
CREATE TABLE r (ruid Nullable(Int16), name String) ENGINE=MergeTree order by tuple() as
select * from VALUES ((1231, 'John'),(1232, 'Johny'));
select 'select 4';
SELECT * FROM l full outer join r on l.luid = r.ruid
where luid is null
and ruid is not null;
select 'select 5';
select * from (
SELECT * FROM l full outer join r on l.luid = r.ruid)
where luid is null
and ruid is not null;
select 'select 6';
select * from (
SELECT * FROM l full outer join r on l.luid = r.ruid
limit 100000000)
where luid is null
and ruid is not null;
drop table l;
drop table r;

View File

@ -6,7 +6,7 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. "$CUR_DIR"/../shell_config.sh
${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS ghdata"
${CLICKHOUSE_CLIENT} -q "CREATE TABLE ghdata (data JSON) ENGINE = MergeTree ORDER BY tuple()" --allow_experimental_object_type 1
${CLICKHOUSE_CLIENT} -q "CREATE TABLE ghdata (data JSON) ENGINE = MergeTree ORDER BY tuple() SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi'" --allow_experimental_object_type 1
cat $CUR_DIR/data_json/ghdata_sample.json | ${CLICKHOUSE_CLIENT} -q "INSERT INTO ghdata FORMAT JSONAsObject"

View File

@ -9,9 +9,9 @@ ${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS ghdata_2"
${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS ghdata_2_string"
${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS ghdata_2_from_string"
${CLICKHOUSE_CLIENT} -q "CREATE TABLE ghdata_2 (data JSON) ENGINE = MergeTree ORDER BY tuple()" --allow_experimental_object_type 1
${CLICKHOUSE_CLIENT} -q "CREATE TABLE ghdata_2_string (data String) ENGINE = MergeTree ORDER BY tuple()"
${CLICKHOUSE_CLIENT} -q "CREATE TABLE ghdata_2_from_string (data JSON) ENGINE = MergeTree ORDER BY tuple()" --allow_experimental_object_type 1
${CLICKHOUSE_CLIENT} -q "CREATE TABLE ghdata_2 (data JSON) ENGINE = MergeTree ORDER BY tuple() SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi'" --allow_experimental_object_type 1
${CLICKHOUSE_CLIENT} -q "CREATE TABLE ghdata_2_string (data String) ENGINE = MergeTree ORDER BY tuple() SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi'"
${CLICKHOUSE_CLIENT} -q "CREATE TABLE ghdata_2_from_string (data JSON) ENGINE = MergeTree ORDER BY tuple() SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi'" --allow_experimental_object_type 1
cat $CUR_DIR/data_json/ghdata_sample.json | ${CLICKHOUSE_CLIENT} -q "INSERT INTO ghdata_2 FORMAT JSONAsObject"
cat $CUR_DIR/data_json/ghdata_sample.json | ${CLICKHOUSE_CLIENT} -q "INSERT INTO ghdata_2_string FORMAT JSONAsString"

Some files were not shown because too many files have changed in this diff Show More