Merge remote-tracking branch 'origin' into integration-2

This commit is contained in:
Yatsishin Ilya 2021-03-09 10:33:15 +03:00
commit 7bfa4bb64b
125 changed files with 3990 additions and 585 deletions

View File

@ -1,9 +1,9 @@
# This strings autochanged from release_lib.sh:
SET(VERSION_REVISION 54448)
SET(VERSION_REVISION 54449)
SET(VERSION_MAJOR 21)
SET(VERSION_MINOR 3)
SET(VERSION_MINOR 4)
SET(VERSION_PATCH 1)
SET(VERSION_GITHASH ef72ba7349f230321750c13ee63b49a11a7c0adc)
SET(VERSION_DESCRIBE v21.3.1.1-prestable)
SET(VERSION_STRING 21.3.1.1)
SET(VERSION_GITHASH af2135ef9dc72f16fa4f229b731262c3f0a8bbdc)
SET(VERSION_DESCRIBE v21.4.1.1-prestable)
SET(VERSION_STRING 21.4.1.1)
# end of autochange

4
debian/changelog vendored
View File

@ -1,5 +1,5 @@
clickhouse (21.3.1.1) unstable; urgency=low
clickhouse (21.4.1.1) unstable; urgency=low
* Modified source code
-- clickhouse-release <clickhouse-release@yandex-team.ru> Mon, 01 Feb 2021 12:50:53 +0300
-- clickhouse-release <clickhouse-release@yandex-team.ru> Sat, 06 Mar 2021 14:43:27 +0300

View File

@ -1,7 +1,7 @@
FROM ubuntu:18.04
ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
ARG version=21.3.1.*
ARG version=21.4.1.*
RUN apt-get update \
&& apt-get install --yes --no-install-recommends \

View File

@ -1,7 +1,7 @@
FROM ubuntu:20.04
ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
ARG version=21.3.1.*
ARG version=21.4.1.*
ARG gosu_ver=1.10
# user/group precreated explicitly with fixed uid/gid on purpose.

View File

@ -1,7 +1,7 @@
FROM ubuntu:18.04
ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
ARG version=21.3.1.*
ARG version=21.4.1.*
RUN apt-get update && \
apt-get install -y apt-transport-https dirmngr && \

View File

@ -4,4 +4,7 @@
<!-- Path to configuration file with users, access rights, profiles of settings, quotas. -->
<users_config>users.xml</users_config>
</yandex>
<!-- Path to directory where users created by SQL commands are stored. -->
<access_control_path>access/</access_control_path>
</yandex>

View File

@ -31,6 +31,12 @@ Also it accept the following settings:
- `fsync_directories` - do the `fsync` for directories. Guarantees that the OS refreshed directory metadata after operations related to asynchronous inserts on Distributed table (after insert, after sending the data to shard, etc).
- `bytes_to_throw_insert` - if more than this number of compressed bytes will be pending for async INSERT, an exception will be thrown. 0 - do not throw. Default 0.
- `bytes_to_delay_insert` - if more than this number of compressed bytes will be pending for async INSERT, the query will be delayed. 0 - do not delay. Default 0.
- `max_delay_to_insert` - max delay of inserting data into Distributed table in seconds, if there are a lot of pending bytes for async send. Default 60.
!!! note "Note"
**Durability settings** (`fsync_...`):
@ -39,6 +45,12 @@ Also it accept the following settings:
- May significantly decrease the inserts' performance
- Affect writing the data stored inside Distributed table folder into the **node which accepted your insert**. If you need to have guarantees of writing data to underlying MergeTree tables - see durability settings (`...fsync...`) in `system.merge_tree_settings`
For **Insert limit settings** (`..._insert`) see also:
- [insert_distributed_sync](../../../operations/settings/settings.md#insert_distributed_sync) setting
- [prefer_localhost_replica](../../../operations/settings/settings.md#settings-prefer-localhost-replica) setting
- `bytes_to_throw_insert` handled before `bytes_to_delay_insert`, so you should not set it to the value less then `bytes_to_delay_insert`
Example:
``` sql

View File

@ -7,9 +7,9 @@ toc_title: Distinctive Features
## True Column-Oriented Database Management System {#true-column-oriented-dbms}
In a true column-oriented DBMS, no extra data is stored with the values. Among other things, this means that constant-length values must be supported, to avoid storing their length “number” next to the values. As an example, a billion UInt8-type values should consume around 1 GB uncompressed, or this strongly affects the CPU use. It is essential to store data compactly (without any “garbage”) even when uncompressed, since the speed of decompression (CPU usage) depends mainly on the volume of uncompressed data.
In a real column-oriented DBMS, no extra data is stored with the values. Among other things, this means that constant-length values must be supported, to avoid storing their length “number” next to the values. For example, a billion UInt8-type values should consume around 1 GB uncompressed, or this strongly affects the CPU use. It is essential to store data compactly (without any “garbage”) even when uncompressed since the speed of decompression (CPU usage) depends mainly on the volume of uncompressed data.
It is worth noting because there are systems that can store values of different columns separately, but that cant effectively process analytical queries due to their optimization for other scenarios. Examples are HBase, BigTable, Cassandra, and HyperTable. In these systems, you would get throughput around a hundred thousand rows per second, but not hundreds of millions of rows per second.
It is worth noting because there are systems that can store values of different columns separately, but that cant effectively process analytical queries due to their optimization for other scenarios. Examples are HBase, BigTable, Cassandra, and HyperTable. You would get throughput around a hundred thousand rows per second in these systems, but not hundreds of millions of rows per second.
Its also worth noting that ClickHouse is a database management system, not a single database. ClickHouse allows creating tables and databases in runtime, loading data, and running queries without reconfiguring and restarting the server.

View File

@ -5,74 +5,40 @@ toc_title: OpenTelemetry Support
# [experimental] OpenTelemetry Support
[OpenTelemetry](https://opentelemetry.io/) is an open standard for collecting
traces and metrics from distributed application. ClickHouse has some support
for OpenTelemetry.
[OpenTelemetry](https://opentelemetry.io/) is an open standard for collecting traces and metrics from the distributed application. ClickHouse has some support for OpenTelemetry.
!!! warning "Warning"
This is an experimental feature that will change in backwards-incompatible ways in the future releases.
This is an experimental feature that will change in backwards-incompatible ways in future releases.
## Supplying Trace Context to ClickHouse
ClickHouse accepts trace context HTTP headers, as described by
the [W3C recommendation](https://www.w3.org/TR/trace-context/).
It also accepts trace context over native protocol that is used for
communication between ClickHouse servers or between the client and server.
For manual testing, trace context headers conforming to the Trace Context
recommendation can be supplied to `clickhouse-client` using
`--opentelemetry-traceparent` and `--opentelemetry-tracestate` flags.
If no parent trace context is supplied, ClickHouse can start a new trace, with
probability controlled by the `opentelemetry_start_trace_probability` setting.
ClickHouse accepts trace context HTTP headers, as described by the [W3C recommendation](https://www.w3.org/TR/trace-context/). It also accepts trace context over a native protocol that is used for communication between ClickHouse servers or between the client and server. For manual testing, trace context headers conforming to the Trace Context recommendation can be supplied to `clickhouse-client` using `--opentelemetry-traceparent` and `--opentelemetry-tracestate` flags.
If no parent trace context is supplied, ClickHouse can start a new trace, with probability controlled by the [opentelemetry_start_trace_probability](../operations/settings/settings.md#opentelemetry-start-trace-probability) setting.
## Propagating the Trace Context
The trace context is propagated to downstream services in the following cases:
* Queries to remote ClickHouse servers, such as when using `Distributed` table
engine.
* `URL` table function. Trace context information is sent in HTTP headers.
* Queries to remote ClickHouse servers, such as when using [Distributed](../engines/table-engines/special/distributed.md) table engine.
* [url](../sql-reference/table-functions/url.md) table function. Trace context information is sent in HTTP headers.
## Tracing the ClickHouse Itself
ClickHouse creates _trace spans_ for each query and some of the query execution
stages, such as query planning or distributed queries.
ClickHouse creates `trace spans` for each query and some of the query execution stages, such as query planning or distributed queries.
To be useful, the tracing information has to be exported to a monitoring system
that supports OpenTelemetry, such as Jaeger or Prometheus. ClickHouse avoids
a dependency on a particular monitoring system, instead only providing the
tracing data through a system table. OpenTelemetry trace span information
[required by the standard](https://github.com/open-telemetry/opentelemetry-specification/blob/master/specification/overview.md#span)
is stored in the `system.opentelemetry_span_log` table.
To be useful, the tracing information has to be exported to a monitoring system that supports OpenTelemetry, such as [Jaeger](https://jaegertracing.io/) or [Prometheus](https://prometheus.io/). ClickHouse avoids a dependency on a particular monitoring system, instead only providing the tracing data through a system table. OpenTelemetry trace span information [required by the standard](https://github.com/open-telemetry/opentelemetry-specification/blob/master/specification/overview.md#span) is stored in the [system.opentelemetry_span_log](../operations/system-tables/opentelemetry_span_log.md) table.
The table must be enabled in the server configuration, see the `opentelemetry_span_log`
element in the default config file `config.xml`. It is enabled by default.
The table must be enabled in the server configuration, see the `opentelemetry_span_log` element in the default config file `config.xml`. It is enabled by default.
The table has the following columns:
- `trace_id`
- `span_id`
- `parent_span_id`
- `operation_name`
- `start_time`
- `finish_time`
- `finish_date`
- `attribute.name`
- `attribute.values`
The tags or attributes are saved as two parallel arrays, containing the keys
and values. Use `ARRAY JOIN` to work with them.
The tags or attributes are saved as two parallel arrays, containing the keys and values. Use [ARRAY JOIN](../sql-reference/statements/select/array-join.md) to work with them.
## Integration with monitoring systems
At the moment, there is no ready tool that can export the tracing data from
ClickHouse to a monitoring system.
At the moment, there is no ready tool that can export the tracing data from ClickHouse to a monitoring system.
For testing, it is possible to setup the export using a materialized view with the URL engine over the `system.opentelemetry_span_log` table, which would push the arriving log data to an HTTP endpoint of a trace collector. For example, to push the minimal span data to a Zipkin instance running at `http://localhost:9411`, in Zipkin v2 JSON format:
For testing, it is possible to setup the export using a materialized view with the [URL](../engines/table-engines/special/url.md) engine over the [system.opentelemetry_span_log](../operations/system-tables/opentelemetry_span_log.md) table, which would push the arriving log data to an HTTP endpoint of a trace collector. For example, to push the minimal span data to a Zipkin instance running at `http://localhost:9411`, in Zipkin v2 JSON format:
```sql
CREATE MATERIALIZED VIEW default.zipkin_spans
@ -94,3 +60,5 @@ FROM system.opentelemetry_span_log
```
In case of any errors, the part of the log data for which the error has occurred will be silently lost. Check the server log for error messages if the data does not arrive.
[Original article](https://clickhouse.tech/docs/en/operations/opentelemetry/) <!--hide-->

View File

@ -17,7 +17,6 @@ The `system.part_log` table contains the following columns:
- `event_date` ([Date](../../sql-reference/data-types/date.md)) — Event date.
- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Event time.
- `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Event time with microseconds precision.
- `duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Duration.
- `database` ([String](../../sql-reference/data-types/string.md)) — Name of the database the data part is in.
- `table` ([String](../../sql-reference/data-types/string.md)) — Name of the table the data part is in.

View File

@ -19,15 +19,17 @@ Resolution: 1 second.
## Usage Remarks {#usage-remarks}
The point in time is saved as a [Unix timestamp](https://en.wikipedia.org/wiki/Unix_time), regardless of the time zone or daylight saving time. Additionally, the `DateTime` type can store time zone that is the same for the entire column, that affects how the values of the `DateTime` type values are displayed in text format and how the values specified as strings are parsed (2020-01-01 05:00:01). The time zone is not stored in the rows of the table (or in resultset), but is stored in the column metadata.
A list of supported time zones can be found in the [IANA Time Zone Database](https://www.iana.org/time-zones).
The `tzdata` package, containing [IANA Time Zone Database](https://www.iana.org/time-zones), should be installed in the system. Use the `timedatectl list-timezones` command to list timezones known by a local system.
The point in time is saved as a [Unix timestamp](https://en.wikipedia.org/wiki/Unix_time), regardless of the time zone or daylight saving time. The time zone affects how the values of the `DateTime` type values are displayed in text format and how the values specified as strings are parsed (2020-01-01 05:00:01).
You can explicitly set a time zone for `DateTime`-type columns when creating a table. If the time zone isnt set, ClickHouse uses the value of the [timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) parameter in the server settings or the operating system settings at the moment of the ClickHouse server start.
Timezone agnostic unix timestamp is stored in tables, and the timezone is used to transform it to text format or back during data import/export or to make calendar calculations on the values (example: `toDate`, `toHour` functions et cetera). The time zone is not stored in the rows of the table (or in resultset), but is stored in the column metadata.
A list of supported time zones can be found in the [IANA Time Zone Database](https://www.iana.org/time-zones) and also can be queried by `SELECT * FROM system.time_zones`.
You can explicitly set a time zone for `DateTime`-type columns when creating a table. Example: `DateTime('UTC')`. If the time zone isnt set, ClickHouse uses the value of the [timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) parameter in the server settings or the operating system settings at the moment of the ClickHouse server start.
The [clickhouse-client](../../interfaces/cli.md) applies the server time zone by default if a time zone isnt explicitly set when initializing the data type. To use the client time zone, run `clickhouse-client` with the `--use_client_time_zone` parameter.
ClickHouse outputs values depending on the value of the [date\_time\_output\_format](../../operations/settings/settings.md#settings-date_time_output_format) setting. `YYYY-MM-DD hh:mm:ss` text format by default. Additionaly you can change the output with the [formatDateTime](../../sql-reference/functions/date-time-functions.md#formatdatetime) function.
ClickHouse outputs values depending on the value of the [date_time_output_format](../../operations/settings/settings.md#settings-date_time_output_format) setting. `YYYY-MM-DD hh:mm:ss` text format by default. Additionaly you can change the output with the [formatDateTime](../../sql-reference/functions/date-time-functions.md#formatdatetime) function.
When inserting data into ClickHouse, you can use different formats of date and time strings, depending on the value of the [date_time_input_format](../../operations/settings/settings.md#settings-date_time_input_format) setting.
@ -114,6 +116,24 @@ FROM dt
└─────────────────────┴─────────────────────┘
```
As timezone conversion only changes the metadata, the operation has no computation cost.
## Limitations on timezones support
Some timezones may not be supported completely. There are a few cases:
If the offset from UTC is not a multiple of 15 minutes, the calculation of hours and minutes can be incorrect. For example, the time zone in Monrovia, Liberia has offset UTC -0:44:30 before 7 Jan 1972. If you are doing calculations on the historical time in Monrovia timezone, the time processing functions may give incorrect results. The results after 7 Jan 1972 will be correct nevertheless.
If the time transition (due to daylight saving time or for other reasons) was performed at a point of time that is not a multiple of 15 minutes, you can also get incorrect results at this specific day.
Non-monotonic calendar dates. For example, in Happy Valley - Goose Bay, the time was transitioned one hour backwards at 00:01:00 7 Nov 2010 (one minute after midnight). So after 6th Nov has ended, people observed a whole one minute of 7th Nov, then time was changed back to 23:01 6th Nov and after another 59 minutes the 7th Nov started again. ClickHouse does not (yet) support this kind of fun. During these days the results of time processing functions may be slightly incorrect.
Similar issue exists for Casey Antarctic station in year 2010. They changed time three hours back at 5 Mar, 02:00. If you are working in antarctic station, please don't afraid to use ClickHouse. Just make sure you set timezone to UTC or be aware of inaccuracies.
Time shifts for multiple days. Some pacific islands changed their timezone offset from UTC+14 to UTC-12. That's alright but some inaccuracies may present if you do calculations with their timezone for historical time points at the days of conversion.
## See Also {#see-also}
- [Type conversion functions](../../sql-reference/functions/type-conversion-functions.md)

View File

@ -538,7 +538,7 @@ Result:
## date\_sub {#date_sub}
Subtracts a time/date interval from the provided date.
Subtracts the time interval or date interval from the provided date or date with time.
**Syntax**
@ -546,19 +546,30 @@ Subtracts a time/date interval from the provided date.
date_sub(unit, value, date)
```
Aliases: `dateSub`, `DATE_SUB`.
Aliases: `dateSub`, `DATE_SUB`.
**Arguments**
- `unit` — The type of interval to subtract. [String](../../sql-reference/data-types/string.md).
Possible values:
Supported values: second, minute, hour, day, week, month, quarter, year.
- `value` - Value in specified unit - [Int](../../sql-reference/data-types/int-uint.md)
- `date` — [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md) to subtract value from.
- `second`
- `minute`
- `hour`
- `day`
- `week`
- `month`
- `quarter`
- `year`
- `value` — Value of interval to subtract. [Int](../../sql-reference/data-types/int-uint.md).
- `date` — The date or date with time from which `value` is subtracted. [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
**Returned value**
Returns Date or DateTime with `value` expressed in `unit` subtracted from `date`.
Returns the date or date with time obtained by subtracting `value`, expressed in `unit`, from `date`.
Type: [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
**Example**

View File

@ -121,24 +121,53 @@ LIMIT 10
└────────────────────────────┴────────┘
```
## IPv6StringToNum(s) {#ipv6stringtonums}
## IPv6StringToNum {#ipv6stringtonums}
The reverse function of IPv6NumToString. If the IPv6 address has an invalid format, it returns a string of null bytes.
If the IP address is a valid IPv4 address then the IPv6 equivalent of the IPv4 address is returned.
The reverse function of [IPv6NumToString](#ipv6numtostringx). If the IPv6 address has an invalid format, it returns a string of null bytes.
If the input string contains a valid IPv4 address, returns its IPv6 equivalent.
HEX can be uppercase or lowercase.
Alias: `INET6_ATON`.
**Syntax**
``` sql
SELECT cutIPv6(IPv6StringToNum('127.0.0.1'), 0, 0);
IPv6StringToNum(string)
```
``` text
┌─cutIPv6(IPv6StringToNum('127.0.0.1'), 0, 0)─┐
│ ::ffff:127.0.0.1 │
└─────────────────────────────────────────────┘
**Argument**
- `string` — IP address. [String](../../sql-reference/data-types/string.md).
**Returned value**
- IPv6 address in binary format.
Type: [FixedString(16)](../../sql-reference/data-types/fixedstring.md).
**Example**
Query:
``` sql
SELECT addr, cutIPv6(IPv6StringToNum(addr), 0, 0) FROM (SELECT ['notaddress', '127.0.0.1', '1111::ffff'] AS addr) ARRAY JOIN addr;
```
Result:
``` text
┌─addr───────┬─cutIPv6(IPv6StringToNum(addr), 0, 0)─┐
│ notaddress │ :: │
│ 127.0.0.1 │ ::ffff:127.0.0.1 │
│ 1111::ffff │ 1111::ffff │
└────────────┴──────────────────────────────────────┘
```
**See also**
- [cutIPv6](#cutipv6x-bytestocutforipv6-bytestocutforipv4).
## IPv4ToIPv6(x) {#ipv4toipv6x}
Takes a `UInt32` number. Interprets it as an IPv4 address in [big endian](https://en.wikipedia.org/wiki/Endianness). Returns a `FixedString(16)` value containing the IPv6 address in binary format. Examples:
@ -232,43 +261,56 @@ SELECT
└───────────────────────────────────┴──────────────────────────┘
```
## toIPv6(string) {#toipv6string}
## toIPv6 {#toipv6string}
An alias to `IPv6StringToNum()` that takes a string form of IPv6 address and returns value of [IPv6](../../sql-reference/data-types/domains/ipv6.md) type, which is binary equal to value returned by `IPv6StringToNum()`.
If the IP address is a valid IPv4 address then the IPv6 equivalent of the IPv4 address is returned.
Converts a string form of IPv6 address to [IPv6](../../sql-reference/data-types/domains/ipv6.md) type. If the IPv6 address has an invalid format, returns an empty value.
Similar to [IPv6StringToNum](#ipv6stringtonums) function, which converts IPv6 address to binary format.
``` sql
WITH
'2001:438:ffff::407d:1bc1' as IPv6_string
SELECT
toTypeName(IPv6StringToNum(IPv6_string)),
toTypeName(toIPv6(IPv6_string))
If the input string contains a valid IPv4 address, then the IPv6 equivalent of the IPv4 address is returned.
**Syntax**
```sql
toIPv6(string)
```
``` text
┌─toTypeName(IPv6StringToNum(IPv6_string))─┬─toTypeName(toIPv6(IPv6_string))─┐
│ FixedString(16) │ IPv6 │
└──────────────────────────────────────────┴─────────────────────────────────┘
```
**Argument**
- `string` — IP address. [String](../../sql-reference/data-types/string.md)
**Returned value**
- IP address.
Type: [IPv6](../../sql-reference/data-types/domains/ipv6.md).
**Examples**
Query:
``` sql
WITH
'2001:438:ffff::407d:1bc1' as IPv6_string
WITH '2001:438:ffff::407d:1bc1' AS IPv6_string
SELECT
hex(IPv6StringToNum(IPv6_string)),
hex(toIPv6(IPv6_string))
hex(toIPv6(IPv6_string));
```
Result:
``` text
┌─hex(IPv6StringToNum(IPv6_string))─┬─hex(toIPv6(IPv6_string))─────────┐
│ 20010438FFFF000000000000407D1BC1 │ 20010438FFFF000000000000407D1BC1 │
└───────────────────────────────────┴──────────────────────────────────┘
```
Query:
``` sql
SELECT toIPv6('127.0.0.1')
SELECT toIPv6('127.0.0.1');
```
Result:
``` text
┌─toIPv6('127.0.0.1')─┐
│ ::ffff:127.0.0.1 │

View File

@ -728,7 +728,7 @@ The result of the function depends on the affected data blocks and the order of
It can reach the neighbor rows only inside the currently processed data block.
The rows order used during the calculation of `neighbor` can differ from the order of rows returned to the user.
To prevent that you can make a subquery with ORDER BY and call the function from outside the subquery.
To prevent that you can make a subquery with [ORDER BY](../../sql-reference/statements/select/order-by.md) and call the function from outside the subquery.
**Arguments**
@ -834,12 +834,12 @@ Calculates the difference between successive row values in the data block.
Returns 0 for the first row and the difference from the previous row for each subsequent row.
!!! warning "Warning"
It can reach the previos row only inside the currently processed data block.
It can reach the previous row only inside the currently processed data block.
The result of the function depends on the affected data blocks and the order of data in the block.
The rows order used during the calculation of `runningDifference` can differ from the order of rows returned to the user.
To prevent that you can make a subquery with ORDER BY and call the function from outside the subquery.
To prevent that you can make a subquery with [ORDER BY](../../sql-reference/statements/select/order-by.md) and call the function from outside the subquery.
Example:

View File

@ -17,7 +17,7 @@ When `OPTIMIZE` is used with the [ReplicatedMergeTree](../../engines/table-engin
- If `OPTIMIZE` doesnt perform a merge for any reason, it doesnt notify the client. To enable notifications, use the [optimize_throw_if_noop](../../operations/settings/settings.md#setting-optimize_throw_if_noop) setting.
- If you specify a `PARTITION`, only the specified partition is optimized. [How to set partition expression](../../sql-reference/statements/alter/index.md#alter-how-to-specify-part-expr).
- If you specify `FINAL`, optimization is performed even when all the data is already in one part.
- If you specify `FINAL`, optimization is performed even when all the data is already in one part. Also merge is forced even if concurrent merges are performed.
- If you specify `DEDUPLICATE`, then completely identical rows will be deduplicated (all columns are compared), it makes sense only for the MergeTree engine.
!!! warning "Warning"

View File

@ -0,0 +1,37 @@
---
toc_priority: 62
toc_title: Поддержка OpenTelemetry
---
# [экспериментально] Поддержка OpenTelemetry
ClickHouse поддерживает [OpenTelemetry](https://opentelemetry.io/) — открытый стандарт для сбора трассировок и метрик из распределенного приложения.
!!! warning "Предупреждение"
Поддержка стандарта экспериментальная и будет со временем меняться.
## Обеспечение поддержки контекста трассировки в ClickHouse
ClickHouse принимает контекстную информацию трассировки через HTTP заголовок `tracecontext`, как описано в [рекомендации W3C](https://www.w3.org/TR/trace-context/). Также он принимает контекстную информацию через нативный протокол, который используется для связи между серверами ClickHouse или между клиентом и сервером. Для ручного тестирования стандартный заголовок `tracecontext`, содержащий контекст трассировки, может быть передан в `clickhouse-client` через флаги: `--opentelemetry-traceparent` и `--opentelemetry-tracestate`.
Если входящий контекст трассировки не указан, ClickHouse может начать трассировку с вероятностью, задаваемой настройкой [opentelemetry_start_trace_probability](../operations/settings/settings.md#opentelemetry-start-trace-probability).
## Распространение контекста трассировки
Контекст трассировки распространяется на нижестоящие сервисы в следующих случаях:
* При использовании запросов к удаленным серверам ClickHouse, например, при использовании движка таблиц [Distributed](../engines/table-engines/special/distributed.md).
* При использовании табличной функции [url](../sql-reference/table-functions/url.md). Информация контекста трассировки передается в HTTP заголовки.
## Как ClickHouse выполняет трассировку
ClickHouse создает `trace spans` для каждого запроса и некоторых этапов выполнения запроса, таких как планирование запросов или распределенные запросы.
Чтобы анализировать информацию трассировки, ее следует экспортировать в систему мониторинга, поддерживающую OpenTelemetry, такую как [Jaeger](https://jaegertracing.io/) или [Prometheus](https://prometheus.io/). ClickHouse не зависит от конкретной системы мониторинга, вместо этого предоставляя данные трассировки только через системную таблицу. Информация о диапазоне трассировки в OpenTelemetry, [требуемая стандартом](https://github.com/open-telemetry/opentelemetry-specification/blob/master/specification/overview.md#span), хранится в системной таблице [system.opentelemetry_span_log](../operations/system-tables/opentelemetry_span_log.md).
Таблица должна быть включена в конфигурации сервера, смотрите элемент `opentelemetry_span_log` в файле конфигурации `config.xml`. По умолчанию таблица включена всегда.
Теги или атрибуты сохраняются в виде двух параллельных массивов, содержащих ключи и значения. Для работы с ними используйте [ARRAY JOIN](../sql-reference/statements/select/array-join.md).
[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/opentelemetry/) <!--hide-->

View File

@ -16,6 +16,7 @@
- `MOVE_PART` — перемещение куска между дисками.
- `event_date` ([Date](../../sql-reference/data-types/date.md)) — дата события.
- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — время события.
- `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — время события с точностью до микросекунд.
- `duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — длительность.
- `database` ([String](../../sql-reference/data-types/string.md)) — имя базы данных, в которой находится кусок.
- `table` ([String](../../sql-reference/data-types/string.md)) — имя таблицы, в которой находится кусок.
@ -47,6 +48,7 @@ query_id: 983ad9c7-28d5-4ae1-844e-603116b7de31
event_type: NewPart
event_date: 2021-02-02
event_time: 2021-02-02 11:14:28
event_time_microseconds: 2021-02-02 11:14:28.861919
duration_ms: 35
database: default
table: log_mt_2

View File

@ -23,8 +23,6 @@ SELECT
└─────────────────────┴────────────┴────────────┴─────────────────────┘
```
Поддерживаются только часовые пояса, отличающиеся от UTC на целое число часов.
## toTimeZone {#totimezone}
Переводит дату или дату-с-временем в указанный часовой пояс. Часовой пояс (таймзона) это атрибут типов Date/DateTime, внутреннее значение (количество секунд) поля таблицы или колонки результата не изменяется, изменяется тип поля и автоматически его текстовое отображение.
@ -446,9 +444,9 @@ date_trunc(unit, value[, timezone])
Синоним: `dateTrunc`.
**Параметры**
**Аргументы**
- `unit`Название части даты или времени. [String](../syntax.md#syntax-string-literal).
- `unit`название части даты или времени. [String Literal](../syntax.md#syntax-string-literal).
Возможные значения:
- `second`
@ -460,8 +458,8 @@ date_trunc(unit, value[, timezone])
- `quarter`
- `year`
- `value`Дата и время. [DateTime](../../sql-reference/data-types/datetime.md) или [DateTime64](../../sql-reference/data-types/datetime64.md).
- `timezone` — [Часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) для возвращаемого значения (необязательно). Если параметр не задан, используется часовой пояс параметра `value`. [String](../../sql-reference/data-types/string.md)
- `value`дата и время. [DateTime](../../sql-reference/data-types/datetime.md) или [DateTime64](../../sql-reference/data-types/datetime64.md).
- `timezone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) для возвращаемого значения (необязательно). Если параметр не задан, используется часовой пояс параметра `value`. [String](../../sql-reference/data-types/string.md)
**Возвращаемое значение**
@ -618,6 +616,57 @@ SELECT dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-0
└────────────────────────────────────────────────────────────────────────────────────────┘
```
## date\_sub {#date_sub}
Вычитает интервал времени или даты из указанной даты или даты со временем.
**Синтаксис**
``` sql
date_sub(unit, value, date)
```
Синонимы: `dateSub`, `DATE_SUB`.
**Аргументы**
- `unit` — единица измерения времени, в которой задан интервал для вычитания. [String](../../sql-reference/data-types/string.md).
Возможные значения:
- `second`
- `minute`
- `hour`
- `day`
- `week`
- `month`
- `quarter`
- `year`
- `value` — значение интервала для вычитания. [Int](../../sql-reference/data-types/int-uint.md).
- `date` — дата или дата со временем, из которой вычитается `value`. [Date](../../sql-reference/data-types/date.md) или [DateTime](../../sql-reference/data-types/datetime.md).
**Возвращаемое значение**
Возвращает дату или дату со временем, полученную в результате вычитания `value`, выраженного в `unit`, из `date`.
Тип: [Date](../../sql-reference/data-types/date.md) или [DateTime](../../sql-reference/data-types/datetime.md).
**Пример**
Запрос:
``` sql
SELECT date_sub(YEAR, 3, toDate('2018-01-01'));
```
Результат:
``` text
┌─minus(toDate('2018-01-01'), toIntervalYear(3))─┐
│ 2015-01-01 │
└────────────────────────────────────────────────┘
```
## timeSlot {#timeslot}
Округляет время до получаса.

View File

@ -121,12 +121,53 @@ LIMIT 10
└────────────────────────────┴────────┘
```
## IPv6StringToNum(s) {#ipv6stringtonums}
## IPv6StringToNum {#ipv6stringtonums}
Функция, обратная к [IPv6NumToString](#ipv6numtostringx). Если IPv6 адрес передан в неправильном формате, то возвращает строку из нулевых байт.
Если IP адрес является корректным IPv4 адресом, функция возвращает его IPv6 эквивалент.
Функция, обратная к IPv6NumToString. Если IPv6 адрес в неправильном формате, то возвращает строку из нулевых байт.
HEX может быть в любом регистре.
Alias: `INET6_ATON`.
Синоним: `INET6_ATON`.
**Синтаксис**
``` sql
IPv6StringToNum(string)
```
**Аргумент**
- `string` — IP адрес. [String](../../sql-reference/data-types/string.md).
**Возвращаемое значение**
- Адрес IPv6 в двоичном представлении.
Тип: [FixedString(16)](../../sql-reference/data-types/fixedstring.md).
**Пример**
Запрос:
``` sql
SELECT addr, cutIPv6(IPv6StringToNum(addr), 0, 0) FROM (SELECT ['notaddress', '127.0.0.1', '1111::ffff'] AS addr) ARRAY JOIN addr;
```
Результат:
``` text
┌─addr───────┬─cutIPv6(IPv6StringToNum(addr), 0, 0)─┐
│ notaddress │ :: │
│ 127.0.0.1 │ ::ffff:127.0.0.1 │
│ 1111::ffff │ 1111::ffff │
└────────────┴──────────────────────────────────────┘
```
**Смотрите также**
- [cutIPv6](#cutipv6x-bytestocutforipv6-bytestocutforipv4).
## IPv4ToIPv6(x) {#ipv4toipv6x}
@ -221,38 +262,62 @@ SELECT
└───────────────────────────────────┴──────────────────────────┘
```
## toIPv6(string) {#toipv6string}
## toIPv6 {#toipv6string}
Псевдоним функции `IPv6StringToNum()` которая принимает строку с адресом IPv6 и возвращает значение типа [IPv6](../../sql-reference/functions/ip-address-functions.md), которое равно значению, возвращаемому функцией `IPv6StringToNum()`.
Приводит строку с адресом в формате IPv6 к типу [IPv6](../../sql-reference/data-types/domains/ipv6.md). Возвращает пустое значение, если входящая строка не является корректным IP адресом.
Похоже на функцию [IPv6StringToNum](#ipv6stringtonums), которая представляет адрес IPv6 в двоичном виде.
``` sql
WITH
'2001:438:ffff::407d:1bc1' as IPv6_string
SELECT
toTypeName(IPv6StringToNum(IPv6_string)),
toTypeName(toIPv6(IPv6_string))
Если входящая строка содержит корректный IPv4 адрес, функция возвращает его IPv6 эквивалент.
**Синтаксис**
```sql
toIPv6(string)
```
``` text
┌─toTypeName(IPv6StringToNum(IPv6_string))─┬─toTypeName(toIPv6(IPv6_string))─┐
│ FixedString(16) │ IPv6 │
└──────────────────────────────────────────┴─────────────────────────────────┘
```
**Аргумент**
- `string` — IP адрес. [String](../../sql-reference/data-types/string.md)
**Возвращаемое значение**
- IP адрес.
Тип: [IPv6](../../sql-reference/data-types/domains/ipv6.md).
**Примеры**
Запрос:
``` sql
WITH
'2001:438:ffff::407d:1bc1' as IPv6_string
WITH '2001:438:ffff::407d:1bc1' AS IPv6_string
SELECT
hex(IPv6StringToNum(IPv6_string)),
hex(toIPv6(IPv6_string))
hex(toIPv6(IPv6_string));
```
Результат:
``` text
┌─hex(IPv6StringToNum(IPv6_string))─┬─hex(toIPv6(IPv6_string))─────────┐
│ 20010438FFFF000000000000407D1BC1 │ 20010438FFFF000000000000407D1BC1 │
└───────────────────────────────────┴──────────────────────────────────┘
```
Запрос:
``` sql
SELECT toIPv6('127.0.0.1');
```
Результат:
``` text
┌─toIPv6('127.0.0.1')─┐
│ ::ffff:127.0.0.1 │
└─────────────────────┘
```
## isIPv4String {#isipv4string}
Определяет, является ли строка адресом IPv4 или нет. Также вернет `0`, если `string` — адрес IPv6.

View File

@ -659,7 +659,7 @@ SELECT
## neighbor {#neighbor}
Функция позволяет получить доступ к значению в колонке `column`, находящемуся на смещении `offset` относительно текущей строки. Является частичной реализацией [оконных функций](https://en.wikipedia.org/wiki/SQL_window_function) `LEAD()` и `LAG()`.
Функция позволяет получить доступ к значению в столбце `column`, находящемуся на смещении `offset` относительно текущей строки. Является частичной реализацией [оконных функций](https://en.wikipedia.org/wiki/SQL_window_function) `LEAD()` и `LAG()`.
**Синтаксис**
@ -667,7 +667,13 @@ SELECT
neighbor(column, offset[, default_value])
```
Результат функции зависит от затронутых блоков данных и порядка данных в блоке. Если сделать подзапрос с ORDER BY и вызывать функцию извне подзапроса, можно будет получить ожидаемый результат.
Результат функции зависит от затронутых блоков данных и порядка данных в блоке.
!!! warning "Предупреждение"
Функция может получить доступ к значению в столбце соседней строки только внутри обрабатываемого в данный момент блока данных.
Порядок строк, используемый при вычислении функции `neighbor`, может отличаться от порядка строк, возвращаемых пользователю.
Чтобы этого не случилось, вы можете сделать подзапрос с [ORDER BY](../../sql-reference/statements/select/order-by.md) и вызвать функцию изне подзапроса.
**Параметры**
@ -772,8 +778,13 @@ FROM numbers(16)
Считает разницу между последовательными значениями строк в блоке данных.
Возвращает 0 для первой строки и разницу с предыдущей строкой для каждой последующей строки.
!!! warning "Предупреждение"
Функция может взять значение предыдущей строки только внутри текущего обработанного блока данных.
Результат функции зависит от затронутых блоков данных и порядка данных в блоке.
Если сделать подзапрос с ORDER BY и вызывать функцию извне подзапроса, можно будет получить ожидаемый результат.
Порядок строк, используемый при вычислении функции `runningDifference`, может отличаться от порядка строк, возвращаемых пользователю.
Чтобы этого не случилось, вы можете сделать подзапрос с [ORDER BY](../../sql-reference/statements/select/order-by.md) и вызвать функцию извне подзапроса.
Пример:

View File

@ -15,11 +15,10 @@ OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION I
- Если `OPTIMIZE` не выполняет мёрж по любой причине, ClickHouse не оповещает об этом клиента. Чтобы включить оповещения, используйте настройку [optimize_throw_if_noop](../../operations/settings/settings.md#setting-optimize_throw_if_noop).
- Если указать `PARTITION`, то оптимизация выполняется только для указанной партиции. [Как задавать имя партиции в запросах](alter/index.md#alter-how-to-specify-part-expr).
- Если указать `FINAL`, то оптимизация выполняется даже в том случае, если все данные уже лежат в одном куске.
- Если указать `FINAL`, то оптимизация выполняется даже в том случае, если все данные уже лежат в одном куске. Кроме того, слияние является принудительным, даже если выполняются параллельные слияния.
- Если указать `DEDUPLICATE`, то произойдет схлопывание полностью одинаковых строк (сравниваются значения во всех колонках), имеет смысл только для движка MergeTree.
!!! warning "Внимание"
Запрос `OPTIMIZE` не может устранить причину появления ошибки «Too many parts».
[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/optimize/) <!--hide-->
[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/optimize/) <!--hide-->

View File

@ -102,7 +102,7 @@ USING (equi_column1, ... equi_columnN, asof_column)
- При использовании обычного `JOIN` , запрос отправляется на удалённые серверы. На каждом из них выполняются подзапросы для формирования «правой» таблицы, и с этой таблицей выполняется соединение. То есть, «правая» таблица формируется на каждом сервере отдельно.
- При использовании `GLOBAL ... JOIN`, сначала сервер-инициатор запроса запускает подзапрос для вычисления правой таблицы. Эта временная таблица передаётся на каждый удалённый сервер, и на них выполняются запросы с использованием переданных временных данных.
Будьте аккуратны при использовании `GLOBAL`. За дополнительной информацией обращайтесь в раздел [Распределенные подзапросы](#select-distributed-subqueries).
Будьте аккуратны при использовании `GLOBAL`. За дополнительной информацией обращайтесь в раздел [Распределенные подзапросы](../../../sql-reference/operators/in.md#select-distributed-subqueries).
## Рекомендации по использованию {#usage-recommendations}

View File

@ -47,7 +47,7 @@
如果配置文件中没有设置 ZooKeeper ,则无法创建复制表,并且任何现有的复制表都将变为只读。
`SELECT` 查询并不需要借助 ZooKeeper 本并不影响 `SELECT` 的性能查询复制表与非复制表速度是一样的。查询分布式表时ClickHouse的处理方式可通过设置 [max_replica_delay_for_distributed_queries](../../../operations/settings/settings.md#settings-max_replica_delay_for_distributed_queries) 和 [fallback_to_stale_replicas_for_distributed_queries](../../../operations/settings/settings.md) 修改。
`SELECT` 查询并不需要借助 ZooKeeper 本并不影响 `SELECT` 的性能查询复制表与非复制表速度是一样的。查询分布式表时ClickHouse的处理方式可通过设置 [max_replica_delay_for_distributed_queries](../../../operations/settings/settings.md#settings-max_replica_delay_for_distributed_queries) 和 [fallback_to_stale_replicas_for_distributed_queries](../../../operations/settings/settings.md) 修改。
对于每个 `INSERT` 语句,会通过几个事务将十来个记录添加到 ZooKeeper。确切地说这是针对每个插入的数据块; 每个 INSERT 语句的每 `max_insert_block_size = 1048576` 行和最后剩余的都各算作一个块。)相比非复制表,写 zk 会导致 `INSERT` 的延迟略长一些。但只要你按照建议每秒不超过一个 `INSERT` 地批量插入数据,不会有任何问题。一个 ZooKeeper 集群能给整个 ClickHouse 集群支撑协调每秒几百个 `INSERT`。数据插入的吞吐量(每秒的行数)可以跟不用复制的数据一样高。

View File

@ -18,7 +18,7 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ...
如果左侧是索引中的单列,而右侧是一组常量,则系统将使用索引处理查询。
Don't list too many values explicitly (i.e. millions). If a data set is large, put it in a temporary table (for example, see the section “External data for query processing”),然后使用子查询。
请不要列举太多具体的常量 (比方说 几百万条)。如果数据集非常大,请把它放在一张临时表里(例如,参考章节[用于查询处理的外部数据](../../engines/table-engines/special/external-data.md)),然后使用子查询。
运算符的右侧可以是一组常量表达式、一组带有常量表达式的元组如上面的示例所示或括号中的数据库表或SELECT子查询的名称。

View File

@ -7,10 +7,12 @@
#include <IO/ReadHelpers.h>
#include <IO/WriteBufferFromFileDescriptor.h>
#include <IO/WriteBufferFromOStream.h>
#include <Parsers/ASTInsertQuery.h>
#include <Parsers/ParserQuery.h>
#include <Parsers/parseQuery.h>
#include <Parsers/formatAST.h>
#include <Parsers/obfuscateQueries.h>
#include <Parsers/parseQuery.h>
#include <Common/ErrorCodes.h>
#include <Common/TerminalSize.h>
#include <Interpreters/Context.h>
@ -28,6 +30,14 @@
#pragma GCC diagnostic ignored "-Wunused-function"
#pragma GCC diagnostic ignored "-Wmissing-declarations"
namespace DB
{
namespace ErrorCodes
{
extern const int INVALID_FORMAT_INSERT_QUERY_WITH_DATA;
}
}
int mainEntryClickHouseFormat(int argc, char ** argv)
{
using namespace DB;
@ -128,6 +138,14 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
do
{
ASTPtr res = parseQueryAndMovePosition(parser, pos, end, "query", multiple, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH);
/// For insert query with data(INSERT INTO ... VALUES ...), will lead to format fail,
/// should throw exception early and make exception message more readable.
if (const auto * insert_query = res->as<ASTInsertQuery>(); insert_query && insert_query->data)
{
throw Exception(
"Can't format ASTInsertQuery with data, since data will be lost",
DB::ErrorCodes::INVALID_FORMAT_INSERT_QUERY_WITH_DATA);
}
if (!quiet)
{
WriteBufferFromOStream res_buf(std::cout, 4096);
@ -137,6 +155,26 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
std::cout << "\n;\n";
std::cout << std::endl;
}
do
{
/// skip spaces to avoid throw exception after last query
while (pos != end && std::isspace(*pos))
++pos;
/// for skip comment after the last query and to not throw exception
if (end - pos > 2 && *pos == '-' && *(pos + 1) == '-')
{
pos += 2;
/// skip until the end of the line
while (pos != end && *pos != '\n')
++pos;
}
/// need to parse next sql
else
break;
} while (pos != end);
} while (multiple && pos != end);
}
}

View File

@ -0,0 +1,157 @@
#pragma once
#include <cstdint>
#include <vector>
#include <chrono>
#include <mutex>
#include <condition_variable>
#include <common/defines.h>
#include <Common/MoveOrCopyIfThrow.h>
/** Pool for limited size objects that cannot be used from different threads simultaneously.
* The main use case is to have fixed size of objects that can be reused in difference threads during their lifetime
* and have to be initialized on demand.
* Two main properties of pool are allocated objects size and borrowed objects size.
* Allocated objects size is size of objects that are currently allocated by the pool.
* Borrowed objects size is size of objects that are borrowed by clients.
* If max_size == 0 then pool has unlimited size and objects will be allocated without limit.
*
* Pool provides following strategy for borrowing object:
* If max_size == 0 then pool has unlimited size and objects will be allocated without limit.
* 1. If pool has objects that can be borrowed increase borrowed objects size and return it.
* 2. If pool allocatedObjectsSize is lower than max objects size or pool has unlimited size
* allocate new object, increase borrowed objects size and return it.
* 3. If pool is full wait on condition variable with or without timeout until some object
* will be returned to the pool.
*/
template <typename T>
class BorrowedObjectPool final
{
public:
explicit BorrowedObjectPool(size_t max_size_) : max_size(max_size_) {}
/// Borrow object from pool. If pull is full and all objects were borrowed
/// then calling thread will wait until some object will be returned into pool.
template <typename FactoryFunc>
void borrowObject(T & dest, FactoryFunc && func)
{
std::unique_lock<std::mutex> lock(objects_mutex);
if (!objects.empty())
{
dest = borrowFromObjects(lock);
return;
}
bool has_unlimited_size = (max_size == 0);
if (unlikely(has_unlimited_size) || allocated_objects_size < max_size)
{
dest = allocateObjectForBorrowing(lock, std::forward<FactoryFunc>(func));
return;
}
condition_variable.wait(lock, [this] { return !objects.empty(); });
dest = borrowFromObjects(lock);
}
/// Same as borrowObject function, but wait with timeout.
/// Returns true if object was borrowed during timeout.
template <typename FactoryFunc>
bool tryBorrowObject(T & dest, FactoryFunc && func, size_t timeout_in_milliseconds = 0)
{
std::unique_lock<std::mutex> lock(objects_mutex);
if (!objects.empty())
{
dest = borrowFromObjects(lock);
return true;
}
bool has_unlimited_size = (max_size == 0);
if (unlikely(has_unlimited_size) || allocated_objects_size < max_size)
{
dest = allocateObjectForBorrowing(lock, std::forward<FactoryFunc>(func));
return true;
}
bool wait_result = condition_variable.wait_for(lock, std::chrono::milliseconds(timeout_in_milliseconds), [this] { return !objects.empty(); });
if (wait_result)
dest = borrowFromObjects(lock);
return wait_result;
}
/// Return object into pool. Client must return same object that was borrowed.
inline void returnObject(T && object_to_return)
{
std::unique_lock<std::mutex> lck(objects_mutex);
objects.emplace_back(std::move(object_to_return));
--borrowed_objects_size;
condition_variable.notify_one();
}
/// Max pool size
inline size_t maxSize() const
{
return max_size;
}
/// Allocated objects size by the pool. If allocatedObjectsSize == maxSize then pool is full.
inline size_t allocatedObjectsSize() const
{
std::unique_lock<std::mutex> lock(objects_mutex);
return allocated_objects_size;
}
/// Returns allocatedObjectsSize == maxSize
inline bool isFull() const
{
std::unique_lock<std::mutex> lock(objects_mutex);
return allocated_objects_size == max_size;
}
/// Borrowed objects size. If borrowedObjectsSize == allocatedObjectsSize and pool is full.
/// Then client will wait during borrowObject function call.
inline size_t borrowedObjectsSize() const
{
std::unique_lock<std::mutex> lock(objects_mutex);
return borrowed_objects_size;
}
private:
template <typename FactoryFunc>
inline T allocateObjectForBorrowing(const std::unique_lock<std::mutex> &, FactoryFunc && func)
{
++allocated_objects_size;
++borrowed_objects_size;
return std::forward<FactoryFunc>(func)();
}
inline T borrowFromObjects(const std::unique_lock<std::mutex> &)
{
T dst;
detail::moveOrCopyIfThrow(std::move(objects.back()), dst);
objects.pop_back();
++borrowed_objects_size;
return dst;
}
size_t max_size;
mutable std::mutex objects_mutex;
std::condition_variable condition_variable;
size_t allocated_objects_size = 0;
size_t borrowed_objects_size = 0;
std::vector<T> objects;
};

View File

@ -6,38 +6,7 @@
#include <Poco/Mutex.h>
#include <Poco/Semaphore.h>
#include <common/types.h>
namespace detail
{
template <typename T, bool is_nothrow_move_assignable = std::is_nothrow_move_assignable_v<T>>
struct MoveOrCopyIfThrow;
template <typename T>
struct MoveOrCopyIfThrow<T, true>
{
void operator()(T && src, T & dst) const
{
dst = std::forward<T>(src);
}
};
template <typename T>
struct MoveOrCopyIfThrow<T, false>
{
void operator()(T && src, T & dst) const
{
dst = src;
}
};
template <typename T>
void moveOrCopyIfThrow(T && src, T & dst)
{
MoveOrCopyIfThrow<T>()(std::forward<T>(src), dst);
}
}
#include <Common/MoveOrCopyIfThrow.h>
/** A very simple thread-safe queue of limited size.
* If you try to pop an item from an empty queue, the thread is blocked until the queue becomes nonempty.
@ -53,8 +22,10 @@ private:
Poco::Semaphore empty_count;
public:
ConcurrentBoundedQueue(size_t max_fill)
: fill_count(0, max_fill), empty_count(max_fill, max_fill) {}
explicit ConcurrentBoundedQueue(size_t max_fill)
: fill_count(0, max_fill)
, empty_count(max_fill, max_fill)
{}
void push(const T & x)
{

View File

@ -100,6 +100,12 @@ namespace CurrentMetrics
amount -= value;
}
void add(Value value = 1)
{
what->fetch_add(value, std::memory_order_relaxed);
amount += value;
}
/// Subtract value before destructor.
void destroy()
{

View File

@ -540,12 +540,14 @@
M(571, DATABASE_REPLICATION_FAILED) \
M(572, TOO_MANY_QUERY_PLAN_OPTIMIZATIONS) \
M(573, EPOLL_ERROR) \
M(574, DISTRIBUTED_TOO_MANY_PENDING_BYTES) \
\
M(999, KEEPER_EXCEPTION) \
M(1000, POCO_EXCEPTION) \
M(1001, STD_EXCEPTION) \
M(1002, UNKNOWN_EXCEPTION) \
M(1003, INVALID_SHARD_ID) \
M(1004, INVALID_FORMAT_INSERT_QUERY_WITH_DATA)
/* See END */

View File

@ -458,33 +458,25 @@ ExecutionStatus ExecutionStatus::fromCurrentException(const std::string & start_
return ExecutionStatus(getCurrentExceptionCode(), msg);
}
ParsingException::ParsingException()
{
Exception::message(Exception::message() + "{}");
}
ParsingException::ParsingException() = default;
ParsingException::ParsingException(const std::string & msg, int code)
: Exception(msg, code)
{
Exception::message(Exception::message() + "{}");
}
ParsingException::ParsingException(int code, const std::string & message)
: Exception(message, code)
{
Exception::message(Exception::message() + "{}");
}
/// We use additional field formatted_message_ to make this method const.
std::string ParsingException::displayText() const
{
try
{
if (line_number_ == -1)
formatted_message_ = fmt::format(message(), "");
formatted_message_ = message();
else
formatted_message_ = fmt::format(message(), fmt::format(": (at row {})\n", line_number_));
formatted_message_ = message() + fmt::format(": (at row {})\n", line_number_);
}
catch (...)
{}

View File

@ -115,9 +115,7 @@ public:
template <typename ...Args>
ParsingException(int code, const std::string & fmt, Args&&... args)
: Exception(fmt::format(fmt, std::forward<Args>(args)...), code)
{
Exception::message(Exception::message() + "{}");
}
{}
std::string displayText() const

View File

@ -0,0 +1,33 @@
#pragma once
#include <common/types.h>
namespace detail
{
template <typename T, bool is_nothrow_move_assignable = std::is_nothrow_move_assignable_v<T>>
struct MoveOrCopyIfThrow;
template <typename T>
struct MoveOrCopyIfThrow<T, true>
{
void operator()(T && src, T & dst) const
{
dst = std::forward<T>(src);
}
};
template <typename T>
struct MoveOrCopyIfThrow<T, false>
{
void operator()(T && src, T & dst) const
{
dst = src;
}
};
template <typename T>
void moveOrCopyIfThrow(T && src, T & dst)
{
MoveOrCopyIfThrow<T>()(std::forward<T>(src), dst);
}
}

View File

@ -68,6 +68,9 @@
M(DelayedInserts, "Number of times the INSERT of a block to a MergeTree table was throttled due to high number of active data parts for partition.") \
M(RejectedInserts, "Number of times the INSERT of a block to a MergeTree table was rejected with 'Too many parts' exception due to high number of active data parts for partition.") \
M(DelayedInsertsMilliseconds, "Total number of milliseconds spent while the INSERT of a block to a MergeTree table was throttled due to high number of active data parts for partition.") \
M(DistributedDelayedInserts, "Number of times the INSERT of a block to a Distributed table was throttled due to high number of pending bytes.") \
M(DistributedRejectedInserts, "Number of times the INSERT of a block to a Distributed table was rejected with 'Too many bytes' exception due to high number of pending bytes.") \
M(DistributedDelayedInsertsMilliseconds, "Total number of milliseconds spent while the INSERT of a block to a Distributed table was throttled due to high number of pending bytes.") \
M(DuplicatedInsertedBlocks, "Number of times the INSERTed block to a ReplicatedMergeTree table was deduplicated.") \
\
M(ZooKeeperInit, "") \

View File

@ -2,15 +2,18 @@
#include <sys/wait.h>
#include <fcntl.h>
#include <dlfcn.h>
#include <unistd.h>
#include <time.h>
#include <csignal>
#include <common/logger_useful.h>
#include <common/errnoToString.h>
#include <Common/Exception.h>
#include <Common/ShellCommand.h>
#include <Common/PipeFDs.h>
#include <common/logger_useful.h>
#include <common/errnoToString.h>
#include <IO/WriteHelpers.h>
#include <IO/Operators.h>
#include <unistd.h>
#include <csignal>
namespace
{
@ -36,9 +39,9 @@ namespace ErrorCodes
extern const int CANNOT_CREATE_CHILD_PROCESS;
}
ShellCommand::ShellCommand(pid_t pid_, int & in_fd_, int & out_fd_, int & err_fd_, bool terminate_in_destructor_)
ShellCommand::ShellCommand(pid_t pid_, int & in_fd_, int & out_fd_, int & err_fd_, ShellCommandDestructorStrategy destructor_strategy_)
: pid(pid_)
, terminate_in_destructor(terminate_in_destructor_)
, destructor_strategy(destructor_strategy_)
, in(in_fd_)
, out(out_fd_)
, err(err_fd_)
@ -52,14 +55,24 @@ Poco::Logger * ShellCommand::getLogger()
ShellCommand::~ShellCommand()
{
if (terminate_in_destructor)
if (wait_called)
return;
if (destructor_strategy.terminate_in_destructor)
{
LOG_TRACE(getLogger(), "Will kill shell command pid {} with SIGTERM", pid);
int retcode = kill(pid, SIGTERM);
if (retcode != 0)
LOG_WARNING(getLogger(), "Cannot kill shell command pid {} errno '{}'", pid, errnoToString(retcode));
size_t try_wait_timeout = destructor_strategy.wait_for_normal_exit_before_termination_seconds;
bool process_terminated_normally = tryWaitProcessWithTimeout(try_wait_timeout);
if (!process_terminated_normally)
{
LOG_TRACE(getLogger(), "Will kill shell command pid {} with SIGTERM", pid);
int retcode = kill(pid, SIGTERM);
if (retcode != 0)
LOG_WARNING(getLogger(), "Cannot kill shell command pid {} errno '{}'", pid, errnoToString(retcode));
}
}
else if (!wait_called)
else
{
try
{
@ -72,6 +85,54 @@ ShellCommand::~ShellCommand()
}
}
bool ShellCommand::tryWaitProcessWithTimeout(size_t timeout_in_seconds)
{
int status = 0;
LOG_TRACE(getLogger(), "Try wait for shell command pid ({}) with timeout ({})", pid, timeout_in_seconds);
wait_called = true;
struct timespec interval {.tv_sec = 1, .tv_nsec = 0};
in.close();
out.close();
err.close();
if (timeout_in_seconds == 0)
{
/// If there is no timeout before signal try to waitpid 1 time without block so we can avoid sending
/// signal if process is already normally terminated.
int waitpid_res = waitpid(pid, &status, WNOHANG);
bool process_terminated_normally = (waitpid_res == pid);
return process_terminated_normally;
}
/// If timeout is positive try waitpid without block in loop until
/// process is normally terminated or waitpid return error
while (timeout_in_seconds != 0)
{
int waitpid_res = waitpid(pid, &status, WNOHANG);
bool process_terminated_normally = (waitpid_res == pid);
if (process_terminated_normally)
return true;
else if (waitpid_res == 0)
{
--timeout_in_seconds;
nanosleep(&interval, nullptr);
continue;
}
else if (waitpid_res == -1 && errno != EINTR)
return false;
}
return false;
}
void ShellCommand::logCommand(const char * filename, char * const argv[])
{
WriteBufferFromOwnString args;
@ -87,7 +148,10 @@ void ShellCommand::logCommand(const char * filename, char * const argv[])
}
std::unique_ptr<ShellCommand> ShellCommand::executeImpl(
const char * filename, char * const argv[], bool pipe_stdin_only, bool terminate_in_destructor)
const char * filename,
char * const argv[],
bool pipe_stdin_only,
ShellCommandDestructorStrategy terminate_in_destructor_strategy)
{
logCommand(filename, argv);
@ -144,7 +208,7 @@ std::unique_ptr<ShellCommand> ShellCommand::executeImpl(
}
std::unique_ptr<ShellCommand> res(new ShellCommand(
pid, pipe_stdin.fds_rw[1], pipe_stdout.fds_rw[0], pipe_stderr.fds_rw[0], terminate_in_destructor));
pid, pipe_stdin.fds_rw[1], pipe_stdout.fds_rw[0], pipe_stderr.fds_rw[0], terminate_in_destructor_strategy));
LOG_TRACE(getLogger(), "Started shell command '{}' with pid {}", filename, pid);
return res;
@ -152,7 +216,9 @@ std::unique_ptr<ShellCommand> ShellCommand::executeImpl(
std::unique_ptr<ShellCommand> ShellCommand::execute(
const std::string & command, bool pipe_stdin_only, bool terminate_in_destructor)
const std::string & command,
bool pipe_stdin_only,
ShellCommandDestructorStrategy terminate_in_destructor_strategy)
{
/// Arguments in non-constant chunks of memory (as required for `execv`).
/// Moreover, their copying must be done before calling `vfork`, so after `vfork` do a minimum of things.
@ -162,12 +228,14 @@ std::unique_ptr<ShellCommand> ShellCommand::execute(
char * const argv[] = { argv0.data(), argv1.data(), argv2.data(), nullptr };
return executeImpl("/bin/sh", argv, pipe_stdin_only, terminate_in_destructor);
return executeImpl("/bin/sh", argv, pipe_stdin_only, terminate_in_destructor_strategy);
}
std::unique_ptr<ShellCommand> ShellCommand::executeDirect(
const std::string & path, const std::vector<std::string> & arguments, bool terminate_in_destructor)
const std::string & path,
const std::vector<std::string> & arguments,
ShellCommandDestructorStrategy terminate_in_destructor_strategy)
{
size_t argv_sum_size = path.size() + 1;
for (const auto & arg : arguments)
@ -188,7 +256,7 @@ std::unique_ptr<ShellCommand> ShellCommand::executeDirect(
argv[arguments.size() + 1] = nullptr;
return executeImpl(path.data(), argv.data(), false, terminate_in_destructor);
return executeImpl(path.data(), argv.data(), false, terminate_in_destructor_strategy);
}

View File

@ -23,21 +23,38 @@ namespace DB
* The second difference - allows to work simultaneously with stdin, and with stdout, and with stderr of running process,
* and also to obtain the return code and completion status.
*/
class ShellCommand
struct ShellCommandDestructorStrategy final
{
explicit ShellCommandDestructorStrategy(bool terminate_in_destructor_, size_t wait_for_normal_exit_before_termination_seconds_ = 0)
: terminate_in_destructor(terminate_in_destructor_)
, wait_for_normal_exit_before_termination_seconds(wait_for_normal_exit_before_termination_seconds_)
{
}
bool terminate_in_destructor;
/// If terminate in destructor is true, command will wait until send SIGTERM signal to created process
size_t wait_for_normal_exit_before_termination_seconds = 0;
};
class ShellCommand final
{
private:
pid_t pid;
bool wait_called = false;
bool terminate_in_destructor;
ShellCommandDestructorStrategy destructor_strategy;
ShellCommand(pid_t pid_, int & in_fd_, int & out_fd_, int & err_fd_, bool terminate_in_destructor_);
ShellCommand(pid_t pid_, int & in_fd_, int & out_fd_, int & err_fd_, ShellCommandDestructorStrategy destructor_strategy_);
bool tryWaitProcessWithTimeout(size_t timeout_in_seconds);
static Poco::Logger * getLogger();
/// Print command name and the list of arguments to log. NOTE: No escaping of arguments is performed.
static void logCommand(const char * filename, char * const argv[]);
static std::unique_ptr<ShellCommand> executeImpl(const char * filename, char * const argv[], bool pipe_stdin_only, bool terminate_in_destructor);
static std::unique_ptr<ShellCommand> executeImpl(const char * filename, char * const argv[], bool pipe_stdin_only, ShellCommandDestructorStrategy terminate_in_destructor_strategy);
public:
WriteBufferFromFile in; /// If the command reads from stdin, do not forget to call in.close() after writing all the data there.
@ -48,11 +65,11 @@ public:
/// Run the command using /bin/sh -c.
/// If terminate_in_destructor is true, send terminate signal in destructor and don't wait process.
static std::unique_ptr<ShellCommand> execute(const std::string & command, bool pipe_stdin_only = false, bool terminate_in_destructor = false);
static std::unique_ptr<ShellCommand> execute(const std::string & command, bool pipe_stdin_only = false, ShellCommandDestructorStrategy terminate_in_destructor_strategy = ShellCommandDestructorStrategy(false));
/// Run the executable with the specified arguments. `arguments` - without argv[0].
/// If terminate_in_destructor is true, send terminate signal in destructor and don't wait process.
static std::unique_ptr<ShellCommand> executeDirect(const std::string & path, const std::vector<std::string> & arguments, bool terminate_in_destructor = false);
static std::unique_ptr<ShellCommand> executeDirect(const std::string & path, const std::vector<std::string> & arguments, ShellCommandDestructorStrategy terminate_in_destructor_strategy = ShellCommandDestructorStrategy(false));
/// Wait for the process to end, throw an exception if the code is not 0 or if the process was not completed by itself.
void wait();

View File

@ -345,7 +345,7 @@ struct ODBCBridgeMixin
LOG_TRACE(log, "Starting {}", serviceAlias());
return ShellCommand::executeDirect(path.toString(), cmd_args, true);
return ShellCommand::executeDirect(path.toString(), cmd_args, ShellCommandDestructorStrategy(true));
}
};
}

View File

@ -26,7 +26,7 @@ NuKeeperStateManager::NuKeeperStateManager(
const CoordinationSettingsPtr & coordination_settings)
: my_server_id(my_server_id_)
, log_store(nuraft::cs_new<NuKeeperLogStore>(
config.getString(config_prefix + ".log_storage_path"),
config.getString(config_prefix + ".log_storage_path", config.getString("path", DBMS_DEFAULT_PATH) + "coordination/logs"),
coordination_settings->rotate_log_storage_interval, coordination_settings->force_sync))
, cluster_config(nuraft::cs_new<nuraft::cluster_config>())
{

View File

@ -13,6 +13,11 @@
namespace DB
{
namespace ErrorCodes
{
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
}
void formatBlock(BlockOutputStreamPtr & out, const Block & block)
{
out->writePrefix();
@ -88,4 +93,63 @@ Context copyContextAndApplySettings(
return local_context;
}
BlockInputStreamWithAdditionalColumns::BlockInputStreamWithAdditionalColumns(
Block block_to_add_, std::unique_ptr<IBlockInputStream> && stream_)
: block_to_add(std::move(block_to_add_))
, stream(std::move(stream_))
{
}
Block BlockInputStreamWithAdditionalColumns::getHeader() const
{
auto header = stream->getHeader();
if (header)
{
for (Int64 i = static_cast<Int64>(block_to_add.columns() - 1); i >= 0; --i)
header.insert(0, block_to_add.getByPosition(i).cloneEmpty());
}
return header;
}
Block BlockInputStreamWithAdditionalColumns::readImpl()
{
auto block = stream->read();
if (block)
{
auto block_rows = block.rows();
auto cut_block = block_to_add.cloneWithCutColumns(current_range_index, block_rows);
if (cut_block.rows() != block_rows)
throw Exception(
"Number of rows in block to add after cut must equal to number of rows in block from inner stream",
ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
for (Int64 i = static_cast<Int64>(cut_block.columns() - 1); i >= 0; --i)
block.insert(0, cut_block.getByPosition(i));
current_range_index += block_rows;
}
return block;
}
void BlockInputStreamWithAdditionalColumns::readPrefix()
{
stream->readPrefix();
}
void BlockInputStreamWithAdditionalColumns::readSuffix()
{
stream->readSuffix();
}
String BlockInputStreamWithAdditionalColumns::getName() const
{
return "BlockInputStreamWithAdditionalColumns";
}
}

View File

@ -6,12 +6,13 @@
#include <Poco/File.h>
#include <Poco/Util/AbstractConfiguration.h>
#include <DataStreams/IBlockInputStream.h>
#include <Columns/IColumn.h>
#include <Core/Block.h>
namespace DB
{
class IBlockOutputStream;
using BlockOutputStreamPtr = std::shared_ptr<IBlockOutputStream>;
@ -46,4 +47,29 @@ void applySettingsToContext(
Context & context,
const Poco::Util::AbstractConfiguration & config);
/** A stream, adds additional columns to each block that it will read from inner stream.
*
* block_to_add rows size must be equal to final sum rows size of all inner stream blocks.
*/
class BlockInputStreamWithAdditionalColumns final : public IBlockInputStream
{
public:
BlockInputStreamWithAdditionalColumns(Block block_to_add_, std::unique_ptr<IBlockInputStream> && stream_);
Block getHeader() const override;
Block readImpl() override;
void readPrefix() override;
void readSuffix() override;
String getName() const override;
private:
Block block_to_add;
std::unique_ptr<IBlockInputStream> stream;
size_t current_range_index = 0;
};
}

View File

@ -27,7 +27,6 @@ namespace ErrorCodes
extern const int LOGICAL_ERROR;
extern const int DICTIONARY_ACCESS_DENIED;
extern const int UNSUPPORTED_METHOD;
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
}
namespace
@ -58,7 +57,6 @@ namespace
}
ExecutableDictionarySource::ExecutableDictionarySource(
const DictionaryStructure & dict_struct_,
const Poco::Util::AbstractConfiguration & config,
@ -200,101 +198,14 @@ namespace
std::function<void(WriteBufferFromFile &)> send_data;
ThreadFromGlobalPool thread;
};
/** A stream, adds additional columns to each block that it will read from inner stream.
*
* block_to_add rows size must be equal to final sum rows size of all inner stream blocks.
*/
class BlockInputStreamWithAdditionalColumns final: public IBlockInputStream
{
public:
BlockInputStreamWithAdditionalColumns(
Block block_to_add_,
std::unique_ptr<IBlockInputStream>&& stream_)
: block_to_add(std::move(block_to_add_))
, stream(std::move(stream_))
{
}
Block getHeader() const override
{
auto header = stream->getHeader();
if (header)
{
for (Int64 i = static_cast<Int64>(block_to_add.columns() - 1); i >= 0; --i)
header.insert(0, block_to_add.getByPosition(i).cloneEmpty());
}
return header;
}
Block readImpl() override
{
auto block = stream->read();
if (block)
{
auto block_rows = block.rows();
auto cut_block = block_to_add.cloneWithCutColumns(current_range_index, block_rows);
if (cut_block.rows() != block_rows)
throw Exception(
"Number of rows in block to add after cut must equal to number of rows in block from inner stream",
ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
for (Int64 i = static_cast<Int64>(cut_block.columns() - 1); i >= 0; --i)
block.insert(0, cut_block.getByPosition(i));
current_range_index += block_rows;
}
return block;
}
void readPrefix() override
{
stream->readPrefix();
}
void readSuffix() override
{
stream->readSuffix();
}
String getName() const override { return "BlockInputStreamWithAdditionalColumns"; }
private:
Block block_to_add;
std::unique_ptr<IBlockInputStream> stream;
size_t current_range_index = 0;
};
}
BlockInputStreamPtr ExecutableDictionarySource::loadIds(const std::vector<UInt64> & ids)
{
LOG_TRACE(log, "loadIds {} size = {}", toString(), ids.size());
auto block = blockForIds(dict_struct, ids);
auto stream = std::make_unique<BlockInputStreamWithBackgroundThread>(
context, format, sample_block, command, log,
[block, this](WriteBufferFromFile & out) mutable
{
auto output_stream = context.getOutputStream(format, out, block.cloneEmpty());
formatBlock(output_stream, block);
out.close();
});
if (implicit_key)
{
return std::make_shared<BlockInputStreamWithAdditionalColumns>(block, std::move(stream));
}
else
return std::shared_ptr<BlockInputStreamWithBackgroundThread>(stream.release());
return getStreamForBlock(block);
}
BlockInputStreamPtr ExecutableDictionarySource::loadKeys(const Columns & key_columns, const std::vector<size_t> & requested_rows)
@ -302,7 +213,11 @@ BlockInputStreamPtr ExecutableDictionarySource::loadKeys(const Columns & key_col
LOG_TRACE(log, "loadKeys {} size = {}", toString(), requested_rows.size());
auto block = blockForKeys(dict_struct, key_columns, requested_rows);
return getStreamForBlock(block);
}
BlockInputStreamPtr ExecutableDictionarySource::getStreamForBlock(const Block & block)
{
auto stream = std::make_unique<BlockInputStreamWithBackgroundThread>(
context, format, sample_block, command, log,
[block, this](WriteBufferFromFile & out) mutable
@ -354,13 +269,13 @@ void registerDictionarySourceExecutable(DictionarySourceFactory & factory)
bool check_config) -> DictionarySourcePtr
{
if (dict_struct.has_expressions)
throw Exception{"Dictionary source of type `executable` does not support attribute expressions", ErrorCodes::LOGICAL_ERROR};
throw Exception(ErrorCodes::LOGICAL_ERROR, "Dictionary source of type `executable` does not support attribute expressions");
/// Executable dictionaries may execute arbitrary commands.
/// It's OK for dictionaries created by administrator from xml-file, but
/// maybe dangerous for dictionaries created from DDL-queries.
if (check_config)
throw Exception("Dictionaries with Executable dictionary source is not allowed", ErrorCodes::DICTIONARY_ACCESS_DENIED);
throw Exception(ErrorCodes::DICTIONARY_ACCESS_DENIED, "Dictionaries with executable dictionary source are not allowed to be created from DDL query");
Context context_local_copy = copyContextAndApplySettings(config_prefix, context, config);

View File

@ -47,6 +47,8 @@ public:
std::string toString() const override;
BlockInputStreamPtr getStreamForBlock(const Block & block);
private:
Poco::Logger * log;
time_t update_time = 0;

View File

@ -0,0 +1,321 @@
#include "ExecutablePoolDictionarySource.h"
#include <functional>
#include <ext/scope_guard.h>
#include <DataStreams/IBlockOutputStream.h>
#include <Interpreters/Context.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
#include <IO/copyData.h>
#include <Common/ShellCommand.h>
#include <Common/ThreadPool.h>
#include <common/logger_useful.h>
#include <common/LocalDateTime.h>
#include "DictionarySourceFactory.h"
#include "DictionarySourceHelpers.h"
#include "DictionaryStructure.h"
#include "registerDictionaries.h"
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int DICTIONARY_ACCESS_DENIED;
extern const int UNSUPPORTED_METHOD;
extern const int TIMEOUT_EXCEEDED;
}
ExecutablePoolDictionarySource::ExecutablePoolDictionarySource(
const DictionaryStructure & dict_struct_,
const Configuration & configuration_,
Block & sample_block_,
const Context & context_)
: log(&Poco::Logger::get("ExecutablePoolDictionarySource"))
, dict_struct{dict_struct_}
, configuration{configuration_}
, sample_block{sample_block_}
, context{context_}
/// If pool size == 0 then there is no size restrictions. Poco max size of semaphore is integer type.
, process_pool{std::make_shared<ProcessPool>(configuration.pool_size == 0 ? std::numeric_limits<int>::max() : configuration.pool_size)}
{
/// Remove keys from sample_block for implicit_key dictionary because
/// these columns will not be returned from source
/// Implicit key means that the source script will return only values,
/// and the correspondence to the requested keys is determined implicitly - by the order of rows in the result.
if (configuration.implicit_key)
{
auto keys_names = dict_struct.getKeysNames();
for (auto & key_name : keys_names)
{
size_t key_column_position_in_block = sample_block.getPositionByName(key_name);
sample_block.erase(key_column_position_in_block);
}
}
}
ExecutablePoolDictionarySource::ExecutablePoolDictionarySource(const ExecutablePoolDictionarySource & other)
: log(&Poco::Logger::get("ExecutablePoolDictionarySource"))
, update_time{other.update_time}
, dict_struct{other.dict_struct}
, configuration{other.configuration}
, sample_block{other.sample_block}
, context{other.context}
, process_pool{std::make_shared<ProcessPool>(configuration.pool_size)}
{
}
BlockInputStreamPtr ExecutablePoolDictionarySource::loadAll()
{
throw Exception("ExecutablePoolDictionarySource with implicit_key does not support loadAll method", ErrorCodes::UNSUPPORTED_METHOD);
}
BlockInputStreamPtr ExecutablePoolDictionarySource::loadUpdatedAll()
{
throw Exception("ExecutablePoolDictionarySource with implicit_key does not support loadAll method", ErrorCodes::UNSUPPORTED_METHOD);
}
namespace
{
/** A stream, that runs child process and sends data to its stdin in background thread,
* and receives data from its stdout.
*/
class PoolBlockInputStreamWithBackgroundThread final : public IBlockInputStream
{
public:
PoolBlockInputStreamWithBackgroundThread(
std::shared_ptr<ProcessPool> process_pool_,
std::unique_ptr<ShellCommand> && command_,
BlockInputStreamPtr && stream_,
size_t read_rows_,
Poco::Logger * log_,
std::function<void(WriteBufferFromFile &)> && send_data_)
: process_pool(process_pool_)
, command(std::move(command_))
, stream(std::move(stream_))
, rows_to_read(read_rows_)
, log(log_)
, send_data(std::move(send_data_))
, thread([this]
{
try
{
send_data(command->in);
}
catch (...)
{
std::lock_guard<std::mutex> lck(exception_during_read_lock);
exception_during_read = std::current_exception();
}
})
{}
~PoolBlockInputStreamWithBackgroundThread() override
{
if (thread.joinable())
thread.join();
if (command)
process_pool->returnObject(std::move(command));
}
Block getHeader() const override
{
return stream->getHeader();
}
private:
Block readImpl() override
{
rethrowExceptionDuringReadIfNeeded();
if (current_read_rows == rows_to_read)
return Block();
Block block;
try
{
block = stream->read();
current_read_rows += block.rows();
}
catch (...)
{
tryLogCurrentException(log);
command = nullptr;
throw;
}
return block;
}
void readPrefix() override
{
rethrowExceptionDuringReadIfNeeded();
stream->readPrefix();
}
void readSuffix() override
{
if (thread.joinable())
thread.join();
rethrowExceptionDuringReadIfNeeded();
stream->readSuffix();
}
void rethrowExceptionDuringReadIfNeeded()
{
std::lock_guard<std::mutex> lck(exception_during_read_lock);
if (exception_during_read)
{
command = nullptr;
std::rethrow_exception(exception_during_read);
}
}
String getName() const override { return "PoolWithBackgroundThread"; }
std::shared_ptr<ProcessPool> process_pool;
std::unique_ptr<ShellCommand> command;
BlockInputStreamPtr stream;
size_t rows_to_read;
Poco::Logger * log;
std::function<void(WriteBufferFromFile &)> send_data;
ThreadFromGlobalPool thread;
size_t current_read_rows = 0;
std::mutex exception_during_read_lock;
std::exception_ptr exception_during_read;
};
}
BlockInputStreamPtr ExecutablePoolDictionarySource::loadIds(const std::vector<UInt64> & ids)
{
LOG_TRACE(log, "loadIds {} size = {}", toString(), ids.size());
auto block = blockForIds(dict_struct, ids);
return getStreamForBlock(block);
}
BlockInputStreamPtr ExecutablePoolDictionarySource::loadKeys(const Columns & key_columns, const std::vector<size_t> & requested_rows)
{
LOG_TRACE(log, "loadKeys {} size = {}", toString(), requested_rows.size());
auto block = blockForKeys(dict_struct, key_columns, requested_rows);
return getStreamForBlock(block);
}
BlockInputStreamPtr ExecutablePoolDictionarySource::getStreamForBlock(const Block & block)
{
std::unique_ptr<ShellCommand> process;
bool result = process_pool->tryBorrowObject(process, [this]()
{
bool terminate_in_destructor = true;
ShellCommandDestructorStrategy strategy { terminate_in_destructor, configuration.command_termination_timeout };
auto shell_command = ShellCommand::execute(configuration.command, false, strategy);
return shell_command;
}, configuration.max_command_execution_time * 10000);
if (!result)
throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Could not get process from pool, max command execution timeout exceeded");
size_t rows_to_read = block.rows();
auto read_stream = context.getInputFormat(configuration.format, process->out, sample_block, rows_to_read);
auto stream = std::make_unique<PoolBlockInputStreamWithBackgroundThread>(
process_pool, std::move(process), std::move(read_stream), rows_to_read, log,
[block, this](WriteBufferFromFile & out) mutable
{
auto output_stream = context.getOutputStream(configuration.format, out, block.cloneEmpty());
formatBlock(output_stream, block);
});
if (configuration.implicit_key)
return std::make_shared<BlockInputStreamWithAdditionalColumns>(block, std::move(stream));
else
return std::shared_ptr<PoolBlockInputStreamWithBackgroundThread>(stream.release());
}
bool ExecutablePoolDictionarySource::isModified() const
{
return true;
}
bool ExecutablePoolDictionarySource::supportsSelectiveLoad() const
{
return true;
}
bool ExecutablePoolDictionarySource::hasUpdateField() const
{
return !configuration.update_field.empty();
}
DictionarySourcePtr ExecutablePoolDictionarySource::clone() const
{
return std::make_unique<ExecutablePoolDictionarySource>(*this);
}
std::string ExecutablePoolDictionarySource::toString() const
{
return "ExecutablePool size: " + std::to_string(configuration.pool_size) + " command: " + configuration.command;
}
void registerDictionarySourceExecutablePool(DictionarySourceFactory & factory)
{
auto create_table_source = [=](const DictionaryStructure & dict_struct,
const Poco::Util::AbstractConfiguration & config,
const std::string & config_prefix,
Block & sample_block,
const Context & context,
const std::string & /* default_database */,
bool check_config) -> DictionarySourcePtr
{
if (dict_struct.has_expressions)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Dictionary source of type `executable_pool` does not support attribute expressions");
/// Executable dictionaries may execute arbitrary commands.
/// It's OK for dictionaries created by administrator from xml-file, but
/// maybe dangerous for dictionaries created from DDL-queries.
if (check_config)
throw Exception(ErrorCodes::DICTIONARY_ACCESS_DENIED, "Dictionaries with executable pool dictionary source are not allowed to be created from DDL query");
Context context_local_copy = copyContextAndApplySettings(config_prefix, context, config);
/** Currently parallel parsing input format cannot read exactly max_block_size rows from input,
* so it will be blocked on ReadBufferFromFileDescriptor because this file descriptor represent pipe that does not have eof.
*/
auto settings_no_parallel_parsing = context_local_copy.getSettings();
settings_no_parallel_parsing.input_format_parallel_parsing = false;
context_local_copy.setSettings(settings_no_parallel_parsing);
String configuration_config_prefix = config_prefix + ".executable_pool";
size_t max_command_execution_time = config.getUInt64(configuration_config_prefix + ".max_command_execution_time", 10);
size_t max_execution_time_seconds = static_cast<size_t>(context.getSettings().max_execution_time.totalSeconds());
if (max_command_execution_time > max_execution_time_seconds)
max_command_execution_time = max_execution_time_seconds;
ExecutablePoolDictionarySource::Configuration configuration
{
.command = config.getString(configuration_config_prefix + ".command"),
.format = config.getString(configuration_config_prefix + ".format"),
.pool_size = config.getUInt64(configuration_config_prefix + ".size"),
.update_field = config.getString(configuration_config_prefix + ".update_field", ""),
.implicit_key = config.getBool(configuration_config_prefix + ".implicit_key", false),
.command_termination_timeout = config.getUInt64(configuration_config_prefix + ".command_termination_timeout", 10),
.max_command_execution_time = max_command_execution_time
};
return std::make_unique<ExecutablePoolDictionarySource>(dict_struct, configuration, sample_block, context_local_copy);
};
factory.registerSource("executable_pool", create_table_source);
}
}

View File

@ -0,0 +1,84 @@
#pragma once
#include <Core/Block.h>
#include <Common/BorrowedObjectPool.h>
#include <Interpreters/Context.h>
#include "IDictionarySource.h"
#include "DictionaryStructure.h"
namespace Poco { class Logger; }
namespace DB
{
using ProcessPool = BorrowedObjectPool<std::unique_ptr<ShellCommand>>;
/** ExecutablePoolDictionarySource allows loading data from pool of processes.
* When client requests ids or keys source get process from ProcessPool
* and create stream based on source format from process stdout.
* It is important that stream format will expect only rows that were requested.
* When stream is finished process is returned back to the ProcessPool.
* If there are no processes in pool during request client will be blocked
* until some process will be retunred to pool.
*/
class ExecutablePoolDictionarySource final : public IDictionarySource
{
public:
struct Configuration
{
const String command;
const String format;
const size_t pool_size;
const String update_field;
const bool implicit_key;
const size_t command_termination_timeout;
const size_t max_command_execution_time;
};
ExecutablePoolDictionarySource(
const DictionaryStructure & dict_struct_,
const Configuration & configuration_,
Block & sample_block_,
const Context & context_);
ExecutablePoolDictionarySource(const ExecutablePoolDictionarySource & other);
ExecutablePoolDictionarySource & operator=(const ExecutablePoolDictionarySource &) = delete;
BlockInputStreamPtr loadAll() override;
/** The logic of this method is flawed, absolutely incorrect and ignorant.
* It may lead to skipping some values due to clock sync or timezone changes.
* The intended usage of "update_field" is totally different.
*/
BlockInputStreamPtr loadUpdatedAll() override;
BlockInputStreamPtr loadIds(const std::vector<UInt64> & ids) override;
BlockInputStreamPtr loadKeys(const Columns & key_columns, const std::vector<size_t> & requested_rows) override;
bool isModified() const override;
bool supportsSelectiveLoad() const override;
bool hasUpdateField() const override;
DictionarySourcePtr clone() const override;
std::string toString() const override;
BlockInputStreamPtr getStreamForBlock(const Block & block);
private:
Poco::Logger * log;
time_t update_time = 0;
const DictionaryStructure dict_struct;
const Configuration configuration;
Block sample_block;
Context context;
std::shared_ptr<ProcessPool> process_pool;
};
}

View File

@ -18,6 +18,7 @@ void registerDictionarySourceJDBC(DictionarySourceFactory & source_factory);
void registerDictionarySourcePostgreSQL(DictionarySourceFactory & source_factory);
#endif
void registerDictionarySourceExecutable(DictionarySourceFactory & source_factory);
void registerDictionarySourceExecutablePool(DictionarySourceFactory & source_factory);
void registerDictionarySourceHTTP(DictionarySourceFactory & source_factory);
void registerDictionarySourceLibrary(DictionarySourceFactory & source_factory);
@ -54,6 +55,7 @@ void registerDictionaries()
registerDictionarySourcePostgreSQL(source_factory);
#endif
registerDictionarySourceExecutable(source_factory);
registerDictionarySourceExecutablePool(source_factory);
registerDictionarySourceHTTP(source_factory);
registerDictionarySourceLibrary(source_factory);
}

View File

@ -42,6 +42,7 @@ SRCS(
Embedded/RegionsHierarchy.cpp
Embedded/RegionsNames.cpp
ExecutableDictionarySource.cpp
ExecutablePoolDictionarySource.cpp
ExternalQueryBuilder.cpp
FileDictionarySource.cpp
FlatDictionary.cpp

View File

@ -1100,14 +1100,14 @@ public:
return executeBitmapData<UInt32>(arguments, input_rows_count);
else if (which.isUInt64())
return executeBitmapData<UInt64>(arguments, input_rows_count);
else if (which.isUInt8())
return executeBitmapData<UInt8>(arguments, input_rows_count);
else if (which.isUInt16())
return executeBitmapData<UInt16>(arguments, input_rows_count);
else if (which.isUInt32())
return executeBitmapData<UInt32>(arguments, input_rows_count);
else if (which.isUInt64())
return executeBitmapData<UInt64>(arguments, input_rows_count);
else if (which.isInt8())
return executeBitmapData<Int8>(arguments, input_rows_count);
else if (which.isInt16())
return executeBitmapData<Int16>(arguments, input_rows_count);
else if (which.isInt32())
return executeBitmapData<Int32>(arguments, input_rows_count);
else if (which.isInt64())
return executeBitmapData<Int64>(arguments, input_rows_count);
else
throw Exception(
"Unexpected type " + from_type->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

View File

@ -358,9 +358,23 @@ public:
}
else if (dictionary_key_type == DictionaryKeyType::complex)
{
if (!isTuple(key_col_with_type.type))
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Third argument of function ({}) must be tuple when dictionary is complex. Actual type ({}).",
getName(),
key_col_with_type.type->getName());
/// Functions in external dictionaries_loader only support full-value (not constant) columns with keys.
ColumnPtr key_column_full = key_col_with_type.column->convertToFullColumnIfConst();
if (!isTuple(key_col_with_type.type))
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Third argument of function ({}) must be tuple when dictionary is complex. Actual type ({}).",
getName(),
key_col_with_type.type->getName());
const auto & key_columns = typeid_cast<const ColumnTuple &>(*key_column_full).getColumnsCopy();
const auto & key_types = static_cast<const DataTypeTuple &>(*key_col_with_type.type).getElements();

View File

@ -15,6 +15,8 @@
# include <Common/config.h>
#endif
#include <boost/algorithm/string/case_conv.hpp>
namespace DB
{
@ -55,23 +57,24 @@ CompressionMethod chooseCompressionMethod(const std::string & path, const std::s
file_extension = path.substr(pos + 1, std::string::npos);
}
const std::string * method_str = file_extension.empty() ? &hint : &file_extension;
std::string method_str = file_extension.empty() ? hint : std::move(file_extension);
boost::algorithm::to_lower(method_str);
if (*method_str == "gzip" || *method_str == "gz")
if (method_str == "gzip" || method_str == "gz")
return CompressionMethod::Gzip;
if (*method_str == "deflate")
if (method_str == "deflate")
return CompressionMethod::Zlib;
if (*method_str == "brotli" || *method_str == "br")
if (method_str == "brotli" || method_str == "br")
return CompressionMethod::Brotli;
if (*method_str == "LZMA" || *method_str == "xz")
if (method_str == "lzma" || method_str == "xz")
return CompressionMethod::Xz;
if (*method_str == "zstd" || *method_str == "zst")
if (method_str == "zstd" || method_str == "zst")
return CompressionMethod::Zstd;
if (hint.empty() || hint == "auto" || hint == "none")
return CompressionMethod::None;
throw Exception(
"Unknown compression method " + hint + ". Only 'auto', 'none', 'gzip', 'br', 'xz', 'zstd' are supported as compression methods",
"Unknown compression method " + hint + ". Only 'auto', 'none', 'gzip', 'deflate', 'br', 'xz', 'zstd' are supported as compression methods",
ErrorCodes::NOT_IMPLEMENTED);
}

View File

@ -688,7 +688,9 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions(
const ColumnsWithTypeAndName & source,
const ColumnsWithTypeAndName & result,
MatchColumnsMode mode,
bool ignore_constant_values)
bool ignore_constant_values,
bool add_casted_columns,
NameToNameMap * new_names)
{
size_t num_input_columns = source.size();
size_t num_result_columns = result.size();
@ -696,6 +698,9 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions(
if (mode == MatchColumnsMode::Position && num_input_columns != num_result_columns)
throw Exception("Number of columns doesn't match", ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH);
if (add_casted_columns && mode != MatchColumnsMode::Name)
throw Exception("Converting with add_casted_columns supported only for MatchColumnsMode::Name", ErrorCodes::LOGICAL_ERROR);
auto actions_dag = std::make_shared<ActionsDAG>(source);
std::vector<Node *> projection(num_result_columns);
@ -715,12 +720,13 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions(
{
const auto & res_elem = result[result_col_num];
Node * src_node = nullptr;
Node * dst_node = nullptr;
switch (mode)
{
case MatchColumnsMode::Position:
{
src_node = actions_dag->inputs[result_col_num];
src_node = dst_node = actions_dag->inputs[result_col_num];
break;
}
@ -731,7 +737,7 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions(
throw Exception("Cannot find column " + backQuote(res_elem.name) + " in source stream",
ErrorCodes::THERE_IS_NO_COLUMN);
src_node = actions_dag->inputs[input.front()];
src_node = dst_node = actions_dag->inputs[input.front()];
input.pop_front();
break;
}
@ -740,10 +746,10 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions(
/// Check constants.
if (const auto * res_const = typeid_cast<const ColumnConst *>(res_elem.column.get()))
{
if (const auto * src_const = typeid_cast<const ColumnConst *>(src_node->column.get()))
if (const auto * src_const = typeid_cast<const ColumnConst *>(dst_node->column.get()))
{
if (ignore_constant_values)
src_node = const_cast<Node *>(&actions_dag->addColumn(res_elem, true));
dst_node = const_cast<Node *>(&actions_dag->addColumn(res_elem, true));
else if (res_const->getField() != src_const->getField())
throw Exception("Cannot convert column " + backQuote(res_elem.name) + " because "
"it is constant but values of constants are different in source and result",
@ -756,7 +762,7 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions(
}
/// Add CAST function to convert into result type if needed.
if (!res_elem.type->equals(*src_node->result_type))
if (!res_elem.type->equals(*dst_node->result_type))
{
ColumnWithTypeAndName column;
column.name = res_elem.type->getName();
@ -764,27 +770,49 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions(
column.type = std::make_shared<DataTypeString>();
auto * right_arg = const_cast<Node *>(&actions_dag->addColumn(std::move(column), true));
auto * left_arg = src_node;
auto * left_arg = dst_node;
FunctionCast::Diagnostic diagnostic = {src_node->result_name, res_elem.name};
FunctionCast::Diagnostic diagnostic = {dst_node->result_name, res_elem.name};
FunctionOverloadResolverPtr func_builder_cast =
std::make_shared<FunctionOverloadResolverAdaptor>(
CastOverloadResolver<CastType::nonAccurate>::createImpl(false, std::move(diagnostic)));
Inputs children = { left_arg, right_arg };
src_node = &actions_dag->addFunction(func_builder_cast, std::move(children), {}, true);
dst_node = &actions_dag->addFunction(func_builder_cast, std::move(children), {}, true);
}
if (src_node->column && isColumnConst(*src_node->column) && !(res_elem.column && isColumnConst(*res_elem.column)))
if (dst_node->column && isColumnConst(*dst_node->column) && !(res_elem.column && isColumnConst(*res_elem.column)))
{
Inputs children = {src_node};
src_node = &actions_dag->addFunction(func_builder_materialize, std::move(children), {}, true);
Inputs children = {dst_node};
dst_node = &actions_dag->addFunction(func_builder_materialize, std::move(children), {}, true);
}
if (src_node->result_name != res_elem.name)
src_node = &actions_dag->addAlias(*src_node, res_elem.name, true);
if (dst_node->result_name != res_elem.name)
{
if (add_casted_columns)
{
if (inputs.contains(dst_node->result_name))
throw Exception("Cannot convert column " + backQuote(res_elem.name) +
" to "+ backQuote(dst_node->result_name) +
" because other column have same name",
ErrorCodes::ILLEGAL_COLUMN);
if (new_names)
new_names->emplace(res_elem.name, dst_node->result_name);
projection[result_col_num] = src_node;
/// Leave current column on same place, add converted to back
projection[result_col_num] = src_node;
projection.push_back(dst_node);
}
else
{
dst_node = &actions_dag->addAlias(*dst_node, res_elem.name, true);
projection[result_col_num] = dst_node;
}
}
else
{
projection[result_col_num] = dst_node;
}
}
actions_dag->removeUnusedActions(projection);

View File

@ -253,11 +253,15 @@ public:
/// Create ActionsDAG which converts block structure from source to result.
/// It is needed to convert result from different sources to the same structure, e.g. for UNION query.
/// Conversion should be possible with only usage of CAST function and renames.
/// @param ignore_constant_values - Do not check that constants are same. Use value from result_header.
/// @param add_casted_columns - Create new columns with converted values instead of replacing original.
static ActionsDAGPtr makeConvertingActions(
const ColumnsWithTypeAndName & source,
const ColumnsWithTypeAndName & result,
MatchColumnsMode mode,
bool ignore_constant_values = false); /// Do not check that constants are same. Use value from result_header.
bool ignore_constant_values = false,
bool add_casted_columns = false,
NameToNameMap * new_names = nullptr);
/// Create expression which add const column and then materialize it.
static ActionsDAGPtr makeAddingColumnActions(ColumnWithTypeAndName column);

View File

@ -674,7 +674,7 @@ void ActionsMatcher::visit(const ASTIdentifier & identifier, const ASTPtr & ast,
if (column_name_type.name == column_name)
{
throw Exception("Column " + backQuote(column_name) + " is not under aggregate function and not in GROUP BY",
ErrorCodes::NOT_AN_AGGREGATE);
ErrorCodes::NOT_AN_AGGREGATE);
}
}

View File

@ -320,6 +320,8 @@ std::unique_ptr<Context> DatabaseReplicatedTask::makeQueryContext(Context & from
String DDLTaskBase::getLogEntryName(UInt32 log_entry_number)
{
/// Sequential counter in ZooKeeper is Int32.
assert(log_entry_number < std::numeric_limits<Int32>::max());
constexpr size_t seq_node_digits = 10;
String number = toString(log_entry_number);
String name = "query-" + String(seq_node_digits - number.size(), '0') + number;
@ -330,7 +332,9 @@ UInt32 DDLTaskBase::getLogEntryNumber(const String & log_entry_name)
{
constexpr const char * name = "query-";
assert(startsWith(log_entry_name, name));
return parse<UInt32>(log_entry_name.substr(strlen(name)));
UInt32 num = parse<UInt32>(log_entry_name.substr(strlen(name)));
assert(num < std::numeric_limits<Int32>::max());
return num;
}
void ZooKeeperMetadataTransaction::commit()

View File

@ -48,6 +48,7 @@ namespace ErrorCodes
extern const int MEMORY_LIMIT_EXCEEDED;
}
constexpr const char * TASK_PROCESSED_OUT_REASON = "Task has been already processed";
namespace
{
@ -290,7 +291,7 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r
if (zookeeper->exists(task->getFinishedNodePath()))
{
out_reason = "Task has been already processed";
out_reason = TASK_PROCESSED_OUT_REASON;
return {};
}
@ -311,51 +312,120 @@ void DDLWorker::scheduleTasks(bool reinitialized)
auto zookeeper = tryGetZooKeeper();
/// Main thread of DDLWorker was restarted, probably due to lost connection with ZooKeeper.
/// We have some unfinished tasks. To avoid duplication of some queries, try to write execution status.
/// We have some unfinished tasks.
/// To avoid duplication of some queries we should try to write execution status again.
/// To avoid skipping of some entries which were not executed we should be careful when choosing begin_node to start from.
/// NOTE: It does not protect from all cases of query duplication, see also comments in processTask(...)
if (reinitialized)
{
for (auto & task : current_tasks)
if (current_tasks.empty())
LOG_TRACE(log, "Don't have unfinished tasks after restarting");
else
LOG_INFO(log, "Have {} unfinished tasks, will check them", current_tasks.size());
assert(current_tasks.size() <= pool_size + (worker_pool != nullptr));
auto task_it = current_tasks.begin();
while (task_it != current_tasks.end())
{
if (task->was_executed)
auto & task = *task_it;
if (task->completely_processed)
{
bool task_still_exists = zookeeper->exists(task->entry_path);
assert(task->was_executed);
/// Status must be written (but finished/ node may not exist if entry was deleted).
/// If someone is deleting entry concurrently, then /active status dir must not exist.
assert(zookeeper->exists(task->getFinishedNodePath()) || !zookeeper->exists(fs::path(task->entry_path) / "active"));
++task_it;
}
else if (task->was_executed)
{
/// Connection was lost on attempt to write status. Will retry.
bool status_written = zookeeper->exists(task->getFinishedNodePath());
if (!status_written && task_still_exists)
{
/// You might think that the following condition is redundant, because status_written implies completely_processed.
/// But it's wrong. It's possible that (!task->completely_processed && status_written)
/// if ZooKeeper successfully received and processed our request
/// but we lost connection while waiting for the response.
/// Yeah, distributed systems is a zoo.
if (status_written)
task->completely_processed = true;
else
processTask(*task, zookeeper);
}
++task_it;
}
else
{
/// We didn't even executed a query, so let's just remove it.
/// We will try to read the task again and execute it from the beginning.
/// NOTE: We can safely compare entry names as Strings, because they are padded.
/// Entry name always starts with "query-" and contain exactly 10 decimal digits
/// of log entry number (with leading zeros).
if (!first_failed_task_name || task->entry_name < *first_failed_task_name)
first_failed_task_name = task->entry_name;
task_it = current_tasks.erase(task_it);
}
}
}
Strings queue_nodes = zookeeper->getChildren(queue_dir, nullptr, queue_updated_event);
filterAndSortQueueNodes(queue_nodes);
if (queue_nodes.empty())
{
LOG_TRACE(log, "No tasks to schedule");
return;
}
else if (max_tasks_in_queue < queue_nodes.size())
if (max_tasks_in_queue < queue_nodes.size())
cleanup_event->set();
/// Detect queue start, using:
/// - skipped tasks
/// - in memory tasks (that are currently active)
/// - in memory tasks (that are currently active or were finished recently)
/// - failed tasks (that should be processed again)
auto begin_node = queue_nodes.begin();
UInt64 last_task_id = 0;
if (!current_tasks.empty())
if (first_failed_task_name)
{
auto & last_task = current_tasks.back();
last_task_id = DDLTaskBase::getLogEntryNumber(last_task->entry_name);
begin_node = std::upper_bound(queue_nodes.begin(), queue_nodes.end(), last_task->entry_name);
/// If we had failed tasks, then we should start from the first failed task.
assert(reinitialized);
begin_node = std::lower_bound(queue_nodes.begin(), queue_nodes.end(), first_failed_task_name);
}
if (last_skipped_entry_name)
else
{
UInt64 last_skipped_entry_id = DDLTaskBase::getLogEntryNumber(*last_skipped_entry_name);
if (last_skipped_entry_id > last_task_id)
begin_node = std::upper_bound(queue_nodes.begin(), queue_nodes.end(), *last_skipped_entry_name);
/// We had no failed tasks. Let's just choose the maximum entry we have previously seen.
String last_task_name;
if (!current_tasks.empty())
last_task_name = current_tasks.back()->entry_name;
if (last_skipped_entry_name && last_task_name < *last_skipped_entry_name)
last_task_name = *last_skipped_entry_name;
begin_node = std::upper_bound(queue_nodes.begin(), queue_nodes.end(), last_task_name);
}
if (begin_node == queue_nodes.end())
LOG_DEBUG(log, "No tasks to schedule");
else
LOG_DEBUG(log, "Will schedule {} tasks starting from {}", std::distance(begin_node, queue_nodes.end()), *begin_node);
/// Let's ensure that it's exactly the first task we should process.
/// Maybe such asserts are too paranoid and excessive,
/// but it's easy enough to break DDLWorker in a very unobvious way by making some minor change in code.
[[maybe_unused]] bool have_no_tasks_info = !first_failed_task_name && current_tasks.empty() && !last_skipped_entry_name;
assert(have_no_tasks_info || queue_nodes.end() == std::find_if(queue_nodes.begin(), queue_nodes.end(), [&](const String & entry_name)
{
/// We should return true if some invariants are violated.
String reason;
auto task = initAndCheckTask(entry_name, reason, zookeeper);
bool maybe_currently_processing = current_tasks.end() != std::find_if(current_tasks.begin(), current_tasks.end(), [&](const auto & t)
{
return t->entry_name == entry_name;
});
/// begin_node is something like a log pointer
if (begin_node == queue_nodes.end() || entry_name < *begin_node)
{
/// Return true if entry should be scheduled.
/// There is a minor race condition: initAndCheckTask(...) may return not null
/// if someone is deleting outdated entry right now (including finished/ nodes), so we also check active/ status dir.
bool maybe_concurrently_deleting = task && !zookeeper->exists(fs::path(task->entry_path) / "active");
return task && !maybe_concurrently_deleting && !maybe_currently_processing;
}
else
{
/// Return true if entry should not be scheduled.
bool processed = !task && reason == TASK_PROCESSED_OUT_REASON;
return processed || maybe_currently_processing;
}
}));
for (auto it = begin_node; it != queue_nodes.end() && !stop_flag; ++it)
{
String entry_name = *it;
@ -391,8 +461,18 @@ void DDLWorker::scheduleTasks(bool reinitialized)
DDLTaskBase & DDLWorker::saveTask(DDLTaskPtr && task)
{
current_tasks.remove_if([](const DDLTaskPtr & t) { return t->completely_processed.load(); });
assert(current_tasks.size() <= pool_size);
/// Tasks are scheduled and executed in main thread <==> Parallel execution is disabled
assert((worker_pool != nullptr) == (1 < pool_size));
/// Parallel execution is disabled ==> All previous tasks are failed to start or finished,
/// so current tasks list must be empty when we are ready to process new one.
assert(worker_pool || current_tasks.empty());
/// Parallel execution is enabled ==> Not more than pool_size tasks are currently executing.
/// Note: If current_tasks.size() == pool_size, then all worker threads are busy,
/// so we will wait on worker_pool->scheduleOrThrowOnError(...)
assert(!worker_pool || current_tasks.size() <= pool_size);
current_tasks.emplace_back(std::move(task));
if (first_failed_task_name && *first_failed_task_name == current_tasks.back()->entry_name)
first_failed_task_name.reset();
return *current_tasks.back();
}
@ -479,10 +559,15 @@ void DDLWorker::updateMaxDDLEntryID(const String & entry_name)
void DDLWorker::processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper)
{
LOG_DEBUG(log, "Processing task {} ({})", task.entry_name, task.entry.query);
assert(!task.completely_processed);
String active_node_path = task.getActiveNodePath();
String finished_node_path = task.getFinishedNodePath();
/// Step 1: Create ephemeral node in active/ status dir.
/// It allows other hosts to understand that task is currently executing (useful for system.distributed_ddl_queue)
/// and protects from concurrent deletion or the task.
/// It will tryRemove(...) on exception
auto active_node = zkutil::EphemeralNodeHolder::existing(active_node_path, *zookeeper);
@ -498,7 +583,21 @@ void DDLWorker::processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper)
/// Status dirs were not created in enqueueQuery(...) or someone is removing entry
if (create_active_res == Coordination::Error::ZNONODE)
{
assert(dynamic_cast<DatabaseReplicatedTask *>(&task) == nullptr);
if (task.was_executed)
{
/// Special case:
/// Task was executed (and we are trying to write status after connection loss) ==> Status dirs were previously created.
/// (Status dirs were previously created AND active/ does not exist) ==> Task was removed.
/// We cannot write status, but it's not required anymore, because no one will try to execute it again.
/// So we consider task as completely processed.
LOG_WARNING(log, "Task {} is executed, but looks like entry {} was deleted, cannot write status", task.entry_name, task.entry_path);
task.completely_processed = true;
return;
}
createStatusDirs(task.entry_path, zookeeper);
}
if (create_active_res == Coordination::Error::ZNODEEXISTS)
{
@ -508,7 +607,7 @@ void DDLWorker::processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper)
String dummy;
if (zookeeper->tryGet(active_node_path, dummy, nullptr, eph_node_disappeared))
{
constexpr int timeout_ms = 5000;
constexpr int timeout_ms = 30 * 1000;
if (!eph_node_disappeared->tryWait(timeout_ms))
throw Exception(ErrorCodes::LOGICAL_ERROR, "Ephemeral node {} still exists, "
"probably it's owned by someone else", active_node_path);
@ -518,6 +617,7 @@ void DDLWorker::processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper)
zookeeper->create(active_node_path, {}, zkutil::CreateMode::Ephemeral);
}
/// Step 2: Execute query from the task.
if (!task.was_executed)
{
/// If table and database engine supports it, they will execute task.ops by their own in a single transaction
@ -588,6 +688,7 @@ void DDLWorker::processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper)
updateMaxDDLEntryID(task.entry_name);
/// Step 3: Create node in finished/ status dir and write execution status.
/// FIXME: if server fails right here, the task will be executed twice. We need WAL here.
/// NOTE: If ZooKeeper connection is lost here, we will try again to write query status.
/// NOTE: If both table and database are replicated, task is executed in single ZK transaction.

View File

@ -123,6 +123,7 @@ protected:
/// Save state of executed task to avoid duplicate execution on ZK error
std::optional<String> last_skipped_entry_name;
std::optional<String> first_failed_task_name;
std::list<DDLTaskPtr> current_tasks;
std::shared_ptr<Poco::Event> queue_updated_event = std::make_shared<Poco::Event>();

View File

@ -726,7 +726,7 @@ ExpressionActionsChain::JoinStep::JoinStep(
for (const auto & column : result_columns)
required_columns.emplace_back(column.name, column.type);
analyzed_join->addJoinedColumnsAndCorrectNullability(result_columns);
analyzed_join->addJoinedColumnsAndCorrectTypes(result_columns);
}
void ExpressionActionsChain::JoinStep::finalize(const Names & required_output_)
@ -747,8 +747,8 @@ void ExpressionActionsChain::JoinStep::finalize(const Names & required_output_)
}
/// Result will also contain joined columns.
for (const auto & column : analyzed_join->columnsAddedByJoin())
required_names.emplace(column.name);
for (const auto & column_name : analyzed_join->columnsAddedByJoin())
required_names.emplace(column_name);
for (const auto & column : result_columns)
{

View File

@ -42,14 +42,13 @@
#include <DataTypes/DataTypeFactory.h>
#include <Parsers/parseQuery.h>
#include <Interpreters/interpretSubquery.h>
#include <Interpreters/DatabaseAndTableWithAlias.h>
#include <Interpreters/misc.h>
#include <Interpreters/ActionsVisitor.h>
#include <Interpreters/GlobalSubqueriesVisitor.h>
#include <Interpreters/GetAggregatesVisitor.h>
#include <Interpreters/GlobalSubqueriesVisitor.h>
#include <Interpreters/interpretSubquery.h>
#include <Interpreters/join_common.h>
#include <Interpreters/misc.h>
#include <IO/Operators.h>
#include <IO/WriteBufferFromString.h>
@ -208,13 +207,12 @@ void ExpressionAnalyzer::analyzeAggregation()
{
getRootActionsNoMakeSet(analyzedJoin().leftKeysList(), true, temp_actions, false);
auto sample_columns = temp_actions->getResultColumns();
analyzedJoin().addJoinedColumnsAndCorrectNullability(sample_columns);
analyzedJoin().addJoinedColumnsAndCorrectTypes(sample_columns);
temp_actions = std::make_shared<ActionsDAG>(sample_columns);
}
columns_after_join = columns_after_array_join;
const auto & added_by_join = analyzedJoin().columnsAddedByJoin();
columns_after_join.insert(columns_after_join.end(), added_by_join.begin(), added_by_join.end());
analyzedJoin().addJoinedColumnsAndCorrectTypes(columns_after_join, false);
}
has_aggregation = makeAggregateDescriptions(temp_actions);
@ -727,13 +725,17 @@ bool SelectQueryExpressionAnalyzer::appendJoinLeftKeys(ExpressionActionsChain &
JoinPtr SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain)
{
JoinPtr table_join = makeTableJoin(*syntax->ast_join);
const ColumnsWithTypeAndName & left_sample_columns = chain.getLastStep().getResultColumns();
JoinPtr table_join = makeTableJoin(*syntax->ast_join, left_sample_columns);
if (syntax->analyzed_join->needConvert())
{
chain.steps.push_back(std::make_unique<ExpressionActionsChain::ExpressionActionsStep>(syntax->analyzed_join->leftConvertingActions()));
chain.addStep();
}
ExpressionActionsChain::Step & step = chain.lastStep(columns_after_array_join);
chain.steps.push_back(std::make_unique<ExpressionActionsChain::JoinStep>(
syntax->analyzed_join, table_join, step.getResultColumns()));
chain.steps.push_back(std::make_unique<ExpressionActionsChain::JoinStep>(syntax->analyzed_join, table_join, step.getResultColumns()));
chain.addStep();
return table_join;
}
@ -798,7 +800,8 @@ static std::shared_ptr<IJoin> makeJoin(std::shared_ptr<TableJoin> analyzed_join,
return std::make_shared<JoinSwitcher>(analyzed_join, sample_block);
}
JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQueryElement & join_element)
JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(
const ASTTablesInSelectQueryElement & join_element, const ColumnsWithTypeAndName & left_sample_columns)
{
/// Two JOINs are not supported with the same subquery, but different USINGs.
auto join_hash = join_element.getTreeHash();
@ -834,7 +837,13 @@ JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQuer
}
/// TODO You do not need to set this up when JOIN is only needed on remote servers.
subquery_for_join.setJoinActions(joined_block_actions); /// changes subquery_for_join.sample_block inside
subquery_for_join.addJoinActions(joined_block_actions); /// changes subquery_for_join.sample_block inside
const ColumnsWithTypeAndName & right_sample_columns = subquery_for_join.sample_block.getColumnsWithTypeAndName();
bool need_convert = syntax->analyzed_join->applyJoinKeyConvert(left_sample_columns, right_sample_columns);
if (need_convert)
subquery_for_join.addJoinActions(std::make_shared<ExpressionActions>(syntax->analyzed_join->rightConvertingActions()));
subquery_for_join.join = makeJoin(syntax->analyzed_join, subquery_for_join.sample_block, context);
/// Do not make subquery for join over dictionary.
@ -1434,9 +1443,9 @@ ExpressionAnalysisResult::ExpressionAnalysisResult(
if (query_analyzer.hasTableJoin())
{
query_analyzer.appendJoinLeftKeys(chain, only_types || !first_stage);
before_join = chain.getLastActions();
join = query_analyzer.appendJoin(chain);
converting_join_columns = query_analyzer.analyzedJoin().leftConvertingActions();
chain.addStep();
}

View File

@ -1,15 +1,16 @@
#pragma once
#include <DataStreams/IBlockStream_fwd.h>
#include <Columns/FilterDescription.h>
#include <DataStreams/IBlockStream_fwd.h>
#include <Interpreters/AggregateDescription.h>
#include <Interpreters/WindowDescription.h>
#include <Interpreters/TreeRewriter.h>
#include <Interpreters/DatabaseCatalog.h>
#include <Interpreters/SubqueryForSet.h>
#include <Interpreters/TreeRewriter.h>
#include <Interpreters/WindowDescription.h>
#include <Interpreters/join_common.h>
#include <Parsers/IAST_fwd.h>
#include <Storages/IStorage_fwd.h>
#include <Storages/SelectQueryInfo.h>
#include <Interpreters/DatabaseCatalog.h>
namespace DB
{
@ -199,6 +200,7 @@ struct ExpressionAnalysisResult
ActionsDAGPtr before_array_join;
ArrayJoinActionPtr array_join;
ActionsDAGPtr before_join;
ActionsDAGPtr converting_join_columns;
JoinPtr join;
ActionsDAGPtr before_where;
ActionsDAGPtr before_aggregation;
@ -313,7 +315,9 @@ private:
/// Create Set-s that we make from IN section to use index on them.
void makeSetsForIndex(const ASTPtr & node);
JoinPtr makeTableJoin(const ASTTablesInSelectQueryElement & join_element);
JoinPtr makeTableJoin(
const ASTTablesInSelectQueryElement & join_element,
const ColumnsWithTypeAndName & left_sample_columns);
const ASTSelectQuery * getAggregatingQuery() const;

View File

@ -208,7 +208,7 @@ BlockIO InterpreterInsertQuery::execute()
auto storage_dst = std::dynamic_pointer_cast<StorageDistributed>(table);
if (storage_src && storage_dst && storage_src->cluster_name == storage_dst->cluster_name)
if (storage_src && storage_dst && storage_src->getClusterName() == storage_dst->getClusterName())
{
is_distributed_insert_select = true;

View File

@ -502,7 +502,10 @@ InterpreterSelectQuery::InterpreterSelectQuery(
if (need_analyze_again)
{
subquery_for_sets = std::move(query_analyzer->getSubqueriesForSets());
LOG_TRACE(log, "Running 'analyze' second time");
query_analyzer->getSubqueriesForSets().clear();
subquery_for_sets = SubqueriesForSets();
/// Do not try move conditions to PREWHERE for the second time.
/// Otherwise, we won't be able to fallback from inefficient PREWHERE to WHERE later.
analyze(/* try_move_to_prewhere = */ false);
@ -1059,14 +1062,22 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
query_plan.addStep(std::move(before_join_step));
}
/// Optional step to convert key columns to common supertype.
/// Columns with changed types will be returned to user,
/// so its only suitable for `USING` join.
if (expressions.converting_join_columns)
{
QueryPlanStepPtr convert_join_step = std::make_unique<ExpressionStep>(
query_plan.getCurrentDataStream(),
expressions.converting_join_columns);
convert_join_step->setStepDescription("Convert JOIN columns");
query_plan.addStep(std::move(convert_join_step));
}
if (expressions.hasJoin())
{
Block join_result_sample;
JoinPtr join = expressions.join;
join_result_sample = JoiningTransform::transformHeader(
query_plan.getCurrentDataStream().header, expressions.join);
QueryPlanStepPtr join_step = std::make_unique<JoinStep>(
query_plan.getCurrentDataStream(),
expressions.join);
@ -1076,6 +1087,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
if (expressions.join_has_delayed_stream)
{
const Block & join_result_sample = query_plan.getCurrentDataStream().header;
auto stream = std::make_shared<LazyNonJoinedBlockInputStream>(*join, join_result_sample, settings.max_block_size);
auto source = std::make_shared<SourceFromInputStream>(std::move(stream));
auto add_non_joined_rows_step = std::make_unique<AddingDelayedSourceStep>(

View File

@ -273,10 +273,9 @@ MutationsInterpreter::MutationsInterpreter(
, commands(std::move(commands_))
, context(context_)
, can_execute(can_execute_)
, select_limits(SelectQueryOptions().analyze(!can_execute).ignoreLimits())
{
mutation_ast = prepare(!can_execute);
SelectQueryOptions limits = SelectQueryOptions().analyze(!can_execute).ignoreLimits();
select_interpreter = std::make_unique<InterpreterSelectQuery>(mutation_ast, context, storage, metadata_snapshot_, limits);
}
static NameSet getKeyColumns(const StoragePtr & storage, const StorageMetadataPtr & metadata_snapshot)
@ -767,6 +766,9 @@ QueryPipelinePtr MutationsInterpreter::addStreamsForLaterStages(const std::vecto
void MutationsInterpreter::validate()
{
if (!select_interpreter)
select_interpreter = std::make_unique<InterpreterSelectQuery>(mutation_ast, context, storage, metadata_snapshot, select_limits);
const Settings & settings = context.getSettingsRef();
/// For Replicated* storages mutations cannot employ non-deterministic functions
@ -794,6 +796,9 @@ BlockInputStreamPtr MutationsInterpreter::execute()
if (!can_execute)
throw Exception("Cannot execute mutations interpreter because can_execute flag set to false", ErrorCodes::LOGICAL_ERROR);
if (!select_interpreter)
select_interpreter = std::make_unique<InterpreterSelectQuery>(mutation_ast, context, storage, metadata_snapshot, select_limits);
QueryPlan plan;
select_interpreter->buildQueryPlan(plan);

View File

@ -76,6 +76,7 @@ private:
MutationCommands commands;
Context context;
bool can_execute;
SelectQueryOptions select_limits;
ASTPtr mutation_ast;

View File

@ -39,10 +39,20 @@ void SubqueryForSet::renameColumns(Block & block)
}
}
void SubqueryForSet::setJoinActions(ExpressionActionsPtr actions)
void SubqueryForSet::addJoinActions(ExpressionActionsPtr actions)
{
actions->execute(sample_block);
joined_block_actions = actions;
if (joined_block_actions == nullptr)
{
joined_block_actions = actions;
}
else
{
auto new_dag = ActionsDAG::merge(
std::move(*joined_block_actions->getActionsDAG().clone()),
std::move(*actions->getActionsDAG().clone()));
joined_block_actions = std::make_shared<ExpressionActions>(new_dag);
}
}
bool SubqueryForSet::insertJoinedBlock(Block & block)

View File

@ -40,7 +40,7 @@ struct SubqueryForSet
void makeSource(std::shared_ptr<InterpreterSelectWithUnionQuery> & interpreter,
NamesWithAliases && joined_block_aliases_);
void setJoinActions(ExpressionActionsPtr actions);
void addJoinActions(ExpressionActionsPtr actions);
bool insertJoinedBlock(Block & block);
void setTotals(Block totals);

View File

@ -1,5 +1,7 @@
#include <Interpreters/TableJoin.h>
#include <common/logger_useful.h>
#include <Parsers/ASTExpressionList.h>
#include <Core/Settings.h>
@ -15,6 +17,11 @@
namespace DB
{
namespace ErrorCodes
{
extern const int TYPE_MISMATCH;
}
TableJoin::TableJoin(const Settings & settings, VolumePtr tmp_volume_)
: size_limits(SizeLimits{settings.max_rows_in_join, settings.max_bytes_in_join, settings.join_overflow_mode})
, default_max_bytes(settings.default_max_bytes_in_join)
@ -40,6 +47,10 @@ void TableJoin::resetCollected()
columns_added_by_join.clear();
original_names.clear();
renames.clear();
left_type_map.clear();
right_type_map.clear();
left_converting_actions = nullptr;
right_converting_actions = nullptr;
}
void TableJoin::addUsingKey(const ASTPtr & ast)
@ -105,14 +116,6 @@ void TableJoin::deduplicateAndQualifyColumnNames(const NameSet & left_table_colu
columns_from_joined_table.swap(dedup_columns);
}
NameSet TableJoin::getQualifiedColumnsSet() const
{
NameSet out;
for (const auto & names : original_names)
out.insert(names.first);
return out;
}
NamesWithAliases TableJoin::getNamesWithAliases(const NameSet & required_columns) const
{
NamesWithAliases out;
@ -221,49 +224,54 @@ bool TableJoin::rightBecomeNullable(const DataTypePtr & column_type) const
void TableJoin::addJoinedColumn(const NameAndTypePair & joined_column)
{
if (rightBecomeNullable(joined_column.type))
columns_added_by_join.emplace_back(NameAndTypePair(joined_column.name, makeNullable(joined_column.type)));
else
columns_added_by_join.push_back(joined_column);
DataTypePtr type = joined_column.type;
if (hasUsing())
{
if (auto it = right_type_map.find(joined_column.name); it != right_type_map.end())
type = it->second;
}
if (rightBecomeNullable(type))
type = makeNullable(type);
columns_added_by_join.emplace_back(joined_column.name, type);
}
void TableJoin::addJoinedColumnsAndCorrectNullability(ColumnsWithTypeAndName & columns) const
void TableJoin::addJoinedColumnsAndCorrectTypes(NamesAndTypesList & names_and_types, bool correct_nullability) const
{
ColumnsWithTypeAndName columns;
for (auto & pair : names_and_types)
columns.emplace_back(nullptr, std::move(pair.type), std::move(pair.name));
names_and_types.clear();
addJoinedColumnsAndCorrectTypes(columns, correct_nullability);
for (auto & col : columns)
names_and_types.emplace_back(std::move(col.name), std::move(col.type));
}
void TableJoin::addJoinedColumnsAndCorrectTypes(ColumnsWithTypeAndName & columns, bool correct_nullability) const
{
for (auto & col : columns)
{
if (leftBecomeNullable(col.type))
if (hasUsing())
{
if (auto it = left_type_map.find(col.name); it != left_type_map.end())
col.type = it->second;
}
if (correct_nullability && leftBecomeNullable(col.type))
{
/// No need to nullify constants
if (!(col.column && isColumnConst(*col.column)))
{
bool is_column_const = col.column && isColumnConst(*col.column);
if (!is_column_const)
col.type = makeNullable(col.type);
}
}
}
/// Types in columns_added_by_join already converted and set nullable if needed
for (const auto & col : columns_added_by_join)
{
auto res_type = col.type;
if (rightBecomeNullable(res_type))
res_type = makeNullable(res_type);
columns.emplace_back(nullptr, res_type, col.name);
}
}
bool TableJoin::sameJoin(const TableJoin * x, const TableJoin * y)
{
if (!x && !y)
return true;
if (!x || !y)
return false;
return x->table_join.kind == y->table_join.kind
&& x->table_join.strictness == y->table_join.strictness
&& x->key_names_left == y->key_names_left
&& x->key_names_right == y->key_names_right
&& x->columns_added_by_join == y->columns_added_by_join;
columns.emplace_back(nullptr, col.type, col.name);
}
bool TableJoin::sameStrictnessAndKind(ASTTableJoin::Strictness strictness_, ASTTableJoin::Kind kind_) const
@ -336,4 +344,124 @@ bool TableJoin::allowDictJoin(const String & dict_key, const Block & sample_bloc
return true;
}
bool TableJoin::applyJoinKeyConvert(const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns)
{
bool need_convert = needConvert();
if (!need_convert && !hasUsing())
{
/// For `USING` we already inferred common type an syntax analyzer stage
NamesAndTypesList left_list;
NamesAndTypesList right_list;
for (const auto & col : left_sample_columns)
left_list.emplace_back(col.name, col.type);
for (const auto & col : right_sample_columns)
right_list.emplace_back(col.name, col.type);
need_convert = inferJoinKeyCommonType(left_list, right_list);
}
if (need_convert)
{
left_converting_actions = applyKeyConvertToTable(left_sample_columns, left_type_map, key_names_left);
right_converting_actions = applyKeyConvertToTable(right_sample_columns, right_type_map, key_names_right);
}
return need_convert;
}
bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const NamesAndTypesList & right)
{
std::unordered_map<String, DataTypePtr> left_types;
for (const auto & col : left)
{
left_types[col.name] = col.type;
}
std::unordered_map<String, DataTypePtr> right_types;
for (const auto & col : right)
{
if (auto it = renames.find(col.name); it != renames.end())
right_types[it->second] = col.type;
else
right_types[col.name] = col.type;
}
for (size_t i = 0; i < key_names_left.size(); ++i)
{
auto ltype = left_types.find(key_names_left[i]);
auto rtype = right_types.find(key_names_right[i]);
if (ltype == left_types.end() || rtype == right_types.end())
{
/// Name mismatch, give up
left_type_map.clear();
right_type_map.clear();
return false;
}
if (JoinCommon::typesEqualUpToNullability(ltype->second, rtype->second))
continue;
DataTypePtr supertype;
try
{
supertype = DB::getLeastSupertype({ltype->second, rtype->second});
}
catch (DB::Exception & ex)
{
throw Exception(
"Type mismatch of columns to JOIN by: " +
key_names_left[i] + ": " + ltype->second->getName() + " at left, " +
key_names_right[i] + ": " + rtype->second->getName() + " at right. " +
"Can't get supertype: " + ex.message(),
ErrorCodes::TYPE_MISMATCH);
}
left_type_map[key_names_left[i]] = right_type_map[key_names_right[i]] = supertype;
}
if (!left_type_map.empty() || !right_type_map.empty())
{
auto format_type_map = [](NameToTypeMap mapping) -> std::string
{
std::vector<std::string> text;
for (const auto & [k, v] : mapping)
text.push_back(k + ": " + v->getName());
return fmt::format("{}", fmt::join(text, ", "));
};
LOG_TRACE(
&Poco::Logger::get("TableJoin"),
"Infer supertype for joined columns. Left: [{}], Right: [{}]",
format_type_map(left_type_map),
format_type_map(right_type_map));
}
return !left_type_map.empty();
}
ActionsDAGPtr TableJoin::applyKeyConvertToTable(
const ColumnsWithTypeAndName & cols_src, const NameToTypeMap & type_mapping, Names & names_to_rename) const
{
ColumnsWithTypeAndName cols_dst = cols_src;
for (auto & col : cols_dst)
{
if (auto it = type_mapping.find(col.name); it != type_mapping.end())
{
col.type = it->second;
col.column = nullptr;
}
}
NameToNameMap key_column_rename;
/// Returns converting actions for tables that need to be performed before join
auto dag = ActionsDAG::makeConvertingActions(
cols_src, cols_dst, ActionsDAG::MatchColumnsMode::Name, true, !hasUsing(), &key_column_rename);
for (auto & name : names_to_rename)
{
const auto it = key_column_rename.find(name);
if (it != key_column_rename.end())
name = it->second;
}
return dag;
}
}

View File

@ -5,9 +5,11 @@
#include <Core/SettingsEnums.h>
#include <Parsers/ASTTablesInSelectQuery.h>
#include <Interpreters/IJoin.h>
#include <Interpreters/join_common.h>
#include <Interpreters/asof.h>
#include <DataStreams/IBlockStream_fwd.h>
#include <DataStreams/SizeLimits.h>
#include <DataTypes/getLeastSupertype.h>
#include <Storages/IStorage_fwd.h>
#include <utility>
@ -32,6 +34,11 @@ using VolumePtr = std::shared_ptr<IVolume>;
class TableJoin
{
public:
using NameToTypeMap = std::unordered_map<String, DataTypePtr>;
private:
/** Query of the form `SELECT expr(x) AS k FROM t1 ANY LEFT JOIN (SELECT expr(x) AS k FROM t2) USING k`
* The join is made by column k.
* During the JOIN,
@ -59,23 +66,39 @@ class TableJoin
Names key_names_left;
Names key_names_right; /// Duplicating names are qualified.
ASTs key_asts_left;
ASTs key_asts_right;
ASTTableJoin table_join;
ASOF::Inequality asof_inequality = ASOF::Inequality::GreaterOrEquals;
/// All columns which can be read from joined table. Duplicating names are qualified.
NamesAndTypesList columns_from_joined_table;
/// Columns will be added to block by JOIN. It's a subset of columns_from_joined_table with corrected Nullability
/// Columns will be added to block by JOIN.
/// It's a subset of columns_from_joined_table with corrected Nullability and type (if inplace type conversion is required)
NamesAndTypesList columns_added_by_join;
/// Target type to convert key columns before join
NameToTypeMap left_type_map;
NameToTypeMap right_type_map;
ActionsDAGPtr left_converting_actions;
ActionsDAGPtr right_converting_actions;
/// Name -> original name. Names are the same as in columns_from_joined_table list.
std::unordered_map<String, String> original_names;
/// Original name -> name. Only ranamed columns.
/// Original name -> name. Only renamed columns.
std::unordered_map<String, String> renames;
VolumePtr tmp_volume;
Names requiredJoinedNames() const;
/// Create converting actions and change key column names if required
ActionsDAGPtr applyKeyConvertToTable(
const ColumnsWithTypeAndName & cols_src, const NameToTypeMap & type_mapping, Names & names_to_rename) const;
public:
TableJoin() = default;
TableJoin(const Settings &, VolumePtr tmp_volume);
@ -125,7 +148,6 @@ public:
bool hasUsing() const { return table_join.using_expression_list != nullptr; }
bool hasOn() const { return table_join.on_expression != nullptr; }
NameSet getQualifiedColumnsSet() const;
NamesWithAliases getNamesWithAliases(const NameSet & required_columns) const;
NamesWithAliases getRequiredColumns(const Block & sample, const Names & action_required_columns) const;
@ -136,7 +158,23 @@ public:
bool leftBecomeNullable(const DataTypePtr & column_type) const;
bool rightBecomeNullable(const DataTypePtr & column_type) const;
void addJoinedColumn(const NameAndTypePair & joined_column);
void addJoinedColumnsAndCorrectNullability(ColumnsWithTypeAndName & columns) const;
void addJoinedColumnsAndCorrectTypes(NamesAndTypesList & names_and_types, bool correct_nullability = true) const;
void addJoinedColumnsAndCorrectTypes(ColumnsWithTypeAndName & columns, bool correct_nullability = true) const;
/// Calculates common supertypes for corresponding join key columns.
bool inferJoinKeyCommonType(const NamesAndTypesList & left, const NamesAndTypesList & right);
/// Calculate converting actions, rename key columns in required
/// For `USING` join we will convert key columns inplace and affect into types in the result table
/// For `JOIN ON` we will create new columns with converted keys to join by.
bool applyJoinKeyConvert(const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns);
bool needConvert() const { return !left_type_map.empty(); }
/// Key columns should be converted before join.
ActionsDAGPtr leftConvertingActions() const { return left_converting_actions; }
ActionsDAGPtr rightConvertingActions() const { return right_converting_actions; }
void setAsofInequality(ASOF::Inequality inequality) { asof_inequality = inequality; }
ASOF::Inequality getAsofInequality() { return asof_inequality; }
@ -144,11 +182,16 @@ public:
ASTPtr leftKeysList() const;
ASTPtr rightKeysList() const; /// For ON syntax only
Names requiredJoinedNames() const;
const Names & keyNamesLeft() const { return key_names_left; }
const Names & keyNamesRight() const { return key_names_right; }
const NamesAndTypesList & columnsFromJoinedTable() const { return columns_from_joined_table; }
const NamesAndTypesList & columnsAddedByJoin() const { return columns_added_by_join; }
Names columnsAddedByJoin() const
{
Names res;
for (const auto & col : columns_added_by_join)
res.push_back(col.name);
return res;
}
/// StorageJoin overrides key names (cause of different names qualification)
void setRightKeys(const Names & keys) { key_names_right = keys; }
@ -156,8 +199,6 @@ public:
/// Split key and other columns by keys name list
void splitAdditionalColumns(const Block & sample_block, Block & block_keys, Block & block_others) const;
Block getRequiredRightKeys(const Block & right_table_keys, std::vector<String> & keys_sources) const;
static bool sameJoin(const TableJoin * x, const TableJoin * y);
};
}

View File

@ -418,6 +418,13 @@ void collectJoinedColumns(TableJoin & analyzed_join, const ASTSelectQuery & sele
const auto & keys = table_join.using_expression_list->as<ASTExpressionList &>();
for (const auto & key : keys.children)
analyzed_join.addUsingKey(key);
/// `USING` semantic allows to have columns with changed types in result table.
/// `JOIN ON` should preserve types from original table
/// We can infer common type on syntax stage for `USING` because join is performed only by columns (not expressions)
/// We need to know changed types in result tables because some analysis (e.g. analyzeAggregation) performed before join
/// For `JOIN ON expr1 == expr2` we will infer common type later in ExpressionAnalyzer, when types of expression will be known
analyzed_join.inferJoinKeyCommonType(tables[0].columns, tables[1].columns);
}
else if (table_join.on_expression)
{
@ -571,7 +578,6 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select
source_column_names.insert(column.name);
NameSet required = columns_context.requiredColumns();
if (columns_context.has_table_join)
{
NameSet available_columns;

View File

@ -46,7 +46,7 @@ std::string getClusterName(const IAST & node)
}
String getClusterNameAndMakeLiteral(ASTPtr & node)
std::string getClusterNameAndMakeLiteral(ASTPtr & node)
{
String cluster_name = getClusterName(*node);
node = std::make_shared<ASTLiteral>(cluster_name);

View File

@ -14,8 +14,8 @@ namespace DB
* This name will be parsed as an expression with an operator minus - not at all what you need.
* Therefore, consider this case separately.
*/
String getClusterName(const IAST & node);
std::string getClusterName(const IAST & node);
String getClusterNameAndMakeLiteral(ASTPtr & node);
std::string getClusterNameAndMakeLiteral(ASTPtr & node);
}

View File

@ -1,9 +1,11 @@
#include <Interpreters/join_common.h>
#include <Interpreters/TableJoin.h>
#include <Interpreters/ActionsDAG.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnLowCardinality.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <DataTypes/getLeastSupertype.h>
#include <DataStreams/materializeBlock.h>
#include <IO/WriteHelpers.h>
@ -293,6 +295,13 @@ void addDefaultValues(IColumn & column, const DataTypePtr & type, size_t count)
type->insertDefaultInto(column);
}
bool typesEqualUpToNullability(DataTypePtr left_type, DataTypePtr right_type)
{
DataTypePtr left_type_strict = removeNullable(recursiveRemoveLowCardinality(left_type));
DataTypePtr right_type_strict = removeNullable(recursiveRemoveLowCardinality(right_type));
return left_type_strict->equals(*right_type_strict);
}
}
@ -307,19 +316,21 @@ NotJoined::NotJoined(const TableJoin & table_join, const Block & saved_block_sam
table_join.splitAdditionalColumns(right_sample_block, right_table_keys, sample_block_with_columns_to_add);
Block required_right_keys = table_join.getRequiredRightKeys(right_table_keys, tmp);
bool remap_keys = table_join.hasUsing();
std::unordered_map<size_t, size_t> left_to_right_key_remap;
for (size_t i = 0; i < table_join.keyNamesLeft().size(); ++i)
if (table_join.hasUsing())
{
const String & left_key_name = table_join.keyNamesLeft()[i];
const String & right_key_name = table_join.keyNamesRight()[i];
for (size_t i = 0; i < table_join.keyNamesLeft().size(); ++i)
{
const String & left_key_name = table_join.keyNamesLeft()[i];
const String & right_key_name = table_join.keyNamesRight()[i];
size_t left_key_pos = result_sample_block.getPositionByName(left_key_name);
size_t right_key_pos = saved_block_sample.getPositionByName(right_key_name);
size_t left_key_pos = result_sample_block.getPositionByName(left_key_name);
size_t right_key_pos = saved_block_sample.getPositionByName(right_key_name);
if (remap_keys && !required_right_keys.has(right_key_name))
left_to_right_key_remap[left_key_pos] = right_key_pos;
if (!required_right_keys.has(right_key_name))
left_to_right_key_remap[left_key_pos] = right_key_pos;
}
}
/// result_sample_block: left_sample_block + left expressions, right not key columns, required right keys

View File

@ -2,6 +2,8 @@
#include <Core/Block.h>
#include <Interpreters/IJoin.h>
#include <Interpreters/ActionsDAG.h>
#include <Interpreters/ExpressionActions.h>
namespace DB
{
@ -36,6 +38,8 @@ void joinTotals(const Block & totals, const Block & columns_to_add, const TableJ
void addDefaultValues(IColumn & column, const DataTypePtr & type, size_t count);
bool typesEqualUpToNullability(DataTypePtr left_type, DataTypePtr right_type);
}
/// Creates result from right table data in RIGHT and FULL JOIN when keys are not present in left table.

View File

@ -276,7 +276,7 @@ void StorageDistributedDirectoryMonitor::flushAllData()
if (quit)
return;
std::unique_lock lock{mutex};
std::lock_guard lock{mutex};
const auto & files = getFiles();
if (!files.empty())
@ -303,7 +303,7 @@ void StorageDistributedDirectoryMonitor::shutdownAndDropAllData()
void StorageDistributedDirectoryMonitor::run()
{
std::unique_lock lock{mutex};
std::lock_guard lock{mutex};
bool do_sleep = false;
while (!quit)
@ -320,12 +320,12 @@ void StorageDistributedDirectoryMonitor::run()
{
do_sleep = !processFiles(files);
std::unique_lock metrics_lock(metrics_mutex);
std::lock_guard metrics_lock(metrics_mutex);
last_exception = std::exception_ptr{};
}
catch (...)
{
std::unique_lock metrics_lock(metrics_mutex);
std::lock_guard metrics_lock(metrics_mutex);
do_sleep = true;
++error_count;
@ -344,7 +344,7 @@ void StorageDistributedDirectoryMonitor::run()
const auto now = std::chrono::system_clock::now();
if (now - last_decrease_time > decrease_error_count_period)
{
std::unique_lock metrics_lock(metrics_mutex);
std::lock_guard metrics_lock(metrics_mutex);
error_count /= 2;
last_decrease_time = now;
@ -453,10 +453,15 @@ std::map<UInt64, std::string> StorageDistributedDirectoryMonitor::getFiles()
}
}
metric_pending_files.changeTo(files.size());
{
std::unique_lock metrics_lock(metrics_mutex);
std::lock_guard metrics_lock(metrics_mutex);
if (files_count != files.size())
LOG_TRACE(log, "Files set to {} (was {})", files.size(), files_count);
if (bytes_count != new_bytes_count)
LOG_TRACE(log, "Bytes set to {} (was {})", new_bytes_count, bytes_count);
metric_pending_files.changeTo(files.size());
files_count = files.size();
bytes_count = new_bytes_count;
}
@ -509,9 +514,7 @@ void StorageDistributedDirectoryMonitor::processFile(const std::string & file_pa
}
auto dir_sync_guard = getDirectorySyncGuard(dir_fsync, disk, relative_path);
Poco::File{file_path}.remove();
metric_pending_files.sub();
markAsSend(file_path);
LOG_TRACE(log, "Finished processing `{}`", file_path);
}
@ -661,7 +664,7 @@ struct StorageDistributedDirectoryMonitor::Batch
auto dir_sync_guard = getDirectorySyncGuard(dir_fsync, parent.disk, parent.relative_path);
for (UInt64 file_index : file_indices)
Poco::File{file_index_to_path.at(file_index)}.remove();
parent.markAsSend(file_index_to_path.at(file_index));
}
else
{
@ -747,16 +750,24 @@ BlockInputStreamPtr StorageDistributedDirectoryMonitor::createStreamFromFile(con
return std::make_shared<DirectoryMonitorBlockInputStream>(file_name);
}
bool StorageDistributedDirectoryMonitor::scheduleAfter(size_t ms)
bool StorageDistributedDirectoryMonitor::addAndSchedule(size_t file_size, size_t ms)
{
if (quit)
return false;
{
std::lock_guard metrics_lock(metrics_mutex);
metric_pending_files.add();
bytes_count += file_size;
++files_count;
}
return task_handle->scheduleAfter(ms, false);
}
StorageDistributedDirectoryMonitor::Status StorageDistributedDirectoryMonitor::getStatus() const
StorageDistributedDirectoryMonitor::Status StorageDistributedDirectoryMonitor::getStatus()
{
std::unique_lock metrics_lock(metrics_mutex);
std::lock_guard metrics_lock(metrics_mutex);
return Status{
path,
@ -780,7 +791,6 @@ void StorageDistributedDirectoryMonitor::processFilesWithBatching(const std::map
batch.readText(in);
file_indices_to_skip.insert(batch.file_indices.begin(), batch.file_indices.end());
batch.send();
metric_pending_files.sub(batch.file_indices.size());
}
std::unordered_map<BatchHeader, Batch, BatchHeader::Hash> header_to_batch;
@ -850,7 +860,6 @@ void StorageDistributedDirectoryMonitor::processFilesWithBatching(const std::map
if (batch.isEnoughSize())
{
batch.send();
metric_pending_files.sub(batch.file_indices.size());
}
}
@ -858,7 +867,6 @@ void StorageDistributedDirectoryMonitor::processFilesWithBatching(const std::map
{
Batch & batch = kv.second;
batch.send();
metric_pending_files.sub(batch.file_indices.size());
}
{
@ -871,7 +879,7 @@ void StorageDistributedDirectoryMonitor::processFilesWithBatching(const std::map
}
}
void StorageDistributedDirectoryMonitor::markAsBroken(const std::string & file_path) const
void StorageDistributedDirectoryMonitor::markAsBroken(const std::string & file_path)
{
const auto last_path_separator_pos = file_path.rfind('/');
const auto & base_path = file_path.substr(0, last_path_separator_pos + 1);
@ -884,12 +892,37 @@ void StorageDistributedDirectoryMonitor::markAsBroken(const std::string & file_p
auto dir_sync_guard = getDirectorySyncGuard(dir_fsync, disk, relative_path);
auto broken_dir_sync_guard = getDirectorySyncGuard(dir_fsync, disk, relative_path + "/broken/");
Poco::File{file_path}.renameTo(broken_file_path);
Poco::File file(file_path);
{
std::lock_guard metrics_lock(metrics_mutex);
size_t file_size = file.getSize();
--files_count;
bytes_count -= file_size;
}
file.renameTo(broken_file_path);
LOG_ERROR(log, "Renamed `{}` to `{}`", file_path, broken_file_path);
}
void StorageDistributedDirectoryMonitor::markAsSend(const std::string & file_path)
{
Poco::File file(file_path);
bool StorageDistributedDirectoryMonitor::maybeMarkAsBroken(const std::string & file_path, const Exception & e) const
size_t file_size = file.getSize();
{
std::lock_guard metrics_lock(metrics_mutex);
metric_pending_files.sub();
--files_count;
bytes_count -= file_size;
}
file.remove();
}
bool StorageDistributedDirectoryMonitor::maybeMarkAsBroken(const std::string & file_path, const Exception & e)
{
/// mark file as broken if necessary
if (isFileBrokenErrorCode(e.code(), e.isRemoteException()))
@ -912,7 +945,7 @@ void StorageDistributedDirectoryMonitor::updatePath(const std::string & new_rela
std::lock_guard lock{mutex};
{
std::unique_lock metrics_lock(metrics_mutex);
std::lock_guard metrics_lock(metrics_mutex);
relative_path = new_relative_path;
path = disk->getPath() + relative_path + '/';
}

View File

@ -48,7 +48,7 @@ public:
static BlockInputStreamPtr createStreamFromFile(const String & file_name);
/// For scheduling via DistributedBlockOutputStream
bool scheduleAfter(size_t ms);
bool addAndSchedule(size_t file_size, size_t ms);
/// system.distribution_queue interface
struct Status
@ -60,7 +60,7 @@ public:
size_t bytes_count;
bool is_blocked;
};
Status getStatus() const;
Status getStatus();
private:
void run();
@ -70,8 +70,9 @@ private:
void processFile(const std::string & file_path);
void processFilesWithBatching(const std::map<UInt64, std::string> & files);
void markAsBroken(const std::string & file_path) const;
bool maybeMarkAsBroken(const std::string & file_path, const Exception & e) const;
void markAsBroken(const std::string & file_path);
void markAsSend(const std::string & file_path);
bool maybeMarkAsBroken(const std::string & file_path, const Exception & e);
std::string getLoggerName() const;
@ -91,7 +92,7 @@ private:
struct BatchHeader;
struct Batch;
mutable std::mutex metrics_mutex;
std::mutex metrics_mutex;
size_t error_count = 0;
size_t files_count = 0;
size_t bytes_count = 0;

View File

@ -114,6 +114,7 @@ Block DistributedBlockOutputStream::getHeader() const
void DistributedBlockOutputStream::writePrefix()
{
storage.delayInsertOrThrowIfNeeded();
}
@ -717,6 +718,7 @@ void DistributedBlockOutputStream::writeToShard(const Block & block, const std::
auto dir_sync_guard = make_directory_sync_guard(*it);
}
auto file_size = Poco::File(first_file_tmp_path).getSize();
/// remove the temporary file, enabling the OS to reclaim inode after all threads
/// have removed their corresponding files
Poco::File(first_file_tmp_path).remove();
@ -726,7 +728,7 @@ void DistributedBlockOutputStream::writeToShard(const Block & block, const std::
for (const auto & dir_name : dir_names)
{
auto & directory_monitor = storage.requireDirectoryMonitor(disk, dir_name);
directory_monitor.scheduleAfter(sleep_ms.totalMilliseconds());
directory_monitor.addAndSchedule(file_size, sleep_ms.totalMilliseconds());
}
}

View File

@ -17,6 +17,10 @@ class ASTStorage;
#define LIST_OF_DISTRIBUTED_SETTINGS(M) \
M(Bool, fsync_after_insert, false, "Do fsync for every inserted. Will decreases performance of inserts (only for async INSERT, i.e. insert_distributed_sync=false)", 0) \
M(Bool, fsync_directories, false, "Do fsync for temporary directory (that is used for async INSERT only) after all part operations (writes, renames, etc.).", 0) \
/** Inserts settings. */ \
M(UInt64, bytes_to_throw_insert, 0, "If more than this number of compressed bytes will be pending for async INSERT, an exception will be thrown. 0 - do not throw.", 0) \
M(UInt64, bytes_to_delay_insert, 0, "If more than this number of compressed bytes will be pending for async INSERT, the query will be delayed. 0 - do not delay.", 0) \
M(UInt64, max_delay_to_insert, 60, "Max delay of inserting data into Distributed table in seconds, if there are a lot of pending bytes for async send.", 0) \
DECLARE_SETTINGS_TRAITS(DistributedSettingsTraits, LIST_OF_DISTRIBUTED_SETTINGS)

View File

@ -60,6 +60,27 @@ static Names extractColumnNames(const ASTPtr & node)
}
}
/** Is used to order Graphite::Retentions by age and precision descending.
* Throws exception if not both age and precision are less or greater then another.
*/
static bool compareRetentions(const Graphite::Retention & a, const Graphite::Retention & b)
{
if (a.age > b.age && a.precision > b.precision)
{
return true;
}
else if (a.age < b.age && a.precision < b.precision)
{
return false;
}
String error_msg = "age and precision should only grow up: "
+ std::to_string(a.age) + ":" + std::to_string(a.precision) + " vs "
+ std::to_string(b.age) + ":" + std::to_string(b.precision);
throw Exception(
error_msg,
ErrorCodes::BAD_ARGUMENTS);
}
/** Read the settings for Graphite rollup from config.
* Example
*
@ -157,8 +178,7 @@ appendGraphitePattern(const Poco::Util::AbstractConfiguration & config, const St
/// retention should be in descending order of age.
if (pattern.type & pattern.TypeRetention) /// TypeRetention or TypeAll
std::sort(pattern.retentions.begin(), pattern.retentions.end(),
[] (const Graphite::Retention & a, const Graphite::Retention & b) { return a.age > b.age; });
std::sort(pattern.retentions.begin(), pattern.retentions.end(), compareRetentions);
patterns.emplace_back(pattern);
}

View File

@ -14,10 +14,12 @@
#include <Columns/ColumnConst.h>
#include <Common/Macros.h>
#include <Common/ProfileEvents.h>
#include <Common/escapeForFileName.h>
#include <Common/typeid_cast.h>
#include <Common/quoteString.h>
#include <Common/randomSeed.h>
#include <Common/formatReadable.h>
#include <Parsers/ASTDropQuery.h>
#include <Parsers/ASTExpressionList.h>
@ -69,6 +71,13 @@ const UInt64 FORCE_OPTIMIZE_SKIP_UNUSED_SHARDS_ALWAYS = 2;
const UInt64 DISTRIBUTED_GROUP_BY_NO_MERGE_AFTER_AGGREGATION = 2;
}
namespace ProfileEvents
{
extern const Event DistributedRejectedInserts;
extern const Event DistributedDelayedInserts;
extern const Event DistributedDelayedInsertsMilliseconds;
}
namespace DB
{
@ -85,6 +94,8 @@ namespace ErrorCodes
extern const int UNABLE_TO_SKIP_UNUSED_SHARDS;
extern const int INVALID_SHARD_ID;
extern const int ALTER_OF_COLUMN_IS_FORBIDDEN;
extern const int DISTRIBUTED_TOO_MANY_PENDING_BYTES;
extern const int ARGUMENT_OUT_OF_BOUND;
}
namespace ActionLocks
@ -768,6 +779,14 @@ std::vector<StorageDistributedDirectoryMonitor::Status> StorageDistributed::getD
return statuses;
}
std::optional<UInt64> StorageDistributed::totalBytes(const Settings &) const
{
UInt64 total_bytes = 0;
for (const auto & status : getDirectoryMonitorsStatuses())
total_bytes += status.bytes_count;
return total_bytes;
}
size_t StorageDistributed::getShardCount() const
{
return getCluster()->getShardCount();
@ -967,6 +986,54 @@ void StorageDistributed::renameOnDisk(const String & new_path_to_table_data)
relative_data_path = new_path_to_table_data;
}
void StorageDistributed::delayInsertOrThrowIfNeeded() const
{
if (!distributed_settings.bytes_to_throw_insert &&
!distributed_settings.bytes_to_delay_insert)
return;
UInt64 total_bytes = *totalBytes(global_context.getSettingsRef());
if (distributed_settings.bytes_to_throw_insert && total_bytes > distributed_settings.bytes_to_throw_insert)
{
ProfileEvents::increment(ProfileEvents::DistributedRejectedInserts);
throw Exception(ErrorCodes::DISTRIBUTED_TOO_MANY_PENDING_BYTES,
"Too many bytes pending for async INSERT: {} (bytes_to_throw_insert={})",
formatReadableSizeWithBinarySuffix(total_bytes),
formatReadableSizeWithBinarySuffix(distributed_settings.bytes_to_throw_insert));
}
if (distributed_settings.bytes_to_delay_insert && total_bytes > distributed_settings.bytes_to_delay_insert)
{
/// Step is 5% of the delay and minimal one second.
/// NOTE: max_delay_to_insert is in seconds, and step is in ms.
const size_t step_ms = std::min<double>(1., double(distributed_settings.max_delay_to_insert) * 1'000 * 0.05);
UInt64 delayed_ms = 0;
do {
delayed_ms += step_ms;
std::this_thread::sleep_for(std::chrono::milliseconds(step_ms));
} while (*totalBytes(global_context.getSettingsRef()) > distributed_settings.bytes_to_delay_insert && delayed_ms < distributed_settings.max_delay_to_insert*1000);
ProfileEvents::increment(ProfileEvents::DistributedDelayedInserts);
ProfileEvents::increment(ProfileEvents::DistributedDelayedInsertsMilliseconds, delayed_ms);
UInt64 new_total_bytes = *totalBytes(global_context.getSettingsRef());
LOG_INFO(log, "Too many bytes pending for async INSERT: was {}, now {}, INSERT was delayed to {} ms",
formatReadableSizeWithBinarySuffix(total_bytes),
formatReadableSizeWithBinarySuffix(new_total_bytes),
delayed_ms);
if (new_total_bytes > distributed_settings.bytes_to_delay_insert)
{
ProfileEvents::increment(ProfileEvents::DistributedRejectedInserts);
throw Exception(ErrorCodes::DISTRIBUTED_TOO_MANY_PENDING_BYTES,
"Too many bytes pending for async INSERT: {} (bytes_to_delay_insert={})",
formatReadableSizeWithBinarySuffix(new_total_bytes),
formatReadableSizeWithBinarySuffix(distributed_settings.bytes_to_delay_insert));
}
}
}
void registerStorageDistributed(StorageFactory & factory)
{
@ -1033,6 +1100,17 @@ void registerStorageDistributed(StorageFactory & factory)
distributed_settings.loadFromQuery(*args.storage_def);
}
if (distributed_settings.max_delay_to_insert < 1)
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND,
"max_delay_to_insert cannot be less then 1");
if (distributed_settings.bytes_to_throw_insert && distributed_settings.bytes_to_delay_insert &&
distributed_settings.bytes_to_throw_insert <= distributed_settings.bytes_to_delay_insert)
{
throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND,
"bytes_to_throw_insert cannot be less or equal to bytes_to_delay_insert (since it is handled first)");
}
return StorageDistributed::create(
args.table_id, args.columns, args.constraints,
remote_database, remote_table, cluster_name,

View File

@ -41,6 +41,7 @@ class StorageDistributed final : public ext::shared_ptr_helper<StorageDistribute
friend struct ext::shared_ptr_helper<StorageDistributed>;
friend class DistributedBlockOutputStream;
friend class StorageDistributedDirectoryMonitor;
friend class StorageSystemDistributionQueue;
public:
~StorageDistributed() override;
@ -76,6 +77,7 @@ public:
unsigned /*num_streams*/) override;
bool supportsParallelInsert() const override { return true; }
std::optional<UInt64> totalBytes(const Settings &) const override;
BlockOutputStreamPtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, const Context & context) override;
@ -83,7 +85,6 @@ public:
void truncate(const ASTPtr &, const StorageMetadataPtr &, const Context &, TableExclusiveLockHolder &) override;
void rename(const String & new_path_to_table_data, const StorageID & new_table_id) override;
void renameOnDisk(const String & new_path_to_table_data);
void checkAlterIsPossible(const AlterCommands & commands, const Context & context) const override;
@ -98,65 +99,24 @@ public:
bool storesDataOnDisk() const override { return true; }
Strings getDataPaths() const override;
const ExpressionActionsPtr & getShardingKeyExpr() const { return sharding_key_expr; }
const String & getShardingKeyColumnName() const { return sharding_key_column_name; }
size_t getShardCount() const;
const String & getRelativeDataPath() const { return relative_data_path; }
std::string getRemoteDatabaseName() const { return remote_database; }
std::string getRemoteTableName() const { return remote_table; }
std::string getClusterName() const { return cluster_name; } /// Returns empty string if tables is used by TableFunctionRemote
/// create directory monitors for each existing subdirectory
void createDirectoryMonitors(const DiskPtr & disk);
/// ensure directory monitor thread and connectoin pool creation by disk and subdirectory name
StorageDistributedDirectoryMonitor & requireDirectoryMonitor(const DiskPtr & disk, const std::string & name);
/// Return list of metrics for all created monitors
/// (note that monitors are created lazily, i.e. until at least one INSERT executed)
std::vector<StorageDistributedDirectoryMonitor::Status> getDirectoryMonitorsStatuses() const;
void flushClusterNodesAllData(const Context & context);
ClusterPtr getCluster() const;
static IColumn::Selector createSelector(const ClusterPtr cluster, const ColumnWithTypeAndName & result);
/// Apply the following settings:
/// - optimize_skip_unused_shards
/// - force_optimize_skip_unused_shards
ClusterPtr getOptimizedCluster(const Context &, const StorageMetadataPtr & metadata_snapshot, const ASTPtr & query_ptr) const;
ClusterPtr skipUnusedShards(ClusterPtr cluster, const ASTPtr & query_ptr, const StorageMetadataPtr & metadata_snapshot, const Context & context) const;
ActionLock getActionLock(StorageActionBlockType type) override;
NamesAndTypesList getVirtuals() const override;
size_t getRandomShardIndex(const Cluster::ShardsInfo & shards);
/// Used by InterpreterInsertQuery
std::string getRemoteDatabaseName() const { return remote_database; }
std::string getRemoteTableName() const { return remote_table; }
/// Returns empty string if tables is used by TableFunctionRemote
std::string getClusterName() const { return cluster_name; }
ClusterPtr getCluster() const;
const DistributedSettings & getDistributedSettingsRef() const { return distributed_settings; }
/// Used by InterpreterSystemQuery
void flushClusterNodesAllData(const Context & context);
String remote_database;
String remote_table;
ASTPtr remote_table_function_ptr;
/// Used by ClusterCopier
size_t getShardCount() const;
const Context & global_context;
Poco::Logger * log;
/// Used to implement TableFunctionRemote.
std::shared_ptr<Cluster> owned_cluster;
/// Is empty if this storage implements TableFunctionRemote.
const String cluster_name;
bool has_sharding_key;
bool sharding_key_is_deterministic = false;
ExpressionActionsPtr sharding_key_expr;
String sharding_key_column_name;
/// Used for global monotonic ordering of files to send.
SimpleIncrement file_names_increment;
ActionBlocker monitors_blocker;
protected:
private:
StorageDistributed(
const StorageID & id_,
const ColumnsDescription & columns_,
@ -186,6 +146,60 @@ protected:
bool attach,
ClusterPtr owned_cluster_ = {});
void renameOnDisk(const String & new_path_to_table_data);
const ExpressionActionsPtr & getShardingKeyExpr() const { return sharding_key_expr; }
const String & getShardingKeyColumnName() const { return sharding_key_column_name; }
const String & getRelativeDataPath() const { return relative_data_path; }
/// create directory monitors for each existing subdirectory
void createDirectoryMonitors(const DiskPtr & disk);
/// ensure directory monitor thread and connectoin pool creation by disk and subdirectory name
StorageDistributedDirectoryMonitor & requireDirectoryMonitor(const DiskPtr & disk, const std::string & name);
/// Return list of metrics for all created monitors
/// (note that monitors are created lazily, i.e. until at least one INSERT executed)
///
/// Used by StorageSystemDistributionQueue
std::vector<StorageDistributedDirectoryMonitor::Status> getDirectoryMonitorsStatuses() const;
static IColumn::Selector createSelector(const ClusterPtr cluster, const ColumnWithTypeAndName & result);
/// Apply the following settings:
/// - optimize_skip_unused_shards
/// - force_optimize_skip_unused_shards
ClusterPtr getOptimizedCluster(const Context &, const StorageMetadataPtr & metadata_snapshot, const ASTPtr & query_ptr) const;
ClusterPtr skipUnusedShards(ClusterPtr cluster, const ASTPtr & query_ptr, const StorageMetadataPtr & metadata_snapshot, const Context & context) const;
size_t getRandomShardIndex(const Cluster::ShardsInfo & shards);
const DistributedSettings & getDistributedSettingsRef() const { return distributed_settings; }
void delayInsertOrThrowIfNeeded() const;
private:
String remote_database;
String remote_table;
ASTPtr remote_table_function_ptr;
const Context & global_context;
Poco::Logger * log;
/// Used to implement TableFunctionRemote.
std::shared_ptr<Cluster> owned_cluster;
/// Is empty if this storage implements TableFunctionRemote.
const String cluster_name;
bool has_sharding_key;
bool sharding_key_is_deterministic = false;
ExpressionActionsPtr sharding_key_expr;
String sharding_key_column_name;
/// Used for global monotonic ordering of files to send.
SimpleIncrement file_names_increment;
ActionBlocker monitors_blocker;
String relative_data_path;
/// Can be empty if relative_data_path is empty. In this case, a directory for the data to be sent is not created.

View File

@ -13,9 +13,11 @@ const char * auto_contributors[] {
"Aleksandra (Ася)",
"Aleksandrov Vladimir",
"Aleksei Levushkin",
"Aleksei Semiglazov",
"Aleksey",
"Aleksey Akulovich",
"Alex Bocharov",
"Alex Karo",
"Alex Krash",
"Alex Ryndin",
"Alex Zatelepin",
@ -81,8 +83,10 @@ const char * auto_contributors[] {
"Andrey Skobtsov",
"Andrey Urusov",
"Andy Yang",
"Anmol Arora",
"Anna",
"Anna Shakhova",
"Anthony N. Simon",
"Anton Ivashkin",
"Anton Kobzev",
"Anton Kvasha",
@ -143,6 +147,7 @@ const char * auto_contributors[] {
"Colum",
"Constantin S. Pan",
"CurtizJ",
"DIAOZHAFENG",
"Daniel Bershatsky",
"Daniel Dao",
"Daniel Qin",
@ -203,6 +208,7 @@ const char * auto_contributors[] {
"Fan()",
"FawnD2",
"FeehanG",
"FgoDt",
"Filipe Caixeta",
"Flowyi",
"Francisco Barón",
@ -224,7 +230,10 @@ const char * auto_contributors[] {
"Grigory Buteyko",
"Grigory Pervakov",
"Guillaume Tassery",
"Haavard Kvaalen",
"Habibullah Oladepo",
"Hamoon",
"Hasitha Kanchana",
"Hasnat",
"Hiroaki Nakamura",
"HuFuwang",
@ -261,6 +270,7 @@ const char * auto_contributors[] {
"Ivan Remen",
"Ivan Starkov",
"Ivan Zhukov",
"JackyWoo",
"Jacob Hayes",
"JaosnHsieh",
"Jason",
@ -271,6 +281,7 @@ const char * auto_contributors[] {
"Jiang Tao",
"Jochen Schalanda",
"John",
"John Hummel",
"Jonatas Freitas",
"Kang Liu",
"Karl Pietrzak",
@ -304,6 +315,7 @@ const char * auto_contributors[] {
"Luis Bosque",
"Lv Feng",
"Léo Ercolanelli",
"M0r64n",
"Maks Skorokhod",
"Maksim",
"Maksim Fedotov",
@ -317,9 +329,11 @@ const char * auto_contributors[] {
"Mark Frost",
"Mark Papadakis",
"Maroun Maroun",
"Marquitos",
"Marsel Arduanov",
"Marti Raudsepp",
"Martijn Bakker",
"Marvin Taschenberger",
"Masha",
"Matthew Peveler",
"Matwey V. Kornilov",
@ -338,6 +352,7 @@ const char * auto_contributors[] {
"Maxim Smirnov",
"Maxim Ulanovskiy",
"MaximAL",
"Mc.Spring",
"MeiK",
"Metehan Çetinkaya",
"Metikov Vadim",
@ -386,6 +401,7 @@ const char * auto_contributors[] {
"Nikita Vasilev",
"Nikolai Kochetov",
"Nikolai Sorokin",
"Nikolay",
"Nikolay Degterinsky",
"Nikolay Kirsh",
"Nikolay Semyachkin",
@ -439,6 +455,7 @@ const char * auto_contributors[] {
"Roman Peshkurov",
"Roman Tsisyk",
"Ruslan",
"Russ Frank",
"Ruzal Ibragimov",
"S.M.A. Djawadi",
"Sabyanin Maxim",
@ -461,12 +478,14 @@ const char * auto_contributors[] {
"Sergey Shtykov",
"Sergey V. Galtsev",
"Sergey Zaikin",
"Sergi Almacellas Abellana",
"Sergi Vladykin",
"SevaCode",
"Sherry Wang",
"Silviu Caragea",
"Simon Liu",
"Simon Podlipsky",
"Sina",
"Sjoerd Mulder",
"Slach",
"Snow",
@ -502,6 +521,7 @@ const char * auto_contributors[] {
"Vadim Plakhtinskiy",
"Vadim Skipin",
"VadimPE",
"Val",
"Valera Ryaboshapko",
"Vasily Kozhukhovskiy",
"Vasily Morozov",
@ -525,6 +545,7 @@ const char * auto_contributors[] {
"Vlad Arkhipov",
"Vladimir",
"Vladimir Bunchuk",
"Vladimir Ch",
"Vladimir Chebotarev",
"Vladimir Golovchenko",
"Vladimir Goncharov",
@ -662,11 +683,13 @@ const char * auto_contributors[] {
"frank",
"franklee",
"fredchenbj",
"fuwhu",
"g-arslan",
"ggerogery",
"giordyb",
"glockbender",
"glushkovds",
"guoleiyi",
"gyuton",
"hao.he",
"hchen9",
@ -675,6 +698,7 @@ const char * auto_contributors[] {
"hexiaoting",
"hotid",
"hustnn",
"idfer",
"igor",
"igor.lapko",
"ikopylov",
@ -686,9 +710,11 @@ const char * auto_contributors[] {
"javartisan",
"javi",
"javi santana",
"jennyma",
"jetgm",
"jianmei zhang",
"jyz0309",
"keenwolf",
"kmeaw",
"koshachy",
"kreuzerkrieg",
@ -697,6 +723,7 @@ const char * auto_contributors[] {
"kssenii",
"l",
"lalex",
"lehasm",
"leozhang",
"levushkin aleksej",
"levysh",
@ -781,6 +808,7 @@ const char * auto_contributors[] {
"shangshujie",
"shedx",
"simon-says",
"spff",
"spongedc",
"spyros87",
"stavrolia",
@ -799,6 +827,7 @@ const char * auto_contributors[] {
"tiger.yan",
"tison",
"topvisor",
"turbo jason",
"tyrionhuang",
"ubuntu",
"unegare",
@ -820,6 +849,7 @@ const char * auto_contributors[] {
"yangshuai",
"ygrek",
"yhgcn",
"yiguolei",
"ylchou",
"yonesko",
"yuefoo",
@ -829,8 +859,10 @@ const char * auto_contributors[] {
"zhang2014",
"zhangshengyu",
"zhangxiao018",
"zhangxiao871",
"zhen ni",
"zhukai",
"zlx19950903",
"zvrr",
"zvvr",
"zzsmdfj",

View File

@ -60,6 +60,7 @@ if [ "$DATA_DIR_PATTERN" != "$DATA_DIR" ]; then
cat ${CONFIG_SERVER_DIR}/ints_dictionary.xml | sed -e s!9000!$CLICKHOUSE_PORT_TCP! > $DATA_DIR/etc/ints_dictionary.xml
cat ${CONFIG_SERVER_DIR}/strings_dictionary.xml | sed -e s!9000!$CLICKHOUSE_PORT_TCP! > $DATA_DIR/etc/strings_dictionary.xml
cat ${CONFIG_SERVER_DIR}/decimals_dictionary.xml | sed -e s!9000!$CLICKHOUSE_PORT_TCP! > $DATA_DIR/etc/decimals_dictionary.xml
cat ${CONFIG_SERVER_DIR}/executable_pool_dictionary.xml | sed -e s!9000!$CLICKHOUSE_PORT_TCP! > $DATA_DIR/etc/executable_pool_dictionary.xml
fi
CLICKHOUSE_EXTRACT_CONFIG=${CLICKHOUSE_EXTRACT_CONFIG:="${CLICKHOUSE_EXTRACT} --config=$CLICKHOUSE_CONFIG"}

View File

@ -2,7 +2,6 @@
<test_keeper_server>
<tcp_port>9181</tcp_port>
<server_id>1</server_id>
<log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>
<coordination_settings>
<operation_timeout_ms>10000</operation_timeout_ms>

View File

@ -0,0 +1,154 @@
<dictionaries>
<dictionary>
<name>executable_pool_simple</name>
<structure>
<id>
<name>x</name>
</id>
<attribute>
<name>a</name>
<type>String</type>
<null_value></null_value>
</attribute>
<attribute>
<name>b</name>
<type>String</type>
<null_value></null_value>
</attribute>
</structure>
<source>
<executable_pool>
<format>TabSeparated</format>
<command>while read read_data; do printf "$read_data\t$read_data a\t$read_data b\n"; done</command>
<size>5</size>
</executable_pool>
</source>
<layout>
<direct />
</layout>
<lifetime>300</lifetime>
</dictionary>
<dictionary>
<name>executable_pool_complex</name>
<structure>
<key>
<attribute>
<name>x</name>
<type>String</type>
</attribute>
<attribute>
<name>y</name>
<type>String</type>
</attribute>
</key>
<attribute>
<name>a</name>
<type>String</type>
<null_value></null_value>
</attribute>
<attribute>
<name>b</name>
<type>String</type>
<null_value></null_value>
</attribute>
</structure>
<source>
<executable>
<format>TabSeparated</format>
<command>while read read_data; do printf "$read_data\tvalue a\tvalue b\n"; done</command>
</executable>
</source>
<layout>
<complex_key_direct />
</layout>
<lifetime>300</lifetime>
</dictionary>
<dictionary>
<name>executable_pool_simple_implicit_key</name>
<structure>
<id>
<name>x</name>
</id>
<attribute>
<name>a</name>
<type>String</type>
<null_value></null_value>
</attribute>
<attribute>
<name>b</name>
<type>String</type>
<null_value></null_value>
</attribute>
</structure>
<source>
<executable_pool>
<format>TabSeparated</format>
<command>while read read_data; do printf "$read_data a\t$read_data b\n"; done</command>
<size>5</size>
<implicit_key>true</implicit_key>
</executable_pool>
</source>
<layout>
<direct />
</layout>
<lifetime>300</lifetime>
</dictionary>
<dictionary>
<name>executable_pool_complex_implicit_key</name>
<structure>
<key>
<attribute>
<name>x</name>
<type>String</type>
</attribute>
<attribute>
<name>y</name>
<type>String</type>
</attribute>
</key>
<attribute>
<name>a</name>
<type>String</type>
<null_value></null_value>
</attribute>
<attribute>
<name>b</name>
<type>String</type>
<null_value></null_value>
</attribute>
</structure>
<source>
<executable_pool>
<format>TabSeparated</format>
<command>while read read_data; do printf "data a\tdata b\n"; done</command>
<size>5</size>
<implicit_key>true</implicit_key>
</executable_pool>
</source>
<layout>
<complex_key_direct />
</layout>
<lifetime>300</lifetime>
</dictionary>
</dictionaries>

View File

@ -39,11 +39,13 @@ ln -sf $SRC_PATH/users.d/readonly.xml $DEST_SERVER_PATH/users.d/
ln -sf $SRC_PATH/users.d/access_management.xml $DEST_SERVER_PATH/users.d/
ln -sf $SRC_PATH/users.d/database_atomic_drop_detach_sync.xml $DEST_SERVER_PATH/users.d/
ln -sf $SRC_PATH/users.d/opentelemetry.xml $DEST_SERVER_PATH/users.d/
ln -sf $SRC_PATH/users.d/remote_queries.xml $DEST_SERVER_PATH/users.d/
ln -sf $SRC_PATH/ints_dictionary.xml $DEST_SERVER_PATH/
ln -sf $SRC_PATH/strings_dictionary.xml $DEST_SERVER_PATH/
ln -sf $SRC_PATH/decimals_dictionary.xml $DEST_SERVER_PATH/
ln -sf $SRC_PATH/executable_dictionary.xml $DEST_SERVER_PATH/
ln -sf $SRC_PATH/executable_pool_dictionary.xml $DEST_SERVER_PATH/
ln -sf $SRC_PATH/top_level_domains $DEST_SERVER_PATH/

View File

@ -0,0 +1,13 @@
<yandex>
<profiles>
<default>
<!-- Avoid "Connection failed at try №1" messages. -->
<connect_timeout_with_failover_ms>2000</connect_timeout_with_failover_ms>
<connect_timeout_with_failover_secure_ms>3000</connect_timeout_with_failover_secure_ms>
<!-- Avoid this logic in tests to avoid EOF (10 hours is enough for tests)-->
<idle_connection_timeout>36000</idle_connection_timeout>
<!-- NOTE: instead of tunning idle_connection_timeout,
SYSTEM RELOAD CONFIG can be executed before each test -->
</default>
</profiles>
</yandex>

View File

@ -94,4 +94,25 @@
</retention>
</default>
</graphite_rollup_broken>
<graphite_rollup_wrong_age_precision>
<path_column_name>metric</path_column_name>
<time_column_name>timestamp</time_column_name>
<value_column_name>value</value_column_name>
<version_column_name>updated</version_column_name>
<default>
<function>avg</function>
<retention>
<age>0</age>
<precision>60</precision>
</retention>
<retention>
<age>36000</age>
<precision>600</precision>
</retention>
<retention>
<age>72000</age>
<precision>300</precision>
</retention>
</default>
</graphite_rollup_wrong_age_precision>
</yandex>

View File

@ -3,6 +3,7 @@ import os.path as p
import time
import pytest
from helpers.client import QueryRuntimeException
from helpers.cluster import ClickHouseCluster
from helpers.test_tools import TSV
@ -442,3 +443,20 @@ SELECT * FROM test.graphite;
''')
assert TSV(result) == TSV(expected)
def test_wrong_rollup_config(graphite_table):
with pytest.raises(QueryRuntimeException) as exc:
q('''
CREATE TABLE test.graphite_not_created
(metric String, value Float64, timestamp UInt32, date Date, updated UInt32)
ENGINE = GraphiteMergeTree('graphite_rollup_wrong_age_precision')
PARTITION BY toYYYYMM(date)
ORDER BY (metric, timestamp)
SETTINGS index_granularity=1;
''')
# The order of retentions is not guaranteed
assert ("age and precision should only grow up: " in str(exc.value))
assert ("36000:600" in str(exc.value))
assert ("72000:300" in str(exc.value))

View File

@ -471,12 +471,14 @@ def select_without_columns(clickhouse_node, mysql_node, service_name):
mysql_node.query("CREATE DATABASE db")
mysql_node.query("CREATE TABLE db.t (a INT PRIMARY KEY, b INT)")
clickhouse_node.query(
"CREATE DATABASE db ENGINE = MaterializeMySQL('{}:3306', 'db', 'root', 'clickhouse')".format(service_name))
"CREATE DATABASE db ENGINE = MaterializeMySQL('{}:3306', 'db', 'root', 'clickhouse') SETTINGS max_flush_data_time = 100000".format(service_name))
check_query(clickhouse_node, "SHOW TABLES FROM db FORMAT TSV", "t\n")
clickhouse_node.query("SYSTEM STOP MERGES db.t")
clickhouse_node.query("CREATE VIEW v AS SELECT * FROM db.t")
mysql_node.query("INSERT INTO db.t VALUES (1, 1), (2, 2)")
mysql_node.query("DELETE FROM db.t WHERE a=2;")
mysql_node.query("DELETE FROM db.t WHERE a = 2;")
# We need to execute a DDL for flush data buffer
mysql_node.query("CREATE TABLE db.temporary(a INT PRIMARY KEY, b INT)")
optimize_on_insert = clickhouse_node.query("SELECT value FROM system.settings WHERE name='optimize_on_insert'").strip()
if optimize_on_insert == "0":

View File

@ -2583,6 +2583,8 @@ def test_kafka_issue14202(kafka_cluster):
kafka_format = 'JSONEachRow';
''')
time.sleep(3)
instance.query(
'INSERT INTO test.kafka_q SELECT t, some_string FROM ( SELECT dt AS t, some_string FROM test.empty_table )')
# check instance is alive

View File

@ -1,17 +1,22 @@
<test>
<create_query>DROP TABLE IF EXISTS test_prl;</create_query>
<create_query>CREATE TABLE test_prl (n UInt64) ENGINE MergeTree ORDER BY n;</create_query>
<create_query>GRANT CREATE ROW POLICY ON *.* TO CURRENT_USER</create_query>
<create_query>CREATE ROW POLICY OR REPLACE test_prl_policy ON test_prl AS PERMISSIVE FOR SELECT USING n % 7 TO ALL;</create_query>
<settings>
<max_threads>1</max_threads>
<max_insert_threads>10</max_insert_threads>
<optimize_move_to_prewhere>0</optimize_move_to_prewhere>
</settings>
<fill_query>INSERT INTO test_prl SELECT number FROM numbers(50000000);</fill_query>
<create_query>DROP TABLE IF EXISTS test_prl</create_query>
<create_query>CREATE TABLE test_prl (n UInt64) ENGINE MergeTree ORDER BY n</create_query>
<create_query>CREATE ROW POLICY OR REPLACE test_prl_policy ON test_prl AS PERMISSIVE FOR SELECT USING n % 7 TO ALL</create_query>
<query>SELECT * FROM test_prl;</query>
<query>SELECT * FROM test_prl WHERE n % 3 AND n % 5 SETTINGS optimize_move_to_prewhere = 0;</query>
<query>SELECT * FROM test_prl PREWHERE n % 3 AND n % 5;</query>
<query>SELECT * FROM test_prl PREWHERE n % 3 WHERE n % 5;</query>
<query>SELECT * FROM test_prl PREWHERE n % 5 WHERE n % 3;</query>
<fill_query>INSERT INTO test_prl SELECT number FROM numbers(100000000)</fill_query>
<drop_query>DROP ROW POLICY IF EXISTS test_prl_policy ON test_prl;</drop_query>
<drop_query>DROP TABLE IF EXISTS test_prl;</drop_query>
<query>SELECT sum(n) FROM test_prl</query>
<query>SELECT sum(n) FROM test_prl WHERE n % 3 AND n % 5</query>
<query>SELECT sum(n) FROM test_prl PREWHERE n % 3 AND n % 5</query>
<query>SELECT sum(n) FROM test_prl PREWHERE n % 3 WHERE n % 5</query>
<query>SELECT sum(n) FROM test_prl PREWHERE n % 5 WHERE n % 3</query>
<drop_query>DROP ROW POLICY IF EXISTS test_prl_policy ON test_prl</drop_query>
<drop_query>DROP TABLE IF EXISTS test_prl</drop_query>
</test>

View File

@ -0,0 +1,4 @@
Check total_bytes/total_rows for Distributed
0 \N
1 \N
0 \N

View File

@ -0,0 +1,11 @@
SELECT 'Check total_bytes/total_rows for Distributed';
CREATE TABLE check_system_tables_null (key Int) Engine=Null();
CREATE TABLE check_system_tables AS check_system_tables_null Engine=Distributed(test_shard_localhost, currentDatabase(), check_system_tables_null);
SYSTEM STOP DISTRIBUTED SENDS check_system_tables;
SELECT total_bytes, total_rows FROM system.tables WHERE database = currentDatabase() AND name = 'check_system_tables';
INSERT INTO check_system_tables SELECT * FROM numbers(1) SETTINGS prefer_localhost_replica=0;
SELECT total_bytes>0, total_rows FROM system.tables WHERE database = currentDatabase() AND name = 'check_system_tables';
SYSTEM FLUSH DISTRIBUTED check_system_tables;
SELECT total_bytes, total_rows FROM system.tables WHERE database = currentDatabase() AND name = 'check_system_tables';
DROP TABLE check_system_tables_null;
DROP TABLE check_system_tables;

View File

@ -1,7 +1,3 @@
-- Avoid "Connection failed at try №1" messages.
SET send_logs_level = 'none';
SET connect_timeout_with_failover_ms = 5000;
DROP TABLE IF EXISTS source_table1;
DROP TABLE IF EXISTS source_table2;
DROP TABLE IF EXISTS distributed_table1;

View File

@ -1,9 +1,5 @@
-- TODO: correct testing with real unique shards
-- Avoid "Connection failed at try №1" messages.
SET send_logs_level = 'fatal';
SET connect_timeout_with_failover_ms = 5000;
set optimize_distributed_group_by_sharding_key=1;
drop table if exists dist_01247;

View File

@ -1,9 +1,5 @@
-- TODO: correct testing with real unique shards
-- Avoid "Connection failed at try №1" messages.
SET send_logs_level = 'fatal';
SET connect_timeout_with_failover_ms = 5000;
set optimize_distributed_group_by_sharding_key=1;
drop table if exists dist_01247;

View File

@ -1,5 +1,5 @@
send_timeout Seconds 300
connect_timeout Seconds 10
connect_timeout_with_failover_ms Milliseconds 50
connect_timeout_with_failover_secure_ms Milliseconds 100
connect_timeout_with_failover_ms Milliseconds 2000
connect_timeout_with_failover_secure_ms Milliseconds 3000
max_memory_usage UInt64 10000000000

View File

@ -10,20 +10,15 @@ select * from system.distribution_queue;
select 'INSERT';
system stop distributed sends dist_01293;
insert into dist_01293 select * from numbers(10);
-- metrics updated only after distributed_directory_monitor_sleep_time_ms
set distributed_directory_monitor_sleep_time_ms=10;
-- 1 second should guarantee metrics update
-- XXX: but this is kind of quirk, way more better will be account this metrics without any delays.
select sleep(1) format Null;
select is_blocked, error_count, data_files, data_compressed_bytes>100 from system.distribution_queue;
select is_blocked, error_count, data_files, data_compressed_bytes>100 from system.distribution_queue where database = currentDatabase();
system flush distributed dist_01293;
select 'FLUSH';
select is_blocked, error_count, data_files, data_compressed_bytes from system.distribution_queue;
select is_blocked, error_count, data_files, data_compressed_bytes from system.distribution_queue where database = currentDatabase();
select 'UNBLOCK';
system start distributed sends dist_01293;
select is_blocked, error_count, data_files, data_compressed_bytes from system.distribution_queue;
select is_blocked, error_count, data_files, data_compressed_bytes from system.distribution_queue where database = currentDatabase();
drop table null_01293;
drop table dist_01293;

View File

@ -0,0 +1,14 @@
drop table if exists dist_01670;
drop table if exists data_01670;
create table data_01670 (key Int) engine=Null();
create table dist_01670 (key Int) engine=Distributed(test_shard_localhost, currentDatabase(), data_01670) settings bytes_to_throw_insert=1;
system stop distributed sends dist_01670;
-- first batch is always OK, since there is no pending bytes yet
insert into dist_01670 select * from numbers(1) settings prefer_localhost_replica=0;
-- second will fail, because of bytes_to_throw_insert=1
-- (previous block definitelly takes more, since it has header)
insert into dist_01670 select * from numbers(1) settings prefer_localhost_replica=0; -- { serverError 574 }
system flush distributed dist_01670;
drop table dist_01670;
drop table data_01670;

View File

@ -0,0 +1,3 @@
max_delay_to_insert will throw
max_delay_to_insert will succeed
flushed

View File

@ -0,0 +1,86 @@
#!/usr/bin/env bash
# NOTE: $SECONDS accuracy is second, so we need some delta, hence -1 in time conditions.
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
max_delay_to_insert=5
${CLICKHOUSE_CLIENT} -nq "
drop table if exists dist_01675;
drop table if exists data_01675;
"
${CLICKHOUSE_CLIENT} -nq "
create table data_01675 (key Int) engine=Null();
create table dist_01675 (key Int) engine=Distributed(test_shard_localhost, currentDatabase(), data_01675) settings bytes_to_delay_insert=1, max_delay_to_insert=$max_delay_to_insert;
system stop distributed sends dist_01675;
"
#
# Case 1: max_delay_to_insert will throw.
#
echo "max_delay_to_insert will throw"
start_seconds=$SECONDS
${CLICKHOUSE_CLIENT} --testmode -nq "
-- first batch is always OK, since there is no pending bytes yet
insert into dist_01675 select * from numbers(1) settings prefer_localhost_replica=0;
-- second will fail, because of bytes_to_delay_insert=1 and max_delay_to_insert=5,
-- while distributed sends is stopped.
--
-- (previous block definitelly takes more, since it has header)
insert into dist_01675 select * from numbers(1) settings prefer_localhost_replica=0; -- { serverError 574 }
system flush distributed dist_01675;
"
end_seconds=$SECONDS
if (( (end_seconds-start_seconds)<(max_delay_to_insert-1) )); then
echo "max_delay_to_insert was not satisfied ($end_seconds-$start_seconds)"
fi
#
# Case 2: max_delay_to_insert will finally finished.
#
echo "max_delay_to_insert will succeed"
max_delay_to_insert=10
${CLICKHOUSE_CLIENT} -nq "
drop table dist_01675;
create table dist_01675 (key Int) engine=Distributed(test_shard_localhost, currentDatabase(), data_01675) settings bytes_to_delay_insert=1, max_delay_to_insert=$max_delay_to_insert;
system stop distributed sends dist_01675;
"
flush_delay=4
function flush_distributed_worker()
{
sleep $flush_delay
${CLICKHOUSE_CLIENT} -q "system flush distributed dist_01675"
echo flushed
}
flush_distributed_worker &
start_seconds=$SECONDS
${CLICKHOUSE_CLIENT} --testmode -nq "
-- first batch is always OK, since there is no pending bytes yet
insert into dist_01675 select * from numbers(1) settings prefer_localhost_replica=0;
-- second will succcedd, due to SYSTEM FLUSH DISTRIBUTED in background.
insert into dist_01675 select * from numbers(1) settings prefer_localhost_replica=0;
"
end_seconds=$SECONDS
wait
if (( (end_seconds-start_seconds)<(flush_delay-1) )); then
echo "max_delay_to_insert was not wait flush_delay ($end_seconds-$start_seconds)"
fi
if (( (end_seconds-start_seconds)>=(max_delay_to_insert-1) )); then
echo "max_delay_to_insert was overcommited ($end_seconds-$start_seconds)"
fi
${CLICKHOUSE_CLIENT} -nq "
drop table dist_01675;
drop table data_01675;
"

View File

@ -1,3 +1,4 @@
3
3
1
1

Some files were not shown because too many files have changed in this diff Show More